• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2017 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 // This file is generated semi-automatically with this command:
9 //   $ src/jumper/build_stages.py
10 
11 #include <stdint.h>
12 
13 #if defined(_MSC_VER)
14     #pragma section("code", read,execute)
15     #define CODE extern "C" __declspec(allocate("code"))
16 #elif defined(__MACH__)
17     #define CODE extern "C" __attribute__((section("__TEXT,__text")))
18 #else
19     #define CODE extern "C" __attribute__((section(".text")))
20 #endif
21 
22 #if defined(__aarch64__)
23 
24 CODE const uint32_t sk_start_pipeline_aarch64[] = {
25   0xa9bd5bf7,                             //stp           x23, x22, [sp, #-48]!
26   0xa90153f5,                             //stp           x21, x20, [sp, #16]
27   0xa9027bf3,                             //stp           x19, x30, [sp, #32]
28   0xaa0103f4,                             //mov           x20, x1
29   0xf8408697,                             //ldr           x23, [x20], #8
30   0xaa0003f5,                             //mov           x21, x0
31   0xaa0303f3,                             //mov           x19, x3
32   0x910012a8,                             //add           x8, x21, #0x4
33   0xeb13011f,                             //cmp           x8, x19
34   0xaa0203f6,                             //mov           x22, x2
35   0x54000069,                             //b.ls          34 <sk_start_pipeline_aarch64+0x34>  // b.plast
36   0xaa1503e0,                             //mov           x0, x21
37   0x14000012,                             //b             78 <sk_start_pipeline_aarch64+0x78>
38   0x6f00e400,                             //movi          v0.2d, #0x0
39   0x6f00e401,                             //movi          v1.2d, #0x0
40   0x6f00e402,                             //movi          v2.2d, #0x0
41   0x6f00e403,                             //movi          v3.2d, #0x0
42   0x6f00e404,                             //movi          v4.2d, #0x0
43   0x6f00e405,                             //movi          v5.2d, #0x0
44   0x6f00e406,                             //movi          v6.2d, #0x0
45   0x6f00e407,                             //movi          v7.2d, #0x0
46   0xaa1503e0,                             //mov           x0, x21
47   0xaa1403e1,                             //mov           x1, x20
48   0xaa1603e2,                             //mov           x2, x22
49   0xd63f02e0,                             //blr           x23
50   0x910012a0,                             //add           x0, x21, #0x4
51   0x910022a8,                             //add           x8, x21, #0x8
52   0xeb13011f,                             //cmp           x8, x19
53   0xaa0003f5,                             //mov           x21, x0
54   0x54fffe09,                             //b.ls          34 <sk_start_pipeline_aarch64+0x34>  // b.plast
55   0xa9427bf3,                             //ldp           x19, x30, [sp, #32]
56   0xa94153f5,                             //ldp           x21, x20, [sp, #16]
57   0xa8c35bf7,                             //ldp           x23, x22, [sp], #48
58   0xd65f03c0,                             //ret
59 };
60 
61 CODE const uint32_t sk_just_return_aarch64[] = {
62   0xd65f03c0,                             //ret
63 };
64 
65 CODE const uint32_t sk_seed_shader_aarch64[] = {
66   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
67   0x3dc00046,                             //ldr           q6, [x2]
68   0x4e040c00,                             //dup           v0.4s, w0
69   0x4f0167e7,                             //movi          v7.4s, #0x3f, lsl #24
70   0x4d40c901,                             //ld1r          {v1.4s}, [x8]
71   0x4e21d800,                             //scvtf         v0.4s, v0.4s
72   0x4e27d400,                             //fadd          v0.4s, v0.4s, v7.4s
73   0x4f03f602,                             //fmov          v2.4s, #1.000000000000000000e+00
74   0x4e21d821,                             //scvtf         v1.4s, v1.4s
75   0x6f00e403,                             //movi          v3.2d, #0x0
76   0x6f00e404,                             //movi          v4.2d, #0x0
77   0x6f00e405,                             //movi          v5.2d, #0x0
78   0x4e26d400,                             //fadd          v0.4s, v0.4s, v6.4s
79   0x6f00e406,                             //movi          v6.2d, #0x0
80   0x4e27d421,                             //fadd          v1.4s, v1.4s, v7.4s
81   0x6f00e407,                             //movi          v7.2d, #0x0
82   0xd61f0060,                             //br            x3
83 };
84 
85 CODE const uint32_t sk_constant_color_aarch64[] = {
86   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
87   0x3dc00103,                             //ldr           q3, [x8]
88   0x4e040460,                             //dup           v0.4s, v3.s[0]
89   0x4e0c0461,                             //dup           v1.4s, v3.s[1]
90   0x4e140462,                             //dup           v2.4s, v3.s[2]
91   0x4e1c0463,                             //dup           v3.4s, v3.s[3]
92   0xd61f0060,                             //br            x3
93 };
94 
95 CODE const uint32_t sk_clear_aarch64[] = {
96   0xf8408423,                             //ldr           x3, [x1], #8
97   0x6f00e400,                             //movi          v0.2d, #0x0
98   0x6f00e401,                             //movi          v1.2d, #0x0
99   0x6f00e402,                             //movi          v2.2d, #0x0
100   0x6f00e403,                             //movi          v3.2d, #0x0
101   0xd61f0060,                             //br            x3
102 };
103 
104 CODE const uint32_t sk_plus__aarch64[] = {
105   0xf8408423,                             //ldr           x3, [x1], #8
106   0x4e24d400,                             //fadd          v0.4s, v0.4s, v4.4s
107   0x4e25d421,                             //fadd          v1.4s, v1.4s, v5.4s
108   0x4e26d442,                             //fadd          v2.4s, v2.4s, v6.4s
109   0x4e27d463,                             //fadd          v3.4s, v3.4s, v7.4s
110   0xd61f0060,                             //br            x3
111 };
112 
113 CODE const uint32_t sk_srcover_aarch64[] = {
114   0xf8408423,                             //ldr           x3, [x1], #8
115   0x4f03f610,                             //fmov          v16.4s, #1.000000000000000000e+00
116   0x4ea3d610,                             //fsub          v16.4s, v16.4s, v3.4s
117   0x4e24ce00,                             //fmla          v0.4s, v16.4s, v4.4s
118   0x4e25ce01,                             //fmla          v1.4s, v16.4s, v5.4s
119   0x4e26ce02,                             //fmla          v2.4s, v16.4s, v6.4s
120   0x4e27ce03,                             //fmla          v3.4s, v16.4s, v7.4s
121   0xd61f0060,                             //br            x3
122 };
123 
124 CODE const uint32_t sk_dstover_aarch64[] = {
125   0x4f03f611,                             //fmov          v17.4s, #1.000000000000000000e+00
126   0xf8408423,                             //ldr           x3, [x1], #8
127   0x4ea41c90,                             //mov           v16.16b, v4.16b
128   0x4ea7d634,                             //fsub          v20.4s, v17.4s, v7.4s
129   0x4ea51cb1,                             //mov           v17.16b, v5.16b
130   0x4ea61cd2,                             //mov           v18.16b, v6.16b
131   0x4ea71cf3,                             //mov           v19.16b, v7.16b
132   0x4e20ce90,                             //fmla          v16.4s, v20.4s, v0.4s
133   0x4e21ce91,                             //fmla          v17.4s, v20.4s, v1.4s
134   0x4e22ce92,                             //fmla          v18.4s, v20.4s, v2.4s
135   0x4e23ce93,                             //fmla          v19.4s, v20.4s, v3.4s
136   0x4eb01e00,                             //mov           v0.16b, v16.16b
137   0x4eb11e21,                             //mov           v1.16b, v17.16b
138   0x4eb21e42,                             //mov           v2.16b, v18.16b
139   0x4eb31e63,                             //mov           v3.16b, v19.16b
140   0xd61f0060,                             //br            x3
141 };
142 
143 CODE const uint32_t sk_clamp_0_aarch64[] = {
144   0xf8408423,                             //ldr           x3, [x1], #8
145   0x6f00e410,                             //movi          v16.2d, #0x0
146   0x4e30f400,                             //fmax          v0.4s, v0.4s, v16.4s
147   0x4e30f421,                             //fmax          v1.4s, v1.4s, v16.4s
148   0x4e30f442,                             //fmax          v2.4s, v2.4s, v16.4s
149   0x4e30f463,                             //fmax          v3.4s, v3.4s, v16.4s
150   0xd61f0060,                             //br            x3
151 };
152 
153 CODE const uint32_t sk_clamp_1_aarch64[] = {
154   0xf8408423,                             //ldr           x3, [x1], #8
155   0x4f03f610,                             //fmov          v16.4s, #1.000000000000000000e+00
156   0x4eb0f400,                             //fmin          v0.4s, v0.4s, v16.4s
157   0x4eb0f421,                             //fmin          v1.4s, v1.4s, v16.4s
158   0x4eb0f442,                             //fmin          v2.4s, v2.4s, v16.4s
159   0x4eb0f463,                             //fmin          v3.4s, v3.4s, v16.4s
160   0xd61f0060,                             //br            x3
161 };
162 
163 CODE const uint32_t sk_clamp_a_aarch64[] = {
164   0xf8408423,                             //ldr           x3, [x1], #8
165   0x4f03f610,                             //fmov          v16.4s, #1.000000000000000000e+00
166   0x4eb0f463,                             //fmin          v3.4s, v3.4s, v16.4s
167   0x4ea3f400,                             //fmin          v0.4s, v0.4s, v3.4s
168   0x4ea3f421,                             //fmin          v1.4s, v1.4s, v3.4s
169   0x4ea3f442,                             //fmin          v2.4s, v2.4s, v3.4s
170   0xd61f0060,                             //br            x3
171 };
172 
173 CODE const uint32_t sk_set_rgb_aarch64[] = {
174   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
175   0xaa0803e9,                             //mov           x9, x8
176   0x4ddfc920,                             //ld1r          {v0.4s}, [x9], #4
177   0x91002108,                             //add           x8, x8, #0x8
178   0x4d40c902,                             //ld1r          {v2.4s}, [x8]
179   0x4d40c921,                             //ld1r          {v1.4s}, [x9]
180   0xd61f0060,                             //br            x3
181 };
182 
183 CODE const uint32_t sk_swap_rb_aarch64[] = {
184   0xf8408423,                             //ldr           x3, [x1], #8
185   0x4ea01c10,                             //mov           v16.16b, v0.16b
186   0x4ea21c40,                             //mov           v0.16b, v2.16b
187   0x4eb01e02,                             //mov           v2.16b, v16.16b
188   0xd61f0060,                             //br            x3
189 };
190 
191 CODE const uint32_t sk_swap_aarch64[] = {
192   0xf8408423,                             //ldr           x3, [x1], #8
193   0x4ea31c70,                             //mov           v16.16b, v3.16b
194   0x4ea21c51,                             //mov           v17.16b, v2.16b
195   0x4ea11c32,                             //mov           v18.16b, v1.16b
196   0x4ea01c13,                             //mov           v19.16b, v0.16b
197   0x4ea41c80,                             //mov           v0.16b, v4.16b
198   0x4ea51ca1,                             //mov           v1.16b, v5.16b
199   0x4ea61cc2,                             //mov           v2.16b, v6.16b
200   0x4ea71ce3,                             //mov           v3.16b, v7.16b
201   0x4eb31e64,                             //mov           v4.16b, v19.16b
202   0x4eb21e45,                             //mov           v5.16b, v18.16b
203   0x4eb11e26,                             //mov           v6.16b, v17.16b
204   0x4eb01e07,                             //mov           v7.16b, v16.16b
205   0xd61f0060,                             //br            x3
206 };
207 
208 CODE const uint32_t sk_move_src_dst_aarch64[] = {
209   0xf8408423,                             //ldr           x3, [x1], #8
210   0x4ea01c04,                             //mov           v4.16b, v0.16b
211   0x4ea11c25,                             //mov           v5.16b, v1.16b
212   0x4ea21c46,                             //mov           v6.16b, v2.16b
213   0x4ea31c67,                             //mov           v7.16b, v3.16b
214   0xd61f0060,                             //br            x3
215 };
216 
217 CODE const uint32_t sk_move_dst_src_aarch64[] = {
218   0xf8408423,                             //ldr           x3, [x1], #8
219   0x4ea41c80,                             //mov           v0.16b, v4.16b
220   0x4ea51ca1,                             //mov           v1.16b, v5.16b
221   0x4ea61cc2,                             //mov           v2.16b, v6.16b
222   0x4ea71ce3,                             //mov           v3.16b, v7.16b
223   0xd61f0060,                             //br            x3
224 };
225 
226 CODE const uint32_t sk_premul_aarch64[] = {
227   0xf8408423,                             //ldr           x3, [x1], #8
228   0x6e23dc00,                             //fmul          v0.4s, v0.4s, v3.4s
229   0x6e23dc21,                             //fmul          v1.4s, v1.4s, v3.4s
230   0x6e23dc42,                             //fmul          v2.4s, v2.4s, v3.4s
231   0xd61f0060,                             //br            x3
232 };
233 
234 CODE const uint32_t sk_unpremul_aarch64[] = {
235   0x4f03f611,                             //fmov          v17.4s, #1.000000000000000000e+00
236   0xf8408423,                             //ldr           x3, [x1], #8
237   0x4ea0d870,                             //fcmeq         v16.4s, v3.4s, #0.0
238   0x6e23fe31,                             //fdiv          v17.4s, v17.4s, v3.4s
239   0x4e701e30,                             //bic           v16.16b, v17.16b, v16.16b
240   0x6e20de00,                             //fmul          v0.4s, v16.4s, v0.4s
241   0x6e21de01,                             //fmul          v1.4s, v16.4s, v1.4s
242   0x6e22de02,                             //fmul          v2.4s, v16.4s, v2.4s
243   0xd61f0060,                             //br            x3
244 };
245 
246 CODE const uint32_t sk_from_srgb_aarch64[] = {
247   0x52a7d328,                             //mov           w8, #0x3e990000
248   0x72933348,                             //movk          w8, #0x999a
249   0x4e040d10,                             //dup           v16.4s, w8
250   0x52a7e648,                             //mov           w8, #0x3f320000
251   0x7291eb88,                             //movk          w8, #0x8f5c
252   0x4e040d11,                             //dup           v17.4s, w8
253   0x52a76468,                             //mov           w8, #0x3b230000
254   0x729ae148,                             //movk          w8, #0xd70a
255   0x4e040d12,                             //dup           v18.4s, w8
256   0x52a7b3c8,                             //mov           w8, #0x3d9e0000
257   0x72907228,                             //movk          w8, #0x8391
258   0x6e22dc54,                             //fmul          v20.4s, v2.4s, v2.4s
259   0x4eb11e35,                             //mov           v21.16b, v17.16b
260   0x4eb11e37,                             //mov           v23.16b, v17.16b
261   0x4e22ce11,                             //fmla          v17.4s, v16.4s, v2.4s
262   0x4eb21e56,                             //mov           v22.16b, v18.16b
263   0x4eb21e58,                             //mov           v24.16b, v18.16b
264   0x4e34ce32,                             //fmla          v18.4s, v17.4s, v20.4s
265   0x4e040d11,                             //dup           v17.4s, w8
266   0x52a7ac28,                             //mov           w8, #0x3d610000
267   0x6e20dc13,                             //fmul          v19.4s, v0.4s, v0.4s
268   0x7288f5c8,                             //movk          w8, #0x47ae
269   0x4e20ce15,                             //fmla          v21.4s, v16.4s, v0.4s
270   0xf8408423,                             //ldr           x3, [x1], #8
271   0x6e21dc34,                             //fmul          v20.4s, v1.4s, v1.4s
272   0x4e33ceb6,                             //fmla          v22.4s, v21.4s, v19.4s
273   0x4e040d13,                             //dup           v19.4s, w8
274   0x4e21ce17,                             //fmla          v23.4s, v16.4s, v1.4s
275   0x6e31dc15,                             //fmul          v21.4s, v0.4s, v17.4s
276   0x6ea0e660,                             //fcmgt         v0.4s, v19.4s, v0.4s
277   0x6e31dc30,                             //fmul          v16.4s, v1.4s, v17.4s
278   0x6ea1e661,                             //fcmgt         v1.4s, v19.4s, v1.4s
279   0x6e31dc51,                             //fmul          v17.4s, v2.4s, v17.4s
280   0x6ea2e662,                             //fcmgt         v2.4s, v19.4s, v2.4s
281   0x4e34cef8,                             //fmla          v24.4s, v23.4s, v20.4s
282   0x6e761ea0,                             //bsl           v0.16b, v21.16b, v22.16b
283   0x6e781e01,                             //bsl           v1.16b, v16.16b, v24.16b
284   0x6e721e22,                             //bsl           v2.16b, v17.16b, v18.16b
285   0xd61f0060,                             //br            x3
286 };
287 
288 CODE const uint32_t sk_to_srgb_aarch64[] = {
289   0x52a828e8,                             //mov           w8, #0x41470000
290   0x728b8528,                             //movk          w8, #0x5c29
291   0x4e040d12,                             //dup           v18.4s, w8
292   0x52a7e608,                             //mov           w8, #0x3f300000
293   0x728df9c8,                             //movk          w8, #0x6fce
294   0x6ea1d811,                             //frsqrte       v17.4s, v0.4s
295   0x4e040d13,                             //dup           v19.4s, w8
296   0x52b7b948,                             //mov           w8, #0xbdca0000
297   0x728af508,                             //movk          w8, #0x57a8
298   0x6ea1d834,                             //frsqrte       v20.4s, v1.4s
299   0x6e31de36,                             //fmul          v22.4s, v17.4s, v17.4s
300   0x4e040d10,                             //dup           v16.4s, w8
301   0x52a77188,                             //mov           w8, #0x3b8c0000
302   0x6ea1d855,                             //frsqrte       v21.4s, v2.4s
303   0x6e34de98,                             //fmul          v24.4s, v20.4s, v20.4s
304   0x4eb6fc16,                             //frsqrts       v22.4s, v0.4s, v22.4s
305   0x729ce088,                             //movk          w8, #0xe704
306   0x6e35deb9,                             //fmul          v25.4s, v21.4s, v21.4s
307   0x4eb8fc38,                             //frsqrts       v24.4s, v1.4s, v24.4s
308   0x6e36de31,                             //fmul          v17.4s, v17.4s, v22.4s
309   0x4e040d17,                             //dup           v23.4s, w8
310   0x4eb9fc59,                             //frsqrts       v25.4s, v2.4s, v25.4s
311   0x6e38de94,                             //fmul          v20.4s, v20.4s, v24.4s
312   0x4ea1da36,                             //frecpe        v22.4s, v17.4s
313   0x6e32dc1a,                             //fmul          v26.4s, v0.4s, v18.4s
314   0x6ea0e6e0,                             //fcmgt         v0.4s, v23.4s, v0.4s
315   0x6e32dc3c,                             //fmul          v28.4s, v1.4s, v18.4s
316   0x6ea1e6e1,                             //fcmgt         v1.4s, v23.4s, v1.4s
317   0x6e32dc52,                             //fmul          v18.4s, v2.4s, v18.4s
318   0x6ea2e6e2,                             //fcmgt         v2.4s, v23.4s, v2.4s
319   0x6e39deb5,                             //fmul          v21.4s, v21.4s, v25.4s
320   0x4ea1da97,                             //frecpe        v23.4s, v20.4s
321   0x4e36fe39,                             //frecps        v25.4s, v17.4s, v22.4s
322   0x4ea1dab8,                             //frecpe        v24.4s, v21.4s
323   0x6e39ded6,                             //fmul          v22.4s, v22.4s, v25.4s
324   0x4e37fe99,                             //frecps        v25.4s, v20.4s, v23.4s
325   0x4eb01e1b,                             //mov           v27.16b, v16.16b
326   0x6e39def7,                             //fmul          v23.4s, v23.4s, v25.4s
327   0x4e38feb9,                             //frecps        v25.4s, v21.4s, v24.4s
328   0x6e39df18,                             //fmul          v24.4s, v24.4s, v25.4s
329   0x4eb01e19,                             //mov           v25.16b, v16.16b
330   0x4e36ce7b,                             //fmla          v27.4s, v19.4s, v22.4s
331   0x6ea1da36,                             //frsqrte       v22.4s, v17.4s
332   0x4e37ce79,                             //fmla          v25.4s, v19.4s, v23.4s
333   0x6ea1da97,                             //frsqrte       v23.4s, v20.4s
334   0x4e38ce70,                             //fmla          v16.4s, v19.4s, v24.4s
335   0x6e36ded8,                             //fmul          v24.4s, v22.4s, v22.4s
336   0x6ea1dab3,                             //frsqrte       v19.4s, v21.4s
337   0x4eb8fe31,                             //frsqrts       v17.4s, v17.4s, v24.4s
338   0x6e37def8,                             //fmul          v24.4s, v23.4s, v23.4s
339   0x4eb8fe94,                             //frsqrts       v20.4s, v20.4s, v24.4s
340   0x6e33de78,                             //fmul          v24.4s, v19.4s, v19.4s
341   0x52a7da48,                             //mov           w8, #0x3ed20000
342   0x4eb8feb5,                             //frsqrts       v21.4s, v21.4s, v24.4s
343   0x7290f848,                             //movk          w8, #0x87c2
344   0x6e31ded1,                             //fmul          v17.4s, v22.4s, v17.4s
345   0x6e34def4,                             //fmul          v20.4s, v23.4s, v20.4s
346   0x6e35de73,                             //fmul          v19.4s, v19.4s, v21.4s
347   0x4e040d15,                             //dup           v21.4s, w8
348   0xf8408423,                             //ldr           x3, [x1], #8
349   0x4e31cebb,                             //fmla          v27.4s, v21.4s, v17.4s
350   0x4f03f611,                             //fmov          v17.4s, #1.000000000000000000e+00
351   0x4e34ceb9,                             //fmla          v25.4s, v21.4s, v20.4s
352   0x4e33ceb0,                             //fmla          v16.4s, v21.4s, v19.4s
353   0x4ebbf633,                             //fmin          v19.4s, v17.4s, v27.4s
354   0x4eb9f634,                             //fmin          v20.4s, v17.4s, v25.4s
355   0x4eb0f630,                             //fmin          v16.4s, v17.4s, v16.4s
356   0x6e731f40,                             //bsl           v0.16b, v26.16b, v19.16b
357   0x6e741f81,                             //bsl           v1.16b, v28.16b, v20.16b
358   0x6e701e42,                             //bsl           v2.16b, v18.16b, v16.16b
359   0xd61f0060,                             //br            x3
360 };
361 
362 CODE const uint32_t sk_scale_1_float_aarch64[] = {
363   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
364   0xbd400110,                             //ldr           s16, [x8]
365   0x4f909000,                             //fmul          v0.4s, v0.4s, v16.s[0]
366   0x4f909021,                             //fmul          v1.4s, v1.4s, v16.s[0]
367   0x4f909042,                             //fmul          v2.4s, v2.4s, v16.s[0]
368   0x4f909063,                             //fmul          v3.4s, v3.4s, v16.s[0]
369   0xd61f0060,                             //br            x3
370 };
371 
372 CODE const uint32_t sk_scale_u8_aarch64[] = {
373   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
374   0x52a77009,                             //mov           w9, #0x3b800000
375   0x72901029,                             //movk          w9, #0x8081
376   0x4e040d30,                             //dup           v16.4s, w9
377   0xf9400108,                             //ldr           x8, [x8]
378   0x8b000108,                             //add           x8, x8, x0
379   0x39400109,                             //ldrb          w9, [x8]
380   0x3940050a,                             //ldrb          w10, [x8, #1]
381   0x3940090b,                             //ldrb          w11, [x8, #2]
382   0x39400d08,                             //ldrb          w8, [x8, #3]
383   0x4e021d31,                             //mov           v17.h[0], w9
384   0x4e061d51,                             //mov           v17.h[1], w10
385   0x4e0a1d71,                             //mov           v17.h[2], w11
386   0x4e0e1d11,                             //mov           v17.h[3], w8
387   0x2f10a631,                             //uxtl          v17.4s, v17.4h
388   0x6e21da31,                             //ucvtf         v17.4s, v17.4s
389   0x6e30de30,                             //fmul          v16.4s, v17.4s, v16.4s
390   0x6e20de00,                             //fmul          v0.4s, v16.4s, v0.4s
391   0x6e21de01,                             //fmul          v1.4s, v16.4s, v1.4s
392   0x6e22de02,                             //fmul          v2.4s, v16.4s, v2.4s
393   0x6e23de03,                             //fmul          v3.4s, v16.4s, v3.4s
394   0xd61f0060,                             //br            x3
395 };
396 
397 CODE const uint32_t sk_lerp_1_float_aarch64[] = {
398   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
399   0x4ea4d411,                             //fsub          v17.4s, v0.4s, v4.4s
400   0x4ea41c80,                             //mov           v0.16b, v4.16b
401   0x4ea5d432,                             //fsub          v18.4s, v1.4s, v5.4s
402   0xbd400110,                             //ldr           s16, [x8]
403   0x4ea51ca1,                             //mov           v1.16b, v5.16b
404   0x4f901220,                             //fmla          v0.4s, v17.4s, v16.s[0]
405   0x4ea6d451,                             //fsub          v17.4s, v2.4s, v6.4s
406   0x4f901241,                             //fmla          v1.4s, v18.4s, v16.s[0]
407   0x4ea61cc2,                             //mov           v2.16b, v6.16b
408   0x4ea7d472,                             //fsub          v18.4s, v3.4s, v7.4s
409   0x4ea71ce3,                             //mov           v3.16b, v7.16b
410   0x4f901222,                             //fmla          v2.4s, v17.4s, v16.s[0]
411   0x4f901243,                             //fmla          v3.4s, v18.4s, v16.s[0]
412   0xd61f0060,                             //br            x3
413 };
414 
415 CODE const uint32_t sk_lerp_u8_aarch64[] = {
416   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
417   0x52a77009,                             //mov           w9, #0x3b800000
418   0x72901029,                             //movk          w9, #0x8081
419   0x4e040d30,                             //dup           v16.4s, w9
420   0xf9400108,                             //ldr           x8, [x8]
421   0x4ea4d412,                             //fsub          v18.4s, v0.4s, v4.4s
422   0x8b000108,                             //add           x8, x8, x0
423   0x3940010a,                             //ldrb          w10, [x8]
424   0x39400509,                             //ldrb          w9, [x8, #1]
425   0x3940090b,                             //ldrb          w11, [x8, #2]
426   0x39400d08,                             //ldrb          w8, [x8, #3]
427   0x4e021d51,                             //mov           v17.h[0], w10
428   0x4e061d31,                             //mov           v17.h[1], w9
429   0x4e0a1d71,                             //mov           v17.h[2], w11
430   0x4e0e1d11,                             //mov           v17.h[3], w8
431   0x2f10a620,                             //uxtl          v0.4s, v17.4h
432   0x6e21d800,                             //ucvtf         v0.4s, v0.4s
433   0x6e30dc10,                             //fmul          v16.4s, v0.4s, v16.4s
434   0x4ea41c80,                             //mov           v0.16b, v4.16b
435   0x4ea5d431,                             //fsub          v17.4s, v1.4s, v5.4s
436   0x4ea51ca1,                             //mov           v1.16b, v5.16b
437   0x4e32ce00,                             //fmla          v0.4s, v16.4s, v18.4s
438   0x4ea6d452,                             //fsub          v18.4s, v2.4s, v6.4s
439   0x4e31ce01,                             //fmla          v1.4s, v16.4s, v17.4s
440   0x4ea61cc2,                             //mov           v2.16b, v6.16b
441   0x4ea7d471,                             //fsub          v17.4s, v3.4s, v7.4s
442   0x4ea71ce3,                             //mov           v3.16b, v7.16b
443   0x4e32ce02,                             //fmla          v2.4s, v16.4s, v18.4s
444   0x4e31ce03,                             //fmla          v3.4s, v16.4s, v17.4s
445   0xd61f0060,                             //br            x3
446 };
447 
448 CODE const uint32_t sk_lerp_565_aarch64[] = {
449   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
450   0xd37ff809,                             //lsl           x9, x0, #1
451   0x4f072710,                             //movi          v16.4s, #0xf8, lsl #8
452   0x4ea4d413,                             //fsub          v19.4s, v0.4s, v4.4s
453   0xf9400108,                             //ldr           x8, [x8]
454   0xfc696903,                             //ldr           d3, [x8, x9]
455   0x52a6f088,                             //mov           w8, #0x37840000
456   0x72842108,                             //movk          w8, #0x2108
457   0x4e040d11,                             //dup           v17.4s, w8
458   0x2f10a463,                             //uxtl          v3.4s, v3.4h
459   0x321b17e8,                             //orr           w8, wzr, #0x7e0
460   0x4e301c60,                             //and           v0.16b, v3.16b, v16.16b
461   0x4e040d12,                             //dup           v18.4s, w8
462   0x52a74048,                             //mov           w8, #0x3a020000
463   0x4e21d800,                             //scvtf         v0.4s, v0.4s
464   0x72810428,                             //movk          w8, #0x821
465   0x6e31dc10,                             //fmul          v16.4s, v0.4s, v17.4s
466   0x4ea41c80,                             //mov           v0.16b, v4.16b
467   0x4e33ce00,                             //fmla          v0.4s, v16.4s, v19.4s
468   0x4f0007f0,                             //movi          v16.4s, #0x1f
469   0x4e040d11,                             //dup           v17.4s, w8
470   0x52a7a088,                             //mov           w8, #0x3d040000
471   0x4e321c72,                             //and           v18.16b, v3.16b, v18.16b
472   0x72842108,                             //movk          w8, #0x2108
473   0x4e301c63,                             //and           v3.16b, v3.16b, v16.16b
474   0x4ea6d450,                             //fsub          v16.4s, v2.4s, v6.4s
475   0x4e21da42,                             //scvtf         v2.4s, v18.4s
476   0x6e31dc51,                             //fmul          v17.4s, v2.4s, v17.4s
477   0x4e040d02,                             //dup           v2.4s, w8
478   0x4e21d863,                             //scvtf         v3.4s, v3.4s
479   0x4ea5d433,                             //fsub          v19.4s, v1.4s, v5.4s
480   0x4ea51ca1,                             //mov           v1.16b, v5.16b
481   0x6e22dc63,                             //fmul          v3.4s, v3.4s, v2.4s
482   0x4ea61cc2,                             //mov           v2.16b, v6.16b
483   0x4e33ce21,                             //fmla          v1.4s, v17.4s, v19.4s
484   0x4e30cc62,                             //fmla          v2.4s, v3.4s, v16.4s
485   0x4f03f603,                             //fmov          v3.4s, #1.000000000000000000e+00
486   0xd61f0060,                             //br            x3
487 };
488 
489 CODE const uint32_t sk_load_tables_aarch64[] = {
490   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
491   0xd37ef409,                             //lsl           x9, x0, #2
492   0x6f00e620,                             //movi          v0.2d, #0xff000000ff
493   0x52a7700b,                             //mov           w11, #0x3b800000
494   0xa940310a,                             //ldp           x10, x12, [x8]
495   0x7290102b,                             //movk          w11, #0x8081
496   0x4e040d63,                             //dup           v3.4s, w11
497   0x3ce96942,                             //ldr           q2, [x10, x9]
498   0xa9412109,                             //ldp           x9, x8, [x8, #16]
499   0x4e201c41,                             //and           v1.16b, v2.16b, v0.16b
500   0x1e26002e,                             //fmov          w14, s1
501   0x6f380450,                             //ushr          v16.4s, v2.4s, #8
502   0x6f300451,                             //ushr          v17.4s, v2.4s, #16
503   0x8b2e498e,                             //add           x14, x12, w14, uxtw #2
504   0x0e0c3c2a,                             //mov           w10, v1.s[1]
505   0x0e143c2b,                             //mov           w11, v1.s[2]
506   0x0e1c3c2d,                             //mov           w13, v1.s[3]
507   0x4e201e01,                             //and           v1.16b, v16.16b, v0.16b
508   0x4e201e30,                             //and           v16.16b, v17.16b, v0.16b
509   0x0d4081c0,                             //ld1           {v0.s}[0], [x14]
510   0x8b2a498a,                             //add           x10, x12, w10, uxtw #2
511   0xbc6b5991,                             //ldr           s17, [x12, w11, uxtw #2]
512   0xbc6d5992,                             //ldr           s18, [x12, w13, uxtw #2]
513   0x0e0c3c2b,                             //mov           w11, v1.s[1]
514   0x0e143c2c,                             //mov           w12, v1.s[2]
515   0x0e1c3c2d,                             //mov           w13, v1.s[3]
516   0x1e26002e,                             //fmov          w14, s1
517   0x8b2e492e,                             //add           x14, x9, w14, uxtw #2
518   0xbc6c5933,                             //ldr           s19, [x9, w12, uxtw #2]
519   0xbc6d5934,                             //ldr           s20, [x9, w13, uxtw #2]
520   0x8b2b4929,                             //add           x9, x9, w11, uxtw #2
521   0x1e26020b,                             //fmov          w11, s16
522   0x6f280442,                             //ushr          v2.4s, v2.4s, #24
523   0x0d409140,                             //ld1           {v0.s}[1], [x10]
524   0x4e21d842,                             //scvtf         v2.4s, v2.4s
525   0x8b2b490a,                             //add           x10, x8, w11, uxtw #2
526   0x0d4081c1,                             //ld1           {v1.s}[0], [x14]
527   0x6e23dc43,                             //fmul          v3.4s, v2.4s, v3.4s
528   0x0d408142,                             //ld1           {v2.s}[0], [x10]
529   0x0e0c3e0f,                             //mov           w15, v16.s[1]
530   0x0e143e0c,                             //mov           w12, v16.s[2]
531   0x8b2f490a,                             //add           x10, x8, w15, uxtw #2
532   0x0e1c3e0d,                             //mov           w13, v16.s[3]
533   0xbc6c5910,                             //ldr           s16, [x8, w12, uxtw #2]
534   0x0d409121,                             //ld1           {v1.s}[1], [x9]
535   0x0d409142,                             //ld1           {v2.s}[1], [x10]
536   0x6e140620,                             //mov           v0.s[2], v17.s[0]
537   0xbc6d5911,                             //ldr           s17, [x8, w13, uxtw #2]
538   0x6e140661,                             //mov           v1.s[2], v19.s[0]
539   0x6e140602,                             //mov           v2.s[2], v16.s[0]
540   0x6e1c0640,                             //mov           v0.s[3], v18.s[0]
541   0x6e1c0681,                             //mov           v1.s[3], v20.s[0]
542   0x6e1c0622,                             //mov           v2.s[3], v17.s[0]
543   0xd61f0060,                             //br            x3
544 };
545 
546 CODE const uint32_t sk_load_a8_aarch64[] = {
547   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
548   0x52a77009,                             //mov           w9, #0x3b800000
549   0x72901029,                             //movk          w9, #0x8081
550   0x4e040d22,                             //dup           v2.4s, w9
551   0xf9400108,                             //ldr           x8, [x8]
552   0x6f00e400,                             //movi          v0.2d, #0x0
553   0x6f00e401,                             //movi          v1.2d, #0x0
554   0x8b000108,                             //add           x8, x8, x0
555   0x3940010a,                             //ldrb          w10, [x8]
556   0x3940050b,                             //ldrb          w11, [x8, #1]
557   0x3940090c,                             //ldrb          w12, [x8, #2]
558   0x39400d08,                             //ldrb          w8, [x8, #3]
559   0x4e021d43,                             //mov           v3.h[0], w10
560   0x4e061d63,                             //mov           v3.h[1], w11
561   0x4e0a1d83,                             //mov           v3.h[2], w12
562   0x4e0e1d03,                             //mov           v3.h[3], w8
563   0x2f10a463,                             //uxtl          v3.4s, v3.4h
564   0x6e21d863,                             //ucvtf         v3.4s, v3.4s
565   0x6e22dc63,                             //fmul          v3.4s, v3.4s, v2.4s
566   0x6f00e402,                             //movi          v2.2d, #0x0
567   0xd61f0060,                             //br            x3
568 };
569 
570 CODE const uint32_t sk_store_a8_aarch64[] = {
571   0xf9400028,                             //ldr           x8, [x1]
572   0x52a86fe9,                             //mov           w9, #0x437f0000
573   0x4e040d30,                             //dup           v16.4s, w9
574   0x6e30dc70,                             //fmul          v16.4s, v3.4s, v16.4s
575   0xf9400108,                             //ldr           x8, [x8]
576   0x6e21aa10,                             //fcvtnu        v16.4s, v16.4s
577   0x0e612a10,                             //xtn           v16.4h, v16.4s
578   0x0e0e3e09,                             //umov          w9, v16.h[3]
579   0x8b000108,                             //add           x8, x8, x0
580   0x39000d09,                             //strb          w9, [x8, #3]
581   0x0e0a3e09,                             //umov          w9, v16.h[2]
582   0x39000909,                             //strb          w9, [x8, #2]
583   0x0e063e09,                             //umov          w9, v16.h[1]
584   0x39000509,                             //strb          w9, [x8, #1]
585   0x0e023e09,                             //umov          w9, v16.h[0]
586   0x39000109,                             //strb          w9, [x8]
587   0xf9400423,                             //ldr           x3, [x1, #8]
588   0x91004021,                             //add           x1, x1, #0x10
589   0xd61f0060,                             //br            x3
590 };
591 
592 CODE const uint32_t sk_load_565_aarch64[] = {
593   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
594   0xd37ff809,                             //lsl           x9, x0, #1
595   0x4f072701,                             //movi          v1.4s, #0xf8, lsl #8
596   0x4f0007e3,                             //movi          v3.4s, #0x1f
597   0xf9400108,                             //ldr           x8, [x8]
598   0xfc696900,                             //ldr           d0, [x8, x9]
599   0x321b17e8,                             //orr           w8, wzr, #0x7e0
600   0x4e040d02,                             //dup           v2.4s, w8
601   0x52a6f088,                             //mov           w8, #0x37840000
602   0x72842108,                             //movk          w8, #0x2108
603   0x2f10a400,                             //uxtl          v0.4s, v0.4h
604   0x4e211c01,                             //and           v1.16b, v0.16b, v1.16b
605   0x4e221c02,                             //and           v2.16b, v0.16b, v2.16b
606   0x4e231c03,                             //and           v3.16b, v0.16b, v3.16b
607   0x4e040d00,                             //dup           v0.4s, w8
608   0x52a74048,                             //mov           w8, #0x3a020000
609   0x72810428,                             //movk          w8, #0x821
610   0x4e21d821,                             //scvtf         v1.4s, v1.4s
611   0x6e20dc20,                             //fmul          v0.4s, v1.4s, v0.4s
612   0x4e040d01,                             //dup           v1.4s, w8
613   0x52a7a088,                             //mov           w8, #0x3d040000
614   0x72842108,                             //movk          w8, #0x2108
615   0x4e21d842,                             //scvtf         v2.4s, v2.4s
616   0x6e21dc41,                             //fmul          v1.4s, v2.4s, v1.4s
617   0x4e040d02,                             //dup           v2.4s, w8
618   0x4e21d863,                             //scvtf         v3.4s, v3.4s
619   0x6e22dc62,                             //fmul          v2.4s, v3.4s, v2.4s
620   0x4f03f603,                             //fmov          v3.4s, #1.000000000000000000e+00
621   0xd61f0060,                             //br            x3
622 };
623 
624 CODE const uint32_t sk_store_565_aarch64[] = {
625   0xf9400028,                             //ldr           x8, [x1]
626   0x52a84f8a,                             //mov           w10, #0x427c0000
627   0x4f01f7f0,                             //fmov          v16.4s, #3.100000000000000000e+01
628   0x4e040d52,                             //dup           v18.4s, w10
629   0x6e30dc11,                             //fmul          v17.4s, v0.4s, v16.4s
630   0x6e32dc32,                             //fmul          v18.4s, v1.4s, v18.4s
631   0x6e21aa31,                             //fcvtnu        v17.4s, v17.4s
632   0x6e21aa52,                             //fcvtnu        v18.4s, v18.4s
633   0x6e30dc50,                             //fmul          v16.4s, v2.4s, v16.4s
634   0x4f2b5631,                             //shl           v17.4s, v17.4s, #11
635   0xf9400108,                             //ldr           x8, [x8]
636   0x4f255652,                             //shl           v18.4s, v18.4s, #5
637   0x4eb11e51,                             //orr           v17.16b, v18.16b, v17.16b
638   0x6e21aa10,                             //fcvtnu        v16.4s, v16.4s
639   0x4eb01e30,                             //orr           v16.16b, v17.16b, v16.16b
640   0xd37ff809,                             //lsl           x9, x0, #1
641   0x0e612a10,                             //xtn           v16.4h, v16.4s
642   0xfc296910,                             //str           d16, [x8, x9]
643   0xf9400423,                             //ldr           x3, [x1, #8]
644   0x91004021,                             //add           x1, x1, #0x10
645   0xd61f0060,                             //br            x3
646 };
647 
648 CODE const uint32_t sk_load_8888_aarch64[] = {
649   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
650   0xd37ef409,                             //lsl           x9, x0, #2
651   0x6f00e621,                             //movi          v1.2d, #0xff000000ff
652   0xf9400108,                             //ldr           x8, [x8]
653   0x3ce96900,                             //ldr           q0, [x8, x9]
654   0x52a77008,                             //mov           w8, #0x3b800000
655   0x72901028,                             //movk          w8, #0x8081
656   0x4e040d02,                             //dup           v2.4s, w8
657   0x6f380410,                             //ushr          v16.4s, v0.4s, #8
658   0x6f300411,                             //ushr          v17.4s, v0.4s, #16
659   0x4e211c03,                             //and           v3.16b, v0.16b, v1.16b
660   0x6f280400,                             //ushr          v0.4s, v0.4s, #24
661   0x4e211e10,                             //and           v16.16b, v16.16b, v1.16b
662   0x4e211e21,                             //and           v1.16b, v17.16b, v1.16b
663   0x4e21d863,                             //scvtf         v3.4s, v3.4s
664   0x4e21d811,                             //scvtf         v17.4s, v0.4s
665   0x4e21da10,                             //scvtf         v16.4s, v16.4s
666   0x4e21d832,                             //scvtf         v18.4s, v1.4s
667   0x6e22dc60,                             //fmul          v0.4s, v3.4s, v2.4s
668   0x6e22de23,                             //fmul          v3.4s, v17.4s, v2.4s
669   0x6e22de01,                             //fmul          v1.4s, v16.4s, v2.4s
670   0x6e22de42,                             //fmul          v2.4s, v18.4s, v2.4s
671   0xd61f0060,                             //br            x3
672 };
673 
674 CODE const uint32_t sk_store_8888_aarch64[] = {
675   0x52a86fea,                             //mov           w10, #0x437f0000
676   0x4e040d50,                             //dup           v16.4s, w10
677   0xf9400028,                             //ldr           x8, [x1]
678   0x6e30dc32,                             //fmul          v18.4s, v1.4s, v16.4s
679   0x6e30dc11,                             //fmul          v17.4s, v0.4s, v16.4s
680   0x6e21aa52,                             //fcvtnu        v18.4s, v18.4s
681   0x6e21aa31,                             //fcvtnu        v17.4s, v17.4s
682   0x4f285652,                             //shl           v18.4s, v18.4s, #8
683   0x4eb11e51,                             //orr           v17.16b, v18.16b, v17.16b
684   0x6e30dc52,                             //fmul          v18.4s, v2.4s, v16.4s
685   0x6e30dc70,                             //fmul          v16.4s, v3.4s, v16.4s
686   0x6e21aa52,                             //fcvtnu        v18.4s, v18.4s
687   0xf9400108,                             //ldr           x8, [x8]
688   0x6e21aa10,                             //fcvtnu        v16.4s, v16.4s
689   0x4f305652,                             //shl           v18.4s, v18.4s, #16
690   0x4eb21e31,                             //orr           v17.16b, v17.16b, v18.16b
691   0x4f385610,                             //shl           v16.4s, v16.4s, #24
692   0xd37ef409,                             //lsl           x9, x0, #2
693   0x4eb01e30,                             //orr           v16.16b, v17.16b, v16.16b
694   0x3ca96910,                             //str           q16, [x8, x9]
695   0xf9400423,                             //ldr           x3, [x1, #8]
696   0x91004021,                             //add           x1, x1, #0x10
697   0xd61f0060,                             //br            x3
698 };
699 
700 CODE const uint32_t sk_load_f16_aarch64[] = {
701   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
702   0xf9400108,                             //ldr           x8, [x8]
703   0x8b000d08,                             //add           x8, x8, x0, lsl #3
704   0x0c400510,                             //ld4           {v16.4h-v19.4h}, [x8]
705   0x0e217a00,                             //fcvtl         v0.4s, v16.4h
706   0x0e217a21,                             //fcvtl         v1.4s, v17.4h
707   0x0e217a42,                             //fcvtl         v2.4s, v18.4h
708   0x0e217a63,                             //fcvtl         v3.4s, v19.4h
709   0xd61f0060,                             //br            x3
710 };
711 
712 CODE const uint32_t sk_store_f16_aarch64[] = {
713   0xf9400028,                             //ldr           x8, [x1]
714   0x0e216810,                             //fcvtn         v16.4h, v0.4s
715   0x0e216831,                             //fcvtn         v17.4h, v1.4s
716   0x0e216852,                             //fcvtn         v18.4h, v2.4s
717   0xf9400108,                             //ldr           x8, [x8]
718   0x0e216873,                             //fcvtn         v19.4h, v3.4s
719   0x8b000d08,                             //add           x8, x8, x0, lsl #3
720   0x0c000510,                             //st4           {v16.4h-v19.4h}, [x8]
721   0xf9400423,                             //ldr           x3, [x1, #8]
722   0x91004021,                             //add           x1, x1, #0x10
723   0xd61f0060,                             //br            x3
724 };
725 
726 CODE const uint32_t sk_store_f32_aarch64[] = {
727   0xf9400028,                             //ldr           x8, [x1]
728   0xf9400108,                             //ldr           x8, [x8]
729   0x8b001108,                             //add           x8, x8, x0, lsl #4
730   0x4c000900,                             //st4           {v0.4s-v3.4s}, [x8]
731   0xf9400423,                             //ldr           x3, [x1, #8]
732   0x91004021,                             //add           x1, x1, #0x10
733   0xd61f0060,                             //br            x3
734 };
735 
736 CODE const uint32_t sk_clamp_x_aarch64[] = {
737   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
738   0x6f00e411,                             //movi          v17.2d, #0x0
739   0x4e20f620,                             //fmax          v0.4s, v17.4s, v0.4s
740   0x6f07e7f1,                             //movi          v17.2d, #0xffffffffffffffff
741   0x4d40c910,                             //ld1r          {v16.4s}, [x8]
742   0x4eb18610,                             //add           v16.4s, v16.4s, v17.4s
743   0x4eb0f400,                             //fmin          v0.4s, v0.4s, v16.4s
744   0xd61f0060,                             //br            x3
745 };
746 
747 CODE const uint32_t sk_clamp_y_aarch64[] = {
748   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
749   0x6f00e411,                             //movi          v17.2d, #0x0
750   0x4e21f621,                             //fmax          v1.4s, v17.4s, v1.4s
751   0x6f07e7f1,                             //movi          v17.2d, #0xffffffffffffffff
752   0x4d40c910,                             //ld1r          {v16.4s}, [x8]
753   0x4eb18610,                             //add           v16.4s, v16.4s, v17.4s
754   0x4eb0f421,                             //fmin          v1.4s, v1.4s, v16.4s
755   0xd61f0060,                             //br            x3
756 };
757 
758 CODE const uint32_t sk_repeat_x_aarch64[] = {
759   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
760   0x6f07e7f1,                             //movi          v17.2d, #0xffffffffffffffff
761   0xbd400110,                             //ldr           s16, [x8]
762   0x4e040612,                             //dup           v18.4s, v16.s[0]
763   0x4eb18651,                             //add           v17.4s, v18.4s, v17.4s
764   0x6e32fc12,                             //fdiv          v18.4s, v0.4s, v18.4s
765   0x4e219a52,                             //frintm        v18.4s, v18.4s
766   0x4f905240,                             //fmls          v0.4s, v18.4s, v16.s[0]
767   0x4eb1f400,                             //fmin          v0.4s, v0.4s, v17.4s
768   0xd61f0060,                             //br            x3
769 };
770 
771 CODE const uint32_t sk_repeat_y_aarch64[] = {
772   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
773   0x6f07e7f1,                             //movi          v17.2d, #0xffffffffffffffff
774   0xbd400110,                             //ldr           s16, [x8]
775   0x4e040612,                             //dup           v18.4s, v16.s[0]
776   0x4eb18651,                             //add           v17.4s, v18.4s, v17.4s
777   0x6e32fc32,                             //fdiv          v18.4s, v1.4s, v18.4s
778   0x4e219a52,                             //frintm        v18.4s, v18.4s
779   0x4f905241,                             //fmls          v1.4s, v18.4s, v16.s[0]
780   0x4eb1f421,                             //fmin          v1.4s, v1.4s, v17.4s
781   0xd61f0060,                             //br            x3
782 };
783 
784 CODE const uint32_t sk_mirror_x_aarch64[] = {
785   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
786   0xbd400110,                             //ldr           s16, [x8]
787   0x4e040611,                             //dup           v17.4s, v16.s[0]
788   0x1e302a10,                             //fadd          s16, s16, s16
789   0x4eb1d400,                             //fsub          v0.4s, v0.4s, v17.4s
790   0x4e040612,                             //dup           v18.4s, v16.s[0]
791   0x6e32fc12,                             //fdiv          v18.4s, v0.4s, v18.4s
792   0x4e219a52,                             //frintm        v18.4s, v18.4s
793   0x4f905240,                             //fmls          v0.4s, v18.4s, v16.s[0]
794   0x6f07e7f0,                             //movi          v16.2d, #0xffffffffffffffff
795   0x4eb1d400,                             //fsub          v0.4s, v0.4s, v17.4s
796   0x4eb08630,                             //add           v16.4s, v17.4s, v16.4s
797   0x4ea0f800,                             //fabs          v0.4s, v0.4s
798   0x4eb0f400,                             //fmin          v0.4s, v0.4s, v16.4s
799   0xd61f0060,                             //br            x3
800 };
801 
802 CODE const uint32_t sk_mirror_y_aarch64[] = {
803   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
804   0xbd400110,                             //ldr           s16, [x8]
805   0x4e040611,                             //dup           v17.4s, v16.s[0]
806   0x1e302a10,                             //fadd          s16, s16, s16
807   0x4eb1d421,                             //fsub          v1.4s, v1.4s, v17.4s
808   0x4e040612,                             //dup           v18.4s, v16.s[0]
809   0x6e32fc32,                             //fdiv          v18.4s, v1.4s, v18.4s
810   0x4e219a52,                             //frintm        v18.4s, v18.4s
811   0x4f905241,                             //fmls          v1.4s, v18.4s, v16.s[0]
812   0x6f07e7f0,                             //movi          v16.2d, #0xffffffffffffffff
813   0x4eb1d421,                             //fsub          v1.4s, v1.4s, v17.4s
814   0x4eb08630,                             //add           v16.4s, v17.4s, v16.4s
815   0x4ea0f821,                             //fabs          v1.4s, v1.4s
816   0x4eb0f421,                             //fmin          v1.4s, v1.4s, v16.4s
817   0xd61f0060,                             //br            x3
818 };
819 
820 CODE const uint32_t sk_luminance_to_alpha_aarch64[] = {
821   0x52a7cb28,                             //mov           w8, #0x3e590000
822   0x72967a08,                             //movk          w8, #0xb3d0
823   0x4e040d11,                             //dup           v17.4s, w8
824   0x52a7e6e8,                             //mov           w8, #0x3f370000
825   0x7282eb28,                             //movk          w8, #0x1759
826   0x4ea01c10,                             //mov           v16.16b, v0.16b
827   0x4e040d00,                             //dup           v0.4s, w8
828   0x52a7b268,                             //mov           w8, #0x3d930000
829   0xf8408423,                             //ldr           x3, [x1], #8
830   0x729bb308,                             //movk          w8, #0xdd98
831   0x6e20dc23,                             //fmul          v3.4s, v1.4s, v0.4s
832   0x4e30ce23,                             //fmla          v3.4s, v17.4s, v16.4s
833   0x4e040d10,                             //dup           v16.4s, w8
834   0x6f00e400,                             //movi          v0.2d, #0x0
835   0x6f00e401,                             //movi          v1.2d, #0x0
836   0x4e22ce03,                             //fmla          v3.4s, v16.4s, v2.4s
837   0x6f00e402,                             //movi          v2.2d, #0x0
838   0xd61f0060,                             //br            x3
839 };
840 
841 CODE const uint32_t sk_matrix_2x3_aarch64[] = {
842   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
843   0xaa0803e9,                             //mov           x9, x8
844   0x9100410a,                             //add           x10, x8, #0x10
845   0x4ddfc932,                             //ld1r          {v18.4s}, [x9], #4
846   0x4d40c950,                             //ld1r          {v16.4s}, [x10]
847   0x2d415113,                             //ldp           s19, s20, [x8, #8]
848   0x9100510a,                             //add           x10, x8, #0x14
849   0x4d40c951,                             //ld1r          {v17.4s}, [x10]
850   0x4f931030,                             //fmla          v16.4s, v1.4s, v19.s[0]
851   0xbd400133,                             //ldr           s19, [x9]
852   0x4f941031,                             //fmla          v17.4s, v1.4s, v20.s[0]
853   0x4e20ce50,                             //fmla          v16.4s, v18.4s, v0.4s
854   0x4f931011,                             //fmla          v17.4s, v0.4s, v19.s[0]
855   0x4eb01e00,                             //mov           v0.16b, v16.16b
856   0x4eb11e21,                             //mov           v1.16b, v17.16b
857   0xd61f0060,                             //br            x3
858 };
859 
860 CODE const uint32_t sk_matrix_3x4_aarch64[] = {
861   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
862   0xaa0803e9,                             //mov           x9, x8
863   0x9100910a,                             //add           x10, x8, #0x24
864   0x4ddfc933,                             //ld1r          {v19.4s}, [x9], #4
865   0x4d40c950,                             //ld1r          {v16.4s}, [x10]
866   0x9100a10a,                             //add           x10, x8, #0x28
867   0x4d40c951,                             //ld1r          {v17.4s}, [x10]
868   0x9100b10a,                             //add           x10, x8, #0x2c
869   0x2d435514,                             //ldp           s20, s21, [x8, #24]
870   0xbd402116,                             //ldr           s22, [x8, #32]
871   0x4d40c952,                             //ld1r          {v18.4s}, [x10]
872   0x4f941050,                             //fmla          v16.4s, v2.4s, v20.s[0]
873   0x4f951051,                             //fmla          v17.4s, v2.4s, v21.s[0]
874   0x4f961052,                             //fmla          v18.4s, v2.4s, v22.s[0]
875   0x2d425502,                             //ldp           s2, s21, [x8, #16]
876   0x2d415d14,                             //ldp           s20, s23, [x8, #8]
877   0x4f821031,                             //fmla          v17.4s, v1.4s, v2.s[0]
878   0xbd400122,                             //ldr           s2, [x9]
879   0x4f971030,                             //fmla          v16.4s, v1.4s, v23.s[0]
880   0x4f951032,                             //fmla          v18.4s, v1.4s, v21.s[0]
881   0x4e20ce70,                             //fmla          v16.4s, v19.4s, v0.4s
882   0x4f941012,                             //fmla          v18.4s, v0.4s, v20.s[0]
883   0x4f821011,                             //fmla          v17.4s, v0.4s, v2.s[0]
884   0x4eb01e00,                             //mov           v0.16b, v16.16b
885   0x4eb11e21,                             //mov           v1.16b, v17.16b
886   0x4eb21e42,                             //mov           v2.16b, v18.16b
887   0xd61f0060,                             //br            x3
888 };
889 
890 CODE const uint32_t sk_matrix_4x5_aarch64[] = {
891   0xf9400029,                             //ldr           x9, [x1]
892   0xaa0903e8,                             //mov           x8, x9
893   0x9101012a,                             //add           x10, x9, #0x40
894   0x4ddfc914,                             //ld1r          {v20.4s}, [x8], #4
895   0x4d40c950,                             //ld1r          {v16.4s}, [x10]
896   0x9101112a,                             //add           x10, x9, #0x44
897   0x4d40c951,                             //ld1r          {v17.4s}, [x10]
898   0x9101212a,                             //add           x10, x9, #0x48
899   0x4d40c952,                             //ld1r          {v18.4s}, [x10]
900   0x2d465533,                             //ldp           s19, s21, [x9, #48]
901   0x2d475d36,                             //ldp           s22, s23, [x9, #56]
902   0x9101312a,                             //add           x10, x9, #0x4c
903   0xf9400423,                             //ldr           x3, [x1, #8]
904   0x4f931070,                             //fmla          v16.4s, v3.4s, v19.s[0]
905   0x4d40c953,                             //ld1r          {v19.4s}, [x10]
906   0x4f951071,                             //fmla          v17.4s, v3.4s, v21.s[0]
907   0x4f961072,                             //fmla          v18.4s, v3.4s, v22.s[0]
908   0x2d445935,                             //ldp           s21, s22, [x9, #32]
909   0x4f971073,                             //fmla          v19.4s, v3.4s, v23.s[0]
910   0x2d455d23,                             //ldp           s3, s23, [x9, #40]
911   0x91004021,                             //add           x1, x1, #0x10
912   0x4f951050,                             //fmla          v16.4s, v2.4s, v21.s[0]
913   0x4f961051,                             //fmla          v17.4s, v2.4s, v22.s[0]
914   0x2d425935,                             //ldp           s21, s22, [x9, #16]
915   0x4f971053,                             //fmla          v19.4s, v2.4s, v23.s[0]
916   0x4f831052,                             //fmla          v18.4s, v2.4s, v3.s[0]
917   0x2d410d22,                             //ldp           s2, s3, [x9, #8]
918   0x4f951030,                             //fmla          v16.4s, v1.4s, v21.s[0]
919   0x2d435d35,                             //ldp           s21, s23, [x9, #24]
920   0x4f961031,                             //fmla          v17.4s, v1.4s, v22.s[0]
921   0xbd400116,                             //ldr           s22, [x8]
922   0x4e20ce90,                             //fmla          v16.4s, v20.4s, v0.4s
923   0x4f951032,                             //fmla          v18.4s, v1.4s, v21.s[0]
924   0x4f971033,                             //fmla          v19.4s, v1.4s, v23.s[0]
925   0x4f821012,                             //fmla          v18.4s, v0.4s, v2.s[0]
926   0x4f831013,                             //fmla          v19.4s, v0.4s, v3.s[0]
927   0x4f961011,                             //fmla          v17.4s, v0.4s, v22.s[0]
928   0x4eb01e00,                             //mov           v0.16b, v16.16b
929   0x4eb11e21,                             //mov           v1.16b, v17.16b
930   0x4eb21e42,                             //mov           v2.16b, v18.16b
931   0x4eb31e63,                             //mov           v3.16b, v19.16b
932   0xd61f0060,                             //br            x3
933 };
934 
935 CODE const uint32_t sk_matrix_perspective_aarch64[] = {
936   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
937   0xaa0803e9,                             //mov           x9, x8
938   0x9100510a,                             //add           x10, x8, #0x14
939   0x4ddfc930,                             //ld1r          {v16.4s}, [x9], #4
940   0x4d40c951,                             //ld1r          {v17.4s}, [x10]
941   0x9100810a,                             //add           x10, x8, #0x20
942   0x4d40c952,                             //ld1r          {v18.4s}, [x10]
943   0x2d41d113,                             //ldp           s19, s20, [x8, #12]
944   0x2d435915,                             //ldp           s21, s22, [x8, #24]
945   0x91002108,                             //add           x8, x8, #0x8
946   0x4f941031,                             //fmla          v17.4s, v1.4s, v20.s[0]
947   0x4d40c914,                             //ld1r          {v20.4s}, [x8]
948   0x4f961032,                             //fmla          v18.4s, v1.4s, v22.s[0]
949   0xbd400136,                             //ldr           s22, [x9]
950   0x4f951012,                             //fmla          v18.4s, v0.4s, v21.s[0]
951   0x4f931011,                             //fmla          v17.4s, v0.4s, v19.s[0]
952   0x4f961034,                             //fmla          v20.4s, v1.4s, v22.s[0]
953   0x4ea1da41,                             //frecpe        v1.4s, v18.4s
954   0x4e21fe52,                             //frecps        v18.4s, v18.4s, v1.4s
955   0x6e32dc32,                             //fmul          v18.4s, v1.4s, v18.4s
956   0x4e20ce14,                             //fmla          v20.4s, v16.4s, v0.4s
957   0x6e32de21,                             //fmul          v1.4s, v17.4s, v18.4s
958   0x6e32de80,                             //fmul          v0.4s, v20.4s, v18.4s
959   0xd61f0060,                             //br            x3
960 };
961 
962 CODE const uint32_t sk_linear_gradient_2stops_aarch64[] = {
963   0xa8c10c28,                             //ldp           x8, x3, [x1], #16
964   0xad404503,                             //ldp           q3, q17, [x8]
965   0x4e040470,                             //dup           v16.4s, v3.s[0]
966   0x4e0c0461,                             //dup           v1.4s, v3.s[1]
967   0x4e140462,                             //dup           v2.4s, v3.s[2]
968   0x4e1c0463,                             //dup           v3.4s, v3.s[3]
969   0x4f911010,                             //fmla          v16.4s, v0.4s, v17.s[0]
970   0x4fb11001,                             //fmla          v1.4s, v0.4s, v17.s[1]
971   0x4f911802,                             //fmla          v2.4s, v0.4s, v17.s[2]
972   0x4fb11803,                             //fmla          v3.4s, v0.4s, v17.s[3]
973   0x4eb01e00,                             //mov           v0.16b, v16.16b
974   0xd61f0060,                             //br            x3
975 };
976 #elif defined(__arm__)
977 
978 CODE const uint32_t sk_start_pipeline_vfp4[] = {
979   0xe92d41f0,                             //push          {r4, r5, r6, r7, r8, lr}
980   0xe1a04000,                             //mov           r4, r0
981   0xe2840002,                             //add           r0, r4, #2
982   0xe1a05003,                             //mov           r5, r3
983   0xe1a08002,                             //mov           r8, r2
984   0xe1a07001,                             //mov           r7, r1
985   0xe1500005,                             //cmp           r0, r5
986   0x8a000010,                             //bhi           64 <sk_start_pipeline_vfp4+0x64>
987   0xe4976004,                             //ldr           r6, [r7], #4
988   0xf2800010,                             //vmov.i32      d0, #0
989   0xe1a00004,                             //mov           r0, r4
990   0xf2801010,                             //vmov.i32      d1, #0
991   0xe1a01007,                             //mov           r1, r7
992   0xf2802010,                             //vmov.i32      d2, #0
993   0xe1a02008,                             //mov           r2, r8
994   0xf2803010,                             //vmov.i32      d3, #0
995   0xf2804010,                             //vmov.i32      d4, #0
996   0xf2805010,                             //vmov.i32      d5, #0
997   0xf2806010,                             //vmov.i32      d6, #0
998   0xf2807010,                             //vmov.i32      d7, #0
999   0xe12fff36,                             //blx           r6
1000   0xe2840004,                             //add           r0, r4, #4
1001   0xe2844002,                             //add           r4, r4, #2
1002   0xe1500005,                             //cmp           r0, r5
1003   0x9affffef,                             //bls           24 <sk_start_pipeline_vfp4+0x24>
1004   0xe1a00004,                             //mov           r0, r4
1005   0xe8bd81f0,                             //pop           {r4, r5, r6, r7, r8, pc}
1006 };
1007 
1008 CODE const uint32_t sk_just_return_vfp4[] = {
1009   0xe12fff1e,                             //bx            lr
1010 };
1011 
1012 CODE const uint32_t sk_seed_shader_vfp4[] = {
1013   0xee800b90,                             //vdup.32       d16, r0
1014   0xe8911008,                             //ldm           r1, {r3, ip}
1015   0xf3fb0620,                             //vcvt.f32.s32  d16, d16
1016   0xf2c3161f,                             //vmov.i32      d17, #1056964608
1017   0xedd23b00,                             //vldr          d19, [r2]
1018   0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
1019   0xf2872f10,                             //vmov.f32      d2, #1
1020   0xf3fb2622,                             //vcvt.f32.s32  d18, d18
1021   0xe2811008,                             //add           r1, r1, #8
1022   0xf2400da1,                             //vadd.f32      d16, d16, d17
1023   0xf2803010,                             //vmov.i32      d3, #0
1024   0xf2804010,                             //vmov.i32      d4, #0
1025   0xf2021da1,                             //vadd.f32      d1, d18, d17
1026   0xf2000da3,                             //vadd.f32      d0, d16, d19
1027   0xf2805010,                             //vmov.i32      d5, #0
1028   0xf2806010,                             //vmov.i32      d6, #0
1029   0xf2807010,                             //vmov.i32      d7, #0
1030   0xe12fff1c,                             //bx            ip
1031 };
1032 
1033 CODE const uint32_t sk_constant_color_vfp4[] = {
1034   0xe8911008,                             //ldm           r1, {r3, ip}
1035   0xe2811008,                             //add           r1, r1, #8
1036   0xf4630a0f,                             //vld1.8        {d16-d17}, [r3]
1037   0xf3b40c20,                             //vdup.32       d0, d16[0]
1038   0xf3bc1c20,                             //vdup.32       d1, d16[1]
1039   0xf3b42c21,                             //vdup.32       d2, d17[0]
1040   0xf3bc3c21,                             //vdup.32       d3, d17[1]
1041   0xe12fff1c,                             //bx            ip
1042 };
1043 
1044 CODE const uint32_t sk_clear_vfp4[] = {
1045   0xe4913004,                             //ldr           r3, [r1], #4
1046   0xf2800010,                             //vmov.i32      d0, #0
1047   0xf2801010,                             //vmov.i32      d1, #0
1048   0xf2802010,                             //vmov.i32      d2, #0
1049   0xf2803010,                             //vmov.i32      d3, #0
1050   0xe12fff13,                             //bx            r3
1051 };
1052 
1053 CODE const uint32_t sk_plus__vfp4[] = {
1054   0xf2000d04,                             //vadd.f32      d0, d0, d4
1055   0xe4913004,                             //ldr           r3, [r1], #4
1056   0xf2011d05,                             //vadd.f32      d1, d1, d5
1057   0xf2022d06,                             //vadd.f32      d2, d2, d6
1058   0xf2033d07,                             //vadd.f32      d3, d3, d7
1059   0xe12fff13,                             //bx            r3
1060 };
1061 
1062 CODE const uint32_t sk_srcover_vfp4[] = {
1063   0xf2c70f10,                             //vmov.f32      d16, #1
1064   0xe4913004,                             //ldr           r3, [r1], #4
1065   0xf2600d83,                             //vsub.f32      d16, d16, d3
1066   0xf2040c30,                             //vfma.f32      d0, d4, d16
1067   0xf2051c30,                             //vfma.f32      d1, d5, d16
1068   0xf2062c30,                             //vfma.f32      d2, d6, d16
1069   0xf2073c30,                             //vfma.f32      d3, d7, d16
1070   0xe12fff13,                             //bx            r3
1071 };
1072 
1073 CODE const uint32_t sk_dstover_vfp4[] = {
1074   0xf2c70f10,                             //vmov.f32      d16, #1
1075   0xe4913004,                             //ldr           r3, [r1], #4
1076   0xf2651115,                             //vorr          d17, d5, d5
1077   0xf2604d87,                             //vsub.f32      d20, d16, d7
1078   0xf2640114,                             //vorr          d16, d4, d4
1079   0xf2662116,                             //vorr          d18, d6, d6
1080   0xf2673117,                             //vorr          d19, d7, d7
1081   0xf2400c34,                             //vfma.f32      d16, d0, d20
1082   0xf2411c34,                             //vfma.f32      d17, d1, d20
1083   0xf2422c34,                             //vfma.f32      d18, d2, d20
1084   0xf2433c34,                             //vfma.f32      d19, d3, d20
1085   0xf22001b0,                             //vorr          d0, d16, d16
1086   0xf22111b1,                             //vorr          d1, d17, d17
1087   0xf22221b2,                             //vorr          d2, d18, d18
1088   0xf22331b3,                             //vorr          d3, d19, d19
1089   0xe12fff13,                             //bx            r3
1090 };
1091 
1092 CODE const uint32_t sk_clamp_0_vfp4[] = {
1093   0xf2c00010,                             //vmov.i32      d16, #0
1094   0xe4913004,                             //ldr           r3, [r1], #4
1095   0xf2000f20,                             //vmax.f32      d0, d0, d16
1096   0xf2011f20,                             //vmax.f32      d1, d1, d16
1097   0xf2022f20,                             //vmax.f32      d2, d2, d16
1098   0xf2033f20,                             //vmax.f32      d3, d3, d16
1099   0xe12fff13,                             //bx            r3
1100 };
1101 
1102 CODE const uint32_t sk_clamp_1_vfp4[] = {
1103   0xf2c70f10,                             //vmov.f32      d16, #1
1104   0xe4913004,                             //ldr           r3, [r1], #4
1105   0xf2200f20,                             //vmin.f32      d0, d0, d16
1106   0xf2211f20,                             //vmin.f32      d1, d1, d16
1107   0xf2222f20,                             //vmin.f32      d2, d2, d16
1108   0xf2233f20,                             //vmin.f32      d3, d3, d16
1109   0xe12fff13,                             //bx            r3
1110 };
1111 
1112 CODE const uint32_t sk_clamp_a_vfp4[] = {
1113   0xf2c70f10,                             //vmov.f32      d16, #1
1114   0xe4913004,                             //ldr           r3, [r1], #4
1115   0xf2233f20,                             //vmin.f32      d3, d3, d16
1116   0xf2200f03,                             //vmin.f32      d0, d0, d3
1117   0xf2211f03,                             //vmin.f32      d1, d1, d3
1118   0xf2222f03,                             //vmin.f32      d2, d2, d3
1119   0xe12fff13,                             //bx            r3
1120 };
1121 
1122 CODE const uint32_t sk_set_rgb_vfp4[] = {
1123   0xe92d4800,                             //push          {fp, lr}
1124   0xe8911008,                             //ldm           r1, {r3, ip}
1125   0xe2811008,                             //add           r1, r1, #8
1126   0xe283e008,                             //add           lr, r3, #8
1127   0xf4a30c9d,                             //vld1.32       {d0[]}, [r3 :32]!
1128   0xf4ae2c9f,                             //vld1.32       {d2[]}, [lr :32]
1129   0xf4a31c9f,                             //vld1.32       {d1[]}, [r3 :32]
1130   0xe8bd4800,                             //pop           {fp, lr}
1131   0xe12fff1c,                             //bx            ip
1132 };
1133 
1134 CODE const uint32_t sk_swap_rb_vfp4[] = {
1135   0xeef00b40,                             //vmov.f64      d16, d0
1136   0xe4913004,                             //ldr           r3, [r1], #4
1137   0xeeb00b42,                             //vmov.f64      d0, d2
1138   0xeeb02b60,                             //vmov.f64      d2, d16
1139   0xe12fff13,                             //bx            r3
1140 };
1141 
1142 CODE const uint32_t sk_swap_vfp4[] = {
1143   0xeef00b43,                             //vmov.f64      d16, d3
1144   0xe4913004,                             //ldr           r3, [r1], #4
1145   0xeef01b42,                             //vmov.f64      d17, d2
1146   0xeef02b41,                             //vmov.f64      d18, d1
1147   0xeef03b40,                             //vmov.f64      d19, d0
1148   0xeeb00b44,                             //vmov.f64      d0, d4
1149   0xeeb01b45,                             //vmov.f64      d1, d5
1150   0xeeb02b46,                             //vmov.f64      d2, d6
1151   0xeeb03b47,                             //vmov.f64      d3, d7
1152   0xeeb04b63,                             //vmov.f64      d4, d19
1153   0xeeb05b62,                             //vmov.f64      d5, d18
1154   0xeeb06b61,                             //vmov.f64      d6, d17
1155   0xeeb07b60,                             //vmov.f64      d7, d16
1156   0xe12fff13,                             //bx            r3
1157 };
1158 
1159 CODE const uint32_t sk_move_src_dst_vfp4[] = {
1160   0xeeb04b40,                             //vmov.f64      d4, d0
1161   0xe4913004,                             //ldr           r3, [r1], #4
1162   0xeeb05b41,                             //vmov.f64      d5, d1
1163   0xeeb06b42,                             //vmov.f64      d6, d2
1164   0xeeb07b43,                             //vmov.f64      d7, d3
1165   0xe12fff13,                             //bx            r3
1166 };
1167 
1168 CODE const uint32_t sk_move_dst_src_vfp4[] = {
1169   0xeeb00b44,                             //vmov.f64      d0, d4
1170   0xe4913004,                             //ldr           r3, [r1], #4
1171   0xeeb01b45,                             //vmov.f64      d1, d5
1172   0xeeb02b46,                             //vmov.f64      d2, d6
1173   0xeeb03b47,                             //vmov.f64      d3, d7
1174   0xe12fff13,                             //bx            r3
1175 };
1176 
1177 CODE const uint32_t sk_premul_vfp4[] = {
1178   0xf3000d13,                             //vmul.f32      d0, d0, d3
1179   0xe4913004,                             //ldr           r3, [r1], #4
1180   0xf3011d13,                             //vmul.f32      d1, d1, d3
1181   0xf3022d13,                             //vmul.f32      d2, d2, d3
1182   0xe12fff13,                             //bx            r3
1183 };
1184 
1185 CODE const uint32_t sk_unpremul_vfp4[] = {
1186   0xed2d8b04,                             //vpush         {d8-d9}
1187   0xeeb78a00,                             //vmov.f32      s16, #112
1188   0xf3f91503,                             //vceq.f32      d17, d3, #0
1189   0xf2c00010,                             //vmov.i32      d16, #0
1190   0xe4913004,                             //ldr           r3, [r1], #4
1191   0xeec89a23,                             //vdiv.f32      s19, s16, s7
1192   0xee889a03,                             //vdiv.f32      s18, s16, s6
1193   0xf3501199,                             //vbsl          d17, d16, d9
1194   0xf3010d90,                             //vmul.f32      d0, d17, d0
1195   0xf3011d91,                             //vmul.f32      d1, d17, d1
1196   0xf3012d92,                             //vmul.f32      d2, d17, d2
1197   0xecbd8b04,                             //vpop          {d8-d9}
1198   0xe12fff13,                             //bx            r3
1199   0xe320f000,                             //nop           {0}
1200 };
1201 
1202 CODE const uint32_t sk_from_srgb_vfp4[] = {
1203   0xeddf3b20,                             //vldr          d19, [pc, #128]
1204   0xf3408d10,                             //vmul.f32      d24, d0, d0
1205   0xeddf0b1c,                             //vldr          d16, [pc, #112]
1206   0xf26341b3,                             //vorr          d20, d19, d19
1207   0xf26351b3,                             //vorr          d21, d19, d19
1208   0xeddf9b1f,                             //vldr          d25, [pc, #124]
1209   0xf2404c30,                             //vfma.f32      d20, d0, d16
1210   0xeddf2b1b,                             //vldr          d18, [pc, #108]
1211   0xf2415c30,                             //vfma.f32      d21, d1, d16
1212   0xeddfcb1d,                             //vldr          d28, [pc, #116]
1213   0xf2423c30,                             //vfma.f32      d19, d2, d16
1214   0xe4913004,                             //ldr           r3, [r1], #4
1215   0xf3426d12,                             //vmul.f32      d22, d2, d2
1216   0xf3417d11,                             //vmul.f32      d23, d1, d1
1217   0xf3620e80,                             //vcgt.f32      d16, d18, d0
1218   0xf3621e81,                             //vcgt.f32      d17, d18, d1
1219   0xf341ad39,                             //vmul.f32      d26, d1, d25
1220   0xf342bd39,                             //vmul.f32      d27, d2, d25
1221   0xf3622e82,                             //vcgt.f32      d18, d18, d2
1222   0xf3409d39,                             //vmul.f32      d25, d0, d25
1223   0xf26cd1bc,                             //vorr          d29, d28, d28
1224   0xf248dcb4,                             //vfma.f32      d29, d24, d20
1225   0xf26c41bc,                             //vorr          d20, d28, d28
1226   0xf2474cb5,                             //vfma.f32      d20, d23, d21
1227   0xf246ccb3,                             //vfma.f32      d28, d22, d19
1228   0xf35901bd,                             //vbsl          d16, d25, d29
1229   0xf35a11b4,                             //vbsl          d17, d26, d20
1230   0xf35b21bc,                             //vbsl          d18, d27, d28
1231   0xf22001b0,                             //vorr          d0, d16, d16
1232   0xf22111b1,                             //vorr          d1, d17, d17
1233   0xf22221b2,                             //vorr          d2, d18, d18
1234   0xe12fff13,                             //bx            r3
1235   0x3e99999a,                             //.word         0x3e99999a
1236   0x3e99999a,                             //.word         0x3e99999a
1237   0x3f328f5c,                             //.word         0x3f328f5c
1238   0x3f328f5c,                             //.word         0x3f328f5c
1239   0x3d6147ae,                             //.word         0x3d6147ae
1240   0x3d6147ae,                             //.word         0x3d6147ae
1241   0x3d9e8391,                             //.word         0x3d9e8391
1242   0x3d9e8391,                             //.word         0x3d9e8391
1243   0x3b23d70a,                             //.word         0x3b23d70a
1244   0x3b23d70a,                             //.word         0x3b23d70a
1245 };
1246 
1247 CODE const uint32_t sk_to_srgb_vfp4[] = {
1248   0xf3fb0582,                             //vrsqrte.f32   d16, d2
1249   0xe4913004,                             //ldr           r3, [r1], #4
1250   0xf3fb1581,                             //vrsqrte.f32   d17, d1
1251   0xf3fb2580,                             //vrsqrte.f32   d18, d0
1252   0xf3403db0,                             //vmul.f32      d19, d16, d16
1253   0xf3414db1,                             //vmul.f32      d20, d17, d17
1254   0xf3425db2,                             //vmul.f32      d21, d18, d18
1255   0xf2623f33,                             //vrsqrts.f32   d19, d2, d19
1256   0xf2614f34,                             //vrsqrts.f32   d20, d1, d20
1257   0xf2605f35,                             //vrsqrts.f32   d21, d0, d21
1258   0xf3400db3,                             //vmul.f32      d16, d16, d19
1259   0xf3411db4,                             //vmul.f32      d17, d17, d20
1260   0xf3422db5,                             //vmul.f32      d18, d18, d21
1261   0xf3fb3520,                             //vrecpe.f32    d19, d16
1262   0xf3fb4521,                             //vrecpe.f32    d20, d17
1263   0xf3fb6522,                             //vrecpe.f32    d22, d18
1264   0xf3fb55a0,                             //vrsqrte.f32   d21, d16
1265   0xf3fb75a1,                             //vrsqrte.f32   d23, d17
1266   0xf3fb85a2,                             //vrsqrte.f32   d24, d18
1267   0xf2409fb3,                             //vrecps.f32    d25, d16, d19
1268   0xf241afb4,                             //vrecps.f32    d26, d17, d20
1269   0xf242bfb6,                             //vrecps.f32    d27, d18, d22
1270   0xf345cdb5,                             //vmul.f32      d28, d21, d21
1271   0xf347ddb7,                             //vmul.f32      d29, d23, d23
1272   0xf348edb8,                             //vmul.f32      d30, d24, d24
1273   0xf2600fbc,                             //vrsqrts.f32   d16, d16, d28
1274   0xf2611fbd,                             //vrsqrts.f32   d17, d17, d29
1275   0xf2622fbe,                             //vrsqrts.f32   d18, d18, d30
1276   0xf3433db9,                             //vmul.f32      d19, d19, d25
1277   0xeddf9b21,                             //vldr          d25, [pc, #132]
1278   0xf3444dba,                             //vmul.f32      d20, d20, d26
1279   0xeddfab21,                             //vldr          d26, [pc, #132]
1280   0xf3466dbb,                             //vmul.f32      d22, d22, d27
1281   0xf26ab1ba,                             //vorr          d27, d26, d26
1282   0xf243bcb9,                             //vfma.f32      d27, d19, d25
1283   0xf26a31ba,                             //vorr          d19, d26, d26
1284   0xf2443cb9,                             //vfma.f32      d19, d20, d25
1285   0xeddf4b1d,                             //vldr          d20, [pc, #116]
1286   0xf246acb9,                             //vfma.f32      d26, d22, d25
1287   0xf3450db0,                             //vmul.f32      d16, d21, d16
1288   0xeddf5b1c,                             //vldr          d21, [pc, #112]
1289   0xf3471db1,                             //vmul.f32      d17, d23, d17
1290   0xf3482db2,                             //vmul.f32      d18, d24, d18
1291   0xf3406d35,                             //vmul.f32      d22, d0, d21
1292   0xf240bcb4,                             //vfma.f32      d27, d16, d20
1293   0xf2413cb4,                             //vfma.f32      d19, d17, d20
1294   0xf242acb4,                             //vfma.f32      d26, d18, d20
1295   0xeddf2b17,                             //vldr          d18, [pc, #92]
1296   0xf3417d35,                             //vmul.f32      d23, d1, d21
1297   0xf3620e80,                             //vcgt.f32      d16, d18, d0
1298   0xf3621e81,                             //vcgt.f32      d17, d18, d1
1299   0xf3622e82,                             //vcgt.f32      d18, d18, d2
1300   0xf3425d35,                             //vmul.f32      d21, d2, d21
1301   0xf2c74f10,                             //vmov.f32      d20, #1
1302   0xf2648faa,                             //vmin.f32      d24, d20, d26
1303   0xf2643fa3,                             //vmin.f32      d19, d20, d19
1304   0xf2644fab,                             //vmin.f32      d20, d20, d27
1305   0xf35601b8,                             //vbsl          d16, d22, d24
1306   0xf35711b3,                             //vbsl          d17, d23, d19
1307   0xf35521b4,                             //vbsl          d18, d21, d20
1308   0xf22001b0,                             //vorr          d0, d16, d16
1309   0xf22111b1,                             //vorr          d1, d17, d17
1310   0xf22221b2,                             //vorr          d2, d18, d18
1311   0xe12fff13,                             //bx            r3
1312   0x3f306fce,                             //.word         0x3f306fce
1313   0x3f306fce,                             //.word         0x3f306fce
1314   0xbdca57a8,                             //.word         0xbdca57a8
1315   0xbdca57a8,                             //.word         0xbdca57a8
1316   0x3ed287c2,                             //.word         0x3ed287c2
1317   0x3ed287c2,                             //.word         0x3ed287c2
1318   0x41475c29,                             //.word         0x41475c29
1319   0x41475c29,                             //.word         0x41475c29
1320   0x3b8ce704,                             //.word         0x3b8ce704
1321   0x3b8ce704,                             //.word         0x3b8ce704
1322 };
1323 
1324 CODE const uint32_t sk_scale_1_float_vfp4[] = {
1325   0xe8911008,                             //ldm           r1, {r3, ip}
1326   0xe2811008,                             //add           r1, r1, #8
1327   0xf4e30c9f,                             //vld1.32       {d16[]}, [r3 :32]
1328   0xf3000d90,                             //vmul.f32      d0, d16, d0
1329   0xf3001d91,                             //vmul.f32      d1, d16, d1
1330   0xf3002d92,                             //vmul.f32      d2, d16, d2
1331   0xf3003d93,                             //vmul.f32      d3, d16, d3
1332   0xe12fff1c,                             //bx            ip
1333 };
1334 
1335 CODE const uint32_t sk_scale_u8_vfp4[] = {
1336   0xe24dd004,                             //sub           sp, sp, #4
1337   0xe8911008,                             //ldm           r1, {r3, ip}
1338   0xe2811008,                             //add           r1, r1, #8
1339   0xe5933000,                             //ldr           r3, [r3]
1340   0xe0833000,                             //add           r3, r3, r0
1341   0xe1d330b0,                             //ldrh          r3, [r3]
1342   0xe1cd30b0,                             //strh          r3, [sp]
1343   0xe1a0300d,                             //mov           r3, sp
1344   0xf4e3041f,                             //vld1.16       {d16[0]}, [r3 :16]
1345   0xf3c80a30,                             //vmovl.u8      q8, d16
1346   0xf3d00a30,                             //vmovl.u16     q8, d16
1347   0xf3fb06a0,                             //vcvt.f32.u32  d16, d16
1348   0xeddf1b06,                             //vldr          d17, [pc, #24]
1349   0xf3400db1,                             //vmul.f32      d16, d16, d17
1350   0xf3000d90,                             //vmul.f32      d0, d16, d0
1351   0xf3001d91,                             //vmul.f32      d1, d16, d1
1352   0xf3002d92,                             //vmul.f32      d2, d16, d2
1353   0xf3003d93,                             //vmul.f32      d3, d16, d3
1354   0xe28dd004,                             //add           sp, sp, #4
1355   0xe12fff1c,                             //bx            ip
1356   0x3b808081,                             //.word         0x3b808081
1357   0x3b808081,                             //.word         0x3b808081
1358 };
1359 
1360 CODE const uint32_t sk_lerp_1_float_vfp4[] = {
1361   0xe8911008,                             //ldm           r1, {r3, ip}
1362   0xf2600d04,                             //vsub.f32      d16, d0, d4
1363   0xf2611d05,                             //vsub.f32      d17, d1, d5
1364   0xf2622d06,                             //vsub.f32      d18, d2, d6
1365   0xe2811008,                             //add           r1, r1, #8
1366   0xf2633d07,                             //vsub.f32      d19, d3, d7
1367   0xf4e34c9f,                             //vld1.32       {d20[]}, [r3 :32]
1368   0xf2240114,                             //vorr          d0, d4, d4
1369   0xf2251115,                             //vorr          d1, d5, d5
1370   0xf2262116,                             //vorr          d2, d6, d6
1371   0xf2273117,                             //vorr          d3, d7, d7
1372   0xf2000cb4,                             //vfma.f32      d0, d16, d20
1373   0xf2011cb4,                             //vfma.f32      d1, d17, d20
1374   0xf2022cb4,                             //vfma.f32      d2, d18, d20
1375   0xf2033cb4,                             //vfma.f32      d3, d19, d20
1376   0xe12fff1c,                             //bx            ip
1377 };
1378 
1379 CODE const uint32_t sk_lerp_u8_vfp4[] = {
1380   0xe24dd004,                             //sub           sp, sp, #4
1381   0xe8911008,                             //ldm           r1, {r3, ip}
1382   0xf2602d04,                             //vsub.f32      d18, d0, d4
1383   0xf2623d06,                             //vsub.f32      d19, d2, d6
1384   0xf2634d07,                             //vsub.f32      d20, d3, d7
1385   0xe2811008,                             //add           r1, r1, #8
1386   0xe5933000,                             //ldr           r3, [r3]
1387   0xf2240114,                             //vorr          d0, d4, d4
1388   0xf2262116,                             //vorr          d2, d6, d6
1389   0xe0833000,                             //add           r3, r3, r0
1390   0xf2273117,                             //vorr          d3, d7, d7
1391   0xe1d330b0,                             //ldrh          r3, [r3]
1392   0xe1cd30b0,                             //strh          r3, [sp]
1393   0xe1a0300d,                             //mov           r3, sp
1394   0xf4e3041f,                             //vld1.16       {d16[0]}, [r3 :16]
1395   0xf3c80a30,                             //vmovl.u8      q8, d16
1396   0xf3d00a30,                             //vmovl.u16     q8, d16
1397   0xf3fb06a0,                             //vcvt.f32.u32  d16, d16
1398   0xeddf1b08,                             //vldr          d17, [pc, #32]
1399   0xf3400db1,                             //vmul.f32      d16, d16, d17
1400   0xf2611d05,                             //vsub.f32      d17, d1, d5
1401   0xf2251115,                             //vorr          d1, d5, d5
1402   0xf2020cb0,                             //vfma.f32      d0, d18, d16
1403   0xf2011cb0,                             //vfma.f32      d1, d17, d16
1404   0xf2032cb0,                             //vfma.f32      d2, d19, d16
1405   0xf2043cb0,                             //vfma.f32      d3, d20, d16
1406   0xe28dd004,                             //add           sp, sp, #4
1407   0xe12fff1c,                             //bx            ip
1408   0x3b808081,                             //.word         0x3b808081
1409   0x3b808081,                             //.word         0x3b808081
1410 };
1411 
1412 CODE const uint32_t sk_lerp_565_vfp4[] = {
1413   0xe24dd004,                             //sub           sp, sp, #4
1414   0xe8911008,                             //ldm           r1, {r3, ip}
1415   0xf3c72218,                             //vmov.i32      d18, #63488
1416   0xf2c1101f,                             //vmov.i32      d17, #31
1417   0xf2603d04,                             //vsub.f32      d19, d0, d4
1418   0xe2811008,                             //add           r1, r1, #8
1419   0xe5933000,                             //ldr           r3, [r3]
1420   0xf2616d05,                             //vsub.f32      d22, d1, d5
1421   0xf2240114,                             //vorr          d0, d4, d4
1422   0xf2251115,                             //vorr          d1, d5, d5
1423   0xe7933080,                             //ldr           r3, [r3, r0, lsl #1]
1424   0xf2873f10,                             //vmov.f32      d3, #1
1425   0xe58d3000,                             //str           r3, [sp]
1426   0xe1a0300d,                             //mov           r3, sp
1427   0xf4e3083f,                             //vld1.32       {d16[0]}, [r3 :32]
1428   0xe3a03e7e,                             //mov           r3, #2016
1429   0xf3d04a30,                             //vmovl.u16     q10, d16
1430   0xee803b90,                             //vdup.32       d16, r3
1431   0xf24421b2,                             //vand          d18, d20, d18
1432   0xf24411b1,                             //vand          d17, d20, d17
1433   0xeddf5b12,                             //vldr          d21, [pc, #72]
1434   0xf24401b0,                             //vand          d16, d20, d16
1435   0xeddf4b0e,                             //vldr          d20, [pc, #56]
1436   0xf3fb2622,                             //vcvt.f32.s32  d18, d18
1437   0xf3fb0620,                             //vcvt.f32.s32  d16, d16
1438   0xf3fb1621,                             //vcvt.f32.s32  d17, d17
1439   0xf3422db4,                             //vmul.f32      d18, d18, d20
1440   0xeddf4b0d,                             //vldr          d20, [pc, #52]
1441   0xf3400db5,                             //vmul.f32      d16, d16, d21
1442   0xf2625d06,                             //vsub.f32      d21, d2, d6
1443   0xf3411db4,                             //vmul.f32      d17, d17, d20
1444   0xf2262116,                             //vorr          d2, d6, d6
1445   0xf2030cb2,                             //vfma.f32      d0, d19, d18
1446   0xf2061cb0,                             //vfma.f32      d1, d22, d16
1447   0xf2052cb1,                             //vfma.f32      d2, d21, d17
1448   0xe28dd004,                             //add           sp, sp, #4
1449   0xe12fff1c,                             //bx            ip
1450   0xe320f000,                             //nop           {0}
1451   0x37842108,                             //.word         0x37842108
1452   0x37842108,                             //.word         0x37842108
1453   0x3a020821,                             //.word         0x3a020821
1454   0x3a020821,                             //.word         0x3a020821
1455   0x3d042108,                             //.word         0x3d042108
1456   0x3d042108,                             //.word         0x3d042108
1457 };
1458 
1459 CODE const uint32_t sk_load_tables_vfp4[] = {
1460   0xe92d48f0,                             //push          {r4, r5, r6, r7, fp, lr}
1461   0xe8911008,                             //ldm           r1, {r3, ip}
1462   0xf3c7001f,                             //vmov.i32      d16, #255
1463   0xe2811008,                             //add           r1, r1, #8
1464   0xe593e000,                             //ldr           lr, [r3]
1465   0xe99300b0,                             //ldmib         r3, {r4, r5, r7}
1466   0xe08e3100,                             //add           r3, lr, r0, lsl #2
1467   0xedd31b00,                             //vldr          d17, [r3]
1468   0xf24121b0,                             //vand          d18, d17, d16
1469   0xf3f83031,                             //vshr.u32      d19, d17, #8
1470   0xee323b90,                             //vmov.32       r3, d18[1]
1471   0xee126b90,                             //vmov.32       r6, d18[0]
1472   0xf3f02031,                             //vshr.u32      d18, d17, #16
1473   0xf24221b0,                             //vand          d18, d18, d16
1474   0xf24301b0,                             //vand          d16, d19, d16
1475   0xe0843103,                             //add           r3, r4, r3, lsl #2
1476   0xedd30a00,                             //vldr          s1, [r3]
1477   0xe0843106,                             //add           r3, r4, r6, lsl #2
1478   0xee326b90,                             //vmov.32       r6, d18[1]
1479   0xed930a00,                             //vldr          s0, [r3]
1480   0xee303b90,                             //vmov.32       r3, d16[1]
1481   0xee104b90,                             //vmov.32       r4, d16[0]
1482   0xf3e80031,                             //vshr.u32      d16, d17, #24
1483   0xeddf1b0d,                             //vldr          d17, [pc, #52]
1484   0xf3fb0620,                             //vcvt.f32.s32  d16, d16
1485   0xf3003db1,                             //vmul.f32      d3, d16, d17
1486   0xe087e106,                             //add           lr, r7, r6, lsl #2
1487   0xee126b90,                             //vmov.32       r6, d18[0]
1488   0xe0853103,                             //add           r3, r5, r3, lsl #2
1489   0xedde2a00,                             //vldr          s5, [lr]
1490   0xedd31a00,                             //vldr          s3, [r3]
1491   0xe0853104,                             //add           r3, r5, r4, lsl #2
1492   0xed931a00,                             //vldr          s2, [r3]
1493   0xe0873106,                             //add           r3, r7, r6, lsl #2
1494   0xed932a00,                             //vldr          s4, [r3]
1495   0xe8bd48f0,                             //pop           {r4, r5, r6, r7, fp, lr}
1496   0xe12fff1c,                             //bx            ip
1497   0xe320f000,                             //nop           {0}
1498   0x3b808081,                             //.word         0x3b808081
1499   0x3b808081,                             //.word         0x3b808081
1500 };
1501 
1502 CODE const uint32_t sk_load_a8_vfp4[] = {
1503   0xe24dd004,                             //sub           sp, sp, #4
1504   0xe8911008,                             //ldm           r1, {r3, ip}
1505   0xe2811008,                             //add           r1, r1, #8
1506   0xf2800010,                             //vmov.i32      d0, #0
1507   0xf2801010,                             //vmov.i32      d1, #0
1508   0xe5933000,                             //ldr           r3, [r3]
1509   0xf2802010,                             //vmov.i32      d2, #0
1510   0xe0833000,                             //add           r3, r3, r0
1511   0xe1d330b0,                             //ldrh          r3, [r3]
1512   0xe1cd30b0,                             //strh          r3, [sp]
1513   0xe1a0300d,                             //mov           r3, sp
1514   0xf4e3041f,                             //vld1.16       {d16[0]}, [r3 :16]
1515   0xf3c80a30,                             //vmovl.u8      q8, d16
1516   0xf3d00a30,                             //vmovl.u16     q8, d16
1517   0xf3fb06a0,                             //vcvt.f32.u32  d16, d16
1518   0xeddf1b03,                             //vldr          d17, [pc, #12]
1519   0xf3003db1,                             //vmul.f32      d3, d16, d17
1520   0xe28dd004,                             //add           sp, sp, #4
1521   0xe12fff1c,                             //bx            ip
1522   0xe320f000,                             //nop           {0}
1523   0x3b808081,                             //.word         0x3b808081
1524   0x3b808081,                             //.word         0x3b808081
1525 };
1526 
1527 CODE const uint32_t sk_store_a8_vfp4[] = {
1528   0xe92d4800,                             //push          {fp, lr}
1529   0xeddf0b0d,                             //vldr          d16, [pc, #52]
1530   0xf2c3161f,                             //vmov.i32      d17, #1056964608
1531   0xf2431c30,                             //vfma.f32      d17, d3, d16
1532   0xe5913000,                             //ldr           r3, [r1]
1533   0xe5933000,                             //ldr           r3, [r3]
1534   0xf3fb07a1,                             //vcvt.u32.f32  d16, d17
1535   0xee10eb90,                             //vmov.32       lr, d16[0]
1536   0xee30cb90,                             //vmov.32       ip, d16[1]
1537   0xe7e3e000,                             //strb          lr, [r3, r0]!
1538   0xe5c3c001,                             //strb          ip, [r3, #1]
1539   0xe2813008,                             //add           r3, r1, #8
1540   0xe591c004,                             //ldr           ip, [r1, #4]
1541   0xe1a01003,                             //mov           r1, r3
1542   0xe8bd4800,                             //pop           {fp, lr}
1543   0xe12fff1c,                             //bx            ip
1544   0x437f0000,                             //.word         0x437f0000
1545   0x437f0000,                             //.word         0x437f0000
1546 };
1547 
1548 CODE const uint32_t sk_load_565_vfp4[] = {
1549   0xe24dd004,                             //sub           sp, sp, #4
1550   0xe8911008,                             //ldm           r1, {r3, ip}
1551   0xf2c1101f,                             //vmov.i32      d17, #31
1552   0xf3c72218,                             //vmov.i32      d18, #63488
1553   0xeddf3b16,                             //vldr          d19, [pc, #88]
1554   0xe2811008,                             //add           r1, r1, #8
1555   0xe5933000,                             //ldr           r3, [r3]
1556   0xf2873f10,                             //vmov.f32      d3, #1
1557   0xe7933080,                             //ldr           r3, [r3, r0, lsl #1]
1558   0xe58d3000,                             //str           r3, [sp]
1559   0xe1a0300d,                             //mov           r3, sp
1560   0xf4e3083f,                             //vld1.32       {d16[0]}, [r3 :32]
1561   0xe3a03e7e,                             //mov           r3, #2016
1562   0xf3d04a30,                             //vmovl.u16     q10, d16
1563   0xee803b90,                             //vdup.32       d16, r3
1564   0xf24411b1,                             //vand          d17, d20, d17
1565   0xeddf5b0e,                             //vldr          d21, [pc, #56]
1566   0xf24421b2,                             //vand          d18, d20, d18
1567   0xf24401b0,                             //vand          d16, d20, d16
1568   0xeddf4b09,                             //vldr          d20, [pc, #36]
1569   0xf3fb2622,                             //vcvt.f32.s32  d18, d18
1570   0xf3fb0620,                             //vcvt.f32.s32  d16, d16
1571   0xf3fb1621,                             //vcvt.f32.s32  d17, d17
1572   0xf3020db3,                             //vmul.f32      d0, d18, d19
1573   0xf3001db4,                             //vmul.f32      d1, d16, d20
1574   0xf3012db5,                             //vmul.f32      d2, d17, d21
1575   0xe28dd004,                             //add           sp, sp, #4
1576   0xe12fff1c,                             //bx            ip
1577   0x37842108,                             //.word         0x37842108
1578   0x37842108,                             //.word         0x37842108
1579   0x3a020821,                             //.word         0x3a020821
1580   0x3a020821,                             //.word         0x3a020821
1581   0x3d042108,                             //.word         0x3d042108
1582   0x3d042108,                             //.word         0x3d042108
1583 };
1584 
1585 CODE const uint32_t sk_store_565_vfp4[] = {
1586   0xf2c30f1f,                             //vmov.f32      d16, #31
1587   0xeddf1b15,                             //vldr          d17, [pc, #84]
1588   0xf2c3361f,                             //vmov.i32      d19, #1056964608
1589   0xe5913000,                             //ldr           r3, [r1]
1590   0xf2413c31,                             //vfma.f32      d19, d1, d17
1591   0xf2c3161f,                             //vmov.i32      d17, #1056964608
1592   0xf2401c30,                             //vfma.f32      d17, d0, d16
1593   0xe5933000,                             //ldr           r3, [r3]
1594   0xf2c3261f,                             //vmov.i32      d18, #1056964608
1595   0xf2422c30,                             //vfma.f32      d18, d2, d16
1596   0xe0833080,                             //add           r3, r3, r0, lsl #1
1597   0xf3fb07a3,                             //vcvt.u32.f32  d16, d19
1598   0xf3fb17a1,                             //vcvt.u32.f32  d17, d17
1599   0xf3fb27a2,                             //vcvt.u32.f32  d18, d18
1600   0xf2e50530,                             //vshl.s32      d16, d16, #5
1601   0xf2eb1531,                             //vshl.s32      d17, d17, #11
1602   0xf26001b1,                             //vorr          d16, d16, d17
1603   0xf26001b2,                             //vorr          d16, d16, d18
1604   0xf3f60121,                             //vuzp.16       d16, d17
1605   0xf4c3080f,                             //vst1.32       {d16[0]}, [r3]
1606   0xe2813008,                             //add           r3, r1, #8
1607   0xe591c004,                             //ldr           ip, [r1, #4]
1608   0xe1a01003,                             //mov           r1, r3
1609   0xe12fff1c,                             //bx            ip
1610   0x427c0000,                             //.word         0x427c0000
1611   0x427c0000,                             //.word         0x427c0000
1612 };
1613 
1614 CODE const uint32_t sk_load_8888_vfp4[] = {
1615   0xe8911008,                             //ldm           r1, {r3, ip}
1616   0xf3c7001f,                             //vmov.i32      d16, #255
1617   0xe2811008,                             //add           r1, r1, #8
1618   0xe5933000,                             //ldr           r3, [r3]
1619   0xe0833100,                             //add           r3, r3, r0, lsl #2
1620   0xedd31b00,                             //vldr          d17, [r3]
1621   0xf24121b0,                             //vand          d18, d17, d16
1622   0xf3f83031,                             //vshr.u32      d19, d17, #8
1623   0xf3e84031,                             //vshr.u32      d20, d17, #24
1624   0xf3f01031,                             //vshr.u32      d17, d17, #16
1625   0xf24331b0,                             //vand          d19, d19, d16
1626   0xf24101b0,                             //vand          d16, d17, d16
1627   0xeddf1b08,                             //vldr          d17, [pc, #32]
1628   0xf3fb2622,                             //vcvt.f32.s32  d18, d18
1629   0xf3fb4624,                             //vcvt.f32.s32  d20, d20
1630   0xf3fb3623,                             //vcvt.f32.s32  d19, d19
1631   0xf3fb0620,                             //vcvt.f32.s32  d16, d16
1632   0xf3020db1,                             //vmul.f32      d0, d18, d17
1633   0xf3043db1,                             //vmul.f32      d3, d20, d17
1634   0xf3031db1,                             //vmul.f32      d1, d19, d17
1635   0xf3002db1,                             //vmul.f32      d2, d16, d17
1636   0xe12fff1c,                             //bx            ip
1637   0x3b808081,                             //.word         0x3b808081
1638   0x3b808081,                             //.word         0x3b808081
1639 };
1640 
1641 CODE const uint32_t sk_store_8888_vfp4[] = {
1642   0xeddf0b1a,                             //vldr          d16, [pc, #104]
1643   0xf2c3261f,                             //vmov.i32      d18, #1056964608
1644   0xf2412c30,                             //vfma.f32      d18, d1, d16
1645   0xe5913000,                             //ldr           r3, [r1]
1646   0xf2c3361f,                             //vmov.i32      d19, #1056964608
1647   0xf2c3161f,                             //vmov.i32      d17, #1056964608
1648   0xf2423c30,                             //vfma.f32      d19, d2, d16
1649   0xe5933000,                             //ldr           r3, [r3]
1650   0xf2c3461f,                             //vmov.i32      d20, #1056964608
1651   0xf2401c30,                             //vfma.f32      d17, d0, d16
1652   0xe0833100,                             //add           r3, r3, r0, lsl #2
1653   0xf2434c30,                             //vfma.f32      d20, d3, d16
1654   0xf3fb07a2,                             //vcvt.u32.f32  d16, d18
1655   0xf3fb27a3,                             //vcvt.u32.f32  d18, d19
1656   0xf3fb17a1,                             //vcvt.u32.f32  d17, d17
1657   0xf3fb37a4,                             //vcvt.u32.f32  d19, d20
1658   0xf2e80530,                             //vshl.s32      d16, d16, #8
1659   0xf2f02532,                             //vshl.s32      d18, d18, #16
1660   0xf26001b1,                             //vorr          d16, d16, d17
1661   0xf2f81533,                             //vshl.s32      d17, d19, #24
1662   0xf26001b2,                             //vorr          d16, d16, d18
1663   0xf26001b1,                             //vorr          d16, d16, d17
1664   0xedc30b00,                             //vstr          d16, [r3]
1665   0xe2813008,                             //add           r3, r1, #8
1666   0xe591c004,                             //ldr           ip, [r1, #4]
1667   0xe1a01003,                             //mov           r1, r3
1668   0xe12fff1c,                             //bx            ip
1669   0xe320f000,                             //nop           {0}
1670   0x437f0000,                             //.word         0x437f0000
1671   0x437f0000,                             //.word         0x437f0000
1672 };
1673 
1674 CODE const uint32_t sk_load_f16_vfp4[] = {
1675   0xe8911008,                             //ldm           r1, {r3, ip}
1676   0xe2811008,                             //add           r1, r1, #8
1677   0xe5933000,                             //ldr           r3, [r3]
1678   0xe0833180,                             //add           r3, r3, r0, lsl #3
1679   0xf463084f,                             //vld2.16       {d16-d17}, [r3]
1680   0xf3f62720,                             //vcvt.f32.f16  q9, d16
1681   0xf3f60721,                             //vcvt.f32.f16  q8, d17
1682   0xf22201b2,                             //vorr          d0, d18, d18
1683   0xf22011b0,                             //vorr          d1, d16, d16
1684   0xf3ba00a3,                             //vtrn.32       d0, d19
1685   0xf22321b3,                             //vorr          d2, d19, d19
1686   0xf3ba10a1,                             //vtrn.32       d1, d17
1687   0xf22131b1,                             //vorr          d3, d17, d17
1688   0xe12fff1c,                             //bx            ip
1689 };
1690 
1691 CODE const uint32_t sk_store_f16_vfp4[] = {
1692   0xeef00b41,                             //vmov.f64      d16, d1
1693   0xeef03b42,                             //vmov.f64      d19, d2
1694   0xf2631113,                             //vorr          d17, d3, d3
1695   0xf2602110,                             //vorr          d18, d0, d0
1696   0xf3fa00a1,                             //vtrn.32       d16, d17
1697   0xf3f61620,                             //vcvt.f16.f32  d17, q8
1698   0xf3fa20a3,                             //vtrn.32       d18, d19
1699   0xe5913000,                             //ldr           r3, [r1]
1700   0xf3f60622,                             //vcvt.f16.f32  d16, q9
1701   0xe5933000,                             //ldr           r3, [r3]
1702   0xe0833180,                             //add           r3, r3, r0, lsl #3
1703   0xf443084f,                             //vst2.16       {d16-d17}, [r3]
1704   0xe2813008,                             //add           r3, r1, #8
1705   0xe591c004,                             //ldr           ip, [r1, #4]
1706   0xe1a01003,                             //mov           r1, r3
1707   0xe12fff1c,                             //bx            ip
1708 };
1709 
1710 CODE const uint32_t sk_store_f32_vfp4[] = {
1711   0xe5913000,                             //ldr           r3, [r1]
1712   0xe5933000,                             //ldr           r3, [r3]
1713   0xe0833200,                             //add           r3, r3, r0, lsl #4
1714   0xf403008f,                             //vst4.32       {d0-d3}, [r3]
1715   0xe2813008,                             //add           r3, r1, #8
1716   0xe591c004,                             //ldr           ip, [r1, #4]
1717   0xe1a01003,                             //mov           r1, r3
1718   0xe12fff1c,                             //bx            ip
1719 };
1720 
1721 CODE const uint32_t sk_clamp_x_vfp4[] = {
1722   0xe8911008,                             //ldm           r1, {r3, ip}
1723   0xf2c00010,                             //vmov.i32      d16, #0
1724   0xf3c71e1f,                             //vmov.i8       d17, #255
1725   0xf2400f80,                             //vmax.f32      d16, d16, d0
1726   0xe2811008,                             //add           r1, r1, #8
1727   0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
1728   0xf26218a1,                             //vadd.i32      d17, d18, d17
1729   0xf2200fa1,                             //vmin.f32      d0, d16, d17
1730   0xe12fff1c,                             //bx            ip
1731 };
1732 
1733 CODE const uint32_t sk_clamp_y_vfp4[] = {
1734   0xe8911008,                             //ldm           r1, {r3, ip}
1735   0xf2c00010,                             //vmov.i32      d16, #0
1736   0xf3c71e1f,                             //vmov.i8       d17, #255
1737   0xf2400f81,                             //vmax.f32      d16, d16, d1
1738   0xe2811008,                             //add           r1, r1, #8
1739   0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
1740   0xf26218a1,                             //vadd.i32      d17, d18, d17
1741   0xf2201fa1,                             //vmin.f32      d1, d16, d17
1742   0xe12fff1c,                             //bx            ip
1743 };
1744 
1745 CODE const uint32_t sk_repeat_x_vfp4[] = {
1746   0xed2d8b04,                             //vpush         {d8-d9}
1747   0xe8911008,                             //ldm           r1, {r3, ip}
1748   0xf2c02010,                             //vmov.i32      d18, #0
1749   0xe2811008,                             //add           r1, r1, #8
1750   0xeddf3b10,                             //vldr          d19, [pc, #64]
1751   0xed938a00,                             //vldr          s16, [r3]
1752   0xeec09a88,                             //vdiv.f32      s19, s1, s16
1753   0xee809a08,                             //vdiv.f32      s18, s0, s16
1754   0xf3fb0709,                             //vcvt.s32.f32  d16, d9
1755   0xf3fb0620,                             //vcvt.f32.s32  d16, d16
1756   0xf3601e89,                             //vcgt.f32      d17, d16, d9
1757   0xf35311b2,                             //vbsl          d17, d19, d18
1758   0xf3f42c08,                             //vdup.32       d18, d8[0]
1759   0xf2600da1,                             //vsub.f32      d16, d16, d17
1760   0xf3c71e1f,                             //vmov.i8       d17, #255
1761   0xf26218a1,                             //vadd.i32      d17, d18, d17
1762   0xf2e009c8,                             //vmul.f32      d16, d16, d8[0]
1763   0xf2600d20,                             //vsub.f32      d16, d0, d16
1764   0xf2200fa1,                             //vmin.f32      d0, d16, d17
1765   0xecbd8b04,                             //vpop          {d8-d9}
1766   0xe12fff1c,                             //bx            ip
1767   0xe320f000,                             //nop           {0}
1768   0x3f800000,                             //.word         0x3f800000
1769   0x3f800000,                             //.word         0x3f800000
1770 };
1771 
1772 CODE const uint32_t sk_repeat_y_vfp4[] = {
1773   0xed2d8b04,                             //vpush         {d8-d9}
1774   0xe8911008,                             //ldm           r1, {r3, ip}
1775   0xf2c02010,                             //vmov.i32      d18, #0
1776   0xe2811008,                             //add           r1, r1, #8
1777   0xeddf3b10,                             //vldr          d19, [pc, #64]
1778   0xed938a00,                             //vldr          s16, [r3]
1779   0xeec19a88,                             //vdiv.f32      s19, s3, s16
1780   0xee819a08,                             //vdiv.f32      s18, s2, s16
1781   0xf3fb0709,                             //vcvt.s32.f32  d16, d9
1782   0xf3fb0620,                             //vcvt.f32.s32  d16, d16
1783   0xf3601e89,                             //vcgt.f32      d17, d16, d9
1784   0xf35311b2,                             //vbsl          d17, d19, d18
1785   0xf3f42c08,                             //vdup.32       d18, d8[0]
1786   0xf2600da1,                             //vsub.f32      d16, d16, d17
1787   0xf3c71e1f,                             //vmov.i8       d17, #255
1788   0xf26218a1,                             //vadd.i32      d17, d18, d17
1789   0xf2e009c8,                             //vmul.f32      d16, d16, d8[0]
1790   0xf2610d20,                             //vsub.f32      d16, d1, d16
1791   0xf2201fa1,                             //vmin.f32      d1, d16, d17
1792   0xecbd8b04,                             //vpop          {d8-d9}
1793   0xe12fff1c,                             //bx            ip
1794   0xe320f000,                             //nop           {0}
1795   0x3f800000,                             //.word         0x3f800000
1796   0x3f800000,                             //.word         0x3f800000
1797 };
1798 
1799 CODE const uint32_t sk_mirror_x_vfp4[] = {
1800   0xed2d8b04,                             //vpush         {d8-d9}
1801   0xe8911008,                             //ldm           r1, {r3, ip}
1802   0xf2c03010,                             //vmov.i32      d19, #0
1803   0xe2811008,                             //add           r1, r1, #8
1804   0xeddf4b14,                             //vldr          d20, [pc, #80]
1805   0xed938a00,                             //vldr          s16, [r3]
1806   0xee389a08,                             //vadd.f32      s18, s16, s16
1807   0xf3f40c08,                             //vdup.32       d16, d8[0]
1808   0xf2200d20,                             //vsub.f32      d0, d0, d16
1809   0xeec08a89,                             //vdiv.f32      s17, s1, s18
1810   0xee808a09,                             //vdiv.f32      s16, s0, s18
1811   0xf3fb1708,                             //vcvt.s32.f32  d17, d8
1812   0xf3fb1621,                             //vcvt.f32.s32  d17, d17
1813   0xf3612e88,                             //vcgt.f32      d18, d17, d8
1814   0xf35421b3,                             //vbsl          d18, d20, d19
1815   0xf2611da2,                             //vsub.f32      d17, d17, d18
1816   0xf3c72e1f,                             //vmov.i8       d18, #255
1817   0xf2e119c9,                             //vmul.f32      d17, d17, d9[0]
1818   0xf2601d21,                             //vsub.f32      d17, d0, d17
1819   0xf2611da0,                             //vsub.f32      d17, d17, d16
1820   0xf26008a2,                             //vadd.i32      d16, d16, d18
1821   0xf3f91721,                             //vabs.f32      d17, d17
1822   0xf2210fa0,                             //vmin.f32      d0, d17, d16
1823   0xecbd8b04,                             //vpop          {d8-d9}
1824   0xe12fff1c,                             //bx            ip
1825   0xe320f000,                             //nop           {0}
1826   0x3f800000,                             //.word         0x3f800000
1827   0x3f800000,                             //.word         0x3f800000
1828 };
1829 
1830 CODE const uint32_t sk_mirror_y_vfp4[] = {
1831   0xed2d8b04,                             //vpush         {d8-d9}
1832   0xe8911008,                             //ldm           r1, {r3, ip}
1833   0xf2c03010,                             //vmov.i32      d19, #0
1834   0xe2811008,                             //add           r1, r1, #8
1835   0xeddf4b14,                             //vldr          d20, [pc, #80]
1836   0xed938a00,                             //vldr          s16, [r3]
1837   0xee389a08,                             //vadd.f32      s18, s16, s16
1838   0xf3f40c08,                             //vdup.32       d16, d8[0]
1839   0xf2211d20,                             //vsub.f32      d1, d1, d16
1840   0xeec18a89,                             //vdiv.f32      s17, s3, s18
1841   0xee818a09,                             //vdiv.f32      s16, s2, s18
1842   0xf3fb1708,                             //vcvt.s32.f32  d17, d8
1843   0xf3fb1621,                             //vcvt.f32.s32  d17, d17
1844   0xf3612e88,                             //vcgt.f32      d18, d17, d8
1845   0xf35421b3,                             //vbsl          d18, d20, d19
1846   0xf2611da2,                             //vsub.f32      d17, d17, d18
1847   0xf3c72e1f,                             //vmov.i8       d18, #255
1848   0xf2e119c9,                             //vmul.f32      d17, d17, d9[0]
1849   0xf2611d21,                             //vsub.f32      d17, d1, d17
1850   0xf2611da0,                             //vsub.f32      d17, d17, d16
1851   0xf26008a2,                             //vadd.i32      d16, d16, d18
1852   0xf3f91721,                             //vabs.f32      d17, d17
1853   0xf2211fa0,                             //vmin.f32      d1, d17, d16
1854   0xecbd8b04,                             //vpop          {d8-d9}
1855   0xe12fff1c,                             //bx            ip
1856   0xe320f000,                             //nop           {0}
1857   0x3f800000,                             //.word         0x3f800000
1858   0x3f800000,                             //.word         0x3f800000
1859 };
1860 
1861 CODE const uint32_t sk_luminance_to_alpha_vfp4[] = {
1862   0xeddf0b0a,                             //vldr          d16, [pc, #40]
1863   0xeddf1b0b,                             //vldr          d17, [pc, #44]
1864   0xf3410d30,                             //vmul.f32      d16, d1, d16
1865   0xe4913004,                             //ldr           r3, [r1], #4
1866   0xf3401d31,                             //vmul.f32      d17, d0, d17
1867   0xf2800010,                             //vmov.i32      d0, #0
1868   0xf2801010,                             //vmov.i32      d1, #0
1869   0xf2013da0,                             //vadd.f32      d3, d17, d16
1870   0xeddf0b06,                             //vldr          d16, [pc, #24]
1871   0xf2023c30,                             //vfma.f32      d3, d2, d16
1872   0xf2802010,                             //vmov.i32      d2, #0
1873   0xe12fff13,                             //bx            r3
1874   0x3f371759,                             //.word         0x3f371759
1875   0x3f371759,                             //.word         0x3f371759
1876   0x3e59b3d0,                             //.word         0x3e59b3d0
1877   0x3e59b3d0,                             //.word         0x3e59b3d0
1878   0x3d93dd98,                             //.word         0x3d93dd98
1879   0x3d93dd98,                             //.word         0x3d93dd98
1880 };
1881 
1882 CODE const uint32_t sk_matrix_2x3_vfp4[] = {
1883   0xe92d4800,                             //push          {fp, lr}
1884   0xe591e000,                             //ldr           lr, [r1]
1885   0xe591c004,                             //ldr           ip, [r1, #4]
1886   0xe2811008,                             //add           r1, r1, #8
1887   0xe28e300c,                             //add           r3, lr, #12
1888   0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
1889   0xe28e3008,                             //add           r3, lr, #8
1890   0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
1891   0xe28e3010,                             //add           r3, lr, #16
1892   0xf4e30c9f,                             //vld1.32       {d16[]}, [r3 :32]
1893   0xe28e3014,                             //add           r3, lr, #20
1894   0xf2410c31,                             //vfma.f32      d16, d1, d17
1895   0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
1896   0xf2411c32,                             //vfma.f32      d17, d1, d18
1897   0xf4ee2c9d,                             //vld1.32       {d18[]}, [lr :32]!
1898   0xf4ee3c9f,                             //vld1.32       {d19[]}, [lr :32]
1899   0xf2400c32,                             //vfma.f32      d16, d0, d18
1900   0xf2401c33,                             //vfma.f32      d17, d0, d19
1901   0xf22001b0,                             //vorr          d0, d16, d16
1902   0xf22111b1,                             //vorr          d1, d17, d17
1903   0xe8bd4800,                             //pop           {fp, lr}
1904   0xe12fff1c,                             //bx            ip
1905 };
1906 
1907 CODE const uint32_t sk_matrix_3x4_vfp4[] = {
1908   0xe92d4800,                             //push          {fp, lr}
1909   0xe591e000,                             //ldr           lr, [r1]
1910   0xe591c004,                             //ldr           ip, [r1, #4]
1911   0xe2811008,                             //add           r1, r1, #8
1912   0xe28e3020,                             //add           r3, lr, #32
1913   0xf4e33c9f,                             //vld1.32       {d19[]}, [r3 :32]
1914   0xe28e302c,                             //add           r3, lr, #44
1915   0xf4e30c9f,                             //vld1.32       {d16[]}, [r3 :32]
1916   0xe28e301c,                             //add           r3, lr, #28
1917   0xf2420c33,                             //vfma.f32      d16, d2, d19
1918   0xf4e34c9f,                             //vld1.32       {d20[]}, [r3 :32]
1919   0xe28e3018,                             //add           r3, lr, #24
1920   0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
1921   0xe28e3024,                             //add           r3, lr, #36
1922   0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
1923   0xe28e3028,                             //add           r3, lr, #40
1924   0xf2421c32,                             //vfma.f32      d17, d2, d18
1925   0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
1926   0xe28e3010,                             //add           r3, lr, #16
1927   0xf2422c34,                             //vfma.f32      d18, d2, d20
1928   0xf4e33c9f,                             //vld1.32       {d19[]}, [r3 :32]
1929   0xe28e300c,                             //add           r3, lr, #12
1930   0xf4e34c9f,                             //vld1.32       {d20[]}, [r3 :32]
1931   0xe28e3014,                             //add           r3, lr, #20
1932   0xf2411c34,                             //vfma.f32      d17, d1, d20
1933   0xf4e34c9f,                             //vld1.32       {d20[]}, [r3 :32]
1934   0xf2410c34,                             //vfma.f32      d16, d1, d20
1935   0xe28e3008,                             //add           r3, lr, #8
1936   0xf2412c33,                             //vfma.f32      d18, d1, d19
1937   0xf4ee3c9d,                             //vld1.32       {d19[]}, [lr :32]!
1938   0xf4ee4c9f,                             //vld1.32       {d20[]}, [lr :32]
1939   0xf2401c33,                             //vfma.f32      d17, d0, d19
1940   0xf4e33c9f,                             //vld1.32       {d19[]}, [r3 :32]
1941   0xf2400c33,                             //vfma.f32      d16, d0, d19
1942   0xf2402c34,                             //vfma.f32      d18, d0, d20
1943   0xf22101b1,                             //vorr          d0, d17, d17
1944   0xf22021b0,                             //vorr          d2, d16, d16
1945   0xf22211b2,                             //vorr          d1, d18, d18
1946   0xe8bd4800,                             //pop           {fp, lr}
1947   0xe12fff1c,                             //bx            ip
1948 };
1949 
1950 CODE const uint32_t sk_matrix_4x5_vfp4[] = {
1951   0xe92d4010,                             //push          {r4, lr}
1952   0xe8911008,                             //ldm           r1, {r3, ip}
1953   0xf2620112,                             //vorr          d16, d2, d2
1954   0xe2811008,                             //add           r1, r1, #8
1955   0xe2834014,                             //add           r4, r3, #20
1956   0xe1a0e003,                             //mov           lr, r3
1957   0xf4e45c9f,                             //vld1.32       {d21[]}, [r4 :32]
1958   0xe2834028,                             //add           r4, r3, #40
1959   0xf4e46c9f,                             //vld1.32       {d22[]}, [r4 :32]
1960   0xe2834038,                             //add           r4, r3, #56
1961   0xf4e47c9f,                             //vld1.32       {d23[]}, [r4 :32]
1962   0xe2834048,                             //add           r4, r3, #72
1963   0xf4a42c9f,                             //vld1.32       {d2[]}, [r4 :32]
1964   0xe2834034,                             //add           r4, r3, #52
1965   0xf2032c37,                             //vfma.f32      d2, d3, d23
1966   0xf4e48c9f,                             //vld1.32       {d24[]}, [r4 :32]
1967   0xe2834044,                             //add           r4, r3, #68
1968   0xf4e41c9f,                             //vld1.32       {d17[]}, [r4 :32]
1969   0xe2834030,                             //add           r4, r3, #48
1970   0xf2431c38,                             //vfma.f32      d17, d3, d24
1971   0xf4e49c9f,                             //vld1.32       {d25[]}, [r4 :32]
1972   0xe283403c,                             //add           r4, r3, #60
1973   0xf4e43c9f,                             //vld1.32       {d19[]}, [r4 :32]
1974   0xe283404c,                             //add           r4, r3, #76
1975   0xf2002cb6,                             //vfma.f32      d2, d16, d22
1976   0xf4e42c9f,                             //vld1.32       {d18[]}, [r4 :32]
1977   0xe2834040,                             //add           r4, r3, #64
1978   0xf2432c33,                             //vfma.f32      d18, d3, d19
1979   0xf4e43c9f,                             //vld1.32       {d19[]}, [r4 :32]
1980   0xe2834020,                             //add           r4, r3, #32
1981   0xf2433c39,                             //vfma.f32      d19, d3, d25
1982   0xf4e47c9f,                             //vld1.32       {d23[]}, [r4 :32]
1983   0xe283402c,                             //add           r4, r3, #44
1984   0xf4e48c9f,                             //vld1.32       {d24[]}, [r4 :32]
1985   0xe2834024,                             //add           r4, r3, #36
1986   0xf2402cb8,                             //vfma.f32      d18, d16, d24
1987   0xf4e48c9f,                             //vld1.32       {d24[]}, [r4 :32]
1988   0xf2401cb8,                             //vfma.f32      d17, d16, d24
1989   0xe2834010,                             //add           r4, r3, #16
1990   0xf2403cb7,                             //vfma.f32      d19, d16, d23
1991   0xf4ee4c9d,                             //vld1.32       {d20[]}, [lr :32]!
1992   0xf4e40c9f,                             //vld1.32       {d16[]}, [r4 :32]
1993   0xe283401c,                             //add           r4, r3, #28
1994   0xf4e46c9f,                             //vld1.32       {d22[]}, [r4 :32]
1995   0xe2834018,                             //add           r4, r3, #24
1996   0xf2412c36,                             //vfma.f32      d18, d1, d22
1997   0xf2411c35,                             //vfma.f32      d17, d1, d21
1998   0xf4ee5c9f,                             //vld1.32       {d21[]}, [lr :32]
1999   0xf2413c30,                             //vfma.f32      d19, d1, d16
2000   0xf4e40c9f,                             //vld1.32       {d16[]}, [r4 :32]
2001   0xe2834008,                             //add           r4, r3, #8
2002   0xe283300c,                             //add           r3, r3, #12
2003   0xf2012c30,                             //vfma.f32      d2, d1, d16
2004   0xf4e40c9f,                             //vld1.32       {d16[]}, [r4 :32]
2005   0xf2401c35,                             //vfma.f32      d17, d0, d21
2006   0xf2403c34,                             //vfma.f32      d19, d0, d20
2007   0xf4e34c9f,                             //vld1.32       {d20[]}, [r3 :32]
2008   0xf2402c34,                             //vfma.f32      d18, d0, d20
2009   0xf2002c30,                             //vfma.f32      d2, d0, d16
2010   0xf22111b1,                             //vorr          d1, d17, d17
2011   0xf22301b3,                             //vorr          d0, d19, d19
2012   0xf22231b2,                             //vorr          d3, d18, d18
2013   0xe8bd4010,                             //pop           {r4, lr}
2014   0xe12fff1c,                             //bx            ip
2015 };
2016 
2017 CODE const uint32_t sk_matrix_perspective_vfp4[] = {
2018   0xe92d4010,                             //push          {r4, lr}
2019   0xe591e000,                             //ldr           lr, [r1]
2020   0xe591c004,                             //ldr           ip, [r1, #4]
2021   0xe2811008,                             //add           r1, r1, #8
2022   0xe28e301c,                             //add           r3, lr, #28
2023   0xe28e4010,                             //add           r4, lr, #16
2024   0xf4e30c9f,                             //vld1.32       {d16[]}, [r3 :32]
2025   0xe28e3020,                             //add           r3, lr, #32
2026   0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
2027   0xe28e3018,                             //add           r3, lr, #24
2028   0xf2411c30,                             //vfma.f32      d17, d1, d16
2029   0xf4e30c9f,                             //vld1.32       {d16[]}, [r3 :32]
2030   0xe1a0300e,                             //mov           r3, lr
2031   0xf4e42c9f,                             //vld1.32       {d18[]}, [r4 :32]
2032   0xe28e4008,                             //add           r4, lr, #8
2033   0xf4e43c9f,                             //vld1.32       {d19[]}, [r4 :32]
2034   0xf2401c30,                             //vfma.f32      d17, d0, d16
2035   0xf4e30c9d,                             //vld1.32       {d16[]}, [r3 :32]!
2036   0xf4e35c9f,                             //vld1.32       {d21[]}, [r3 :32]
2037   0xe28e3014,                             //add           r3, lr, #20
2038   0xf2413c35,                             //vfma.f32      d19, d1, d21
2039   0xf4e35c9f,                             //vld1.32       {d21[]}, [r3 :32]
2040   0xe28e300c,                             //add           r3, lr, #12
2041   0xf2415c32,                             //vfma.f32      d21, d1, d18
2042   0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
2043   0xf3fb4521,                             //vrecpe.f32    d20, d17
2044   0xf2403c30,                             //vfma.f32      d19, d0, d16
2045   0xf2411fb4,                             //vrecps.f32    d17, d17, d20
2046   0xf2405c32,                             //vfma.f32      d21, d0, d18
2047   0xf3440db1,                             //vmul.f32      d16, d20, d17
2048   0xf3030db0,                             //vmul.f32      d0, d19, d16
2049   0xf3051db0,                             //vmul.f32      d1, d21, d16
2050   0xe8bd4010,                             //pop           {r4, lr}
2051   0xe12fff1c,                             //bx            ip
2052 };
2053 
2054 CODE const uint32_t sk_linear_gradient_2stops_vfp4[] = {
2055   0xe8911008,                             //ldm           r1, {r3, ip}
2056   0xe2811008,                             //add           r1, r1, #8
2057   0xf4632a0d,                             //vld1.8        {d18-d19}, [r3]!
2058   0xf4634a0f,                             //vld1.8        {d20-d21}, [r3]
2059   0xf3f40c22,                             //vdup.32       d16, d18[0]
2060   0xf3f41c24,                             //vdup.32       d17, d20[0]
2061   0xf2400c31,                             //vfma.f32      d16, d0, d17
2062   0xf3fc6c24,                             //vdup.32       d22, d20[1]
2063   0xf3bc1c22,                             //vdup.32       d1, d18[1]
2064   0xf3b42c23,                             //vdup.32       d2, d19[0]
2065   0xf2001c36,                             //vfma.f32      d1, d0, d22
2066   0xf3f41c25,                             //vdup.32       d17, d21[0]
2067   0xf3fc4c25,                             //vdup.32       d20, d21[1]
2068   0xf2002c31,                             //vfma.f32      d2, d0, d17
2069   0xf3bc3c23,                             //vdup.32       d3, d19[1]
2070   0xf2003c34,                             //vfma.f32      d3, d0, d20
2071   0xf22001b0,                             //vorr          d0, d16, d16
2072   0xe12fff1c,                             //bx            ip
2073 };
2074 #elif defined(__x86_64__)
2075 
2076 CODE const uint8_t sk_start_pipeline_hsw[] = {
2077   65,87,                                  //push          %r15
2078   65,86,                                  //push          %r14
2079   65,85,                                  //push          %r13
2080   65,84,                                  //push          %r12
2081   83,                                     //push          %rbx
2082   73,137,205,                             //mov           %rcx,%r13
2083   73,137,214,                             //mov           %rdx,%r14
2084   72,137,251,                             //mov           %rdi,%rbx
2085   72,173,                                 //lods          %ds:(%rsi),%rax
2086   73,137,199,                             //mov           %rax,%r15
2087   73,137,244,                             //mov           %rsi,%r12
2088   72,141,67,8,                            //lea           0x8(%rbx),%rax
2089   76,57,232,                              //cmp           %r13,%rax
2090   118,5,                                  //jbe           28 <_sk_start_pipeline_hsw+0x28>
2091   72,137,223,                             //mov           %rbx,%rdi
2092   235,65,                                 //jmp           69 <_sk_start_pipeline_hsw+0x69>
2093   185,0,0,0,0,                            //mov           $0x0,%ecx
2094   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
2095   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
2096   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
2097   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
2098   197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
2099   197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
2100   197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
2101   197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
2102   72,137,223,                             //mov           %rbx,%rdi
2103   76,137,230,                             //mov           %r12,%rsi
2104   76,137,242,                             //mov           %r14,%rdx
2105   65,255,215,                             //callq         *%r15
2106   72,141,123,8,                           //lea           0x8(%rbx),%rdi
2107   72,131,195,16,                          //add           $0x10,%rbx
2108   76,57,235,                              //cmp           %r13,%rbx
2109   72,137,251,                             //mov           %rdi,%rbx
2110   118,191,                                //jbe           28 <_sk_start_pipeline_hsw+0x28>
2111   76,137,233,                             //mov           %r13,%rcx
2112   72,41,249,                              //sub           %rdi,%rcx
2113   116,41,                                 //je            9a <_sk_start_pipeline_hsw+0x9a>
2114   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
2115   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
2116   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
2117   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
2118   197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
2119   197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
2120   197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
2121   197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
2122   76,137,230,                             //mov           %r12,%rsi
2123   76,137,242,                             //mov           %r14,%rdx
2124   65,255,215,                             //callq         *%r15
2125   76,137,232,                             //mov           %r13,%rax
2126   91,                                     //pop           %rbx
2127   65,92,                                  //pop           %r12
2128   65,93,                                  //pop           %r13
2129   65,94,                                  //pop           %r14
2130   65,95,                                  //pop           %r15
2131   197,248,119,                            //vzeroupper
2132   195,                                    //retq
2133 };
2134 
2135 CODE const uint8_t sk_just_return_hsw[] = {
2136   195,                                    //retq
2137 };
2138 
2139 CODE const uint8_t sk_seed_shader_hsw[] = {
2140   72,173,                                 //lods          %ds:(%rsi),%rax
2141   197,249,110,199,                        //vmovd         %edi,%xmm0
2142   196,226,125,88,192,                     //vpbroadcastd  %xmm0,%ymm0
2143   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
2144   65,184,0,0,0,63,                        //mov           $0x3f000000,%r8d
2145   196,193,121,110,200,                    //vmovd         %r8d,%xmm1
2146   196,226,125,88,201,                     //vpbroadcastd  %xmm1,%ymm1
2147   197,252,88,193,                         //vaddps        %ymm1,%ymm0,%ymm0
2148   197,252,88,2,                           //vaddps        (%rdx),%ymm0,%ymm0
2149   196,226,125,24,16,                      //vbroadcastss  (%rax),%ymm2
2150   197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
2151   197,236,88,201,                         //vaddps        %ymm1,%ymm2,%ymm1
2152   184,0,0,128,63,                         //mov           $0x3f800000,%eax
2153   197,249,110,208,                        //vmovd         %eax,%xmm2
2154   196,226,125,88,210,                     //vpbroadcastd  %xmm2,%ymm2
2155   72,173,                                 //lods          %ds:(%rsi),%rax
2156   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
2157   197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
2158   197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
2159   197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
2160   197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
2161   255,224,                                //jmpq          *%rax
2162 };
2163 
2164 CODE const uint8_t sk_constant_color_hsw[] = {
2165   72,173,                                 //lods          %ds:(%rsi),%rax
2166   196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
2167   196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
2168   196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
2169   196,226,125,24,88,12,                   //vbroadcastss  0xc(%rax),%ymm3
2170   72,173,                                 //lods          %ds:(%rsi),%rax
2171   255,224,                                //jmpq          *%rax
2172 };
2173 
2174 CODE const uint8_t sk_clear_hsw[] = {
2175   72,173,                                 //lods          %ds:(%rsi),%rax
2176   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
2177   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
2178   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
2179   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
2180   255,224,                                //jmpq          *%rax
2181 };
2182 
2183 CODE const uint8_t sk_plus__hsw[] = {
2184   197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
2185   197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
2186   197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
2187   197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
2188   72,173,                                 //lods          %ds:(%rsi),%rax
2189   255,224,                                //jmpq          *%rax
2190 };
2191 
2192 CODE const uint8_t sk_srcover_hsw[] = {
2193   184,0,0,128,63,                         //mov           $0x3f800000,%eax
2194   197,121,110,192,                        //vmovd         %eax,%xmm8
2195   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
2196   197,60,92,195,                          //vsubps        %ymm3,%ymm8,%ymm8
2197   196,194,93,184,192,                     //vfmadd231ps   %ymm8,%ymm4,%ymm0
2198   196,194,85,184,200,                     //vfmadd231ps   %ymm8,%ymm5,%ymm1
2199   196,194,77,184,208,                     //vfmadd231ps   %ymm8,%ymm6,%ymm2
2200   196,194,69,184,216,                     //vfmadd231ps   %ymm8,%ymm7,%ymm3
2201   72,173,                                 //lods          %ds:(%rsi),%rax
2202   255,224,                                //jmpq          *%rax
2203 };
2204 
2205 CODE const uint8_t sk_dstover_hsw[] = {
2206   184,0,0,128,63,                         //mov           $0x3f800000,%eax
2207   197,121,110,192,                        //vmovd         %eax,%xmm8
2208   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
2209   197,60,92,199,                          //vsubps        %ymm7,%ymm8,%ymm8
2210   196,226,61,168,196,                     //vfmadd213ps   %ymm4,%ymm8,%ymm0
2211   196,226,61,168,205,                     //vfmadd213ps   %ymm5,%ymm8,%ymm1
2212   196,226,61,168,214,                     //vfmadd213ps   %ymm6,%ymm8,%ymm2
2213   196,226,61,168,223,                     //vfmadd213ps   %ymm7,%ymm8,%ymm3
2214   72,173,                                 //lods          %ds:(%rsi),%rax
2215   255,224,                                //jmpq          *%rax
2216 };
2217 
2218 CODE const uint8_t sk_clamp_0_hsw[] = {
2219   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
2220   196,193,124,95,192,                     //vmaxps        %ymm8,%ymm0,%ymm0
2221   196,193,116,95,200,                     //vmaxps        %ymm8,%ymm1,%ymm1
2222   196,193,108,95,208,                     //vmaxps        %ymm8,%ymm2,%ymm2
2223   196,193,100,95,216,                     //vmaxps        %ymm8,%ymm3,%ymm3
2224   72,173,                                 //lods          %ds:(%rsi),%rax
2225   255,224,                                //jmpq          *%rax
2226 };
2227 
2228 CODE const uint8_t sk_clamp_1_hsw[] = {
2229   184,0,0,128,63,                         //mov           $0x3f800000,%eax
2230   197,121,110,192,                        //vmovd         %eax,%xmm8
2231   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
2232   196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
2233   196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
2234   196,193,108,93,208,                     //vminps        %ymm8,%ymm2,%ymm2
2235   196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
2236   72,173,                                 //lods          %ds:(%rsi),%rax
2237   255,224,                                //jmpq          *%rax
2238 };
2239 
2240 CODE const uint8_t sk_clamp_a_hsw[] = {
2241   184,0,0,128,63,                         //mov           $0x3f800000,%eax
2242   197,121,110,192,                        //vmovd         %eax,%xmm8
2243   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
2244   196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
2245   197,252,93,195,                         //vminps        %ymm3,%ymm0,%ymm0
2246   197,244,93,203,                         //vminps        %ymm3,%ymm1,%ymm1
2247   197,236,93,211,                         //vminps        %ymm3,%ymm2,%ymm2
2248   72,173,                                 //lods          %ds:(%rsi),%rax
2249   255,224,                                //jmpq          *%rax
2250 };
2251 
2252 CODE const uint8_t sk_set_rgb_hsw[] = {
2253   72,173,                                 //lods          %ds:(%rsi),%rax
2254   196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
2255   196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
2256   196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
2257   72,173,                                 //lods          %ds:(%rsi),%rax
2258   255,224,                                //jmpq          *%rax
2259 };
2260 
2261 CODE const uint8_t sk_swap_rb_hsw[] = {
2262   197,124,40,192,                         //vmovaps       %ymm0,%ymm8
2263   72,173,                                 //lods          %ds:(%rsi),%rax
2264   197,252,40,194,                         //vmovaps       %ymm2,%ymm0
2265   197,124,41,194,                         //vmovaps       %ymm8,%ymm2
2266   255,224,                                //jmpq          *%rax
2267 };
2268 
2269 CODE const uint8_t sk_swap_hsw[] = {
2270   197,124,40,195,                         //vmovaps       %ymm3,%ymm8
2271   197,124,40,202,                         //vmovaps       %ymm2,%ymm9
2272   197,124,40,209,                         //vmovaps       %ymm1,%ymm10
2273   197,124,40,216,                         //vmovaps       %ymm0,%ymm11
2274   72,173,                                 //lods          %ds:(%rsi),%rax
2275   197,252,40,196,                         //vmovaps       %ymm4,%ymm0
2276   197,252,40,205,                         //vmovaps       %ymm5,%ymm1
2277   197,252,40,214,                         //vmovaps       %ymm6,%ymm2
2278   197,252,40,223,                         //vmovaps       %ymm7,%ymm3
2279   197,124,41,220,                         //vmovaps       %ymm11,%ymm4
2280   197,124,41,213,                         //vmovaps       %ymm10,%ymm5
2281   197,124,41,206,                         //vmovaps       %ymm9,%ymm6
2282   197,124,41,199,                         //vmovaps       %ymm8,%ymm7
2283   255,224,                                //jmpq          *%rax
2284 };
2285 
2286 CODE const uint8_t sk_move_src_dst_hsw[] = {
2287   72,173,                                 //lods          %ds:(%rsi),%rax
2288   197,252,40,224,                         //vmovaps       %ymm0,%ymm4
2289   197,252,40,233,                         //vmovaps       %ymm1,%ymm5
2290   197,252,40,242,                         //vmovaps       %ymm2,%ymm6
2291   197,252,40,251,                         //vmovaps       %ymm3,%ymm7
2292   255,224,                                //jmpq          *%rax
2293 };
2294 
2295 CODE const uint8_t sk_move_dst_src_hsw[] = {
2296   72,173,                                 //lods          %ds:(%rsi),%rax
2297   197,252,40,196,                         //vmovaps       %ymm4,%ymm0
2298   197,252,40,205,                         //vmovaps       %ymm5,%ymm1
2299   197,252,40,214,                         //vmovaps       %ymm6,%ymm2
2300   197,252,40,223,                         //vmovaps       %ymm7,%ymm3
2301   255,224,                                //jmpq          *%rax
2302 };
2303 
2304 CODE const uint8_t sk_premul_hsw[] = {
2305   197,252,89,195,                         //vmulps        %ymm3,%ymm0,%ymm0
2306   197,244,89,203,                         //vmulps        %ymm3,%ymm1,%ymm1
2307   197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
2308   72,173,                                 //lods          %ds:(%rsi),%rax
2309   255,224,                                //jmpq          *%rax
2310 };
2311 
2312 CODE const uint8_t sk_unpremul_hsw[] = {
2313   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
2314   196,65,100,194,200,0,                   //vcmpeqps      %ymm8,%ymm3,%ymm9
2315   184,0,0,128,63,                         //mov           $0x3f800000,%eax
2316   197,121,110,208,                        //vmovd         %eax,%xmm10
2317   196,66,125,88,210,                      //vpbroadcastd  %xmm10,%ymm10
2318   197,44,94,211,                          //vdivps        %ymm3,%ymm10,%ymm10
2319   196,67,45,74,192,144,                   //vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
2320   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
2321   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
2322   197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
2323   72,173,                                 //lods          %ds:(%rsi),%rax
2324   255,224,                                //jmpq          *%rax
2325 };
2326 
2327 CODE const uint8_t sk_from_srgb_hsw[] = {
2328   184,145,131,158,61,                     //mov           $0x3d9e8391,%eax
2329   197,121,110,192,                        //vmovd         %eax,%xmm8
2330   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
2331   197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
2332   197,124,89,208,                         //vmulps        %ymm0,%ymm0,%ymm10
2333   184,154,153,153,62,                     //mov           $0x3e99999a,%eax
2334   197,121,110,216,                        //vmovd         %eax,%xmm11
2335   196,66,125,88,219,                      //vpbroadcastd  %xmm11,%ymm11
2336   184,92,143,50,63,                       //mov           $0x3f328f5c,%eax
2337   197,121,110,224,                        //vmovd         %eax,%xmm12
2338   196,66,125,88,228,                      //vpbroadcastd  %xmm12,%ymm12
2339   196,65,125,111,235,                     //vmovdqa       %ymm11,%ymm13
2340   196,66,125,168,236,                     //vfmadd213ps   %ymm12,%ymm0,%ymm13
2341   184,10,215,35,59,                       //mov           $0x3b23d70a,%eax
2342   197,121,110,240,                        //vmovd         %eax,%xmm14
2343   196,66,125,88,246,                      //vpbroadcastd  %xmm14,%ymm14
2344   196,66,45,168,238,                      //vfmadd213ps   %ymm14,%ymm10,%ymm13
2345   184,174,71,97,61,                       //mov           $0x3d6147ae,%eax
2346   197,121,110,208,                        //vmovd         %eax,%xmm10
2347   196,66,125,88,210,                      //vpbroadcastd  %xmm10,%ymm10
2348   196,193,124,194,194,1,                  //vcmpltps      %ymm10,%ymm0,%ymm0
2349   196,195,21,74,193,0,                    //vblendvps     %ymm0,%ymm9,%ymm13,%ymm0
2350   197,60,89,201,                          //vmulps        %ymm1,%ymm8,%ymm9
2351   197,116,89,233,                         //vmulps        %ymm1,%ymm1,%ymm13
2352   196,65,125,111,251,                     //vmovdqa       %ymm11,%ymm15
2353   196,66,117,168,252,                     //vfmadd213ps   %ymm12,%ymm1,%ymm15
2354   196,66,21,168,254,                      //vfmadd213ps   %ymm14,%ymm13,%ymm15
2355   196,193,116,194,202,1,                  //vcmpltps      %ymm10,%ymm1,%ymm1
2356   196,195,5,74,201,16,                    //vblendvps     %ymm1,%ymm9,%ymm15,%ymm1
2357   197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
2358   197,108,89,202,                         //vmulps        %ymm2,%ymm2,%ymm9
2359   196,66,109,168,220,                     //vfmadd213ps   %ymm12,%ymm2,%ymm11
2360   196,66,53,168,222,                      //vfmadd213ps   %ymm14,%ymm9,%ymm11
2361   196,193,108,194,210,1,                  //vcmpltps      %ymm10,%ymm2,%ymm2
2362   196,195,37,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm11,%ymm2
2363   72,173,                                 //lods          %ds:(%rsi),%rax
2364   255,224,                                //jmpq          *%rax
2365 };
2366 
2367 CODE const uint8_t sk_to_srgb_hsw[] = {
2368   197,124,82,192,                         //vrsqrtps      %ymm0,%ymm8
2369   196,65,124,83,216,                      //vrcpps        %ymm8,%ymm11
2370   196,65,124,82,224,                      //vrsqrtps      %ymm8,%ymm12
2371   184,41,92,71,65,                        //mov           $0x41475c29,%eax
2372   197,121,110,192,                        //vmovd         %eax,%xmm8
2373   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
2374   197,60,89,232,                          //vmulps        %ymm0,%ymm8,%ymm13
2375   184,0,0,128,63,                         //mov           $0x3f800000,%eax
2376   197,121,110,200,                        //vmovd         %eax,%xmm9
2377   196,66,125,88,201,                      //vpbroadcastd  %xmm9,%ymm9
2378   184,194,135,210,62,                     //mov           $0x3ed287c2,%eax
2379   197,121,110,208,                        //vmovd         %eax,%xmm10
2380   196,66,125,88,210,                      //vpbroadcastd  %xmm10,%ymm10
2381   184,206,111,48,63,                      //mov           $0x3f306fce,%eax
2382   197,121,110,240,                        //vmovd         %eax,%xmm14
2383   196,66,125,88,246,                      //vpbroadcastd  %xmm14,%ymm14
2384   184,168,87,202,61,                      //mov           $0x3dca57a8,%eax
2385   53,0,0,0,128,                           //xor           $0x80000000,%eax
2386   197,121,110,248,                        //vmovd         %eax,%xmm15
2387   196,66,125,88,255,                      //vpbroadcastd  %xmm15,%ymm15
2388   196,66,13,168,223,                      //vfmadd213ps   %ymm15,%ymm14,%ymm11
2389   196,66,45,184,220,                      //vfmadd231ps   %ymm12,%ymm10,%ymm11
2390   196,65,52,93,219,                       //vminps        %ymm11,%ymm9,%ymm11
2391   184,4,231,140,59,                       //mov           $0x3b8ce704,%eax
2392   197,121,110,224,                        //vmovd         %eax,%xmm12
2393   196,66,125,88,228,                      //vpbroadcastd  %xmm12,%ymm12
2394   196,193,124,194,196,1,                  //vcmpltps      %ymm12,%ymm0,%ymm0
2395   196,195,37,74,197,0,                    //vblendvps     %ymm0,%ymm13,%ymm11,%ymm0
2396   197,124,82,217,                         //vrsqrtps      %ymm1,%ymm11
2397   196,65,124,83,235,                      //vrcpps        %ymm11,%ymm13
2398   196,65,124,82,219,                      //vrsqrtps      %ymm11,%ymm11
2399   196,66,13,168,239,                      //vfmadd213ps   %ymm15,%ymm14,%ymm13
2400   196,66,45,184,235,                      //vfmadd231ps   %ymm11,%ymm10,%ymm13
2401   197,60,89,217,                          //vmulps        %ymm1,%ymm8,%ymm11
2402   196,65,52,93,237,                       //vminps        %ymm13,%ymm9,%ymm13
2403   196,193,116,194,204,1,                  //vcmpltps      %ymm12,%ymm1,%ymm1
2404   196,195,21,74,203,16,                   //vblendvps     %ymm1,%ymm11,%ymm13,%ymm1
2405   197,124,82,218,                         //vrsqrtps      %ymm2,%ymm11
2406   196,65,124,83,235,                      //vrcpps        %ymm11,%ymm13
2407   196,66,13,168,239,                      //vfmadd213ps   %ymm15,%ymm14,%ymm13
2408   196,65,124,82,219,                      //vrsqrtps      %ymm11,%ymm11
2409   196,66,45,184,235,                      //vfmadd231ps   %ymm11,%ymm10,%ymm13
2410   196,65,52,93,205,                       //vminps        %ymm13,%ymm9,%ymm9
2411   197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
2412   196,193,108,194,212,1,                  //vcmpltps      %ymm12,%ymm2,%ymm2
2413   196,195,53,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
2414   72,173,                                 //lods          %ds:(%rsi),%rax
2415   255,224,                                //jmpq          *%rax
2416 };
2417 
2418 CODE const uint8_t sk_scale_1_float_hsw[] = {
2419   72,173,                                 //lods          %ds:(%rsi),%rax
2420   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
2421   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
2422   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
2423   197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
2424   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
2425   72,173,                                 //lods          %ds:(%rsi),%rax
2426   255,224,                                //jmpq          *%rax
2427 };
2428 
2429 CODE const uint8_t sk_scale_u8_hsw[] = {
2430   73,137,200,                             //mov           %rcx,%r8
2431   72,173,                                 //lods          %ds:(%rsi),%rax
2432   72,139,0,                               //mov           (%rax),%rax
2433   72,1,248,                               //add           %rdi,%rax
2434   77,133,192,                             //test          %r8,%r8
2435   117,56,                                 //jne           4bf <_sk_scale_u8_hsw+0x48>
2436   197,122,126,0,                          //vmovq         (%rax),%xmm8
2437   196,66,125,49,192,                      //vpmovzxbd     %xmm8,%ymm8
2438   196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
2439   184,129,128,128,59,                     //mov           $0x3b808081,%eax
2440   197,121,110,200,                        //vmovd         %eax,%xmm9
2441   196,66,125,88,201,                      //vpbroadcastd  %xmm9,%ymm9
2442   196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
2443   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
2444   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
2445   197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
2446   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
2447   72,173,                                 //lods          %ds:(%rsi),%rax
2448   76,137,193,                             //mov           %r8,%rcx
2449   255,224,                                //jmpq          *%rax
2450   49,201,                                 //xor           %ecx,%ecx
2451   77,137,194,                             //mov           %r8,%r10
2452   69,49,201,                              //xor           %r9d,%r9d
2453   68,15,182,24,                           //movzbl        (%rax),%r11d
2454   72,255,192,                             //inc           %rax
2455   73,211,227,                             //shl           %cl,%r11
2456   77,9,217,                               //or            %r11,%r9
2457   72,131,193,8,                           //add           $0x8,%rcx
2458   73,255,202,                             //dec           %r10
2459   117,234,                                //jne           4c7 <_sk_scale_u8_hsw+0x50>
2460   196,65,249,110,193,                     //vmovq         %r9,%xmm8
2461   235,167,                                //jmp           48b <_sk_scale_u8_hsw+0x14>
2462 };
2463 
2464 CODE const uint8_t sk_lerp_1_float_hsw[] = {
2465   72,173,                                 //lods          %ds:(%rsi),%rax
2466   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
2467   197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
2468   196,226,61,168,196,                     //vfmadd213ps   %ymm4,%ymm8,%ymm0
2469   197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
2470   196,226,61,168,205,                     //vfmadd213ps   %ymm5,%ymm8,%ymm1
2471   197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
2472   196,226,61,168,214,                     //vfmadd213ps   %ymm6,%ymm8,%ymm2
2473   197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
2474   196,226,61,168,223,                     //vfmadd213ps   %ymm7,%ymm8,%ymm3
2475   72,173,                                 //lods          %ds:(%rsi),%rax
2476   255,224,                                //jmpq          *%rax
2477 };
2478 
2479 CODE const uint8_t sk_lerp_u8_hsw[] = {
2480   73,137,200,                             //mov           %rcx,%r8
2481   72,173,                                 //lods          %ds:(%rsi),%rax
2482   72,139,0,                               //mov           (%rax),%rax
2483   72,1,248,                               //add           %rdi,%rax
2484   77,133,192,                             //test          %r8,%r8
2485   117,76,                                 //jne           56f <_sk_lerp_u8_hsw+0x5c>
2486   197,122,126,0,                          //vmovq         (%rax),%xmm8
2487   196,66,125,49,192,                      //vpmovzxbd     %xmm8,%ymm8
2488   196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
2489   184,129,128,128,59,                     //mov           $0x3b808081,%eax
2490   197,121,110,200,                        //vmovd         %eax,%xmm9
2491   196,66,125,88,201,                      //vpbroadcastd  %xmm9,%ymm9
2492   196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
2493   197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
2494   196,226,61,168,196,                     //vfmadd213ps   %ymm4,%ymm8,%ymm0
2495   197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
2496   196,226,61,168,205,                     //vfmadd213ps   %ymm5,%ymm8,%ymm1
2497   197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
2498   196,226,61,168,214,                     //vfmadd213ps   %ymm6,%ymm8,%ymm2
2499   197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
2500   196,226,61,168,223,                     //vfmadd213ps   %ymm7,%ymm8,%ymm3
2501   72,173,                                 //lods          %ds:(%rsi),%rax
2502   76,137,193,                             //mov           %r8,%rcx
2503   255,224,                                //jmpq          *%rax
2504   49,201,                                 //xor           %ecx,%ecx
2505   77,137,194,                             //mov           %r8,%r10
2506   69,49,201,                              //xor           %r9d,%r9d
2507   68,15,182,24,                           //movzbl        (%rax),%r11d
2508   72,255,192,                             //inc           %rax
2509   73,211,227,                             //shl           %cl,%r11
2510   77,9,217,                               //or            %r11,%r9
2511   72,131,193,8,                           //add           $0x8,%rcx
2512   73,255,202,                             //dec           %r10
2513   117,234,                                //jne           577 <_sk_lerp_u8_hsw+0x64>
2514   196,65,249,110,193,                     //vmovq         %r9,%xmm8
2515   235,147,                                //jmp           527 <_sk_lerp_u8_hsw+0x14>
2516 };
2517 
2518 CODE const uint8_t sk_lerp_565_hsw[] = {
2519   72,173,                                 //lods          %ds:(%rsi),%rax
2520   76,139,16,                              //mov           (%rax),%r10
2521   72,133,201,                             //test          %rcx,%rcx
2522   15,133,179,0,0,0,                       //jne           655 <_sk_lerp_565_hsw+0xc1>
2523   196,193,122,111,28,122,                 //vmovdqu       (%r10,%rdi,2),%xmm3
2524   196,98,125,51,195,                      //vpmovzxwd     %xmm3,%ymm8
2525   184,0,248,0,0,                          //mov           $0xf800,%eax
2526   197,249,110,216,                        //vmovd         %eax,%xmm3
2527   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
2528   196,193,101,219,216,                    //vpand         %ymm8,%ymm3,%ymm3
2529   197,124,91,203,                         //vcvtdq2ps     %ymm3,%ymm9
2530   184,8,33,132,55,                        //mov           $0x37842108,%eax
2531   197,249,110,216,                        //vmovd         %eax,%xmm3
2532   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
2533   197,52,89,203,                          //vmulps        %ymm3,%ymm9,%ymm9
2534   184,224,7,0,0,                          //mov           $0x7e0,%eax
2535   197,249,110,216,                        //vmovd         %eax,%xmm3
2536   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
2537   196,193,101,219,216,                    //vpand         %ymm8,%ymm3,%ymm3
2538   197,124,91,211,                         //vcvtdq2ps     %ymm3,%ymm10
2539   184,33,8,2,58,                          //mov           $0x3a020821,%eax
2540   197,249,110,216,                        //vmovd         %eax,%xmm3
2541   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
2542   197,44,89,211,                          //vmulps        %ymm3,%ymm10,%ymm10
2543   184,31,0,0,0,                           //mov           $0x1f,%eax
2544   197,249,110,216,                        //vmovd         %eax,%xmm3
2545   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
2546   196,193,101,219,216,                    //vpand         %ymm8,%ymm3,%ymm3
2547   197,124,91,195,                         //vcvtdq2ps     %ymm3,%ymm8
2548   184,8,33,4,61,                          //mov           $0x3d042108,%eax
2549   197,249,110,216,                        //vmovd         %eax,%xmm3
2550   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
2551   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
2552   197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
2553   196,226,53,168,196,                     //vfmadd213ps   %ymm4,%ymm9,%ymm0
2554   197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
2555   196,226,45,168,205,                     //vfmadd213ps   %ymm5,%ymm10,%ymm1
2556   197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
2557   196,226,101,168,214,                    //vfmadd213ps   %ymm6,%ymm3,%ymm2
2558   184,0,0,128,63,                         //mov           $0x3f800000,%eax
2559   197,249,110,216,                        //vmovd         %eax,%xmm3
2560   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
2561   72,173,                                 //lods          %ds:(%rsi),%rax
2562   255,224,                                //jmpq          *%rax
2563   65,137,200,                             //mov           %ecx,%r8d
2564   65,128,224,7,                           //and           $0x7,%r8b
2565   197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
2566   65,254,200,                             //dec           %r8b
2567   65,128,248,6,                           //cmp           $0x6,%r8b
2568   15,135,59,255,255,255,                  //ja            5a8 <_sk_lerp_565_hsw+0x14>
2569   69,15,182,192,                          //movzbl        %r8b,%r8d
2570   76,141,13,76,0,0,0,                     //lea           0x4c(%rip),%r9        # 6c4 <_sk_lerp_565_hsw+0x130>
2571   75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
2572   76,1,200,                               //add           %r9,%rax
2573   255,224,                                //jmpq          *%rax
2574   197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
2575   196,193,97,196,92,122,12,6,             //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm3
2576   196,193,97,196,92,122,10,5,             //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm3,%xmm3
2577   196,193,97,196,92,122,8,4,              //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm3,%xmm3
2578   196,193,97,196,92,122,6,3,              //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm3,%xmm3
2579   196,193,97,196,92,122,4,2,              //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3
2580   196,193,97,196,92,122,2,1,              //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3
2581   196,193,97,196,28,122,0,                //vpinsrw       $0x0,(%r10,%rdi,2),%xmm3,%xmm3
2582   233,231,254,255,255,                    //jmpq          5a8 <_sk_lerp_565_hsw+0x14>
2583   15,31,0,                                //nopl          (%rax)
2584   241,                                    //icebp
2585   255,                                    //(bad)
2586   255,                                    //(bad)
2587   255,                                    //(bad)
2588   233,255,255,255,225,                    //jmpq          ffffffffe20006cc <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff4f0>
2589   255,                                    //(bad)
2590   255,                                    //(bad)
2591   255,                                    //(bad)
2592   217,255,                                //fcos
2593   255,                                    //(bad)
2594   255,209,                                //callq         *%rcx
2595   255,                                    //(bad)
2596   255,                                    //(bad)
2597   255,201,                                //dec           %ecx
2598   255,                                    //(bad)
2599   255,                                    //(bad)
2600   255,                                    //(bad)
2601   189,                                    //.byte         0xbd
2602   255,                                    //(bad)
2603   255,                                    //(bad)
2604   255,                                    //.byte         0xff
2605 };
2606 
2607 CODE const uint8_t sk_load_tables_hsw[] = {
2608   73,137,200,                             //mov           %rcx,%r8
2609   72,173,                                 //lods          %ds:(%rsi),%rax
2610   76,141,12,189,0,0,0,0,                  //lea           0x0(,%rdi,4),%r9
2611   76,3,8,                                 //add           (%rax),%r9
2612   77,133,192,                             //test          %r8,%r8
2613   117,121,                                //jne           76e <_sk_load_tables_hsw+0x8e>
2614   196,193,126,111,25,                     //vmovdqu       (%r9),%ymm3
2615   185,255,0,0,0,                          //mov           $0xff,%ecx
2616   197,249,110,193,                        //vmovd         %ecx,%xmm0
2617   196,226,125,88,208,                     //vpbroadcastd  %xmm0,%ymm2
2618   197,237,219,203,                        //vpand         %ymm3,%ymm2,%ymm1
2619   196,65,61,118,192,                      //vpcmpeqd      %ymm8,%ymm8,%ymm8
2620   72,139,72,8,                            //mov           0x8(%rax),%rcx
2621   76,139,72,16,                           //mov           0x10(%rax),%r9
2622   196,65,53,118,201,                      //vpcmpeqd      %ymm9,%ymm9,%ymm9
2623   196,226,53,146,4,137,                   //vgatherdps    %ymm9,(%rcx,%ymm1,4),%ymm0
2624   197,245,114,211,8,                      //vpsrld        $0x8,%ymm3,%ymm1
2625   197,109,219,201,                        //vpand         %ymm1,%ymm2,%ymm9
2626   196,65,45,118,210,                      //vpcmpeqd      %ymm10,%ymm10,%ymm10
2627   196,130,45,146,12,137,                  //vgatherdps    %ymm10,(%r9,%ymm9,4),%ymm1
2628   72,139,64,24,                           //mov           0x18(%rax),%rax
2629   197,181,114,211,16,                     //vpsrld        $0x10,%ymm3,%ymm9
2630   196,65,109,219,201,                     //vpand         %ymm9,%ymm2,%ymm9
2631   196,162,61,146,20,136,                  //vgatherdps    %ymm8,(%rax,%ymm9,4),%ymm2
2632   197,229,114,211,24,                     //vpsrld        $0x18,%ymm3,%ymm3
2633   197,124,91,195,                         //vcvtdq2ps     %ymm3,%ymm8
2634   184,129,128,128,59,                     //mov           $0x3b808081,%eax
2635   197,249,110,216,                        //vmovd         %eax,%xmm3
2636   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
2637   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
2638   72,173,                                 //lods          %ds:(%rsi),%rax
2639   76,137,193,                             //mov           %r8,%rcx
2640   255,224,                                //jmpq          *%rax
2641   185,8,0,0,0,                            //mov           $0x8,%ecx
2642   68,41,193,                              //sub           %r8d,%ecx
2643   192,225,3,                              //shl           $0x3,%cl
2644   73,199,194,255,255,255,255,             //mov           $0xffffffffffffffff,%r10
2645   73,211,234,                             //shr           %cl,%r10
2646   196,193,249,110,194,                    //vmovq         %r10,%xmm0
2647   196,226,125,33,192,                     //vpmovsxbd     %xmm0,%ymm0
2648   196,194,125,140,25,                     //vpmaskmovd    (%r9),%ymm0,%ymm3
2649   233,99,255,255,255,                     //jmpq          6fa <_sk_load_tables_hsw+0x1a>
2650 };
2651 
2652 CODE const uint8_t sk_load_a8_hsw[] = {
2653   73,137,200,                             //mov           %rcx,%r8
2654   72,173,                                 //lods          %ds:(%rsi),%rax
2655   72,139,0,                               //mov           (%rax),%rax
2656   72,1,248,                               //add           %rdi,%rax
2657   77,133,192,                             //test          %r8,%r8
2658   117,50,                                 //jne           7d9 <_sk_load_a8_hsw+0x42>
2659   197,250,126,0,                          //vmovq         (%rax),%xmm0
2660   196,226,125,49,192,                     //vpmovzxbd     %xmm0,%ymm0
2661   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
2662   184,129,128,128,59,                     //mov           $0x3b808081,%eax
2663   197,249,110,200,                        //vmovd         %eax,%xmm1
2664   196,226,125,88,201,                     //vpbroadcastd  %xmm1,%ymm1
2665   197,252,89,217,                         //vmulps        %ymm1,%ymm0,%ymm3
2666   72,173,                                 //lods          %ds:(%rsi),%rax
2667   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
2668   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
2669   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
2670   76,137,193,                             //mov           %r8,%rcx
2671   255,224,                                //jmpq          *%rax
2672   49,201,                                 //xor           %ecx,%ecx
2673   77,137,194,                             //mov           %r8,%r10
2674   69,49,201,                              //xor           %r9d,%r9d
2675   68,15,182,24,                           //movzbl        (%rax),%r11d
2676   72,255,192,                             //inc           %rax
2677   73,211,227,                             //shl           %cl,%r11
2678   77,9,217,                               //or            %r11,%r9
2679   72,131,193,8,                           //add           $0x8,%rcx
2680   73,255,202,                             //dec           %r10
2681   117,234,                                //jne           7e1 <_sk_load_a8_hsw+0x4a>
2682   196,193,249,110,193,                    //vmovq         %r9,%xmm0
2683   235,173,                                //jmp           7ab <_sk_load_a8_hsw+0x14>
2684 };
2685 
2686 CODE const uint8_t sk_store_a8_hsw[] = {
2687   72,173,                                 //lods          %ds:(%rsi),%rax
2688   76,139,8,                               //mov           (%rax),%r9
2689   184,0,0,127,67,                         //mov           $0x437f0000,%eax
2690   197,121,110,192,                        //vmovd         %eax,%xmm8
2691   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
2692   197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
2693   196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
2694   196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
2695   196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
2696   196,65,57,103,192,                      //vpackuswb     %xmm8,%xmm8,%xmm8
2697   72,133,201,                             //test          %rcx,%rcx
2698   117,10,                                 //jne           839 <_sk_store_a8_hsw+0x3b>
2699   196,65,123,17,4,57,                     //vmovsd        %xmm8,(%r9,%rdi,1)
2700   72,173,                                 //lods          %ds:(%rsi),%rax
2701   255,224,                                //jmpq          *%rax
2702   65,137,200,                             //mov           %ecx,%r8d
2703   65,128,224,7,                           //and           $0x7,%r8b
2704   65,254,200,                             //dec           %r8b
2705   65,128,248,6,                           //cmp           $0x6,%r8b
2706   119,236,                                //ja            835 <_sk_store_a8_hsw+0x37>
2707   196,66,121,48,192,                      //vpmovzxbw     %xmm8,%xmm8
2708   65,15,182,192,                          //movzbl        %r8b,%eax
2709   76,141,5,67,0,0,0,                      //lea           0x43(%rip),%r8        # 89c <_sk_store_a8_hsw+0x9e>
2710   73,99,4,128,                            //movslq        (%r8,%rax,4),%rax
2711   76,1,192,                               //add           %r8,%rax
2712   255,224,                                //jmpq          *%rax
2713   196,67,121,20,68,57,6,12,               //vpextrb       $0xc,%xmm8,0x6(%r9,%rdi,1)
2714   196,67,121,20,68,57,5,10,               //vpextrb       $0xa,%xmm8,0x5(%r9,%rdi,1)
2715   196,67,121,20,68,57,4,8,                //vpextrb       $0x8,%xmm8,0x4(%r9,%rdi,1)
2716   196,67,121,20,68,57,3,6,                //vpextrb       $0x6,%xmm8,0x3(%r9,%rdi,1)
2717   196,67,121,20,68,57,2,4,                //vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
2718   196,67,121,20,68,57,1,2,                //vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
2719   196,67,121,20,4,57,0,                   //vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
2720   235,154,                                //jmp           835 <_sk_store_a8_hsw+0x37>
2721   144,                                    //nop
2722   246,255,                                //idiv          %bh
2723   255,                                    //(bad)
2724   255,                                    //(bad)
2725   238,                                    //out           %al,(%dx)
2726   255,                                    //(bad)
2727   255,                                    //(bad)
2728   255,230,                                //jmpq          *%rsi
2729   255,                                    //(bad)
2730   255,                                    //(bad)
2731   255,                                    //(bad)
2732   222,255,                                //fdivrp        %st,%st(7)
2733   255,                                    //(bad)
2734   255,214,                                //callq         *%rsi
2735   255,                                    //(bad)
2736   255,                                    //(bad)
2737   255,206,                                //dec           %esi
2738   255,                                    //(bad)
2739   255,                                    //(bad)
2740   255,198,                                //inc           %esi
2741   255,                                    //(bad)
2742   255,                                    //(bad)
2743   255,                                    //.byte         0xff
2744 };
2745 
2746 CODE const uint8_t sk_load_565_hsw[] = {
2747   72,173,                                 //lods          %ds:(%rsi),%rax
2748   76,139,16,                              //mov           (%rax),%r10
2749   72,133,201,                             //test          %rcx,%rcx
2750   15,133,149,0,0,0,                       //jne           95b <_sk_load_565_hsw+0xa3>
2751   196,193,122,111,4,122,                  //vmovdqu       (%r10,%rdi,2),%xmm0
2752   196,226,125,51,208,                     //vpmovzxwd     %xmm0,%ymm2
2753   184,0,248,0,0,                          //mov           $0xf800,%eax
2754   197,249,110,192,                        //vmovd         %eax,%xmm0
2755   196,226,125,88,192,                     //vpbroadcastd  %xmm0,%ymm0
2756   197,253,219,194,                        //vpand         %ymm2,%ymm0,%ymm0
2757   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
2758   184,8,33,132,55,                        //mov           $0x37842108,%eax
2759   197,249,110,200,                        //vmovd         %eax,%xmm1
2760   196,226,125,88,201,                     //vpbroadcastd  %xmm1,%ymm1
2761   197,252,89,193,                         //vmulps        %ymm1,%ymm0,%ymm0
2762   184,224,7,0,0,                          //mov           $0x7e0,%eax
2763   197,249,110,200,                        //vmovd         %eax,%xmm1
2764   196,226,125,88,201,                     //vpbroadcastd  %xmm1,%ymm1
2765   197,245,219,202,                        //vpand         %ymm2,%ymm1,%ymm1
2766   197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
2767   184,33,8,2,58,                          //mov           $0x3a020821,%eax
2768   197,249,110,216,                        //vmovd         %eax,%xmm3
2769   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
2770   197,244,89,203,                         //vmulps        %ymm3,%ymm1,%ymm1
2771   184,31,0,0,0,                           //mov           $0x1f,%eax
2772   197,249,110,216,                        //vmovd         %eax,%xmm3
2773   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
2774   197,229,219,210,                        //vpand         %ymm2,%ymm3,%ymm2
2775   197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
2776   184,8,33,4,61,                          //mov           $0x3d042108,%eax
2777   197,249,110,216,                        //vmovd         %eax,%xmm3
2778   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
2779   197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
2780   184,0,0,128,63,                         //mov           $0x3f800000,%eax
2781   197,249,110,216,                        //vmovd         %eax,%xmm3
2782   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
2783   72,173,                                 //lods          %ds:(%rsi),%rax
2784   255,224,                                //jmpq          *%rax
2785   65,137,200,                             //mov           %ecx,%r8d
2786   65,128,224,7,                           //and           $0x7,%r8b
2787   197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
2788   65,254,200,                             //dec           %r8b
2789   65,128,248,6,                           //cmp           $0x6,%r8b
2790   15,135,89,255,255,255,                  //ja            8cc <_sk_load_565_hsw+0x14>
2791   69,15,182,192,                          //movzbl        %r8b,%r8d
2792   76,141,13,74,0,0,0,                     //lea           0x4a(%rip),%r9        # 9c8 <_sk_load_565_hsw+0x110>
2793   75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
2794   76,1,200,                               //add           %r9,%rax
2795   255,224,                                //jmpq          *%rax
2796   197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
2797   196,193,121,196,68,122,12,6,            //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
2798   196,193,121,196,68,122,10,5,            //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
2799   196,193,121,196,68,122,8,4,             //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
2800   196,193,121,196,68,122,6,3,             //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
2801   196,193,121,196,68,122,4,2,             //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
2802   196,193,121,196,68,122,2,1,             //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
2803   196,193,121,196,4,122,0,                //vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
2804   233,5,255,255,255,                      //jmpq          8cc <_sk_load_565_hsw+0x14>
2805   144,                                    //nop
2806   243,255,                                //repz          (bad)
2807   255,                                    //(bad)
2808   255,                                    //(bad)
2809   235,255,                                //jmp           9cd <_sk_load_565_hsw+0x115>
2810   255,                                    //(bad)
2811   255,227,                                //jmpq          *%rbx
2812   255,                                    //(bad)
2813   255,                                    //(bad)
2814   255,                                    //(bad)
2815   219,255,                                //(bad)
2816   255,                                    //(bad)
2817   255,211,                                //callq         *%rbx
2818   255,                                    //(bad)
2819   255,                                    //(bad)
2820   255,203,                                //dec           %ebx
2821   255,                                    //(bad)
2822   255,                                    //(bad)
2823   255,                                    //(bad)
2824   191,                                    //.byte         0xbf
2825   255,                                    //(bad)
2826   255,                                    //(bad)
2827   255,                                    //.byte         0xff
2828 };
2829 
2830 CODE const uint8_t sk_store_565_hsw[] = {
2831   72,173,                                 //lods          %ds:(%rsi),%rax
2832   76,139,8,                               //mov           (%rax),%r9
2833   184,0,0,248,65,                         //mov           $0x41f80000,%eax
2834   197,121,110,192,                        //vmovd         %eax,%xmm8
2835   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
2836   197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
2837   196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
2838   196,193,53,114,241,11,                  //vpslld        $0xb,%ymm9,%ymm9
2839   184,0,0,124,66,                         //mov           $0x427c0000,%eax
2840   197,121,110,208,                        //vmovd         %eax,%xmm10
2841   196,66,125,88,210,                      //vpbroadcastd  %xmm10,%ymm10
2842   197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
2843   196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
2844   196,193,45,114,242,5,                   //vpslld        $0x5,%ymm10,%ymm10
2845   196,65,45,235,201,                      //vpor          %ymm9,%ymm10,%ymm9
2846   197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
2847   196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
2848   196,65,53,235,192,                      //vpor          %ymm8,%ymm9,%ymm8
2849   196,67,125,57,193,1,                    //vextracti128  $0x1,%ymm8,%xmm9
2850   196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
2851   72,133,201,                             //test          %rcx,%rcx
2852   117,10,                                 //jne           a50 <_sk_store_565_hsw+0x6c>
2853   196,65,122,127,4,121,                   //vmovdqu       %xmm8,(%r9,%rdi,2)
2854   72,173,                                 //lods          %ds:(%rsi),%rax
2855   255,224,                                //jmpq          *%rax
2856   65,137,200,                             //mov           %ecx,%r8d
2857   65,128,224,7,                           //and           $0x7,%r8b
2858   65,254,200,                             //dec           %r8b
2859   65,128,248,6,                           //cmp           $0x6,%r8b
2860   119,236,                                //ja            a4c <_sk_store_565_hsw+0x68>
2861   65,15,182,192,                          //movzbl        %r8b,%eax
2862   76,141,5,69,0,0,0,                      //lea           0x45(%rip),%r8        # ab0 <_sk_store_565_hsw+0xcc>
2863   73,99,4,128,                            //movslq        (%r8,%rax,4),%rax
2864   76,1,192,                               //add           %r8,%rax
2865   255,224,                                //jmpq          *%rax
2866   196,67,121,21,68,121,12,6,              //vpextrw       $0x6,%xmm8,0xc(%r9,%rdi,2)
2867   196,67,121,21,68,121,10,5,              //vpextrw       $0x5,%xmm8,0xa(%r9,%rdi,2)
2868   196,67,121,21,68,121,8,4,               //vpextrw       $0x4,%xmm8,0x8(%r9,%rdi,2)
2869   196,67,121,21,68,121,6,3,               //vpextrw       $0x3,%xmm8,0x6(%r9,%rdi,2)
2870   196,67,121,21,68,121,4,2,               //vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
2871   196,67,121,21,68,121,2,1,               //vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
2872   196,67,121,21,4,121,0,                  //vpextrw       $0x0,%xmm8,(%r9,%rdi,2)
2873   235,159,                                //jmp           a4c <_sk_store_565_hsw+0x68>
2874   15,31,0,                                //nopl          (%rax)
2875   244,                                    //hlt
2876   255,                                    //(bad)
2877   255,                                    //(bad)
2878   255,                                    //(bad)
2879   236,                                    //in            (%dx),%al
2880   255,                                    //(bad)
2881   255,                                    //(bad)
2882   255,228,                                //jmpq          *%rsp
2883   255,                                    //(bad)
2884   255,                                    //(bad)
2885   255,                                    //(bad)
2886   220,255,                                //fdivr         %st,%st(7)
2887   255,                                    //(bad)
2888   255,212,                                //callq         *%rsp
2889   255,                                    //(bad)
2890   255,                                    //(bad)
2891   255,204,                                //dec           %esp
2892   255,                                    //(bad)
2893   255,                                    //(bad)
2894   255,196,                                //inc           %esp
2895   255,                                    //(bad)
2896   255,                                    //(bad)
2897   255,                                    //.byte         0xff
2898 };
2899 
2900 CODE const uint8_t sk_load_8888_hsw[] = {
2901   73,137,200,                             //mov           %rcx,%r8
2902   72,173,                                 //lods          %ds:(%rsi),%rax
2903   76,141,12,189,0,0,0,0,                  //lea           0x0(,%rdi,4),%r9
2904   76,3,8,                                 //add           (%rax),%r9
2905   77,133,192,                             //test          %r8,%r8
2906   117,104,                                //jne           b49 <_sk_load_8888_hsw+0x7d>
2907   196,193,126,111,25,                     //vmovdqu       (%r9),%ymm3
2908   184,255,0,0,0,                          //mov           $0xff,%eax
2909   197,249,110,192,                        //vmovd         %eax,%xmm0
2910   196,226,125,88,208,                     //vpbroadcastd  %xmm0,%ymm2
2911   197,237,219,195,                        //vpand         %ymm3,%ymm2,%ymm0
2912   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
2913   184,129,128,128,59,                     //mov           $0x3b808081,%eax
2914   197,249,110,200,                        //vmovd         %eax,%xmm1
2915   196,98,125,88,193,                      //vpbroadcastd  %xmm1,%ymm8
2916   196,193,124,89,192,                     //vmulps        %ymm8,%ymm0,%ymm0
2917   197,245,114,211,8,                      //vpsrld        $0x8,%ymm3,%ymm1
2918   197,237,219,201,                        //vpand         %ymm1,%ymm2,%ymm1
2919   197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
2920   196,193,116,89,200,                     //vmulps        %ymm8,%ymm1,%ymm1
2921   197,181,114,211,16,                     //vpsrld        $0x10,%ymm3,%ymm9
2922   196,193,109,219,209,                    //vpand         %ymm9,%ymm2,%ymm2
2923   197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
2924   196,193,108,89,208,                     //vmulps        %ymm8,%ymm2,%ymm2
2925   197,229,114,211,24,                     //vpsrld        $0x18,%ymm3,%ymm3
2926   197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
2927   196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
2928   72,173,                                 //lods          %ds:(%rsi),%rax
2929   76,137,193,                             //mov           %r8,%rcx
2930   255,224,                                //jmpq          *%rax
2931   185,8,0,0,0,                            //mov           $0x8,%ecx
2932   68,41,193,                              //sub           %r8d,%ecx
2933   192,225,3,                              //shl           $0x3,%cl
2934   72,199,192,255,255,255,255,             //mov           $0xffffffffffffffff,%rax
2935   72,211,232,                             //shr           %cl,%rax
2936   196,225,249,110,192,                    //vmovq         %rax,%xmm0
2937   196,226,125,33,192,                     //vpmovsxbd     %xmm0,%ymm0
2938   196,194,125,140,25,                     //vpmaskmovd    (%r9),%ymm0,%ymm3
2939   233,116,255,255,255,                    //jmpq          ae6 <_sk_load_8888_hsw+0x1a>
2940 };
2941 
2942 CODE const uint8_t sk_store_8888_hsw[] = {
2943   73,137,200,                             //mov           %rcx,%r8
2944   72,173,                                 //lods          %ds:(%rsi),%rax
2945   76,141,12,189,0,0,0,0,                  //lea           0x0(,%rdi,4),%r9
2946   76,3,8,                                 //add           (%rax),%r9
2947   184,0,0,127,67,                         //mov           $0x437f0000,%eax
2948   197,121,110,192,                        //vmovd         %eax,%xmm8
2949   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
2950   197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
2951   196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
2952   197,60,89,209,                          //vmulps        %ymm1,%ymm8,%ymm10
2953   196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
2954   196,193,45,114,242,8,                   //vpslld        $0x8,%ymm10,%ymm10
2955   196,65,45,235,201,                      //vpor          %ymm9,%ymm10,%ymm9
2956   197,60,89,210,                          //vmulps        %ymm2,%ymm8,%ymm10
2957   196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
2958   196,193,45,114,242,16,                  //vpslld        $0x10,%ymm10,%ymm10
2959   197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
2960   196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
2961   196,193,61,114,240,24,                  //vpslld        $0x18,%ymm8,%ymm8
2962   196,65,45,235,192,                      //vpor          %ymm8,%ymm10,%ymm8
2963   196,65,53,235,192,                      //vpor          %ymm8,%ymm9,%ymm8
2964   77,133,192,                             //test          %r8,%r8
2965   117,12,                                 //jne           be6 <_sk_store_8888_hsw+0x74>
2966   196,65,126,127,1,                       //vmovdqu       %ymm8,(%r9)
2967   72,173,                                 //lods          %ds:(%rsi),%rax
2968   76,137,193,                             //mov           %r8,%rcx
2969   255,224,                                //jmpq          *%rax
2970   185,8,0,0,0,                            //mov           $0x8,%ecx
2971   68,41,193,                              //sub           %r8d,%ecx
2972   192,225,3,                              //shl           $0x3,%cl
2973   72,199,192,255,255,255,255,             //mov           $0xffffffffffffffff,%rax
2974   72,211,232,                             //shr           %cl,%rax
2975   196,97,249,110,200,                     //vmovq         %rax,%xmm9
2976   196,66,125,33,201,                      //vpmovsxbd     %xmm9,%ymm9
2977   196,66,53,142,1,                        //vpmaskmovd    %ymm8,%ymm9,(%r9)
2978   235,211,                                //jmp           bdf <_sk_store_8888_hsw+0x6d>
2979 };
2980 
2981 CODE const uint8_t sk_load_f16_hsw[] = {
2982   72,173,                                 //lods          %ds:(%rsi),%rax
2983   72,139,0,                               //mov           (%rax),%rax
2984   72,133,201,                             //test          %rcx,%rcx
2985   117,97,                                 //jne           c77 <_sk_load_f16_hsw+0x6b>
2986   197,121,16,4,248,                       //vmovupd       (%rax,%rdi,8),%xmm8
2987   197,249,16,84,248,16,                   //vmovupd       0x10(%rax,%rdi,8),%xmm2
2988   197,249,16,92,248,32,                   //vmovupd       0x20(%rax,%rdi,8),%xmm3
2989   197,122,111,76,248,48,                  //vmovdqu       0x30(%rax,%rdi,8),%xmm9
2990   197,185,97,194,                         //vpunpcklwd    %xmm2,%xmm8,%xmm0
2991   197,185,105,210,                        //vpunpckhwd    %xmm2,%xmm8,%xmm2
2992   196,193,97,97,201,                      //vpunpcklwd    %xmm9,%xmm3,%xmm1
2993   196,193,97,105,217,                     //vpunpckhwd    %xmm9,%xmm3,%xmm3
2994   197,121,97,194,                         //vpunpcklwd    %xmm2,%xmm0,%xmm8
2995   197,121,105,202,                        //vpunpckhwd    %xmm2,%xmm0,%xmm9
2996   197,241,97,211,                         //vpunpcklwd    %xmm3,%xmm1,%xmm2
2997   197,241,105,219,                        //vpunpckhwd    %xmm3,%xmm1,%xmm3
2998   197,185,108,194,                        //vpunpcklqdq   %xmm2,%xmm8,%xmm0
2999   196,226,125,19,192,                     //vcvtph2ps     %xmm0,%ymm0
3000   197,185,109,202,                        //vpunpckhqdq   %xmm2,%xmm8,%xmm1
3001   196,226,125,19,201,                     //vcvtph2ps     %xmm1,%ymm1
3002   197,177,108,211,                        //vpunpcklqdq   %xmm3,%xmm9,%xmm2
3003   196,226,125,19,210,                     //vcvtph2ps     %xmm2,%ymm2
3004   197,177,109,219,                        //vpunpckhqdq   %xmm3,%xmm9,%xmm3
3005   196,226,125,19,219,                     //vcvtph2ps     %xmm3,%ymm3
3006   72,173,                                 //lods          %ds:(%rsi),%rax
3007   255,224,                                //jmpq          *%rax
3008   197,123,16,4,248,                       //vmovsd        (%rax,%rdi,8),%xmm8
3009   196,65,49,239,201,                      //vpxor         %xmm9,%xmm9,%xmm9
3010   72,131,249,1,                           //cmp           $0x1,%rcx
3011   116,79,                                 //je            cd6 <_sk_load_f16_hsw+0xca>
3012   197,57,22,68,248,8,                     //vmovhpd       0x8(%rax,%rdi,8),%xmm8,%xmm8
3013   72,131,249,3,                           //cmp           $0x3,%rcx
3014   114,67,                                 //jb            cd6 <_sk_load_f16_hsw+0xca>
3015   197,251,16,84,248,16,                   //vmovsd        0x10(%rax,%rdi,8),%xmm2
3016   72,131,249,3,                           //cmp           $0x3,%rcx
3017   116,68,                                 //je            ce3 <_sk_load_f16_hsw+0xd7>
3018   197,233,22,84,248,24,                   //vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
3019   72,131,249,5,                           //cmp           $0x5,%rcx
3020   114,56,                                 //jb            ce3 <_sk_load_f16_hsw+0xd7>
3021   197,251,16,92,248,32,                   //vmovsd        0x20(%rax,%rdi,8),%xmm3
3022   72,131,249,5,                           //cmp           $0x5,%rcx
3023   15,132,114,255,255,255,                 //je            c2d <_sk_load_f16_hsw+0x21>
3024   197,225,22,92,248,40,                   //vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
3025   72,131,249,7,                           //cmp           $0x7,%rcx
3026   15,130,98,255,255,255,                  //jb            c2d <_sk_load_f16_hsw+0x21>
3027   197,122,126,76,248,48,                  //vmovq         0x30(%rax,%rdi,8),%xmm9
3028   233,87,255,255,255,                     //jmpq          c2d <_sk_load_f16_hsw+0x21>
3029   197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
3030   197,233,87,210,                         //vxorpd        %xmm2,%xmm2,%xmm2
3031   233,74,255,255,255,                     //jmpq          c2d <_sk_load_f16_hsw+0x21>
3032   197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
3033   233,65,255,255,255,                     //jmpq          c2d <_sk_load_f16_hsw+0x21>
3034 };
3035 
3036 CODE const uint8_t sk_store_f16_hsw[] = {
3037   72,173,                                 //lods          %ds:(%rsi),%rax
3038   72,139,0,                               //mov           (%rax),%rax
3039   196,195,125,29,192,4,                   //vcvtps2ph     $0x4,%ymm0,%xmm8
3040   196,195,125,29,201,4,                   //vcvtps2ph     $0x4,%ymm1,%xmm9
3041   196,195,125,29,210,4,                   //vcvtps2ph     $0x4,%ymm2,%xmm10
3042   196,195,125,29,219,4,                   //vcvtps2ph     $0x4,%ymm3,%xmm11
3043   196,65,57,97,225,                       //vpunpcklwd    %xmm9,%xmm8,%xmm12
3044   196,65,57,105,193,                      //vpunpckhwd    %xmm9,%xmm8,%xmm8
3045   196,65,41,97,203,                       //vpunpcklwd    %xmm11,%xmm10,%xmm9
3046   196,65,41,105,235,                      //vpunpckhwd    %xmm11,%xmm10,%xmm13
3047   196,65,25,98,217,                       //vpunpckldq    %xmm9,%xmm12,%xmm11
3048   196,65,25,106,209,                      //vpunpckhdq    %xmm9,%xmm12,%xmm10
3049   196,65,57,98,205,                       //vpunpckldq    %xmm13,%xmm8,%xmm9
3050   196,65,57,106,197,                      //vpunpckhdq    %xmm13,%xmm8,%xmm8
3051   72,133,201,                             //test          %rcx,%rcx
3052   117,27,                                 //jne           d51 <_sk_store_f16_hsw+0x65>
3053   197,120,17,28,248,                      //vmovups       %xmm11,(%rax,%rdi,8)
3054   197,120,17,84,248,16,                   //vmovups       %xmm10,0x10(%rax,%rdi,8)
3055   197,120,17,76,248,32,                   //vmovups       %xmm9,0x20(%rax,%rdi,8)
3056   197,122,127,68,248,48,                  //vmovdqu       %xmm8,0x30(%rax,%rdi,8)
3057   72,173,                                 //lods          %ds:(%rsi),%rax
3058   255,224,                                //jmpq          *%rax
3059   197,121,214,28,248,                     //vmovq         %xmm11,(%rax,%rdi,8)
3060   72,131,249,1,                           //cmp           $0x1,%rcx
3061   116,241,                                //je            d4d <_sk_store_f16_hsw+0x61>
3062   197,121,23,92,248,8,                    //vmovhpd       %xmm11,0x8(%rax,%rdi,8)
3063   72,131,249,3,                           //cmp           $0x3,%rcx
3064   114,229,                                //jb            d4d <_sk_store_f16_hsw+0x61>
3065   197,121,214,84,248,16,                  //vmovq         %xmm10,0x10(%rax,%rdi,8)
3066   116,221,                                //je            d4d <_sk_store_f16_hsw+0x61>
3067   197,121,23,84,248,24,                   //vmovhpd       %xmm10,0x18(%rax,%rdi,8)
3068   72,131,249,5,                           //cmp           $0x5,%rcx
3069   114,209,                                //jb            d4d <_sk_store_f16_hsw+0x61>
3070   197,121,214,76,248,32,                  //vmovq         %xmm9,0x20(%rax,%rdi,8)
3071   116,201,                                //je            d4d <_sk_store_f16_hsw+0x61>
3072   197,121,23,76,248,40,                   //vmovhpd       %xmm9,0x28(%rax,%rdi,8)
3073   72,131,249,7,                           //cmp           $0x7,%rcx
3074   114,189,                                //jb            d4d <_sk_store_f16_hsw+0x61>
3075   197,121,214,68,248,48,                  //vmovq         %xmm8,0x30(%rax,%rdi,8)
3076   235,181,                                //jmp           d4d <_sk_store_f16_hsw+0x61>
3077 };
3078 
3079 CODE const uint8_t sk_store_f32_hsw[] = {
3080   72,173,                                 //lods          %ds:(%rsi),%rax
3081   76,139,0,                               //mov           (%rax),%r8
3082   72,141,4,189,0,0,0,0,                   //lea           0x0(,%rdi,4),%rax
3083   197,124,20,193,                         //vunpcklps     %ymm1,%ymm0,%ymm8
3084   197,124,21,217,                         //vunpckhps     %ymm1,%ymm0,%ymm11
3085   197,108,20,203,                         //vunpcklps     %ymm3,%ymm2,%ymm9
3086   197,108,21,227,                         //vunpckhps     %ymm3,%ymm2,%ymm12
3087   196,65,61,20,209,                       //vunpcklpd     %ymm9,%ymm8,%ymm10
3088   196,65,61,21,201,                       //vunpckhpd     %ymm9,%ymm8,%ymm9
3089   196,65,37,20,196,                       //vunpcklpd     %ymm12,%ymm11,%ymm8
3090   196,65,37,21,220,                       //vunpckhpd     %ymm12,%ymm11,%ymm11
3091   72,133,201,                             //test          %rcx,%rcx
3092   117,55,                                 //jne           e05 <_sk_store_f32_hsw+0x6d>
3093   196,67,45,24,225,1,                     //vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
3094   196,67,61,24,235,1,                     //vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
3095   196,67,45,6,201,49,                     //vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
3096   196,67,61,6,195,49,                     //vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
3097   196,65,125,17,36,128,                   //vmovupd       %ymm12,(%r8,%rax,4)
3098   196,65,125,17,108,128,32,               //vmovupd       %ymm13,0x20(%r8,%rax,4)
3099   196,65,125,17,76,128,64,                //vmovupd       %ymm9,0x40(%r8,%rax,4)
3100   196,65,125,17,68,128,96,                //vmovupd       %ymm8,0x60(%r8,%rax,4)
3101   72,173,                                 //lods          %ds:(%rsi),%rax
3102   255,224,                                //jmpq          *%rax
3103   196,65,121,17,20,128,                   //vmovupd       %xmm10,(%r8,%rax,4)
3104   72,131,249,1,                           //cmp           $0x1,%rcx
3105   116,240,                                //je            e01 <_sk_store_f32_hsw+0x69>
3106   196,65,121,17,76,128,16,                //vmovupd       %xmm9,0x10(%r8,%rax,4)
3107   72,131,249,3,                           //cmp           $0x3,%rcx
3108   114,227,                                //jb            e01 <_sk_store_f32_hsw+0x69>
3109   196,65,121,17,68,128,32,                //vmovupd       %xmm8,0x20(%r8,%rax,4)
3110   116,218,                                //je            e01 <_sk_store_f32_hsw+0x69>
3111   196,65,121,17,92,128,48,                //vmovupd       %xmm11,0x30(%r8,%rax,4)
3112   72,131,249,5,                           //cmp           $0x5,%rcx
3113   114,205,                                //jb            e01 <_sk_store_f32_hsw+0x69>
3114   196,67,125,25,84,128,64,1,              //vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
3115   116,195,                                //je            e01 <_sk_store_f32_hsw+0x69>
3116   196,67,125,25,76,128,80,1,              //vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
3117   72,131,249,7,                           //cmp           $0x7,%rcx
3118   114,181,                                //jb            e01 <_sk_store_f32_hsw+0x69>
3119   196,67,125,25,68,128,96,1,              //vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
3120   235,171,                                //jmp           e01 <_sk_store_f32_hsw+0x69>
3121 };
3122 
3123 CODE const uint8_t sk_clamp_x_hsw[] = {
3124   72,173,                                 //lods          %ds:(%rsi),%rax
3125   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
3126   197,188,95,192,                         //vmaxps        %ymm0,%ymm8,%ymm0
3127   196,98,125,88,0,                        //vpbroadcastd  (%rax),%ymm8
3128   196,65,53,118,201,                      //vpcmpeqd      %ymm9,%ymm9,%ymm9
3129   196,65,61,254,193,                      //vpaddd        %ymm9,%ymm8,%ymm8
3130   196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
3131   72,173,                                 //lods          %ds:(%rsi),%rax
3132   255,224,                                //jmpq          *%rax
3133 };
3134 
3135 CODE const uint8_t sk_clamp_y_hsw[] = {
3136   72,173,                                 //lods          %ds:(%rsi),%rax
3137   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
3138   197,188,95,201,                         //vmaxps        %ymm1,%ymm8,%ymm1
3139   196,98,125,88,0,                        //vpbroadcastd  (%rax),%ymm8
3140   196,65,53,118,201,                      //vpcmpeqd      %ymm9,%ymm9,%ymm9
3141   196,65,61,254,193,                      //vpaddd        %ymm9,%ymm8,%ymm8
3142   196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
3143   72,173,                                 //lods          %ds:(%rsi),%rax
3144   255,224,                                //jmpq          *%rax
3145 };
3146 
3147 CODE const uint8_t sk_repeat_x_hsw[] = {
3148   72,173,                                 //lods          %ds:(%rsi),%rax
3149   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
3150   196,65,124,94,200,                      //vdivps        %ymm8,%ymm0,%ymm9
3151   196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
3152   196,98,61,172,200,                      //vfnmadd213ps  %ymm0,%ymm8,%ymm9
3153   197,253,118,192,                        //vpcmpeqd      %ymm0,%ymm0,%ymm0
3154   197,189,254,192,                        //vpaddd        %ymm0,%ymm8,%ymm0
3155   197,180,93,192,                         //vminps        %ymm0,%ymm9,%ymm0
3156   72,173,                                 //lods          %ds:(%rsi),%rax
3157   255,224,                                //jmpq          *%rax
3158 };
3159 
3160 CODE const uint8_t sk_repeat_y_hsw[] = {
3161   72,173,                                 //lods          %ds:(%rsi),%rax
3162   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
3163   196,65,116,94,200,                      //vdivps        %ymm8,%ymm1,%ymm9
3164   196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
3165   196,98,61,172,201,                      //vfnmadd213ps  %ymm1,%ymm8,%ymm9
3166   197,245,118,201,                        //vpcmpeqd      %ymm1,%ymm1,%ymm1
3167   197,189,254,201,                        //vpaddd        %ymm1,%ymm8,%ymm1
3168   197,180,93,201,                         //vminps        %ymm1,%ymm9,%ymm1
3169   72,173,                                 //lods          %ds:(%rsi),%rax
3170   255,224,                                //jmpq          *%rax
3171 };
3172 
3173 CODE const uint8_t sk_mirror_x_hsw[] = {
3174   72,173,                                 //lods          %ds:(%rsi),%rax
3175   197,122,16,0,                           //vmovss        (%rax),%xmm8
3176   196,66,125,24,200,                      //vbroadcastss  %xmm8,%ymm9
3177   196,65,124,92,209,                      //vsubps        %ymm9,%ymm0,%ymm10
3178   196,193,58,88,192,                      //vaddss        %xmm8,%xmm8,%xmm0
3179   196,226,125,24,192,                     //vbroadcastss  %xmm0,%ymm0
3180   197,44,94,192,                          //vdivps        %ymm0,%ymm10,%ymm8
3181   196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
3182   196,66,125,172,194,                     //vfnmadd213ps  %ymm10,%ymm0,%ymm8
3183   196,193,60,92,193,                      //vsubps        %ymm9,%ymm8,%ymm0
3184   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
3185   197,60,92,192,                          //vsubps        %ymm0,%ymm8,%ymm8
3186   197,188,84,192,                         //vandps        %ymm0,%ymm8,%ymm0
3187   196,65,61,118,192,                      //vpcmpeqd      %ymm8,%ymm8,%ymm8
3188   196,65,53,254,192,                      //vpaddd        %ymm8,%ymm9,%ymm8
3189   196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
3190   72,173,                                 //lods          %ds:(%rsi),%rax
3191   255,224,                                //jmpq          *%rax
3192 };
3193 
3194 CODE const uint8_t sk_mirror_y_hsw[] = {
3195   72,173,                                 //lods          %ds:(%rsi),%rax
3196   197,122,16,0,                           //vmovss        (%rax),%xmm8
3197   196,66,125,24,200,                      //vbroadcastss  %xmm8,%ymm9
3198   196,65,116,92,209,                      //vsubps        %ymm9,%ymm1,%ymm10
3199   196,193,58,88,200,                      //vaddss        %xmm8,%xmm8,%xmm1
3200   196,226,125,24,201,                     //vbroadcastss  %xmm1,%ymm1
3201   197,44,94,193,                          //vdivps        %ymm1,%ymm10,%ymm8
3202   196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
3203   196,66,117,172,194,                     //vfnmadd213ps  %ymm10,%ymm1,%ymm8
3204   196,193,60,92,201,                      //vsubps        %ymm9,%ymm8,%ymm1
3205   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
3206   197,60,92,193,                          //vsubps        %ymm1,%ymm8,%ymm8
3207   197,188,84,201,                         //vandps        %ymm1,%ymm8,%ymm1
3208   196,65,61,118,192,                      //vpcmpeqd      %ymm8,%ymm8,%ymm8
3209   196,65,53,254,192,                      //vpaddd        %ymm8,%ymm9,%ymm8
3210   196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
3211   72,173,                                 //lods          %ds:(%rsi),%rax
3212   255,224,                                //jmpq          *%rax
3213 };
3214 
3215 CODE const uint8_t sk_luminance_to_alpha_hsw[] = {
3216   184,208,179,89,62,                      //mov           $0x3e59b3d0,%eax
3217   197,249,110,216,                        //vmovd         %eax,%xmm3
3218   196,98,125,88,195,                      //vpbroadcastd  %xmm3,%ymm8
3219   184,89,23,55,63,                        //mov           $0x3f371759,%eax
3220   197,249,110,216,                        //vmovd         %eax,%xmm3
3221   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
3222   197,228,89,201,                         //vmulps        %ymm1,%ymm3,%ymm1
3223   196,98,125,168,193,                     //vfmadd213ps   %ymm1,%ymm0,%ymm8
3224   184,152,221,147,61,                     //mov           $0x3d93dd98,%eax
3225   197,249,110,192,                        //vmovd         %eax,%xmm0
3226   196,226,125,88,216,                     //vpbroadcastd  %xmm0,%ymm3
3227   196,194,109,168,216,                    //vfmadd213ps   %ymm8,%ymm2,%ymm3
3228   72,173,                                 //lods          %ds:(%rsi),%rax
3229   197,253,239,192,                        //vpxor         %ymm0,%ymm0,%ymm0
3230   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
3231   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
3232   255,224,                                //jmpq          *%rax
3233 };
3234 
3235 CODE const uint8_t sk_matrix_2x3_hsw[] = {
3236   72,173,                                 //lods          %ds:(%rsi),%rax
3237   196,98,125,24,8,                        //vbroadcastss  (%rax),%ymm9
3238   196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
3239   196,98,125,24,64,16,                    //vbroadcastss  0x10(%rax),%ymm8
3240   196,66,117,184,194,                     //vfmadd231ps   %ymm10,%ymm1,%ymm8
3241   196,66,125,184,193,                     //vfmadd231ps   %ymm9,%ymm0,%ymm8
3242   196,98,125,24,80,4,                     //vbroadcastss  0x4(%rax),%ymm10
3243   196,98,125,24,88,12,                    //vbroadcastss  0xc(%rax),%ymm11
3244   196,98,125,24,72,20,                    //vbroadcastss  0x14(%rax),%ymm9
3245   196,66,117,184,203,                     //vfmadd231ps   %ymm11,%ymm1,%ymm9
3246   196,66,125,184,202,                     //vfmadd231ps   %ymm10,%ymm0,%ymm9
3247   72,173,                                 //lods          %ds:(%rsi),%rax
3248   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
3249   197,124,41,201,                         //vmovaps       %ymm9,%ymm1
3250   255,224,                                //jmpq          *%rax
3251 };
3252 
3253 CODE const uint8_t sk_matrix_3x4_hsw[] = {
3254   72,173,                                 //lods          %ds:(%rsi),%rax
3255   196,98,125,24,8,                        //vbroadcastss  (%rax),%ymm9
3256   196,98,125,24,80,12,                    //vbroadcastss  0xc(%rax),%ymm10
3257   196,98,125,24,88,24,                    //vbroadcastss  0x18(%rax),%ymm11
3258   196,98,125,24,64,36,                    //vbroadcastss  0x24(%rax),%ymm8
3259   196,66,109,184,195,                     //vfmadd231ps   %ymm11,%ymm2,%ymm8
3260   196,66,117,184,194,                     //vfmadd231ps   %ymm10,%ymm1,%ymm8
3261   196,66,125,184,193,                     //vfmadd231ps   %ymm9,%ymm0,%ymm8
3262   196,98,125,24,80,4,                     //vbroadcastss  0x4(%rax),%ymm10
3263   196,98,125,24,88,16,                    //vbroadcastss  0x10(%rax),%ymm11
3264   196,98,125,24,96,28,                    //vbroadcastss  0x1c(%rax),%ymm12
3265   196,98,125,24,72,40,                    //vbroadcastss  0x28(%rax),%ymm9
3266   196,66,109,184,204,                     //vfmadd231ps   %ymm12,%ymm2,%ymm9
3267   196,66,117,184,203,                     //vfmadd231ps   %ymm11,%ymm1,%ymm9
3268   196,66,125,184,202,                     //vfmadd231ps   %ymm10,%ymm0,%ymm9
3269   196,98,125,24,88,8,                     //vbroadcastss  0x8(%rax),%ymm11
3270   196,98,125,24,96,20,                    //vbroadcastss  0x14(%rax),%ymm12
3271   196,98,125,24,104,32,                   //vbroadcastss  0x20(%rax),%ymm13
3272   196,98,125,24,80,44,                    //vbroadcastss  0x2c(%rax),%ymm10
3273   196,66,109,184,213,                     //vfmadd231ps   %ymm13,%ymm2,%ymm10
3274   196,66,117,184,212,                     //vfmadd231ps   %ymm12,%ymm1,%ymm10
3275   196,66,125,184,211,                     //vfmadd231ps   %ymm11,%ymm0,%ymm10
3276   72,173,                                 //lods          %ds:(%rsi),%rax
3277   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
3278   197,124,41,201,                         //vmovaps       %ymm9,%ymm1
3279   197,124,41,210,                         //vmovaps       %ymm10,%ymm2
3280   255,224,                                //jmpq          *%rax
3281 };
3282 
3283 CODE const uint8_t sk_matrix_4x5_hsw[] = {
3284   72,173,                                 //lods          %ds:(%rsi),%rax
3285   196,98,125,24,8,                        //vbroadcastss  (%rax),%ymm9
3286   196,98,125,24,80,16,                    //vbroadcastss  0x10(%rax),%ymm10
3287   196,98,125,24,88,32,                    //vbroadcastss  0x20(%rax),%ymm11
3288   196,98,125,24,96,48,                    //vbroadcastss  0x30(%rax),%ymm12
3289   196,98,125,24,64,64,                    //vbroadcastss  0x40(%rax),%ymm8
3290   196,66,101,184,196,                     //vfmadd231ps   %ymm12,%ymm3,%ymm8
3291   196,66,109,184,195,                     //vfmadd231ps   %ymm11,%ymm2,%ymm8
3292   196,66,117,184,194,                     //vfmadd231ps   %ymm10,%ymm1,%ymm8
3293   196,66,125,184,193,                     //vfmadd231ps   %ymm9,%ymm0,%ymm8
3294   196,98,125,24,80,4,                     //vbroadcastss  0x4(%rax),%ymm10
3295   196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
3296   196,98,125,24,96,36,                    //vbroadcastss  0x24(%rax),%ymm12
3297   196,98,125,24,104,52,                   //vbroadcastss  0x34(%rax),%ymm13
3298   196,98,125,24,72,68,                    //vbroadcastss  0x44(%rax),%ymm9
3299   196,66,101,184,205,                     //vfmadd231ps   %ymm13,%ymm3,%ymm9
3300   196,66,109,184,204,                     //vfmadd231ps   %ymm12,%ymm2,%ymm9
3301   196,66,117,184,203,                     //vfmadd231ps   %ymm11,%ymm1,%ymm9
3302   196,66,125,184,202,                     //vfmadd231ps   %ymm10,%ymm0,%ymm9
3303   196,98,125,24,88,8,                     //vbroadcastss  0x8(%rax),%ymm11
3304   196,98,125,24,96,24,                    //vbroadcastss  0x18(%rax),%ymm12
3305   196,98,125,24,104,40,                   //vbroadcastss  0x28(%rax),%ymm13
3306   196,98,125,24,112,56,                   //vbroadcastss  0x38(%rax),%ymm14
3307   196,98,125,24,80,72,                    //vbroadcastss  0x48(%rax),%ymm10
3308   196,66,101,184,214,                     //vfmadd231ps   %ymm14,%ymm3,%ymm10
3309   196,66,109,184,213,                     //vfmadd231ps   %ymm13,%ymm2,%ymm10
3310   196,66,117,184,212,                     //vfmadd231ps   %ymm12,%ymm1,%ymm10
3311   196,66,125,184,211,                     //vfmadd231ps   %ymm11,%ymm0,%ymm10
3312   196,98,125,24,96,12,                    //vbroadcastss  0xc(%rax),%ymm12
3313   196,98,125,24,104,28,                   //vbroadcastss  0x1c(%rax),%ymm13
3314   196,98,125,24,112,44,                   //vbroadcastss  0x2c(%rax),%ymm14
3315   196,98,125,24,120,60,                   //vbroadcastss  0x3c(%rax),%ymm15
3316   196,98,125,24,88,76,                    //vbroadcastss  0x4c(%rax),%ymm11
3317   196,66,101,184,223,                     //vfmadd231ps   %ymm15,%ymm3,%ymm11
3318   196,66,109,184,222,                     //vfmadd231ps   %ymm14,%ymm2,%ymm11
3319   196,66,117,184,221,                     //vfmadd231ps   %ymm13,%ymm1,%ymm11
3320   196,66,125,184,220,                     //vfmadd231ps   %ymm12,%ymm0,%ymm11
3321   72,173,                                 //lods          %ds:(%rsi),%rax
3322   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
3323   197,124,41,201,                         //vmovaps       %ymm9,%ymm1
3324   197,124,41,210,                         //vmovaps       %ymm10,%ymm2
3325   197,124,41,219,                         //vmovaps       %ymm11,%ymm3
3326   255,224,                                //jmpq          *%rax
3327 };
3328 
3329 CODE const uint8_t sk_matrix_perspective_hsw[] = {
3330   72,173,                                 //lods          %ds:(%rsi),%rax
3331   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
3332   196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
3333   196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
3334   196,66,117,184,209,                     //vfmadd231ps   %ymm9,%ymm1,%ymm10
3335   196,66,125,184,208,                     //vfmadd231ps   %ymm8,%ymm0,%ymm10
3336   196,98,125,24,64,12,                    //vbroadcastss  0xc(%rax),%ymm8
3337   196,98,125,24,72,16,                    //vbroadcastss  0x10(%rax),%ymm9
3338   196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
3339   196,66,117,184,217,                     //vfmadd231ps   %ymm9,%ymm1,%ymm11
3340   196,66,125,184,216,                     //vfmadd231ps   %ymm8,%ymm0,%ymm11
3341   196,98,125,24,64,24,                    //vbroadcastss  0x18(%rax),%ymm8
3342   196,98,125,24,72,28,                    //vbroadcastss  0x1c(%rax),%ymm9
3343   196,98,125,24,96,32,                    //vbroadcastss  0x20(%rax),%ymm12
3344   196,66,117,184,225,                     //vfmadd231ps   %ymm9,%ymm1,%ymm12
3345   196,66,125,184,224,                     //vfmadd231ps   %ymm8,%ymm0,%ymm12
3346   196,193,124,83,204,                     //vrcpps        %ymm12,%ymm1
3347   197,172,89,193,                         //vmulps        %ymm1,%ymm10,%ymm0
3348   197,164,89,201,                         //vmulps        %ymm1,%ymm11,%ymm1
3349   72,173,                                 //lods          %ds:(%rsi),%rax
3350   255,224,                                //jmpq          *%rax
3351 };
3352 
3353 CODE const uint8_t sk_linear_gradient_2stops_hsw[] = {
3354   72,173,                                 //lods          %ds:(%rsi),%rax
3355   196,226,125,24,72,16,                   //vbroadcastss  0x10(%rax),%ymm1
3356   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
3357   196,98,125,184,193,                     //vfmadd231ps   %ymm1,%ymm0,%ymm8
3358   196,226,125,24,80,20,                   //vbroadcastss  0x14(%rax),%ymm2
3359   196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
3360   196,226,125,184,202,                    //vfmadd231ps   %ymm2,%ymm0,%ymm1
3361   196,226,125,24,88,24,                   //vbroadcastss  0x18(%rax),%ymm3
3362   196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
3363   196,226,125,184,211,                    //vfmadd231ps   %ymm3,%ymm0,%ymm2
3364   196,98,125,24,72,28,                    //vbroadcastss  0x1c(%rax),%ymm9
3365   196,226,125,24,88,12,                   //vbroadcastss  0xc(%rax),%ymm3
3366   196,194,125,184,217,                    //vfmadd231ps   %ymm9,%ymm0,%ymm3
3367   72,173,                                 //lods          %ds:(%rsi),%rax
3368   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
3369   255,224,                                //jmpq          *%rax
3370 };
3371 
3372 CODE const uint8_t sk_start_pipeline_avx[] = {
3373   65,87,                                  //push          %r15
3374   65,86,                                  //push          %r14
3375   65,85,                                  //push          %r13
3376   65,84,                                  //push          %r12
3377   83,                                     //push          %rbx
3378   73,137,205,                             //mov           %rcx,%r13
3379   73,137,214,                             //mov           %rdx,%r14
3380   72,137,251,                             //mov           %rdi,%rbx
3381   72,173,                                 //lods          %ds:(%rsi),%rax
3382   73,137,199,                             //mov           %rax,%r15
3383   73,137,244,                             //mov           %rsi,%r12
3384   72,141,67,8,                            //lea           0x8(%rbx),%rax
3385   76,57,232,                              //cmp           %r13,%rax
3386   118,5,                                  //jbe           28 <_sk_start_pipeline_avx+0x28>
3387   72,137,223,                             //mov           %rbx,%rdi
3388   235,65,                                 //jmp           69 <_sk_start_pipeline_avx+0x69>
3389   185,0,0,0,0,                            //mov           $0x0,%ecx
3390   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
3391   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
3392   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
3393   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
3394   197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
3395   197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
3396   197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
3397   197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
3398   72,137,223,                             //mov           %rbx,%rdi
3399   76,137,230,                             //mov           %r12,%rsi
3400   76,137,242,                             //mov           %r14,%rdx
3401   65,255,215,                             //callq         *%r15
3402   72,141,123,8,                           //lea           0x8(%rbx),%rdi
3403   72,131,195,16,                          //add           $0x10,%rbx
3404   76,57,235,                              //cmp           %r13,%rbx
3405   72,137,251,                             //mov           %rdi,%rbx
3406   118,191,                                //jbe           28 <_sk_start_pipeline_avx+0x28>
3407   76,137,233,                             //mov           %r13,%rcx
3408   72,41,249,                              //sub           %rdi,%rcx
3409   116,41,                                 //je            9a <_sk_start_pipeline_avx+0x9a>
3410   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
3411   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
3412   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
3413   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
3414   197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
3415   197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
3416   197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
3417   197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
3418   76,137,230,                             //mov           %r12,%rsi
3419   76,137,242,                             //mov           %r14,%rdx
3420   65,255,215,                             //callq         *%r15
3421   76,137,232,                             //mov           %r13,%rax
3422   91,                                     //pop           %rbx
3423   65,92,                                  //pop           %r12
3424   65,93,                                  //pop           %r13
3425   65,94,                                  //pop           %r14
3426   65,95,                                  //pop           %r15
3427   197,248,119,                            //vzeroupper
3428   195,                                    //retq
3429 };
3430 
3431 CODE const uint8_t sk_just_return_avx[] = {
3432   195,                                    //retq
3433 };
3434 
3435 CODE const uint8_t sk_seed_shader_avx[] = {
3436   72,173,                                 //lods          %ds:(%rsi),%rax
3437   197,249,110,199,                        //vmovd         %edi,%xmm0
3438   197,249,112,192,0,                      //vpshufd       $0x0,%xmm0,%xmm0
3439   196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
3440   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
3441   65,184,0,0,0,63,                        //mov           $0x3f000000,%r8d
3442   196,193,121,110,200,                    //vmovd         %r8d,%xmm1
3443   196,227,121,4,201,0,                    //vpermilps     $0x0,%xmm1,%xmm1
3444   196,227,117,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
3445   197,252,88,193,                         //vaddps        %ymm1,%ymm0,%ymm0
3446   197,252,88,2,                           //vaddps        (%rdx),%ymm0,%ymm0
3447   196,226,125,24,16,                      //vbroadcastss  (%rax),%ymm2
3448   197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
3449   197,236,88,201,                         //vaddps        %ymm1,%ymm2,%ymm1
3450   184,0,0,128,63,                         //mov           $0x3f800000,%eax
3451   197,249,110,208,                        //vmovd         %eax,%xmm2
3452   196,227,121,4,210,0,                    //vpermilps     $0x0,%xmm2,%xmm2
3453   196,227,109,24,210,1,                   //vinsertf128   $0x1,%xmm2,%ymm2,%ymm2
3454   72,173,                                 //lods          %ds:(%rsi),%rax
3455   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
3456   197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
3457   197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
3458   197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
3459   197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
3460   255,224,                                //jmpq          *%rax
3461 };
3462 
3463 CODE const uint8_t sk_constant_color_avx[] = {
3464   72,173,                                 //lods          %ds:(%rsi),%rax
3465   196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
3466   196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
3467   196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
3468   196,226,125,24,88,12,                   //vbroadcastss  0xc(%rax),%ymm3
3469   72,173,                                 //lods          %ds:(%rsi),%rax
3470   255,224,                                //jmpq          *%rax
3471 };
3472 
3473 CODE const uint8_t sk_clear_avx[] = {
3474   72,173,                                 //lods          %ds:(%rsi),%rax
3475   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
3476   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
3477   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
3478   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
3479   255,224,                                //jmpq          *%rax
3480 };
3481 
3482 CODE const uint8_t sk_plus__avx[] = {
3483   197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
3484   197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
3485   197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
3486   197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
3487   72,173,                                 //lods          %ds:(%rsi),%rax
3488   255,224,                                //jmpq          *%rax
3489 };
3490 
3491 CODE const uint8_t sk_srcover_avx[] = {
3492   184,0,0,128,63,                         //mov           $0x3f800000,%eax
3493   197,121,110,192,                        //vmovd         %eax,%xmm8
3494   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
3495   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
3496   197,60,92,195,                          //vsubps        %ymm3,%ymm8,%ymm8
3497   197,60,89,204,                          //vmulps        %ymm4,%ymm8,%ymm9
3498   197,180,88,192,                         //vaddps        %ymm0,%ymm9,%ymm0
3499   197,60,89,205,                          //vmulps        %ymm5,%ymm8,%ymm9
3500   197,180,88,201,                         //vaddps        %ymm1,%ymm9,%ymm1
3501   197,60,89,206,                          //vmulps        %ymm6,%ymm8,%ymm9
3502   197,180,88,210,                         //vaddps        %ymm2,%ymm9,%ymm2
3503   197,60,89,199,                          //vmulps        %ymm7,%ymm8,%ymm8
3504   197,188,88,219,                         //vaddps        %ymm3,%ymm8,%ymm3
3505   72,173,                                 //lods          %ds:(%rsi),%rax
3506   255,224,                                //jmpq          *%rax
3507 };
3508 
3509 CODE const uint8_t sk_dstover_avx[] = {
3510   184,0,0,128,63,                         //mov           $0x3f800000,%eax
3511   197,121,110,192,                        //vmovd         %eax,%xmm8
3512   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
3513   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
3514   197,60,92,199,                          //vsubps        %ymm7,%ymm8,%ymm8
3515   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
3516   197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
3517   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
3518   197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
3519   197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
3520   197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
3521   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
3522   197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
3523   72,173,                                 //lods          %ds:(%rsi),%rax
3524   255,224,                                //jmpq          *%rax
3525 };
3526 
3527 CODE const uint8_t sk_clamp_0_avx[] = {
3528   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
3529   196,193,124,95,192,                     //vmaxps        %ymm8,%ymm0,%ymm0
3530   196,193,116,95,200,                     //vmaxps        %ymm8,%ymm1,%ymm1
3531   196,193,108,95,208,                     //vmaxps        %ymm8,%ymm2,%ymm2
3532   196,193,100,95,216,                     //vmaxps        %ymm8,%ymm3,%ymm3
3533   72,173,                                 //lods          %ds:(%rsi),%rax
3534   255,224,                                //jmpq          *%rax
3535 };
3536 
3537 CODE const uint8_t sk_clamp_1_avx[] = {
3538   184,0,0,128,63,                         //mov           $0x3f800000,%eax
3539   197,121,110,192,                        //vmovd         %eax,%xmm8
3540   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
3541   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
3542   196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
3543   196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
3544   196,193,108,93,208,                     //vminps        %ymm8,%ymm2,%ymm2
3545   196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
3546   72,173,                                 //lods          %ds:(%rsi),%rax
3547   255,224,                                //jmpq          *%rax
3548 };
3549 
3550 CODE const uint8_t sk_clamp_a_avx[] = {
3551   184,0,0,128,63,                         //mov           $0x3f800000,%eax
3552   197,121,110,192,                        //vmovd         %eax,%xmm8
3553   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
3554   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
3555   196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
3556   197,252,93,195,                         //vminps        %ymm3,%ymm0,%ymm0
3557   197,244,93,203,                         //vminps        %ymm3,%ymm1,%ymm1
3558   197,236,93,211,                         //vminps        %ymm3,%ymm2,%ymm2
3559   72,173,                                 //lods          %ds:(%rsi),%rax
3560   255,224,                                //jmpq          *%rax
3561 };
3562 
3563 CODE const uint8_t sk_set_rgb_avx[] = {
3564   72,173,                                 //lods          %ds:(%rsi),%rax
3565   196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
3566   196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
3567   196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
3568   72,173,                                 //lods          %ds:(%rsi),%rax
3569   255,224,                                //jmpq          *%rax
3570 };
3571 
3572 CODE const uint8_t sk_swap_rb_avx[] = {
3573   197,124,40,192,                         //vmovaps       %ymm0,%ymm8
3574   72,173,                                 //lods          %ds:(%rsi),%rax
3575   197,252,40,194,                         //vmovaps       %ymm2,%ymm0
3576   197,124,41,194,                         //vmovaps       %ymm8,%ymm2
3577   255,224,                                //jmpq          *%rax
3578 };
3579 
3580 CODE const uint8_t sk_swap_avx[] = {
3581   197,124,40,195,                         //vmovaps       %ymm3,%ymm8
3582   197,124,40,202,                         //vmovaps       %ymm2,%ymm9
3583   197,124,40,209,                         //vmovaps       %ymm1,%ymm10
3584   197,124,40,216,                         //vmovaps       %ymm0,%ymm11
3585   72,173,                                 //lods          %ds:(%rsi),%rax
3586   197,252,40,196,                         //vmovaps       %ymm4,%ymm0
3587   197,252,40,205,                         //vmovaps       %ymm5,%ymm1
3588   197,252,40,214,                         //vmovaps       %ymm6,%ymm2
3589   197,252,40,223,                         //vmovaps       %ymm7,%ymm3
3590   197,124,41,220,                         //vmovaps       %ymm11,%ymm4
3591   197,124,41,213,                         //vmovaps       %ymm10,%ymm5
3592   197,124,41,206,                         //vmovaps       %ymm9,%ymm6
3593   197,124,41,199,                         //vmovaps       %ymm8,%ymm7
3594   255,224,                                //jmpq          *%rax
3595 };
3596 
3597 CODE const uint8_t sk_move_src_dst_avx[] = {
3598   72,173,                                 //lods          %ds:(%rsi),%rax
3599   197,252,40,224,                         //vmovaps       %ymm0,%ymm4
3600   197,252,40,233,                         //vmovaps       %ymm1,%ymm5
3601   197,252,40,242,                         //vmovaps       %ymm2,%ymm6
3602   197,252,40,251,                         //vmovaps       %ymm3,%ymm7
3603   255,224,                                //jmpq          *%rax
3604 };
3605 
3606 CODE const uint8_t sk_move_dst_src_avx[] = {
3607   72,173,                                 //lods          %ds:(%rsi),%rax
3608   197,252,40,196,                         //vmovaps       %ymm4,%ymm0
3609   197,252,40,205,                         //vmovaps       %ymm5,%ymm1
3610   197,252,40,214,                         //vmovaps       %ymm6,%ymm2
3611   197,252,40,223,                         //vmovaps       %ymm7,%ymm3
3612   255,224,                                //jmpq          *%rax
3613 };
3614 
3615 CODE const uint8_t sk_premul_avx[] = {
3616   197,252,89,195,                         //vmulps        %ymm3,%ymm0,%ymm0
3617   197,244,89,203,                         //vmulps        %ymm3,%ymm1,%ymm1
3618   197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
3619   72,173,                                 //lods          %ds:(%rsi),%rax
3620   255,224,                                //jmpq          *%rax
3621 };
3622 
3623 CODE const uint8_t sk_unpremul_avx[] = {
3624   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
3625   196,65,100,194,200,0,                   //vcmpeqps      %ymm8,%ymm3,%ymm9
3626   184,0,0,128,63,                         //mov           $0x3f800000,%eax
3627   197,121,110,208,                        //vmovd         %eax,%xmm10
3628   196,67,121,4,210,0,                     //vpermilps     $0x0,%xmm10,%xmm10
3629   196,67,45,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm10,%ymm10
3630   197,44,94,211,                          //vdivps        %ymm3,%ymm10,%ymm10
3631   196,67,45,74,192,144,                   //vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
3632   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
3633   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
3634   197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
3635   72,173,                                 //lods          %ds:(%rsi),%rax
3636   255,224,                                //jmpq          *%rax
3637 };
3638 
3639 CODE const uint8_t sk_from_srgb_avx[] = {
3640   184,145,131,158,61,                     //mov           $0x3d9e8391,%eax
3641   197,121,110,192,                        //vmovd         %eax,%xmm8
3642   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
3643   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
3644   197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
3645   197,124,89,208,                         //vmulps        %ymm0,%ymm0,%ymm10
3646   184,154,153,153,62,                     //mov           $0x3e99999a,%eax
3647   197,121,110,216,                        //vmovd         %eax,%xmm11
3648   196,67,121,4,219,0,                     //vpermilps     $0x0,%xmm11,%xmm11
3649   196,67,37,24,219,1,                     //vinsertf128   $0x1,%xmm11,%ymm11,%ymm11
3650   184,92,143,50,63,                       //mov           $0x3f328f5c,%eax
3651   197,121,110,224,                        //vmovd         %eax,%xmm12
3652   196,67,121,4,228,0,                     //vpermilps     $0x0,%xmm12,%xmm12
3653   196,67,29,24,228,1,                     //vinsertf128   $0x1,%xmm12,%ymm12,%ymm12
3654   197,36,89,232,                          //vmulps        %ymm0,%ymm11,%ymm13
3655   196,65,20,88,236,                       //vaddps        %ymm12,%ymm13,%ymm13
3656   184,10,215,35,59,                       //mov           $0x3b23d70a,%eax
3657   197,121,110,240,                        //vmovd         %eax,%xmm14
3658   196,67,121,4,246,0,                     //vpermilps     $0x0,%xmm14,%xmm14
3659   196,67,13,24,246,1,                     //vinsertf128   $0x1,%xmm14,%ymm14,%ymm14
3660   196,65,44,89,213,                       //vmulps        %ymm13,%ymm10,%ymm10
3661   196,65,12,88,210,                       //vaddps        %ymm10,%ymm14,%ymm10
3662   184,174,71,97,61,                       //mov           $0x3d6147ae,%eax
3663   197,121,110,232,                        //vmovd         %eax,%xmm13
3664   196,67,121,4,237,0,                     //vpermilps     $0x0,%xmm13,%xmm13
3665   196,67,21,24,237,1,                     //vinsertf128   $0x1,%xmm13,%ymm13,%ymm13
3666   196,193,124,194,197,1,                  //vcmpltps      %ymm13,%ymm0,%ymm0
3667   196,195,45,74,193,0,                    //vblendvps     %ymm0,%ymm9,%ymm10,%ymm0
3668   197,60,89,201,                          //vmulps        %ymm1,%ymm8,%ymm9
3669   197,116,89,209,                         //vmulps        %ymm1,%ymm1,%ymm10
3670   197,36,89,249,                          //vmulps        %ymm1,%ymm11,%ymm15
3671   196,65,28,88,255,                       //vaddps        %ymm15,%ymm12,%ymm15
3672   196,65,44,89,215,                       //vmulps        %ymm15,%ymm10,%ymm10
3673   196,65,12,88,210,                       //vaddps        %ymm10,%ymm14,%ymm10
3674   196,193,116,194,205,1,                  //vcmpltps      %ymm13,%ymm1,%ymm1
3675   196,195,45,74,201,16,                   //vblendvps     %ymm1,%ymm9,%ymm10,%ymm1
3676   197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
3677   197,108,89,202,                         //vmulps        %ymm2,%ymm2,%ymm9
3678   197,36,89,210,                          //vmulps        %ymm2,%ymm11,%ymm10
3679   196,65,28,88,210,                       //vaddps        %ymm10,%ymm12,%ymm10
3680   196,65,52,89,202,                       //vmulps        %ymm10,%ymm9,%ymm9
3681   196,65,12,88,201,                       //vaddps        %ymm9,%ymm14,%ymm9
3682   196,193,108,194,213,1,                  //vcmpltps      %ymm13,%ymm2,%ymm2
3683   196,195,53,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
3684   72,173,                                 //lods          %ds:(%rsi),%rax
3685   255,224,                                //jmpq          *%rax
3686 };
3687 
3688 CODE const uint8_t sk_to_srgb_avx[] = {
3689   197,124,82,192,                         //vrsqrtps      %ymm0,%ymm8
3690   196,65,124,83,232,                      //vrcpps        %ymm8,%ymm13
3691   196,65,124,82,240,                      //vrsqrtps      %ymm8,%ymm14
3692   184,41,92,71,65,                        //mov           $0x41475c29,%eax
3693   197,121,110,192,                        //vmovd         %eax,%xmm8
3694   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
3695   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
3696   197,60,89,224,                          //vmulps        %ymm0,%ymm8,%ymm12
3697   184,0,0,128,63,                         //mov           $0x3f800000,%eax
3698   197,121,110,200,                        //vmovd         %eax,%xmm9
3699   196,67,121,4,201,0,                     //vpermilps     $0x0,%xmm9,%xmm9
3700   196,67,53,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
3701   184,194,135,210,62,                     //mov           $0x3ed287c2,%eax
3702   197,121,110,208,                        //vmovd         %eax,%xmm10
3703   196,67,121,4,210,0,                     //vpermilps     $0x0,%xmm10,%xmm10
3704   196,67,45,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm10,%ymm10
3705   184,206,111,48,63,                      //mov           $0x3f306fce,%eax
3706   197,121,110,216,                        //vmovd         %eax,%xmm11
3707   196,67,121,4,219,0,                     //vpermilps     $0x0,%xmm11,%xmm11
3708   196,67,37,24,219,1,                     //vinsertf128   $0x1,%xmm11,%ymm11,%ymm11
3709   184,168,87,202,61,                      //mov           $0x3dca57a8,%eax
3710   53,0,0,0,128,                           //xor           $0x80000000,%eax
3711   197,121,110,248,                        //vmovd         %eax,%xmm15
3712   196,67,121,4,255,0,                     //vpermilps     $0x0,%xmm15,%xmm15
3713   196,67,5,24,255,1,                      //vinsertf128   $0x1,%xmm15,%ymm15,%ymm15
3714   196,65,20,89,235,                       //vmulps        %ymm11,%ymm13,%ymm13
3715   196,65,20,88,239,                       //vaddps        %ymm15,%ymm13,%ymm13
3716   196,65,12,89,242,                       //vmulps        %ymm10,%ymm14,%ymm14
3717   196,65,12,88,237,                       //vaddps        %ymm13,%ymm14,%ymm13
3718   196,65,52,93,237,                       //vminps        %ymm13,%ymm9,%ymm13
3719   184,4,231,140,59,                       //mov           $0x3b8ce704,%eax
3720   197,121,110,240,                        //vmovd         %eax,%xmm14
3721   196,67,121,4,246,0,                     //vpermilps     $0x0,%xmm14,%xmm14
3722   196,67,13,24,246,1,                     //vinsertf128   $0x1,%xmm14,%ymm14,%ymm14
3723   196,193,124,194,198,1,                  //vcmpltps      %ymm14,%ymm0,%ymm0
3724   196,195,21,74,196,0,                    //vblendvps     %ymm0,%ymm12,%ymm13,%ymm0
3725   197,124,82,225,                         //vrsqrtps      %ymm1,%ymm12
3726   196,65,124,83,236,                      //vrcpps        %ymm12,%ymm13
3727   196,65,124,82,228,                      //vrsqrtps      %ymm12,%ymm12
3728   196,65,36,89,237,                       //vmulps        %ymm13,%ymm11,%ymm13
3729   196,65,4,88,237,                        //vaddps        %ymm13,%ymm15,%ymm13
3730   196,65,44,89,228,                       //vmulps        %ymm12,%ymm10,%ymm12
3731   196,65,28,88,229,                       //vaddps        %ymm13,%ymm12,%ymm12
3732   197,60,89,233,                          //vmulps        %ymm1,%ymm8,%ymm13
3733   196,65,52,93,228,                       //vminps        %ymm12,%ymm9,%ymm12
3734   196,193,116,194,206,1,                  //vcmpltps      %ymm14,%ymm1,%ymm1
3735   196,195,29,74,205,16,                   //vblendvps     %ymm1,%ymm13,%ymm12,%ymm1
3736   197,124,82,226,                         //vrsqrtps      %ymm2,%ymm12
3737   196,65,124,83,236,                      //vrcpps        %ymm12,%ymm13
3738   196,65,36,89,221,                       //vmulps        %ymm13,%ymm11,%ymm11
3739   196,65,4,88,219,                        //vaddps        %ymm11,%ymm15,%ymm11
3740   196,65,124,82,228,                      //vrsqrtps      %ymm12,%ymm12
3741   196,65,44,89,212,                       //vmulps        %ymm12,%ymm10,%ymm10
3742   196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
3743   196,65,52,93,202,                       //vminps        %ymm10,%ymm9,%ymm9
3744   197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
3745   196,193,108,194,214,1,                  //vcmpltps      %ymm14,%ymm2,%ymm2
3746   196,195,53,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
3747   72,173,                                 //lods          %ds:(%rsi),%rax
3748   255,224,                                //jmpq          *%rax
3749 };
3750 
3751 CODE const uint8_t sk_scale_1_float_avx[] = {
3752   72,173,                                 //lods          %ds:(%rsi),%rax
3753   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
3754   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
3755   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
3756   197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
3757   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
3758   72,173,                                 //lods          %ds:(%rsi),%rax
3759   255,224,                                //jmpq          *%rax
3760 };
3761 
3762 CODE const uint8_t sk_scale_u8_avx[] = {
3763   73,137,200,                             //mov           %rcx,%r8
3764   72,173,                                 //lods          %ds:(%rsi),%rax
3765   72,139,0,                               //mov           (%rax),%rax
3766   72,1,248,                               //add           %rdi,%rax
3767   77,133,192,                             //test          %r8,%r8
3768   117,80,                                 //jne           5a2 <_sk_scale_u8_avx+0x60>
3769   197,122,126,0,                          //vmovq         (%rax),%xmm8
3770   196,66,121,49,200,                      //vpmovzxbd     %xmm8,%xmm9
3771   196,67,121,4,192,229,                   //vpermilps     $0xe5,%xmm8,%xmm8
3772   196,66,121,49,192,                      //vpmovzxbd     %xmm8,%xmm8
3773   196,67,53,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
3774   196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
3775   184,129,128,128,59,                     //mov           $0x3b808081,%eax
3776   197,121,110,200,                        //vmovd         %eax,%xmm9
3777   196,67,121,4,201,0,                     //vpermilps     $0x0,%xmm9,%xmm9
3778   196,67,53,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
3779   196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
3780   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
3781   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
3782   197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
3783   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
3784   72,173,                                 //lods          %ds:(%rsi),%rax
3785   76,137,193,                             //mov           %r8,%rcx
3786   255,224,                                //jmpq          *%rax
3787   49,201,                                 //xor           %ecx,%ecx
3788   77,137,194,                             //mov           %r8,%r10
3789   69,49,201,                              //xor           %r9d,%r9d
3790   68,15,182,24,                           //movzbl        (%rax),%r11d
3791   72,255,192,                             //inc           %rax
3792   73,211,227,                             //shl           %cl,%r11
3793   77,9,217,                               //or            %r11,%r9
3794   72,131,193,8,                           //add           $0x8,%rcx
3795   73,255,202,                             //dec           %r10
3796   117,234,                                //jne           5aa <_sk_scale_u8_avx+0x68>
3797   196,65,249,110,193,                     //vmovq         %r9,%xmm8
3798   235,143,                                //jmp           556 <_sk_scale_u8_avx+0x14>
3799 };
3800 
3801 CODE const uint8_t sk_lerp_1_float_avx[] = {
3802   72,173,                                 //lods          %ds:(%rsi),%rax
3803   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
3804   197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
3805   196,193,124,89,192,                     //vmulps        %ymm8,%ymm0,%ymm0
3806   197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
3807   197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
3808   196,193,116,89,200,                     //vmulps        %ymm8,%ymm1,%ymm1
3809   197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
3810   197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
3811   196,193,108,89,208,                     //vmulps        %ymm8,%ymm2,%ymm2
3812   197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
3813   197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
3814   196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
3815   197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
3816   72,173,                                 //lods          %ds:(%rsi),%rax
3817   255,224,                                //jmpq          *%rax
3818 };
3819 
3820 CODE const uint8_t sk_lerp_u8_avx[] = {
3821   73,137,200,                             //mov           %rcx,%r8
3822   72,173,                                 //lods          %ds:(%rsi),%rax
3823   72,139,0,                               //mov           (%rax),%rax
3824   72,1,248,                               //add           %rdi,%rax
3825   77,133,192,                             //test          %r8,%r8
3826   117,116,                                //jne           68a <_sk_lerp_u8_avx+0x84>
3827   197,122,126,0,                          //vmovq         (%rax),%xmm8
3828   196,66,121,49,200,                      //vpmovzxbd     %xmm8,%xmm9
3829   196,67,121,4,192,229,                   //vpermilps     $0xe5,%xmm8,%xmm8
3830   196,66,121,49,192,                      //vpmovzxbd     %xmm8,%xmm8
3831   196,67,53,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
3832   196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
3833   184,129,128,128,59,                     //mov           $0x3b808081,%eax
3834   197,121,110,200,                        //vmovd         %eax,%xmm9
3835   196,67,121,4,201,0,                     //vpermilps     $0x0,%xmm9,%xmm9
3836   196,67,53,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
3837   196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
3838   197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
3839   196,193,124,89,192,                     //vmulps        %ymm8,%ymm0,%ymm0
3840   197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
3841   197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
3842   196,193,116,89,200,                     //vmulps        %ymm8,%ymm1,%ymm1
3843   197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
3844   197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
3845   196,193,108,89,208,                     //vmulps        %ymm8,%ymm2,%ymm2
3846   197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
3847   197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
3848   196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
3849   197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
3850   72,173,                                 //lods          %ds:(%rsi),%rax
3851   76,137,193,                             //mov           %r8,%rcx
3852   255,224,                                //jmpq          *%rax
3853   49,201,                                 //xor           %ecx,%ecx
3854   77,137,194,                             //mov           %r8,%r10
3855   69,49,201,                              //xor           %r9d,%r9d
3856   68,15,182,24,                           //movzbl        (%rax),%r11d
3857   72,255,192,                             //inc           %rax
3858   73,211,227,                             //shl           %cl,%r11
3859   77,9,217,                               //or            %r11,%r9
3860   72,131,193,8,                           //add           $0x8,%rcx
3861   73,255,202,                             //dec           %r10
3862   117,234,                                //jne           692 <_sk_lerp_u8_avx+0x8c>
3863   196,65,249,110,193,                     //vmovq         %r9,%xmm8
3864   233,104,255,255,255,                    //jmpq          61a <_sk_lerp_u8_avx+0x14>
3865 };
3866 
3867 CODE const uint8_t sk_lerp_565_avx[] = {
3868   72,173,                                 //lods          %ds:(%rsi),%rax
3869   76,139,16,                              //mov           (%rax),%r10
3870   72,133,201,                             //test          %rcx,%rcx
3871   15,133,250,0,0,0,                       //jne           7ba <_sk_lerp_565_avx+0x108>
3872   196,65,122,111,4,122,                   //vmovdqu       (%r10,%rdi,2),%xmm8
3873   197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
3874   197,185,105,219,                        //vpunpckhwd    %xmm3,%xmm8,%xmm3
3875   196,66,121,51,192,                      //vpmovzxwd     %xmm8,%xmm8
3876   196,99,61,24,195,1,                     //vinsertf128   $0x1,%xmm3,%ymm8,%ymm8
3877   184,0,248,0,0,                          //mov           $0xf800,%eax
3878   197,249,110,216,                        //vmovd         %eax,%xmm3
3879   197,249,112,219,0,                      //vpshufd       $0x0,%xmm3,%xmm3
3880   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
3881   196,193,100,84,216,                     //vandps        %ymm8,%ymm3,%ymm3
3882   197,124,91,203,                         //vcvtdq2ps     %ymm3,%ymm9
3883   184,8,33,132,55,                        //mov           $0x37842108,%eax
3884   197,249,110,216,                        //vmovd         %eax,%xmm3
3885   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
3886   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
3887   197,52,89,203,                          //vmulps        %ymm3,%ymm9,%ymm9
3888   184,224,7,0,0,                          //mov           $0x7e0,%eax
3889   197,249,110,216,                        //vmovd         %eax,%xmm3
3890   197,249,112,219,0,                      //vpshufd       $0x0,%xmm3,%xmm3
3891   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
3892   196,193,100,84,216,                     //vandps        %ymm8,%ymm3,%ymm3
3893   197,124,91,211,                         //vcvtdq2ps     %ymm3,%ymm10
3894   184,33,8,2,58,                          //mov           $0x3a020821,%eax
3895   197,249,110,216,                        //vmovd         %eax,%xmm3
3896   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
3897   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
3898   197,44,89,211,                          //vmulps        %ymm3,%ymm10,%ymm10
3899   184,31,0,0,0,                           //mov           $0x1f,%eax
3900   197,249,110,216,                        //vmovd         %eax,%xmm3
3901   197,249,112,219,0,                      //vpshufd       $0x0,%xmm3,%xmm3
3902   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
3903   196,193,100,84,216,                     //vandps        %ymm8,%ymm3,%ymm3
3904   197,124,91,195,                         //vcvtdq2ps     %ymm3,%ymm8
3905   184,8,33,4,61,                          //mov           $0x3d042108,%eax
3906   197,249,110,216,                        //vmovd         %eax,%xmm3
3907   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
3908   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
3909   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
3910   197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
3911   196,193,124,89,193,                     //vmulps        %ymm9,%ymm0,%ymm0
3912   197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
3913   197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
3914   196,193,116,89,202,                     //vmulps        %ymm10,%ymm1,%ymm1
3915   197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
3916   197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
3917   197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
3918   197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
3919   184,0,0,128,63,                         //mov           $0x3f800000,%eax
3920   197,249,110,216,                        //vmovd         %eax,%xmm3
3921   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
3922   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
3923   72,173,                                 //lods          %ds:(%rsi),%rax
3924   255,224,                                //jmpq          *%rax
3925   65,137,200,                             //mov           %ecx,%r8d
3926   65,128,224,7,                           //and           $0x7,%r8b
3927   196,65,57,239,192,                      //vpxor         %xmm8,%xmm8,%xmm8
3928   65,254,200,                             //dec           %r8b
3929   65,128,248,6,                           //cmp           $0x6,%r8b
3930   15,135,243,254,255,255,                 //ja            6c6 <_sk_lerp_565_avx+0x14>
3931   69,15,182,192,                          //movzbl        %r8b,%r8d
3932   76,141,13,74,0,0,0,                     //lea           0x4a(%rip),%r9        # 828 <_sk_lerp_565_avx+0x176>
3933   75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
3934   76,1,200,                               //add           %r9,%rax
3935   255,224,                                //jmpq          *%rax
3936   197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
3937   196,65,97,196,68,122,12,6,              //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm8
3938   196,65,57,196,68,122,10,5,              //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm8,%xmm8
3939   196,65,57,196,68,122,8,4,               //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm8,%xmm8
3940   196,65,57,196,68,122,6,3,               //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm8,%xmm8
3941   196,65,57,196,68,122,4,2,               //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8
3942   196,65,57,196,68,122,2,1,               //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8
3943   196,65,57,196,4,122,0,                  //vpinsrw       $0x0,(%r10,%rdi,2),%xmm8,%xmm8
3944   233,159,254,255,255,                    //jmpq          6c6 <_sk_lerp_565_avx+0x14>
3945   144,                                    //nop
3946   243,255,                                //repz          (bad)
3947   255,                                    //(bad)
3948   255,                                    //(bad)
3949   235,255,                                //jmp           82d <_sk_lerp_565_avx+0x17b>
3950   255,                                    //(bad)
3951   255,227,                                //jmpq          *%rbx
3952   255,                                    //(bad)
3953   255,                                    //(bad)
3954   255,                                    //(bad)
3955   219,255,                                //(bad)
3956   255,                                    //(bad)
3957   255,211,                                //callq         *%rbx
3958   255,                                    //(bad)
3959   255,                                    //(bad)
3960   255,203,                                //dec           %ebx
3961   255,                                    //(bad)
3962   255,                                    //(bad)
3963   255,                                    //(bad)
3964   191,                                    //.byte         0xbf
3965   255,                                    //(bad)
3966   255,                                    //(bad)
3967   255,                                    //.byte         0xff
3968 };
3969 
3970 CODE const uint8_t sk_load_tables_avx[] = {
3971   85,                                     //push          %rbp
3972   65,87,                                  //push          %r15
3973   65,86,                                  //push          %r14
3974   65,85,                                  //push          %r13
3975   65,84,                                  //push          %r12
3976   83,                                     //push          %rbx
3977   72,173,                                 //lods          %ds:(%rsi),%rax
3978   76,139,0,                               //mov           (%rax),%r8
3979   72,133,201,                             //test          %rcx,%rcx
3980   15,133,56,2,0,0,                        //jne           a94 <_sk_load_tables_avx+0x250>
3981   196,65,124,16,4,184,                    //vmovups       (%r8,%rdi,4),%ymm8
3982   187,255,0,0,0,                          //mov           $0xff,%ebx
3983   197,249,110,195,                        //vmovd         %ebx,%xmm0
3984   197,249,112,192,0,                      //vpshufd       $0x0,%xmm0,%xmm0
3985   196,99,125,24,200,1,                    //vinsertf128   $0x1,%xmm0,%ymm0,%ymm9
3986   196,193,52,84,192,                      //vandps        %ymm8,%ymm9,%ymm0
3987   196,193,249,126,193,                    //vmovq         %xmm0,%r9
3988   69,137,203,                             //mov           %r9d,%r11d
3989   196,195,249,22,194,1,                   //vpextrq       $0x1,%xmm0,%r10
3990   69,137,214,                             //mov           %r10d,%r14d
3991   73,193,234,32,                          //shr           $0x20,%r10
3992   73,193,233,32,                          //shr           $0x20,%r9
3993   196,227,125,25,192,1,                   //vextractf128  $0x1,%ymm0,%xmm0
3994   196,193,249,126,196,                    //vmovq         %xmm0,%r12
3995   69,137,231,                             //mov           %r12d,%r15d
3996   196,227,249,22,195,1,                   //vpextrq       $0x1,%xmm0,%rbx
3997   65,137,221,                             //mov           %ebx,%r13d
3998   72,193,235,32,                          //shr           $0x20,%rbx
3999   73,193,236,32,                          //shr           $0x20,%r12
4000   72,139,104,8,                           //mov           0x8(%rax),%rbp
4001   76,139,64,16,                           //mov           0x10(%rax),%r8
4002   196,161,122,16,68,189,0,                //vmovss        0x0(%rbp,%r15,4),%xmm0
4003   196,163,121,33,68,165,0,16,             //vinsertps     $0x10,0x0(%rbp,%r12,4),%xmm0,%xmm0
4004   196,161,122,16,76,173,0,                //vmovss        0x0(%rbp,%r13,4),%xmm1
4005   196,227,121,33,193,32,                  //vinsertps     $0x20,%xmm1,%xmm0,%xmm0
4006   197,250,16,76,157,0,                    //vmovss        0x0(%rbp,%rbx,4),%xmm1
4007   196,227,121,33,193,48,                  //vinsertps     $0x30,%xmm1,%xmm0,%xmm0
4008   196,161,122,16,76,157,0,                //vmovss        0x0(%rbp,%r11,4),%xmm1
4009   196,163,113,33,76,141,0,16,             //vinsertps     $0x10,0x0(%rbp,%r9,4),%xmm1,%xmm1
4010   196,161,122,16,92,181,0,                //vmovss        0x0(%rbp,%r14,4),%xmm3
4011   196,227,113,33,203,32,                  //vinsertps     $0x20,%xmm3,%xmm1,%xmm1
4012   196,161,122,16,92,149,0,                //vmovss        0x0(%rbp,%r10,4),%xmm3
4013   196,227,113,33,203,48,                  //vinsertps     $0x30,%xmm3,%xmm1,%xmm1
4014   196,227,117,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
4015   196,193,113,114,208,8,                  //vpsrld        $0x8,%xmm8,%xmm1
4016   196,67,125,25,194,1,                    //vextractf128  $0x1,%ymm8,%xmm10
4017   196,193,105,114,210,8,                  //vpsrld        $0x8,%xmm10,%xmm2
4018   196,227,117,24,202,1,                   //vinsertf128   $0x1,%xmm2,%ymm1,%ymm1
4019   197,180,84,201,                         //vandps        %ymm1,%ymm9,%ymm1
4020   196,193,249,126,201,                    //vmovq         %xmm1,%r9
4021   69,137,203,                             //mov           %r9d,%r11d
4022   196,195,249,22,202,1,                   //vpextrq       $0x1,%xmm1,%r10
4023   69,137,214,                             //mov           %r10d,%r14d
4024   73,193,234,32,                          //shr           $0x20,%r10
4025   73,193,233,32,                          //shr           $0x20,%r9
4026   196,227,125,25,201,1,                   //vextractf128  $0x1,%ymm1,%xmm1
4027   196,225,249,126,205,                    //vmovq         %xmm1,%rbp
4028   65,137,239,                             //mov           %ebp,%r15d
4029   196,227,249,22,203,1,                   //vpextrq       $0x1,%xmm1,%rbx
4030   65,137,220,                             //mov           %ebx,%r12d
4031   72,193,235,32,                          //shr           $0x20,%rbx
4032   72,193,237,32,                          //shr           $0x20,%rbp
4033   196,129,122,16,12,184,                  //vmovss        (%r8,%r15,4),%xmm1
4034   196,195,113,33,12,168,16,               //vinsertps     $0x10,(%r8,%rbp,4),%xmm1,%xmm1
4035   196,129,122,16,20,160,                  //vmovss        (%r8,%r12,4),%xmm2
4036   196,227,113,33,202,32,                  //vinsertps     $0x20,%xmm2,%xmm1,%xmm1
4037   196,193,122,16,20,152,                  //vmovss        (%r8,%rbx,4),%xmm2
4038   196,227,113,33,202,48,                  //vinsertps     $0x30,%xmm2,%xmm1,%xmm1
4039   196,129,122,16,20,152,                  //vmovss        (%r8,%r11,4),%xmm2
4040   196,131,105,33,20,136,16,               //vinsertps     $0x10,(%r8,%r9,4),%xmm2,%xmm2
4041   196,129,122,16,28,176,                  //vmovss        (%r8,%r14,4),%xmm3
4042   196,227,105,33,211,32,                  //vinsertps     $0x20,%xmm3,%xmm2,%xmm2
4043   196,129,122,16,28,144,                  //vmovss        (%r8,%r10,4),%xmm3
4044   196,227,105,33,211,48,                  //vinsertps     $0x30,%xmm3,%xmm2,%xmm2
4045   196,227,109,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
4046   72,139,64,24,                           //mov           0x18(%rax),%rax
4047   196,193,105,114,208,16,                 //vpsrld        $0x10,%xmm8,%xmm2
4048   196,193,97,114,210,16,                  //vpsrld        $0x10,%xmm10,%xmm3
4049   196,227,109,24,211,1,                   //vinsertf128   $0x1,%xmm3,%ymm2,%ymm2
4050   197,180,84,210,                         //vandps        %ymm2,%ymm9,%ymm2
4051   196,193,249,126,208,                    //vmovq         %xmm2,%r8
4052   69,137,194,                             //mov           %r8d,%r10d
4053   196,195,249,22,209,1,                   //vpextrq       $0x1,%xmm2,%r9
4054   69,137,203,                             //mov           %r9d,%r11d
4055   73,193,233,32,                          //shr           $0x20,%r9
4056   73,193,232,32,                          //shr           $0x20,%r8
4057   196,227,125,25,210,1,                   //vextractf128  $0x1,%ymm2,%xmm2
4058   196,225,249,126,213,                    //vmovq         %xmm2,%rbp
4059   65,137,238,                             //mov           %ebp,%r14d
4060   196,227,249,22,211,1,                   //vpextrq       $0x1,%xmm2,%rbx
4061   65,137,223,                             //mov           %ebx,%r15d
4062   72,193,235,32,                          //shr           $0x20,%rbx
4063   72,193,237,32,                          //shr           $0x20,%rbp
4064   196,161,122,16,20,176,                  //vmovss        (%rax,%r14,4),%xmm2
4065   196,227,105,33,20,168,16,               //vinsertps     $0x10,(%rax,%rbp,4),%xmm2,%xmm2
4066   196,161,122,16,28,184,                  //vmovss        (%rax,%r15,4),%xmm3
4067   196,227,105,33,211,32,                  //vinsertps     $0x20,%xmm3,%xmm2,%xmm2
4068   197,250,16,28,152,                      //vmovss        (%rax,%rbx,4),%xmm3
4069   196,99,105,33,203,48,                   //vinsertps     $0x30,%xmm3,%xmm2,%xmm9
4070   196,161,122,16,28,144,                  //vmovss        (%rax,%r10,4),%xmm3
4071   196,163,97,33,28,128,16,                //vinsertps     $0x10,(%rax,%r8,4),%xmm3,%xmm3
4072   196,161,122,16,20,152,                  //vmovss        (%rax,%r11,4),%xmm2
4073   196,227,97,33,210,32,                   //vinsertps     $0x20,%xmm2,%xmm3,%xmm2
4074   196,161,122,16,28,136,                  //vmovss        (%rax,%r9,4),%xmm3
4075   196,227,105,33,211,48,                  //vinsertps     $0x30,%xmm3,%xmm2,%xmm2
4076   196,195,109,24,209,1,                   //vinsertf128   $0x1,%xmm9,%ymm2,%ymm2
4077   196,193,57,114,208,24,                  //vpsrld        $0x18,%xmm8,%xmm8
4078   196,193,97,114,210,24,                  //vpsrld        $0x18,%xmm10,%xmm3
4079   196,227,61,24,219,1,                    //vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
4080   197,124,91,195,                         //vcvtdq2ps     %ymm3,%ymm8
4081   184,129,128,128,59,                     //mov           $0x3b808081,%eax
4082   197,249,110,216,                        //vmovd         %eax,%xmm3
4083   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
4084   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
4085   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
4086   72,173,                                 //lods          %ds:(%rsi),%rax
4087   91,                                     //pop           %rbx
4088   65,92,                                  //pop           %r12
4089   65,93,                                  //pop           %r13
4090   65,94,                                  //pop           %r14
4091   65,95,                                  //pop           %r15
4092   93,                                     //pop           %rbp
4093   255,224,                                //jmpq          *%rax
4094   137,203,                                //mov           %ecx,%ebx
4095   128,227,7,                              //and           $0x7,%bl
4096   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
4097   254,203,                                //dec           %bl
4098   128,251,6,                              //cmp           $0x6,%bl
4099   15,135,185,253,255,255,                 //ja            862 <_sk_load_tables_avx+0x1e>
4100   15,182,219,                             //movzbl        %bl,%ebx
4101   76,141,13,137,0,0,0,                    //lea           0x89(%rip),%r9        # b3c <_sk_load_tables_avx+0x2f8>
4102   73,99,28,153,                           //movslq        (%r9,%rbx,4),%rbx
4103   76,1,203,                               //add           %r9,%rbx
4104   255,227,                                //jmpq          *%rbx
4105   196,193,121,110,68,184,24,              //vmovd         0x18(%r8,%rdi,4),%xmm0
4106   197,249,112,192,68,                     //vpshufd       $0x44,%xmm0,%xmm0
4107   196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
4108   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
4109   196,99,117,12,192,64,                   //vblendps      $0x40,%ymm0,%ymm1,%ymm8
4110   196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
4111   196,195,121,34,68,184,20,1,             //vpinsrd       $0x1,0x14(%r8,%rdi,4),%xmm0,%xmm0
4112   196,99,61,24,192,1,                     //vinsertf128   $0x1,%xmm0,%ymm8,%ymm8
4113   196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
4114   196,195,121,34,68,184,16,0,             //vpinsrd       $0x0,0x10(%r8,%rdi,4),%xmm0,%xmm0
4115   196,99,61,24,192,1,                     //vinsertf128   $0x1,%xmm0,%ymm8,%ymm8
4116   196,195,57,34,68,184,12,3,              //vpinsrd       $0x3,0xc(%r8,%rdi,4),%xmm8,%xmm0
4117   196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
4118   196,195,57,34,68,184,8,2,               //vpinsrd       $0x2,0x8(%r8,%rdi,4),%xmm8,%xmm0
4119   196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
4120   196,195,57,34,68,184,4,1,               //vpinsrd       $0x1,0x4(%r8,%rdi,4),%xmm8,%xmm0
4121   196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
4122   196,195,57,34,4,184,0,                  //vpinsrd       $0x0,(%r8,%rdi,4),%xmm8,%xmm0
4123   196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
4124   233,38,253,255,255,                     //jmpq          862 <_sk_load_tables_avx+0x1e>
4125   238,                                    //out           %al,(%dx)
4126   255,                                    //(bad)
4127   255,                                    //(bad)
4128   255,224,                                //jmpq          *%rax
4129   255,                                    //(bad)
4130   255,                                    //(bad)
4131   255,210,                                //callq         *%rdx
4132   255,                                    //(bad)
4133   255,                                    //(bad)
4134   255,196,                                //inc           %esp
4135   255,                                    //(bad)
4136   255,                                    //(bad)
4137   255,176,255,255,255,156,                //pushq         -0x63000001(%rax)
4138   255,                                    //(bad)
4139   255,                                    //(bad)
4140   255,                                    //.byte         0xff
4141   128,255,255,                            //cmp           $0xff,%bh
4142   255,                                    //.byte         0xff
4143 };
4144 
4145 CODE const uint8_t sk_load_a8_avx[] = {
4146   73,137,200,                             //mov           %rcx,%r8
4147   72,173,                                 //lods          %ds:(%rsi),%rax
4148   72,139,0,                               //mov           (%rax),%rax
4149   72,1,248,                               //add           %rdi,%rax
4150   77,133,192,                             //test          %r8,%r8
4151   117,74,                                 //jne           bb2 <_sk_load_a8_avx+0x5a>
4152   197,250,126,0,                          //vmovq         (%rax),%xmm0
4153   196,226,121,49,200,                     //vpmovzxbd     %xmm0,%xmm1
4154   196,227,121,4,192,229,                  //vpermilps     $0xe5,%xmm0,%xmm0
4155   196,226,121,49,192,                     //vpmovzxbd     %xmm0,%xmm0
4156   196,227,117,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
4157   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
4158   184,129,128,128,59,                     //mov           $0x3b808081,%eax
4159   197,249,110,200,                        //vmovd         %eax,%xmm1
4160   196,227,121,4,201,0,                    //vpermilps     $0x0,%xmm1,%xmm1
4161   196,227,117,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
4162   197,252,89,217,                         //vmulps        %ymm1,%ymm0,%ymm3
4163   72,173,                                 //lods          %ds:(%rsi),%rax
4164   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
4165   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
4166   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
4167   76,137,193,                             //mov           %r8,%rcx
4168   255,224,                                //jmpq          *%rax
4169   49,201,                                 //xor           %ecx,%ecx
4170   77,137,194,                             //mov           %r8,%r10
4171   69,49,201,                              //xor           %r9d,%r9d
4172   68,15,182,24,                           //movzbl        (%rax),%r11d
4173   72,255,192,                             //inc           %rax
4174   73,211,227,                             //shl           %cl,%r11
4175   77,9,217,                               //or            %r11,%r9
4176   72,131,193,8,                           //add           $0x8,%rcx
4177   73,255,202,                             //dec           %r10
4178   117,234,                                //jne           bba <_sk_load_a8_avx+0x62>
4179   196,193,249,110,193,                    //vmovq         %r9,%xmm0
4180   235,149,                                //jmp           b6c <_sk_load_a8_avx+0x14>
4181 };
4182 
4183 CODE const uint8_t sk_store_a8_avx[] = {
4184   72,173,                                 //lods          %ds:(%rsi),%rax
4185   76,139,8,                               //mov           (%rax),%r9
4186   184,0,0,127,67,                         //mov           $0x437f0000,%eax
4187   197,121,110,192,                        //vmovd         %eax,%xmm8
4188   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
4189   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
4190   197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
4191   196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
4192   196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
4193   196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
4194   196,65,57,103,192,                      //vpackuswb     %xmm8,%xmm8,%xmm8
4195   72,133,201,                             //test          %rcx,%rcx
4196   117,10,                                 //jne           c19 <_sk_store_a8_avx+0x42>
4197   196,65,123,17,4,57,                     //vmovsd        %xmm8,(%r9,%rdi,1)
4198   72,173,                                 //lods          %ds:(%rsi),%rax
4199   255,224,                                //jmpq          *%rax
4200   65,137,200,                             //mov           %ecx,%r8d
4201   65,128,224,7,                           //and           $0x7,%r8b
4202   65,254,200,                             //dec           %r8b
4203   65,128,248,6,                           //cmp           $0x6,%r8b
4204   119,236,                                //ja            c15 <_sk_store_a8_avx+0x3e>
4205   196,66,121,48,192,                      //vpmovzxbw     %xmm8,%xmm8
4206   65,15,182,192,                          //movzbl        %r8b,%eax
4207   76,141,5,67,0,0,0,                      //lea           0x43(%rip),%r8        # c7c <_sk_store_a8_avx+0xa5>
4208   73,99,4,128,                            //movslq        (%r8,%rax,4),%rax
4209   76,1,192,                               //add           %r8,%rax
4210   255,224,                                //jmpq          *%rax
4211   196,67,121,20,68,57,6,12,               //vpextrb       $0xc,%xmm8,0x6(%r9,%rdi,1)
4212   196,67,121,20,68,57,5,10,               //vpextrb       $0xa,%xmm8,0x5(%r9,%rdi,1)
4213   196,67,121,20,68,57,4,8,                //vpextrb       $0x8,%xmm8,0x4(%r9,%rdi,1)
4214   196,67,121,20,68,57,3,6,                //vpextrb       $0x6,%xmm8,0x3(%r9,%rdi,1)
4215   196,67,121,20,68,57,2,4,                //vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
4216   196,67,121,20,68,57,1,2,                //vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
4217   196,67,121,20,4,57,0,                   //vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
4218   235,154,                                //jmp           c15 <_sk_store_a8_avx+0x3e>
4219   144,                                    //nop
4220   246,255,                                //idiv          %bh
4221   255,                                    //(bad)
4222   255,                                    //(bad)
4223   238,                                    //out           %al,(%dx)
4224   255,                                    //(bad)
4225   255,                                    //(bad)
4226   255,230,                                //jmpq          *%rsi
4227   255,                                    //(bad)
4228   255,                                    //(bad)
4229   255,                                    //(bad)
4230   222,255,                                //fdivrp        %st,%st(7)
4231   255,                                    //(bad)
4232   255,214,                                //callq         *%rsi
4233   255,                                    //(bad)
4234   255,                                    //(bad)
4235   255,206,                                //dec           %esi
4236   255,                                    //(bad)
4237   255,                                    //(bad)
4238   255,198,                                //inc           %esi
4239   255,                                    //(bad)
4240   255,                                    //(bad)
4241   255,                                    //.byte         0xff
4242 };
4243 
4244 CODE const uint8_t sk_load_565_avx[] = {
4245   72,173,                                 //lods          %ds:(%rsi),%rax
4246   76,139,16,                              //mov           (%rax),%r10
4247   72,133,201,                             //test          %rcx,%rcx
4248   15,133,209,0,0,0,                       //jne           d77 <_sk_load_565_avx+0xdf>
4249   196,193,122,111,4,122,                  //vmovdqu       (%r10,%rdi,2),%xmm0
4250   197,241,239,201,                        //vpxor         %xmm1,%xmm1,%xmm1
4251   197,249,105,201,                        //vpunpckhwd    %xmm1,%xmm0,%xmm1
4252   196,226,121,51,192,                     //vpmovzxwd     %xmm0,%xmm0
4253   196,227,125,24,209,1,                   //vinsertf128   $0x1,%xmm1,%ymm0,%ymm2
4254   184,0,248,0,0,                          //mov           $0xf800,%eax
4255   197,249,110,192,                        //vmovd         %eax,%xmm0
4256   197,249,112,192,0,                      //vpshufd       $0x0,%xmm0,%xmm0
4257   196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
4258   197,252,84,194,                         //vandps        %ymm2,%ymm0,%ymm0
4259   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
4260   184,8,33,132,55,                        //mov           $0x37842108,%eax
4261   197,249,110,200,                        //vmovd         %eax,%xmm1
4262   196,227,121,4,201,0,                    //vpermilps     $0x0,%xmm1,%xmm1
4263   196,227,117,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
4264   197,252,89,193,                         //vmulps        %ymm1,%ymm0,%ymm0
4265   184,224,7,0,0,                          //mov           $0x7e0,%eax
4266   197,249,110,200,                        //vmovd         %eax,%xmm1
4267   197,249,112,201,0,                      //vpshufd       $0x0,%xmm1,%xmm1
4268   196,227,117,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
4269   197,244,84,202,                         //vandps        %ymm2,%ymm1,%ymm1
4270   197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
4271   184,33,8,2,58,                          //mov           $0x3a020821,%eax
4272   197,249,110,216,                        //vmovd         %eax,%xmm3
4273   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
4274   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
4275   197,244,89,203,                         //vmulps        %ymm3,%ymm1,%ymm1
4276   184,31,0,0,0,                           //mov           $0x1f,%eax
4277   197,249,110,216,                        //vmovd         %eax,%xmm3
4278   197,249,112,219,0,                      //vpshufd       $0x0,%xmm3,%xmm3
4279   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
4280   197,228,84,210,                         //vandps        %ymm2,%ymm3,%ymm2
4281   197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
4282   184,8,33,4,61,                          //mov           $0x3d042108,%eax
4283   197,249,110,216,                        //vmovd         %eax,%xmm3
4284   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
4285   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
4286   197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
4287   184,0,0,128,63,                         //mov           $0x3f800000,%eax
4288   197,249,110,216,                        //vmovd         %eax,%xmm3
4289   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
4290   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
4291   72,173,                                 //lods          %ds:(%rsi),%rax
4292   255,224,                                //jmpq          *%rax
4293   65,137,200,                             //mov           %ecx,%r8d
4294   65,128,224,7,                           //and           $0x7,%r8b
4295   197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
4296   65,254,200,                             //dec           %r8b
4297   65,128,248,6,                           //cmp           $0x6,%r8b
4298   15,135,29,255,255,255,                  //ja            cac <_sk_load_565_avx+0x14>
4299   69,15,182,192,                          //movzbl        %r8b,%r8d
4300   76,141,13,74,0,0,0,                     //lea           0x4a(%rip),%r9        # de4 <_sk_load_565_avx+0x14c>
4301   75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
4302   76,1,200,                               //add           %r9,%rax
4303   255,224,                                //jmpq          *%rax
4304   197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
4305   196,193,121,196,68,122,12,6,            //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
4306   196,193,121,196,68,122,10,5,            //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
4307   196,193,121,196,68,122,8,4,             //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
4308   196,193,121,196,68,122,6,3,             //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
4309   196,193,121,196,68,122,4,2,             //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
4310   196,193,121,196,68,122,2,1,             //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
4311   196,193,121,196,4,122,0,                //vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
4312   233,201,254,255,255,                    //jmpq          cac <_sk_load_565_avx+0x14>
4313   144,                                    //nop
4314   243,255,                                //repz          (bad)
4315   255,                                    //(bad)
4316   255,                                    //(bad)
4317   235,255,                                //jmp           de9 <_sk_load_565_avx+0x151>
4318   255,                                    //(bad)
4319   255,227,                                //jmpq          *%rbx
4320   255,                                    //(bad)
4321   255,                                    //(bad)
4322   255,                                    //(bad)
4323   219,255,                                //(bad)
4324   255,                                    //(bad)
4325   255,211,                                //callq         *%rbx
4326   255,                                    //(bad)
4327   255,                                    //(bad)
4328   255,203,                                //dec           %ebx
4329   255,                                    //(bad)
4330   255,                                    //(bad)
4331   255,                                    //(bad)
4332   191,                                    //.byte         0xbf
4333   255,                                    //(bad)
4334   255,                                    //(bad)
4335   255,                                    //.byte         0xff
4336 };
4337 
4338 CODE const uint8_t sk_store_565_avx[] = {
4339   72,173,                                 //lods          %ds:(%rsi),%rax
4340   76,139,8,                               //mov           (%rax),%r9
4341   184,0,0,248,65,                         //mov           $0x41f80000,%eax
4342   197,121,110,192,                        //vmovd         %eax,%xmm8
4343   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
4344   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
4345   197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
4346   196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
4347   196,193,41,114,241,11,                  //vpslld        $0xb,%xmm9,%xmm10
4348   196,67,125,25,201,1,                    //vextractf128  $0x1,%ymm9,%xmm9
4349   196,193,49,114,241,11,                  //vpslld        $0xb,%xmm9,%xmm9
4350   196,67,45,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm10,%ymm9
4351   184,0,0,124,66,                         //mov           $0x427c0000,%eax
4352   197,121,110,208,                        //vmovd         %eax,%xmm10
4353   196,67,121,4,210,0,                     //vpermilps     $0x0,%xmm10,%xmm10
4354   196,67,45,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm10,%ymm10
4355   197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
4356   196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
4357   196,193,33,114,242,5,                   //vpslld        $0x5,%xmm10,%xmm11
4358   196,67,125,25,210,1,                    //vextractf128  $0x1,%ymm10,%xmm10
4359   196,193,41,114,242,5,                   //vpslld        $0x5,%xmm10,%xmm10
4360   196,67,37,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
4361   196,65,45,86,201,                       //vorpd         %ymm9,%ymm10,%ymm9
4362   197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
4363   196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
4364   196,65,53,86,192,                       //vorpd         %ymm8,%ymm9,%ymm8
4365   196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
4366   196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
4367   72,133,201,                             //test          %rcx,%rcx
4368   117,10,                                 //jne           e9e <_sk_store_565_avx+0x9e>
4369   196,65,122,127,4,121,                   //vmovdqu       %xmm8,(%r9,%rdi,2)
4370   72,173,                                 //lods          %ds:(%rsi),%rax
4371   255,224,                                //jmpq          *%rax
4372   65,137,200,                             //mov           %ecx,%r8d
4373   65,128,224,7,                           //and           $0x7,%r8b
4374   65,254,200,                             //dec           %r8b
4375   65,128,248,6,                           //cmp           $0x6,%r8b
4376   119,236,                                //ja            e9a <_sk_store_565_avx+0x9a>
4377   65,15,182,192,                          //movzbl        %r8b,%eax
4378   76,141,5,67,0,0,0,                      //lea           0x43(%rip),%r8        # efc <_sk_store_565_avx+0xfc>
4379   73,99,4,128,                            //movslq        (%r8,%rax,4),%rax
4380   76,1,192,                               //add           %r8,%rax
4381   255,224,                                //jmpq          *%rax
4382   196,67,121,21,68,121,12,6,              //vpextrw       $0x6,%xmm8,0xc(%r9,%rdi,2)
4383   196,67,121,21,68,121,10,5,              //vpextrw       $0x5,%xmm8,0xa(%r9,%rdi,2)
4384   196,67,121,21,68,121,8,4,               //vpextrw       $0x4,%xmm8,0x8(%r9,%rdi,2)
4385   196,67,121,21,68,121,6,3,               //vpextrw       $0x3,%xmm8,0x6(%r9,%rdi,2)
4386   196,67,121,21,68,121,4,2,               //vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
4387   196,67,121,21,68,121,2,1,               //vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
4388   196,67,121,21,4,121,0,                  //vpextrw       $0x0,%xmm8,(%r9,%rdi,2)
4389   235,159,                                //jmp           e9a <_sk_store_565_avx+0x9a>
4390   144,                                    //nop
4391   246,255,                                //idiv          %bh
4392   255,                                    //(bad)
4393   255,                                    //(bad)
4394   238,                                    //out           %al,(%dx)
4395   255,                                    //(bad)
4396   255,                                    //(bad)
4397   255,230,                                //jmpq          *%rsi
4398   255,                                    //(bad)
4399   255,                                    //(bad)
4400   255,                                    //(bad)
4401   222,255,                                //fdivrp        %st,%st(7)
4402   255,                                    //(bad)
4403   255,214,                                //callq         *%rsi
4404   255,                                    //(bad)
4405   255,                                    //(bad)
4406   255,206,                                //dec           %esi
4407   255,                                    //(bad)
4408   255,                                    //(bad)
4409   255,198,                                //inc           %esi
4410   255,                                    //(bad)
4411   255,                                    //(bad)
4412   255,                                    //.byte         0xff
4413 };
4414 
4415 CODE const uint8_t sk_load_8888_avx[] = {
4416   72,173,                                 //lods          %ds:(%rsi),%rax
4417   76,139,16,                              //mov           (%rax),%r10
4418   72,133,201,                             //test          %rcx,%rcx
4419   15,133,157,0,0,0,                       //jne           fc3 <_sk_load_8888_avx+0xab>
4420   196,65,124,16,12,186,                   //vmovups       (%r10,%rdi,4),%ymm9
4421   184,255,0,0,0,                          //mov           $0xff,%eax
4422   197,249,110,192,                        //vmovd         %eax,%xmm0
4423   197,249,112,192,0,                      //vpshufd       $0x0,%xmm0,%xmm0
4424   196,99,125,24,216,1,                    //vinsertf128   $0x1,%xmm0,%ymm0,%ymm11
4425   196,193,36,84,193,                      //vandps        %ymm9,%ymm11,%ymm0
4426   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
4427   184,129,128,128,59,                     //mov           $0x3b808081,%eax
4428   197,249,110,200,                        //vmovd         %eax,%xmm1
4429   196,227,121,4,201,0,                    //vpermilps     $0x0,%xmm1,%xmm1
4430   196,99,117,24,193,1,                    //vinsertf128   $0x1,%xmm1,%ymm1,%ymm8
4431   196,193,124,89,192,                     //vmulps        %ymm8,%ymm0,%ymm0
4432   196,193,41,114,209,8,                   //vpsrld        $0x8,%xmm9,%xmm10
4433   196,99,125,25,203,1,                    //vextractf128  $0x1,%ymm9,%xmm3
4434   197,241,114,211,8,                      //vpsrld        $0x8,%xmm3,%xmm1
4435   196,227,45,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm10,%ymm1
4436   197,164,84,201,                         //vandps        %ymm1,%ymm11,%ymm1
4437   197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
4438   196,193,116,89,200,                     //vmulps        %ymm8,%ymm1,%ymm1
4439   196,193,41,114,209,16,                  //vpsrld        $0x10,%xmm9,%xmm10
4440   197,233,114,211,16,                     //vpsrld        $0x10,%xmm3,%xmm2
4441   196,227,45,24,210,1,                    //vinsertf128   $0x1,%xmm2,%ymm10,%ymm2
4442   197,164,84,210,                         //vandps        %ymm2,%ymm11,%ymm2
4443   197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
4444   196,193,108,89,208,                     //vmulps        %ymm8,%ymm2,%ymm2
4445   196,193,49,114,209,24,                  //vpsrld        $0x18,%xmm9,%xmm9
4446   197,225,114,211,24,                     //vpsrld        $0x18,%xmm3,%xmm3
4447   196,227,53,24,219,1,                    //vinsertf128   $0x1,%xmm3,%ymm9,%ymm3
4448   197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
4449   196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
4450   72,173,                                 //lods          %ds:(%rsi),%rax
4451   255,224,                                //jmpq          *%rax
4452   65,137,200,                             //mov           %ecx,%r8d
4453   65,128,224,7,                           //and           $0x7,%r8b
4454   196,65,52,87,201,                       //vxorps        %ymm9,%ymm9,%ymm9
4455   65,254,200,                             //dec           %r8b
4456   65,128,248,6,                           //cmp           $0x6,%r8b
4457   15,135,80,255,255,255,                  //ja            f2c <_sk_load_8888_avx+0x14>
4458   69,15,182,192,                          //movzbl        %r8b,%r8d
4459   76,141,13,137,0,0,0,                    //lea           0x89(%rip),%r9        # 1070 <_sk_load_8888_avx+0x158>
4460   75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
4461   76,1,200,                               //add           %r9,%rax
4462   255,224,                                //jmpq          *%rax
4463   196,193,121,110,68,186,24,              //vmovd         0x18(%r10,%rdi,4),%xmm0
4464   197,249,112,192,68,                     //vpshufd       $0x44,%xmm0,%xmm0
4465   196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
4466   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
4467   196,99,117,12,200,64,                   //vblendps      $0x40,%ymm0,%ymm1,%ymm9
4468   196,99,125,25,200,1,                    //vextractf128  $0x1,%ymm9,%xmm0
4469   196,195,121,34,68,186,20,1,             //vpinsrd       $0x1,0x14(%r10,%rdi,4),%xmm0,%xmm0
4470   196,99,53,24,200,1,                     //vinsertf128   $0x1,%xmm0,%ymm9,%ymm9
4471   196,99,125,25,200,1,                    //vextractf128  $0x1,%ymm9,%xmm0
4472   196,195,121,34,68,186,16,0,             //vpinsrd       $0x0,0x10(%r10,%rdi,4),%xmm0,%xmm0
4473   196,99,53,24,200,1,                     //vinsertf128   $0x1,%xmm0,%ymm9,%ymm9
4474   196,195,49,34,68,186,12,3,              //vpinsrd       $0x3,0xc(%r10,%rdi,4),%xmm9,%xmm0
4475   196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
4476   196,195,49,34,68,186,8,2,               //vpinsrd       $0x2,0x8(%r10,%rdi,4),%xmm9,%xmm0
4477   196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
4478   196,195,49,34,68,186,4,1,               //vpinsrd       $0x1,0x4(%r10,%rdi,4),%xmm9,%xmm0
4479   196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
4480   196,195,49,34,4,186,0,                  //vpinsrd       $0x0,(%r10,%rdi,4),%xmm9,%xmm0
4481   196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
4482   233,188,254,255,255,                    //jmpq          f2c <_sk_load_8888_avx+0x14>
4483   238,                                    //out           %al,(%dx)
4484   255,                                    //(bad)
4485   255,                                    //(bad)
4486   255,224,                                //jmpq          *%rax
4487   255,                                    //(bad)
4488   255,                                    //(bad)
4489   255,210,                                //callq         *%rdx
4490   255,                                    //(bad)
4491   255,                                    //(bad)
4492   255,196,                                //inc           %esp
4493   255,                                    //(bad)
4494   255,                                    //(bad)
4495   255,176,255,255,255,156,                //pushq         -0x63000001(%rax)
4496   255,                                    //(bad)
4497   255,                                    //(bad)
4498   255,                                    //.byte         0xff
4499   128,255,255,                            //cmp           $0xff,%bh
4500   255,                                    //.byte         0xff
4501 };
4502 
4503 CODE const uint8_t sk_store_8888_avx[] = {
4504   72,173,                                 //lods          %ds:(%rsi),%rax
4505   76,139,8,                               //mov           (%rax),%r9
4506   184,0,0,127,67,                         //mov           $0x437f0000,%eax
4507   197,121,110,192,                        //vmovd         %eax,%xmm8
4508   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
4509   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
4510   197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
4511   196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
4512   197,60,89,209,                          //vmulps        %ymm1,%ymm8,%ymm10
4513   196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
4514   196,193,33,114,242,8,                   //vpslld        $0x8,%xmm10,%xmm11
4515   196,67,125,25,210,1,                    //vextractf128  $0x1,%ymm10,%xmm10
4516   196,193,41,114,242,8,                   //vpslld        $0x8,%xmm10,%xmm10
4517   196,67,37,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
4518   196,65,45,86,201,                       //vorpd         %ymm9,%ymm10,%ymm9
4519   197,60,89,210,                          //vmulps        %ymm2,%ymm8,%ymm10
4520   196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
4521   196,193,33,114,242,16,                  //vpslld        $0x10,%xmm10,%xmm11
4522   196,67,125,25,210,1,                    //vextractf128  $0x1,%ymm10,%xmm10
4523   196,193,41,114,242,16,                  //vpslld        $0x10,%xmm10,%xmm10
4524   196,67,37,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
4525   197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
4526   196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
4527   196,193,33,114,240,24,                  //vpslld        $0x18,%xmm8,%xmm11
4528   196,67,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm8
4529   196,193,57,114,240,24,                  //vpslld        $0x18,%xmm8,%xmm8
4530   196,67,37,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm11,%ymm8
4531   196,65,45,86,192,                       //vorpd         %ymm8,%ymm10,%ymm8
4532   196,65,53,86,192,                       //vorpd         %ymm8,%ymm9,%ymm8
4533   72,133,201,                             //test          %rcx,%rcx
4534   117,10,                                 //jne           1130 <_sk_store_8888_avx+0xa4>
4535   196,65,124,17,4,185,                    //vmovups       %ymm8,(%r9,%rdi,4)
4536   72,173,                                 //lods          %ds:(%rsi),%rax
4537   255,224,                                //jmpq          *%rax
4538   65,137,200,                             //mov           %ecx,%r8d
4539   65,128,224,7,                           //and           $0x7,%r8b
4540   65,254,200,                             //dec           %r8b
4541   65,128,248,6,                           //cmp           $0x6,%r8b
4542   119,236,                                //ja            112c <_sk_store_8888_avx+0xa0>
4543   65,15,182,192,                          //movzbl        %r8b,%eax
4544   76,141,5,85,0,0,0,                      //lea           0x55(%rip),%r8        # 11a0 <_sk_store_8888_avx+0x114>
4545   73,99,4,128,                            //movslq        (%r8,%rax,4),%rax
4546   76,1,192,                               //add           %r8,%rax
4547   255,224,                                //jmpq          *%rax
4548   196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
4549   196,67,121,22,76,185,24,2,              //vpextrd       $0x2,%xmm9,0x18(%r9,%rdi,4)
4550   196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
4551   196,67,121,22,76,185,20,1,              //vpextrd       $0x1,%xmm9,0x14(%r9,%rdi,4)
4552   196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
4553   196,65,122,17,76,185,16,                //vmovss        %xmm9,0x10(%r9,%rdi,4)
4554   196,67,121,22,68,185,12,3,              //vpextrd       $0x3,%xmm8,0xc(%r9,%rdi,4)
4555   196,67,121,22,68,185,8,2,               //vpextrd       $0x2,%xmm8,0x8(%r9,%rdi,4)
4556   196,67,121,22,68,185,4,1,               //vpextrd       $0x1,%xmm8,0x4(%r9,%rdi,4)
4557   196,65,121,126,4,185,                   //vmovd         %xmm8,(%r9,%rdi,4)
4558   235,143,                                //jmp           112c <_sk_store_8888_avx+0xa0>
4559   15,31,0,                                //nopl          (%rax)
4560   245,                                    //cmc
4561   255,                                    //(bad)
4562   255,                                    //(bad)
4563   255,                                    //(bad)
4564   237,                                    //in            (%dx),%eax
4565   255,                                    //(bad)
4566   255,                                    //(bad)
4567   255,229,                                //jmpq          *%rbp
4568   255,                                    //(bad)
4569   255,                                    //(bad)
4570   255,                                    //(bad)
4571   221,255,                                //(bad)
4572   255,                                    //(bad)
4573   255,208,                                //callq         *%rax
4574   255,                                    //(bad)
4575   255,                                    //(bad)
4576   255,194,                                //inc           %edx
4577   255,                                    //(bad)
4578   255,                                    //(bad)
4579   255,                                    //.byte         0xff
4580   180,255,                                //mov           $0xff,%ah
4581   255,                                    //(bad)
4582   255,                                    //.byte         0xff
4583 };
4584 
4585 CODE const uint8_t sk_load_f16_avx[] = {
4586   72,173,                                 //lods          %ds:(%rsi),%rax
4587   72,139,0,                               //mov           (%rax),%rax
4588   72,133,201,                             //test          %rcx,%rcx
4589   15,133,2,1,0,0,                         //jne           12cc <_sk_load_f16_avx+0x110>
4590   197,121,16,4,248,                       //vmovupd       (%rax,%rdi,8),%xmm8
4591   197,249,16,84,248,16,                   //vmovupd       0x10(%rax,%rdi,8),%xmm2
4592   197,249,16,92,248,32,                   //vmovupd       0x20(%rax,%rdi,8),%xmm3
4593   197,122,111,76,248,48,                  //vmovdqu       0x30(%rax,%rdi,8),%xmm9
4594   197,185,97,194,                         //vpunpcklwd    %xmm2,%xmm8,%xmm0
4595   197,185,105,210,                        //vpunpckhwd    %xmm2,%xmm8,%xmm2
4596   196,193,97,97,201,                      //vpunpcklwd    %xmm9,%xmm3,%xmm1
4597   196,193,97,105,217,                     //vpunpckhwd    %xmm9,%xmm3,%xmm3
4598   197,121,97,194,                         //vpunpcklwd    %xmm2,%xmm0,%xmm8
4599   197,249,105,194,                        //vpunpckhwd    %xmm2,%xmm0,%xmm0
4600   197,241,97,211,                         //vpunpcklwd    %xmm3,%xmm1,%xmm2
4601   197,113,105,203,                        //vpunpckhwd    %xmm3,%xmm1,%xmm9
4602   184,0,4,0,4,                            //mov           $0x4000400,%eax
4603   197,249,110,216,                        //vmovd         %eax,%xmm3
4604   197,249,112,219,0,                      //vpshufd       $0x0,%xmm3,%xmm3
4605   196,193,97,101,200,                     //vpcmpgtw      %xmm8,%xmm3,%xmm1
4606   196,65,113,223,192,                     //vpandn        %xmm8,%xmm1,%xmm8
4607   197,225,101,200,                        //vpcmpgtw      %xmm0,%xmm3,%xmm1
4608   197,241,223,192,                        //vpandn        %xmm0,%xmm1,%xmm0
4609   197,225,101,202,                        //vpcmpgtw      %xmm2,%xmm3,%xmm1
4610   197,241,223,202,                        //vpandn        %xmm2,%xmm1,%xmm1
4611   196,193,97,101,209,                     //vpcmpgtw      %xmm9,%xmm3,%xmm2
4612   196,193,105,223,209,                    //vpandn        %xmm9,%xmm2,%xmm2
4613   196,66,121,51,208,                      //vpmovzxwd     %xmm8,%xmm10
4614   196,98,121,51,201,                      //vpmovzxwd     %xmm1,%xmm9
4615   197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
4616   197,57,105,195,                         //vpunpckhwd    %xmm3,%xmm8,%xmm8
4617   197,241,105,203,                        //vpunpckhwd    %xmm3,%xmm1,%xmm1
4618   196,98,121,51,216,                      //vpmovzxwd     %xmm0,%xmm11
4619   196,98,121,51,226,                      //vpmovzxwd     %xmm2,%xmm12
4620   197,121,105,235,                        //vpunpckhwd    %xmm3,%xmm0,%xmm13
4621   197,105,105,243,                        //vpunpckhwd    %xmm3,%xmm2,%xmm14
4622   196,193,121,114,242,13,                 //vpslld        $0xd,%xmm10,%xmm0
4623   196,193,105,114,241,13,                 //vpslld        $0xd,%xmm9,%xmm2
4624   196,227,125,24,194,1,                   //vinsertf128   $0x1,%xmm2,%ymm0,%ymm0
4625   184,0,0,128,119,                        //mov           $0x77800000,%eax
4626   197,249,110,208,                        //vmovd         %eax,%xmm2
4627   197,249,112,210,0,                      //vpshufd       $0x0,%xmm2,%xmm2
4628   196,99,109,24,202,1,                    //vinsertf128   $0x1,%xmm2,%ymm2,%ymm9
4629   197,180,89,192,                         //vmulps        %ymm0,%ymm9,%ymm0
4630   196,193,105,114,240,13,                 //vpslld        $0xd,%xmm8,%xmm2
4631   197,241,114,241,13,                     //vpslld        $0xd,%xmm1,%xmm1
4632   196,227,109,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
4633   197,180,89,201,                         //vmulps        %ymm1,%ymm9,%ymm1
4634   196,193,105,114,243,13,                 //vpslld        $0xd,%xmm11,%xmm2
4635   196,193,97,114,244,13,                  //vpslld        $0xd,%xmm12,%xmm3
4636   196,227,109,24,211,1,                   //vinsertf128   $0x1,%xmm3,%ymm2,%ymm2
4637   197,180,89,210,                         //vmulps        %ymm2,%ymm9,%ymm2
4638   196,193,57,114,245,13,                  //vpslld        $0xd,%xmm13,%xmm8
4639   196,193,97,114,246,13,                  //vpslld        $0xd,%xmm14,%xmm3
4640   196,227,61,24,219,1,                    //vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
4641   197,180,89,219,                         //vmulps        %ymm3,%ymm9,%ymm3
4642   72,173,                                 //lods          %ds:(%rsi),%rax
4643   255,224,                                //jmpq          *%rax
4644   197,123,16,4,248,                       //vmovsd        (%rax,%rdi,8),%xmm8
4645   196,65,49,239,201,                      //vpxor         %xmm9,%xmm9,%xmm9
4646   72,131,249,1,                           //cmp           $0x1,%rcx
4647   116,79,                                 //je            132b <_sk_load_f16_avx+0x16f>
4648   197,57,22,68,248,8,                     //vmovhpd       0x8(%rax,%rdi,8),%xmm8,%xmm8
4649   72,131,249,3,                           //cmp           $0x3,%rcx
4650   114,67,                                 //jb            132b <_sk_load_f16_avx+0x16f>
4651   197,251,16,84,248,16,                   //vmovsd        0x10(%rax,%rdi,8),%xmm2
4652   72,131,249,3,                           //cmp           $0x3,%rcx
4653   116,68,                                 //je            1338 <_sk_load_f16_avx+0x17c>
4654   197,233,22,84,248,24,                   //vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
4655   72,131,249,5,                           //cmp           $0x5,%rcx
4656   114,56,                                 //jb            1338 <_sk_load_f16_avx+0x17c>
4657   197,251,16,92,248,32,                   //vmovsd        0x20(%rax,%rdi,8),%xmm3
4658   72,131,249,5,                           //cmp           $0x5,%rcx
4659   15,132,209,254,255,255,                 //je            11e1 <_sk_load_f16_avx+0x25>
4660   197,225,22,92,248,40,                   //vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
4661   72,131,249,7,                           //cmp           $0x7,%rcx
4662   15,130,193,254,255,255,                 //jb            11e1 <_sk_load_f16_avx+0x25>
4663   197,122,126,76,248,48,                  //vmovq         0x30(%rax,%rdi,8),%xmm9
4664   233,182,254,255,255,                    //jmpq          11e1 <_sk_load_f16_avx+0x25>
4665   197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
4666   197,233,87,210,                         //vxorpd        %xmm2,%xmm2,%xmm2
4667   233,169,254,255,255,                    //jmpq          11e1 <_sk_load_f16_avx+0x25>
4668   197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
4669   233,160,254,255,255,                    //jmpq          11e1 <_sk_load_f16_avx+0x25>
4670 };
4671 
4672 CODE const uint8_t sk_store_f16_avx[] = {
4673   72,173,                                 //lods          %ds:(%rsi),%rax
4674   76,139,0,                               //mov           (%rax),%r8
4675   184,0,0,128,7,                          //mov           $0x7800000,%eax
4676   197,121,110,192,                        //vmovd         %eax,%xmm8
4677   196,65,121,112,192,0,                   //vpshufd       $0x0,%xmm8,%xmm8
4678   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
4679   197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
4680   196,67,125,25,202,1,                    //vextractf128  $0x1,%ymm9,%xmm10
4681   196,193,41,114,210,13,                  //vpsrld        $0xd,%xmm10,%xmm10
4682   196,193,49,114,209,13,                  //vpsrld        $0xd,%xmm9,%xmm9
4683   197,60,89,217,                          //vmulps        %ymm1,%ymm8,%ymm11
4684   196,67,125,25,220,1,                    //vextractf128  $0x1,%ymm11,%xmm12
4685   196,193,25,114,212,13,                  //vpsrld        $0xd,%xmm12,%xmm12
4686   196,193,33,114,211,13,                  //vpsrld        $0xd,%xmm11,%xmm11
4687   197,60,89,234,                          //vmulps        %ymm2,%ymm8,%ymm13
4688   196,67,125,25,238,1,                    //vextractf128  $0x1,%ymm13,%xmm14
4689   196,193,9,114,214,13,                   //vpsrld        $0xd,%xmm14,%xmm14
4690   196,193,17,114,213,13,                  //vpsrld        $0xd,%xmm13,%xmm13
4691   197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
4692   196,67,125,25,199,1,                    //vextractf128  $0x1,%ymm8,%xmm15
4693   196,193,1,114,215,13,                   //vpsrld        $0xd,%xmm15,%xmm15
4694   196,193,57,114,208,13,                  //vpsrld        $0xd,%xmm8,%xmm8
4695   196,193,33,115,251,2,                   //vpslldq       $0x2,%xmm11,%xmm11
4696   196,65,33,235,201,                      //vpor          %xmm9,%xmm11,%xmm9
4697   196,193,33,115,252,2,                   //vpslldq       $0x2,%xmm12,%xmm11
4698   196,65,33,235,226,                      //vpor          %xmm10,%xmm11,%xmm12
4699   196,193,57,115,248,2,                   //vpslldq       $0x2,%xmm8,%xmm8
4700   196,65,57,235,197,                      //vpor          %xmm13,%xmm8,%xmm8
4701   196,193,41,115,255,2,                   //vpslldq       $0x2,%xmm15,%xmm10
4702   196,65,41,235,238,                      //vpor          %xmm14,%xmm10,%xmm13
4703   196,65,49,98,216,                       //vpunpckldq    %xmm8,%xmm9,%xmm11
4704   196,65,49,106,208,                      //vpunpckhdq    %xmm8,%xmm9,%xmm10
4705   196,65,25,98,205,                       //vpunpckldq    %xmm13,%xmm12,%xmm9
4706   196,65,25,106,197,                      //vpunpckhdq    %xmm13,%xmm12,%xmm8
4707   72,133,201,                             //test          %rcx,%rcx
4708   117,31,                                 //jne           1417 <_sk_store_f16_avx+0xd6>
4709   196,65,120,17,28,248,                   //vmovups       %xmm11,(%r8,%rdi,8)
4710   196,65,120,17,84,248,16,                //vmovups       %xmm10,0x10(%r8,%rdi,8)
4711   196,65,120,17,76,248,32,                //vmovups       %xmm9,0x20(%r8,%rdi,8)
4712   196,65,122,127,68,248,48,               //vmovdqu       %xmm8,0x30(%r8,%rdi,8)
4713   72,173,                                 //lods          %ds:(%rsi),%rax
4714   255,224,                                //jmpq          *%rax
4715   196,65,121,214,28,248,                  //vmovq         %xmm11,(%r8,%rdi,8)
4716   72,131,249,1,                           //cmp           $0x1,%rcx
4717   116,240,                                //je            1413 <_sk_store_f16_avx+0xd2>
4718   196,65,121,23,92,248,8,                 //vmovhpd       %xmm11,0x8(%r8,%rdi,8)
4719   72,131,249,3,                           //cmp           $0x3,%rcx
4720   114,227,                                //jb            1413 <_sk_store_f16_avx+0xd2>
4721   196,65,121,214,84,248,16,               //vmovq         %xmm10,0x10(%r8,%rdi,8)
4722   116,218,                                //je            1413 <_sk_store_f16_avx+0xd2>
4723   196,65,121,23,84,248,24,                //vmovhpd       %xmm10,0x18(%r8,%rdi,8)
4724   72,131,249,5,                           //cmp           $0x5,%rcx
4725   114,205,                                //jb            1413 <_sk_store_f16_avx+0xd2>
4726   196,65,121,214,76,248,32,               //vmovq         %xmm9,0x20(%r8,%rdi,8)
4727   116,196,                                //je            1413 <_sk_store_f16_avx+0xd2>
4728   196,65,121,23,76,248,40,                //vmovhpd       %xmm9,0x28(%r8,%rdi,8)
4729   72,131,249,7,                           //cmp           $0x7,%rcx
4730   114,183,                                //jb            1413 <_sk_store_f16_avx+0xd2>
4731   196,65,121,214,68,248,48,               //vmovq         %xmm8,0x30(%r8,%rdi,8)
4732   235,174,                                //jmp           1413 <_sk_store_f16_avx+0xd2>
4733 };
4734 
4735 CODE const uint8_t sk_store_f32_avx[] = {
4736   72,173,                                 //lods          %ds:(%rsi),%rax
4737   76,139,0,                               //mov           (%rax),%r8
4738   72,141,4,189,0,0,0,0,                   //lea           0x0(,%rdi,4),%rax
4739   197,124,20,193,                         //vunpcklps     %ymm1,%ymm0,%ymm8
4740   197,124,21,217,                         //vunpckhps     %ymm1,%ymm0,%ymm11
4741   197,108,20,203,                         //vunpcklps     %ymm3,%ymm2,%ymm9
4742   197,108,21,227,                         //vunpckhps     %ymm3,%ymm2,%ymm12
4743   196,65,61,20,209,                       //vunpcklpd     %ymm9,%ymm8,%ymm10
4744   196,65,61,21,201,                       //vunpckhpd     %ymm9,%ymm8,%ymm9
4745   196,65,37,20,196,                       //vunpcklpd     %ymm12,%ymm11,%ymm8
4746   196,65,37,21,220,                       //vunpckhpd     %ymm12,%ymm11,%ymm11
4747   72,133,201,                             //test          %rcx,%rcx
4748   117,55,                                 //jne           14d2 <_sk_store_f32_avx+0x6d>
4749   196,67,45,24,225,1,                     //vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
4750   196,67,61,24,235,1,                     //vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
4751   196,67,45,6,201,49,                     //vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
4752   196,67,61,6,195,49,                     //vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
4753   196,65,125,17,36,128,                   //vmovupd       %ymm12,(%r8,%rax,4)
4754   196,65,125,17,108,128,32,               //vmovupd       %ymm13,0x20(%r8,%rax,4)
4755   196,65,125,17,76,128,64,                //vmovupd       %ymm9,0x40(%r8,%rax,4)
4756   196,65,125,17,68,128,96,                //vmovupd       %ymm8,0x60(%r8,%rax,4)
4757   72,173,                                 //lods          %ds:(%rsi),%rax
4758   255,224,                                //jmpq          *%rax
4759   196,65,121,17,20,128,                   //vmovupd       %xmm10,(%r8,%rax,4)
4760   72,131,249,1,                           //cmp           $0x1,%rcx
4761   116,240,                                //je            14ce <_sk_store_f32_avx+0x69>
4762   196,65,121,17,76,128,16,                //vmovupd       %xmm9,0x10(%r8,%rax,4)
4763   72,131,249,3,                           //cmp           $0x3,%rcx
4764   114,227,                                //jb            14ce <_sk_store_f32_avx+0x69>
4765   196,65,121,17,68,128,32,                //vmovupd       %xmm8,0x20(%r8,%rax,4)
4766   116,218,                                //je            14ce <_sk_store_f32_avx+0x69>
4767   196,65,121,17,92,128,48,                //vmovupd       %xmm11,0x30(%r8,%rax,4)
4768   72,131,249,5,                           //cmp           $0x5,%rcx
4769   114,205,                                //jb            14ce <_sk_store_f32_avx+0x69>
4770   196,67,125,25,84,128,64,1,              //vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
4771   116,195,                                //je            14ce <_sk_store_f32_avx+0x69>
4772   196,67,125,25,76,128,80,1,              //vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
4773   72,131,249,7,                           //cmp           $0x7,%rcx
4774   114,181,                                //jb            14ce <_sk_store_f32_avx+0x69>
4775   196,67,125,25,68,128,96,1,              //vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
4776   235,171,                                //jmp           14ce <_sk_store_f32_avx+0x69>
4777 };
4778 
4779 CODE const uint8_t sk_clamp_x_avx[] = {
4780   72,173,                                 //lods          %ds:(%rsi),%rax
4781   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
4782   197,60,95,200,                          //vmaxps        %ymm0,%ymm8,%ymm9
4783   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
4784   196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
4785   196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
4786   196,193,121,254,194,                    //vpaddd        %xmm10,%xmm0,%xmm0
4787   196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
4788   196,227,61,24,192,1,                    //vinsertf128   $0x1,%xmm0,%ymm8,%ymm0
4789   197,180,93,192,                         //vminps        %ymm0,%ymm9,%ymm0
4790   72,173,                                 //lods          %ds:(%rsi),%rax
4791   255,224,                                //jmpq          *%rax
4792 };
4793 
4794 CODE const uint8_t sk_clamp_y_avx[] = {
4795   72,173,                                 //lods          %ds:(%rsi),%rax
4796   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
4797   197,60,95,201,                          //vmaxps        %ymm1,%ymm8,%ymm9
4798   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
4799   196,99,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm1
4800   196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
4801   196,193,113,254,202,                    //vpaddd        %xmm10,%xmm1,%xmm1
4802   196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
4803   196,227,61,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm8,%ymm1
4804   197,180,93,201,                         //vminps        %ymm1,%ymm9,%ymm1
4805   72,173,                                 //lods          %ds:(%rsi),%rax
4806   255,224,                                //jmpq          *%rax
4807 };
4808 
4809 CODE const uint8_t sk_repeat_x_avx[] = {
4810   72,173,                                 //lods          %ds:(%rsi),%rax
4811   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
4812   196,65,124,94,200,                      //vdivps        %ymm8,%ymm0,%ymm9
4813   196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
4814   196,65,52,89,200,                       //vmulps        %ymm8,%ymm9,%ymm9
4815   196,65,124,92,201,                      //vsubps        %ymm9,%ymm0,%ymm9
4816   196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
4817   196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
4818   196,193,121,254,194,                    //vpaddd        %xmm10,%xmm0,%xmm0
4819   196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
4820   196,227,61,24,192,1,                    //vinsertf128   $0x1,%xmm0,%ymm8,%ymm0
4821   197,180,93,192,                         //vminps        %ymm0,%ymm9,%ymm0
4822   72,173,                                 //lods          %ds:(%rsi),%rax
4823   255,224,                                //jmpq          *%rax
4824 };
4825 
4826 CODE const uint8_t sk_repeat_y_avx[] = {
4827   72,173,                                 //lods          %ds:(%rsi),%rax
4828   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
4829   196,65,116,94,200,                      //vdivps        %ymm8,%ymm1,%ymm9
4830   196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
4831   196,65,52,89,200,                       //vmulps        %ymm8,%ymm9,%ymm9
4832   196,65,116,92,201,                      //vsubps        %ymm9,%ymm1,%ymm9
4833   196,99,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm1
4834   196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
4835   196,193,113,254,202,                    //vpaddd        %xmm10,%xmm1,%xmm1
4836   196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
4837   196,227,61,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm8,%ymm1
4838   197,180,93,201,                         //vminps        %ymm1,%ymm9,%ymm1
4839   72,173,                                 //lods          %ds:(%rsi),%rax
4840   255,224,                                //jmpq          *%rax
4841 };
4842 
4843 CODE const uint8_t sk_mirror_x_avx[] = {
4844   72,173,                                 //lods          %ds:(%rsi),%rax
4845   197,121,110,0,                          //vmovd         (%rax),%xmm8
4846   196,65,121,112,200,0,                   //vpshufd       $0x0,%xmm8,%xmm9
4847   196,67,53,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
4848   196,65,124,92,209,                      //vsubps        %ymm9,%ymm0,%ymm10
4849   196,193,58,88,192,                      //vaddss        %xmm8,%xmm8,%xmm0
4850   196,227,121,4,192,0,                    //vpermilps     $0x0,%xmm0,%xmm0
4851   196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
4852   197,44,94,192,                          //vdivps        %ymm0,%ymm10,%ymm8
4853   196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
4854   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
4855   197,172,92,192,                         //vsubps        %ymm0,%ymm10,%ymm0
4856   196,193,124,92,193,                     //vsubps        %ymm9,%ymm0,%ymm0
4857   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
4858   197,60,92,192,                          //vsubps        %ymm0,%ymm8,%ymm8
4859   197,60,84,192,                          //vandps        %ymm0,%ymm8,%ymm8
4860   196,99,125,25,200,1,                    //vextractf128  $0x1,%ymm9,%xmm0
4861   196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
4862   196,193,121,254,194,                    //vpaddd        %xmm10,%xmm0,%xmm0
4863   196,65,49,254,202,                      //vpaddd        %xmm10,%xmm9,%xmm9
4864   196,227,53,24,192,1,                    //vinsertf128   $0x1,%xmm0,%ymm9,%ymm0
4865   197,188,93,192,                         //vminps        %ymm0,%ymm8,%ymm0
4866   72,173,                                 //lods          %ds:(%rsi),%rax
4867   255,224,                                //jmpq          *%rax
4868 };
4869 
4870 CODE const uint8_t sk_mirror_y_avx[] = {
4871   72,173,                                 //lods          %ds:(%rsi),%rax
4872   197,121,110,0,                          //vmovd         (%rax),%xmm8
4873   196,65,121,112,200,0,                   //vpshufd       $0x0,%xmm8,%xmm9
4874   196,67,53,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
4875   196,65,116,92,209,                      //vsubps        %ymm9,%ymm1,%ymm10
4876   196,193,58,88,200,                      //vaddss        %xmm8,%xmm8,%xmm1
4877   196,227,121,4,201,0,                    //vpermilps     $0x0,%xmm1,%xmm1
4878   196,227,117,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
4879   197,44,94,193,                          //vdivps        %ymm1,%ymm10,%ymm8
4880   196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
4881   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
4882   197,172,92,201,                         //vsubps        %ymm1,%ymm10,%ymm1
4883   196,193,116,92,201,                     //vsubps        %ymm9,%ymm1,%ymm1
4884   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
4885   197,60,92,193,                          //vsubps        %ymm1,%ymm8,%ymm8
4886   197,60,84,193,                          //vandps        %ymm1,%ymm8,%ymm8
4887   196,99,125,25,201,1,                    //vextractf128  $0x1,%ymm9,%xmm1
4888   196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
4889   196,193,113,254,202,                    //vpaddd        %xmm10,%xmm1,%xmm1
4890   196,65,49,254,202,                      //vpaddd        %xmm10,%xmm9,%xmm9
4891   196,227,53,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm9,%ymm1
4892   197,188,93,201,                         //vminps        %ymm1,%ymm8,%ymm1
4893   72,173,                                 //lods          %ds:(%rsi),%rax
4894   255,224,                                //jmpq          *%rax
4895 };
4896 
4897 CODE const uint8_t sk_luminance_to_alpha_avx[] = {
4898   184,208,179,89,62,                      //mov           $0x3e59b3d0,%eax
4899   197,249,110,216,                        //vmovd         %eax,%xmm3
4900   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
4901   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
4902   197,228,89,192,                         //vmulps        %ymm0,%ymm3,%ymm0
4903   184,89,23,55,63,                        //mov           $0x3f371759,%eax
4904   197,249,110,216,                        //vmovd         %eax,%xmm3
4905   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
4906   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
4907   197,228,89,201,                         //vmulps        %ymm1,%ymm3,%ymm1
4908   197,252,88,193,                         //vaddps        %ymm1,%ymm0,%ymm0
4909   184,152,221,147,61,                     //mov           $0x3d93dd98,%eax
4910   197,249,110,200,                        //vmovd         %eax,%xmm1
4911   196,227,121,4,201,0,                    //vpermilps     $0x0,%xmm1,%xmm1
4912   196,227,117,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
4913   197,244,89,202,                         //vmulps        %ymm2,%ymm1,%ymm1
4914   197,252,88,217,                         //vaddps        %ymm1,%ymm0,%ymm3
4915   72,173,                                 //lods          %ds:(%rsi),%rax
4916   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
4917   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
4918   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
4919   255,224,                                //jmpq          *%rax
4920 };
4921 
4922 CODE const uint8_t sk_matrix_2x3_avx[] = {
4923   72,173,                                 //lods          %ds:(%rsi),%rax
4924   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
4925   196,98,125,24,72,8,                     //vbroadcastss  0x8(%rax),%ymm9
4926   196,98,125,24,80,16,                    //vbroadcastss  0x10(%rax),%ymm10
4927   197,52,89,201,                          //vmulps        %ymm1,%ymm9,%ymm9
4928   196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
4929   197,60,89,192,                          //vmulps        %ymm0,%ymm8,%ymm8
4930   196,65,60,88,193,                       //vaddps        %ymm9,%ymm8,%ymm8
4931   196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
4932   196,98,125,24,80,12,                    //vbroadcastss  0xc(%rax),%ymm10
4933   196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
4934   197,172,89,201,                         //vmulps        %ymm1,%ymm10,%ymm1
4935   196,193,116,88,203,                     //vaddps        %ymm11,%ymm1,%ymm1
4936   197,180,89,192,                         //vmulps        %ymm0,%ymm9,%ymm0
4937   197,252,88,201,                         //vaddps        %ymm1,%ymm0,%ymm1
4938   72,173,                                 //lods          %ds:(%rsi),%rax
4939   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
4940   255,224,                                //jmpq          *%rax
4941 };
4942 
4943 CODE const uint8_t sk_matrix_3x4_avx[] = {
4944   72,173,                                 //lods          %ds:(%rsi),%rax
4945   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
4946   196,98,125,24,72,12,                    //vbroadcastss  0xc(%rax),%ymm9
4947   196,98,125,24,80,24,                    //vbroadcastss  0x18(%rax),%ymm10
4948   196,98,125,24,88,36,                    //vbroadcastss  0x24(%rax),%ymm11
4949   197,44,89,210,                          //vmulps        %ymm2,%ymm10,%ymm10
4950   196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
4951   197,52,89,201,                          //vmulps        %ymm1,%ymm9,%ymm9
4952   196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
4953   197,60,89,192,                          //vmulps        %ymm0,%ymm8,%ymm8
4954   196,65,60,88,193,                       //vaddps        %ymm9,%ymm8,%ymm8
4955   196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
4956   196,98,125,24,80,16,                    //vbroadcastss  0x10(%rax),%ymm10
4957   196,98,125,24,88,28,                    //vbroadcastss  0x1c(%rax),%ymm11
4958   196,98,125,24,96,40,                    //vbroadcastss  0x28(%rax),%ymm12
4959   197,36,89,218,                          //vmulps        %ymm2,%ymm11,%ymm11
4960   196,65,36,88,220,                       //vaddps        %ymm12,%ymm11,%ymm11
4961   197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
4962   196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
4963   197,52,89,200,                          //vmulps        %ymm0,%ymm9,%ymm9
4964   196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
4965   196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
4966   196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
4967   196,98,125,24,96,32,                    //vbroadcastss  0x20(%rax),%ymm12
4968   196,98,125,24,104,44,                   //vbroadcastss  0x2c(%rax),%ymm13
4969   197,156,89,210,                         //vmulps        %ymm2,%ymm12,%ymm2
4970   196,193,108,88,213,                     //vaddps        %ymm13,%ymm2,%ymm2
4971   197,164,89,201,                         //vmulps        %ymm1,%ymm11,%ymm1
4972   197,244,88,202,                         //vaddps        %ymm2,%ymm1,%ymm1
4973   197,172,89,192,                         //vmulps        %ymm0,%ymm10,%ymm0
4974   197,252,88,209,                         //vaddps        %ymm1,%ymm0,%ymm2
4975   72,173,                                 //lods          %ds:(%rsi),%rax
4976   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
4977   197,124,41,201,                         //vmovaps       %ymm9,%ymm1
4978   255,224,                                //jmpq          *%rax
4979 };
4980 
4981 CODE const uint8_t sk_matrix_4x5_avx[] = {
4982   72,173,                                 //lods          %ds:(%rsi),%rax
4983   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
4984   196,98,125,24,72,16,                    //vbroadcastss  0x10(%rax),%ymm9
4985   196,98,125,24,80,32,                    //vbroadcastss  0x20(%rax),%ymm10
4986   196,98,125,24,88,48,                    //vbroadcastss  0x30(%rax),%ymm11
4987   196,98,125,24,96,64,                    //vbroadcastss  0x40(%rax),%ymm12
4988   197,36,89,219,                          //vmulps        %ymm3,%ymm11,%ymm11
4989   196,65,36,88,220,                       //vaddps        %ymm12,%ymm11,%ymm11
4990   197,44,89,210,                          //vmulps        %ymm2,%ymm10,%ymm10
4991   196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
4992   197,52,89,201,                          //vmulps        %ymm1,%ymm9,%ymm9
4993   196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
4994   197,60,89,192,                          //vmulps        %ymm0,%ymm8,%ymm8
4995   196,65,60,88,193,                       //vaddps        %ymm9,%ymm8,%ymm8
4996   196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
4997   196,98,125,24,80,20,                    //vbroadcastss  0x14(%rax),%ymm10
4998   196,98,125,24,88,36,                    //vbroadcastss  0x24(%rax),%ymm11
4999   196,98,125,24,96,52,                    //vbroadcastss  0x34(%rax),%ymm12
5000   196,98,125,24,104,68,                   //vbroadcastss  0x44(%rax),%ymm13
5001   197,28,89,227,                          //vmulps        %ymm3,%ymm12,%ymm12
5002   196,65,28,88,229,                       //vaddps        %ymm13,%ymm12,%ymm12
5003   197,36,89,218,                          //vmulps        %ymm2,%ymm11,%ymm11
5004   196,65,36,88,220,                       //vaddps        %ymm12,%ymm11,%ymm11
5005   197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
5006   196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
5007   197,52,89,200,                          //vmulps        %ymm0,%ymm9,%ymm9
5008   196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
5009   196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
5010   196,98,125,24,88,24,                    //vbroadcastss  0x18(%rax),%ymm11
5011   196,98,125,24,96,40,                    //vbroadcastss  0x28(%rax),%ymm12
5012   196,98,125,24,104,56,                   //vbroadcastss  0x38(%rax),%ymm13
5013   196,98,125,24,112,72,                   //vbroadcastss  0x48(%rax),%ymm14
5014   197,20,89,235,                          //vmulps        %ymm3,%ymm13,%ymm13
5015   196,65,20,88,238,                       //vaddps        %ymm14,%ymm13,%ymm13
5016   197,28,89,226,                          //vmulps        %ymm2,%ymm12,%ymm12
5017   196,65,28,88,229,                       //vaddps        %ymm13,%ymm12,%ymm12
5018   197,36,89,217,                          //vmulps        %ymm1,%ymm11,%ymm11
5019   196,65,36,88,220,                       //vaddps        %ymm12,%ymm11,%ymm11
5020   197,44,89,208,                          //vmulps        %ymm0,%ymm10,%ymm10
5021   196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
5022   196,98,125,24,88,12,                    //vbroadcastss  0xc(%rax),%ymm11
5023   196,98,125,24,96,28,                    //vbroadcastss  0x1c(%rax),%ymm12
5024   196,98,125,24,104,44,                   //vbroadcastss  0x2c(%rax),%ymm13
5025   196,98,125,24,112,60,                   //vbroadcastss  0x3c(%rax),%ymm14
5026   196,98,125,24,120,76,                   //vbroadcastss  0x4c(%rax),%ymm15
5027   197,140,89,219,                         //vmulps        %ymm3,%ymm14,%ymm3
5028   196,193,100,88,223,                     //vaddps        %ymm15,%ymm3,%ymm3
5029   197,148,89,210,                         //vmulps        %ymm2,%ymm13,%ymm2
5030   197,236,88,211,                         //vaddps        %ymm3,%ymm2,%ymm2
5031   197,156,89,201,                         //vmulps        %ymm1,%ymm12,%ymm1
5032   197,244,88,202,                         //vaddps        %ymm2,%ymm1,%ymm1
5033   197,164,89,192,                         //vmulps        %ymm0,%ymm11,%ymm0
5034   197,252,88,217,                         //vaddps        %ymm1,%ymm0,%ymm3
5035   72,173,                                 //lods          %ds:(%rsi),%rax
5036   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
5037   197,124,41,201,                         //vmovaps       %ymm9,%ymm1
5038   197,124,41,210,                         //vmovaps       %ymm10,%ymm2
5039   255,224,                                //jmpq          *%rax
5040 };
5041 
5042 CODE const uint8_t sk_matrix_perspective_avx[] = {
5043   72,173,                                 //lods          %ds:(%rsi),%rax
5044   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
5045   196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
5046   196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
5047   197,52,89,201,                          //vmulps        %ymm1,%ymm9,%ymm9
5048   196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
5049   197,60,89,192,                          //vmulps        %ymm0,%ymm8,%ymm8
5050   196,65,60,88,193,                       //vaddps        %ymm9,%ymm8,%ymm8
5051   196,98,125,24,72,12,                    //vbroadcastss  0xc(%rax),%ymm9
5052   196,98,125,24,80,16,                    //vbroadcastss  0x10(%rax),%ymm10
5053   196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
5054   197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
5055   196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
5056   197,52,89,200,                          //vmulps        %ymm0,%ymm9,%ymm9
5057   196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
5058   196,98,125,24,80,24,                    //vbroadcastss  0x18(%rax),%ymm10
5059   196,98,125,24,88,28,                    //vbroadcastss  0x1c(%rax),%ymm11
5060   196,98,125,24,96,32,                    //vbroadcastss  0x20(%rax),%ymm12
5061   197,164,89,201,                         //vmulps        %ymm1,%ymm11,%ymm1
5062   196,193,116,88,204,                     //vaddps        %ymm12,%ymm1,%ymm1
5063   197,172,89,192,                         //vmulps        %ymm0,%ymm10,%ymm0
5064   197,252,88,193,                         //vaddps        %ymm1,%ymm0,%ymm0
5065   197,252,83,200,                         //vrcpps        %ymm0,%ymm1
5066   197,188,89,193,                         //vmulps        %ymm1,%ymm8,%ymm0
5067   197,180,89,201,                         //vmulps        %ymm1,%ymm9,%ymm1
5068   72,173,                                 //lods          %ds:(%rsi),%rax
5069   255,224,                                //jmpq          *%rax
5070 };
5071 
5072 CODE const uint8_t sk_linear_gradient_2stops_avx[] = {
5073   72,173,                                 //lods          %ds:(%rsi),%rax
5074   196,226,125,24,72,16,                   //vbroadcastss  0x10(%rax),%ymm1
5075   196,226,125,24,16,                      //vbroadcastss  (%rax),%ymm2
5076   197,244,89,200,                         //vmulps        %ymm0,%ymm1,%ymm1
5077   197,108,88,193,                         //vaddps        %ymm1,%ymm2,%ymm8
5078   196,226,125,24,72,20,                   //vbroadcastss  0x14(%rax),%ymm1
5079   196,226,125,24,80,4,                    //vbroadcastss  0x4(%rax),%ymm2
5080   197,244,89,200,                         //vmulps        %ymm0,%ymm1,%ymm1
5081   197,236,88,201,                         //vaddps        %ymm1,%ymm2,%ymm1
5082   196,226,125,24,80,24,                   //vbroadcastss  0x18(%rax),%ymm2
5083   196,226,125,24,88,8,                    //vbroadcastss  0x8(%rax),%ymm3
5084   197,236,89,208,                         //vmulps        %ymm0,%ymm2,%ymm2
5085   197,228,88,210,                         //vaddps        %ymm2,%ymm3,%ymm2
5086   196,226,125,24,88,28,                   //vbroadcastss  0x1c(%rax),%ymm3
5087   196,98,125,24,72,12,                    //vbroadcastss  0xc(%rax),%ymm9
5088   197,228,89,192,                         //vmulps        %ymm0,%ymm3,%ymm0
5089   197,180,88,216,                         //vaddps        %ymm0,%ymm9,%ymm3
5090   72,173,                                 //lods          %ds:(%rsi),%rax
5091   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
5092   255,224,                                //jmpq          *%rax
5093 };
5094 
5095 CODE const uint8_t sk_start_pipeline_sse41[] = {
5096   65,87,                                  //push          %r15
5097   65,86,                                  //push          %r14
5098   65,85,                                  //push          %r13
5099   65,84,                                  //push          %r12
5100   83,                                     //push          %rbx
5101   73,137,207,                             //mov           %rcx,%r15
5102   73,137,214,                             //mov           %rdx,%r14
5103   72,137,251,                             //mov           %rdi,%rbx
5104   72,173,                                 //lods          %ds:(%rsi),%rax
5105   73,137,196,                             //mov           %rax,%r12
5106   73,137,245,                             //mov           %rsi,%r13
5107   72,141,67,4,                            //lea           0x4(%rbx),%rax
5108   76,57,248,                              //cmp           %r15,%rax
5109   118,5,                                  //jbe           28 <_sk_start_pipeline_sse41+0x28>
5110   72,137,216,                             //mov           %rbx,%rax
5111   235,52,                                 //jmp           5c <_sk_start_pipeline_sse41+0x5c>
5112   15,87,192,                              //xorps         %xmm0,%xmm0
5113   15,87,201,                              //xorps         %xmm1,%xmm1
5114   15,87,210,                              //xorps         %xmm2,%xmm2
5115   15,87,219,                              //xorps         %xmm3,%xmm3
5116   15,87,228,                              //xorps         %xmm4,%xmm4
5117   15,87,237,                              //xorps         %xmm5,%xmm5
5118   15,87,246,                              //xorps         %xmm6,%xmm6
5119   15,87,255,                              //xorps         %xmm7,%xmm7
5120   72,137,223,                             //mov           %rbx,%rdi
5121   76,137,238,                             //mov           %r13,%rsi
5122   76,137,242,                             //mov           %r14,%rdx
5123   65,255,212,                             //callq         *%r12
5124   72,141,67,4,                            //lea           0x4(%rbx),%rax
5125   72,131,195,8,                           //add           $0x8,%rbx
5126   76,57,251,                              //cmp           %r15,%rbx
5127   72,137,195,                             //mov           %rax,%rbx
5128   118,204,                                //jbe           28 <_sk_start_pipeline_sse41+0x28>
5129   91,                                     //pop           %rbx
5130   65,92,                                  //pop           %r12
5131   65,93,                                  //pop           %r13
5132   65,94,                                  //pop           %r14
5133   65,95,                                  //pop           %r15
5134   195,                                    //retq
5135 };
5136 
5137 CODE const uint8_t sk_just_return_sse41[] = {
5138   195,                                    //retq
5139 };
5140 
5141 CODE const uint8_t sk_seed_shader_sse41[] = {
5142   72,173,                                 //lods          %ds:(%rsi),%rax
5143   102,15,110,199,                         //movd          %edi,%xmm0
5144   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
5145   15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
5146   185,0,0,0,63,                           //mov           $0x3f000000,%ecx
5147   102,15,110,209,                         //movd          %ecx,%xmm2
5148   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
5149   15,88,202,                              //addps         %xmm2,%xmm1
5150   15,16,2,                                //movups        (%rdx),%xmm0
5151   15,88,193,                              //addps         %xmm1,%xmm0
5152   102,15,110,8,                           //movd          (%rax),%xmm1
5153   102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
5154   15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
5155   15,88,202,                              //addps         %xmm2,%xmm1
5156   184,0,0,128,63,                         //mov           $0x3f800000,%eax
5157   102,15,110,208,                         //movd          %eax,%xmm2
5158   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
5159   72,173,                                 //lods          %ds:(%rsi),%rax
5160   15,87,219,                              //xorps         %xmm3,%xmm3
5161   15,87,228,                              //xorps         %xmm4,%xmm4
5162   15,87,237,                              //xorps         %xmm5,%xmm5
5163   15,87,246,                              //xorps         %xmm6,%xmm6
5164   15,87,255,                              //xorps         %xmm7,%xmm7
5165   255,224,                                //jmpq          *%rax
5166 };
5167 
5168 CODE const uint8_t sk_constant_color_sse41[] = {
5169   72,173,                                 //lods          %ds:(%rsi),%rax
5170   15,16,24,                               //movups        (%rax),%xmm3
5171   15,40,195,                              //movaps        %xmm3,%xmm0
5172   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
5173   15,40,203,                              //movaps        %xmm3,%xmm1
5174   15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
5175   15,40,211,                              //movaps        %xmm3,%xmm2
5176   15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
5177   15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
5178   72,173,                                 //lods          %ds:(%rsi),%rax
5179   255,224,                                //jmpq          *%rax
5180 };
5181 
5182 CODE const uint8_t sk_clear_sse41[] = {
5183   72,173,                                 //lods          %ds:(%rsi),%rax
5184   15,87,192,                              //xorps         %xmm0,%xmm0
5185   15,87,201,                              //xorps         %xmm1,%xmm1
5186   15,87,210,                              //xorps         %xmm2,%xmm2
5187   15,87,219,                              //xorps         %xmm3,%xmm3
5188   255,224,                                //jmpq          *%rax
5189 };
5190 
5191 CODE const uint8_t sk_plus__sse41[] = {
5192   15,88,196,                              //addps         %xmm4,%xmm0
5193   15,88,205,                              //addps         %xmm5,%xmm1
5194   15,88,214,                              //addps         %xmm6,%xmm2
5195   15,88,223,                              //addps         %xmm7,%xmm3
5196   72,173,                                 //lods          %ds:(%rsi),%rax
5197   255,224,                                //jmpq          *%rax
5198 };
5199 
5200 CODE const uint8_t sk_srcover_sse41[] = {
5201   184,0,0,128,63,                         //mov           $0x3f800000,%eax
5202   102,68,15,110,192,                      //movd          %eax,%xmm8
5203   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5204   68,15,92,195,                           //subps         %xmm3,%xmm8
5205   69,15,40,200,                           //movaps        %xmm8,%xmm9
5206   68,15,89,204,                           //mulps         %xmm4,%xmm9
5207   65,15,88,193,                           //addps         %xmm9,%xmm0
5208   69,15,40,200,                           //movaps        %xmm8,%xmm9
5209   68,15,89,205,                           //mulps         %xmm5,%xmm9
5210   65,15,88,201,                           //addps         %xmm9,%xmm1
5211   69,15,40,200,                           //movaps        %xmm8,%xmm9
5212   68,15,89,206,                           //mulps         %xmm6,%xmm9
5213   65,15,88,209,                           //addps         %xmm9,%xmm2
5214   68,15,89,199,                           //mulps         %xmm7,%xmm8
5215   65,15,88,216,                           //addps         %xmm8,%xmm3
5216   72,173,                                 //lods          %ds:(%rsi),%rax
5217   255,224,                                //jmpq          *%rax
5218 };
5219 
5220 CODE const uint8_t sk_dstover_sse41[] = {
5221   184,0,0,128,63,                         //mov           $0x3f800000,%eax
5222   102,68,15,110,192,                      //movd          %eax,%xmm8
5223   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5224   68,15,92,199,                           //subps         %xmm7,%xmm8
5225   65,15,89,192,                           //mulps         %xmm8,%xmm0
5226   15,88,196,                              //addps         %xmm4,%xmm0
5227   65,15,89,200,                           //mulps         %xmm8,%xmm1
5228   15,88,205,                              //addps         %xmm5,%xmm1
5229   65,15,89,208,                           //mulps         %xmm8,%xmm2
5230   15,88,214,                              //addps         %xmm6,%xmm2
5231   65,15,89,216,                           //mulps         %xmm8,%xmm3
5232   15,88,223,                              //addps         %xmm7,%xmm3
5233   72,173,                                 //lods          %ds:(%rsi),%rax
5234   255,224,                                //jmpq          *%rax
5235 };
5236 
5237 CODE const uint8_t sk_clamp_0_sse41[] = {
5238   69,15,87,192,                           //xorps         %xmm8,%xmm8
5239   65,15,95,192,                           //maxps         %xmm8,%xmm0
5240   65,15,95,200,                           //maxps         %xmm8,%xmm1
5241   65,15,95,208,                           //maxps         %xmm8,%xmm2
5242   65,15,95,216,                           //maxps         %xmm8,%xmm3
5243   72,173,                                 //lods          %ds:(%rsi),%rax
5244   255,224,                                //jmpq          *%rax
5245 };
5246 
5247 CODE const uint8_t sk_clamp_1_sse41[] = {
5248   184,0,0,128,63,                         //mov           $0x3f800000,%eax
5249   102,68,15,110,192,                      //movd          %eax,%xmm8
5250   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5251   65,15,93,192,                           //minps         %xmm8,%xmm0
5252   65,15,93,200,                           //minps         %xmm8,%xmm1
5253   65,15,93,208,                           //minps         %xmm8,%xmm2
5254   65,15,93,216,                           //minps         %xmm8,%xmm3
5255   72,173,                                 //lods          %ds:(%rsi),%rax
5256   255,224,                                //jmpq          *%rax
5257 };
5258 
5259 CODE const uint8_t sk_clamp_a_sse41[] = {
5260   184,0,0,128,63,                         //mov           $0x3f800000,%eax
5261   102,68,15,110,192,                      //movd          %eax,%xmm8
5262   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5263   65,15,93,216,                           //minps         %xmm8,%xmm3
5264   15,93,195,                              //minps         %xmm3,%xmm0
5265   15,93,203,                              //minps         %xmm3,%xmm1
5266   15,93,211,                              //minps         %xmm3,%xmm2
5267   72,173,                                 //lods          %ds:(%rsi),%rax
5268   255,224,                                //jmpq          *%rax
5269 };
5270 
5271 CODE const uint8_t sk_set_rgb_sse41[] = {
5272   72,173,                                 //lods          %ds:(%rsi),%rax
5273   243,15,16,0,                            //movss         (%rax),%xmm0
5274   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
5275   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
5276   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
5277   243,15,16,80,8,                         //movss         0x8(%rax),%xmm2
5278   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
5279   72,173,                                 //lods          %ds:(%rsi),%rax
5280   255,224,                                //jmpq          *%rax
5281 };
5282 
5283 CODE const uint8_t sk_swap_rb_sse41[] = {
5284   68,15,40,192,                           //movaps        %xmm0,%xmm8
5285   72,173,                                 //lods          %ds:(%rsi),%rax
5286   15,40,194,                              //movaps        %xmm2,%xmm0
5287   65,15,40,208,                           //movaps        %xmm8,%xmm2
5288   255,224,                                //jmpq          *%rax
5289 };
5290 
5291 CODE const uint8_t sk_swap_sse41[] = {
5292   68,15,40,195,                           //movaps        %xmm3,%xmm8
5293   68,15,40,202,                           //movaps        %xmm2,%xmm9
5294   68,15,40,209,                           //movaps        %xmm1,%xmm10
5295   68,15,40,216,                           //movaps        %xmm0,%xmm11
5296   72,173,                                 //lods          %ds:(%rsi),%rax
5297   15,40,196,                              //movaps        %xmm4,%xmm0
5298   15,40,205,                              //movaps        %xmm5,%xmm1
5299   15,40,214,                              //movaps        %xmm6,%xmm2
5300   15,40,223,                              //movaps        %xmm7,%xmm3
5301   65,15,40,227,                           //movaps        %xmm11,%xmm4
5302   65,15,40,234,                           //movaps        %xmm10,%xmm5
5303   65,15,40,241,                           //movaps        %xmm9,%xmm6
5304   65,15,40,248,                           //movaps        %xmm8,%xmm7
5305   255,224,                                //jmpq          *%rax
5306 };
5307 
5308 CODE const uint8_t sk_move_src_dst_sse41[] = {
5309   72,173,                                 //lods          %ds:(%rsi),%rax
5310   15,40,224,                              //movaps        %xmm0,%xmm4
5311   15,40,233,                              //movaps        %xmm1,%xmm5
5312   15,40,242,                              //movaps        %xmm2,%xmm6
5313   15,40,251,                              //movaps        %xmm3,%xmm7
5314   255,224,                                //jmpq          *%rax
5315 };
5316 
5317 CODE const uint8_t sk_move_dst_src_sse41[] = {
5318   72,173,                                 //lods          %ds:(%rsi),%rax
5319   15,40,196,                              //movaps        %xmm4,%xmm0
5320   15,40,205,                              //movaps        %xmm5,%xmm1
5321   15,40,214,                              //movaps        %xmm6,%xmm2
5322   15,40,223,                              //movaps        %xmm7,%xmm3
5323   255,224,                                //jmpq          *%rax
5324 };
5325 
5326 CODE const uint8_t sk_premul_sse41[] = {
5327   15,89,195,                              //mulps         %xmm3,%xmm0
5328   15,89,203,                              //mulps         %xmm3,%xmm1
5329   15,89,211,                              //mulps         %xmm3,%xmm2
5330   72,173,                                 //lods          %ds:(%rsi),%rax
5331   255,224,                                //jmpq          *%rax
5332 };
5333 
5334 CODE const uint8_t sk_unpremul_sse41[] = {
5335   69,15,87,192,                           //xorps         %xmm8,%xmm8
5336   184,0,0,128,63,                         //mov           $0x3f800000,%eax
5337   102,68,15,110,200,                      //movd          %eax,%xmm9
5338   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
5339   68,15,94,203,                           //divps         %xmm3,%xmm9
5340   68,15,194,195,4,                        //cmpneqps      %xmm3,%xmm8
5341   69,15,84,193,                           //andps         %xmm9,%xmm8
5342   65,15,89,192,                           //mulps         %xmm8,%xmm0
5343   65,15,89,200,                           //mulps         %xmm8,%xmm1
5344   65,15,89,208,                           //mulps         %xmm8,%xmm2
5345   72,173,                                 //lods          %ds:(%rsi),%rax
5346   255,224,                                //jmpq          *%rax
5347 };
5348 
5349 CODE const uint8_t sk_from_srgb_sse41[] = {
5350   184,145,131,158,61,                     //mov           $0x3d9e8391,%eax
5351   102,68,15,110,216,                      //movd          %eax,%xmm11
5352   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
5353   69,15,40,211,                           //movaps        %xmm11,%xmm10
5354   68,15,89,208,                           //mulps         %xmm0,%xmm10
5355   68,15,40,240,                           //movaps        %xmm0,%xmm14
5356   69,15,89,246,                           //mulps         %xmm14,%xmm14
5357   184,154,153,153,62,                     //mov           $0x3e99999a,%eax
5358   102,68,15,110,192,                      //movd          %eax,%xmm8
5359   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5360   184,92,143,50,63,                       //mov           $0x3f328f5c,%eax
5361   102,68,15,110,224,                      //movd          %eax,%xmm12
5362   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
5363   69,15,40,200,                           //movaps        %xmm8,%xmm9
5364   68,15,89,200,                           //mulps         %xmm0,%xmm9
5365   69,15,88,204,                           //addps         %xmm12,%xmm9
5366   184,10,215,35,59,                       //mov           $0x3b23d70a,%eax
5367   102,68,15,110,232,                      //movd          %eax,%xmm13
5368   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
5369   69,15,89,206,                           //mulps         %xmm14,%xmm9
5370   69,15,88,205,                           //addps         %xmm13,%xmm9
5371   184,174,71,97,61,                       //mov           $0x3d6147ae,%eax
5372   102,68,15,110,240,                      //movd          %eax,%xmm14
5373   69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
5374   65,15,194,198,1,                        //cmpltps       %xmm14,%xmm0
5375   102,69,15,56,20,202,                    //blendvps      %xmm0,%xmm10,%xmm9
5376   69,15,40,251,                           //movaps        %xmm11,%xmm15
5377   68,15,89,249,                           //mulps         %xmm1,%xmm15
5378   15,40,193,                              //movaps        %xmm1,%xmm0
5379   15,89,192,                              //mulps         %xmm0,%xmm0
5380   69,15,40,208,                           //movaps        %xmm8,%xmm10
5381   68,15,89,209,                           //mulps         %xmm1,%xmm10
5382   69,15,88,212,                           //addps         %xmm12,%xmm10
5383   68,15,89,208,                           //mulps         %xmm0,%xmm10
5384   69,15,88,213,                           //addps         %xmm13,%xmm10
5385   65,15,194,206,1,                        //cmpltps       %xmm14,%xmm1
5386   15,40,193,                              //movaps        %xmm1,%xmm0
5387   102,69,15,56,20,215,                    //blendvps      %xmm0,%xmm15,%xmm10
5388   68,15,89,218,                           //mulps         %xmm2,%xmm11
5389   15,40,194,                              //movaps        %xmm2,%xmm0
5390   15,89,192,                              //mulps         %xmm0,%xmm0
5391   68,15,89,194,                           //mulps         %xmm2,%xmm8
5392   69,15,88,196,                           //addps         %xmm12,%xmm8
5393   68,15,89,192,                           //mulps         %xmm0,%xmm8
5394   69,15,88,197,                           //addps         %xmm13,%xmm8
5395   65,15,194,214,1,                        //cmpltps       %xmm14,%xmm2
5396   15,40,194,                              //movaps        %xmm2,%xmm0
5397   102,69,15,56,20,195,                    //blendvps      %xmm0,%xmm11,%xmm8
5398   72,173,                                 //lods          %ds:(%rsi),%rax
5399   65,15,40,193,                           //movaps        %xmm9,%xmm0
5400   65,15,40,202,                           //movaps        %xmm10,%xmm1
5401   65,15,40,208,                           //movaps        %xmm8,%xmm2
5402   255,224,                                //jmpq          *%rax
5403 };
5404 
5405 CODE const uint8_t sk_to_srgb_sse41[] = {
5406   72,131,236,24,                          //sub           $0x18,%rsp
5407   15,41,60,36,                            //movaps        %xmm7,(%rsp)
5408   15,40,254,                              //movaps        %xmm6,%xmm7
5409   15,40,245,                              //movaps        %xmm5,%xmm6
5410   15,40,236,                              //movaps        %xmm4,%xmm5
5411   15,40,227,                              //movaps        %xmm3,%xmm4
5412   15,40,218,                              //movaps        %xmm2,%xmm3
5413   15,40,209,                              //movaps        %xmm1,%xmm2
5414   68,15,82,192,                           //rsqrtps       %xmm0,%xmm8
5415   69,15,83,200,                           //rcpps         %xmm8,%xmm9
5416   69,15,82,248,                           //rsqrtps       %xmm8,%xmm15
5417   184,41,92,71,65,                        //mov           $0x41475c29,%eax
5418   102,68,15,110,216,                      //movd          %eax,%xmm11
5419   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
5420   69,15,40,211,                           //movaps        %xmm11,%xmm10
5421   68,15,89,208,                           //mulps         %xmm0,%xmm10
5422   184,0,0,128,63,                         //mov           $0x3f800000,%eax
5423   102,68,15,110,192,                      //movd          %eax,%xmm8
5424   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5425   184,194,135,210,62,                     //mov           $0x3ed287c2,%eax
5426   102,68,15,110,224,                      //movd          %eax,%xmm12
5427   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
5428   184,206,111,48,63,                      //mov           $0x3f306fce,%eax
5429   102,68,15,110,232,                      //movd          %eax,%xmm13
5430   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
5431   184,168,87,202,61,                      //mov           $0x3dca57a8,%eax
5432   53,0,0,0,128,                           //xor           $0x80000000,%eax
5433   102,68,15,110,240,                      //movd          %eax,%xmm14
5434   69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
5435   69,15,89,205,                           //mulps         %xmm13,%xmm9
5436   69,15,88,206,                           //addps         %xmm14,%xmm9
5437   69,15,89,252,                           //mulps         %xmm12,%xmm15
5438   69,15,88,249,                           //addps         %xmm9,%xmm15
5439   69,15,40,200,                           //movaps        %xmm8,%xmm9
5440   69,15,93,207,                           //minps         %xmm15,%xmm9
5441   184,4,231,140,59,                       //mov           $0x3b8ce704,%eax
5442   102,68,15,110,248,                      //movd          %eax,%xmm15
5443   69,15,198,255,0,                        //shufps        $0x0,%xmm15,%xmm15
5444   65,15,194,199,1,                        //cmpltps       %xmm15,%xmm0
5445   102,69,15,56,20,202,                    //blendvps      %xmm0,%xmm10,%xmm9
5446   68,15,82,210,                           //rsqrtps       %xmm2,%xmm10
5447   65,15,83,194,                           //rcpps         %xmm10,%xmm0
5448   69,15,82,210,                           //rsqrtps       %xmm10,%xmm10
5449   65,15,89,197,                           //mulps         %xmm13,%xmm0
5450   65,15,88,198,                           //addps         %xmm14,%xmm0
5451   69,15,89,212,                           //mulps         %xmm12,%xmm10
5452   68,15,88,208,                           //addps         %xmm0,%xmm10
5453   65,15,40,200,                           //movaps        %xmm8,%xmm1
5454   65,15,93,202,                           //minps         %xmm10,%xmm1
5455   69,15,40,211,                           //movaps        %xmm11,%xmm10
5456   68,15,89,210,                           //mulps         %xmm2,%xmm10
5457   65,15,194,215,1,                        //cmpltps       %xmm15,%xmm2
5458   15,40,194,                              //movaps        %xmm2,%xmm0
5459   102,65,15,56,20,202,                    //blendvps      %xmm0,%xmm10,%xmm1
5460   15,82,195,                              //rsqrtps       %xmm3,%xmm0
5461   15,83,208,                              //rcpps         %xmm0,%xmm2
5462   65,15,89,213,                           //mulps         %xmm13,%xmm2
5463   65,15,88,214,                           //addps         %xmm14,%xmm2
5464   15,82,192,                              //rsqrtps       %xmm0,%xmm0
5465   65,15,89,196,                           //mulps         %xmm12,%xmm0
5466   15,88,194,                              //addps         %xmm2,%xmm0
5467   68,15,93,192,                           //minps         %xmm0,%xmm8
5468   68,15,89,219,                           //mulps         %xmm3,%xmm11
5469   65,15,194,223,1,                        //cmpltps       %xmm15,%xmm3
5470   15,40,195,                              //movaps        %xmm3,%xmm0
5471   102,69,15,56,20,195,                    //blendvps      %xmm0,%xmm11,%xmm8
5472   72,173,                                 //lods          %ds:(%rsi),%rax
5473   65,15,40,193,                           //movaps        %xmm9,%xmm0
5474   65,15,40,208,                           //movaps        %xmm8,%xmm2
5475   15,40,220,                              //movaps        %xmm4,%xmm3
5476   15,40,229,                              //movaps        %xmm5,%xmm4
5477   15,40,238,                              //movaps        %xmm6,%xmm5
5478   15,40,247,                              //movaps        %xmm7,%xmm6
5479   15,40,60,36,                            //movaps        (%rsp),%xmm7
5480   72,131,196,24,                          //add           $0x18,%rsp
5481   255,224,                                //jmpq          *%rax
5482 };
5483 
5484 CODE const uint8_t sk_scale_1_float_sse41[] = {
5485   72,173,                                 //lods          %ds:(%rsi),%rax
5486   243,68,15,16,0,                         //movss         (%rax),%xmm8
5487   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5488   65,15,89,192,                           //mulps         %xmm8,%xmm0
5489   65,15,89,200,                           //mulps         %xmm8,%xmm1
5490   65,15,89,208,                           //mulps         %xmm8,%xmm2
5491   65,15,89,216,                           //mulps         %xmm8,%xmm3
5492   72,173,                                 //lods          %ds:(%rsi),%rax
5493   255,224,                                //jmpq          *%rax
5494 };
5495 
5496 CODE const uint8_t sk_scale_u8_sse41[] = {
5497   72,173,                                 //lods          %ds:(%rsi),%rax
5498   72,139,0,                               //mov           (%rax),%rax
5499   102,68,15,56,49,4,56,                   //pmovzxbd      (%rax,%rdi,1),%xmm8
5500   69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
5501   184,129,128,128,59,                     //mov           $0x3b808081,%eax
5502   102,68,15,110,200,                      //movd          %eax,%xmm9
5503   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
5504   69,15,89,200,                           //mulps         %xmm8,%xmm9
5505   65,15,89,193,                           //mulps         %xmm9,%xmm0
5506   65,15,89,201,                           //mulps         %xmm9,%xmm1
5507   65,15,89,209,                           //mulps         %xmm9,%xmm2
5508   65,15,89,217,                           //mulps         %xmm9,%xmm3
5509   72,173,                                 //lods          %ds:(%rsi),%rax
5510   255,224,                                //jmpq          *%rax
5511 };
5512 
5513 CODE const uint8_t sk_lerp_1_float_sse41[] = {
5514   72,173,                                 //lods          %ds:(%rsi),%rax
5515   243,68,15,16,0,                         //movss         (%rax),%xmm8
5516   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5517   15,92,196,                              //subps         %xmm4,%xmm0
5518   65,15,89,192,                           //mulps         %xmm8,%xmm0
5519   15,88,196,                              //addps         %xmm4,%xmm0
5520   15,92,205,                              //subps         %xmm5,%xmm1
5521   65,15,89,200,                           //mulps         %xmm8,%xmm1
5522   15,88,205,                              //addps         %xmm5,%xmm1
5523   15,92,214,                              //subps         %xmm6,%xmm2
5524   65,15,89,208,                           //mulps         %xmm8,%xmm2
5525   15,88,214,                              //addps         %xmm6,%xmm2
5526   15,92,223,                              //subps         %xmm7,%xmm3
5527   65,15,89,216,                           //mulps         %xmm8,%xmm3
5528   15,88,223,                              //addps         %xmm7,%xmm3
5529   72,173,                                 //lods          %ds:(%rsi),%rax
5530   255,224,                                //jmpq          *%rax
5531 };
5532 
5533 CODE const uint8_t sk_lerp_u8_sse41[] = {
5534   72,173,                                 //lods          %ds:(%rsi),%rax
5535   72,139,0,                               //mov           (%rax),%rax
5536   102,68,15,56,49,4,56,                   //pmovzxbd      (%rax,%rdi,1),%xmm8
5537   69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
5538   184,129,128,128,59,                     //mov           $0x3b808081,%eax
5539   102,68,15,110,200,                      //movd          %eax,%xmm9
5540   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
5541   69,15,89,200,                           //mulps         %xmm8,%xmm9
5542   15,92,196,                              //subps         %xmm4,%xmm0
5543   65,15,89,193,                           //mulps         %xmm9,%xmm0
5544   15,88,196,                              //addps         %xmm4,%xmm0
5545   15,92,205,                              //subps         %xmm5,%xmm1
5546   65,15,89,201,                           //mulps         %xmm9,%xmm1
5547   15,88,205,                              //addps         %xmm5,%xmm1
5548   15,92,214,                              //subps         %xmm6,%xmm2
5549   65,15,89,209,                           //mulps         %xmm9,%xmm2
5550   15,88,214,                              //addps         %xmm6,%xmm2
5551   15,92,223,                              //subps         %xmm7,%xmm3
5552   65,15,89,217,                           //mulps         %xmm9,%xmm3
5553   15,88,223,                              //addps         %xmm7,%xmm3
5554   72,173,                                 //lods          %ds:(%rsi),%rax
5555   255,224,                                //jmpq          *%rax
5556 };
5557 
5558 CODE const uint8_t sk_lerp_565_sse41[] = {
5559   72,173,                                 //lods          %ds:(%rsi),%rax
5560   72,139,0,                               //mov           (%rax),%rax
5561   102,68,15,56,51,4,120,                  //pmovzxwd      (%rax,%rdi,2),%xmm8
5562   184,0,248,0,0,                          //mov           $0xf800,%eax
5563   102,15,110,216,                         //movd          %eax,%xmm3
5564   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
5565   102,65,15,219,216,                      //pand          %xmm8,%xmm3
5566   68,15,91,203,                           //cvtdq2ps      %xmm3,%xmm9
5567   184,8,33,132,55,                        //mov           $0x37842108,%eax
5568   102,68,15,110,208,                      //movd          %eax,%xmm10
5569   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
5570   69,15,89,209,                           //mulps         %xmm9,%xmm10
5571   184,224,7,0,0,                          //mov           $0x7e0,%eax
5572   102,15,110,216,                         //movd          %eax,%xmm3
5573   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
5574   102,65,15,219,216,                      //pand          %xmm8,%xmm3
5575   68,15,91,203,                           //cvtdq2ps      %xmm3,%xmm9
5576   184,33,8,2,58,                          //mov           $0x3a020821,%eax
5577   102,68,15,110,216,                      //movd          %eax,%xmm11
5578   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
5579   69,15,89,217,                           //mulps         %xmm9,%xmm11
5580   184,31,0,0,0,                           //mov           $0x1f,%eax
5581   102,15,110,216,                         //movd          %eax,%xmm3
5582   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
5583   102,65,15,219,216,                      //pand          %xmm8,%xmm3
5584   68,15,91,195,                           //cvtdq2ps      %xmm3,%xmm8
5585   184,8,33,4,61,                          //mov           $0x3d042108,%eax
5586   102,15,110,216,                         //movd          %eax,%xmm3
5587   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
5588   65,15,89,216,                           //mulps         %xmm8,%xmm3
5589   15,92,196,                              //subps         %xmm4,%xmm0
5590   65,15,89,194,                           //mulps         %xmm10,%xmm0
5591   15,88,196,                              //addps         %xmm4,%xmm0
5592   15,92,205,                              //subps         %xmm5,%xmm1
5593   65,15,89,203,                           //mulps         %xmm11,%xmm1
5594   15,88,205,                              //addps         %xmm5,%xmm1
5595   15,92,214,                              //subps         %xmm6,%xmm2
5596   15,89,211,                              //mulps         %xmm3,%xmm2
5597   15,88,214,                              //addps         %xmm6,%xmm2
5598   184,0,0,128,63,                         //mov           $0x3f800000,%eax
5599   102,15,110,216,                         //movd          %eax,%xmm3
5600   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
5601   72,173,                                 //lods          %ds:(%rsi),%rax
5602   255,224,                                //jmpq          *%rax
5603 };
5604 
5605 CODE const uint8_t sk_load_tables_sse41[] = {
5606   72,173,                                 //lods          %ds:(%rsi),%rax
5607   72,139,8,                               //mov           (%rax),%rcx
5608   76,139,64,8,                            //mov           0x8(%rax),%r8
5609   243,68,15,111,4,185,                    //movdqu        (%rcx,%rdi,4),%xmm8
5610   185,255,0,0,0,                          //mov           $0xff,%ecx
5611   102,15,110,193,                         //movd          %ecx,%xmm0
5612   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
5613   102,65,15,111,200,                      //movdqa        %xmm8,%xmm1
5614   102,15,114,209,8,                       //psrld         $0x8,%xmm1
5615   102,15,219,200,                         //pand          %xmm0,%xmm1
5616   102,65,15,111,208,                      //movdqa        %xmm8,%xmm2
5617   102,15,114,210,16,                      //psrld         $0x10,%xmm2
5618   102,15,219,208,                         //pand          %xmm0,%xmm2
5619   102,65,15,219,192,                      //pand          %xmm8,%xmm0
5620   102,72,15,58,22,193,1,                  //pextrq        $0x1,%xmm0,%rcx
5621   65,137,201,                             //mov           %ecx,%r9d
5622   72,193,233,32,                          //shr           $0x20,%rcx
5623   102,73,15,126,194,                      //movq          %xmm0,%r10
5624   69,137,211,                             //mov           %r10d,%r11d
5625   73,193,234,32,                          //shr           $0x20,%r10
5626   243,67,15,16,4,152,                     //movss         (%r8,%r11,4),%xmm0
5627   102,67,15,58,33,4,144,16,               //insertps      $0x10,(%r8,%r10,4),%xmm0
5628   102,67,15,58,33,4,136,32,               //insertps      $0x20,(%r8,%r9,4),%xmm0
5629   102,65,15,58,33,4,136,48,               //insertps      $0x30,(%r8,%rcx,4),%xmm0
5630   76,139,64,16,                           //mov           0x10(%rax),%r8
5631   102,73,15,58,22,202,1,                  //pextrq        $0x1,%xmm1,%r10
5632   77,137,209,                             //mov           %r10,%r9
5633   73,193,233,32,                          //shr           $0x20,%r9
5634   102,72,15,126,201,                      //movq          %xmm1,%rcx
5635   65,137,203,                             //mov           %ecx,%r11d
5636   65,129,227,255,255,255,0,               //and           $0xffffff,%r11d
5637   72,193,233,30,                          //shr           $0x1e,%rcx
5638   65,129,226,255,255,255,0,               //and           $0xffffff,%r10d
5639   243,67,15,16,12,152,                    //movss         (%r8,%r11,4),%xmm1
5640   102,65,15,58,33,12,8,16,                //insertps      $0x10,(%r8,%rcx,1),%xmm1
5641   243,67,15,16,28,144,                    //movss         (%r8,%r10,4),%xmm3
5642   102,15,58,33,203,32,                    //insertps      $0x20,%xmm3,%xmm1
5643   243,67,15,16,28,136,                    //movss         (%r8,%r9,4),%xmm3
5644   102,15,58,33,203,48,                    //insertps      $0x30,%xmm3,%xmm1
5645   76,139,72,24,                           //mov           0x18(%rax),%r9
5646   102,72,15,58,22,209,1,                  //pextrq        $0x1,%xmm2,%rcx
5647   68,15,183,193,                          //movzwl        %cx,%r8d
5648   72,193,233,32,                          //shr           $0x20,%rcx
5649   102,72,15,126,208,                      //movq          %xmm2,%rax
5650   68,15,183,208,                          //movzwl        %ax,%r10d
5651   72,193,232,30,                          //shr           $0x1e,%rax
5652   243,67,15,16,20,145,                    //movss         (%r9,%r10,4),%xmm2
5653   102,65,15,58,33,20,1,16,                //insertps      $0x10,(%r9,%rax,1),%xmm2
5654   243,67,15,16,28,129,                    //movss         (%r9,%r8,4),%xmm3
5655   102,15,58,33,211,32,                    //insertps      $0x20,%xmm3,%xmm2
5656   243,65,15,16,28,137,                    //movss         (%r9,%rcx,4),%xmm3
5657   102,15,58,33,211,48,                    //insertps      $0x30,%xmm3,%xmm2
5658   102,65,15,114,208,24,                   //psrld         $0x18,%xmm8
5659   69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
5660   184,129,128,128,59,                     //mov           $0x3b808081,%eax
5661   102,15,110,216,                         //movd          %eax,%xmm3
5662   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
5663   65,15,89,216,                           //mulps         %xmm8,%xmm3
5664   72,173,                                 //lods          %ds:(%rsi),%rax
5665   255,224,                                //jmpq          *%rax
5666 };
5667 
5668 CODE const uint8_t sk_load_a8_sse41[] = {
5669   72,173,                                 //lods          %ds:(%rsi),%rax
5670   72,139,0,                               //mov           (%rax),%rax
5671   102,15,56,49,4,56,                      //pmovzxbd      (%rax,%rdi,1),%xmm0
5672   15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
5673   184,129,128,128,59,                     //mov           $0x3b808081,%eax
5674   102,15,110,216,                         //movd          %eax,%xmm3
5675   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
5676   15,89,216,                              //mulps         %xmm0,%xmm3
5677   72,173,                                 //lods          %ds:(%rsi),%rax
5678   15,87,192,                              //xorps         %xmm0,%xmm0
5679   15,87,201,                              //xorps         %xmm1,%xmm1
5680   15,87,210,                              //xorps         %xmm2,%xmm2
5681   255,224,                                //jmpq          *%rax
5682 };
5683 
5684 CODE const uint8_t sk_store_a8_sse41[] = {
5685   72,173,                                 //lods          %ds:(%rsi),%rax
5686   72,139,0,                               //mov           (%rax),%rax
5687   185,0,0,127,67,                         //mov           $0x437f0000,%ecx
5688   102,68,15,110,193,                      //movd          %ecx,%xmm8
5689   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5690   68,15,89,195,                           //mulps         %xmm3,%xmm8
5691   102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
5692   102,69,15,56,43,192,                    //packusdw      %xmm8,%xmm8
5693   102,69,15,103,192,                      //packuswb      %xmm8,%xmm8
5694   102,68,15,126,4,56,                     //movd          %xmm8,(%rax,%rdi,1)
5695   72,173,                                 //lods          %ds:(%rsi),%rax
5696   255,224,                                //jmpq          *%rax
5697 };
5698 
5699 CODE const uint8_t sk_load_565_sse41[] = {
5700   72,173,                                 //lods          %ds:(%rsi),%rax
5701   72,139,0,                               //mov           (%rax),%rax
5702   102,15,56,51,20,120,                    //pmovzxwd      (%rax,%rdi,2),%xmm2
5703   184,0,248,0,0,                          //mov           $0xf800,%eax
5704   102,15,110,192,                         //movd          %eax,%xmm0
5705   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
5706   102,15,219,194,                         //pand          %xmm2,%xmm0
5707   15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
5708   184,8,33,132,55,                        //mov           $0x37842108,%eax
5709   102,15,110,192,                         //movd          %eax,%xmm0
5710   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
5711   15,89,193,                              //mulps         %xmm1,%xmm0
5712   184,224,7,0,0,                          //mov           $0x7e0,%eax
5713   102,15,110,200,                         //movd          %eax,%xmm1
5714   102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
5715   102,15,219,202,                         //pand          %xmm2,%xmm1
5716   15,91,217,                              //cvtdq2ps      %xmm1,%xmm3
5717   184,33,8,2,58,                          //mov           $0x3a020821,%eax
5718   102,15,110,200,                         //movd          %eax,%xmm1
5719   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
5720   15,89,203,                              //mulps         %xmm3,%xmm1
5721   184,31,0,0,0,                           //mov           $0x1f,%eax
5722   102,15,110,216,                         //movd          %eax,%xmm3
5723   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
5724   102,15,219,218,                         //pand          %xmm2,%xmm3
5725   15,91,219,                              //cvtdq2ps      %xmm3,%xmm3
5726   184,8,33,4,61,                          //mov           $0x3d042108,%eax
5727   102,15,110,208,                         //movd          %eax,%xmm2
5728   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
5729   15,89,211,                              //mulps         %xmm3,%xmm2
5730   184,0,0,128,63,                         //mov           $0x3f800000,%eax
5731   102,15,110,216,                         //movd          %eax,%xmm3
5732   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
5733   72,173,                                 //lods          %ds:(%rsi),%rax
5734   255,224,                                //jmpq          *%rax
5735 };
5736 
5737 CODE const uint8_t sk_store_565_sse41[] = {
5738   72,173,                                 //lods          %ds:(%rsi),%rax
5739   72,139,0,                               //mov           (%rax),%rax
5740   185,0,0,248,65,                         //mov           $0x41f80000,%ecx
5741   102,68,15,110,193,                      //movd          %ecx,%xmm8
5742   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5743   69,15,40,200,                           //movaps        %xmm8,%xmm9
5744   68,15,89,200,                           //mulps         %xmm0,%xmm9
5745   102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
5746   102,65,15,114,241,11,                   //pslld         $0xb,%xmm9
5747   185,0,0,124,66,                         //mov           $0x427c0000,%ecx
5748   102,68,15,110,209,                      //movd          %ecx,%xmm10
5749   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
5750   68,15,89,209,                           //mulps         %xmm1,%xmm10
5751   102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
5752   102,65,15,114,242,5,                    //pslld         $0x5,%xmm10
5753   102,69,15,235,209,                      //por           %xmm9,%xmm10
5754   68,15,89,194,                           //mulps         %xmm2,%xmm8
5755   102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
5756   102,69,15,86,194,                       //orpd          %xmm10,%xmm8
5757   102,69,15,56,43,192,                    //packusdw      %xmm8,%xmm8
5758   102,68,15,214,4,120,                    //movq          %xmm8,(%rax,%rdi,2)
5759   72,173,                                 //lods          %ds:(%rsi),%rax
5760   255,224,                                //jmpq          *%rax
5761 };
5762 
5763 CODE const uint8_t sk_load_8888_sse41[] = {
5764   72,173,                                 //lods          %ds:(%rsi),%rax
5765   72,139,0,                               //mov           (%rax),%rax
5766   243,15,111,28,184,                      //movdqu        (%rax,%rdi,4),%xmm3
5767   184,255,0,0,0,                          //mov           $0xff,%eax
5768   102,15,110,192,                         //movd          %eax,%xmm0
5769   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
5770   102,15,111,203,                         //movdqa        %xmm3,%xmm1
5771   102,15,114,209,8,                       //psrld         $0x8,%xmm1
5772   102,15,219,200,                         //pand          %xmm0,%xmm1
5773   102,15,111,211,                         //movdqa        %xmm3,%xmm2
5774   102,15,114,210,16,                      //psrld         $0x10,%xmm2
5775   102,15,219,208,                         //pand          %xmm0,%xmm2
5776   102,15,219,195,                         //pand          %xmm3,%xmm0
5777   15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
5778   184,129,128,128,59,                     //mov           $0x3b808081,%eax
5779   102,68,15,110,192,                      //movd          %eax,%xmm8
5780   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5781   65,15,89,192,                           //mulps         %xmm8,%xmm0
5782   15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
5783   65,15,89,200,                           //mulps         %xmm8,%xmm1
5784   15,91,210,                              //cvtdq2ps      %xmm2,%xmm2
5785   65,15,89,208,                           //mulps         %xmm8,%xmm2
5786   102,15,114,211,24,                      //psrld         $0x18,%xmm3
5787   15,91,219,                              //cvtdq2ps      %xmm3,%xmm3
5788   65,15,89,216,                           //mulps         %xmm8,%xmm3
5789   72,173,                                 //lods          %ds:(%rsi),%rax
5790   255,224,                                //jmpq          *%rax
5791 };
5792 
5793 CODE const uint8_t sk_store_8888_sse41[] = {
5794   72,173,                                 //lods          %ds:(%rsi),%rax
5795   72,139,0,                               //mov           (%rax),%rax
5796   185,0,0,127,67,                         //mov           $0x437f0000,%ecx
5797   102,68,15,110,193,                      //movd          %ecx,%xmm8
5798   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5799   69,15,40,200,                           //movaps        %xmm8,%xmm9
5800   68,15,89,200,                           //mulps         %xmm0,%xmm9
5801   102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
5802   69,15,40,208,                           //movaps        %xmm8,%xmm10
5803   68,15,89,209,                           //mulps         %xmm1,%xmm10
5804   102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
5805   102,65,15,114,242,8,                    //pslld         $0x8,%xmm10
5806   102,69,15,235,209,                      //por           %xmm9,%xmm10
5807   69,15,40,200,                           //movaps        %xmm8,%xmm9
5808   68,15,89,202,                           //mulps         %xmm2,%xmm9
5809   102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
5810   102,65,15,114,241,16,                   //pslld         $0x10,%xmm9
5811   68,15,89,195,                           //mulps         %xmm3,%xmm8
5812   102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
5813   102,65,15,114,240,24,                   //pslld         $0x18,%xmm8
5814   102,69,15,235,193,                      //por           %xmm9,%xmm8
5815   102,69,15,235,194,                      //por           %xmm10,%xmm8
5816   243,68,15,127,4,184,                    //movdqu        %xmm8,(%rax,%rdi,4)
5817   72,173,                                 //lods          %ds:(%rsi),%rax
5818   255,224,                                //jmpq          *%rax
5819 };
5820 
5821 CODE const uint8_t sk_load_f16_sse41[] = {
5822   72,173,                                 //lods          %ds:(%rsi),%rax
5823   72,139,0,                               //mov           (%rax),%rax
5824   243,15,111,4,248,                       //movdqu        (%rax,%rdi,8),%xmm0
5825   243,15,111,76,248,16,                   //movdqu        0x10(%rax,%rdi,8),%xmm1
5826   102,15,111,208,                         //movdqa        %xmm0,%xmm2
5827   102,15,97,209,                          //punpcklwd     %xmm1,%xmm2
5828   102,15,105,193,                         //punpckhwd     %xmm1,%xmm0
5829   102,68,15,111,194,                      //movdqa        %xmm2,%xmm8
5830   102,68,15,97,192,                       //punpcklwd     %xmm0,%xmm8
5831   102,15,105,208,                         //punpckhwd     %xmm0,%xmm2
5832   184,0,4,0,4,                            //mov           $0x4000400,%eax
5833   102,15,110,192,                         //movd          %eax,%xmm0
5834   102,15,112,216,0,                       //pshufd        $0x0,%xmm0,%xmm3
5835   102,15,111,203,                         //movdqa        %xmm3,%xmm1
5836   102,65,15,101,200,                      //pcmpgtw       %xmm8,%xmm1
5837   102,65,15,223,200,                      //pandn         %xmm8,%xmm1
5838   102,15,101,218,                         //pcmpgtw       %xmm2,%xmm3
5839   102,15,223,218,                         //pandn         %xmm2,%xmm3
5840   102,15,56,51,193,                       //pmovzxwd      %xmm1,%xmm0
5841   102,15,114,240,13,                      //pslld         $0xd,%xmm0
5842   184,0,0,128,119,                        //mov           $0x77800000,%eax
5843   102,15,110,208,                         //movd          %eax,%xmm2
5844   102,68,15,112,194,0,                    //pshufd        $0x0,%xmm2,%xmm8
5845   65,15,89,192,                           //mulps         %xmm8,%xmm0
5846   102,69,15,239,201,                      //pxor          %xmm9,%xmm9
5847   102,65,15,105,201,                      //punpckhwd     %xmm9,%xmm1
5848   102,15,114,241,13,                      //pslld         $0xd,%xmm1
5849   65,15,89,200,                           //mulps         %xmm8,%xmm1
5850   102,15,56,51,211,                       //pmovzxwd      %xmm3,%xmm2
5851   102,15,114,242,13,                      //pslld         $0xd,%xmm2
5852   65,15,89,208,                           //mulps         %xmm8,%xmm2
5853   102,65,15,105,217,                      //punpckhwd     %xmm9,%xmm3
5854   102,15,114,243,13,                      //pslld         $0xd,%xmm3
5855   65,15,89,216,                           //mulps         %xmm8,%xmm3
5856   72,173,                                 //lods          %ds:(%rsi),%rax
5857   255,224,                                //jmpq          *%rax
5858 };
5859 
5860 CODE const uint8_t sk_store_f16_sse41[] = {
5861   72,173,                                 //lods          %ds:(%rsi),%rax
5862   72,139,0,                               //mov           (%rax),%rax
5863   185,0,0,128,7,                          //mov           $0x7800000,%ecx
5864   102,68,15,110,193,                      //movd          %ecx,%xmm8
5865   102,69,15,112,192,0,                    //pshufd        $0x0,%xmm8,%xmm8
5866   102,69,15,111,200,                      //movdqa        %xmm8,%xmm9
5867   68,15,89,200,                           //mulps         %xmm0,%xmm9
5868   102,65,15,114,209,13,                   //psrld         $0xd,%xmm9
5869   102,69,15,111,208,                      //movdqa        %xmm8,%xmm10
5870   68,15,89,209,                           //mulps         %xmm1,%xmm10
5871   102,65,15,114,210,13,                   //psrld         $0xd,%xmm10
5872   102,69,15,111,216,                      //movdqa        %xmm8,%xmm11
5873   68,15,89,218,                           //mulps         %xmm2,%xmm11
5874   102,65,15,114,211,13,                   //psrld         $0xd,%xmm11
5875   68,15,89,195,                           //mulps         %xmm3,%xmm8
5876   102,65,15,114,208,13,                   //psrld         $0xd,%xmm8
5877   102,65,15,115,250,2,                    //pslldq        $0x2,%xmm10
5878   102,69,15,235,209,                      //por           %xmm9,%xmm10
5879   102,65,15,115,248,2,                    //pslldq        $0x2,%xmm8
5880   102,69,15,235,195,                      //por           %xmm11,%xmm8
5881   102,69,15,111,202,                      //movdqa        %xmm10,%xmm9
5882   102,69,15,98,200,                       //punpckldq     %xmm8,%xmm9
5883   243,68,15,127,12,248,                   //movdqu        %xmm9,(%rax,%rdi,8)
5884   102,69,15,106,208,                      //punpckhdq     %xmm8,%xmm10
5885   243,68,15,127,84,248,16,                //movdqu        %xmm10,0x10(%rax,%rdi,8)
5886   72,173,                                 //lods          %ds:(%rsi),%rax
5887   255,224,                                //jmpq          *%rax
5888 };
5889 
5890 CODE const uint8_t sk_store_f32_sse41[] = {
5891   72,173,                                 //lods          %ds:(%rsi),%rax
5892   72,139,0,                               //mov           (%rax),%rax
5893   72,137,249,                             //mov           %rdi,%rcx
5894   72,193,225,4,                           //shl           $0x4,%rcx
5895   68,15,40,192,                           //movaps        %xmm0,%xmm8
5896   68,15,40,200,                           //movaps        %xmm0,%xmm9
5897   68,15,20,201,                           //unpcklps      %xmm1,%xmm9
5898   68,15,40,210,                           //movaps        %xmm2,%xmm10
5899   68,15,40,218,                           //movaps        %xmm2,%xmm11
5900   68,15,20,219,                           //unpcklps      %xmm3,%xmm11
5901   68,15,21,193,                           //unpckhps      %xmm1,%xmm8
5902   68,15,21,211,                           //unpckhps      %xmm3,%xmm10
5903   69,15,40,225,                           //movaps        %xmm9,%xmm12
5904   102,69,15,20,227,                       //unpcklpd      %xmm11,%xmm12
5905   69,15,18,217,                           //movhlps       %xmm9,%xmm11
5906   69,15,40,200,                           //movaps        %xmm8,%xmm9
5907   102,69,15,20,202,                       //unpcklpd      %xmm10,%xmm9
5908   69,15,18,208,                           //movhlps       %xmm8,%xmm10
5909   102,68,15,17,36,8,                      //movupd        %xmm12,(%rax,%rcx,1)
5910   68,15,17,92,8,16,                       //movups        %xmm11,0x10(%rax,%rcx,1)
5911   102,68,15,17,76,8,32,                   //movupd        %xmm9,0x20(%rax,%rcx,1)
5912   68,15,17,84,8,48,                       //movups        %xmm10,0x30(%rax,%rcx,1)
5913   72,173,                                 //lods          %ds:(%rsi),%rax
5914   255,224,                                //jmpq          *%rax
5915 };
5916 
5917 CODE const uint8_t sk_clamp_x_sse41[] = {
5918   72,173,                                 //lods          %ds:(%rsi),%rax
5919   69,15,87,192,                           //xorps         %xmm8,%xmm8
5920   68,15,95,192,                           //maxps         %xmm0,%xmm8
5921   243,68,15,16,8,                         //movss         (%rax),%xmm9
5922   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
5923   102,15,118,192,                         //pcmpeqd       %xmm0,%xmm0
5924   102,65,15,254,193,                      //paddd         %xmm9,%xmm0
5925   68,15,93,192,                           //minps         %xmm0,%xmm8
5926   72,173,                                 //lods          %ds:(%rsi),%rax
5927   65,15,40,192,                           //movaps        %xmm8,%xmm0
5928   255,224,                                //jmpq          *%rax
5929 };
5930 
5931 CODE const uint8_t sk_clamp_y_sse41[] = {
5932   72,173,                                 //lods          %ds:(%rsi),%rax
5933   69,15,87,192,                           //xorps         %xmm8,%xmm8
5934   68,15,95,193,                           //maxps         %xmm1,%xmm8
5935   243,68,15,16,8,                         //movss         (%rax),%xmm9
5936   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
5937   102,15,118,201,                         //pcmpeqd       %xmm1,%xmm1
5938   102,65,15,254,201,                      //paddd         %xmm9,%xmm1
5939   68,15,93,193,                           //minps         %xmm1,%xmm8
5940   72,173,                                 //lods          %ds:(%rsi),%rax
5941   65,15,40,200,                           //movaps        %xmm8,%xmm1
5942   255,224,                                //jmpq          *%rax
5943 };
5944 
5945 CODE const uint8_t sk_repeat_x_sse41[] = {
5946   72,173,                                 //lods          %ds:(%rsi),%rax
5947   243,68,15,16,0,                         //movss         (%rax),%xmm8
5948   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5949   68,15,40,200,                           //movaps        %xmm0,%xmm9
5950   69,15,94,200,                           //divps         %xmm8,%xmm9
5951   102,69,15,58,8,201,1,                   //roundps       $0x1,%xmm9,%xmm9
5952   69,15,89,200,                           //mulps         %xmm8,%xmm9
5953   65,15,92,193,                           //subps         %xmm9,%xmm0
5954   102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
5955   102,69,15,254,200,                      //paddd         %xmm8,%xmm9
5956   65,15,93,193,                           //minps         %xmm9,%xmm0
5957   72,173,                                 //lods          %ds:(%rsi),%rax
5958   255,224,                                //jmpq          *%rax
5959 };
5960 
5961 CODE const uint8_t sk_repeat_y_sse41[] = {
5962   72,173,                                 //lods          %ds:(%rsi),%rax
5963   243,68,15,16,0,                         //movss         (%rax),%xmm8
5964   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5965   68,15,40,201,                           //movaps        %xmm1,%xmm9
5966   69,15,94,200,                           //divps         %xmm8,%xmm9
5967   102,69,15,58,8,201,1,                   //roundps       $0x1,%xmm9,%xmm9
5968   69,15,89,200,                           //mulps         %xmm8,%xmm9
5969   65,15,92,201,                           //subps         %xmm9,%xmm1
5970   102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
5971   102,69,15,254,200,                      //paddd         %xmm8,%xmm9
5972   65,15,93,201,                           //minps         %xmm9,%xmm1
5973   72,173,                                 //lods          %ds:(%rsi),%rax
5974   255,224,                                //jmpq          *%rax
5975 };
5976 
5977 CODE const uint8_t sk_mirror_x_sse41[] = {
5978   72,173,                                 //lods          %ds:(%rsi),%rax
5979   243,68,15,16,0,                         //movss         (%rax),%xmm8
5980   69,15,40,200,                           //movaps        %xmm8,%xmm9
5981   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
5982   65,15,92,193,                           //subps         %xmm9,%xmm0
5983   243,69,15,88,192,                       //addss         %xmm8,%xmm8
5984   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
5985   68,15,40,208,                           //movaps        %xmm0,%xmm10
5986   69,15,94,208,                           //divps         %xmm8,%xmm10
5987   102,69,15,58,8,210,1,                   //roundps       $0x1,%xmm10,%xmm10
5988   69,15,89,208,                           //mulps         %xmm8,%xmm10
5989   65,15,92,194,                           //subps         %xmm10,%xmm0
5990   65,15,92,193,                           //subps         %xmm9,%xmm0
5991   69,15,87,192,                           //xorps         %xmm8,%xmm8
5992   68,15,92,192,                           //subps         %xmm0,%xmm8
5993   65,15,84,192,                           //andps         %xmm8,%xmm0
5994   102,69,15,118,192,                      //pcmpeqd       %xmm8,%xmm8
5995   102,69,15,254,193,                      //paddd         %xmm9,%xmm8
5996   65,15,93,192,                           //minps         %xmm8,%xmm0
5997   72,173,                                 //lods          %ds:(%rsi),%rax
5998   255,224,                                //jmpq          *%rax
5999 };
6000 
6001 CODE const uint8_t sk_mirror_y_sse41[] = {
6002   72,173,                                 //lods          %ds:(%rsi),%rax
6003   243,68,15,16,0,                         //movss         (%rax),%xmm8
6004   69,15,40,200,                           //movaps        %xmm8,%xmm9
6005   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
6006   65,15,92,201,                           //subps         %xmm9,%xmm1
6007   243,69,15,88,192,                       //addss         %xmm8,%xmm8
6008   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
6009   68,15,40,209,                           //movaps        %xmm1,%xmm10
6010   69,15,94,208,                           //divps         %xmm8,%xmm10
6011   102,69,15,58,8,210,1,                   //roundps       $0x1,%xmm10,%xmm10
6012   69,15,89,208,                           //mulps         %xmm8,%xmm10
6013   65,15,92,202,                           //subps         %xmm10,%xmm1
6014   65,15,92,201,                           //subps         %xmm9,%xmm1
6015   69,15,87,192,                           //xorps         %xmm8,%xmm8
6016   68,15,92,193,                           //subps         %xmm1,%xmm8
6017   65,15,84,200,                           //andps         %xmm8,%xmm1
6018   102,69,15,118,192,                      //pcmpeqd       %xmm8,%xmm8
6019   102,69,15,254,193,                      //paddd         %xmm9,%xmm8
6020   65,15,93,200,                           //minps         %xmm8,%xmm1
6021   72,173,                                 //lods          %ds:(%rsi),%rax
6022   255,224,                                //jmpq          *%rax
6023 };
6024 
6025 CODE const uint8_t sk_luminance_to_alpha_sse41[] = {
6026   184,208,179,89,62,                      //mov           $0x3e59b3d0,%eax
6027   102,15,110,216,                         //movd          %eax,%xmm3
6028   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
6029   15,89,216,                              //mulps         %xmm0,%xmm3
6030   184,89,23,55,63,                        //mov           $0x3f371759,%eax
6031   102,15,110,192,                         //movd          %eax,%xmm0
6032   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
6033   15,89,193,                              //mulps         %xmm1,%xmm0
6034   15,88,195,                              //addps         %xmm3,%xmm0
6035   184,152,221,147,61,                     //mov           $0x3d93dd98,%eax
6036   102,15,110,216,                         //movd          %eax,%xmm3
6037   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
6038   15,89,218,                              //mulps         %xmm2,%xmm3
6039   15,88,216,                              //addps         %xmm0,%xmm3
6040   72,173,                                 //lods          %ds:(%rsi),%rax
6041   15,87,192,                              //xorps         %xmm0,%xmm0
6042   15,87,201,                              //xorps         %xmm1,%xmm1
6043   15,87,210,                              //xorps         %xmm2,%xmm2
6044   255,224,                                //jmpq          *%rax
6045 };
6046 
6047 CODE const uint8_t sk_matrix_2x3_sse41[] = {
6048   68,15,40,201,                           //movaps        %xmm1,%xmm9
6049   68,15,40,192,                           //movaps        %xmm0,%xmm8
6050   72,173,                                 //lods          %ds:(%rsi),%rax
6051   243,15,16,0,                            //movss         (%rax),%xmm0
6052   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
6053   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
6054   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
6055   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6056   243,68,15,16,88,16,                     //movss         0x10(%rax),%xmm11
6057   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
6058   69,15,89,209,                           //mulps         %xmm9,%xmm10
6059   69,15,88,211,                           //addps         %xmm11,%xmm10
6060   65,15,89,192,                           //mulps         %xmm8,%xmm0
6061   65,15,88,194,                           //addps         %xmm10,%xmm0
6062   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
6063   243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
6064   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6065   243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
6066   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
6067   69,15,89,209,                           //mulps         %xmm9,%xmm10
6068   69,15,88,211,                           //addps         %xmm11,%xmm10
6069   65,15,89,200,                           //mulps         %xmm8,%xmm1
6070   65,15,88,202,                           //addps         %xmm10,%xmm1
6071   72,173,                                 //lods          %ds:(%rsi),%rax
6072   255,224,                                //jmpq          *%rax
6073 };
6074 
6075 CODE const uint8_t sk_matrix_3x4_sse41[] = {
6076   68,15,40,201,                           //movaps        %xmm1,%xmm9
6077   68,15,40,192,                           //movaps        %xmm0,%xmm8
6078   72,173,                                 //lods          %ds:(%rsi),%rax
6079   243,15,16,0,                            //movss         (%rax),%xmm0
6080   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
6081   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
6082   243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
6083   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6084   243,68,15,16,88,24,                     //movss         0x18(%rax),%xmm11
6085   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
6086   243,68,15,16,96,36,                     //movss         0x24(%rax),%xmm12
6087   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
6088   68,15,89,218,                           //mulps         %xmm2,%xmm11
6089   69,15,88,220,                           //addps         %xmm12,%xmm11
6090   69,15,89,209,                           //mulps         %xmm9,%xmm10
6091   69,15,88,211,                           //addps         %xmm11,%xmm10
6092   65,15,89,192,                           //mulps         %xmm8,%xmm0
6093   65,15,88,194,                           //addps         %xmm10,%xmm0
6094   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
6095   243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
6096   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6097   243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
6098   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
6099   243,68,15,16,96,40,                     //movss         0x28(%rax),%xmm12
6100   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
6101   68,15,89,218,                           //mulps         %xmm2,%xmm11
6102   69,15,88,220,                           //addps         %xmm12,%xmm11
6103   69,15,89,209,                           //mulps         %xmm9,%xmm10
6104   69,15,88,211,                           //addps         %xmm11,%xmm10
6105   65,15,89,200,                           //mulps         %xmm8,%xmm1
6106   65,15,88,202,                           //addps         %xmm10,%xmm1
6107   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
6108   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6109   243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
6110   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
6111   243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
6112   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
6113   243,68,15,16,104,44,                    //movss         0x2c(%rax),%xmm13
6114   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
6115   68,15,89,226,                           //mulps         %xmm2,%xmm12
6116   69,15,88,229,                           //addps         %xmm13,%xmm12
6117   69,15,89,217,                           //mulps         %xmm9,%xmm11
6118   69,15,88,220,                           //addps         %xmm12,%xmm11
6119   69,15,89,208,                           //mulps         %xmm8,%xmm10
6120   69,15,88,211,                           //addps         %xmm11,%xmm10
6121   72,173,                                 //lods          %ds:(%rsi),%rax
6122   65,15,40,210,                           //movaps        %xmm10,%xmm2
6123   255,224,                                //jmpq          *%rax
6124 };
6125 
6126 CODE const uint8_t sk_matrix_4x5_sse41[] = {
6127   68,15,40,201,                           //movaps        %xmm1,%xmm9
6128   68,15,40,192,                           //movaps        %xmm0,%xmm8
6129   72,173,                                 //lods          %ds:(%rsi),%rax
6130   243,15,16,0,                            //movss         (%rax),%xmm0
6131   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
6132   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
6133   243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
6134   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6135   243,68,15,16,88,32,                     //movss         0x20(%rax),%xmm11
6136   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
6137   243,68,15,16,96,48,                     //movss         0x30(%rax),%xmm12
6138   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
6139   243,68,15,16,104,64,                    //movss         0x40(%rax),%xmm13
6140   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
6141   68,15,89,227,                           //mulps         %xmm3,%xmm12
6142   69,15,88,229,                           //addps         %xmm13,%xmm12
6143   68,15,89,218,                           //mulps         %xmm2,%xmm11
6144   69,15,88,220,                           //addps         %xmm12,%xmm11
6145   69,15,89,209,                           //mulps         %xmm9,%xmm10
6146   69,15,88,211,                           //addps         %xmm11,%xmm10
6147   65,15,89,192,                           //mulps         %xmm8,%xmm0
6148   65,15,88,194,                           //addps         %xmm10,%xmm0
6149   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
6150   243,68,15,16,80,20,                     //movss         0x14(%rax),%xmm10
6151   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6152   243,68,15,16,88,36,                     //movss         0x24(%rax),%xmm11
6153   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
6154   243,68,15,16,96,52,                     //movss         0x34(%rax),%xmm12
6155   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
6156   243,68,15,16,104,68,                    //movss         0x44(%rax),%xmm13
6157   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
6158   68,15,89,227,                           //mulps         %xmm3,%xmm12
6159   69,15,88,229,                           //addps         %xmm13,%xmm12
6160   68,15,89,218,                           //mulps         %xmm2,%xmm11
6161   69,15,88,220,                           //addps         %xmm12,%xmm11
6162   69,15,89,209,                           //mulps         %xmm9,%xmm10
6163   69,15,88,211,                           //addps         %xmm11,%xmm10
6164   65,15,89,200,                           //mulps         %xmm8,%xmm1
6165   65,15,88,202,                           //addps         %xmm10,%xmm1
6166   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
6167   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6168   243,68,15,16,88,24,                     //movss         0x18(%rax),%xmm11
6169   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
6170   243,68,15,16,96,40,                     //movss         0x28(%rax),%xmm12
6171   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
6172   243,68,15,16,104,56,                    //movss         0x38(%rax),%xmm13
6173   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
6174   243,68,15,16,112,72,                    //movss         0x48(%rax),%xmm14
6175   69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
6176   68,15,89,235,                           //mulps         %xmm3,%xmm13
6177   69,15,88,238,                           //addps         %xmm14,%xmm13
6178   68,15,89,226,                           //mulps         %xmm2,%xmm12
6179   69,15,88,229,                           //addps         %xmm13,%xmm12
6180   69,15,89,217,                           //mulps         %xmm9,%xmm11
6181   69,15,88,220,                           //addps         %xmm12,%xmm11
6182   69,15,89,208,                           //mulps         %xmm8,%xmm10
6183   69,15,88,211,                           //addps         %xmm11,%xmm10
6184   243,68,15,16,88,12,                     //movss         0xc(%rax),%xmm11
6185   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
6186   243,68,15,16,96,28,                     //movss         0x1c(%rax),%xmm12
6187   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
6188   243,68,15,16,104,44,                    //movss         0x2c(%rax),%xmm13
6189   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
6190   243,68,15,16,112,60,                    //movss         0x3c(%rax),%xmm14
6191   69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
6192   243,68,15,16,120,76,                    //movss         0x4c(%rax),%xmm15
6193   69,15,198,255,0,                        //shufps        $0x0,%xmm15,%xmm15
6194   68,15,89,243,                           //mulps         %xmm3,%xmm14
6195   69,15,88,247,                           //addps         %xmm15,%xmm14
6196   68,15,89,234,                           //mulps         %xmm2,%xmm13
6197   69,15,88,238,                           //addps         %xmm14,%xmm13
6198   69,15,89,225,                           //mulps         %xmm9,%xmm12
6199   69,15,88,229,                           //addps         %xmm13,%xmm12
6200   69,15,89,216,                           //mulps         %xmm8,%xmm11
6201   69,15,88,220,                           //addps         %xmm12,%xmm11
6202   72,173,                                 //lods          %ds:(%rsi),%rax
6203   65,15,40,210,                           //movaps        %xmm10,%xmm2
6204   65,15,40,219,                           //movaps        %xmm11,%xmm3
6205   255,224,                                //jmpq          *%rax
6206 };
6207 
6208 CODE const uint8_t sk_matrix_perspective_sse41[] = {
6209   68,15,40,192,                           //movaps        %xmm0,%xmm8
6210   72,173,                                 //lods          %ds:(%rsi),%rax
6211   243,15,16,0,                            //movss         (%rax),%xmm0
6212   243,68,15,16,72,4,                      //movss         0x4(%rax),%xmm9
6213   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
6214   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
6215   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
6216   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6217   68,15,89,201,                           //mulps         %xmm1,%xmm9
6218   69,15,88,202,                           //addps         %xmm10,%xmm9
6219   65,15,89,192,                           //mulps         %xmm8,%xmm0
6220   65,15,88,193,                           //addps         %xmm9,%xmm0
6221   243,68,15,16,72,12,                     //movss         0xc(%rax),%xmm9
6222   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
6223   243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
6224   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6225   243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
6226   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
6227   68,15,89,209,                           //mulps         %xmm1,%xmm10
6228   69,15,88,211,                           //addps         %xmm11,%xmm10
6229   69,15,89,200,                           //mulps         %xmm8,%xmm9
6230   69,15,88,202,                           //addps         %xmm10,%xmm9
6231   243,68,15,16,80,24,                     //movss         0x18(%rax),%xmm10
6232   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6233   243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
6234   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
6235   243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
6236   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
6237   68,15,89,217,                           //mulps         %xmm1,%xmm11
6238   69,15,88,220,                           //addps         %xmm12,%xmm11
6239   69,15,89,208,                           //mulps         %xmm8,%xmm10
6240   69,15,88,211,                           //addps         %xmm11,%xmm10
6241   65,15,83,202,                           //rcpps         %xmm10,%xmm1
6242   15,89,193,                              //mulps         %xmm1,%xmm0
6243   68,15,89,201,                           //mulps         %xmm1,%xmm9
6244   72,173,                                 //lods          %ds:(%rsi),%rax
6245   65,15,40,201,                           //movaps        %xmm9,%xmm1
6246   255,224,                                //jmpq          *%rax
6247 };
6248 
6249 CODE const uint8_t sk_linear_gradient_2stops_sse41[] = {
6250   72,173,                                 //lods          %ds:(%rsi),%rax
6251   68,15,16,8,                             //movups        (%rax),%xmm9
6252   15,16,88,16,                            //movups        0x10(%rax),%xmm3
6253   68,15,40,195,                           //movaps        %xmm3,%xmm8
6254   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
6255   65,15,40,201,                           //movaps        %xmm9,%xmm1
6256   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
6257   68,15,89,192,                           //mulps         %xmm0,%xmm8
6258   68,15,88,193,                           //addps         %xmm1,%xmm8
6259   15,40,203,                              //movaps        %xmm3,%xmm1
6260   15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
6261   65,15,40,209,                           //movaps        %xmm9,%xmm2
6262   15,198,210,85,                          //shufps        $0x55,%xmm2,%xmm2
6263   15,89,200,                              //mulps         %xmm0,%xmm1
6264   15,88,202,                              //addps         %xmm2,%xmm1
6265   15,40,211,                              //movaps        %xmm3,%xmm2
6266   15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
6267   69,15,40,209,                           //movaps        %xmm9,%xmm10
6268   69,15,198,210,170,                      //shufps        $0xaa,%xmm10,%xmm10
6269   15,89,208,                              //mulps         %xmm0,%xmm2
6270   65,15,88,210,                           //addps         %xmm10,%xmm2
6271   15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
6272   69,15,198,201,255,                      //shufps        $0xff,%xmm9,%xmm9
6273   15,89,216,                              //mulps         %xmm0,%xmm3
6274   65,15,88,217,                           //addps         %xmm9,%xmm3
6275   72,173,                                 //lods          %ds:(%rsi),%rax
6276   65,15,40,192,                           //movaps        %xmm8,%xmm0
6277   255,224,                                //jmpq          *%rax
6278 };
6279 
6280 CODE const uint8_t sk_start_pipeline_sse2[] = {
6281   65,87,                                  //push          %r15
6282   65,86,                                  //push          %r14
6283   65,85,                                  //push          %r13
6284   65,84,                                  //push          %r12
6285   83,                                     //push          %rbx
6286   73,137,207,                             //mov           %rcx,%r15
6287   73,137,214,                             //mov           %rdx,%r14
6288   72,137,251,                             //mov           %rdi,%rbx
6289   72,173,                                 //lods          %ds:(%rsi),%rax
6290   73,137,196,                             //mov           %rax,%r12
6291   73,137,245,                             //mov           %rsi,%r13
6292   72,141,67,4,                            //lea           0x4(%rbx),%rax
6293   76,57,248,                              //cmp           %r15,%rax
6294   118,5,                                  //jbe           28 <_sk_start_pipeline_sse2+0x28>
6295   72,137,216,                             //mov           %rbx,%rax
6296   235,52,                                 //jmp           5c <_sk_start_pipeline_sse2+0x5c>
6297   15,87,192,                              //xorps         %xmm0,%xmm0
6298   15,87,201,                              //xorps         %xmm1,%xmm1
6299   15,87,210,                              //xorps         %xmm2,%xmm2
6300   15,87,219,                              //xorps         %xmm3,%xmm3
6301   15,87,228,                              //xorps         %xmm4,%xmm4
6302   15,87,237,                              //xorps         %xmm5,%xmm5
6303   15,87,246,                              //xorps         %xmm6,%xmm6
6304   15,87,255,                              //xorps         %xmm7,%xmm7
6305   72,137,223,                             //mov           %rbx,%rdi
6306   76,137,238,                             //mov           %r13,%rsi
6307   76,137,242,                             //mov           %r14,%rdx
6308   65,255,212,                             //callq         *%r12
6309   72,141,67,4,                            //lea           0x4(%rbx),%rax
6310   72,131,195,8,                           //add           $0x8,%rbx
6311   76,57,251,                              //cmp           %r15,%rbx
6312   72,137,195,                             //mov           %rax,%rbx
6313   118,204,                                //jbe           28 <_sk_start_pipeline_sse2+0x28>
6314   91,                                     //pop           %rbx
6315   65,92,                                  //pop           %r12
6316   65,93,                                  //pop           %r13
6317   65,94,                                  //pop           %r14
6318   65,95,                                  //pop           %r15
6319   195,                                    //retq
6320 };
6321 
6322 CODE const uint8_t sk_just_return_sse2[] = {
6323   195,                                    //retq
6324 };
6325 
6326 CODE const uint8_t sk_seed_shader_sse2[] = {
6327   72,173,                                 //lods          %ds:(%rsi),%rax
6328   102,15,110,199,                         //movd          %edi,%xmm0
6329   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
6330   15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
6331   185,0,0,0,63,                           //mov           $0x3f000000,%ecx
6332   102,15,110,209,                         //movd          %ecx,%xmm2
6333   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
6334   15,88,202,                              //addps         %xmm2,%xmm1
6335   15,16,2,                                //movups        (%rdx),%xmm0
6336   15,88,193,                              //addps         %xmm1,%xmm0
6337   102,15,110,8,                           //movd          (%rax),%xmm1
6338   102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
6339   15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
6340   15,88,202,                              //addps         %xmm2,%xmm1
6341   184,0,0,128,63,                         //mov           $0x3f800000,%eax
6342   102,15,110,208,                         //movd          %eax,%xmm2
6343   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
6344   72,173,                                 //lods          %ds:(%rsi),%rax
6345   15,87,219,                              //xorps         %xmm3,%xmm3
6346   15,87,228,                              //xorps         %xmm4,%xmm4
6347   15,87,237,                              //xorps         %xmm5,%xmm5
6348   15,87,246,                              //xorps         %xmm6,%xmm6
6349   15,87,255,                              //xorps         %xmm7,%xmm7
6350   255,224,                                //jmpq          *%rax
6351 };
6352 
6353 CODE const uint8_t sk_constant_color_sse2[] = {
6354   72,173,                                 //lods          %ds:(%rsi),%rax
6355   15,16,24,                               //movups        (%rax),%xmm3
6356   15,40,195,                              //movaps        %xmm3,%xmm0
6357   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
6358   15,40,203,                              //movaps        %xmm3,%xmm1
6359   15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
6360   15,40,211,                              //movaps        %xmm3,%xmm2
6361   15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
6362   15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
6363   72,173,                                 //lods          %ds:(%rsi),%rax
6364   255,224,                                //jmpq          *%rax
6365 };
6366 
6367 CODE const uint8_t sk_clear_sse2[] = {
6368   72,173,                                 //lods          %ds:(%rsi),%rax
6369   15,87,192,                              //xorps         %xmm0,%xmm0
6370   15,87,201,                              //xorps         %xmm1,%xmm1
6371   15,87,210,                              //xorps         %xmm2,%xmm2
6372   15,87,219,                              //xorps         %xmm3,%xmm3
6373   255,224,                                //jmpq          *%rax
6374 };
6375 
6376 CODE const uint8_t sk_plus__sse2[] = {
6377   15,88,196,                              //addps         %xmm4,%xmm0
6378   15,88,205,                              //addps         %xmm5,%xmm1
6379   15,88,214,                              //addps         %xmm6,%xmm2
6380   15,88,223,                              //addps         %xmm7,%xmm3
6381   72,173,                                 //lods          %ds:(%rsi),%rax
6382   255,224,                                //jmpq          *%rax
6383 };
6384 
6385 CODE const uint8_t sk_srcover_sse2[] = {
6386   184,0,0,128,63,                         //mov           $0x3f800000,%eax
6387   102,68,15,110,192,                      //movd          %eax,%xmm8
6388   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
6389   68,15,92,195,                           //subps         %xmm3,%xmm8
6390   69,15,40,200,                           //movaps        %xmm8,%xmm9
6391   68,15,89,204,                           //mulps         %xmm4,%xmm9
6392   65,15,88,193,                           //addps         %xmm9,%xmm0
6393   69,15,40,200,                           //movaps        %xmm8,%xmm9
6394   68,15,89,205,                           //mulps         %xmm5,%xmm9
6395   65,15,88,201,                           //addps         %xmm9,%xmm1
6396   69,15,40,200,                           //movaps        %xmm8,%xmm9
6397   68,15,89,206,                           //mulps         %xmm6,%xmm9
6398   65,15,88,209,                           //addps         %xmm9,%xmm2
6399   68,15,89,199,                           //mulps         %xmm7,%xmm8
6400   65,15,88,216,                           //addps         %xmm8,%xmm3
6401   72,173,                                 //lods          %ds:(%rsi),%rax
6402   255,224,                                //jmpq          *%rax
6403 };
6404 
6405 CODE const uint8_t sk_dstover_sse2[] = {
6406   184,0,0,128,63,                         //mov           $0x3f800000,%eax
6407   102,68,15,110,192,                      //movd          %eax,%xmm8
6408   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
6409   68,15,92,199,                           //subps         %xmm7,%xmm8
6410   65,15,89,192,                           //mulps         %xmm8,%xmm0
6411   15,88,196,                              //addps         %xmm4,%xmm0
6412   65,15,89,200,                           //mulps         %xmm8,%xmm1
6413   15,88,205,                              //addps         %xmm5,%xmm1
6414   65,15,89,208,                           //mulps         %xmm8,%xmm2
6415   15,88,214,                              //addps         %xmm6,%xmm2
6416   65,15,89,216,                           //mulps         %xmm8,%xmm3
6417   15,88,223,                              //addps         %xmm7,%xmm3
6418   72,173,                                 //lods          %ds:(%rsi),%rax
6419   255,224,                                //jmpq          *%rax
6420 };
6421 
6422 CODE const uint8_t sk_clamp_0_sse2[] = {
6423   69,15,87,192,                           //xorps         %xmm8,%xmm8
6424   65,15,95,192,                           //maxps         %xmm8,%xmm0
6425   65,15,95,200,                           //maxps         %xmm8,%xmm1
6426   65,15,95,208,                           //maxps         %xmm8,%xmm2
6427   65,15,95,216,                           //maxps         %xmm8,%xmm3
6428   72,173,                                 //lods          %ds:(%rsi),%rax
6429   255,224,                                //jmpq          *%rax
6430 };
6431 
6432 CODE const uint8_t sk_clamp_1_sse2[] = {
6433   184,0,0,128,63,                         //mov           $0x3f800000,%eax
6434   102,68,15,110,192,                      //movd          %eax,%xmm8
6435   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
6436   65,15,93,192,                           //minps         %xmm8,%xmm0
6437   65,15,93,200,                           //minps         %xmm8,%xmm1
6438   65,15,93,208,                           //minps         %xmm8,%xmm2
6439   65,15,93,216,                           //minps         %xmm8,%xmm3
6440   72,173,                                 //lods          %ds:(%rsi),%rax
6441   255,224,                                //jmpq          *%rax
6442 };
6443 
6444 CODE const uint8_t sk_clamp_a_sse2[] = {
6445   184,0,0,128,63,                         //mov           $0x3f800000,%eax
6446   102,68,15,110,192,                      //movd          %eax,%xmm8
6447   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
6448   65,15,93,216,                           //minps         %xmm8,%xmm3
6449   15,93,195,                              //minps         %xmm3,%xmm0
6450   15,93,203,                              //minps         %xmm3,%xmm1
6451   15,93,211,                              //minps         %xmm3,%xmm2
6452   72,173,                                 //lods          %ds:(%rsi),%rax
6453   255,224,                                //jmpq          *%rax
6454 };
6455 
6456 CODE const uint8_t sk_set_rgb_sse2[] = {
6457   72,173,                                 //lods          %ds:(%rsi),%rax
6458   243,15,16,0,                            //movss         (%rax),%xmm0
6459   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
6460   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
6461   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
6462   243,15,16,80,8,                         //movss         0x8(%rax),%xmm2
6463   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
6464   72,173,                                 //lods          %ds:(%rsi),%rax
6465   255,224,                                //jmpq          *%rax
6466 };
6467 
6468 CODE const uint8_t sk_swap_rb_sse2[] = {
6469   68,15,40,192,                           //movaps        %xmm0,%xmm8
6470   72,173,                                 //lods          %ds:(%rsi),%rax
6471   15,40,194,                              //movaps        %xmm2,%xmm0
6472   65,15,40,208,                           //movaps        %xmm8,%xmm2
6473   255,224,                                //jmpq          *%rax
6474 };
6475 
6476 CODE const uint8_t sk_swap_sse2[] = {
6477   68,15,40,195,                           //movaps        %xmm3,%xmm8
6478   68,15,40,202,                           //movaps        %xmm2,%xmm9
6479   68,15,40,209,                           //movaps        %xmm1,%xmm10
6480   68,15,40,216,                           //movaps        %xmm0,%xmm11
6481   72,173,                                 //lods          %ds:(%rsi),%rax
6482   15,40,196,                              //movaps        %xmm4,%xmm0
6483   15,40,205,                              //movaps        %xmm5,%xmm1
6484   15,40,214,                              //movaps        %xmm6,%xmm2
6485   15,40,223,                              //movaps        %xmm7,%xmm3
6486   65,15,40,227,                           //movaps        %xmm11,%xmm4
6487   65,15,40,234,                           //movaps        %xmm10,%xmm5
6488   65,15,40,241,                           //movaps        %xmm9,%xmm6
6489   65,15,40,248,                           //movaps        %xmm8,%xmm7
6490   255,224,                                //jmpq          *%rax
6491 };
6492 
6493 CODE const uint8_t sk_move_src_dst_sse2[] = {
6494   72,173,                                 //lods          %ds:(%rsi),%rax
6495   15,40,224,                              //movaps        %xmm0,%xmm4
6496   15,40,233,                              //movaps        %xmm1,%xmm5
6497   15,40,242,                              //movaps        %xmm2,%xmm6
6498   15,40,251,                              //movaps        %xmm3,%xmm7
6499   255,224,                                //jmpq          *%rax
6500 };
6501 
6502 CODE const uint8_t sk_move_dst_src_sse2[] = {
6503   72,173,                                 //lods          %ds:(%rsi),%rax
6504   15,40,196,                              //movaps        %xmm4,%xmm0
6505   15,40,205,                              //movaps        %xmm5,%xmm1
6506   15,40,214,                              //movaps        %xmm6,%xmm2
6507   15,40,223,                              //movaps        %xmm7,%xmm3
6508   255,224,                                //jmpq          *%rax
6509 };
6510 
6511 CODE const uint8_t sk_premul_sse2[] = {
6512   15,89,195,                              //mulps         %xmm3,%xmm0
6513   15,89,203,                              //mulps         %xmm3,%xmm1
6514   15,89,211,                              //mulps         %xmm3,%xmm2
6515   72,173,                                 //lods          %ds:(%rsi),%rax
6516   255,224,                                //jmpq          *%rax
6517 };
6518 
6519 CODE const uint8_t sk_unpremul_sse2[] = {
6520   69,15,87,192,                           //xorps         %xmm8,%xmm8
6521   184,0,0,128,63,                         //mov           $0x3f800000,%eax
6522   102,68,15,110,200,                      //movd          %eax,%xmm9
6523   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
6524   68,15,94,203,                           //divps         %xmm3,%xmm9
6525   68,15,194,195,4,                        //cmpneqps      %xmm3,%xmm8
6526   69,15,84,193,                           //andps         %xmm9,%xmm8
6527   65,15,89,192,                           //mulps         %xmm8,%xmm0
6528   65,15,89,200,                           //mulps         %xmm8,%xmm1
6529   65,15,89,208,                           //mulps         %xmm8,%xmm2
6530   72,173,                                 //lods          %ds:(%rsi),%rax
6531   255,224,                                //jmpq          *%rax
6532 };
6533 
6534 CODE const uint8_t sk_from_srgb_sse2[] = {
6535   184,145,131,158,61,                     //mov           $0x3d9e8391,%eax
6536   102,68,15,110,192,                      //movd          %eax,%xmm8
6537   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
6538   69,15,40,232,                           //movaps        %xmm8,%xmm13
6539   68,15,89,232,                           //mulps         %xmm0,%xmm13
6540   68,15,40,224,                           //movaps        %xmm0,%xmm12
6541   69,15,89,228,                           //mulps         %xmm12,%xmm12
6542   184,154,153,153,62,                     //mov           $0x3e99999a,%eax
6543   102,68,15,110,200,                      //movd          %eax,%xmm9
6544   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
6545   184,92,143,50,63,                       //mov           $0x3f328f5c,%eax
6546   102,68,15,110,208,                      //movd          %eax,%xmm10
6547   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6548   69,15,40,241,                           //movaps        %xmm9,%xmm14
6549   68,15,89,240,                           //mulps         %xmm0,%xmm14
6550   69,15,88,242,                           //addps         %xmm10,%xmm14
6551   184,10,215,35,59,                       //mov           $0x3b23d70a,%eax
6552   102,68,15,110,216,                      //movd          %eax,%xmm11
6553   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
6554   69,15,89,244,                           //mulps         %xmm12,%xmm14
6555   69,15,88,243,                           //addps         %xmm11,%xmm14
6556   184,174,71,97,61,                       //mov           $0x3d6147ae,%eax
6557   102,68,15,110,224,                      //movd          %eax,%xmm12
6558   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
6559   65,15,194,196,1,                        //cmpltps       %xmm12,%xmm0
6560   68,15,84,232,                           //andps         %xmm0,%xmm13
6561   65,15,85,198,                           //andnps        %xmm14,%xmm0
6562   65,15,86,197,                           //orps          %xmm13,%xmm0
6563   69,15,40,232,                           //movaps        %xmm8,%xmm13
6564   68,15,89,233,                           //mulps         %xmm1,%xmm13
6565   68,15,40,241,                           //movaps        %xmm1,%xmm14
6566   69,15,89,246,                           //mulps         %xmm14,%xmm14
6567   69,15,40,249,                           //movaps        %xmm9,%xmm15
6568   68,15,89,249,                           //mulps         %xmm1,%xmm15
6569   69,15,88,250,                           //addps         %xmm10,%xmm15
6570   69,15,89,254,                           //mulps         %xmm14,%xmm15
6571   69,15,88,251,                           //addps         %xmm11,%xmm15
6572   65,15,194,204,1,                        //cmpltps       %xmm12,%xmm1
6573   68,15,84,233,                           //andps         %xmm1,%xmm13
6574   65,15,85,207,                           //andnps        %xmm15,%xmm1
6575   65,15,86,205,                           //orps          %xmm13,%xmm1
6576   68,15,89,194,                           //mulps         %xmm2,%xmm8
6577   68,15,40,234,                           //movaps        %xmm2,%xmm13
6578   69,15,89,237,                           //mulps         %xmm13,%xmm13
6579   68,15,89,202,                           //mulps         %xmm2,%xmm9
6580   69,15,88,202,                           //addps         %xmm10,%xmm9
6581   69,15,89,205,                           //mulps         %xmm13,%xmm9
6582   69,15,88,203,                           //addps         %xmm11,%xmm9
6583   65,15,194,212,1,                        //cmpltps       %xmm12,%xmm2
6584   68,15,84,194,                           //andps         %xmm2,%xmm8
6585   65,15,85,209,                           //andnps        %xmm9,%xmm2
6586   65,15,86,208,                           //orps          %xmm8,%xmm2
6587   72,173,                                 //lods          %ds:(%rsi),%rax
6588   255,224,                                //jmpq          *%rax
6589 };
6590 
6591 CODE const uint8_t sk_to_srgb_sse2[] = {
6592   68,15,82,192,                           //rsqrtps       %xmm0,%xmm8
6593   69,15,83,248,                           //rcpps         %xmm8,%xmm15
6594   69,15,82,232,                           //rsqrtps       %xmm8,%xmm13
6595   184,41,92,71,65,                        //mov           $0x41475c29,%eax
6596   102,68,15,110,192,                      //movd          %eax,%xmm8
6597   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
6598   69,15,40,240,                           //movaps        %xmm8,%xmm14
6599   68,15,89,240,                           //mulps         %xmm0,%xmm14
6600   184,0,0,128,63,                         //mov           $0x3f800000,%eax
6601   102,68,15,110,200,                      //movd          %eax,%xmm9
6602   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
6603   184,194,135,210,62,                     //mov           $0x3ed287c2,%eax
6604   102,68,15,110,208,                      //movd          %eax,%xmm10
6605   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6606   184,206,111,48,63,                      //mov           $0x3f306fce,%eax
6607   102,68,15,110,216,                      //movd          %eax,%xmm11
6608   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
6609   184,168,87,202,61,                      //mov           $0x3dca57a8,%eax
6610   53,0,0,0,128,                           //xor           $0x80000000,%eax
6611   102,68,15,110,224,                      //movd          %eax,%xmm12
6612   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
6613   69,15,89,251,                           //mulps         %xmm11,%xmm15
6614   69,15,88,252,                           //addps         %xmm12,%xmm15
6615   69,15,89,234,                           //mulps         %xmm10,%xmm13
6616   69,15,88,239,                           //addps         %xmm15,%xmm13
6617   69,15,40,249,                           //movaps        %xmm9,%xmm15
6618   69,15,93,253,                           //minps         %xmm13,%xmm15
6619   184,4,231,140,59,                       //mov           $0x3b8ce704,%eax
6620   102,68,15,110,232,                      //movd          %eax,%xmm13
6621   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
6622   65,15,194,197,1,                        //cmpltps       %xmm13,%xmm0
6623   68,15,84,240,                           //andps         %xmm0,%xmm14
6624   65,15,85,199,                           //andnps        %xmm15,%xmm0
6625   65,15,86,198,                           //orps          %xmm14,%xmm0
6626   68,15,82,241,                           //rsqrtps       %xmm1,%xmm14
6627   69,15,83,254,                           //rcpps         %xmm14,%xmm15
6628   69,15,82,246,                           //rsqrtps       %xmm14,%xmm14
6629   69,15,89,251,                           //mulps         %xmm11,%xmm15
6630   69,15,88,252,                           //addps         %xmm12,%xmm15
6631   69,15,89,242,                           //mulps         %xmm10,%xmm14
6632   69,15,88,247,                           //addps         %xmm15,%xmm14
6633   69,15,40,249,                           //movaps        %xmm9,%xmm15
6634   69,15,93,254,                           //minps         %xmm14,%xmm15
6635   69,15,40,240,                           //movaps        %xmm8,%xmm14
6636   68,15,89,241,                           //mulps         %xmm1,%xmm14
6637   65,15,194,205,1,                        //cmpltps       %xmm13,%xmm1
6638   68,15,84,241,                           //andps         %xmm1,%xmm14
6639   65,15,85,207,                           //andnps        %xmm15,%xmm1
6640   65,15,86,206,                           //orps          %xmm14,%xmm1
6641   68,15,82,242,                           //rsqrtps       %xmm2,%xmm14
6642   69,15,83,254,                           //rcpps         %xmm14,%xmm15
6643   69,15,89,251,                           //mulps         %xmm11,%xmm15
6644   69,15,88,252,                           //addps         %xmm12,%xmm15
6645   69,15,82,222,                           //rsqrtps       %xmm14,%xmm11
6646   69,15,89,218,                           //mulps         %xmm10,%xmm11
6647   69,15,88,223,                           //addps         %xmm15,%xmm11
6648   69,15,93,203,                           //minps         %xmm11,%xmm9
6649   68,15,89,194,                           //mulps         %xmm2,%xmm8
6650   65,15,194,213,1,                        //cmpltps       %xmm13,%xmm2
6651   68,15,84,194,                           //andps         %xmm2,%xmm8
6652   65,15,85,209,                           //andnps        %xmm9,%xmm2
6653   65,15,86,208,                           //orps          %xmm8,%xmm2
6654   72,173,                                 //lods          %ds:(%rsi),%rax
6655   255,224,                                //jmpq          *%rax
6656 };
6657 
6658 CODE const uint8_t sk_scale_1_float_sse2[] = {
6659   72,173,                                 //lods          %ds:(%rsi),%rax
6660   243,68,15,16,0,                         //movss         (%rax),%xmm8
6661   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
6662   65,15,89,192,                           //mulps         %xmm8,%xmm0
6663   65,15,89,200,                           //mulps         %xmm8,%xmm1
6664   65,15,89,208,                           //mulps         %xmm8,%xmm2
6665   65,15,89,216,                           //mulps         %xmm8,%xmm3
6666   72,173,                                 //lods          %ds:(%rsi),%rax
6667   255,224,                                //jmpq          *%rax
6668 };
6669 
6670 CODE const uint8_t sk_scale_u8_sse2[] = {
6671   72,173,                                 //lods          %ds:(%rsi),%rax
6672   72,139,0,                               //mov           (%rax),%rax
6673   102,68,15,110,4,56,                     //movd          (%rax,%rdi,1),%xmm8
6674   102,69,15,239,201,                      //pxor          %xmm9,%xmm9
6675   102,69,15,96,193,                       //punpcklbw     %xmm9,%xmm8
6676   102,69,15,97,193,                       //punpcklwd     %xmm9,%xmm8
6677   69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
6678   184,129,128,128,59,                     //mov           $0x3b808081,%eax
6679   102,68,15,110,200,                      //movd          %eax,%xmm9
6680   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
6681   69,15,89,200,                           //mulps         %xmm8,%xmm9
6682   65,15,89,193,                           //mulps         %xmm9,%xmm0
6683   65,15,89,201,                           //mulps         %xmm9,%xmm1
6684   65,15,89,209,                           //mulps         %xmm9,%xmm2
6685   65,15,89,217,                           //mulps         %xmm9,%xmm3
6686   72,173,                                 //lods          %ds:(%rsi),%rax
6687   255,224,                                //jmpq          *%rax
6688 };
6689 
6690 CODE const uint8_t sk_lerp_1_float_sse2[] = {
6691   72,173,                                 //lods          %ds:(%rsi),%rax
6692   243,68,15,16,0,                         //movss         (%rax),%xmm8
6693   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
6694   15,92,196,                              //subps         %xmm4,%xmm0
6695   65,15,89,192,                           //mulps         %xmm8,%xmm0
6696   15,88,196,                              //addps         %xmm4,%xmm0
6697   15,92,205,                              //subps         %xmm5,%xmm1
6698   65,15,89,200,                           //mulps         %xmm8,%xmm1
6699   15,88,205,                              //addps         %xmm5,%xmm1
6700   15,92,214,                              //subps         %xmm6,%xmm2
6701   65,15,89,208,                           //mulps         %xmm8,%xmm2
6702   15,88,214,                              //addps         %xmm6,%xmm2
6703   15,92,223,                              //subps         %xmm7,%xmm3
6704   65,15,89,216,                           //mulps         %xmm8,%xmm3
6705   15,88,223,                              //addps         %xmm7,%xmm3
6706   72,173,                                 //lods          %ds:(%rsi),%rax
6707   255,224,                                //jmpq          *%rax
6708 };
6709 
6710 CODE const uint8_t sk_lerp_u8_sse2[] = {
6711   72,173,                                 //lods          %ds:(%rsi),%rax
6712   72,139,0,                               //mov           (%rax),%rax
6713   102,68,15,110,4,56,                     //movd          (%rax,%rdi,1),%xmm8
6714   102,69,15,239,201,                      //pxor          %xmm9,%xmm9
6715   102,69,15,96,193,                       //punpcklbw     %xmm9,%xmm8
6716   102,69,15,97,193,                       //punpcklwd     %xmm9,%xmm8
6717   69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
6718   184,129,128,128,59,                     //mov           $0x3b808081,%eax
6719   102,68,15,110,200,                      //movd          %eax,%xmm9
6720   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
6721   69,15,89,200,                           //mulps         %xmm8,%xmm9
6722   15,92,196,                              //subps         %xmm4,%xmm0
6723   65,15,89,193,                           //mulps         %xmm9,%xmm0
6724   15,88,196,                              //addps         %xmm4,%xmm0
6725   15,92,205,                              //subps         %xmm5,%xmm1
6726   65,15,89,201,                           //mulps         %xmm9,%xmm1
6727   15,88,205,                              //addps         %xmm5,%xmm1
6728   15,92,214,                              //subps         %xmm6,%xmm2
6729   65,15,89,209,                           //mulps         %xmm9,%xmm2
6730   15,88,214,                              //addps         %xmm6,%xmm2
6731   15,92,223,                              //subps         %xmm7,%xmm3
6732   65,15,89,217,                           //mulps         %xmm9,%xmm3
6733   15,88,223,                              //addps         %xmm7,%xmm3
6734   72,173,                                 //lods          %ds:(%rsi),%rax
6735   255,224,                                //jmpq          *%rax
6736 };
6737 
6738 CODE const uint8_t sk_lerp_565_sse2[] = {
6739   72,173,                                 //lods          %ds:(%rsi),%rax
6740   72,139,0,                               //mov           (%rax),%rax
6741   243,68,15,126,4,120,                    //movq          (%rax,%rdi,2),%xmm8
6742   102,15,239,219,                         //pxor          %xmm3,%xmm3
6743   102,68,15,97,195,                       //punpcklwd     %xmm3,%xmm8
6744   184,0,248,0,0,                          //mov           $0xf800,%eax
6745   102,15,110,216,                         //movd          %eax,%xmm3
6746   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
6747   102,65,15,219,216,                      //pand          %xmm8,%xmm3
6748   68,15,91,203,                           //cvtdq2ps      %xmm3,%xmm9
6749   184,8,33,132,55,                        //mov           $0x37842108,%eax
6750   102,68,15,110,208,                      //movd          %eax,%xmm10
6751   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6752   69,15,89,209,                           //mulps         %xmm9,%xmm10
6753   184,224,7,0,0,                          //mov           $0x7e0,%eax
6754   102,15,110,216,                         //movd          %eax,%xmm3
6755   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
6756   102,65,15,219,216,                      //pand          %xmm8,%xmm3
6757   68,15,91,203,                           //cvtdq2ps      %xmm3,%xmm9
6758   184,33,8,2,58,                          //mov           $0x3a020821,%eax
6759   102,68,15,110,216,                      //movd          %eax,%xmm11
6760   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
6761   69,15,89,217,                           //mulps         %xmm9,%xmm11
6762   184,31,0,0,0,                           //mov           $0x1f,%eax
6763   102,15,110,216,                         //movd          %eax,%xmm3
6764   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
6765   102,65,15,219,216,                      //pand          %xmm8,%xmm3
6766   68,15,91,195,                           //cvtdq2ps      %xmm3,%xmm8
6767   184,8,33,4,61,                          //mov           $0x3d042108,%eax
6768   102,15,110,216,                         //movd          %eax,%xmm3
6769   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
6770   65,15,89,216,                           //mulps         %xmm8,%xmm3
6771   15,92,196,                              //subps         %xmm4,%xmm0
6772   65,15,89,194,                           //mulps         %xmm10,%xmm0
6773   15,88,196,                              //addps         %xmm4,%xmm0
6774   15,92,205,                              //subps         %xmm5,%xmm1
6775   65,15,89,203,                           //mulps         %xmm11,%xmm1
6776   15,88,205,                              //addps         %xmm5,%xmm1
6777   15,92,214,                              //subps         %xmm6,%xmm2
6778   15,89,211,                              //mulps         %xmm3,%xmm2
6779   15,88,214,                              //addps         %xmm6,%xmm2
6780   184,0,0,128,63,                         //mov           $0x3f800000,%eax
6781   102,15,110,216,                         //movd          %eax,%xmm3
6782   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
6783   72,173,                                 //lods          %ds:(%rsi),%rax
6784   255,224,                                //jmpq          *%rax
6785 };
6786 
6787 CODE const uint8_t sk_load_tables_sse2[] = {
6788   72,173,                                 //lods          %ds:(%rsi),%rax
6789   72,139,8,                               //mov           (%rax),%rcx
6790   76,139,64,8,                            //mov           0x8(%rax),%r8
6791   243,68,15,111,4,185,                    //movdqu        (%rcx,%rdi,4),%xmm8
6792   185,255,0,0,0,                          //mov           $0xff,%ecx
6793   102,15,110,193,                         //movd          %ecx,%xmm0
6794   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
6795   102,69,15,111,200,                      //movdqa        %xmm8,%xmm9
6796   102,65,15,114,209,8,                    //psrld         $0x8,%xmm9
6797   102,68,15,219,200,                      //pand          %xmm0,%xmm9
6798   102,69,15,111,208,                      //movdqa        %xmm8,%xmm10
6799   102,65,15,114,210,16,                   //psrld         $0x10,%xmm10
6800   102,68,15,219,208,                      //pand          %xmm0,%xmm10
6801   102,65,15,219,192,                      //pand          %xmm8,%xmm0
6802   102,15,112,216,78,                      //pshufd        $0x4e,%xmm0,%xmm3
6803   102,72,15,126,217,                      //movq          %xmm3,%rcx
6804   65,137,201,                             //mov           %ecx,%r9d
6805   72,193,233,32,                          //shr           $0x20,%rcx
6806   102,73,15,126,194,                      //movq          %xmm0,%r10
6807   69,137,211,                             //mov           %r10d,%r11d
6808   73,193,234,32,                          //shr           $0x20,%r10
6809   243,67,15,16,28,144,                    //movss         (%r8,%r10,4),%xmm3
6810   243,65,15,16,4,136,                     //movss         (%r8,%rcx,4),%xmm0
6811   15,20,216,                              //unpcklps      %xmm0,%xmm3
6812   243,67,15,16,4,152,                     //movss         (%r8,%r11,4),%xmm0
6813   243,67,15,16,12,136,                    //movss         (%r8,%r9,4),%xmm1
6814   15,20,193,                              //unpcklps      %xmm1,%xmm0
6815   15,20,195,                              //unpcklps      %xmm3,%xmm0
6816   76,139,64,16,                           //mov           0x10(%rax),%r8
6817   102,65,15,112,201,78,                   //pshufd        $0x4e,%xmm9,%xmm1
6818   102,73,15,126,202,                      //movq          %xmm1,%r10
6819   77,137,209,                             //mov           %r10,%r9
6820   73,193,233,32,                          //shr           $0x20,%r9
6821   102,76,15,126,201,                      //movq          %xmm9,%rcx
6822   65,137,203,                             //mov           %ecx,%r11d
6823   65,129,227,255,255,255,0,               //and           $0xffffff,%r11d
6824   72,193,233,30,                          //shr           $0x1e,%rcx
6825   65,129,226,255,255,255,0,               //and           $0xffffff,%r10d
6826   243,65,15,16,28,8,                      //movss         (%r8,%rcx,1),%xmm3
6827   243,67,15,16,12,136,                    //movss         (%r8,%r9,4),%xmm1
6828   15,20,217,                              //unpcklps      %xmm1,%xmm3
6829   243,67,15,16,12,152,                    //movss         (%r8,%r11,4),%xmm1
6830   243,67,15,16,20,144,                    //movss         (%r8,%r10,4),%xmm2
6831   15,20,202,                              //unpcklps      %xmm2,%xmm1
6832   15,20,203,                              //unpcklps      %xmm3,%xmm1
6833   76,139,72,24,                           //mov           0x18(%rax),%r9
6834   102,65,15,112,210,78,                   //pshufd        $0x4e,%xmm10,%xmm2
6835   102,72,15,126,209,                      //movq          %xmm2,%rcx
6836   68,15,183,193,                          //movzwl        %cx,%r8d
6837   72,193,233,32,                          //shr           $0x20,%rcx
6838   102,76,15,126,208,                      //movq          %xmm10,%rax
6839   68,15,183,208,                          //movzwl        %ax,%r10d
6840   72,193,232,30,                          //shr           $0x1e,%rax
6841   243,69,15,16,12,1,                      //movss         (%r9,%rax,1),%xmm9
6842   243,65,15,16,20,137,                    //movss         (%r9,%rcx,4),%xmm2
6843   68,15,20,202,                           //unpcklps      %xmm2,%xmm9
6844   243,67,15,16,20,145,                    //movss         (%r9,%r10,4),%xmm2
6845   243,67,15,16,28,129,                    //movss         (%r9,%r8,4),%xmm3
6846   15,20,211,                              //unpcklps      %xmm3,%xmm2
6847   65,15,20,209,                           //unpcklps      %xmm9,%xmm2
6848   102,65,15,114,208,24,                   //psrld         $0x18,%xmm8
6849   69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
6850   184,129,128,128,59,                     //mov           $0x3b808081,%eax
6851   102,15,110,216,                         //movd          %eax,%xmm3
6852   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
6853   65,15,89,216,                           //mulps         %xmm8,%xmm3
6854   72,173,                                 //lods          %ds:(%rsi),%rax
6855   255,224,                                //jmpq          *%rax
6856 };
6857 
6858 CODE const uint8_t sk_load_a8_sse2[] = {
6859   72,173,                                 //lods          %ds:(%rsi),%rax
6860   72,139,0,                               //mov           (%rax),%rax
6861   102,15,110,4,56,                        //movd          (%rax,%rdi,1),%xmm0
6862   102,15,239,201,                         //pxor          %xmm1,%xmm1
6863   102,15,96,193,                          //punpcklbw     %xmm1,%xmm0
6864   102,15,97,193,                          //punpcklwd     %xmm1,%xmm0
6865   15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
6866   184,129,128,128,59,                     //mov           $0x3b808081,%eax
6867   102,15,110,216,                         //movd          %eax,%xmm3
6868   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
6869   15,89,216,                              //mulps         %xmm0,%xmm3
6870   72,173,                                 //lods          %ds:(%rsi),%rax
6871   15,87,192,                              //xorps         %xmm0,%xmm0
6872   102,15,239,201,                         //pxor          %xmm1,%xmm1
6873   15,87,210,                              //xorps         %xmm2,%xmm2
6874   255,224,                                //jmpq          *%rax
6875 };
6876 
6877 CODE const uint8_t sk_store_a8_sse2[] = {
6878   72,173,                                 //lods          %ds:(%rsi),%rax
6879   72,139,0,                               //mov           (%rax),%rax
6880   185,0,0,127,67,                         //mov           $0x437f0000,%ecx
6881   102,68,15,110,193,                      //movd          %ecx,%xmm8
6882   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
6883   68,15,89,195,                           //mulps         %xmm3,%xmm8
6884   102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
6885   102,65,15,114,240,16,                   //pslld         $0x10,%xmm8
6886   102,65,15,114,224,16,                   //psrad         $0x10,%xmm8
6887   102,69,15,107,192,                      //packssdw      %xmm8,%xmm8
6888   102,69,15,103,192,                      //packuswb      %xmm8,%xmm8
6889   102,68,15,126,4,56,                     //movd          %xmm8,(%rax,%rdi,1)
6890   72,173,                                 //lods          %ds:(%rsi),%rax
6891   255,224,                                //jmpq          *%rax
6892 };
6893 
6894 CODE const uint8_t sk_load_565_sse2[] = {
6895   72,173,                                 //lods          %ds:(%rsi),%rax
6896   72,139,0,                               //mov           (%rax),%rax
6897   243,15,126,20,120,                      //movq          (%rax,%rdi,2),%xmm2
6898   102,15,239,192,                         //pxor          %xmm0,%xmm0
6899   102,15,97,208,                          //punpcklwd     %xmm0,%xmm2
6900   184,0,248,0,0,                          //mov           $0xf800,%eax
6901   102,15,110,192,                         //movd          %eax,%xmm0
6902   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
6903   102,15,219,194,                         //pand          %xmm2,%xmm0
6904   15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
6905   184,8,33,132,55,                        //mov           $0x37842108,%eax
6906   102,15,110,192,                         //movd          %eax,%xmm0
6907   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
6908   15,89,193,                              //mulps         %xmm1,%xmm0
6909   184,224,7,0,0,                          //mov           $0x7e0,%eax
6910   102,15,110,200,                         //movd          %eax,%xmm1
6911   102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
6912   102,15,219,202,                         //pand          %xmm2,%xmm1
6913   15,91,217,                              //cvtdq2ps      %xmm1,%xmm3
6914   184,33,8,2,58,                          //mov           $0x3a020821,%eax
6915   102,15,110,200,                         //movd          %eax,%xmm1
6916   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
6917   15,89,203,                              //mulps         %xmm3,%xmm1
6918   184,31,0,0,0,                           //mov           $0x1f,%eax
6919   102,15,110,216,                         //movd          %eax,%xmm3
6920   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
6921   102,15,219,218,                         //pand          %xmm2,%xmm3
6922   15,91,219,                              //cvtdq2ps      %xmm3,%xmm3
6923   184,8,33,4,61,                          //mov           $0x3d042108,%eax
6924   102,15,110,208,                         //movd          %eax,%xmm2
6925   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
6926   15,89,211,                              //mulps         %xmm3,%xmm2
6927   184,0,0,128,63,                         //mov           $0x3f800000,%eax
6928   102,15,110,216,                         //movd          %eax,%xmm3
6929   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
6930   72,173,                                 //lods          %ds:(%rsi),%rax
6931   255,224,                                //jmpq          *%rax
6932 };
6933 
6934 CODE const uint8_t sk_store_565_sse2[] = {
6935   72,173,                                 //lods          %ds:(%rsi),%rax
6936   72,139,0,                               //mov           (%rax),%rax
6937   185,0,0,248,65,                         //mov           $0x41f80000,%ecx
6938   102,68,15,110,193,                      //movd          %ecx,%xmm8
6939   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
6940   69,15,40,200,                           //movaps        %xmm8,%xmm9
6941   68,15,89,200,                           //mulps         %xmm0,%xmm9
6942   102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
6943   102,65,15,114,241,11,                   //pslld         $0xb,%xmm9
6944   185,0,0,124,66,                         //mov           $0x427c0000,%ecx
6945   102,68,15,110,209,                      //movd          %ecx,%xmm10
6946   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
6947   68,15,89,209,                           //mulps         %xmm1,%xmm10
6948   102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
6949   102,65,15,114,242,5,                    //pslld         $0x5,%xmm10
6950   102,69,15,235,209,                      //por           %xmm9,%xmm10
6951   68,15,89,194,                           //mulps         %xmm2,%xmm8
6952   102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
6953   102,69,15,86,194,                       //orpd          %xmm10,%xmm8
6954   102,65,15,114,240,16,                   //pslld         $0x10,%xmm8
6955   102,65,15,114,224,16,                   //psrad         $0x10,%xmm8
6956   102,69,15,107,192,                      //packssdw      %xmm8,%xmm8
6957   102,68,15,214,4,120,                    //movq          %xmm8,(%rax,%rdi,2)
6958   72,173,                                 //lods          %ds:(%rsi),%rax
6959   255,224,                                //jmpq          *%rax
6960 };
6961 
6962 CODE const uint8_t sk_load_8888_sse2[] = {
6963   72,173,                                 //lods          %ds:(%rsi),%rax
6964   72,139,0,                               //mov           (%rax),%rax
6965   243,15,111,28,184,                      //movdqu        (%rax,%rdi,4),%xmm3
6966   184,255,0,0,0,                          //mov           $0xff,%eax
6967   102,15,110,192,                         //movd          %eax,%xmm0
6968   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
6969   102,15,111,203,                         //movdqa        %xmm3,%xmm1
6970   102,15,114,209,8,                       //psrld         $0x8,%xmm1
6971   102,15,219,200,                         //pand          %xmm0,%xmm1
6972   102,15,111,211,                         //movdqa        %xmm3,%xmm2
6973   102,15,114,210,16,                      //psrld         $0x10,%xmm2
6974   102,15,219,208,                         //pand          %xmm0,%xmm2
6975   102,15,219,195,                         //pand          %xmm3,%xmm0
6976   15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
6977   184,129,128,128,59,                     //mov           $0x3b808081,%eax
6978   102,68,15,110,192,                      //movd          %eax,%xmm8
6979   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
6980   65,15,89,192,                           //mulps         %xmm8,%xmm0
6981   15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
6982   65,15,89,200,                           //mulps         %xmm8,%xmm1
6983   15,91,210,                              //cvtdq2ps      %xmm2,%xmm2
6984   65,15,89,208,                           //mulps         %xmm8,%xmm2
6985   102,15,114,211,24,                      //psrld         $0x18,%xmm3
6986   15,91,219,                              //cvtdq2ps      %xmm3,%xmm3
6987   65,15,89,216,                           //mulps         %xmm8,%xmm3
6988   72,173,                                 //lods          %ds:(%rsi),%rax
6989   255,224,                                //jmpq          *%rax
6990 };
6991 
6992 CODE const uint8_t sk_store_8888_sse2[] = {
6993   72,173,                                 //lods          %ds:(%rsi),%rax
6994   72,139,0,                               //mov           (%rax),%rax
6995   185,0,0,127,67,                         //mov           $0x437f0000,%ecx
6996   102,68,15,110,193,                      //movd          %ecx,%xmm8
6997   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
6998   69,15,40,200,                           //movaps        %xmm8,%xmm9
6999   68,15,89,200,                           //mulps         %xmm0,%xmm9
7000   102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
7001   69,15,40,208,                           //movaps        %xmm8,%xmm10
7002   68,15,89,209,                           //mulps         %xmm1,%xmm10
7003   102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
7004   102,65,15,114,242,8,                    //pslld         $0x8,%xmm10
7005   102,69,15,235,209,                      //por           %xmm9,%xmm10
7006   69,15,40,200,                           //movaps        %xmm8,%xmm9
7007   68,15,89,202,                           //mulps         %xmm2,%xmm9
7008   102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
7009   102,65,15,114,241,16,                   //pslld         $0x10,%xmm9
7010   68,15,89,195,                           //mulps         %xmm3,%xmm8
7011   102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
7012   102,65,15,114,240,24,                   //pslld         $0x18,%xmm8
7013   102,69,15,235,193,                      //por           %xmm9,%xmm8
7014   102,69,15,235,194,                      //por           %xmm10,%xmm8
7015   243,68,15,127,4,184,                    //movdqu        %xmm8,(%rax,%rdi,4)
7016   72,173,                                 //lods          %ds:(%rsi),%rax
7017   255,224,                                //jmpq          *%rax
7018 };
7019 
7020 CODE const uint8_t sk_load_f16_sse2[] = {
7021   72,173,                                 //lods          %ds:(%rsi),%rax
7022   72,139,0,                               //mov           (%rax),%rax
7023   243,15,111,4,248,                       //movdqu        (%rax,%rdi,8),%xmm0
7024   243,15,111,76,248,16,                   //movdqu        0x10(%rax,%rdi,8),%xmm1
7025   102,15,111,208,                         //movdqa        %xmm0,%xmm2
7026   102,15,97,209,                          //punpcklwd     %xmm1,%xmm2
7027   102,15,105,193,                         //punpckhwd     %xmm1,%xmm0
7028   102,68,15,111,194,                      //movdqa        %xmm2,%xmm8
7029   102,68,15,97,192,                       //punpcklwd     %xmm0,%xmm8
7030   102,15,105,208,                         //punpckhwd     %xmm0,%xmm2
7031   184,0,4,0,4,                            //mov           $0x4000400,%eax
7032   102,15,110,192,                         //movd          %eax,%xmm0
7033   102,15,112,216,0,                       //pshufd        $0x0,%xmm0,%xmm3
7034   102,15,111,203,                         //movdqa        %xmm3,%xmm1
7035   102,65,15,101,200,                      //pcmpgtw       %xmm8,%xmm1
7036   102,65,15,223,200,                      //pandn         %xmm8,%xmm1
7037   102,15,101,218,                         //pcmpgtw       %xmm2,%xmm3
7038   102,15,223,218,                         //pandn         %xmm2,%xmm3
7039   102,69,15,239,192,                      //pxor          %xmm8,%xmm8
7040   102,15,111,193,                         //movdqa        %xmm1,%xmm0
7041   102,65,15,97,192,                       //punpcklwd     %xmm8,%xmm0
7042   102,15,114,240,13,                      //pslld         $0xd,%xmm0
7043   184,0,0,128,119,                        //mov           $0x77800000,%eax
7044   102,15,110,208,                         //movd          %eax,%xmm2
7045   102,68,15,112,202,0,                    //pshufd        $0x0,%xmm2,%xmm9
7046   65,15,89,193,                           //mulps         %xmm9,%xmm0
7047   102,65,15,105,200,                      //punpckhwd     %xmm8,%xmm1
7048   102,15,114,241,13,                      //pslld         $0xd,%xmm1
7049   65,15,89,201,                           //mulps         %xmm9,%xmm1
7050   102,15,111,211,                         //movdqa        %xmm3,%xmm2
7051   102,65,15,97,208,                       //punpcklwd     %xmm8,%xmm2
7052   102,15,114,242,13,                      //pslld         $0xd,%xmm2
7053   65,15,89,209,                           //mulps         %xmm9,%xmm2
7054   102,65,15,105,216,                      //punpckhwd     %xmm8,%xmm3
7055   102,15,114,243,13,                      //pslld         $0xd,%xmm3
7056   65,15,89,217,                           //mulps         %xmm9,%xmm3
7057   72,173,                                 //lods          %ds:(%rsi),%rax
7058   255,224,                                //jmpq          *%rax
7059 };
7060 
7061 CODE const uint8_t sk_store_f16_sse2[] = {
7062   72,173,                                 //lods          %ds:(%rsi),%rax
7063   72,139,0,                               //mov           (%rax),%rax
7064   185,0,0,128,7,                          //mov           $0x7800000,%ecx
7065   102,68,15,110,193,                      //movd          %ecx,%xmm8
7066   102,69,15,112,192,0,                    //pshufd        $0x0,%xmm8,%xmm8
7067   102,69,15,111,200,                      //movdqa        %xmm8,%xmm9
7068   68,15,89,200,                           //mulps         %xmm0,%xmm9
7069   102,65,15,114,209,13,                   //psrld         $0xd,%xmm9
7070   102,69,15,111,208,                      //movdqa        %xmm8,%xmm10
7071   68,15,89,209,                           //mulps         %xmm1,%xmm10
7072   102,65,15,114,210,13,                   //psrld         $0xd,%xmm10
7073   102,69,15,111,216,                      //movdqa        %xmm8,%xmm11
7074   68,15,89,218,                           //mulps         %xmm2,%xmm11
7075   102,65,15,114,211,13,                   //psrld         $0xd,%xmm11
7076   68,15,89,195,                           //mulps         %xmm3,%xmm8
7077   102,65,15,114,208,13,                   //psrld         $0xd,%xmm8
7078   102,65,15,115,250,2,                    //pslldq        $0x2,%xmm10
7079   102,69,15,235,209,                      //por           %xmm9,%xmm10
7080   102,65,15,115,248,2,                    //pslldq        $0x2,%xmm8
7081   102,69,15,235,195,                      //por           %xmm11,%xmm8
7082   102,69,15,111,202,                      //movdqa        %xmm10,%xmm9
7083   102,69,15,98,200,                       //punpckldq     %xmm8,%xmm9
7084   243,68,15,127,12,248,                   //movdqu        %xmm9,(%rax,%rdi,8)
7085   102,69,15,106,208,                      //punpckhdq     %xmm8,%xmm10
7086   243,68,15,127,84,248,16,                //movdqu        %xmm10,0x10(%rax,%rdi,8)
7087   72,173,                                 //lods          %ds:(%rsi),%rax
7088   255,224,                                //jmpq          *%rax
7089 };
7090 
7091 CODE const uint8_t sk_store_f32_sse2[] = {
7092   72,173,                                 //lods          %ds:(%rsi),%rax
7093   72,139,0,                               //mov           (%rax),%rax
7094   72,137,249,                             //mov           %rdi,%rcx
7095   72,193,225,4,                           //shl           $0x4,%rcx
7096   68,15,40,192,                           //movaps        %xmm0,%xmm8
7097   68,15,40,200,                           //movaps        %xmm0,%xmm9
7098   68,15,20,201,                           //unpcklps      %xmm1,%xmm9
7099   68,15,40,210,                           //movaps        %xmm2,%xmm10
7100   68,15,40,218,                           //movaps        %xmm2,%xmm11
7101   68,15,20,219,                           //unpcklps      %xmm3,%xmm11
7102   68,15,21,193,                           //unpckhps      %xmm1,%xmm8
7103   68,15,21,211,                           //unpckhps      %xmm3,%xmm10
7104   69,15,40,225,                           //movaps        %xmm9,%xmm12
7105   102,69,15,20,227,                       //unpcklpd      %xmm11,%xmm12
7106   69,15,18,217,                           //movhlps       %xmm9,%xmm11
7107   69,15,40,200,                           //movaps        %xmm8,%xmm9
7108   102,69,15,20,202,                       //unpcklpd      %xmm10,%xmm9
7109   69,15,18,208,                           //movhlps       %xmm8,%xmm10
7110   102,68,15,17,36,8,                      //movupd        %xmm12,(%rax,%rcx,1)
7111   68,15,17,92,8,16,                       //movups        %xmm11,0x10(%rax,%rcx,1)
7112   102,68,15,17,76,8,32,                   //movupd        %xmm9,0x20(%rax,%rcx,1)
7113   68,15,17,84,8,48,                       //movups        %xmm10,0x30(%rax,%rcx,1)
7114   72,173,                                 //lods          %ds:(%rsi),%rax
7115   255,224,                                //jmpq          *%rax
7116 };
7117 
7118 CODE const uint8_t sk_clamp_x_sse2[] = {
7119   72,173,                                 //lods          %ds:(%rsi),%rax
7120   69,15,87,192,                           //xorps         %xmm8,%xmm8
7121   68,15,95,192,                           //maxps         %xmm0,%xmm8
7122   243,68,15,16,8,                         //movss         (%rax),%xmm9
7123   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
7124   102,15,118,192,                         //pcmpeqd       %xmm0,%xmm0
7125   102,65,15,254,193,                      //paddd         %xmm9,%xmm0
7126   68,15,93,192,                           //minps         %xmm0,%xmm8
7127   72,173,                                 //lods          %ds:(%rsi),%rax
7128   65,15,40,192,                           //movaps        %xmm8,%xmm0
7129   255,224,                                //jmpq          *%rax
7130 };
7131 
7132 CODE const uint8_t sk_clamp_y_sse2[] = {
7133   72,173,                                 //lods          %ds:(%rsi),%rax
7134   69,15,87,192,                           //xorps         %xmm8,%xmm8
7135   68,15,95,193,                           //maxps         %xmm1,%xmm8
7136   243,68,15,16,8,                         //movss         (%rax),%xmm9
7137   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
7138   102,15,118,201,                         //pcmpeqd       %xmm1,%xmm1
7139   102,65,15,254,201,                      //paddd         %xmm9,%xmm1
7140   68,15,93,193,                           //minps         %xmm1,%xmm8
7141   72,173,                                 //lods          %ds:(%rsi),%rax
7142   65,15,40,200,                           //movaps        %xmm8,%xmm1
7143   255,224,                                //jmpq          *%rax
7144 };
7145 
7146 CODE const uint8_t sk_repeat_x_sse2[] = {
7147   72,173,                                 //lods          %ds:(%rsi),%rax
7148   243,68,15,16,0,                         //movss         (%rax),%xmm8
7149   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
7150   68,15,40,200,                           //movaps        %xmm0,%xmm9
7151   69,15,94,200,                           //divps         %xmm8,%xmm9
7152   243,69,15,91,209,                       //cvttps2dq     %xmm9,%xmm10
7153   69,15,91,210,                           //cvtdq2ps      %xmm10,%xmm10
7154   69,15,194,202,1,                        //cmpltps       %xmm10,%xmm9
7155   184,0,0,128,63,                         //mov           $0x3f800000,%eax
7156   102,68,15,110,216,                      //movd          %eax,%xmm11
7157   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
7158   69,15,84,217,                           //andps         %xmm9,%xmm11
7159   69,15,92,211,                           //subps         %xmm11,%xmm10
7160   69,15,89,208,                           //mulps         %xmm8,%xmm10
7161   65,15,92,194,                           //subps         %xmm10,%xmm0
7162   102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
7163   102,69,15,254,200,                      //paddd         %xmm8,%xmm9
7164   65,15,93,193,                           //minps         %xmm9,%xmm0
7165   72,173,                                 //lods          %ds:(%rsi),%rax
7166   255,224,                                //jmpq          *%rax
7167 };
7168 
7169 CODE const uint8_t sk_repeat_y_sse2[] = {
7170   72,173,                                 //lods          %ds:(%rsi),%rax
7171   243,68,15,16,0,                         //movss         (%rax),%xmm8
7172   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
7173   68,15,40,201,                           //movaps        %xmm1,%xmm9
7174   69,15,94,200,                           //divps         %xmm8,%xmm9
7175   243,69,15,91,209,                       //cvttps2dq     %xmm9,%xmm10
7176   69,15,91,210,                           //cvtdq2ps      %xmm10,%xmm10
7177   69,15,194,202,1,                        //cmpltps       %xmm10,%xmm9
7178   184,0,0,128,63,                         //mov           $0x3f800000,%eax
7179   102,68,15,110,216,                      //movd          %eax,%xmm11
7180   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
7181   69,15,84,217,                           //andps         %xmm9,%xmm11
7182   69,15,92,211,                           //subps         %xmm11,%xmm10
7183   69,15,89,208,                           //mulps         %xmm8,%xmm10
7184   65,15,92,202,                           //subps         %xmm10,%xmm1
7185   102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
7186   102,69,15,254,200,                      //paddd         %xmm8,%xmm9
7187   65,15,93,201,                           //minps         %xmm9,%xmm1
7188   72,173,                                 //lods          %ds:(%rsi),%rax
7189   255,224,                                //jmpq          *%rax
7190 };
7191 
7192 CODE const uint8_t sk_mirror_x_sse2[] = {
7193   72,173,                                 //lods          %ds:(%rsi),%rax
7194   243,68,15,16,8,                         //movss         (%rax),%xmm9
7195   69,15,40,193,                           //movaps        %xmm9,%xmm8
7196   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
7197   65,15,92,192,                           //subps         %xmm8,%xmm0
7198   243,69,15,88,201,                       //addss         %xmm9,%xmm9
7199   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
7200   68,15,40,208,                           //movaps        %xmm0,%xmm10
7201   69,15,94,209,                           //divps         %xmm9,%xmm10
7202   243,69,15,91,218,                       //cvttps2dq     %xmm10,%xmm11
7203   69,15,91,219,                           //cvtdq2ps      %xmm11,%xmm11
7204   69,15,194,211,1,                        //cmpltps       %xmm11,%xmm10
7205   184,0,0,128,63,                         //mov           $0x3f800000,%eax
7206   102,68,15,110,224,                      //movd          %eax,%xmm12
7207   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
7208   69,15,84,226,                           //andps         %xmm10,%xmm12
7209   69,15,87,210,                           //xorps         %xmm10,%xmm10
7210   69,15,92,220,                           //subps         %xmm12,%xmm11
7211   69,15,89,217,                           //mulps         %xmm9,%xmm11
7212   65,15,92,195,                           //subps         %xmm11,%xmm0
7213   65,15,92,192,                           //subps         %xmm8,%xmm0
7214   68,15,92,208,                           //subps         %xmm0,%xmm10
7215   65,15,84,194,                           //andps         %xmm10,%xmm0
7216   102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
7217   102,69,15,254,200,                      //paddd         %xmm8,%xmm9
7218   65,15,93,193,                           //minps         %xmm9,%xmm0
7219   72,173,                                 //lods          %ds:(%rsi),%rax
7220   255,224,                                //jmpq          *%rax
7221 };
7222 
7223 CODE const uint8_t sk_mirror_y_sse2[] = {
7224   72,173,                                 //lods          %ds:(%rsi),%rax
7225   243,68,15,16,8,                         //movss         (%rax),%xmm9
7226   69,15,40,193,                           //movaps        %xmm9,%xmm8
7227   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
7228   65,15,92,200,                           //subps         %xmm8,%xmm1
7229   243,69,15,88,201,                       //addss         %xmm9,%xmm9
7230   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
7231   68,15,40,209,                           //movaps        %xmm1,%xmm10
7232   69,15,94,209,                           //divps         %xmm9,%xmm10
7233   243,69,15,91,218,                       //cvttps2dq     %xmm10,%xmm11
7234   69,15,91,219,                           //cvtdq2ps      %xmm11,%xmm11
7235   69,15,194,211,1,                        //cmpltps       %xmm11,%xmm10
7236   184,0,0,128,63,                         //mov           $0x3f800000,%eax
7237   102,68,15,110,224,                      //movd          %eax,%xmm12
7238   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
7239   69,15,84,226,                           //andps         %xmm10,%xmm12
7240   69,15,87,210,                           //xorps         %xmm10,%xmm10
7241   69,15,92,220,                           //subps         %xmm12,%xmm11
7242   69,15,89,217,                           //mulps         %xmm9,%xmm11
7243   65,15,92,203,                           //subps         %xmm11,%xmm1
7244   65,15,92,200,                           //subps         %xmm8,%xmm1
7245   68,15,92,209,                           //subps         %xmm1,%xmm10
7246   65,15,84,202,                           //andps         %xmm10,%xmm1
7247   102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
7248   102,69,15,254,200,                      //paddd         %xmm8,%xmm9
7249   65,15,93,201,                           //minps         %xmm9,%xmm1
7250   72,173,                                 //lods          %ds:(%rsi),%rax
7251   255,224,                                //jmpq          *%rax
7252 };
7253 
7254 CODE const uint8_t sk_luminance_to_alpha_sse2[] = {
7255   184,208,179,89,62,                      //mov           $0x3e59b3d0,%eax
7256   102,15,110,216,                         //movd          %eax,%xmm3
7257   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
7258   15,89,216,                              //mulps         %xmm0,%xmm3
7259   184,89,23,55,63,                        //mov           $0x3f371759,%eax
7260   102,15,110,192,                         //movd          %eax,%xmm0
7261   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
7262   15,89,193,                              //mulps         %xmm1,%xmm0
7263   15,88,195,                              //addps         %xmm3,%xmm0
7264   184,152,221,147,61,                     //mov           $0x3d93dd98,%eax
7265   102,15,110,216,                         //movd          %eax,%xmm3
7266   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
7267   15,89,218,                              //mulps         %xmm2,%xmm3
7268   15,88,216,                              //addps         %xmm0,%xmm3
7269   72,173,                                 //lods          %ds:(%rsi),%rax
7270   15,87,192,                              //xorps         %xmm0,%xmm0
7271   15,87,201,                              //xorps         %xmm1,%xmm1
7272   15,87,210,                              //xorps         %xmm2,%xmm2
7273   255,224,                                //jmpq          *%rax
7274 };
7275 
7276 CODE const uint8_t sk_matrix_2x3_sse2[] = {
7277   68,15,40,201,                           //movaps        %xmm1,%xmm9
7278   68,15,40,192,                           //movaps        %xmm0,%xmm8
7279   72,173,                                 //lods          %ds:(%rsi),%rax
7280   243,15,16,0,                            //movss         (%rax),%xmm0
7281   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
7282   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
7283   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
7284   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
7285   243,68,15,16,88,16,                     //movss         0x10(%rax),%xmm11
7286   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
7287   69,15,89,209,                           //mulps         %xmm9,%xmm10
7288   69,15,88,211,                           //addps         %xmm11,%xmm10
7289   65,15,89,192,                           //mulps         %xmm8,%xmm0
7290   65,15,88,194,                           //addps         %xmm10,%xmm0
7291   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
7292   243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
7293   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
7294   243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
7295   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
7296   69,15,89,209,                           //mulps         %xmm9,%xmm10
7297   69,15,88,211,                           //addps         %xmm11,%xmm10
7298   65,15,89,200,                           //mulps         %xmm8,%xmm1
7299   65,15,88,202,                           //addps         %xmm10,%xmm1
7300   72,173,                                 //lods          %ds:(%rsi),%rax
7301   255,224,                                //jmpq          *%rax
7302 };
7303 
7304 CODE const uint8_t sk_matrix_3x4_sse2[] = {
7305   68,15,40,201,                           //movaps        %xmm1,%xmm9
7306   68,15,40,192,                           //movaps        %xmm0,%xmm8
7307   72,173,                                 //lods          %ds:(%rsi),%rax
7308   243,15,16,0,                            //movss         (%rax),%xmm0
7309   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
7310   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
7311   243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
7312   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
7313   243,68,15,16,88,24,                     //movss         0x18(%rax),%xmm11
7314   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
7315   243,68,15,16,96,36,                     //movss         0x24(%rax),%xmm12
7316   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
7317   68,15,89,218,                           //mulps         %xmm2,%xmm11
7318   69,15,88,220,                           //addps         %xmm12,%xmm11
7319   69,15,89,209,                           //mulps         %xmm9,%xmm10
7320   69,15,88,211,                           //addps         %xmm11,%xmm10
7321   65,15,89,192,                           //mulps         %xmm8,%xmm0
7322   65,15,88,194,                           //addps         %xmm10,%xmm0
7323   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
7324   243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
7325   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
7326   243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
7327   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
7328   243,68,15,16,96,40,                     //movss         0x28(%rax),%xmm12
7329   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
7330   68,15,89,218,                           //mulps         %xmm2,%xmm11
7331   69,15,88,220,                           //addps         %xmm12,%xmm11
7332   69,15,89,209,                           //mulps         %xmm9,%xmm10
7333   69,15,88,211,                           //addps         %xmm11,%xmm10
7334   65,15,89,200,                           //mulps         %xmm8,%xmm1
7335   65,15,88,202,                           //addps         %xmm10,%xmm1
7336   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
7337   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
7338   243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
7339   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
7340   243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
7341   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
7342   243,68,15,16,104,44,                    //movss         0x2c(%rax),%xmm13
7343   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
7344   68,15,89,226,                           //mulps         %xmm2,%xmm12
7345   69,15,88,229,                           //addps         %xmm13,%xmm12
7346   69,15,89,217,                           //mulps         %xmm9,%xmm11
7347   69,15,88,220,                           //addps         %xmm12,%xmm11
7348   69,15,89,208,                           //mulps         %xmm8,%xmm10
7349   69,15,88,211,                           //addps         %xmm11,%xmm10
7350   72,173,                                 //lods          %ds:(%rsi),%rax
7351   65,15,40,210,                           //movaps        %xmm10,%xmm2
7352   255,224,                                //jmpq          *%rax
7353 };
7354 
7355 CODE const uint8_t sk_matrix_4x5_sse2[] = {
7356   68,15,40,201,                           //movaps        %xmm1,%xmm9
7357   68,15,40,192,                           //movaps        %xmm0,%xmm8
7358   72,173,                                 //lods          %ds:(%rsi),%rax
7359   243,15,16,0,                            //movss         (%rax),%xmm0
7360   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
7361   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
7362   243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
7363   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
7364   243,68,15,16,88,32,                     //movss         0x20(%rax),%xmm11
7365   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
7366   243,68,15,16,96,48,                     //movss         0x30(%rax),%xmm12
7367   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
7368   243,68,15,16,104,64,                    //movss         0x40(%rax),%xmm13
7369   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
7370   68,15,89,227,                           //mulps         %xmm3,%xmm12
7371   69,15,88,229,                           //addps         %xmm13,%xmm12
7372   68,15,89,218,                           //mulps         %xmm2,%xmm11
7373   69,15,88,220,                           //addps         %xmm12,%xmm11
7374   69,15,89,209,                           //mulps         %xmm9,%xmm10
7375   69,15,88,211,                           //addps         %xmm11,%xmm10
7376   65,15,89,192,                           //mulps         %xmm8,%xmm0
7377   65,15,88,194,                           //addps         %xmm10,%xmm0
7378   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
7379   243,68,15,16,80,20,                     //movss         0x14(%rax),%xmm10
7380   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
7381   243,68,15,16,88,36,                     //movss         0x24(%rax),%xmm11
7382   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
7383   243,68,15,16,96,52,                     //movss         0x34(%rax),%xmm12
7384   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
7385   243,68,15,16,104,68,                    //movss         0x44(%rax),%xmm13
7386   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
7387   68,15,89,227,                           //mulps         %xmm3,%xmm12
7388   69,15,88,229,                           //addps         %xmm13,%xmm12
7389   68,15,89,218,                           //mulps         %xmm2,%xmm11
7390   69,15,88,220,                           //addps         %xmm12,%xmm11
7391   69,15,89,209,                           //mulps         %xmm9,%xmm10
7392   69,15,88,211,                           //addps         %xmm11,%xmm10
7393   65,15,89,200,                           //mulps         %xmm8,%xmm1
7394   65,15,88,202,                           //addps         %xmm10,%xmm1
7395   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
7396   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
7397   243,68,15,16,88,24,                     //movss         0x18(%rax),%xmm11
7398   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
7399   243,68,15,16,96,40,                     //movss         0x28(%rax),%xmm12
7400   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
7401   243,68,15,16,104,56,                    //movss         0x38(%rax),%xmm13
7402   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
7403   243,68,15,16,112,72,                    //movss         0x48(%rax),%xmm14
7404   69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
7405   68,15,89,235,                           //mulps         %xmm3,%xmm13
7406   69,15,88,238,                           //addps         %xmm14,%xmm13
7407   68,15,89,226,                           //mulps         %xmm2,%xmm12
7408   69,15,88,229,                           //addps         %xmm13,%xmm12
7409   69,15,89,217,                           //mulps         %xmm9,%xmm11
7410   69,15,88,220,                           //addps         %xmm12,%xmm11
7411   69,15,89,208,                           //mulps         %xmm8,%xmm10
7412   69,15,88,211,                           //addps         %xmm11,%xmm10
7413   243,68,15,16,88,12,                     //movss         0xc(%rax),%xmm11
7414   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
7415   243,68,15,16,96,28,                     //movss         0x1c(%rax),%xmm12
7416   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
7417   243,68,15,16,104,44,                    //movss         0x2c(%rax),%xmm13
7418   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
7419   243,68,15,16,112,60,                    //movss         0x3c(%rax),%xmm14
7420   69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
7421   243,68,15,16,120,76,                    //movss         0x4c(%rax),%xmm15
7422   69,15,198,255,0,                        //shufps        $0x0,%xmm15,%xmm15
7423   68,15,89,243,                           //mulps         %xmm3,%xmm14
7424   69,15,88,247,                           //addps         %xmm15,%xmm14
7425   68,15,89,234,                           //mulps         %xmm2,%xmm13
7426   69,15,88,238,                           //addps         %xmm14,%xmm13
7427   69,15,89,225,                           //mulps         %xmm9,%xmm12
7428   69,15,88,229,                           //addps         %xmm13,%xmm12
7429   69,15,89,216,                           //mulps         %xmm8,%xmm11
7430   69,15,88,220,                           //addps         %xmm12,%xmm11
7431   72,173,                                 //lods          %ds:(%rsi),%rax
7432   65,15,40,210,                           //movaps        %xmm10,%xmm2
7433   65,15,40,219,                           //movaps        %xmm11,%xmm3
7434   255,224,                                //jmpq          *%rax
7435 };
7436 
7437 CODE const uint8_t sk_matrix_perspective_sse2[] = {
7438   68,15,40,192,                           //movaps        %xmm0,%xmm8
7439   72,173,                                 //lods          %ds:(%rsi),%rax
7440   243,15,16,0,                            //movss         (%rax),%xmm0
7441   243,68,15,16,72,4,                      //movss         0x4(%rax),%xmm9
7442   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
7443   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
7444   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
7445   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
7446   68,15,89,201,                           //mulps         %xmm1,%xmm9
7447   69,15,88,202,                           //addps         %xmm10,%xmm9
7448   65,15,89,192,                           //mulps         %xmm8,%xmm0
7449   65,15,88,193,                           //addps         %xmm9,%xmm0
7450   243,68,15,16,72,12,                     //movss         0xc(%rax),%xmm9
7451   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
7452   243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
7453   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
7454   243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
7455   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
7456   68,15,89,209,                           //mulps         %xmm1,%xmm10
7457   69,15,88,211,                           //addps         %xmm11,%xmm10
7458   69,15,89,200,                           //mulps         %xmm8,%xmm9
7459   69,15,88,202,                           //addps         %xmm10,%xmm9
7460   243,68,15,16,80,24,                     //movss         0x18(%rax),%xmm10
7461   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
7462   243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
7463   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
7464   243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
7465   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
7466   68,15,89,217,                           //mulps         %xmm1,%xmm11
7467   69,15,88,220,                           //addps         %xmm12,%xmm11
7468   69,15,89,208,                           //mulps         %xmm8,%xmm10
7469   69,15,88,211,                           //addps         %xmm11,%xmm10
7470   65,15,83,202,                           //rcpps         %xmm10,%xmm1
7471   15,89,193,                              //mulps         %xmm1,%xmm0
7472   68,15,89,201,                           //mulps         %xmm1,%xmm9
7473   72,173,                                 //lods          %ds:(%rsi),%rax
7474   65,15,40,201,                           //movaps        %xmm9,%xmm1
7475   255,224,                                //jmpq          *%rax
7476 };
7477 
7478 CODE const uint8_t sk_linear_gradient_2stops_sse2[] = {
7479   72,173,                                 //lods          %ds:(%rsi),%rax
7480   68,15,16,8,                             //movups        (%rax),%xmm9
7481   15,16,88,16,                            //movups        0x10(%rax),%xmm3
7482   68,15,40,195,                           //movaps        %xmm3,%xmm8
7483   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
7484   65,15,40,201,                           //movaps        %xmm9,%xmm1
7485   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
7486   68,15,89,192,                           //mulps         %xmm0,%xmm8
7487   68,15,88,193,                           //addps         %xmm1,%xmm8
7488   15,40,203,                              //movaps        %xmm3,%xmm1
7489   15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
7490   65,15,40,209,                           //movaps        %xmm9,%xmm2
7491   15,198,210,85,                          //shufps        $0x55,%xmm2,%xmm2
7492   15,89,200,                              //mulps         %xmm0,%xmm1
7493   15,88,202,                              //addps         %xmm2,%xmm1
7494   15,40,211,                              //movaps        %xmm3,%xmm2
7495   15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
7496   69,15,40,209,                           //movaps        %xmm9,%xmm10
7497   69,15,198,210,170,                      //shufps        $0xaa,%xmm10,%xmm10
7498   15,89,208,                              //mulps         %xmm0,%xmm2
7499   65,15,88,210,                           //addps         %xmm10,%xmm2
7500   15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
7501   69,15,198,201,255,                      //shufps        $0xff,%xmm9,%xmm9
7502   15,89,216,                              //mulps         %xmm0,%xmm3
7503   65,15,88,217,                           //addps         %xmm9,%xmm3
7504   72,173,                                 //lods          %ds:(%rsi),%rax
7505   65,15,40,192,                           //movaps        %xmm8,%xmm0
7506   255,224,                                //jmpq          *%rax
7507 };
7508 #elif defined(_M_X64)
7509 
7510 CODE const uint8_t sk_start_pipeline_hsw[] = {
7511   65,87,                                  //push          %r15
7512   65,86,                                  //push          %r14
7513   65,85,                                  //push          %r13
7514   65,84,                                  //push          %r12
7515   86,                                     //push          %rsi
7516   87,                                     //push          %rdi
7517   83,                                     //push          %rbx
7518   72,129,236,160,0,0,0,                   //sub           $0xa0,%rsp
7519   197,120,41,188,36,144,0,0,0,            //vmovaps       %xmm15,0x90(%rsp)
7520   197,120,41,180,36,128,0,0,0,            //vmovaps       %xmm14,0x80(%rsp)
7521   197,120,41,108,36,112,                  //vmovaps       %xmm13,0x70(%rsp)
7522   197,120,41,100,36,96,                   //vmovaps       %xmm12,0x60(%rsp)
7523   197,120,41,92,36,80,                    //vmovaps       %xmm11,0x50(%rsp)
7524   197,120,41,84,36,64,                    //vmovaps       %xmm10,0x40(%rsp)
7525   197,120,41,76,36,48,                    //vmovaps       %xmm9,0x30(%rsp)
7526   197,120,41,68,36,32,                    //vmovaps       %xmm8,0x20(%rsp)
7527   197,248,41,124,36,16,                   //vmovaps       %xmm7,0x10(%rsp)
7528   197,248,41,52,36,                       //vmovaps       %xmm6,(%rsp)
7529   77,137,205,                             //mov           %r9,%r13
7530   77,137,198,                             //mov           %r8,%r14
7531   72,137,203,                             //mov           %rcx,%rbx
7532   72,137,214,                             //mov           %rdx,%rsi
7533   72,173,                                 //lods          %ds:(%rsi),%rax
7534   73,137,199,                             //mov           %rax,%r15
7535   73,137,244,                             //mov           %rsi,%r12
7536   72,141,67,8,                            //lea           0x8(%rbx),%rax
7537   76,57,232,                              //cmp           %r13,%rax
7538   118,5,                                  //jbe           75 <_sk_start_pipeline_hsw+0x75>
7539   72,137,223,                             //mov           %rbx,%rdi
7540   235,65,                                 //jmp           b6 <_sk_start_pipeline_hsw+0xb6>
7541   185,0,0,0,0,                            //mov           $0x0,%ecx
7542   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
7543   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
7544   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
7545   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
7546   197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
7547   197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
7548   197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
7549   197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
7550   72,137,223,                             //mov           %rbx,%rdi
7551   76,137,230,                             //mov           %r12,%rsi
7552   76,137,242,                             //mov           %r14,%rdx
7553   65,255,215,                             //callq         *%r15
7554   72,141,123,8,                           //lea           0x8(%rbx),%rdi
7555   72,131,195,16,                          //add           $0x10,%rbx
7556   76,57,235,                              //cmp           %r13,%rbx
7557   72,137,251,                             //mov           %rdi,%rbx
7558   118,191,                                //jbe           75 <_sk_start_pipeline_hsw+0x75>
7559   76,137,233,                             //mov           %r13,%rcx
7560   72,41,249,                              //sub           %rdi,%rcx
7561   116,41,                                 //je            e7 <_sk_start_pipeline_hsw+0xe7>
7562   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
7563   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
7564   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
7565   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
7566   197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
7567   197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
7568   197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
7569   197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
7570   76,137,230,                             //mov           %r12,%rsi
7571   76,137,242,                             //mov           %r14,%rdx
7572   65,255,215,                             //callq         *%r15
7573   76,137,232,                             //mov           %r13,%rax
7574   197,248,40,52,36,                       //vmovaps       (%rsp),%xmm6
7575   197,248,40,124,36,16,                   //vmovaps       0x10(%rsp),%xmm7
7576   197,120,40,68,36,32,                    //vmovaps       0x20(%rsp),%xmm8
7577   197,120,40,76,36,48,                    //vmovaps       0x30(%rsp),%xmm9
7578   197,120,40,84,36,64,                    //vmovaps       0x40(%rsp),%xmm10
7579   197,120,40,92,36,80,                    //vmovaps       0x50(%rsp),%xmm11
7580   197,120,40,100,36,96,                   //vmovaps       0x60(%rsp),%xmm12
7581   197,120,40,108,36,112,                  //vmovaps       0x70(%rsp),%xmm13
7582   197,120,40,180,36,128,0,0,0,            //vmovaps       0x80(%rsp),%xmm14
7583   197,120,40,188,36,144,0,0,0,            //vmovaps       0x90(%rsp),%xmm15
7584   72,129,196,160,0,0,0,                   //add           $0xa0,%rsp
7585   91,                                     //pop           %rbx
7586   95,                                     //pop           %rdi
7587   94,                                     //pop           %rsi
7588   65,92,                                  //pop           %r12
7589   65,93,                                  //pop           %r13
7590   65,94,                                  //pop           %r14
7591   65,95,                                  //pop           %r15
7592   197,248,119,                            //vzeroupper
7593   195,                                    //retq
7594 };
7595 
7596 CODE const uint8_t sk_just_return_hsw[] = {
7597   195,                                    //retq
7598 };
7599 
7600 CODE const uint8_t sk_seed_shader_hsw[] = {
7601   72,173,                                 //lods          %ds:(%rsi),%rax
7602   197,249,110,199,                        //vmovd         %edi,%xmm0
7603   196,226,125,88,192,                     //vpbroadcastd  %xmm0,%ymm0
7604   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
7605   65,184,0,0,0,63,                        //mov           $0x3f000000,%r8d
7606   196,193,121,110,200,                    //vmovd         %r8d,%xmm1
7607   196,226,125,88,201,                     //vpbroadcastd  %xmm1,%ymm1
7608   197,252,88,193,                         //vaddps        %ymm1,%ymm0,%ymm0
7609   197,252,88,2,                           //vaddps        (%rdx),%ymm0,%ymm0
7610   196,226,125,24,16,                      //vbroadcastss  (%rax),%ymm2
7611   197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
7612   197,236,88,201,                         //vaddps        %ymm1,%ymm2,%ymm1
7613   184,0,0,128,63,                         //mov           $0x3f800000,%eax
7614   197,249,110,208,                        //vmovd         %eax,%xmm2
7615   196,226,125,88,210,                     //vpbroadcastd  %xmm2,%ymm2
7616   72,173,                                 //lods          %ds:(%rsi),%rax
7617   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
7618   197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
7619   197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
7620   197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
7621   197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
7622   255,224,                                //jmpq          *%rax
7623 };
7624 
7625 CODE const uint8_t sk_constant_color_hsw[] = {
7626   72,173,                                 //lods          %ds:(%rsi),%rax
7627   196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
7628   196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
7629   196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
7630   196,226,125,24,88,12,                   //vbroadcastss  0xc(%rax),%ymm3
7631   72,173,                                 //lods          %ds:(%rsi),%rax
7632   255,224,                                //jmpq          *%rax
7633 };
7634 
7635 CODE const uint8_t sk_clear_hsw[] = {
7636   72,173,                                 //lods          %ds:(%rsi),%rax
7637   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
7638   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
7639   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
7640   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
7641   255,224,                                //jmpq          *%rax
7642 };
7643 
7644 CODE const uint8_t sk_plus__hsw[] = {
7645   197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
7646   197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
7647   197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
7648   197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
7649   72,173,                                 //lods          %ds:(%rsi),%rax
7650   255,224,                                //jmpq          *%rax
7651 };
7652 
7653 CODE const uint8_t sk_srcover_hsw[] = {
7654   184,0,0,128,63,                         //mov           $0x3f800000,%eax
7655   197,121,110,192,                        //vmovd         %eax,%xmm8
7656   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
7657   197,60,92,195,                          //vsubps        %ymm3,%ymm8,%ymm8
7658   196,194,93,184,192,                     //vfmadd231ps   %ymm8,%ymm4,%ymm0
7659   196,194,85,184,200,                     //vfmadd231ps   %ymm8,%ymm5,%ymm1
7660   196,194,77,184,208,                     //vfmadd231ps   %ymm8,%ymm6,%ymm2
7661   196,194,69,184,216,                     //vfmadd231ps   %ymm8,%ymm7,%ymm3
7662   72,173,                                 //lods          %ds:(%rsi),%rax
7663   255,224,                                //jmpq          *%rax
7664 };
7665 
7666 CODE const uint8_t sk_dstover_hsw[] = {
7667   184,0,0,128,63,                         //mov           $0x3f800000,%eax
7668   197,121,110,192,                        //vmovd         %eax,%xmm8
7669   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
7670   197,60,92,199,                          //vsubps        %ymm7,%ymm8,%ymm8
7671   196,226,61,168,196,                     //vfmadd213ps   %ymm4,%ymm8,%ymm0
7672   196,226,61,168,205,                     //vfmadd213ps   %ymm5,%ymm8,%ymm1
7673   196,226,61,168,214,                     //vfmadd213ps   %ymm6,%ymm8,%ymm2
7674   196,226,61,168,223,                     //vfmadd213ps   %ymm7,%ymm8,%ymm3
7675   72,173,                                 //lods          %ds:(%rsi),%rax
7676   255,224,                                //jmpq          *%rax
7677 };
7678 
7679 CODE const uint8_t sk_clamp_0_hsw[] = {
7680   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
7681   196,193,124,95,192,                     //vmaxps        %ymm8,%ymm0,%ymm0
7682   196,193,116,95,200,                     //vmaxps        %ymm8,%ymm1,%ymm1
7683   196,193,108,95,208,                     //vmaxps        %ymm8,%ymm2,%ymm2
7684   196,193,100,95,216,                     //vmaxps        %ymm8,%ymm3,%ymm3
7685   72,173,                                 //lods          %ds:(%rsi),%rax
7686   255,224,                                //jmpq          *%rax
7687 };
7688 
7689 CODE const uint8_t sk_clamp_1_hsw[] = {
7690   184,0,0,128,63,                         //mov           $0x3f800000,%eax
7691   197,121,110,192,                        //vmovd         %eax,%xmm8
7692   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
7693   196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
7694   196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
7695   196,193,108,93,208,                     //vminps        %ymm8,%ymm2,%ymm2
7696   196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
7697   72,173,                                 //lods          %ds:(%rsi),%rax
7698   255,224,                                //jmpq          *%rax
7699 };
7700 
7701 CODE const uint8_t sk_clamp_a_hsw[] = {
7702   184,0,0,128,63,                         //mov           $0x3f800000,%eax
7703   197,121,110,192,                        //vmovd         %eax,%xmm8
7704   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
7705   196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
7706   197,252,93,195,                         //vminps        %ymm3,%ymm0,%ymm0
7707   197,244,93,203,                         //vminps        %ymm3,%ymm1,%ymm1
7708   197,236,93,211,                         //vminps        %ymm3,%ymm2,%ymm2
7709   72,173,                                 //lods          %ds:(%rsi),%rax
7710   255,224,                                //jmpq          *%rax
7711 };
7712 
7713 CODE const uint8_t sk_set_rgb_hsw[] = {
7714   72,173,                                 //lods          %ds:(%rsi),%rax
7715   196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
7716   196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
7717   196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
7718   72,173,                                 //lods          %ds:(%rsi),%rax
7719   255,224,                                //jmpq          *%rax
7720 };
7721 
7722 CODE const uint8_t sk_swap_rb_hsw[] = {
7723   197,124,40,192,                         //vmovaps       %ymm0,%ymm8
7724   72,173,                                 //lods          %ds:(%rsi),%rax
7725   197,252,40,194,                         //vmovaps       %ymm2,%ymm0
7726   197,124,41,194,                         //vmovaps       %ymm8,%ymm2
7727   255,224,                                //jmpq          *%rax
7728 };
7729 
7730 CODE const uint8_t sk_swap_hsw[] = {
7731   197,124,40,195,                         //vmovaps       %ymm3,%ymm8
7732   197,124,40,202,                         //vmovaps       %ymm2,%ymm9
7733   197,124,40,209,                         //vmovaps       %ymm1,%ymm10
7734   197,124,40,216,                         //vmovaps       %ymm0,%ymm11
7735   72,173,                                 //lods          %ds:(%rsi),%rax
7736   197,252,40,196,                         //vmovaps       %ymm4,%ymm0
7737   197,252,40,205,                         //vmovaps       %ymm5,%ymm1
7738   197,252,40,214,                         //vmovaps       %ymm6,%ymm2
7739   197,252,40,223,                         //vmovaps       %ymm7,%ymm3
7740   197,124,41,220,                         //vmovaps       %ymm11,%ymm4
7741   197,124,41,213,                         //vmovaps       %ymm10,%ymm5
7742   197,124,41,206,                         //vmovaps       %ymm9,%ymm6
7743   197,124,41,199,                         //vmovaps       %ymm8,%ymm7
7744   255,224,                                //jmpq          *%rax
7745 };
7746 
7747 CODE const uint8_t sk_move_src_dst_hsw[] = {
7748   72,173,                                 //lods          %ds:(%rsi),%rax
7749   197,252,40,224,                         //vmovaps       %ymm0,%ymm4
7750   197,252,40,233,                         //vmovaps       %ymm1,%ymm5
7751   197,252,40,242,                         //vmovaps       %ymm2,%ymm6
7752   197,252,40,251,                         //vmovaps       %ymm3,%ymm7
7753   255,224,                                //jmpq          *%rax
7754 };
7755 
7756 CODE const uint8_t sk_move_dst_src_hsw[] = {
7757   72,173,                                 //lods          %ds:(%rsi),%rax
7758   197,252,40,196,                         //vmovaps       %ymm4,%ymm0
7759   197,252,40,205,                         //vmovaps       %ymm5,%ymm1
7760   197,252,40,214,                         //vmovaps       %ymm6,%ymm2
7761   197,252,40,223,                         //vmovaps       %ymm7,%ymm3
7762   255,224,                                //jmpq          *%rax
7763 };
7764 
7765 CODE const uint8_t sk_premul_hsw[] = {
7766   197,252,89,195,                         //vmulps        %ymm3,%ymm0,%ymm0
7767   197,244,89,203,                         //vmulps        %ymm3,%ymm1,%ymm1
7768   197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
7769   72,173,                                 //lods          %ds:(%rsi),%rax
7770   255,224,                                //jmpq          *%rax
7771 };
7772 
7773 CODE const uint8_t sk_unpremul_hsw[] = {
7774   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
7775   196,65,100,194,200,0,                   //vcmpeqps      %ymm8,%ymm3,%ymm9
7776   184,0,0,128,63,                         //mov           $0x3f800000,%eax
7777   197,121,110,208,                        //vmovd         %eax,%xmm10
7778   196,66,125,88,210,                      //vpbroadcastd  %xmm10,%ymm10
7779   197,44,94,211,                          //vdivps        %ymm3,%ymm10,%ymm10
7780   196,67,45,74,192,144,                   //vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
7781   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
7782   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
7783   197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
7784   72,173,                                 //lods          %ds:(%rsi),%rax
7785   255,224,                                //jmpq          *%rax
7786 };
7787 
7788 CODE const uint8_t sk_from_srgb_hsw[] = {
7789   184,145,131,158,61,                     //mov           $0x3d9e8391,%eax
7790   197,121,110,192,                        //vmovd         %eax,%xmm8
7791   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
7792   197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
7793   197,124,89,208,                         //vmulps        %ymm0,%ymm0,%ymm10
7794   184,154,153,153,62,                     //mov           $0x3e99999a,%eax
7795   197,121,110,216,                        //vmovd         %eax,%xmm11
7796   196,66,125,88,219,                      //vpbroadcastd  %xmm11,%ymm11
7797   184,92,143,50,63,                       //mov           $0x3f328f5c,%eax
7798   197,121,110,224,                        //vmovd         %eax,%xmm12
7799   196,66,125,88,228,                      //vpbroadcastd  %xmm12,%ymm12
7800   196,65,125,111,235,                     //vmovdqa       %ymm11,%ymm13
7801   196,66,125,168,236,                     //vfmadd213ps   %ymm12,%ymm0,%ymm13
7802   184,10,215,35,59,                       //mov           $0x3b23d70a,%eax
7803   197,121,110,240,                        //vmovd         %eax,%xmm14
7804   196,66,125,88,246,                      //vpbroadcastd  %xmm14,%ymm14
7805   196,66,45,168,238,                      //vfmadd213ps   %ymm14,%ymm10,%ymm13
7806   184,174,71,97,61,                       //mov           $0x3d6147ae,%eax
7807   197,121,110,208,                        //vmovd         %eax,%xmm10
7808   196,66,125,88,210,                      //vpbroadcastd  %xmm10,%ymm10
7809   196,193,124,194,194,1,                  //vcmpltps      %ymm10,%ymm0,%ymm0
7810   196,195,21,74,193,0,                    //vblendvps     %ymm0,%ymm9,%ymm13,%ymm0
7811   197,60,89,201,                          //vmulps        %ymm1,%ymm8,%ymm9
7812   197,116,89,233,                         //vmulps        %ymm1,%ymm1,%ymm13
7813   196,65,125,111,251,                     //vmovdqa       %ymm11,%ymm15
7814   196,66,117,168,252,                     //vfmadd213ps   %ymm12,%ymm1,%ymm15
7815   196,66,21,168,254,                      //vfmadd213ps   %ymm14,%ymm13,%ymm15
7816   196,193,116,194,202,1,                  //vcmpltps      %ymm10,%ymm1,%ymm1
7817   196,195,5,74,201,16,                    //vblendvps     %ymm1,%ymm9,%ymm15,%ymm1
7818   197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
7819   197,108,89,202,                         //vmulps        %ymm2,%ymm2,%ymm9
7820   196,66,109,168,220,                     //vfmadd213ps   %ymm12,%ymm2,%ymm11
7821   196,66,53,168,222,                      //vfmadd213ps   %ymm14,%ymm9,%ymm11
7822   196,193,108,194,210,1,                  //vcmpltps      %ymm10,%ymm2,%ymm2
7823   196,195,37,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm11,%ymm2
7824   72,173,                                 //lods          %ds:(%rsi),%rax
7825   255,224,                                //jmpq          *%rax
7826 };
7827 
7828 CODE const uint8_t sk_to_srgb_hsw[] = {
7829   197,124,82,192,                         //vrsqrtps      %ymm0,%ymm8
7830   196,65,124,83,216,                      //vrcpps        %ymm8,%ymm11
7831   196,65,124,82,224,                      //vrsqrtps      %ymm8,%ymm12
7832   184,41,92,71,65,                        //mov           $0x41475c29,%eax
7833   197,121,110,192,                        //vmovd         %eax,%xmm8
7834   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
7835   197,60,89,232,                          //vmulps        %ymm0,%ymm8,%ymm13
7836   184,0,0,128,63,                         //mov           $0x3f800000,%eax
7837   197,121,110,200,                        //vmovd         %eax,%xmm9
7838   196,66,125,88,201,                      //vpbroadcastd  %xmm9,%ymm9
7839   184,194,135,210,62,                     //mov           $0x3ed287c2,%eax
7840   197,121,110,208,                        //vmovd         %eax,%xmm10
7841   196,66,125,88,210,                      //vpbroadcastd  %xmm10,%ymm10
7842   184,206,111,48,63,                      //mov           $0x3f306fce,%eax
7843   197,121,110,240,                        //vmovd         %eax,%xmm14
7844   196,66,125,88,246,                      //vpbroadcastd  %xmm14,%ymm14
7845   184,168,87,202,61,                      //mov           $0x3dca57a8,%eax
7846   53,0,0,0,128,                           //xor           $0x80000000,%eax
7847   197,121,110,248,                        //vmovd         %eax,%xmm15
7848   196,66,125,88,255,                      //vpbroadcastd  %xmm15,%ymm15
7849   196,66,13,168,223,                      //vfmadd213ps   %ymm15,%ymm14,%ymm11
7850   196,66,45,184,220,                      //vfmadd231ps   %ymm12,%ymm10,%ymm11
7851   196,65,52,93,219,                       //vminps        %ymm11,%ymm9,%ymm11
7852   184,4,231,140,59,                       //mov           $0x3b8ce704,%eax
7853   197,121,110,224,                        //vmovd         %eax,%xmm12
7854   196,66,125,88,228,                      //vpbroadcastd  %xmm12,%ymm12
7855   196,193,124,194,196,1,                  //vcmpltps      %ymm12,%ymm0,%ymm0
7856   196,195,37,74,197,0,                    //vblendvps     %ymm0,%ymm13,%ymm11,%ymm0
7857   197,124,82,217,                         //vrsqrtps      %ymm1,%ymm11
7858   196,65,124,83,235,                      //vrcpps        %ymm11,%ymm13
7859   196,65,124,82,219,                      //vrsqrtps      %ymm11,%ymm11
7860   196,66,13,168,239,                      //vfmadd213ps   %ymm15,%ymm14,%ymm13
7861   196,66,45,184,235,                      //vfmadd231ps   %ymm11,%ymm10,%ymm13
7862   197,60,89,217,                          //vmulps        %ymm1,%ymm8,%ymm11
7863   196,65,52,93,237,                       //vminps        %ymm13,%ymm9,%ymm13
7864   196,193,116,194,204,1,                  //vcmpltps      %ymm12,%ymm1,%ymm1
7865   196,195,21,74,203,16,                   //vblendvps     %ymm1,%ymm11,%ymm13,%ymm1
7866   197,124,82,218,                         //vrsqrtps      %ymm2,%ymm11
7867   196,65,124,83,235,                      //vrcpps        %ymm11,%ymm13
7868   196,66,13,168,239,                      //vfmadd213ps   %ymm15,%ymm14,%ymm13
7869   196,65,124,82,219,                      //vrsqrtps      %ymm11,%ymm11
7870   196,66,45,184,235,                      //vfmadd231ps   %ymm11,%ymm10,%ymm13
7871   196,65,52,93,205,                       //vminps        %ymm13,%ymm9,%ymm9
7872   197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
7873   196,193,108,194,212,1,                  //vcmpltps      %ymm12,%ymm2,%ymm2
7874   196,195,53,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
7875   72,173,                                 //lods          %ds:(%rsi),%rax
7876   255,224,                                //jmpq          *%rax
7877 };
7878 
7879 CODE const uint8_t sk_scale_1_float_hsw[] = {
7880   72,173,                                 //lods          %ds:(%rsi),%rax
7881   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
7882   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
7883   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
7884   197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
7885   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
7886   72,173,                                 //lods          %ds:(%rsi),%rax
7887   255,224,                                //jmpq          *%rax
7888 };
7889 
7890 CODE const uint8_t sk_scale_u8_hsw[] = {
7891   73,137,200,                             //mov           %rcx,%r8
7892   72,173,                                 //lods          %ds:(%rsi),%rax
7893   72,139,0,                               //mov           (%rax),%rax
7894   72,1,248,                               //add           %rdi,%rax
7895   77,133,192,                             //test          %r8,%r8
7896   117,56,                                 //jne           556 <_sk_scale_u8_hsw+0x48>
7897   197,122,126,0,                          //vmovq         (%rax),%xmm8
7898   196,66,125,49,192,                      //vpmovzxbd     %xmm8,%ymm8
7899   196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
7900   184,129,128,128,59,                     //mov           $0x3b808081,%eax
7901   197,121,110,200,                        //vmovd         %eax,%xmm9
7902   196,66,125,88,201,                      //vpbroadcastd  %xmm9,%ymm9
7903   196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
7904   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
7905   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
7906   197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
7907   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
7908   72,173,                                 //lods          %ds:(%rsi),%rax
7909   76,137,193,                             //mov           %r8,%rcx
7910   255,224,                                //jmpq          *%rax
7911   49,201,                                 //xor           %ecx,%ecx
7912   77,137,194,                             //mov           %r8,%r10
7913   69,49,201,                              //xor           %r9d,%r9d
7914   68,15,182,24,                           //movzbl        (%rax),%r11d
7915   72,255,192,                             //inc           %rax
7916   73,211,227,                             //shl           %cl,%r11
7917   77,9,217,                               //or            %r11,%r9
7918   72,131,193,8,                           //add           $0x8,%rcx
7919   73,255,202,                             //dec           %r10
7920   117,234,                                //jne           55e <_sk_scale_u8_hsw+0x50>
7921   196,65,249,110,193,                     //vmovq         %r9,%xmm8
7922   235,167,                                //jmp           522 <_sk_scale_u8_hsw+0x14>
7923 };
7924 
7925 CODE const uint8_t sk_lerp_1_float_hsw[] = {
7926   72,173,                                 //lods          %ds:(%rsi),%rax
7927   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
7928   197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
7929   196,226,61,168,196,                     //vfmadd213ps   %ymm4,%ymm8,%ymm0
7930   197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
7931   196,226,61,168,205,                     //vfmadd213ps   %ymm5,%ymm8,%ymm1
7932   197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
7933   196,226,61,168,214,                     //vfmadd213ps   %ymm6,%ymm8,%ymm2
7934   197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
7935   196,226,61,168,223,                     //vfmadd213ps   %ymm7,%ymm8,%ymm3
7936   72,173,                                 //lods          %ds:(%rsi),%rax
7937   255,224,                                //jmpq          *%rax
7938 };
7939 
7940 CODE const uint8_t sk_lerp_u8_hsw[] = {
7941   73,137,200,                             //mov           %rcx,%r8
7942   72,173,                                 //lods          %ds:(%rsi),%rax
7943   72,139,0,                               //mov           (%rax),%rax
7944   72,1,248,                               //add           %rdi,%rax
7945   77,133,192,                             //test          %r8,%r8
7946   117,76,                                 //jne           606 <_sk_lerp_u8_hsw+0x5c>
7947   197,122,126,0,                          //vmovq         (%rax),%xmm8
7948   196,66,125,49,192,                      //vpmovzxbd     %xmm8,%ymm8
7949   196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
7950   184,129,128,128,59,                     //mov           $0x3b808081,%eax
7951   197,121,110,200,                        //vmovd         %eax,%xmm9
7952   196,66,125,88,201,                      //vpbroadcastd  %xmm9,%ymm9
7953   196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
7954   197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
7955   196,226,61,168,196,                     //vfmadd213ps   %ymm4,%ymm8,%ymm0
7956   197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
7957   196,226,61,168,205,                     //vfmadd213ps   %ymm5,%ymm8,%ymm1
7958   197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
7959   196,226,61,168,214,                     //vfmadd213ps   %ymm6,%ymm8,%ymm2
7960   197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
7961   196,226,61,168,223,                     //vfmadd213ps   %ymm7,%ymm8,%ymm3
7962   72,173,                                 //lods          %ds:(%rsi),%rax
7963   76,137,193,                             //mov           %r8,%rcx
7964   255,224,                                //jmpq          *%rax
7965   49,201,                                 //xor           %ecx,%ecx
7966   77,137,194,                             //mov           %r8,%r10
7967   69,49,201,                              //xor           %r9d,%r9d
7968   68,15,182,24,                           //movzbl        (%rax),%r11d
7969   72,255,192,                             //inc           %rax
7970   73,211,227,                             //shl           %cl,%r11
7971   77,9,217,                               //or            %r11,%r9
7972   72,131,193,8,                           //add           $0x8,%rcx
7973   73,255,202,                             //dec           %r10
7974   117,234,                                //jne           60e <_sk_lerp_u8_hsw+0x64>
7975   196,65,249,110,193,                     //vmovq         %r9,%xmm8
7976   235,147,                                //jmp           5be <_sk_lerp_u8_hsw+0x14>
7977 };
7978 
7979 CODE const uint8_t sk_lerp_565_hsw[] = {
7980   72,173,                                 //lods          %ds:(%rsi),%rax
7981   76,139,16,                              //mov           (%rax),%r10
7982   72,133,201,                             //test          %rcx,%rcx
7983   15,133,179,0,0,0,                       //jne           6ec <_sk_lerp_565_hsw+0xc1>
7984   196,193,122,111,28,122,                 //vmovdqu       (%r10,%rdi,2),%xmm3
7985   196,98,125,51,195,                      //vpmovzxwd     %xmm3,%ymm8
7986   184,0,248,0,0,                          //mov           $0xf800,%eax
7987   197,249,110,216,                        //vmovd         %eax,%xmm3
7988   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
7989   196,193,101,219,216,                    //vpand         %ymm8,%ymm3,%ymm3
7990   197,124,91,203,                         //vcvtdq2ps     %ymm3,%ymm9
7991   184,8,33,132,55,                        //mov           $0x37842108,%eax
7992   197,249,110,216,                        //vmovd         %eax,%xmm3
7993   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
7994   197,52,89,203,                          //vmulps        %ymm3,%ymm9,%ymm9
7995   184,224,7,0,0,                          //mov           $0x7e0,%eax
7996   197,249,110,216,                        //vmovd         %eax,%xmm3
7997   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
7998   196,193,101,219,216,                    //vpand         %ymm8,%ymm3,%ymm3
7999   197,124,91,211,                         //vcvtdq2ps     %ymm3,%ymm10
8000   184,33,8,2,58,                          //mov           $0x3a020821,%eax
8001   197,249,110,216,                        //vmovd         %eax,%xmm3
8002   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
8003   197,44,89,211,                          //vmulps        %ymm3,%ymm10,%ymm10
8004   184,31,0,0,0,                           //mov           $0x1f,%eax
8005   197,249,110,216,                        //vmovd         %eax,%xmm3
8006   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
8007   196,193,101,219,216,                    //vpand         %ymm8,%ymm3,%ymm3
8008   197,124,91,195,                         //vcvtdq2ps     %ymm3,%ymm8
8009   184,8,33,4,61,                          //mov           $0x3d042108,%eax
8010   197,249,110,216,                        //vmovd         %eax,%xmm3
8011   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
8012   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
8013   197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
8014   196,226,53,168,196,                     //vfmadd213ps   %ymm4,%ymm9,%ymm0
8015   197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
8016   196,226,45,168,205,                     //vfmadd213ps   %ymm5,%ymm10,%ymm1
8017   197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
8018   196,226,101,168,214,                    //vfmadd213ps   %ymm6,%ymm3,%ymm2
8019   184,0,0,128,63,                         //mov           $0x3f800000,%eax
8020   197,249,110,216,                        //vmovd         %eax,%xmm3
8021   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
8022   72,173,                                 //lods          %ds:(%rsi),%rax
8023   255,224,                                //jmpq          *%rax
8024   65,137,200,                             //mov           %ecx,%r8d
8025   65,128,224,7,                           //and           $0x7,%r8b
8026   197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
8027   65,254,200,                             //dec           %r8b
8028   65,128,248,6,                           //cmp           $0x6,%r8b
8029   15,135,59,255,255,255,                  //ja            63f <_sk_lerp_565_hsw+0x14>
8030   69,15,182,192,                          //movzbl        %r8b,%r8d
8031   76,141,13,73,0,0,0,                     //lea           0x49(%rip),%r9        # 758 <_sk_lerp_565_hsw+0x12d>
8032   75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
8033   76,1,200,                               //add           %r9,%rax
8034   255,224,                                //jmpq          *%rax
8035   197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
8036   196,193,97,196,92,122,12,6,             //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm3
8037   196,193,97,196,92,122,10,5,             //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm3,%xmm3
8038   196,193,97,196,92,122,8,4,              //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm3,%xmm3
8039   196,193,97,196,92,122,6,3,              //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm3,%xmm3
8040   196,193,97,196,92,122,4,2,              //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3
8041   196,193,97,196,92,122,2,1,              //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3
8042   196,193,97,196,28,122,0,                //vpinsrw       $0x0,(%r10,%rdi,2),%xmm3,%xmm3
8043   233,231,254,255,255,                    //jmpq          63f <_sk_lerp_565_hsw+0x14>
8044   244,                                    //hlt
8045   255,                                    //(bad)
8046   255,                                    //(bad)
8047   255,                                    //(bad)
8048   236,                                    //in            (%dx),%al
8049   255,                                    //(bad)
8050   255,                                    //(bad)
8051   255,228,                                //jmpq          *%rsp
8052   255,                                    //(bad)
8053   255,                                    //(bad)
8054   255,                                    //(bad)
8055   220,255,                                //fdivr         %st,%st(7)
8056   255,                                    //(bad)
8057   255,212,                                //callq         *%rsp
8058   255,                                    //(bad)
8059   255,                                    //(bad)
8060   255,204,                                //dec           %esp
8061   255,                                    //(bad)
8062   255,                                    //(bad)
8063   255,192,                                //inc           %eax
8064   255,                                    //(bad)
8065   255,                                    //(bad)
8066   255,                                    //.byte         0xff
8067 };
8068 
8069 CODE const uint8_t sk_load_tables_hsw[] = {
8070   73,137,200,                             //mov           %rcx,%r8
8071   72,173,                                 //lods          %ds:(%rsi),%rax
8072   76,141,12,189,0,0,0,0,                  //lea           0x0(,%rdi,4),%r9
8073   76,3,8,                                 //add           (%rax),%r9
8074   77,133,192,                             //test          %r8,%r8
8075   117,121,                                //jne           802 <_sk_load_tables_hsw+0x8e>
8076   196,193,126,111,25,                     //vmovdqu       (%r9),%ymm3
8077   185,255,0,0,0,                          //mov           $0xff,%ecx
8078   197,249,110,193,                        //vmovd         %ecx,%xmm0
8079   196,226,125,88,208,                     //vpbroadcastd  %xmm0,%ymm2
8080   197,237,219,203,                        //vpand         %ymm3,%ymm2,%ymm1
8081   196,65,61,118,192,                      //vpcmpeqd      %ymm8,%ymm8,%ymm8
8082   72,139,72,8,                            //mov           0x8(%rax),%rcx
8083   76,139,72,16,                           //mov           0x10(%rax),%r9
8084   196,65,53,118,201,                      //vpcmpeqd      %ymm9,%ymm9,%ymm9
8085   196,226,53,146,4,137,                   //vgatherdps    %ymm9,(%rcx,%ymm1,4),%ymm0
8086   197,245,114,211,8,                      //vpsrld        $0x8,%ymm3,%ymm1
8087   197,109,219,201,                        //vpand         %ymm1,%ymm2,%ymm9
8088   196,65,45,118,210,                      //vpcmpeqd      %ymm10,%ymm10,%ymm10
8089   196,130,45,146,12,137,                  //vgatherdps    %ymm10,(%r9,%ymm9,4),%ymm1
8090   72,139,64,24,                           //mov           0x18(%rax),%rax
8091   197,181,114,211,16,                     //vpsrld        $0x10,%ymm3,%ymm9
8092   196,65,109,219,201,                     //vpand         %ymm9,%ymm2,%ymm9
8093   196,162,61,146,20,136,                  //vgatherdps    %ymm8,(%rax,%ymm9,4),%ymm2
8094   197,229,114,211,24,                     //vpsrld        $0x18,%ymm3,%ymm3
8095   197,124,91,195,                         //vcvtdq2ps     %ymm3,%ymm8
8096   184,129,128,128,59,                     //mov           $0x3b808081,%eax
8097   197,249,110,216,                        //vmovd         %eax,%xmm3
8098   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
8099   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
8100   72,173,                                 //lods          %ds:(%rsi),%rax
8101   76,137,193,                             //mov           %r8,%rcx
8102   255,224,                                //jmpq          *%rax
8103   185,8,0,0,0,                            //mov           $0x8,%ecx
8104   68,41,193,                              //sub           %r8d,%ecx
8105   192,225,3,                              //shl           $0x3,%cl
8106   73,199,194,255,255,255,255,             //mov           $0xffffffffffffffff,%r10
8107   73,211,234,                             //shr           %cl,%r10
8108   196,193,249,110,194,                    //vmovq         %r10,%xmm0
8109   196,226,125,33,192,                     //vpmovsxbd     %xmm0,%ymm0
8110   196,194,125,140,25,                     //vpmaskmovd    (%r9),%ymm0,%ymm3
8111   233,99,255,255,255,                     //jmpq          78e <_sk_load_tables_hsw+0x1a>
8112 };
8113 
8114 CODE const uint8_t sk_load_a8_hsw[] = {
8115   73,137,200,                             //mov           %rcx,%r8
8116   72,173,                                 //lods          %ds:(%rsi),%rax
8117   72,139,0,                               //mov           (%rax),%rax
8118   72,1,248,                               //add           %rdi,%rax
8119   77,133,192,                             //test          %r8,%r8
8120   117,50,                                 //jne           86d <_sk_load_a8_hsw+0x42>
8121   197,250,126,0,                          //vmovq         (%rax),%xmm0
8122   196,226,125,49,192,                     //vpmovzxbd     %xmm0,%ymm0
8123   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
8124   184,129,128,128,59,                     //mov           $0x3b808081,%eax
8125   197,249,110,200,                        //vmovd         %eax,%xmm1
8126   196,226,125,88,201,                     //vpbroadcastd  %xmm1,%ymm1
8127   197,252,89,217,                         //vmulps        %ymm1,%ymm0,%ymm3
8128   72,173,                                 //lods          %ds:(%rsi),%rax
8129   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
8130   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
8131   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
8132   76,137,193,                             //mov           %r8,%rcx
8133   255,224,                                //jmpq          *%rax
8134   49,201,                                 //xor           %ecx,%ecx
8135   77,137,194,                             //mov           %r8,%r10
8136   69,49,201,                              //xor           %r9d,%r9d
8137   68,15,182,24,                           //movzbl        (%rax),%r11d
8138   72,255,192,                             //inc           %rax
8139   73,211,227,                             //shl           %cl,%r11
8140   77,9,217,                               //or            %r11,%r9
8141   72,131,193,8,                           //add           $0x8,%rcx
8142   73,255,202,                             //dec           %r10
8143   117,234,                                //jne           875 <_sk_load_a8_hsw+0x4a>
8144   196,193,249,110,193,                    //vmovq         %r9,%xmm0
8145   235,173,                                //jmp           83f <_sk_load_a8_hsw+0x14>
8146 };
8147 
8148 CODE const uint8_t sk_store_a8_hsw[] = {
8149   72,173,                                 //lods          %ds:(%rsi),%rax
8150   76,139,8,                               //mov           (%rax),%r9
8151   184,0,0,127,67,                         //mov           $0x437f0000,%eax
8152   197,121,110,192,                        //vmovd         %eax,%xmm8
8153   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
8154   197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
8155   196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
8156   196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
8157   196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
8158   196,65,57,103,192,                      //vpackuswb     %xmm8,%xmm8,%xmm8
8159   72,133,201,                             //test          %rcx,%rcx
8160   117,10,                                 //jne           8cd <_sk_store_a8_hsw+0x3b>
8161   196,65,123,17,4,57,                     //vmovsd        %xmm8,(%r9,%rdi,1)
8162   72,173,                                 //lods          %ds:(%rsi),%rax
8163   255,224,                                //jmpq          *%rax
8164   65,137,200,                             //mov           %ecx,%r8d
8165   65,128,224,7,                           //and           $0x7,%r8b
8166   65,254,200,                             //dec           %r8b
8167   65,128,248,6,                           //cmp           $0x6,%r8b
8168   119,236,                                //ja            8c9 <_sk_store_a8_hsw+0x37>
8169   196,66,121,48,192,                      //vpmovzxbw     %xmm8,%xmm8
8170   65,15,182,192,                          //movzbl        %r8b,%eax
8171   76,141,5,67,0,0,0,                      //lea           0x43(%rip),%r8        # 930 <_sk_store_a8_hsw+0x9e>
8172   73,99,4,128,                            //movslq        (%r8,%rax,4),%rax
8173   76,1,192,                               //add           %r8,%rax
8174   255,224,                                //jmpq          *%rax
8175   196,67,121,20,68,57,6,12,               //vpextrb       $0xc,%xmm8,0x6(%r9,%rdi,1)
8176   196,67,121,20,68,57,5,10,               //vpextrb       $0xa,%xmm8,0x5(%r9,%rdi,1)
8177   196,67,121,20,68,57,4,8,                //vpextrb       $0x8,%xmm8,0x4(%r9,%rdi,1)
8178   196,67,121,20,68,57,3,6,                //vpextrb       $0x6,%xmm8,0x3(%r9,%rdi,1)
8179   196,67,121,20,68,57,2,4,                //vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
8180   196,67,121,20,68,57,1,2,                //vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
8181   196,67,121,20,4,57,0,                   //vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
8182   235,154,                                //jmp           8c9 <_sk_store_a8_hsw+0x37>
8183   144,                                    //nop
8184   246,255,                                //idiv          %bh
8185   255,                                    //(bad)
8186   255,                                    //(bad)
8187   238,                                    //out           %al,(%dx)
8188   255,                                    //(bad)
8189   255,                                    //(bad)
8190   255,230,                                //jmpq          *%rsi
8191   255,                                    //(bad)
8192   255,                                    //(bad)
8193   255,                                    //(bad)
8194   222,255,                                //fdivrp        %st,%st(7)
8195   255,                                    //(bad)
8196   255,214,                                //callq         *%rsi
8197   255,                                    //(bad)
8198   255,                                    //(bad)
8199   255,206,                                //dec           %esi
8200   255,                                    //(bad)
8201   255,                                    //(bad)
8202   255,198,                                //inc           %esi
8203   255,                                    //(bad)
8204   255,                                    //(bad)
8205   255,                                    //.byte         0xff
8206 };
8207 
8208 CODE const uint8_t sk_load_565_hsw[] = {
8209   72,173,                                 //lods          %ds:(%rsi),%rax
8210   76,139,16,                              //mov           (%rax),%r10
8211   72,133,201,                             //test          %rcx,%rcx
8212   15,133,149,0,0,0,                       //jne           9ef <_sk_load_565_hsw+0xa3>
8213   196,193,122,111,4,122,                  //vmovdqu       (%r10,%rdi,2),%xmm0
8214   196,226,125,51,208,                     //vpmovzxwd     %xmm0,%ymm2
8215   184,0,248,0,0,                          //mov           $0xf800,%eax
8216   197,249,110,192,                        //vmovd         %eax,%xmm0
8217   196,226,125,88,192,                     //vpbroadcastd  %xmm0,%ymm0
8218   197,253,219,194,                        //vpand         %ymm2,%ymm0,%ymm0
8219   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
8220   184,8,33,132,55,                        //mov           $0x37842108,%eax
8221   197,249,110,200,                        //vmovd         %eax,%xmm1
8222   196,226,125,88,201,                     //vpbroadcastd  %xmm1,%ymm1
8223   197,252,89,193,                         //vmulps        %ymm1,%ymm0,%ymm0
8224   184,224,7,0,0,                          //mov           $0x7e0,%eax
8225   197,249,110,200,                        //vmovd         %eax,%xmm1
8226   196,226,125,88,201,                     //vpbroadcastd  %xmm1,%ymm1
8227   197,245,219,202,                        //vpand         %ymm2,%ymm1,%ymm1
8228   197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
8229   184,33,8,2,58,                          //mov           $0x3a020821,%eax
8230   197,249,110,216,                        //vmovd         %eax,%xmm3
8231   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
8232   197,244,89,203,                         //vmulps        %ymm3,%ymm1,%ymm1
8233   184,31,0,0,0,                           //mov           $0x1f,%eax
8234   197,249,110,216,                        //vmovd         %eax,%xmm3
8235   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
8236   197,229,219,210,                        //vpand         %ymm2,%ymm3,%ymm2
8237   197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
8238   184,8,33,4,61,                          //mov           $0x3d042108,%eax
8239   197,249,110,216,                        //vmovd         %eax,%xmm3
8240   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
8241   197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
8242   184,0,0,128,63,                         //mov           $0x3f800000,%eax
8243   197,249,110,216,                        //vmovd         %eax,%xmm3
8244   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
8245   72,173,                                 //lods          %ds:(%rsi),%rax
8246   255,224,                                //jmpq          *%rax
8247   65,137,200,                             //mov           %ecx,%r8d
8248   65,128,224,7,                           //and           $0x7,%r8b
8249   197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
8250   65,254,200,                             //dec           %r8b
8251   65,128,248,6,                           //cmp           $0x6,%r8b
8252   15,135,89,255,255,255,                  //ja            960 <_sk_load_565_hsw+0x14>
8253   69,15,182,192,                          //movzbl        %r8b,%r8d
8254   76,141,13,74,0,0,0,                     //lea           0x4a(%rip),%r9        # a5c <_sk_load_565_hsw+0x110>
8255   75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
8256   76,1,200,                               //add           %r9,%rax
8257   255,224,                                //jmpq          *%rax
8258   197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
8259   196,193,121,196,68,122,12,6,            //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
8260   196,193,121,196,68,122,10,5,            //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
8261   196,193,121,196,68,122,8,4,             //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
8262   196,193,121,196,68,122,6,3,             //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
8263   196,193,121,196,68,122,4,2,             //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
8264   196,193,121,196,68,122,2,1,             //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
8265   196,193,121,196,4,122,0,                //vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
8266   233,5,255,255,255,                      //jmpq          960 <_sk_load_565_hsw+0x14>
8267   144,                                    //nop
8268   243,255,                                //repz          (bad)
8269   255,                                    //(bad)
8270   255,                                    //(bad)
8271   235,255,                                //jmp           a61 <_sk_load_565_hsw+0x115>
8272   255,                                    //(bad)
8273   255,227,                                //jmpq          *%rbx
8274   255,                                    //(bad)
8275   255,                                    //(bad)
8276   255,                                    //(bad)
8277   219,255,                                //(bad)
8278   255,                                    //(bad)
8279   255,211,                                //callq         *%rbx
8280   255,                                    //(bad)
8281   255,                                    //(bad)
8282   255,203,                                //dec           %ebx
8283   255,                                    //(bad)
8284   255,                                    //(bad)
8285   255,                                    //(bad)
8286   191,                                    //.byte         0xbf
8287   255,                                    //(bad)
8288   255,                                    //(bad)
8289   255,                                    //.byte         0xff
8290 };
8291 
8292 CODE const uint8_t sk_store_565_hsw[] = {
8293   72,173,                                 //lods          %ds:(%rsi),%rax
8294   76,139,8,                               //mov           (%rax),%r9
8295   184,0,0,248,65,                         //mov           $0x41f80000,%eax
8296   197,121,110,192,                        //vmovd         %eax,%xmm8
8297   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
8298   197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
8299   196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
8300   196,193,53,114,241,11,                  //vpslld        $0xb,%ymm9,%ymm9
8301   184,0,0,124,66,                         //mov           $0x427c0000,%eax
8302   197,121,110,208,                        //vmovd         %eax,%xmm10
8303   196,66,125,88,210,                      //vpbroadcastd  %xmm10,%ymm10
8304   197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
8305   196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
8306   196,193,45,114,242,5,                   //vpslld        $0x5,%ymm10,%ymm10
8307   196,65,45,235,201,                      //vpor          %ymm9,%ymm10,%ymm9
8308   197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
8309   196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
8310   196,65,53,235,192,                      //vpor          %ymm8,%ymm9,%ymm8
8311   196,67,125,57,193,1,                    //vextracti128  $0x1,%ymm8,%xmm9
8312   196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
8313   72,133,201,                             //test          %rcx,%rcx
8314   117,10,                                 //jne           ae4 <_sk_store_565_hsw+0x6c>
8315   196,65,122,127,4,121,                   //vmovdqu       %xmm8,(%r9,%rdi,2)
8316   72,173,                                 //lods          %ds:(%rsi),%rax
8317   255,224,                                //jmpq          *%rax
8318   65,137,200,                             //mov           %ecx,%r8d
8319   65,128,224,7,                           //and           $0x7,%r8b
8320   65,254,200,                             //dec           %r8b
8321   65,128,248,6,                           //cmp           $0x6,%r8b
8322   119,236,                                //ja            ae0 <_sk_store_565_hsw+0x68>
8323   65,15,182,192,                          //movzbl        %r8b,%eax
8324   76,141,5,69,0,0,0,                      //lea           0x45(%rip),%r8        # b44 <_sk_store_565_hsw+0xcc>
8325   73,99,4,128,                            //movslq        (%r8,%rax,4),%rax
8326   76,1,192,                               //add           %r8,%rax
8327   255,224,                                //jmpq          *%rax
8328   196,67,121,21,68,121,12,6,              //vpextrw       $0x6,%xmm8,0xc(%r9,%rdi,2)
8329   196,67,121,21,68,121,10,5,              //vpextrw       $0x5,%xmm8,0xa(%r9,%rdi,2)
8330   196,67,121,21,68,121,8,4,               //vpextrw       $0x4,%xmm8,0x8(%r9,%rdi,2)
8331   196,67,121,21,68,121,6,3,               //vpextrw       $0x3,%xmm8,0x6(%r9,%rdi,2)
8332   196,67,121,21,68,121,4,2,               //vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
8333   196,67,121,21,68,121,2,1,               //vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
8334   196,67,121,21,4,121,0,                  //vpextrw       $0x0,%xmm8,(%r9,%rdi,2)
8335   235,159,                                //jmp           ae0 <_sk_store_565_hsw+0x68>
8336   15,31,0,                                //nopl          (%rax)
8337   244,                                    //hlt
8338   255,                                    //(bad)
8339   255,                                    //(bad)
8340   255,                                    //(bad)
8341   236,                                    //in            (%dx),%al
8342   255,                                    //(bad)
8343   255,                                    //(bad)
8344   255,228,                                //jmpq          *%rsp
8345   255,                                    //(bad)
8346   255,                                    //(bad)
8347   255,                                    //(bad)
8348   220,255,                                //fdivr         %st,%st(7)
8349   255,                                    //(bad)
8350   255,212,                                //callq         *%rsp
8351   255,                                    //(bad)
8352   255,                                    //(bad)
8353   255,204,                                //dec           %esp
8354   255,                                    //(bad)
8355   255,                                    //(bad)
8356   255,196,                                //inc           %esp
8357   255,                                    //(bad)
8358   255,                                    //(bad)
8359   255,                                    //.byte         0xff
8360 };
8361 
8362 CODE const uint8_t sk_load_8888_hsw[] = {
8363   73,137,200,                             //mov           %rcx,%r8
8364   72,173,                                 //lods          %ds:(%rsi),%rax
8365   76,141,12,189,0,0,0,0,                  //lea           0x0(,%rdi,4),%r9
8366   76,3,8,                                 //add           (%rax),%r9
8367   77,133,192,                             //test          %r8,%r8
8368   117,104,                                //jne           bdd <_sk_load_8888_hsw+0x7d>
8369   196,193,126,111,25,                     //vmovdqu       (%r9),%ymm3
8370   184,255,0,0,0,                          //mov           $0xff,%eax
8371   197,249,110,192,                        //vmovd         %eax,%xmm0
8372   196,226,125,88,208,                     //vpbroadcastd  %xmm0,%ymm2
8373   197,237,219,195,                        //vpand         %ymm3,%ymm2,%ymm0
8374   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
8375   184,129,128,128,59,                     //mov           $0x3b808081,%eax
8376   197,249,110,200,                        //vmovd         %eax,%xmm1
8377   196,98,125,88,193,                      //vpbroadcastd  %xmm1,%ymm8
8378   196,193,124,89,192,                     //vmulps        %ymm8,%ymm0,%ymm0
8379   197,245,114,211,8,                      //vpsrld        $0x8,%ymm3,%ymm1
8380   197,237,219,201,                        //vpand         %ymm1,%ymm2,%ymm1
8381   197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
8382   196,193,116,89,200,                     //vmulps        %ymm8,%ymm1,%ymm1
8383   197,181,114,211,16,                     //vpsrld        $0x10,%ymm3,%ymm9
8384   196,193,109,219,209,                    //vpand         %ymm9,%ymm2,%ymm2
8385   197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
8386   196,193,108,89,208,                     //vmulps        %ymm8,%ymm2,%ymm2
8387   197,229,114,211,24,                     //vpsrld        $0x18,%ymm3,%ymm3
8388   197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
8389   196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
8390   72,173,                                 //lods          %ds:(%rsi),%rax
8391   76,137,193,                             //mov           %r8,%rcx
8392   255,224,                                //jmpq          *%rax
8393   185,8,0,0,0,                            //mov           $0x8,%ecx
8394   68,41,193,                              //sub           %r8d,%ecx
8395   192,225,3,                              //shl           $0x3,%cl
8396   72,199,192,255,255,255,255,             //mov           $0xffffffffffffffff,%rax
8397   72,211,232,                             //shr           %cl,%rax
8398   196,225,249,110,192,                    //vmovq         %rax,%xmm0
8399   196,226,125,33,192,                     //vpmovsxbd     %xmm0,%ymm0
8400   196,194,125,140,25,                     //vpmaskmovd    (%r9),%ymm0,%ymm3
8401   233,116,255,255,255,                    //jmpq          b7a <_sk_load_8888_hsw+0x1a>
8402 };
8403 
8404 CODE const uint8_t sk_store_8888_hsw[] = {
8405   73,137,200,                             //mov           %rcx,%r8
8406   72,173,                                 //lods          %ds:(%rsi),%rax
8407   76,141,12,189,0,0,0,0,                  //lea           0x0(,%rdi,4),%r9
8408   76,3,8,                                 //add           (%rax),%r9
8409   184,0,0,127,67,                         //mov           $0x437f0000,%eax
8410   197,121,110,192,                        //vmovd         %eax,%xmm8
8411   196,66,125,88,192,                      //vpbroadcastd  %xmm8,%ymm8
8412   197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
8413   196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
8414   197,60,89,209,                          //vmulps        %ymm1,%ymm8,%ymm10
8415   196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
8416   196,193,45,114,242,8,                   //vpslld        $0x8,%ymm10,%ymm10
8417   196,65,45,235,201,                      //vpor          %ymm9,%ymm10,%ymm9
8418   197,60,89,210,                          //vmulps        %ymm2,%ymm8,%ymm10
8419   196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
8420   196,193,45,114,242,16,                  //vpslld        $0x10,%ymm10,%ymm10
8421   197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
8422   196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
8423   196,193,61,114,240,24,                  //vpslld        $0x18,%ymm8,%ymm8
8424   196,65,45,235,192,                      //vpor          %ymm8,%ymm10,%ymm8
8425   196,65,53,235,192,                      //vpor          %ymm8,%ymm9,%ymm8
8426   77,133,192,                             //test          %r8,%r8
8427   117,12,                                 //jne           c7a <_sk_store_8888_hsw+0x74>
8428   196,65,126,127,1,                       //vmovdqu       %ymm8,(%r9)
8429   72,173,                                 //lods          %ds:(%rsi),%rax
8430   76,137,193,                             //mov           %r8,%rcx
8431   255,224,                                //jmpq          *%rax
8432   185,8,0,0,0,                            //mov           $0x8,%ecx
8433   68,41,193,                              //sub           %r8d,%ecx
8434   192,225,3,                              //shl           $0x3,%cl
8435   72,199,192,255,255,255,255,             //mov           $0xffffffffffffffff,%rax
8436   72,211,232,                             //shr           %cl,%rax
8437   196,97,249,110,200,                     //vmovq         %rax,%xmm9
8438   196,66,125,33,201,                      //vpmovsxbd     %xmm9,%ymm9
8439   196,66,53,142,1,                        //vpmaskmovd    %ymm8,%ymm9,(%r9)
8440   235,211,                                //jmp           c73 <_sk_store_8888_hsw+0x6d>
8441 };
8442 
8443 CODE const uint8_t sk_load_f16_hsw[] = {
8444   72,173,                                 //lods          %ds:(%rsi),%rax
8445   72,139,0,                               //mov           (%rax),%rax
8446   72,133,201,                             //test          %rcx,%rcx
8447   117,97,                                 //jne           d0b <_sk_load_f16_hsw+0x6b>
8448   197,121,16,4,248,                       //vmovupd       (%rax,%rdi,8),%xmm8
8449   197,249,16,84,248,16,                   //vmovupd       0x10(%rax,%rdi,8),%xmm2
8450   197,249,16,92,248,32,                   //vmovupd       0x20(%rax,%rdi,8),%xmm3
8451   197,122,111,76,248,48,                  //vmovdqu       0x30(%rax,%rdi,8),%xmm9
8452   197,185,97,194,                         //vpunpcklwd    %xmm2,%xmm8,%xmm0
8453   197,185,105,210,                        //vpunpckhwd    %xmm2,%xmm8,%xmm2
8454   196,193,97,97,201,                      //vpunpcklwd    %xmm9,%xmm3,%xmm1
8455   196,193,97,105,217,                     //vpunpckhwd    %xmm9,%xmm3,%xmm3
8456   197,121,97,194,                         //vpunpcklwd    %xmm2,%xmm0,%xmm8
8457   197,121,105,202,                        //vpunpckhwd    %xmm2,%xmm0,%xmm9
8458   197,241,97,211,                         //vpunpcklwd    %xmm3,%xmm1,%xmm2
8459   197,241,105,219,                        //vpunpckhwd    %xmm3,%xmm1,%xmm3
8460   197,185,108,194,                        //vpunpcklqdq   %xmm2,%xmm8,%xmm0
8461   196,226,125,19,192,                     //vcvtph2ps     %xmm0,%ymm0
8462   197,185,109,202,                        //vpunpckhqdq   %xmm2,%xmm8,%xmm1
8463   196,226,125,19,201,                     //vcvtph2ps     %xmm1,%ymm1
8464   197,177,108,211,                        //vpunpcklqdq   %xmm3,%xmm9,%xmm2
8465   196,226,125,19,210,                     //vcvtph2ps     %xmm2,%ymm2
8466   197,177,109,219,                        //vpunpckhqdq   %xmm3,%xmm9,%xmm3
8467   196,226,125,19,219,                     //vcvtph2ps     %xmm3,%ymm3
8468   72,173,                                 //lods          %ds:(%rsi),%rax
8469   255,224,                                //jmpq          *%rax
8470   197,123,16,4,248,                       //vmovsd        (%rax,%rdi,8),%xmm8
8471   196,65,49,239,201,                      //vpxor         %xmm9,%xmm9,%xmm9
8472   72,131,249,1,                           //cmp           $0x1,%rcx
8473   116,79,                                 //je            d6a <_sk_load_f16_hsw+0xca>
8474   197,57,22,68,248,8,                     //vmovhpd       0x8(%rax,%rdi,8),%xmm8,%xmm8
8475   72,131,249,3,                           //cmp           $0x3,%rcx
8476   114,67,                                 //jb            d6a <_sk_load_f16_hsw+0xca>
8477   197,251,16,84,248,16,                   //vmovsd        0x10(%rax,%rdi,8),%xmm2
8478   72,131,249,3,                           //cmp           $0x3,%rcx
8479   116,68,                                 //je            d77 <_sk_load_f16_hsw+0xd7>
8480   197,233,22,84,248,24,                   //vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
8481   72,131,249,5,                           //cmp           $0x5,%rcx
8482   114,56,                                 //jb            d77 <_sk_load_f16_hsw+0xd7>
8483   197,251,16,92,248,32,                   //vmovsd        0x20(%rax,%rdi,8),%xmm3
8484   72,131,249,5,                           //cmp           $0x5,%rcx
8485   15,132,114,255,255,255,                 //je            cc1 <_sk_load_f16_hsw+0x21>
8486   197,225,22,92,248,40,                   //vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
8487   72,131,249,7,                           //cmp           $0x7,%rcx
8488   15,130,98,255,255,255,                  //jb            cc1 <_sk_load_f16_hsw+0x21>
8489   197,122,126,76,248,48,                  //vmovq         0x30(%rax,%rdi,8),%xmm9
8490   233,87,255,255,255,                     //jmpq          cc1 <_sk_load_f16_hsw+0x21>
8491   197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
8492   197,233,87,210,                         //vxorpd        %xmm2,%xmm2,%xmm2
8493   233,74,255,255,255,                     //jmpq          cc1 <_sk_load_f16_hsw+0x21>
8494   197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
8495   233,65,255,255,255,                     //jmpq          cc1 <_sk_load_f16_hsw+0x21>
8496 };
8497 
8498 CODE const uint8_t sk_store_f16_hsw[] = {
8499   72,173,                                 //lods          %ds:(%rsi),%rax
8500   72,139,0,                               //mov           (%rax),%rax
8501   196,195,125,29,192,4,                   //vcvtps2ph     $0x4,%ymm0,%xmm8
8502   196,195,125,29,201,4,                   //vcvtps2ph     $0x4,%ymm1,%xmm9
8503   196,195,125,29,210,4,                   //vcvtps2ph     $0x4,%ymm2,%xmm10
8504   196,195,125,29,219,4,                   //vcvtps2ph     $0x4,%ymm3,%xmm11
8505   196,65,57,97,225,                       //vpunpcklwd    %xmm9,%xmm8,%xmm12
8506   196,65,57,105,193,                      //vpunpckhwd    %xmm9,%xmm8,%xmm8
8507   196,65,41,97,203,                       //vpunpcklwd    %xmm11,%xmm10,%xmm9
8508   196,65,41,105,235,                      //vpunpckhwd    %xmm11,%xmm10,%xmm13
8509   196,65,25,98,217,                       //vpunpckldq    %xmm9,%xmm12,%xmm11
8510   196,65,25,106,209,                      //vpunpckhdq    %xmm9,%xmm12,%xmm10
8511   196,65,57,98,205,                       //vpunpckldq    %xmm13,%xmm8,%xmm9
8512   196,65,57,106,197,                      //vpunpckhdq    %xmm13,%xmm8,%xmm8
8513   72,133,201,                             //test          %rcx,%rcx
8514   117,27,                                 //jne           de5 <_sk_store_f16_hsw+0x65>
8515   197,120,17,28,248,                      //vmovups       %xmm11,(%rax,%rdi,8)
8516   197,120,17,84,248,16,                   //vmovups       %xmm10,0x10(%rax,%rdi,8)
8517   197,120,17,76,248,32,                   //vmovups       %xmm9,0x20(%rax,%rdi,8)
8518   197,122,127,68,248,48,                  //vmovdqu       %xmm8,0x30(%rax,%rdi,8)
8519   72,173,                                 //lods          %ds:(%rsi),%rax
8520   255,224,                                //jmpq          *%rax
8521   197,121,214,28,248,                     //vmovq         %xmm11,(%rax,%rdi,8)
8522   72,131,249,1,                           //cmp           $0x1,%rcx
8523   116,241,                                //je            de1 <_sk_store_f16_hsw+0x61>
8524   197,121,23,92,248,8,                    //vmovhpd       %xmm11,0x8(%rax,%rdi,8)
8525   72,131,249,3,                           //cmp           $0x3,%rcx
8526   114,229,                                //jb            de1 <_sk_store_f16_hsw+0x61>
8527   197,121,214,84,248,16,                  //vmovq         %xmm10,0x10(%rax,%rdi,8)
8528   116,221,                                //je            de1 <_sk_store_f16_hsw+0x61>
8529   197,121,23,84,248,24,                   //vmovhpd       %xmm10,0x18(%rax,%rdi,8)
8530   72,131,249,5,                           //cmp           $0x5,%rcx
8531   114,209,                                //jb            de1 <_sk_store_f16_hsw+0x61>
8532   197,121,214,76,248,32,                  //vmovq         %xmm9,0x20(%rax,%rdi,8)
8533   116,201,                                //je            de1 <_sk_store_f16_hsw+0x61>
8534   197,121,23,76,248,40,                   //vmovhpd       %xmm9,0x28(%rax,%rdi,8)
8535   72,131,249,7,                           //cmp           $0x7,%rcx
8536   114,189,                                //jb            de1 <_sk_store_f16_hsw+0x61>
8537   197,121,214,68,248,48,                  //vmovq         %xmm8,0x30(%rax,%rdi,8)
8538   235,181,                                //jmp           de1 <_sk_store_f16_hsw+0x61>
8539 };
8540 
8541 CODE const uint8_t sk_store_f32_hsw[] = {
8542   72,173,                                 //lods          %ds:(%rsi),%rax
8543   76,139,0,                               //mov           (%rax),%r8
8544   72,141,4,189,0,0,0,0,                   //lea           0x0(,%rdi,4),%rax
8545   197,124,20,193,                         //vunpcklps     %ymm1,%ymm0,%ymm8
8546   197,124,21,217,                         //vunpckhps     %ymm1,%ymm0,%ymm11
8547   197,108,20,203,                         //vunpcklps     %ymm3,%ymm2,%ymm9
8548   197,108,21,227,                         //vunpckhps     %ymm3,%ymm2,%ymm12
8549   196,65,61,20,209,                       //vunpcklpd     %ymm9,%ymm8,%ymm10
8550   196,65,61,21,201,                       //vunpckhpd     %ymm9,%ymm8,%ymm9
8551   196,65,37,20,196,                       //vunpcklpd     %ymm12,%ymm11,%ymm8
8552   196,65,37,21,220,                       //vunpckhpd     %ymm12,%ymm11,%ymm11
8553   72,133,201,                             //test          %rcx,%rcx
8554   117,55,                                 //jne           e99 <_sk_store_f32_hsw+0x6d>
8555   196,67,45,24,225,1,                     //vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
8556   196,67,61,24,235,1,                     //vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
8557   196,67,45,6,201,49,                     //vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
8558   196,67,61,6,195,49,                     //vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
8559   196,65,125,17,36,128,                   //vmovupd       %ymm12,(%r8,%rax,4)
8560   196,65,125,17,108,128,32,               //vmovupd       %ymm13,0x20(%r8,%rax,4)
8561   196,65,125,17,76,128,64,                //vmovupd       %ymm9,0x40(%r8,%rax,4)
8562   196,65,125,17,68,128,96,                //vmovupd       %ymm8,0x60(%r8,%rax,4)
8563   72,173,                                 //lods          %ds:(%rsi),%rax
8564   255,224,                                //jmpq          *%rax
8565   196,65,121,17,20,128,                   //vmovupd       %xmm10,(%r8,%rax,4)
8566   72,131,249,1,                           //cmp           $0x1,%rcx
8567   116,240,                                //je            e95 <_sk_store_f32_hsw+0x69>
8568   196,65,121,17,76,128,16,                //vmovupd       %xmm9,0x10(%r8,%rax,4)
8569   72,131,249,3,                           //cmp           $0x3,%rcx
8570   114,227,                                //jb            e95 <_sk_store_f32_hsw+0x69>
8571   196,65,121,17,68,128,32,                //vmovupd       %xmm8,0x20(%r8,%rax,4)
8572   116,218,                                //je            e95 <_sk_store_f32_hsw+0x69>
8573   196,65,121,17,92,128,48,                //vmovupd       %xmm11,0x30(%r8,%rax,4)
8574   72,131,249,5,                           //cmp           $0x5,%rcx
8575   114,205,                                //jb            e95 <_sk_store_f32_hsw+0x69>
8576   196,67,125,25,84,128,64,1,              //vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
8577   116,195,                                //je            e95 <_sk_store_f32_hsw+0x69>
8578   196,67,125,25,76,128,80,1,              //vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
8579   72,131,249,7,                           //cmp           $0x7,%rcx
8580   114,181,                                //jb            e95 <_sk_store_f32_hsw+0x69>
8581   196,67,125,25,68,128,96,1,              //vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
8582   235,171,                                //jmp           e95 <_sk_store_f32_hsw+0x69>
8583 };
8584 
8585 CODE const uint8_t sk_clamp_x_hsw[] = {
8586   72,173,                                 //lods          %ds:(%rsi),%rax
8587   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
8588   197,188,95,192,                         //vmaxps        %ymm0,%ymm8,%ymm0
8589   196,98,125,88,0,                        //vpbroadcastd  (%rax),%ymm8
8590   196,65,53,118,201,                      //vpcmpeqd      %ymm9,%ymm9,%ymm9
8591   196,65,61,254,193,                      //vpaddd        %ymm9,%ymm8,%ymm8
8592   196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
8593   72,173,                                 //lods          %ds:(%rsi),%rax
8594   255,224,                                //jmpq          *%rax
8595 };
8596 
8597 CODE const uint8_t sk_clamp_y_hsw[] = {
8598   72,173,                                 //lods          %ds:(%rsi),%rax
8599   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
8600   197,188,95,201,                         //vmaxps        %ymm1,%ymm8,%ymm1
8601   196,98,125,88,0,                        //vpbroadcastd  (%rax),%ymm8
8602   196,65,53,118,201,                      //vpcmpeqd      %ymm9,%ymm9,%ymm9
8603   196,65,61,254,193,                      //vpaddd        %ymm9,%ymm8,%ymm8
8604   196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
8605   72,173,                                 //lods          %ds:(%rsi),%rax
8606   255,224,                                //jmpq          *%rax
8607 };
8608 
8609 CODE const uint8_t sk_repeat_x_hsw[] = {
8610   72,173,                                 //lods          %ds:(%rsi),%rax
8611   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
8612   196,65,124,94,200,                      //vdivps        %ymm8,%ymm0,%ymm9
8613   196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
8614   196,98,61,172,200,                      //vfnmadd213ps  %ymm0,%ymm8,%ymm9
8615   197,253,118,192,                        //vpcmpeqd      %ymm0,%ymm0,%ymm0
8616   197,189,254,192,                        //vpaddd        %ymm0,%ymm8,%ymm0
8617   197,180,93,192,                         //vminps        %ymm0,%ymm9,%ymm0
8618   72,173,                                 //lods          %ds:(%rsi),%rax
8619   255,224,                                //jmpq          *%rax
8620 };
8621 
8622 CODE const uint8_t sk_repeat_y_hsw[] = {
8623   72,173,                                 //lods          %ds:(%rsi),%rax
8624   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
8625   196,65,116,94,200,                      //vdivps        %ymm8,%ymm1,%ymm9
8626   196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
8627   196,98,61,172,201,                      //vfnmadd213ps  %ymm1,%ymm8,%ymm9
8628   197,245,118,201,                        //vpcmpeqd      %ymm1,%ymm1,%ymm1
8629   197,189,254,201,                        //vpaddd        %ymm1,%ymm8,%ymm1
8630   197,180,93,201,                         //vminps        %ymm1,%ymm9,%ymm1
8631   72,173,                                 //lods          %ds:(%rsi),%rax
8632   255,224,                                //jmpq          *%rax
8633 };
8634 
8635 CODE const uint8_t sk_mirror_x_hsw[] = {
8636   72,173,                                 //lods          %ds:(%rsi),%rax
8637   197,122,16,0,                           //vmovss        (%rax),%xmm8
8638   196,66,125,24,200,                      //vbroadcastss  %xmm8,%ymm9
8639   196,65,124,92,209,                      //vsubps        %ymm9,%ymm0,%ymm10
8640   196,193,58,88,192,                      //vaddss        %xmm8,%xmm8,%xmm0
8641   196,226,125,24,192,                     //vbroadcastss  %xmm0,%ymm0
8642   197,44,94,192,                          //vdivps        %ymm0,%ymm10,%ymm8
8643   196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
8644   196,66,125,172,194,                     //vfnmadd213ps  %ymm10,%ymm0,%ymm8
8645   196,193,60,92,193,                      //vsubps        %ymm9,%ymm8,%ymm0
8646   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
8647   197,60,92,192,                          //vsubps        %ymm0,%ymm8,%ymm8
8648   197,188,84,192,                         //vandps        %ymm0,%ymm8,%ymm0
8649   196,65,61,118,192,                      //vpcmpeqd      %ymm8,%ymm8,%ymm8
8650   196,65,53,254,192,                      //vpaddd        %ymm8,%ymm9,%ymm8
8651   196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
8652   72,173,                                 //lods          %ds:(%rsi),%rax
8653   255,224,                                //jmpq          *%rax
8654 };
8655 
8656 CODE const uint8_t sk_mirror_y_hsw[] = {
8657   72,173,                                 //lods          %ds:(%rsi),%rax
8658   197,122,16,0,                           //vmovss        (%rax),%xmm8
8659   196,66,125,24,200,                      //vbroadcastss  %xmm8,%ymm9
8660   196,65,116,92,209,                      //vsubps        %ymm9,%ymm1,%ymm10
8661   196,193,58,88,200,                      //vaddss        %xmm8,%xmm8,%xmm1
8662   196,226,125,24,201,                     //vbroadcastss  %xmm1,%ymm1
8663   197,44,94,193,                          //vdivps        %ymm1,%ymm10,%ymm8
8664   196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
8665   196,66,117,172,194,                     //vfnmadd213ps  %ymm10,%ymm1,%ymm8
8666   196,193,60,92,201,                      //vsubps        %ymm9,%ymm8,%ymm1
8667   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
8668   197,60,92,193,                          //vsubps        %ymm1,%ymm8,%ymm8
8669   197,188,84,201,                         //vandps        %ymm1,%ymm8,%ymm1
8670   196,65,61,118,192,                      //vpcmpeqd      %ymm8,%ymm8,%ymm8
8671   196,65,53,254,192,                      //vpaddd        %ymm8,%ymm9,%ymm8
8672   196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
8673   72,173,                                 //lods          %ds:(%rsi),%rax
8674   255,224,                                //jmpq          *%rax
8675 };
8676 
8677 CODE const uint8_t sk_luminance_to_alpha_hsw[] = {
8678   184,208,179,89,62,                      //mov           $0x3e59b3d0,%eax
8679   197,249,110,216,                        //vmovd         %eax,%xmm3
8680   196,98,125,88,195,                      //vpbroadcastd  %xmm3,%ymm8
8681   184,89,23,55,63,                        //mov           $0x3f371759,%eax
8682   197,249,110,216,                        //vmovd         %eax,%xmm3
8683   196,226,125,88,219,                     //vpbroadcastd  %xmm3,%ymm3
8684   197,228,89,201,                         //vmulps        %ymm1,%ymm3,%ymm1
8685   196,98,125,168,193,                     //vfmadd213ps   %ymm1,%ymm0,%ymm8
8686   184,152,221,147,61,                     //mov           $0x3d93dd98,%eax
8687   197,249,110,192,                        //vmovd         %eax,%xmm0
8688   196,226,125,88,216,                     //vpbroadcastd  %xmm0,%ymm3
8689   196,194,109,168,216,                    //vfmadd213ps   %ymm8,%ymm2,%ymm3
8690   72,173,                                 //lods          %ds:(%rsi),%rax
8691   197,253,239,192,                        //vpxor         %ymm0,%ymm0,%ymm0
8692   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
8693   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
8694   255,224,                                //jmpq          *%rax
8695 };
8696 
8697 CODE const uint8_t sk_matrix_2x3_hsw[] = {
8698   72,173,                                 //lods          %ds:(%rsi),%rax
8699   196,98,125,24,8,                        //vbroadcastss  (%rax),%ymm9
8700   196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
8701   196,98,125,24,64,16,                    //vbroadcastss  0x10(%rax),%ymm8
8702   196,66,117,184,194,                     //vfmadd231ps   %ymm10,%ymm1,%ymm8
8703   196,66,125,184,193,                     //vfmadd231ps   %ymm9,%ymm0,%ymm8
8704   196,98,125,24,80,4,                     //vbroadcastss  0x4(%rax),%ymm10
8705   196,98,125,24,88,12,                    //vbroadcastss  0xc(%rax),%ymm11
8706   196,98,125,24,72,20,                    //vbroadcastss  0x14(%rax),%ymm9
8707   196,66,117,184,203,                     //vfmadd231ps   %ymm11,%ymm1,%ymm9
8708   196,66,125,184,202,                     //vfmadd231ps   %ymm10,%ymm0,%ymm9
8709   72,173,                                 //lods          %ds:(%rsi),%rax
8710   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
8711   197,124,41,201,                         //vmovaps       %ymm9,%ymm1
8712   255,224,                                //jmpq          *%rax
8713 };
8714 
8715 CODE const uint8_t sk_matrix_3x4_hsw[] = {
8716   72,173,                                 //lods          %ds:(%rsi),%rax
8717   196,98,125,24,8,                        //vbroadcastss  (%rax),%ymm9
8718   196,98,125,24,80,12,                    //vbroadcastss  0xc(%rax),%ymm10
8719   196,98,125,24,88,24,                    //vbroadcastss  0x18(%rax),%ymm11
8720   196,98,125,24,64,36,                    //vbroadcastss  0x24(%rax),%ymm8
8721   196,66,109,184,195,                     //vfmadd231ps   %ymm11,%ymm2,%ymm8
8722   196,66,117,184,194,                     //vfmadd231ps   %ymm10,%ymm1,%ymm8
8723   196,66,125,184,193,                     //vfmadd231ps   %ymm9,%ymm0,%ymm8
8724   196,98,125,24,80,4,                     //vbroadcastss  0x4(%rax),%ymm10
8725   196,98,125,24,88,16,                    //vbroadcastss  0x10(%rax),%ymm11
8726   196,98,125,24,96,28,                    //vbroadcastss  0x1c(%rax),%ymm12
8727   196,98,125,24,72,40,                    //vbroadcastss  0x28(%rax),%ymm9
8728   196,66,109,184,204,                     //vfmadd231ps   %ymm12,%ymm2,%ymm9
8729   196,66,117,184,203,                     //vfmadd231ps   %ymm11,%ymm1,%ymm9
8730   196,66,125,184,202,                     //vfmadd231ps   %ymm10,%ymm0,%ymm9
8731   196,98,125,24,88,8,                     //vbroadcastss  0x8(%rax),%ymm11
8732   196,98,125,24,96,20,                    //vbroadcastss  0x14(%rax),%ymm12
8733   196,98,125,24,104,32,                   //vbroadcastss  0x20(%rax),%ymm13
8734   196,98,125,24,80,44,                    //vbroadcastss  0x2c(%rax),%ymm10
8735   196,66,109,184,213,                     //vfmadd231ps   %ymm13,%ymm2,%ymm10
8736   196,66,117,184,212,                     //vfmadd231ps   %ymm12,%ymm1,%ymm10
8737   196,66,125,184,211,                     //vfmadd231ps   %ymm11,%ymm0,%ymm10
8738   72,173,                                 //lods          %ds:(%rsi),%rax
8739   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
8740   197,124,41,201,                         //vmovaps       %ymm9,%ymm1
8741   197,124,41,210,                         //vmovaps       %ymm10,%ymm2
8742   255,224,                                //jmpq          *%rax
8743 };
8744 
8745 CODE const uint8_t sk_matrix_4x5_hsw[] = {
8746   72,173,                                 //lods          %ds:(%rsi),%rax
8747   196,98,125,24,8,                        //vbroadcastss  (%rax),%ymm9
8748   196,98,125,24,80,16,                    //vbroadcastss  0x10(%rax),%ymm10
8749   196,98,125,24,88,32,                    //vbroadcastss  0x20(%rax),%ymm11
8750   196,98,125,24,96,48,                    //vbroadcastss  0x30(%rax),%ymm12
8751   196,98,125,24,64,64,                    //vbroadcastss  0x40(%rax),%ymm8
8752   196,66,101,184,196,                     //vfmadd231ps   %ymm12,%ymm3,%ymm8
8753   196,66,109,184,195,                     //vfmadd231ps   %ymm11,%ymm2,%ymm8
8754   196,66,117,184,194,                     //vfmadd231ps   %ymm10,%ymm1,%ymm8
8755   196,66,125,184,193,                     //vfmadd231ps   %ymm9,%ymm0,%ymm8
8756   196,98,125,24,80,4,                     //vbroadcastss  0x4(%rax),%ymm10
8757   196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
8758   196,98,125,24,96,36,                    //vbroadcastss  0x24(%rax),%ymm12
8759   196,98,125,24,104,52,                   //vbroadcastss  0x34(%rax),%ymm13
8760   196,98,125,24,72,68,                    //vbroadcastss  0x44(%rax),%ymm9
8761   196,66,101,184,205,                     //vfmadd231ps   %ymm13,%ymm3,%ymm9
8762   196,66,109,184,204,                     //vfmadd231ps   %ymm12,%ymm2,%ymm9
8763   196,66,117,184,203,                     //vfmadd231ps   %ymm11,%ymm1,%ymm9
8764   196,66,125,184,202,                     //vfmadd231ps   %ymm10,%ymm0,%ymm9
8765   196,98,125,24,88,8,                     //vbroadcastss  0x8(%rax),%ymm11
8766   196,98,125,24,96,24,                    //vbroadcastss  0x18(%rax),%ymm12
8767   196,98,125,24,104,40,                   //vbroadcastss  0x28(%rax),%ymm13
8768   196,98,125,24,112,56,                   //vbroadcastss  0x38(%rax),%ymm14
8769   196,98,125,24,80,72,                    //vbroadcastss  0x48(%rax),%ymm10
8770   196,66,101,184,214,                     //vfmadd231ps   %ymm14,%ymm3,%ymm10
8771   196,66,109,184,213,                     //vfmadd231ps   %ymm13,%ymm2,%ymm10
8772   196,66,117,184,212,                     //vfmadd231ps   %ymm12,%ymm1,%ymm10
8773   196,66,125,184,211,                     //vfmadd231ps   %ymm11,%ymm0,%ymm10
8774   196,98,125,24,96,12,                    //vbroadcastss  0xc(%rax),%ymm12
8775   196,98,125,24,104,28,                   //vbroadcastss  0x1c(%rax),%ymm13
8776   196,98,125,24,112,44,                   //vbroadcastss  0x2c(%rax),%ymm14
8777   196,98,125,24,120,60,                   //vbroadcastss  0x3c(%rax),%ymm15
8778   196,98,125,24,88,76,                    //vbroadcastss  0x4c(%rax),%ymm11
8779   196,66,101,184,223,                     //vfmadd231ps   %ymm15,%ymm3,%ymm11
8780   196,66,109,184,222,                     //vfmadd231ps   %ymm14,%ymm2,%ymm11
8781   196,66,117,184,221,                     //vfmadd231ps   %ymm13,%ymm1,%ymm11
8782   196,66,125,184,220,                     //vfmadd231ps   %ymm12,%ymm0,%ymm11
8783   72,173,                                 //lods          %ds:(%rsi),%rax
8784   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
8785   197,124,41,201,                         //vmovaps       %ymm9,%ymm1
8786   197,124,41,210,                         //vmovaps       %ymm10,%ymm2
8787   197,124,41,219,                         //vmovaps       %ymm11,%ymm3
8788   255,224,                                //jmpq          *%rax
8789 };
8790 
8791 CODE const uint8_t sk_matrix_perspective_hsw[] = {
8792   72,173,                                 //lods          %ds:(%rsi),%rax
8793   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
8794   196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
8795   196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
8796   196,66,117,184,209,                     //vfmadd231ps   %ymm9,%ymm1,%ymm10
8797   196,66,125,184,208,                     //vfmadd231ps   %ymm8,%ymm0,%ymm10
8798   196,98,125,24,64,12,                    //vbroadcastss  0xc(%rax),%ymm8
8799   196,98,125,24,72,16,                    //vbroadcastss  0x10(%rax),%ymm9
8800   196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
8801   196,66,117,184,217,                     //vfmadd231ps   %ymm9,%ymm1,%ymm11
8802   196,66,125,184,216,                     //vfmadd231ps   %ymm8,%ymm0,%ymm11
8803   196,98,125,24,64,24,                    //vbroadcastss  0x18(%rax),%ymm8
8804   196,98,125,24,72,28,                    //vbroadcastss  0x1c(%rax),%ymm9
8805   196,98,125,24,96,32,                    //vbroadcastss  0x20(%rax),%ymm12
8806   196,66,117,184,225,                     //vfmadd231ps   %ymm9,%ymm1,%ymm12
8807   196,66,125,184,224,                     //vfmadd231ps   %ymm8,%ymm0,%ymm12
8808   196,193,124,83,204,                     //vrcpps        %ymm12,%ymm1
8809   197,172,89,193,                         //vmulps        %ymm1,%ymm10,%ymm0
8810   197,164,89,201,                         //vmulps        %ymm1,%ymm11,%ymm1
8811   72,173,                                 //lods          %ds:(%rsi),%rax
8812   255,224,                                //jmpq          *%rax
8813 };
8814 
8815 CODE const uint8_t sk_linear_gradient_2stops_hsw[] = {
8816   72,173,                                 //lods          %ds:(%rsi),%rax
8817   196,226,125,24,72,16,                   //vbroadcastss  0x10(%rax),%ymm1
8818   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
8819   196,98,125,184,193,                     //vfmadd231ps   %ymm1,%ymm0,%ymm8
8820   196,226,125,24,80,20,                   //vbroadcastss  0x14(%rax),%ymm2
8821   196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
8822   196,226,125,184,202,                    //vfmadd231ps   %ymm2,%ymm0,%ymm1
8823   196,226,125,24,88,24,                   //vbroadcastss  0x18(%rax),%ymm3
8824   196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
8825   196,226,125,184,211,                    //vfmadd231ps   %ymm3,%ymm0,%ymm2
8826   196,98,125,24,72,28,                    //vbroadcastss  0x1c(%rax),%ymm9
8827   196,226,125,24,88,12,                   //vbroadcastss  0xc(%rax),%ymm3
8828   196,194,125,184,217,                    //vfmadd231ps   %ymm9,%ymm0,%ymm3
8829   72,173,                                 //lods          %ds:(%rsi),%rax
8830   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
8831   255,224,                                //jmpq          *%rax
8832 };
8833 
8834 CODE const uint8_t sk_start_pipeline_avx[] = {
8835   65,87,                                  //push          %r15
8836   65,86,                                  //push          %r14
8837   65,85,                                  //push          %r13
8838   65,84,                                  //push          %r12
8839   86,                                     //push          %rsi
8840   87,                                     //push          %rdi
8841   83,                                     //push          %rbx
8842   72,129,236,160,0,0,0,                   //sub           $0xa0,%rsp
8843   197,120,41,188,36,144,0,0,0,            //vmovaps       %xmm15,0x90(%rsp)
8844   197,120,41,180,36,128,0,0,0,            //vmovaps       %xmm14,0x80(%rsp)
8845   197,120,41,108,36,112,                  //vmovaps       %xmm13,0x70(%rsp)
8846   197,120,41,100,36,96,                   //vmovaps       %xmm12,0x60(%rsp)
8847   197,120,41,92,36,80,                    //vmovaps       %xmm11,0x50(%rsp)
8848   197,120,41,84,36,64,                    //vmovaps       %xmm10,0x40(%rsp)
8849   197,120,41,76,36,48,                    //vmovaps       %xmm9,0x30(%rsp)
8850   197,120,41,68,36,32,                    //vmovaps       %xmm8,0x20(%rsp)
8851   197,248,41,124,36,16,                   //vmovaps       %xmm7,0x10(%rsp)
8852   197,248,41,52,36,                       //vmovaps       %xmm6,(%rsp)
8853   77,137,205,                             //mov           %r9,%r13
8854   77,137,198,                             //mov           %r8,%r14
8855   72,137,203,                             //mov           %rcx,%rbx
8856   72,137,214,                             //mov           %rdx,%rsi
8857   72,173,                                 //lods          %ds:(%rsi),%rax
8858   73,137,199,                             //mov           %rax,%r15
8859   73,137,244,                             //mov           %rsi,%r12
8860   72,141,67,8,                            //lea           0x8(%rbx),%rax
8861   76,57,232,                              //cmp           %r13,%rax
8862   118,5,                                  //jbe           75 <_sk_start_pipeline_avx+0x75>
8863   72,137,223,                             //mov           %rbx,%rdi
8864   235,65,                                 //jmp           b6 <_sk_start_pipeline_avx+0xb6>
8865   185,0,0,0,0,                            //mov           $0x0,%ecx
8866   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
8867   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
8868   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
8869   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
8870   197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
8871   197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
8872   197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
8873   197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
8874   72,137,223,                             //mov           %rbx,%rdi
8875   76,137,230,                             //mov           %r12,%rsi
8876   76,137,242,                             //mov           %r14,%rdx
8877   65,255,215,                             //callq         *%r15
8878   72,141,123,8,                           //lea           0x8(%rbx),%rdi
8879   72,131,195,16,                          //add           $0x10,%rbx
8880   76,57,235,                              //cmp           %r13,%rbx
8881   72,137,251,                             //mov           %rdi,%rbx
8882   118,191,                                //jbe           75 <_sk_start_pipeline_avx+0x75>
8883   76,137,233,                             //mov           %r13,%rcx
8884   72,41,249,                              //sub           %rdi,%rcx
8885   116,41,                                 //je            e7 <_sk_start_pipeline_avx+0xe7>
8886   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
8887   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
8888   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
8889   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
8890   197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
8891   197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
8892   197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
8893   197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
8894   76,137,230,                             //mov           %r12,%rsi
8895   76,137,242,                             //mov           %r14,%rdx
8896   65,255,215,                             //callq         *%r15
8897   76,137,232,                             //mov           %r13,%rax
8898   197,248,40,52,36,                       //vmovaps       (%rsp),%xmm6
8899   197,248,40,124,36,16,                   //vmovaps       0x10(%rsp),%xmm7
8900   197,120,40,68,36,32,                    //vmovaps       0x20(%rsp),%xmm8
8901   197,120,40,76,36,48,                    //vmovaps       0x30(%rsp),%xmm9
8902   197,120,40,84,36,64,                    //vmovaps       0x40(%rsp),%xmm10
8903   197,120,40,92,36,80,                    //vmovaps       0x50(%rsp),%xmm11
8904   197,120,40,100,36,96,                   //vmovaps       0x60(%rsp),%xmm12
8905   197,120,40,108,36,112,                  //vmovaps       0x70(%rsp),%xmm13
8906   197,120,40,180,36,128,0,0,0,            //vmovaps       0x80(%rsp),%xmm14
8907   197,120,40,188,36,144,0,0,0,            //vmovaps       0x90(%rsp),%xmm15
8908   72,129,196,160,0,0,0,                   //add           $0xa0,%rsp
8909   91,                                     //pop           %rbx
8910   95,                                     //pop           %rdi
8911   94,                                     //pop           %rsi
8912   65,92,                                  //pop           %r12
8913   65,93,                                  //pop           %r13
8914   65,94,                                  //pop           %r14
8915   65,95,                                  //pop           %r15
8916   197,248,119,                            //vzeroupper
8917   195,                                    //retq
8918 };
8919 
8920 CODE const uint8_t sk_just_return_avx[] = {
8921   195,                                    //retq
8922 };
8923 
8924 CODE const uint8_t sk_seed_shader_avx[] = {
8925   72,173,                                 //lods          %ds:(%rsi),%rax
8926   197,249,110,199,                        //vmovd         %edi,%xmm0
8927   197,249,112,192,0,                      //vpshufd       $0x0,%xmm0,%xmm0
8928   196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
8929   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
8930   65,184,0,0,0,63,                        //mov           $0x3f000000,%r8d
8931   196,193,121,110,200,                    //vmovd         %r8d,%xmm1
8932   196,227,121,4,201,0,                    //vpermilps     $0x0,%xmm1,%xmm1
8933   196,227,117,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
8934   197,252,88,193,                         //vaddps        %ymm1,%ymm0,%ymm0
8935   197,252,88,2,                           //vaddps        (%rdx),%ymm0,%ymm0
8936   196,226,125,24,16,                      //vbroadcastss  (%rax),%ymm2
8937   197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
8938   197,236,88,201,                         //vaddps        %ymm1,%ymm2,%ymm1
8939   184,0,0,128,63,                         //mov           $0x3f800000,%eax
8940   197,249,110,208,                        //vmovd         %eax,%xmm2
8941   196,227,121,4,210,0,                    //vpermilps     $0x0,%xmm2,%xmm2
8942   196,227,109,24,210,1,                   //vinsertf128   $0x1,%xmm2,%ymm2,%ymm2
8943   72,173,                                 //lods          %ds:(%rsi),%rax
8944   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
8945   197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
8946   197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
8947   197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
8948   197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
8949   255,224,                                //jmpq          *%rax
8950 };
8951 
8952 CODE const uint8_t sk_constant_color_avx[] = {
8953   72,173,                                 //lods          %ds:(%rsi),%rax
8954   196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
8955   196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
8956   196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
8957   196,226,125,24,88,12,                   //vbroadcastss  0xc(%rax),%ymm3
8958   72,173,                                 //lods          %ds:(%rsi),%rax
8959   255,224,                                //jmpq          *%rax
8960 };
8961 
8962 CODE const uint8_t sk_clear_avx[] = {
8963   72,173,                                 //lods          %ds:(%rsi),%rax
8964   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
8965   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
8966   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
8967   197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
8968   255,224,                                //jmpq          *%rax
8969 };
8970 
8971 CODE const uint8_t sk_plus__avx[] = {
8972   197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
8973   197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
8974   197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
8975   197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
8976   72,173,                                 //lods          %ds:(%rsi),%rax
8977   255,224,                                //jmpq          *%rax
8978 };
8979 
8980 CODE const uint8_t sk_srcover_avx[] = {
8981   184,0,0,128,63,                         //mov           $0x3f800000,%eax
8982   197,121,110,192,                        //vmovd         %eax,%xmm8
8983   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
8984   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
8985   197,60,92,195,                          //vsubps        %ymm3,%ymm8,%ymm8
8986   197,60,89,204,                          //vmulps        %ymm4,%ymm8,%ymm9
8987   197,180,88,192,                         //vaddps        %ymm0,%ymm9,%ymm0
8988   197,60,89,205,                          //vmulps        %ymm5,%ymm8,%ymm9
8989   197,180,88,201,                         //vaddps        %ymm1,%ymm9,%ymm1
8990   197,60,89,206,                          //vmulps        %ymm6,%ymm8,%ymm9
8991   197,180,88,210,                         //vaddps        %ymm2,%ymm9,%ymm2
8992   197,60,89,199,                          //vmulps        %ymm7,%ymm8,%ymm8
8993   197,188,88,219,                         //vaddps        %ymm3,%ymm8,%ymm3
8994   72,173,                                 //lods          %ds:(%rsi),%rax
8995   255,224,                                //jmpq          *%rax
8996 };
8997 
8998 CODE const uint8_t sk_dstover_avx[] = {
8999   184,0,0,128,63,                         //mov           $0x3f800000,%eax
9000   197,121,110,192,                        //vmovd         %eax,%xmm8
9001   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
9002   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
9003   197,60,92,199,                          //vsubps        %ymm7,%ymm8,%ymm8
9004   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
9005   197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
9006   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
9007   197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
9008   197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
9009   197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
9010   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
9011   197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
9012   72,173,                                 //lods          %ds:(%rsi),%rax
9013   255,224,                                //jmpq          *%rax
9014 };
9015 
9016 CODE const uint8_t sk_clamp_0_avx[] = {
9017   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
9018   196,193,124,95,192,                     //vmaxps        %ymm8,%ymm0,%ymm0
9019   196,193,116,95,200,                     //vmaxps        %ymm8,%ymm1,%ymm1
9020   196,193,108,95,208,                     //vmaxps        %ymm8,%ymm2,%ymm2
9021   196,193,100,95,216,                     //vmaxps        %ymm8,%ymm3,%ymm3
9022   72,173,                                 //lods          %ds:(%rsi),%rax
9023   255,224,                                //jmpq          *%rax
9024 };
9025 
9026 CODE const uint8_t sk_clamp_1_avx[] = {
9027   184,0,0,128,63,                         //mov           $0x3f800000,%eax
9028   197,121,110,192,                        //vmovd         %eax,%xmm8
9029   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
9030   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
9031   196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
9032   196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
9033   196,193,108,93,208,                     //vminps        %ymm8,%ymm2,%ymm2
9034   196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
9035   72,173,                                 //lods          %ds:(%rsi),%rax
9036   255,224,                                //jmpq          *%rax
9037 };
9038 
9039 CODE const uint8_t sk_clamp_a_avx[] = {
9040   184,0,0,128,63,                         //mov           $0x3f800000,%eax
9041   197,121,110,192,                        //vmovd         %eax,%xmm8
9042   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
9043   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
9044   196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
9045   197,252,93,195,                         //vminps        %ymm3,%ymm0,%ymm0
9046   197,244,93,203,                         //vminps        %ymm3,%ymm1,%ymm1
9047   197,236,93,211,                         //vminps        %ymm3,%ymm2,%ymm2
9048   72,173,                                 //lods          %ds:(%rsi),%rax
9049   255,224,                                //jmpq          *%rax
9050 };
9051 
9052 CODE const uint8_t sk_set_rgb_avx[] = {
9053   72,173,                                 //lods          %ds:(%rsi),%rax
9054   196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
9055   196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
9056   196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
9057   72,173,                                 //lods          %ds:(%rsi),%rax
9058   255,224,                                //jmpq          *%rax
9059 };
9060 
9061 CODE const uint8_t sk_swap_rb_avx[] = {
9062   197,124,40,192,                         //vmovaps       %ymm0,%ymm8
9063   72,173,                                 //lods          %ds:(%rsi),%rax
9064   197,252,40,194,                         //vmovaps       %ymm2,%ymm0
9065   197,124,41,194,                         //vmovaps       %ymm8,%ymm2
9066   255,224,                                //jmpq          *%rax
9067 };
9068 
9069 CODE const uint8_t sk_swap_avx[] = {
9070   197,124,40,195,                         //vmovaps       %ymm3,%ymm8
9071   197,124,40,202,                         //vmovaps       %ymm2,%ymm9
9072   197,124,40,209,                         //vmovaps       %ymm1,%ymm10
9073   197,124,40,216,                         //vmovaps       %ymm0,%ymm11
9074   72,173,                                 //lods          %ds:(%rsi),%rax
9075   197,252,40,196,                         //vmovaps       %ymm4,%ymm0
9076   197,252,40,205,                         //vmovaps       %ymm5,%ymm1
9077   197,252,40,214,                         //vmovaps       %ymm6,%ymm2
9078   197,252,40,223,                         //vmovaps       %ymm7,%ymm3
9079   197,124,41,220,                         //vmovaps       %ymm11,%ymm4
9080   197,124,41,213,                         //vmovaps       %ymm10,%ymm5
9081   197,124,41,206,                         //vmovaps       %ymm9,%ymm6
9082   197,124,41,199,                         //vmovaps       %ymm8,%ymm7
9083   255,224,                                //jmpq          *%rax
9084 };
9085 
9086 CODE const uint8_t sk_move_src_dst_avx[] = {
9087   72,173,                                 //lods          %ds:(%rsi),%rax
9088   197,252,40,224,                         //vmovaps       %ymm0,%ymm4
9089   197,252,40,233,                         //vmovaps       %ymm1,%ymm5
9090   197,252,40,242,                         //vmovaps       %ymm2,%ymm6
9091   197,252,40,251,                         //vmovaps       %ymm3,%ymm7
9092   255,224,                                //jmpq          *%rax
9093 };
9094 
9095 CODE const uint8_t sk_move_dst_src_avx[] = {
9096   72,173,                                 //lods          %ds:(%rsi),%rax
9097   197,252,40,196,                         //vmovaps       %ymm4,%ymm0
9098   197,252,40,205,                         //vmovaps       %ymm5,%ymm1
9099   197,252,40,214,                         //vmovaps       %ymm6,%ymm2
9100   197,252,40,223,                         //vmovaps       %ymm7,%ymm3
9101   255,224,                                //jmpq          *%rax
9102 };
9103 
9104 CODE const uint8_t sk_premul_avx[] = {
9105   197,252,89,195,                         //vmulps        %ymm3,%ymm0,%ymm0
9106   197,244,89,203,                         //vmulps        %ymm3,%ymm1,%ymm1
9107   197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
9108   72,173,                                 //lods          %ds:(%rsi),%rax
9109   255,224,                                //jmpq          *%rax
9110 };
9111 
9112 CODE const uint8_t sk_unpremul_avx[] = {
9113   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
9114   196,65,100,194,200,0,                   //vcmpeqps      %ymm8,%ymm3,%ymm9
9115   184,0,0,128,63,                         //mov           $0x3f800000,%eax
9116   197,121,110,208,                        //vmovd         %eax,%xmm10
9117   196,67,121,4,210,0,                     //vpermilps     $0x0,%xmm10,%xmm10
9118   196,67,45,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm10,%ymm10
9119   197,44,94,211,                          //vdivps        %ymm3,%ymm10,%ymm10
9120   196,67,45,74,192,144,                   //vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
9121   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
9122   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
9123   197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
9124   72,173,                                 //lods          %ds:(%rsi),%rax
9125   255,224,                                //jmpq          *%rax
9126 };
9127 
9128 CODE const uint8_t sk_from_srgb_avx[] = {
9129   184,145,131,158,61,                     //mov           $0x3d9e8391,%eax
9130   197,121,110,192,                        //vmovd         %eax,%xmm8
9131   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
9132   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
9133   197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
9134   197,124,89,208,                         //vmulps        %ymm0,%ymm0,%ymm10
9135   184,154,153,153,62,                     //mov           $0x3e99999a,%eax
9136   197,121,110,216,                        //vmovd         %eax,%xmm11
9137   196,67,121,4,219,0,                     //vpermilps     $0x0,%xmm11,%xmm11
9138   196,67,37,24,219,1,                     //vinsertf128   $0x1,%xmm11,%ymm11,%ymm11
9139   184,92,143,50,63,                       //mov           $0x3f328f5c,%eax
9140   197,121,110,224,                        //vmovd         %eax,%xmm12
9141   196,67,121,4,228,0,                     //vpermilps     $0x0,%xmm12,%xmm12
9142   196,67,29,24,228,1,                     //vinsertf128   $0x1,%xmm12,%ymm12,%ymm12
9143   197,36,89,232,                          //vmulps        %ymm0,%ymm11,%ymm13
9144   196,65,20,88,236,                       //vaddps        %ymm12,%ymm13,%ymm13
9145   184,10,215,35,59,                       //mov           $0x3b23d70a,%eax
9146   197,121,110,240,                        //vmovd         %eax,%xmm14
9147   196,67,121,4,246,0,                     //vpermilps     $0x0,%xmm14,%xmm14
9148   196,67,13,24,246,1,                     //vinsertf128   $0x1,%xmm14,%ymm14,%ymm14
9149   196,65,44,89,213,                       //vmulps        %ymm13,%ymm10,%ymm10
9150   196,65,12,88,210,                       //vaddps        %ymm10,%ymm14,%ymm10
9151   184,174,71,97,61,                       //mov           $0x3d6147ae,%eax
9152   197,121,110,232,                        //vmovd         %eax,%xmm13
9153   196,67,121,4,237,0,                     //vpermilps     $0x0,%xmm13,%xmm13
9154   196,67,21,24,237,1,                     //vinsertf128   $0x1,%xmm13,%ymm13,%ymm13
9155   196,193,124,194,197,1,                  //vcmpltps      %ymm13,%ymm0,%ymm0
9156   196,195,45,74,193,0,                    //vblendvps     %ymm0,%ymm9,%ymm10,%ymm0
9157   197,60,89,201,                          //vmulps        %ymm1,%ymm8,%ymm9
9158   197,116,89,209,                         //vmulps        %ymm1,%ymm1,%ymm10
9159   197,36,89,249,                          //vmulps        %ymm1,%ymm11,%ymm15
9160   196,65,28,88,255,                       //vaddps        %ymm15,%ymm12,%ymm15
9161   196,65,44,89,215,                       //vmulps        %ymm15,%ymm10,%ymm10
9162   196,65,12,88,210,                       //vaddps        %ymm10,%ymm14,%ymm10
9163   196,193,116,194,205,1,                  //vcmpltps      %ymm13,%ymm1,%ymm1
9164   196,195,45,74,201,16,                   //vblendvps     %ymm1,%ymm9,%ymm10,%ymm1
9165   197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
9166   197,108,89,202,                         //vmulps        %ymm2,%ymm2,%ymm9
9167   197,36,89,210,                          //vmulps        %ymm2,%ymm11,%ymm10
9168   196,65,28,88,210,                       //vaddps        %ymm10,%ymm12,%ymm10
9169   196,65,52,89,202,                       //vmulps        %ymm10,%ymm9,%ymm9
9170   196,65,12,88,201,                       //vaddps        %ymm9,%ymm14,%ymm9
9171   196,193,108,194,213,1,                  //vcmpltps      %ymm13,%ymm2,%ymm2
9172   196,195,53,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
9173   72,173,                                 //lods          %ds:(%rsi),%rax
9174   255,224,                                //jmpq          *%rax
9175 };
9176 
9177 CODE const uint8_t sk_to_srgb_avx[] = {
9178   197,124,82,192,                         //vrsqrtps      %ymm0,%ymm8
9179   196,65,124,83,232,                      //vrcpps        %ymm8,%ymm13
9180   196,65,124,82,240,                      //vrsqrtps      %ymm8,%ymm14
9181   184,41,92,71,65,                        //mov           $0x41475c29,%eax
9182   197,121,110,192,                        //vmovd         %eax,%xmm8
9183   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
9184   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
9185   197,60,89,224,                          //vmulps        %ymm0,%ymm8,%ymm12
9186   184,0,0,128,63,                         //mov           $0x3f800000,%eax
9187   197,121,110,200,                        //vmovd         %eax,%xmm9
9188   196,67,121,4,201,0,                     //vpermilps     $0x0,%xmm9,%xmm9
9189   196,67,53,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
9190   184,194,135,210,62,                     //mov           $0x3ed287c2,%eax
9191   197,121,110,208,                        //vmovd         %eax,%xmm10
9192   196,67,121,4,210,0,                     //vpermilps     $0x0,%xmm10,%xmm10
9193   196,67,45,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm10,%ymm10
9194   184,206,111,48,63,                      //mov           $0x3f306fce,%eax
9195   197,121,110,216,                        //vmovd         %eax,%xmm11
9196   196,67,121,4,219,0,                     //vpermilps     $0x0,%xmm11,%xmm11
9197   196,67,37,24,219,1,                     //vinsertf128   $0x1,%xmm11,%ymm11,%ymm11
9198   184,168,87,202,61,                      //mov           $0x3dca57a8,%eax
9199   53,0,0,0,128,                           //xor           $0x80000000,%eax
9200   197,121,110,248,                        //vmovd         %eax,%xmm15
9201   196,67,121,4,255,0,                     //vpermilps     $0x0,%xmm15,%xmm15
9202   196,67,5,24,255,1,                      //vinsertf128   $0x1,%xmm15,%ymm15,%ymm15
9203   196,65,20,89,235,                       //vmulps        %ymm11,%ymm13,%ymm13
9204   196,65,20,88,239,                       //vaddps        %ymm15,%ymm13,%ymm13
9205   196,65,12,89,242,                       //vmulps        %ymm10,%ymm14,%ymm14
9206   196,65,12,88,237,                       //vaddps        %ymm13,%ymm14,%ymm13
9207   196,65,52,93,237,                       //vminps        %ymm13,%ymm9,%ymm13
9208   184,4,231,140,59,                       //mov           $0x3b8ce704,%eax
9209   197,121,110,240,                        //vmovd         %eax,%xmm14
9210   196,67,121,4,246,0,                     //vpermilps     $0x0,%xmm14,%xmm14
9211   196,67,13,24,246,1,                     //vinsertf128   $0x1,%xmm14,%ymm14,%ymm14
9212   196,193,124,194,198,1,                  //vcmpltps      %ymm14,%ymm0,%ymm0
9213   196,195,21,74,196,0,                    //vblendvps     %ymm0,%ymm12,%ymm13,%ymm0
9214   197,124,82,225,                         //vrsqrtps      %ymm1,%ymm12
9215   196,65,124,83,236,                      //vrcpps        %ymm12,%ymm13
9216   196,65,124,82,228,                      //vrsqrtps      %ymm12,%ymm12
9217   196,65,36,89,237,                       //vmulps        %ymm13,%ymm11,%ymm13
9218   196,65,4,88,237,                        //vaddps        %ymm13,%ymm15,%ymm13
9219   196,65,44,89,228,                       //vmulps        %ymm12,%ymm10,%ymm12
9220   196,65,28,88,229,                       //vaddps        %ymm13,%ymm12,%ymm12
9221   197,60,89,233,                          //vmulps        %ymm1,%ymm8,%ymm13
9222   196,65,52,93,228,                       //vminps        %ymm12,%ymm9,%ymm12
9223   196,193,116,194,206,1,                  //vcmpltps      %ymm14,%ymm1,%ymm1
9224   196,195,29,74,205,16,                   //vblendvps     %ymm1,%ymm13,%ymm12,%ymm1
9225   197,124,82,226,                         //vrsqrtps      %ymm2,%ymm12
9226   196,65,124,83,236,                      //vrcpps        %ymm12,%ymm13
9227   196,65,36,89,221,                       //vmulps        %ymm13,%ymm11,%ymm11
9228   196,65,4,88,219,                        //vaddps        %ymm11,%ymm15,%ymm11
9229   196,65,124,82,228,                      //vrsqrtps      %ymm12,%ymm12
9230   196,65,44,89,212,                       //vmulps        %ymm12,%ymm10,%ymm10
9231   196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
9232   196,65,52,93,202,                       //vminps        %ymm10,%ymm9,%ymm9
9233   197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
9234   196,193,108,194,214,1,                  //vcmpltps      %ymm14,%ymm2,%ymm2
9235   196,195,53,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
9236   72,173,                                 //lods          %ds:(%rsi),%rax
9237   255,224,                                //jmpq          *%rax
9238 };
9239 
9240 CODE const uint8_t sk_scale_1_float_avx[] = {
9241   72,173,                                 //lods          %ds:(%rsi),%rax
9242   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
9243   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
9244   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
9245   197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
9246   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
9247   72,173,                                 //lods          %ds:(%rsi),%rax
9248   255,224,                                //jmpq          *%rax
9249 };
9250 
9251 CODE const uint8_t sk_scale_u8_avx[] = {
9252   73,137,200,                             //mov           %rcx,%r8
9253   72,173,                                 //lods          %ds:(%rsi),%rax
9254   72,139,0,                               //mov           (%rax),%rax
9255   72,1,248,                               //add           %rdi,%rax
9256   77,133,192,                             //test          %r8,%r8
9257   117,80,                                 //jne           639 <_sk_scale_u8_avx+0x60>
9258   197,122,126,0,                          //vmovq         (%rax),%xmm8
9259   196,66,121,49,200,                      //vpmovzxbd     %xmm8,%xmm9
9260   196,67,121,4,192,229,                   //vpermilps     $0xe5,%xmm8,%xmm8
9261   196,66,121,49,192,                      //vpmovzxbd     %xmm8,%xmm8
9262   196,67,53,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
9263   196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
9264   184,129,128,128,59,                     //mov           $0x3b808081,%eax
9265   197,121,110,200,                        //vmovd         %eax,%xmm9
9266   196,67,121,4,201,0,                     //vpermilps     $0x0,%xmm9,%xmm9
9267   196,67,53,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
9268   196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
9269   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
9270   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
9271   197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
9272   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
9273   72,173,                                 //lods          %ds:(%rsi),%rax
9274   76,137,193,                             //mov           %r8,%rcx
9275   255,224,                                //jmpq          *%rax
9276   49,201,                                 //xor           %ecx,%ecx
9277   77,137,194,                             //mov           %r8,%r10
9278   69,49,201,                              //xor           %r9d,%r9d
9279   68,15,182,24,                           //movzbl        (%rax),%r11d
9280   72,255,192,                             //inc           %rax
9281   73,211,227,                             //shl           %cl,%r11
9282   77,9,217,                               //or            %r11,%r9
9283   72,131,193,8,                           //add           $0x8,%rcx
9284   73,255,202,                             //dec           %r10
9285   117,234,                                //jne           641 <_sk_scale_u8_avx+0x68>
9286   196,65,249,110,193,                     //vmovq         %r9,%xmm8
9287   235,143,                                //jmp           5ed <_sk_scale_u8_avx+0x14>
9288 };
9289 
9290 CODE const uint8_t sk_lerp_1_float_avx[] = {
9291   72,173,                                 //lods          %ds:(%rsi),%rax
9292   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
9293   197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
9294   196,193,124,89,192,                     //vmulps        %ymm8,%ymm0,%ymm0
9295   197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
9296   197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
9297   196,193,116,89,200,                     //vmulps        %ymm8,%ymm1,%ymm1
9298   197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
9299   197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
9300   196,193,108,89,208,                     //vmulps        %ymm8,%ymm2,%ymm2
9301   197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
9302   197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
9303   196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
9304   197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
9305   72,173,                                 //lods          %ds:(%rsi),%rax
9306   255,224,                                //jmpq          *%rax
9307 };
9308 
9309 CODE const uint8_t sk_lerp_u8_avx[] = {
9310   73,137,200,                             //mov           %rcx,%r8
9311   72,173,                                 //lods          %ds:(%rsi),%rax
9312   72,139,0,                               //mov           (%rax),%rax
9313   72,1,248,                               //add           %rdi,%rax
9314   77,133,192,                             //test          %r8,%r8
9315   117,116,                                //jne           721 <_sk_lerp_u8_avx+0x84>
9316   197,122,126,0,                          //vmovq         (%rax),%xmm8
9317   196,66,121,49,200,                      //vpmovzxbd     %xmm8,%xmm9
9318   196,67,121,4,192,229,                   //vpermilps     $0xe5,%xmm8,%xmm8
9319   196,66,121,49,192,                      //vpmovzxbd     %xmm8,%xmm8
9320   196,67,53,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
9321   196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
9322   184,129,128,128,59,                     //mov           $0x3b808081,%eax
9323   197,121,110,200,                        //vmovd         %eax,%xmm9
9324   196,67,121,4,201,0,                     //vpermilps     $0x0,%xmm9,%xmm9
9325   196,67,53,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
9326   196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
9327   197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
9328   196,193,124,89,192,                     //vmulps        %ymm8,%ymm0,%ymm0
9329   197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
9330   197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
9331   196,193,116,89,200,                     //vmulps        %ymm8,%ymm1,%ymm1
9332   197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
9333   197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
9334   196,193,108,89,208,                     //vmulps        %ymm8,%ymm2,%ymm2
9335   197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
9336   197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
9337   196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
9338   197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
9339   72,173,                                 //lods          %ds:(%rsi),%rax
9340   76,137,193,                             //mov           %r8,%rcx
9341   255,224,                                //jmpq          *%rax
9342   49,201,                                 //xor           %ecx,%ecx
9343   77,137,194,                             //mov           %r8,%r10
9344   69,49,201,                              //xor           %r9d,%r9d
9345   68,15,182,24,                           //movzbl        (%rax),%r11d
9346   72,255,192,                             //inc           %rax
9347   73,211,227,                             //shl           %cl,%r11
9348   77,9,217,                               //or            %r11,%r9
9349   72,131,193,8,                           //add           $0x8,%rcx
9350   73,255,202,                             //dec           %r10
9351   117,234,                                //jne           729 <_sk_lerp_u8_avx+0x8c>
9352   196,65,249,110,193,                     //vmovq         %r9,%xmm8
9353   233,104,255,255,255,                    //jmpq          6b1 <_sk_lerp_u8_avx+0x14>
9354 };
9355 
9356 CODE const uint8_t sk_lerp_565_avx[] = {
9357   72,173,                                 //lods          %ds:(%rsi),%rax
9358   76,139,16,                              //mov           (%rax),%r10
9359   72,133,201,                             //test          %rcx,%rcx
9360   15,133,250,0,0,0,                       //jne           851 <_sk_lerp_565_avx+0x108>
9361   196,65,122,111,4,122,                   //vmovdqu       (%r10,%rdi,2),%xmm8
9362   197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
9363   197,185,105,219,                        //vpunpckhwd    %xmm3,%xmm8,%xmm3
9364   196,66,121,51,192,                      //vpmovzxwd     %xmm8,%xmm8
9365   196,99,61,24,195,1,                     //vinsertf128   $0x1,%xmm3,%ymm8,%ymm8
9366   184,0,248,0,0,                          //mov           $0xf800,%eax
9367   197,249,110,216,                        //vmovd         %eax,%xmm3
9368   197,249,112,219,0,                      //vpshufd       $0x0,%xmm3,%xmm3
9369   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
9370   196,193,100,84,216,                     //vandps        %ymm8,%ymm3,%ymm3
9371   197,124,91,203,                         //vcvtdq2ps     %ymm3,%ymm9
9372   184,8,33,132,55,                        //mov           $0x37842108,%eax
9373   197,249,110,216,                        //vmovd         %eax,%xmm3
9374   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
9375   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
9376   197,52,89,203,                          //vmulps        %ymm3,%ymm9,%ymm9
9377   184,224,7,0,0,                          //mov           $0x7e0,%eax
9378   197,249,110,216,                        //vmovd         %eax,%xmm3
9379   197,249,112,219,0,                      //vpshufd       $0x0,%xmm3,%xmm3
9380   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
9381   196,193,100,84,216,                     //vandps        %ymm8,%ymm3,%ymm3
9382   197,124,91,211,                         //vcvtdq2ps     %ymm3,%ymm10
9383   184,33,8,2,58,                          //mov           $0x3a020821,%eax
9384   197,249,110,216,                        //vmovd         %eax,%xmm3
9385   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
9386   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
9387   197,44,89,211,                          //vmulps        %ymm3,%ymm10,%ymm10
9388   184,31,0,0,0,                           //mov           $0x1f,%eax
9389   197,249,110,216,                        //vmovd         %eax,%xmm3
9390   197,249,112,219,0,                      //vpshufd       $0x0,%xmm3,%xmm3
9391   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
9392   196,193,100,84,216,                     //vandps        %ymm8,%ymm3,%ymm3
9393   197,124,91,195,                         //vcvtdq2ps     %ymm3,%ymm8
9394   184,8,33,4,61,                          //mov           $0x3d042108,%eax
9395   197,249,110,216,                        //vmovd         %eax,%xmm3
9396   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
9397   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
9398   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
9399   197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
9400   196,193,124,89,193,                     //vmulps        %ymm9,%ymm0,%ymm0
9401   197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
9402   197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
9403   196,193,116,89,202,                     //vmulps        %ymm10,%ymm1,%ymm1
9404   197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
9405   197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
9406   197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
9407   197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
9408   184,0,0,128,63,                         //mov           $0x3f800000,%eax
9409   197,249,110,216,                        //vmovd         %eax,%xmm3
9410   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
9411   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
9412   72,173,                                 //lods          %ds:(%rsi),%rax
9413   255,224,                                //jmpq          *%rax
9414   65,137,200,                             //mov           %ecx,%r8d
9415   65,128,224,7,                           //and           $0x7,%r8b
9416   196,65,57,239,192,                      //vpxor         %xmm8,%xmm8,%xmm8
9417   65,254,200,                             //dec           %r8b
9418   65,128,248,6,                           //cmp           $0x6,%r8b
9419   15,135,243,254,255,255,                 //ja            75d <_sk_lerp_565_avx+0x14>
9420   69,15,182,192,                          //movzbl        %r8b,%r8d
9421   76,141,13,75,0,0,0,                     //lea           0x4b(%rip),%r9        # 8c0 <_sk_lerp_565_avx+0x177>
9422   75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
9423   76,1,200,                               //add           %r9,%rax
9424   255,224,                                //jmpq          *%rax
9425   197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
9426   196,65,97,196,68,122,12,6,              //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm8
9427   196,65,57,196,68,122,10,5,              //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm8,%xmm8
9428   196,65,57,196,68,122,8,4,               //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm8,%xmm8
9429   196,65,57,196,68,122,6,3,               //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm8,%xmm8
9430   196,65,57,196,68,122,4,2,               //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8
9431   196,65,57,196,68,122,2,1,               //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8
9432   196,65,57,196,4,122,0,                  //vpinsrw       $0x0,(%r10,%rdi,2),%xmm8,%xmm8
9433   233,159,254,255,255,                    //jmpq          75d <_sk_lerp_565_avx+0x14>
9434   102,144,                                //xchg          %ax,%ax
9435   242,255,                                //repnz         (bad)
9436   255,                                    //(bad)
9437   255,                                    //(bad)
9438   234,                                    //(bad)
9439   255,                                    //(bad)
9440   255,                                    //(bad)
9441   255,226,                                //jmpq          *%rdx
9442   255,                                    //(bad)
9443   255,                                    //(bad)
9444   255,                                    //(bad)
9445   218,255,                                //(bad)
9446   255,                                    //(bad)
9447   255,210,                                //callq         *%rdx
9448   255,                                    //(bad)
9449   255,                                    //(bad)
9450   255,202,                                //dec           %edx
9451   255,                                    //(bad)
9452   255,                                    //(bad)
9453   255,                                    //(bad)
9454   190,                                    //.byte         0xbe
9455   255,                                    //(bad)
9456   255,                                    //(bad)
9457   255,                                    //.byte         0xff
9458 };
9459 
9460 CODE const uint8_t sk_load_tables_avx[] = {
9461   85,                                     //push          %rbp
9462   65,87,                                  //push          %r15
9463   65,86,                                  //push          %r14
9464   65,85,                                  //push          %r13
9465   65,84,                                  //push          %r12
9466   83,                                     //push          %rbx
9467   72,173,                                 //lods          %ds:(%rsi),%rax
9468   76,139,0,                               //mov           (%rax),%r8
9469   72,133,201,                             //test          %rcx,%rcx
9470   15,133,56,2,0,0,                        //jne           b2c <_sk_load_tables_avx+0x250>
9471   196,65,124,16,4,184,                    //vmovups       (%r8,%rdi,4),%ymm8
9472   187,255,0,0,0,                          //mov           $0xff,%ebx
9473   197,249,110,195,                        //vmovd         %ebx,%xmm0
9474   197,249,112,192,0,                      //vpshufd       $0x0,%xmm0,%xmm0
9475   196,99,125,24,200,1,                    //vinsertf128   $0x1,%xmm0,%ymm0,%ymm9
9476   196,193,52,84,192,                      //vandps        %ymm8,%ymm9,%ymm0
9477   196,193,249,126,193,                    //vmovq         %xmm0,%r9
9478   69,137,203,                             //mov           %r9d,%r11d
9479   196,195,249,22,194,1,                   //vpextrq       $0x1,%xmm0,%r10
9480   69,137,214,                             //mov           %r10d,%r14d
9481   73,193,234,32,                          //shr           $0x20,%r10
9482   73,193,233,32,                          //shr           $0x20,%r9
9483   196,227,125,25,192,1,                   //vextractf128  $0x1,%ymm0,%xmm0
9484   196,193,249,126,196,                    //vmovq         %xmm0,%r12
9485   69,137,231,                             //mov           %r12d,%r15d
9486   196,227,249,22,195,1,                   //vpextrq       $0x1,%xmm0,%rbx
9487   65,137,221,                             //mov           %ebx,%r13d
9488   72,193,235,32,                          //shr           $0x20,%rbx
9489   73,193,236,32,                          //shr           $0x20,%r12
9490   72,139,104,8,                           //mov           0x8(%rax),%rbp
9491   76,139,64,16,                           //mov           0x10(%rax),%r8
9492   196,161,122,16,68,189,0,                //vmovss        0x0(%rbp,%r15,4),%xmm0
9493   196,163,121,33,68,165,0,16,             //vinsertps     $0x10,0x0(%rbp,%r12,4),%xmm0,%xmm0
9494   196,161,122,16,76,173,0,                //vmovss        0x0(%rbp,%r13,4),%xmm1
9495   196,227,121,33,193,32,                  //vinsertps     $0x20,%xmm1,%xmm0,%xmm0
9496   197,250,16,76,157,0,                    //vmovss        0x0(%rbp,%rbx,4),%xmm1
9497   196,227,121,33,193,48,                  //vinsertps     $0x30,%xmm1,%xmm0,%xmm0
9498   196,161,122,16,76,157,0,                //vmovss        0x0(%rbp,%r11,4),%xmm1
9499   196,163,113,33,76,141,0,16,             //vinsertps     $0x10,0x0(%rbp,%r9,4),%xmm1,%xmm1
9500   196,161,122,16,92,181,0,                //vmovss        0x0(%rbp,%r14,4),%xmm3
9501   196,227,113,33,203,32,                  //vinsertps     $0x20,%xmm3,%xmm1,%xmm1
9502   196,161,122,16,92,149,0,                //vmovss        0x0(%rbp,%r10,4),%xmm3
9503   196,227,113,33,203,48,                  //vinsertps     $0x30,%xmm3,%xmm1,%xmm1
9504   196,227,117,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
9505   196,193,113,114,208,8,                  //vpsrld        $0x8,%xmm8,%xmm1
9506   196,67,125,25,194,1,                    //vextractf128  $0x1,%ymm8,%xmm10
9507   196,193,105,114,210,8,                  //vpsrld        $0x8,%xmm10,%xmm2
9508   196,227,117,24,202,1,                   //vinsertf128   $0x1,%xmm2,%ymm1,%ymm1
9509   197,180,84,201,                         //vandps        %ymm1,%ymm9,%ymm1
9510   196,193,249,126,201,                    //vmovq         %xmm1,%r9
9511   69,137,203,                             //mov           %r9d,%r11d
9512   196,195,249,22,202,1,                   //vpextrq       $0x1,%xmm1,%r10
9513   69,137,214,                             //mov           %r10d,%r14d
9514   73,193,234,32,                          //shr           $0x20,%r10
9515   73,193,233,32,                          //shr           $0x20,%r9
9516   196,227,125,25,201,1,                   //vextractf128  $0x1,%ymm1,%xmm1
9517   196,225,249,126,205,                    //vmovq         %xmm1,%rbp
9518   65,137,239,                             //mov           %ebp,%r15d
9519   196,227,249,22,203,1,                   //vpextrq       $0x1,%xmm1,%rbx
9520   65,137,220,                             //mov           %ebx,%r12d
9521   72,193,235,32,                          //shr           $0x20,%rbx
9522   72,193,237,32,                          //shr           $0x20,%rbp
9523   196,129,122,16,12,184,                  //vmovss        (%r8,%r15,4),%xmm1
9524   196,195,113,33,12,168,16,               //vinsertps     $0x10,(%r8,%rbp,4),%xmm1,%xmm1
9525   196,129,122,16,20,160,                  //vmovss        (%r8,%r12,4),%xmm2
9526   196,227,113,33,202,32,                  //vinsertps     $0x20,%xmm2,%xmm1,%xmm1
9527   196,193,122,16,20,152,                  //vmovss        (%r8,%rbx,4),%xmm2
9528   196,227,113,33,202,48,                  //vinsertps     $0x30,%xmm2,%xmm1,%xmm1
9529   196,129,122,16,20,152,                  //vmovss        (%r8,%r11,4),%xmm2
9530   196,131,105,33,20,136,16,               //vinsertps     $0x10,(%r8,%r9,4),%xmm2,%xmm2
9531   196,129,122,16,28,176,                  //vmovss        (%r8,%r14,4),%xmm3
9532   196,227,105,33,211,32,                  //vinsertps     $0x20,%xmm3,%xmm2,%xmm2
9533   196,129,122,16,28,144,                  //vmovss        (%r8,%r10,4),%xmm3
9534   196,227,105,33,211,48,                  //vinsertps     $0x30,%xmm3,%xmm2,%xmm2
9535   196,227,109,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
9536   72,139,64,24,                           //mov           0x18(%rax),%rax
9537   196,193,105,114,208,16,                 //vpsrld        $0x10,%xmm8,%xmm2
9538   196,193,97,114,210,16,                  //vpsrld        $0x10,%xmm10,%xmm3
9539   196,227,109,24,211,1,                   //vinsertf128   $0x1,%xmm3,%ymm2,%ymm2
9540   197,180,84,210,                         //vandps        %ymm2,%ymm9,%ymm2
9541   196,193,249,126,208,                    //vmovq         %xmm2,%r8
9542   69,137,194,                             //mov           %r8d,%r10d
9543   196,195,249,22,209,1,                   //vpextrq       $0x1,%xmm2,%r9
9544   69,137,203,                             //mov           %r9d,%r11d
9545   73,193,233,32,                          //shr           $0x20,%r9
9546   73,193,232,32,                          //shr           $0x20,%r8
9547   196,227,125,25,210,1,                   //vextractf128  $0x1,%ymm2,%xmm2
9548   196,225,249,126,213,                    //vmovq         %xmm2,%rbp
9549   65,137,238,                             //mov           %ebp,%r14d
9550   196,227,249,22,211,1,                   //vpextrq       $0x1,%xmm2,%rbx
9551   65,137,223,                             //mov           %ebx,%r15d
9552   72,193,235,32,                          //shr           $0x20,%rbx
9553   72,193,237,32,                          //shr           $0x20,%rbp
9554   196,161,122,16,20,176,                  //vmovss        (%rax,%r14,4),%xmm2
9555   196,227,105,33,20,168,16,               //vinsertps     $0x10,(%rax,%rbp,4),%xmm2,%xmm2
9556   196,161,122,16,28,184,                  //vmovss        (%rax,%r15,4),%xmm3
9557   196,227,105,33,211,32,                  //vinsertps     $0x20,%xmm3,%xmm2,%xmm2
9558   197,250,16,28,152,                      //vmovss        (%rax,%rbx,4),%xmm3
9559   196,99,105,33,203,48,                   //vinsertps     $0x30,%xmm3,%xmm2,%xmm9
9560   196,161,122,16,28,144,                  //vmovss        (%rax,%r10,4),%xmm3
9561   196,163,97,33,28,128,16,                //vinsertps     $0x10,(%rax,%r8,4),%xmm3,%xmm3
9562   196,161,122,16,20,152,                  //vmovss        (%rax,%r11,4),%xmm2
9563   196,227,97,33,210,32,                   //vinsertps     $0x20,%xmm2,%xmm3,%xmm2
9564   196,161,122,16,28,136,                  //vmovss        (%rax,%r9,4),%xmm3
9565   196,227,105,33,211,48,                  //vinsertps     $0x30,%xmm3,%xmm2,%xmm2
9566   196,195,109,24,209,1,                   //vinsertf128   $0x1,%xmm9,%ymm2,%ymm2
9567   196,193,57,114,208,24,                  //vpsrld        $0x18,%xmm8,%xmm8
9568   196,193,97,114,210,24,                  //vpsrld        $0x18,%xmm10,%xmm3
9569   196,227,61,24,219,1,                    //vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
9570   197,124,91,195,                         //vcvtdq2ps     %ymm3,%ymm8
9571   184,129,128,128,59,                     //mov           $0x3b808081,%eax
9572   197,249,110,216,                        //vmovd         %eax,%xmm3
9573   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
9574   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
9575   197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
9576   72,173,                                 //lods          %ds:(%rsi),%rax
9577   91,                                     //pop           %rbx
9578   65,92,                                  //pop           %r12
9579   65,93,                                  //pop           %r13
9580   65,94,                                  //pop           %r14
9581   65,95,                                  //pop           %r15
9582   93,                                     //pop           %rbp
9583   255,224,                                //jmpq          *%rax
9584   137,203,                                //mov           %ecx,%ebx
9585   128,227,7,                              //and           $0x7,%bl
9586   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
9587   254,203,                                //dec           %bl
9588   128,251,6,                              //cmp           $0x6,%bl
9589   15,135,185,253,255,255,                 //ja            8fa <_sk_load_tables_avx+0x1e>
9590   15,182,219,                             //movzbl        %bl,%ebx
9591   76,141,13,137,0,0,0,                    //lea           0x89(%rip),%r9        # bd4 <_sk_load_tables_avx+0x2f8>
9592   73,99,28,153,                           //movslq        (%r9,%rbx,4),%rbx
9593   76,1,203,                               //add           %r9,%rbx
9594   255,227,                                //jmpq          *%rbx
9595   196,193,121,110,68,184,24,              //vmovd         0x18(%r8,%rdi,4),%xmm0
9596   197,249,112,192,68,                     //vpshufd       $0x44,%xmm0,%xmm0
9597   196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
9598   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
9599   196,99,117,12,192,64,                   //vblendps      $0x40,%ymm0,%ymm1,%ymm8
9600   196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
9601   196,195,121,34,68,184,20,1,             //vpinsrd       $0x1,0x14(%r8,%rdi,4),%xmm0,%xmm0
9602   196,99,61,24,192,1,                     //vinsertf128   $0x1,%xmm0,%ymm8,%ymm8
9603   196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
9604   196,195,121,34,68,184,16,0,             //vpinsrd       $0x0,0x10(%r8,%rdi,4),%xmm0,%xmm0
9605   196,99,61,24,192,1,                     //vinsertf128   $0x1,%xmm0,%ymm8,%ymm8
9606   196,195,57,34,68,184,12,3,              //vpinsrd       $0x3,0xc(%r8,%rdi,4),%xmm8,%xmm0
9607   196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
9608   196,195,57,34,68,184,8,2,               //vpinsrd       $0x2,0x8(%r8,%rdi,4),%xmm8,%xmm0
9609   196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
9610   196,195,57,34,68,184,4,1,               //vpinsrd       $0x1,0x4(%r8,%rdi,4),%xmm8,%xmm0
9611   196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
9612   196,195,57,34,4,184,0,                  //vpinsrd       $0x0,(%r8,%rdi,4),%xmm8,%xmm0
9613   196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
9614   233,38,253,255,255,                     //jmpq          8fa <_sk_load_tables_avx+0x1e>
9615   238,                                    //out           %al,(%dx)
9616   255,                                    //(bad)
9617   255,                                    //(bad)
9618   255,224,                                //jmpq          *%rax
9619   255,                                    //(bad)
9620   255,                                    //(bad)
9621   255,210,                                //callq         *%rdx
9622   255,                                    //(bad)
9623   255,                                    //(bad)
9624   255,196,                                //inc           %esp
9625   255,                                    //(bad)
9626   255,                                    //(bad)
9627   255,176,255,255,255,156,                //pushq         -0x63000001(%rax)
9628   255,                                    //(bad)
9629   255,                                    //(bad)
9630   255,                                    //.byte         0xff
9631   128,255,255,                            //cmp           $0xff,%bh
9632   255,                                    //.byte         0xff
9633 };
9634 
9635 CODE const uint8_t sk_load_a8_avx[] = {
9636   73,137,200,                             //mov           %rcx,%r8
9637   72,173,                                 //lods          %ds:(%rsi),%rax
9638   72,139,0,                               //mov           (%rax),%rax
9639   72,1,248,                               //add           %rdi,%rax
9640   77,133,192,                             //test          %r8,%r8
9641   117,74,                                 //jne           c4a <_sk_load_a8_avx+0x5a>
9642   197,250,126,0,                          //vmovq         (%rax),%xmm0
9643   196,226,121,49,200,                     //vpmovzxbd     %xmm0,%xmm1
9644   196,227,121,4,192,229,                  //vpermilps     $0xe5,%xmm0,%xmm0
9645   196,226,121,49,192,                     //vpmovzxbd     %xmm0,%xmm0
9646   196,227,117,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
9647   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
9648   184,129,128,128,59,                     //mov           $0x3b808081,%eax
9649   197,249,110,200,                        //vmovd         %eax,%xmm1
9650   196,227,121,4,201,0,                    //vpermilps     $0x0,%xmm1,%xmm1
9651   196,227,117,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
9652   197,252,89,217,                         //vmulps        %ymm1,%ymm0,%ymm3
9653   72,173,                                 //lods          %ds:(%rsi),%rax
9654   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
9655   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
9656   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
9657   76,137,193,                             //mov           %r8,%rcx
9658   255,224,                                //jmpq          *%rax
9659   49,201,                                 //xor           %ecx,%ecx
9660   77,137,194,                             //mov           %r8,%r10
9661   69,49,201,                              //xor           %r9d,%r9d
9662   68,15,182,24,                           //movzbl        (%rax),%r11d
9663   72,255,192,                             //inc           %rax
9664   73,211,227,                             //shl           %cl,%r11
9665   77,9,217,                               //or            %r11,%r9
9666   72,131,193,8,                           //add           $0x8,%rcx
9667   73,255,202,                             //dec           %r10
9668   117,234,                                //jne           c52 <_sk_load_a8_avx+0x62>
9669   196,193,249,110,193,                    //vmovq         %r9,%xmm0
9670   235,149,                                //jmp           c04 <_sk_load_a8_avx+0x14>
9671 };
9672 
9673 CODE const uint8_t sk_store_a8_avx[] = {
9674   72,173,                                 //lods          %ds:(%rsi),%rax
9675   76,139,8,                               //mov           (%rax),%r9
9676   184,0,0,127,67,                         //mov           $0x437f0000,%eax
9677   197,121,110,192,                        //vmovd         %eax,%xmm8
9678   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
9679   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
9680   197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
9681   196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
9682   196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
9683   196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
9684   196,65,57,103,192,                      //vpackuswb     %xmm8,%xmm8,%xmm8
9685   72,133,201,                             //test          %rcx,%rcx
9686   117,10,                                 //jne           cb1 <_sk_store_a8_avx+0x42>
9687   196,65,123,17,4,57,                     //vmovsd        %xmm8,(%r9,%rdi,1)
9688   72,173,                                 //lods          %ds:(%rsi),%rax
9689   255,224,                                //jmpq          *%rax
9690   65,137,200,                             //mov           %ecx,%r8d
9691   65,128,224,7,                           //and           $0x7,%r8b
9692   65,254,200,                             //dec           %r8b
9693   65,128,248,6,                           //cmp           $0x6,%r8b
9694   119,236,                                //ja            cad <_sk_store_a8_avx+0x3e>
9695   196,66,121,48,192,                      //vpmovzxbw     %xmm8,%xmm8
9696   65,15,182,192,                          //movzbl        %r8b,%eax
9697   76,141,5,67,0,0,0,                      //lea           0x43(%rip),%r8        # d14 <_sk_store_a8_avx+0xa5>
9698   73,99,4,128,                            //movslq        (%r8,%rax,4),%rax
9699   76,1,192,                               //add           %r8,%rax
9700   255,224,                                //jmpq          *%rax
9701   196,67,121,20,68,57,6,12,               //vpextrb       $0xc,%xmm8,0x6(%r9,%rdi,1)
9702   196,67,121,20,68,57,5,10,               //vpextrb       $0xa,%xmm8,0x5(%r9,%rdi,1)
9703   196,67,121,20,68,57,4,8,                //vpextrb       $0x8,%xmm8,0x4(%r9,%rdi,1)
9704   196,67,121,20,68,57,3,6,                //vpextrb       $0x6,%xmm8,0x3(%r9,%rdi,1)
9705   196,67,121,20,68,57,2,4,                //vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
9706   196,67,121,20,68,57,1,2,                //vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
9707   196,67,121,20,4,57,0,                   //vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
9708   235,154,                                //jmp           cad <_sk_store_a8_avx+0x3e>
9709   144,                                    //nop
9710   246,255,                                //idiv          %bh
9711   255,                                    //(bad)
9712   255,                                    //(bad)
9713   238,                                    //out           %al,(%dx)
9714   255,                                    //(bad)
9715   255,                                    //(bad)
9716   255,230,                                //jmpq          *%rsi
9717   255,                                    //(bad)
9718   255,                                    //(bad)
9719   255,                                    //(bad)
9720   222,255,                                //fdivrp        %st,%st(7)
9721   255,                                    //(bad)
9722   255,214,                                //callq         *%rsi
9723   255,                                    //(bad)
9724   255,                                    //(bad)
9725   255,206,                                //dec           %esi
9726   255,                                    //(bad)
9727   255,                                    //(bad)
9728   255,198,                                //inc           %esi
9729   255,                                    //(bad)
9730   255,                                    //(bad)
9731   255,                                    //.byte         0xff
9732 };
9733 
9734 CODE const uint8_t sk_load_565_avx[] = {
9735   72,173,                                 //lods          %ds:(%rsi),%rax
9736   76,139,16,                              //mov           (%rax),%r10
9737   72,133,201,                             //test          %rcx,%rcx
9738   15,133,209,0,0,0,                       //jne           e0f <_sk_load_565_avx+0xdf>
9739   196,193,122,111,4,122,                  //vmovdqu       (%r10,%rdi,2),%xmm0
9740   197,241,239,201,                        //vpxor         %xmm1,%xmm1,%xmm1
9741   197,249,105,201,                        //vpunpckhwd    %xmm1,%xmm0,%xmm1
9742   196,226,121,51,192,                     //vpmovzxwd     %xmm0,%xmm0
9743   196,227,125,24,209,1,                   //vinsertf128   $0x1,%xmm1,%ymm0,%ymm2
9744   184,0,248,0,0,                          //mov           $0xf800,%eax
9745   197,249,110,192,                        //vmovd         %eax,%xmm0
9746   197,249,112,192,0,                      //vpshufd       $0x0,%xmm0,%xmm0
9747   196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
9748   197,252,84,194,                         //vandps        %ymm2,%ymm0,%ymm0
9749   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
9750   184,8,33,132,55,                        //mov           $0x37842108,%eax
9751   197,249,110,200,                        //vmovd         %eax,%xmm1
9752   196,227,121,4,201,0,                    //vpermilps     $0x0,%xmm1,%xmm1
9753   196,227,117,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
9754   197,252,89,193,                         //vmulps        %ymm1,%ymm0,%ymm0
9755   184,224,7,0,0,                          //mov           $0x7e0,%eax
9756   197,249,110,200,                        //vmovd         %eax,%xmm1
9757   197,249,112,201,0,                      //vpshufd       $0x0,%xmm1,%xmm1
9758   196,227,117,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
9759   197,244,84,202,                         //vandps        %ymm2,%ymm1,%ymm1
9760   197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
9761   184,33,8,2,58,                          //mov           $0x3a020821,%eax
9762   197,249,110,216,                        //vmovd         %eax,%xmm3
9763   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
9764   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
9765   197,244,89,203,                         //vmulps        %ymm3,%ymm1,%ymm1
9766   184,31,0,0,0,                           //mov           $0x1f,%eax
9767   197,249,110,216,                        //vmovd         %eax,%xmm3
9768   197,249,112,219,0,                      //vpshufd       $0x0,%xmm3,%xmm3
9769   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
9770   197,228,84,210,                         //vandps        %ymm2,%ymm3,%ymm2
9771   197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
9772   184,8,33,4,61,                          //mov           $0x3d042108,%eax
9773   197,249,110,216,                        //vmovd         %eax,%xmm3
9774   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
9775   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
9776   197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
9777   184,0,0,128,63,                         //mov           $0x3f800000,%eax
9778   197,249,110,216,                        //vmovd         %eax,%xmm3
9779   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
9780   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
9781   72,173,                                 //lods          %ds:(%rsi),%rax
9782   255,224,                                //jmpq          *%rax
9783   65,137,200,                             //mov           %ecx,%r8d
9784   65,128,224,7,                           //and           $0x7,%r8b
9785   197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
9786   65,254,200,                             //dec           %r8b
9787   65,128,248,6,                           //cmp           $0x6,%r8b
9788   15,135,29,255,255,255,                  //ja            d44 <_sk_load_565_avx+0x14>
9789   69,15,182,192,                          //movzbl        %r8b,%r8d
9790   76,141,13,74,0,0,0,                     //lea           0x4a(%rip),%r9        # e7c <_sk_load_565_avx+0x14c>
9791   75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
9792   76,1,200,                               //add           %r9,%rax
9793   255,224,                                //jmpq          *%rax
9794   197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
9795   196,193,121,196,68,122,12,6,            //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
9796   196,193,121,196,68,122,10,5,            //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
9797   196,193,121,196,68,122,8,4,             //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
9798   196,193,121,196,68,122,6,3,             //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
9799   196,193,121,196,68,122,4,2,             //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
9800   196,193,121,196,68,122,2,1,             //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
9801   196,193,121,196,4,122,0,                //vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
9802   233,201,254,255,255,                    //jmpq          d44 <_sk_load_565_avx+0x14>
9803   144,                                    //nop
9804   243,255,                                //repz          (bad)
9805   255,                                    //(bad)
9806   255,                                    //(bad)
9807   235,255,                                //jmp           e81 <_sk_load_565_avx+0x151>
9808   255,                                    //(bad)
9809   255,227,                                //jmpq          *%rbx
9810   255,                                    //(bad)
9811   255,                                    //(bad)
9812   255,                                    //(bad)
9813   219,255,                                //(bad)
9814   255,                                    //(bad)
9815   255,211,                                //callq         *%rbx
9816   255,                                    //(bad)
9817   255,                                    //(bad)
9818   255,203,                                //dec           %ebx
9819   255,                                    //(bad)
9820   255,                                    //(bad)
9821   255,                                    //(bad)
9822   191,                                    //.byte         0xbf
9823   255,                                    //(bad)
9824   255,                                    //(bad)
9825   255,                                    //.byte         0xff
9826 };
9827 
9828 CODE const uint8_t sk_store_565_avx[] = {
9829   72,173,                                 //lods          %ds:(%rsi),%rax
9830   76,139,8,                               //mov           (%rax),%r9
9831   184,0,0,248,65,                         //mov           $0x41f80000,%eax
9832   197,121,110,192,                        //vmovd         %eax,%xmm8
9833   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
9834   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
9835   197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
9836   196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
9837   196,193,41,114,241,11,                  //vpslld        $0xb,%xmm9,%xmm10
9838   196,67,125,25,201,1,                    //vextractf128  $0x1,%ymm9,%xmm9
9839   196,193,49,114,241,11,                  //vpslld        $0xb,%xmm9,%xmm9
9840   196,67,45,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm10,%ymm9
9841   184,0,0,124,66,                         //mov           $0x427c0000,%eax
9842   197,121,110,208,                        //vmovd         %eax,%xmm10
9843   196,67,121,4,210,0,                     //vpermilps     $0x0,%xmm10,%xmm10
9844   196,67,45,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm10,%ymm10
9845   197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
9846   196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
9847   196,193,33,114,242,5,                   //vpslld        $0x5,%xmm10,%xmm11
9848   196,67,125,25,210,1,                    //vextractf128  $0x1,%ymm10,%xmm10
9849   196,193,41,114,242,5,                   //vpslld        $0x5,%xmm10,%xmm10
9850   196,67,37,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
9851   196,65,45,86,201,                       //vorpd         %ymm9,%ymm10,%ymm9
9852   197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
9853   196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
9854   196,65,53,86,192,                       //vorpd         %ymm8,%ymm9,%ymm8
9855   196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
9856   196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
9857   72,133,201,                             //test          %rcx,%rcx
9858   117,10,                                 //jne           f36 <_sk_store_565_avx+0x9e>
9859   196,65,122,127,4,121,                   //vmovdqu       %xmm8,(%r9,%rdi,2)
9860   72,173,                                 //lods          %ds:(%rsi),%rax
9861   255,224,                                //jmpq          *%rax
9862   65,137,200,                             //mov           %ecx,%r8d
9863   65,128,224,7,                           //and           $0x7,%r8b
9864   65,254,200,                             //dec           %r8b
9865   65,128,248,6,                           //cmp           $0x6,%r8b
9866   119,236,                                //ja            f32 <_sk_store_565_avx+0x9a>
9867   65,15,182,192,                          //movzbl        %r8b,%eax
9868   76,141,5,67,0,0,0,                      //lea           0x43(%rip),%r8        # f94 <_sk_store_565_avx+0xfc>
9869   73,99,4,128,                            //movslq        (%r8,%rax,4),%rax
9870   76,1,192,                               //add           %r8,%rax
9871   255,224,                                //jmpq          *%rax
9872   196,67,121,21,68,121,12,6,              //vpextrw       $0x6,%xmm8,0xc(%r9,%rdi,2)
9873   196,67,121,21,68,121,10,5,              //vpextrw       $0x5,%xmm8,0xa(%r9,%rdi,2)
9874   196,67,121,21,68,121,8,4,               //vpextrw       $0x4,%xmm8,0x8(%r9,%rdi,2)
9875   196,67,121,21,68,121,6,3,               //vpextrw       $0x3,%xmm8,0x6(%r9,%rdi,2)
9876   196,67,121,21,68,121,4,2,               //vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
9877   196,67,121,21,68,121,2,1,               //vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
9878   196,67,121,21,4,121,0,                  //vpextrw       $0x0,%xmm8,(%r9,%rdi,2)
9879   235,159,                                //jmp           f32 <_sk_store_565_avx+0x9a>
9880   144,                                    //nop
9881   246,255,                                //idiv          %bh
9882   255,                                    //(bad)
9883   255,                                    //(bad)
9884   238,                                    //out           %al,(%dx)
9885   255,                                    //(bad)
9886   255,                                    //(bad)
9887   255,230,                                //jmpq          *%rsi
9888   255,                                    //(bad)
9889   255,                                    //(bad)
9890   255,                                    //(bad)
9891   222,255,                                //fdivrp        %st,%st(7)
9892   255,                                    //(bad)
9893   255,214,                                //callq         *%rsi
9894   255,                                    //(bad)
9895   255,                                    //(bad)
9896   255,206,                                //dec           %esi
9897   255,                                    //(bad)
9898   255,                                    //(bad)
9899   255,198,                                //inc           %esi
9900   255,                                    //(bad)
9901   255,                                    //(bad)
9902   255,                                    //.byte         0xff
9903 };
9904 
9905 CODE const uint8_t sk_load_8888_avx[] = {
9906   72,173,                                 //lods          %ds:(%rsi),%rax
9907   76,139,16,                              //mov           (%rax),%r10
9908   72,133,201,                             //test          %rcx,%rcx
9909   15,133,157,0,0,0,                       //jne           105b <_sk_load_8888_avx+0xab>
9910   196,65,124,16,12,186,                   //vmovups       (%r10,%rdi,4),%ymm9
9911   184,255,0,0,0,                          //mov           $0xff,%eax
9912   197,249,110,192,                        //vmovd         %eax,%xmm0
9913   197,249,112,192,0,                      //vpshufd       $0x0,%xmm0,%xmm0
9914   196,99,125,24,216,1,                    //vinsertf128   $0x1,%xmm0,%ymm0,%ymm11
9915   196,193,36,84,193,                      //vandps        %ymm9,%ymm11,%ymm0
9916   197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
9917   184,129,128,128,59,                     //mov           $0x3b808081,%eax
9918   197,249,110,200,                        //vmovd         %eax,%xmm1
9919   196,227,121,4,201,0,                    //vpermilps     $0x0,%xmm1,%xmm1
9920   196,99,117,24,193,1,                    //vinsertf128   $0x1,%xmm1,%ymm1,%ymm8
9921   196,193,124,89,192,                     //vmulps        %ymm8,%ymm0,%ymm0
9922   196,193,41,114,209,8,                   //vpsrld        $0x8,%xmm9,%xmm10
9923   196,99,125,25,203,1,                    //vextractf128  $0x1,%ymm9,%xmm3
9924   197,241,114,211,8,                      //vpsrld        $0x8,%xmm3,%xmm1
9925   196,227,45,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm10,%ymm1
9926   197,164,84,201,                         //vandps        %ymm1,%ymm11,%ymm1
9927   197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
9928   196,193,116,89,200,                     //vmulps        %ymm8,%ymm1,%ymm1
9929   196,193,41,114,209,16,                  //vpsrld        $0x10,%xmm9,%xmm10
9930   197,233,114,211,16,                     //vpsrld        $0x10,%xmm3,%xmm2
9931   196,227,45,24,210,1,                    //vinsertf128   $0x1,%xmm2,%ymm10,%ymm2
9932   197,164,84,210,                         //vandps        %ymm2,%ymm11,%ymm2
9933   197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
9934   196,193,108,89,208,                     //vmulps        %ymm8,%ymm2,%ymm2
9935   196,193,49,114,209,24,                  //vpsrld        $0x18,%xmm9,%xmm9
9936   197,225,114,211,24,                     //vpsrld        $0x18,%xmm3,%xmm3
9937   196,227,53,24,219,1,                    //vinsertf128   $0x1,%xmm3,%ymm9,%ymm3
9938   197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
9939   196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
9940   72,173,                                 //lods          %ds:(%rsi),%rax
9941   255,224,                                //jmpq          *%rax
9942   65,137,200,                             //mov           %ecx,%r8d
9943   65,128,224,7,                           //and           $0x7,%r8b
9944   196,65,52,87,201,                       //vxorps        %ymm9,%ymm9,%ymm9
9945   65,254,200,                             //dec           %r8b
9946   65,128,248,6,                           //cmp           $0x6,%r8b
9947   15,135,80,255,255,255,                  //ja            fc4 <_sk_load_8888_avx+0x14>
9948   69,15,182,192,                          //movzbl        %r8b,%r8d
9949   76,141,13,137,0,0,0,                    //lea           0x89(%rip),%r9        # 1108 <_sk_load_8888_avx+0x158>
9950   75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
9951   76,1,200,                               //add           %r9,%rax
9952   255,224,                                //jmpq          *%rax
9953   196,193,121,110,68,186,24,              //vmovd         0x18(%r10,%rdi,4),%xmm0
9954   197,249,112,192,68,                     //vpshufd       $0x44,%xmm0,%xmm0
9955   196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
9956   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
9957   196,99,117,12,200,64,                   //vblendps      $0x40,%ymm0,%ymm1,%ymm9
9958   196,99,125,25,200,1,                    //vextractf128  $0x1,%ymm9,%xmm0
9959   196,195,121,34,68,186,20,1,             //vpinsrd       $0x1,0x14(%r10,%rdi,4),%xmm0,%xmm0
9960   196,99,53,24,200,1,                     //vinsertf128   $0x1,%xmm0,%ymm9,%ymm9
9961   196,99,125,25,200,1,                    //vextractf128  $0x1,%ymm9,%xmm0
9962   196,195,121,34,68,186,16,0,             //vpinsrd       $0x0,0x10(%r10,%rdi,4),%xmm0,%xmm0
9963   196,99,53,24,200,1,                     //vinsertf128   $0x1,%xmm0,%ymm9,%ymm9
9964   196,195,49,34,68,186,12,3,              //vpinsrd       $0x3,0xc(%r10,%rdi,4),%xmm9,%xmm0
9965   196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
9966   196,195,49,34,68,186,8,2,               //vpinsrd       $0x2,0x8(%r10,%rdi,4),%xmm9,%xmm0
9967   196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
9968   196,195,49,34,68,186,4,1,               //vpinsrd       $0x1,0x4(%r10,%rdi,4),%xmm9,%xmm0
9969   196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
9970   196,195,49,34,4,186,0,                  //vpinsrd       $0x0,(%r10,%rdi,4),%xmm9,%xmm0
9971   196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
9972   233,188,254,255,255,                    //jmpq          fc4 <_sk_load_8888_avx+0x14>
9973   238,                                    //out           %al,(%dx)
9974   255,                                    //(bad)
9975   255,                                    //(bad)
9976   255,224,                                //jmpq          *%rax
9977   255,                                    //(bad)
9978   255,                                    //(bad)
9979   255,210,                                //callq         *%rdx
9980   255,                                    //(bad)
9981   255,                                    //(bad)
9982   255,196,                                //inc           %esp
9983   255,                                    //(bad)
9984   255,                                    //(bad)
9985   255,176,255,255,255,156,                //pushq         -0x63000001(%rax)
9986   255,                                    //(bad)
9987   255,                                    //(bad)
9988   255,                                    //.byte         0xff
9989   128,255,255,                            //cmp           $0xff,%bh
9990   255,                                    //.byte         0xff
9991 };
9992 
9993 CODE const uint8_t sk_store_8888_avx[] = {
9994   72,173,                                 //lods          %ds:(%rsi),%rax
9995   76,139,8,                               //mov           (%rax),%r9
9996   184,0,0,127,67,                         //mov           $0x437f0000,%eax
9997   197,121,110,192,                        //vmovd         %eax,%xmm8
9998   196,67,121,4,192,0,                     //vpermilps     $0x0,%xmm8,%xmm8
9999   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
10000   197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
10001   196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
10002   197,60,89,209,                          //vmulps        %ymm1,%ymm8,%ymm10
10003   196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
10004   196,193,33,114,242,8,                   //vpslld        $0x8,%xmm10,%xmm11
10005   196,67,125,25,210,1,                    //vextractf128  $0x1,%ymm10,%xmm10
10006   196,193,41,114,242,8,                   //vpslld        $0x8,%xmm10,%xmm10
10007   196,67,37,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
10008   196,65,45,86,201,                       //vorpd         %ymm9,%ymm10,%ymm9
10009   197,60,89,210,                          //vmulps        %ymm2,%ymm8,%ymm10
10010   196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
10011   196,193,33,114,242,16,                  //vpslld        $0x10,%xmm10,%xmm11
10012   196,67,125,25,210,1,                    //vextractf128  $0x1,%ymm10,%xmm10
10013   196,193,41,114,242,16,                  //vpslld        $0x10,%xmm10,%xmm10
10014   196,67,37,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
10015   197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
10016   196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
10017   196,193,33,114,240,24,                  //vpslld        $0x18,%xmm8,%xmm11
10018   196,67,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm8
10019   196,193,57,114,240,24,                  //vpslld        $0x18,%xmm8,%xmm8
10020   196,67,37,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm11,%ymm8
10021   196,65,45,86,192,                       //vorpd         %ymm8,%ymm10,%ymm8
10022   196,65,53,86,192,                       //vorpd         %ymm8,%ymm9,%ymm8
10023   72,133,201,                             //test          %rcx,%rcx
10024   117,10,                                 //jne           11c8 <_sk_store_8888_avx+0xa4>
10025   196,65,124,17,4,185,                    //vmovups       %ymm8,(%r9,%rdi,4)
10026   72,173,                                 //lods          %ds:(%rsi),%rax
10027   255,224,                                //jmpq          *%rax
10028   65,137,200,                             //mov           %ecx,%r8d
10029   65,128,224,7,                           //and           $0x7,%r8b
10030   65,254,200,                             //dec           %r8b
10031   65,128,248,6,                           //cmp           $0x6,%r8b
10032   119,236,                                //ja            11c4 <_sk_store_8888_avx+0xa0>
10033   65,15,182,192,                          //movzbl        %r8b,%eax
10034   76,141,5,85,0,0,0,                      //lea           0x55(%rip),%r8        # 1238 <_sk_store_8888_avx+0x114>
10035   73,99,4,128,                            //movslq        (%r8,%rax,4),%rax
10036   76,1,192,                               //add           %r8,%rax
10037   255,224,                                //jmpq          *%rax
10038   196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
10039   196,67,121,22,76,185,24,2,              //vpextrd       $0x2,%xmm9,0x18(%r9,%rdi,4)
10040   196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
10041   196,67,121,22,76,185,20,1,              //vpextrd       $0x1,%xmm9,0x14(%r9,%rdi,4)
10042   196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
10043   196,65,122,17,76,185,16,                //vmovss        %xmm9,0x10(%r9,%rdi,4)
10044   196,67,121,22,68,185,12,3,              //vpextrd       $0x3,%xmm8,0xc(%r9,%rdi,4)
10045   196,67,121,22,68,185,8,2,               //vpextrd       $0x2,%xmm8,0x8(%r9,%rdi,4)
10046   196,67,121,22,68,185,4,1,               //vpextrd       $0x1,%xmm8,0x4(%r9,%rdi,4)
10047   196,65,121,126,4,185,                   //vmovd         %xmm8,(%r9,%rdi,4)
10048   235,143,                                //jmp           11c4 <_sk_store_8888_avx+0xa0>
10049   15,31,0,                                //nopl          (%rax)
10050   245,                                    //cmc
10051   255,                                    //(bad)
10052   255,                                    //(bad)
10053   255,                                    //(bad)
10054   237,                                    //in            (%dx),%eax
10055   255,                                    //(bad)
10056   255,                                    //(bad)
10057   255,229,                                //jmpq          *%rbp
10058   255,                                    //(bad)
10059   255,                                    //(bad)
10060   255,                                    //(bad)
10061   221,255,                                //(bad)
10062   255,                                    //(bad)
10063   255,208,                                //callq         *%rax
10064   255,                                    //(bad)
10065   255,                                    //(bad)
10066   255,194,                                //inc           %edx
10067   255,                                    //(bad)
10068   255,                                    //(bad)
10069   255,                                    //.byte         0xff
10070   180,255,                                //mov           $0xff,%ah
10071   255,                                    //(bad)
10072   255,                                    //.byte         0xff
10073 };
10074 
10075 CODE const uint8_t sk_load_f16_avx[] = {
10076   72,173,                                 //lods          %ds:(%rsi),%rax
10077   72,139,0,                               //mov           (%rax),%rax
10078   72,133,201,                             //test          %rcx,%rcx
10079   15,133,2,1,0,0,                         //jne           1364 <_sk_load_f16_avx+0x110>
10080   197,121,16,4,248,                       //vmovupd       (%rax,%rdi,8),%xmm8
10081   197,249,16,84,248,16,                   //vmovupd       0x10(%rax,%rdi,8),%xmm2
10082   197,249,16,92,248,32,                   //vmovupd       0x20(%rax,%rdi,8),%xmm3
10083   197,122,111,76,248,48,                  //vmovdqu       0x30(%rax,%rdi,8),%xmm9
10084   197,185,97,194,                         //vpunpcklwd    %xmm2,%xmm8,%xmm0
10085   197,185,105,210,                        //vpunpckhwd    %xmm2,%xmm8,%xmm2
10086   196,193,97,97,201,                      //vpunpcklwd    %xmm9,%xmm3,%xmm1
10087   196,193,97,105,217,                     //vpunpckhwd    %xmm9,%xmm3,%xmm3
10088   197,121,97,194,                         //vpunpcklwd    %xmm2,%xmm0,%xmm8
10089   197,249,105,194,                        //vpunpckhwd    %xmm2,%xmm0,%xmm0
10090   197,241,97,211,                         //vpunpcklwd    %xmm3,%xmm1,%xmm2
10091   197,113,105,203,                        //vpunpckhwd    %xmm3,%xmm1,%xmm9
10092   184,0,4,0,4,                            //mov           $0x4000400,%eax
10093   197,249,110,216,                        //vmovd         %eax,%xmm3
10094   197,249,112,219,0,                      //vpshufd       $0x0,%xmm3,%xmm3
10095   196,193,97,101,200,                     //vpcmpgtw      %xmm8,%xmm3,%xmm1
10096   196,65,113,223,192,                     //vpandn        %xmm8,%xmm1,%xmm8
10097   197,225,101,200,                        //vpcmpgtw      %xmm0,%xmm3,%xmm1
10098   197,241,223,192,                        //vpandn        %xmm0,%xmm1,%xmm0
10099   197,225,101,202,                        //vpcmpgtw      %xmm2,%xmm3,%xmm1
10100   197,241,223,202,                        //vpandn        %xmm2,%xmm1,%xmm1
10101   196,193,97,101,209,                     //vpcmpgtw      %xmm9,%xmm3,%xmm2
10102   196,193,105,223,209,                    //vpandn        %xmm9,%xmm2,%xmm2
10103   196,66,121,51,208,                      //vpmovzxwd     %xmm8,%xmm10
10104   196,98,121,51,201,                      //vpmovzxwd     %xmm1,%xmm9
10105   197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
10106   197,57,105,195,                         //vpunpckhwd    %xmm3,%xmm8,%xmm8
10107   197,241,105,203,                        //vpunpckhwd    %xmm3,%xmm1,%xmm1
10108   196,98,121,51,216,                      //vpmovzxwd     %xmm0,%xmm11
10109   196,98,121,51,226,                      //vpmovzxwd     %xmm2,%xmm12
10110   197,121,105,235,                        //vpunpckhwd    %xmm3,%xmm0,%xmm13
10111   197,105,105,243,                        //vpunpckhwd    %xmm3,%xmm2,%xmm14
10112   196,193,121,114,242,13,                 //vpslld        $0xd,%xmm10,%xmm0
10113   196,193,105,114,241,13,                 //vpslld        $0xd,%xmm9,%xmm2
10114   196,227,125,24,194,1,                   //vinsertf128   $0x1,%xmm2,%ymm0,%ymm0
10115   184,0,0,128,119,                        //mov           $0x77800000,%eax
10116   197,249,110,208,                        //vmovd         %eax,%xmm2
10117   197,249,112,210,0,                      //vpshufd       $0x0,%xmm2,%xmm2
10118   196,99,109,24,202,1,                    //vinsertf128   $0x1,%xmm2,%ymm2,%ymm9
10119   197,180,89,192,                         //vmulps        %ymm0,%ymm9,%ymm0
10120   196,193,105,114,240,13,                 //vpslld        $0xd,%xmm8,%xmm2
10121   197,241,114,241,13,                     //vpslld        $0xd,%xmm1,%xmm1
10122   196,227,109,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
10123   197,180,89,201,                         //vmulps        %ymm1,%ymm9,%ymm1
10124   196,193,105,114,243,13,                 //vpslld        $0xd,%xmm11,%xmm2
10125   196,193,97,114,244,13,                  //vpslld        $0xd,%xmm12,%xmm3
10126   196,227,109,24,211,1,                   //vinsertf128   $0x1,%xmm3,%ymm2,%ymm2
10127   197,180,89,210,                         //vmulps        %ymm2,%ymm9,%ymm2
10128   196,193,57,114,245,13,                  //vpslld        $0xd,%xmm13,%xmm8
10129   196,193,97,114,246,13,                  //vpslld        $0xd,%xmm14,%xmm3
10130   196,227,61,24,219,1,                    //vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
10131   197,180,89,219,                         //vmulps        %ymm3,%ymm9,%ymm3
10132   72,173,                                 //lods          %ds:(%rsi),%rax
10133   255,224,                                //jmpq          *%rax
10134   197,123,16,4,248,                       //vmovsd        (%rax,%rdi,8),%xmm8
10135   196,65,49,239,201,                      //vpxor         %xmm9,%xmm9,%xmm9
10136   72,131,249,1,                           //cmp           $0x1,%rcx
10137   116,79,                                 //je            13c3 <_sk_load_f16_avx+0x16f>
10138   197,57,22,68,248,8,                     //vmovhpd       0x8(%rax,%rdi,8),%xmm8,%xmm8
10139   72,131,249,3,                           //cmp           $0x3,%rcx
10140   114,67,                                 //jb            13c3 <_sk_load_f16_avx+0x16f>
10141   197,251,16,84,248,16,                   //vmovsd        0x10(%rax,%rdi,8),%xmm2
10142   72,131,249,3,                           //cmp           $0x3,%rcx
10143   116,68,                                 //je            13d0 <_sk_load_f16_avx+0x17c>
10144   197,233,22,84,248,24,                   //vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
10145   72,131,249,5,                           //cmp           $0x5,%rcx
10146   114,56,                                 //jb            13d0 <_sk_load_f16_avx+0x17c>
10147   197,251,16,92,248,32,                   //vmovsd        0x20(%rax,%rdi,8),%xmm3
10148   72,131,249,5,                           //cmp           $0x5,%rcx
10149   15,132,209,254,255,255,                 //je            1279 <_sk_load_f16_avx+0x25>
10150   197,225,22,92,248,40,                   //vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
10151   72,131,249,7,                           //cmp           $0x7,%rcx
10152   15,130,193,254,255,255,                 //jb            1279 <_sk_load_f16_avx+0x25>
10153   197,122,126,76,248,48,                  //vmovq         0x30(%rax,%rdi,8),%xmm9
10154   233,182,254,255,255,                    //jmpq          1279 <_sk_load_f16_avx+0x25>
10155   197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
10156   197,233,87,210,                         //vxorpd        %xmm2,%xmm2,%xmm2
10157   233,169,254,255,255,                    //jmpq          1279 <_sk_load_f16_avx+0x25>
10158   197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
10159   233,160,254,255,255,                    //jmpq          1279 <_sk_load_f16_avx+0x25>
10160 };
10161 
10162 CODE const uint8_t sk_store_f16_avx[] = {
10163   72,173,                                 //lods          %ds:(%rsi),%rax
10164   76,139,0,                               //mov           (%rax),%r8
10165   184,0,0,128,7,                          //mov           $0x7800000,%eax
10166   197,121,110,192,                        //vmovd         %eax,%xmm8
10167   196,65,121,112,192,0,                   //vpshufd       $0x0,%xmm8,%xmm8
10168   196,67,61,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm8,%ymm8
10169   197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
10170   196,67,125,25,202,1,                    //vextractf128  $0x1,%ymm9,%xmm10
10171   196,193,41,114,210,13,                  //vpsrld        $0xd,%xmm10,%xmm10
10172   196,193,49,114,209,13,                  //vpsrld        $0xd,%xmm9,%xmm9
10173   197,60,89,217,                          //vmulps        %ymm1,%ymm8,%ymm11
10174   196,67,125,25,220,1,                    //vextractf128  $0x1,%ymm11,%xmm12
10175   196,193,25,114,212,13,                  //vpsrld        $0xd,%xmm12,%xmm12
10176   196,193,33,114,211,13,                  //vpsrld        $0xd,%xmm11,%xmm11
10177   197,60,89,234,                          //vmulps        %ymm2,%ymm8,%ymm13
10178   196,67,125,25,238,1,                    //vextractf128  $0x1,%ymm13,%xmm14
10179   196,193,9,114,214,13,                   //vpsrld        $0xd,%xmm14,%xmm14
10180   196,193,17,114,213,13,                  //vpsrld        $0xd,%xmm13,%xmm13
10181   197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
10182   196,67,125,25,199,1,                    //vextractf128  $0x1,%ymm8,%xmm15
10183   196,193,1,114,215,13,                   //vpsrld        $0xd,%xmm15,%xmm15
10184   196,193,57,114,208,13,                  //vpsrld        $0xd,%xmm8,%xmm8
10185   196,193,33,115,251,2,                   //vpslldq       $0x2,%xmm11,%xmm11
10186   196,65,33,235,201,                      //vpor          %xmm9,%xmm11,%xmm9
10187   196,193,33,115,252,2,                   //vpslldq       $0x2,%xmm12,%xmm11
10188   196,65,33,235,226,                      //vpor          %xmm10,%xmm11,%xmm12
10189   196,193,57,115,248,2,                   //vpslldq       $0x2,%xmm8,%xmm8
10190   196,65,57,235,197,                      //vpor          %xmm13,%xmm8,%xmm8
10191   196,193,41,115,255,2,                   //vpslldq       $0x2,%xmm15,%xmm10
10192   196,65,41,235,238,                      //vpor          %xmm14,%xmm10,%xmm13
10193   196,65,49,98,216,                       //vpunpckldq    %xmm8,%xmm9,%xmm11
10194   196,65,49,106,208,                      //vpunpckhdq    %xmm8,%xmm9,%xmm10
10195   196,65,25,98,205,                       //vpunpckldq    %xmm13,%xmm12,%xmm9
10196   196,65,25,106,197,                      //vpunpckhdq    %xmm13,%xmm12,%xmm8
10197   72,133,201,                             //test          %rcx,%rcx
10198   117,31,                                 //jne           14af <_sk_store_f16_avx+0xd6>
10199   196,65,120,17,28,248,                   //vmovups       %xmm11,(%r8,%rdi,8)
10200   196,65,120,17,84,248,16,                //vmovups       %xmm10,0x10(%r8,%rdi,8)
10201   196,65,120,17,76,248,32,                //vmovups       %xmm9,0x20(%r8,%rdi,8)
10202   196,65,122,127,68,248,48,               //vmovdqu       %xmm8,0x30(%r8,%rdi,8)
10203   72,173,                                 //lods          %ds:(%rsi),%rax
10204   255,224,                                //jmpq          *%rax
10205   196,65,121,214,28,248,                  //vmovq         %xmm11,(%r8,%rdi,8)
10206   72,131,249,1,                           //cmp           $0x1,%rcx
10207   116,240,                                //je            14ab <_sk_store_f16_avx+0xd2>
10208   196,65,121,23,92,248,8,                 //vmovhpd       %xmm11,0x8(%r8,%rdi,8)
10209   72,131,249,3,                           //cmp           $0x3,%rcx
10210   114,227,                                //jb            14ab <_sk_store_f16_avx+0xd2>
10211   196,65,121,214,84,248,16,               //vmovq         %xmm10,0x10(%r8,%rdi,8)
10212   116,218,                                //je            14ab <_sk_store_f16_avx+0xd2>
10213   196,65,121,23,84,248,24,                //vmovhpd       %xmm10,0x18(%r8,%rdi,8)
10214   72,131,249,5,                           //cmp           $0x5,%rcx
10215   114,205,                                //jb            14ab <_sk_store_f16_avx+0xd2>
10216   196,65,121,214,76,248,32,               //vmovq         %xmm9,0x20(%r8,%rdi,8)
10217   116,196,                                //je            14ab <_sk_store_f16_avx+0xd2>
10218   196,65,121,23,76,248,40,                //vmovhpd       %xmm9,0x28(%r8,%rdi,8)
10219   72,131,249,7,                           //cmp           $0x7,%rcx
10220   114,183,                                //jb            14ab <_sk_store_f16_avx+0xd2>
10221   196,65,121,214,68,248,48,               //vmovq         %xmm8,0x30(%r8,%rdi,8)
10222   235,174,                                //jmp           14ab <_sk_store_f16_avx+0xd2>
10223 };
10224 
10225 CODE const uint8_t sk_store_f32_avx[] = {
10226   72,173,                                 //lods          %ds:(%rsi),%rax
10227   76,139,0,                               //mov           (%rax),%r8
10228   72,141,4,189,0,0,0,0,                   //lea           0x0(,%rdi,4),%rax
10229   197,124,20,193,                         //vunpcklps     %ymm1,%ymm0,%ymm8
10230   197,124,21,217,                         //vunpckhps     %ymm1,%ymm0,%ymm11
10231   197,108,20,203,                         //vunpcklps     %ymm3,%ymm2,%ymm9
10232   197,108,21,227,                         //vunpckhps     %ymm3,%ymm2,%ymm12
10233   196,65,61,20,209,                       //vunpcklpd     %ymm9,%ymm8,%ymm10
10234   196,65,61,21,201,                       //vunpckhpd     %ymm9,%ymm8,%ymm9
10235   196,65,37,20,196,                       //vunpcklpd     %ymm12,%ymm11,%ymm8
10236   196,65,37,21,220,                       //vunpckhpd     %ymm12,%ymm11,%ymm11
10237   72,133,201,                             //test          %rcx,%rcx
10238   117,55,                                 //jne           156a <_sk_store_f32_avx+0x6d>
10239   196,67,45,24,225,1,                     //vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
10240   196,67,61,24,235,1,                     //vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
10241   196,67,45,6,201,49,                     //vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
10242   196,67,61,6,195,49,                     //vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
10243   196,65,125,17,36,128,                   //vmovupd       %ymm12,(%r8,%rax,4)
10244   196,65,125,17,108,128,32,               //vmovupd       %ymm13,0x20(%r8,%rax,4)
10245   196,65,125,17,76,128,64,                //vmovupd       %ymm9,0x40(%r8,%rax,4)
10246   196,65,125,17,68,128,96,                //vmovupd       %ymm8,0x60(%r8,%rax,4)
10247   72,173,                                 //lods          %ds:(%rsi),%rax
10248   255,224,                                //jmpq          *%rax
10249   196,65,121,17,20,128,                   //vmovupd       %xmm10,(%r8,%rax,4)
10250   72,131,249,1,                           //cmp           $0x1,%rcx
10251   116,240,                                //je            1566 <_sk_store_f32_avx+0x69>
10252   196,65,121,17,76,128,16,                //vmovupd       %xmm9,0x10(%r8,%rax,4)
10253   72,131,249,3,                           //cmp           $0x3,%rcx
10254   114,227,                                //jb            1566 <_sk_store_f32_avx+0x69>
10255   196,65,121,17,68,128,32,                //vmovupd       %xmm8,0x20(%r8,%rax,4)
10256   116,218,                                //je            1566 <_sk_store_f32_avx+0x69>
10257   196,65,121,17,92,128,48,                //vmovupd       %xmm11,0x30(%r8,%rax,4)
10258   72,131,249,5,                           //cmp           $0x5,%rcx
10259   114,205,                                //jb            1566 <_sk_store_f32_avx+0x69>
10260   196,67,125,25,84,128,64,1,              //vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
10261   116,195,                                //je            1566 <_sk_store_f32_avx+0x69>
10262   196,67,125,25,76,128,80,1,              //vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
10263   72,131,249,7,                           //cmp           $0x7,%rcx
10264   114,181,                                //jb            1566 <_sk_store_f32_avx+0x69>
10265   196,67,125,25,68,128,96,1,              //vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
10266   235,171,                                //jmp           1566 <_sk_store_f32_avx+0x69>
10267 };
10268 
10269 CODE const uint8_t sk_clamp_x_avx[] = {
10270   72,173,                                 //lods          %ds:(%rsi),%rax
10271   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
10272   197,60,95,200,                          //vmaxps        %ymm0,%ymm8,%ymm9
10273   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
10274   196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
10275   196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
10276   196,193,121,254,194,                    //vpaddd        %xmm10,%xmm0,%xmm0
10277   196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
10278   196,227,61,24,192,1,                    //vinsertf128   $0x1,%xmm0,%ymm8,%ymm0
10279   197,180,93,192,                         //vminps        %ymm0,%ymm9,%ymm0
10280   72,173,                                 //lods          %ds:(%rsi),%rax
10281   255,224,                                //jmpq          *%rax
10282 };
10283 
10284 CODE const uint8_t sk_clamp_y_avx[] = {
10285   72,173,                                 //lods          %ds:(%rsi),%rax
10286   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
10287   197,60,95,201,                          //vmaxps        %ymm1,%ymm8,%ymm9
10288   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
10289   196,99,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm1
10290   196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
10291   196,193,113,254,202,                    //vpaddd        %xmm10,%xmm1,%xmm1
10292   196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
10293   196,227,61,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm8,%ymm1
10294   197,180,93,201,                         //vminps        %ymm1,%ymm9,%ymm1
10295   72,173,                                 //lods          %ds:(%rsi),%rax
10296   255,224,                                //jmpq          *%rax
10297 };
10298 
10299 CODE const uint8_t sk_repeat_x_avx[] = {
10300   72,173,                                 //lods          %ds:(%rsi),%rax
10301   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
10302   196,65,124,94,200,                      //vdivps        %ymm8,%ymm0,%ymm9
10303   196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
10304   196,65,52,89,200,                       //vmulps        %ymm8,%ymm9,%ymm9
10305   196,65,124,92,201,                      //vsubps        %ymm9,%ymm0,%ymm9
10306   196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
10307   196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
10308   196,193,121,254,194,                    //vpaddd        %xmm10,%xmm0,%xmm0
10309   196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
10310   196,227,61,24,192,1,                    //vinsertf128   $0x1,%xmm0,%ymm8,%ymm0
10311   197,180,93,192,                         //vminps        %ymm0,%ymm9,%ymm0
10312   72,173,                                 //lods          %ds:(%rsi),%rax
10313   255,224,                                //jmpq          *%rax
10314 };
10315 
10316 CODE const uint8_t sk_repeat_y_avx[] = {
10317   72,173,                                 //lods          %ds:(%rsi),%rax
10318   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
10319   196,65,116,94,200,                      //vdivps        %ymm8,%ymm1,%ymm9
10320   196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
10321   196,65,52,89,200,                       //vmulps        %ymm8,%ymm9,%ymm9
10322   196,65,116,92,201,                      //vsubps        %ymm9,%ymm1,%ymm9
10323   196,99,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm1
10324   196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
10325   196,193,113,254,202,                    //vpaddd        %xmm10,%xmm1,%xmm1
10326   196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
10327   196,227,61,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm8,%ymm1
10328   197,180,93,201,                         //vminps        %ymm1,%ymm9,%ymm1
10329   72,173,                                 //lods          %ds:(%rsi),%rax
10330   255,224,                                //jmpq          *%rax
10331 };
10332 
10333 CODE const uint8_t sk_mirror_x_avx[] = {
10334   72,173,                                 //lods          %ds:(%rsi),%rax
10335   197,121,110,0,                          //vmovd         (%rax),%xmm8
10336   196,65,121,112,200,0,                   //vpshufd       $0x0,%xmm8,%xmm9
10337   196,67,53,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
10338   196,65,124,92,209,                      //vsubps        %ymm9,%ymm0,%ymm10
10339   196,193,58,88,192,                      //vaddss        %xmm8,%xmm8,%xmm0
10340   196,227,121,4,192,0,                    //vpermilps     $0x0,%xmm0,%xmm0
10341   196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
10342   197,44,94,192,                          //vdivps        %ymm0,%ymm10,%ymm8
10343   196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
10344   197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
10345   197,172,92,192,                         //vsubps        %ymm0,%ymm10,%ymm0
10346   196,193,124,92,193,                     //vsubps        %ymm9,%ymm0,%ymm0
10347   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
10348   197,60,92,192,                          //vsubps        %ymm0,%ymm8,%ymm8
10349   197,60,84,192,                          //vandps        %ymm0,%ymm8,%ymm8
10350   196,99,125,25,200,1,                    //vextractf128  $0x1,%ymm9,%xmm0
10351   196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
10352   196,193,121,254,194,                    //vpaddd        %xmm10,%xmm0,%xmm0
10353   196,65,49,254,202,                      //vpaddd        %xmm10,%xmm9,%xmm9
10354   196,227,53,24,192,1,                    //vinsertf128   $0x1,%xmm0,%ymm9,%ymm0
10355   197,188,93,192,                         //vminps        %ymm0,%ymm8,%ymm0
10356   72,173,                                 //lods          %ds:(%rsi),%rax
10357   255,224,                                //jmpq          *%rax
10358 };
10359 
10360 CODE const uint8_t sk_mirror_y_avx[] = {
10361   72,173,                                 //lods          %ds:(%rsi),%rax
10362   197,121,110,0,                          //vmovd         (%rax),%xmm8
10363   196,65,121,112,200,0,                   //vpshufd       $0x0,%xmm8,%xmm9
10364   196,67,53,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
10365   196,65,116,92,209,                      //vsubps        %ymm9,%ymm1,%ymm10
10366   196,193,58,88,200,                      //vaddss        %xmm8,%xmm8,%xmm1
10367   196,227,121,4,201,0,                    //vpermilps     $0x0,%xmm1,%xmm1
10368   196,227,117,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
10369   197,44,94,193,                          //vdivps        %ymm1,%ymm10,%ymm8
10370   196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
10371   197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
10372   197,172,92,201,                         //vsubps        %ymm1,%ymm10,%ymm1
10373   196,193,116,92,201,                     //vsubps        %ymm9,%ymm1,%ymm1
10374   196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
10375   197,60,92,193,                          //vsubps        %ymm1,%ymm8,%ymm8
10376   197,60,84,193,                          //vandps        %ymm1,%ymm8,%ymm8
10377   196,99,125,25,201,1,                    //vextractf128  $0x1,%ymm9,%xmm1
10378   196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
10379   196,193,113,254,202,                    //vpaddd        %xmm10,%xmm1,%xmm1
10380   196,65,49,254,202,                      //vpaddd        %xmm10,%xmm9,%xmm9
10381   196,227,53,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm9,%ymm1
10382   197,188,93,201,                         //vminps        %ymm1,%ymm8,%ymm1
10383   72,173,                                 //lods          %ds:(%rsi),%rax
10384   255,224,                                //jmpq          *%rax
10385 };
10386 
10387 CODE const uint8_t sk_luminance_to_alpha_avx[] = {
10388   184,208,179,89,62,                      //mov           $0x3e59b3d0,%eax
10389   197,249,110,216,                        //vmovd         %eax,%xmm3
10390   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
10391   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
10392   197,228,89,192,                         //vmulps        %ymm0,%ymm3,%ymm0
10393   184,89,23,55,63,                        //mov           $0x3f371759,%eax
10394   197,249,110,216,                        //vmovd         %eax,%xmm3
10395   196,227,121,4,219,0,                    //vpermilps     $0x0,%xmm3,%xmm3
10396   196,227,101,24,219,1,                   //vinsertf128   $0x1,%xmm3,%ymm3,%ymm3
10397   197,228,89,201,                         //vmulps        %ymm1,%ymm3,%ymm1
10398   197,252,88,193,                         //vaddps        %ymm1,%ymm0,%ymm0
10399   184,152,221,147,61,                     //mov           $0x3d93dd98,%eax
10400   197,249,110,200,                        //vmovd         %eax,%xmm1
10401   196,227,121,4,201,0,                    //vpermilps     $0x0,%xmm1,%xmm1
10402   196,227,117,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
10403   197,244,89,202,                         //vmulps        %ymm2,%ymm1,%ymm1
10404   197,252,88,217,                         //vaddps        %ymm1,%ymm0,%ymm3
10405   72,173,                                 //lods          %ds:(%rsi),%rax
10406   197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
10407   197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
10408   197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
10409   255,224,                                //jmpq          *%rax
10410 };
10411 
10412 CODE const uint8_t sk_matrix_2x3_avx[] = {
10413   72,173,                                 //lods          %ds:(%rsi),%rax
10414   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
10415   196,98,125,24,72,8,                     //vbroadcastss  0x8(%rax),%ymm9
10416   196,98,125,24,80,16,                    //vbroadcastss  0x10(%rax),%ymm10
10417   197,52,89,201,                          //vmulps        %ymm1,%ymm9,%ymm9
10418   196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
10419   197,60,89,192,                          //vmulps        %ymm0,%ymm8,%ymm8
10420   196,65,60,88,193,                       //vaddps        %ymm9,%ymm8,%ymm8
10421   196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
10422   196,98,125,24,80,12,                    //vbroadcastss  0xc(%rax),%ymm10
10423   196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
10424   197,172,89,201,                         //vmulps        %ymm1,%ymm10,%ymm1
10425   196,193,116,88,203,                     //vaddps        %ymm11,%ymm1,%ymm1
10426   197,180,89,192,                         //vmulps        %ymm0,%ymm9,%ymm0
10427   197,252,88,201,                         //vaddps        %ymm1,%ymm0,%ymm1
10428   72,173,                                 //lods          %ds:(%rsi),%rax
10429   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
10430   255,224,                                //jmpq          *%rax
10431 };
10432 
10433 CODE const uint8_t sk_matrix_3x4_avx[] = {
10434   72,173,                                 //lods          %ds:(%rsi),%rax
10435   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
10436   196,98,125,24,72,12,                    //vbroadcastss  0xc(%rax),%ymm9
10437   196,98,125,24,80,24,                    //vbroadcastss  0x18(%rax),%ymm10
10438   196,98,125,24,88,36,                    //vbroadcastss  0x24(%rax),%ymm11
10439   197,44,89,210,                          //vmulps        %ymm2,%ymm10,%ymm10
10440   196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
10441   197,52,89,201,                          //vmulps        %ymm1,%ymm9,%ymm9
10442   196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
10443   197,60,89,192,                          //vmulps        %ymm0,%ymm8,%ymm8
10444   196,65,60,88,193,                       //vaddps        %ymm9,%ymm8,%ymm8
10445   196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
10446   196,98,125,24,80,16,                    //vbroadcastss  0x10(%rax),%ymm10
10447   196,98,125,24,88,28,                    //vbroadcastss  0x1c(%rax),%ymm11
10448   196,98,125,24,96,40,                    //vbroadcastss  0x28(%rax),%ymm12
10449   197,36,89,218,                          //vmulps        %ymm2,%ymm11,%ymm11
10450   196,65,36,88,220,                       //vaddps        %ymm12,%ymm11,%ymm11
10451   197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
10452   196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
10453   197,52,89,200,                          //vmulps        %ymm0,%ymm9,%ymm9
10454   196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
10455   196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
10456   196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
10457   196,98,125,24,96,32,                    //vbroadcastss  0x20(%rax),%ymm12
10458   196,98,125,24,104,44,                   //vbroadcastss  0x2c(%rax),%ymm13
10459   197,156,89,210,                         //vmulps        %ymm2,%ymm12,%ymm2
10460   196,193,108,88,213,                     //vaddps        %ymm13,%ymm2,%ymm2
10461   197,164,89,201,                         //vmulps        %ymm1,%ymm11,%ymm1
10462   197,244,88,202,                         //vaddps        %ymm2,%ymm1,%ymm1
10463   197,172,89,192,                         //vmulps        %ymm0,%ymm10,%ymm0
10464   197,252,88,209,                         //vaddps        %ymm1,%ymm0,%ymm2
10465   72,173,                                 //lods          %ds:(%rsi),%rax
10466   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
10467   197,124,41,201,                         //vmovaps       %ymm9,%ymm1
10468   255,224,                                //jmpq          *%rax
10469 };
10470 
10471 CODE const uint8_t sk_matrix_4x5_avx[] = {
10472   72,173,                                 //lods          %ds:(%rsi),%rax
10473   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
10474   196,98,125,24,72,16,                    //vbroadcastss  0x10(%rax),%ymm9
10475   196,98,125,24,80,32,                    //vbroadcastss  0x20(%rax),%ymm10
10476   196,98,125,24,88,48,                    //vbroadcastss  0x30(%rax),%ymm11
10477   196,98,125,24,96,64,                    //vbroadcastss  0x40(%rax),%ymm12
10478   197,36,89,219,                          //vmulps        %ymm3,%ymm11,%ymm11
10479   196,65,36,88,220,                       //vaddps        %ymm12,%ymm11,%ymm11
10480   197,44,89,210,                          //vmulps        %ymm2,%ymm10,%ymm10
10481   196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
10482   197,52,89,201,                          //vmulps        %ymm1,%ymm9,%ymm9
10483   196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
10484   197,60,89,192,                          //vmulps        %ymm0,%ymm8,%ymm8
10485   196,65,60,88,193,                       //vaddps        %ymm9,%ymm8,%ymm8
10486   196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
10487   196,98,125,24,80,20,                    //vbroadcastss  0x14(%rax),%ymm10
10488   196,98,125,24,88,36,                    //vbroadcastss  0x24(%rax),%ymm11
10489   196,98,125,24,96,52,                    //vbroadcastss  0x34(%rax),%ymm12
10490   196,98,125,24,104,68,                   //vbroadcastss  0x44(%rax),%ymm13
10491   197,28,89,227,                          //vmulps        %ymm3,%ymm12,%ymm12
10492   196,65,28,88,229,                       //vaddps        %ymm13,%ymm12,%ymm12
10493   197,36,89,218,                          //vmulps        %ymm2,%ymm11,%ymm11
10494   196,65,36,88,220,                       //vaddps        %ymm12,%ymm11,%ymm11
10495   197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
10496   196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
10497   197,52,89,200,                          //vmulps        %ymm0,%ymm9,%ymm9
10498   196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
10499   196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
10500   196,98,125,24,88,24,                    //vbroadcastss  0x18(%rax),%ymm11
10501   196,98,125,24,96,40,                    //vbroadcastss  0x28(%rax),%ymm12
10502   196,98,125,24,104,56,                   //vbroadcastss  0x38(%rax),%ymm13
10503   196,98,125,24,112,72,                   //vbroadcastss  0x48(%rax),%ymm14
10504   197,20,89,235,                          //vmulps        %ymm3,%ymm13,%ymm13
10505   196,65,20,88,238,                       //vaddps        %ymm14,%ymm13,%ymm13
10506   197,28,89,226,                          //vmulps        %ymm2,%ymm12,%ymm12
10507   196,65,28,88,229,                       //vaddps        %ymm13,%ymm12,%ymm12
10508   197,36,89,217,                          //vmulps        %ymm1,%ymm11,%ymm11
10509   196,65,36,88,220,                       //vaddps        %ymm12,%ymm11,%ymm11
10510   197,44,89,208,                          //vmulps        %ymm0,%ymm10,%ymm10
10511   196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
10512   196,98,125,24,88,12,                    //vbroadcastss  0xc(%rax),%ymm11
10513   196,98,125,24,96,28,                    //vbroadcastss  0x1c(%rax),%ymm12
10514   196,98,125,24,104,44,                   //vbroadcastss  0x2c(%rax),%ymm13
10515   196,98,125,24,112,60,                   //vbroadcastss  0x3c(%rax),%ymm14
10516   196,98,125,24,120,76,                   //vbroadcastss  0x4c(%rax),%ymm15
10517   197,140,89,219,                         //vmulps        %ymm3,%ymm14,%ymm3
10518   196,193,100,88,223,                     //vaddps        %ymm15,%ymm3,%ymm3
10519   197,148,89,210,                         //vmulps        %ymm2,%ymm13,%ymm2
10520   197,236,88,211,                         //vaddps        %ymm3,%ymm2,%ymm2
10521   197,156,89,201,                         //vmulps        %ymm1,%ymm12,%ymm1
10522   197,244,88,202,                         //vaddps        %ymm2,%ymm1,%ymm1
10523   197,164,89,192,                         //vmulps        %ymm0,%ymm11,%ymm0
10524   197,252,88,217,                         //vaddps        %ymm1,%ymm0,%ymm3
10525   72,173,                                 //lods          %ds:(%rsi),%rax
10526   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
10527   197,124,41,201,                         //vmovaps       %ymm9,%ymm1
10528   197,124,41,210,                         //vmovaps       %ymm10,%ymm2
10529   255,224,                                //jmpq          *%rax
10530 };
10531 
10532 CODE const uint8_t sk_matrix_perspective_avx[] = {
10533   72,173,                                 //lods          %ds:(%rsi),%rax
10534   196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
10535   196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
10536   196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
10537   197,52,89,201,                          //vmulps        %ymm1,%ymm9,%ymm9
10538   196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
10539   197,60,89,192,                          //vmulps        %ymm0,%ymm8,%ymm8
10540   196,65,60,88,193,                       //vaddps        %ymm9,%ymm8,%ymm8
10541   196,98,125,24,72,12,                    //vbroadcastss  0xc(%rax),%ymm9
10542   196,98,125,24,80,16,                    //vbroadcastss  0x10(%rax),%ymm10
10543   196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
10544   197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
10545   196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
10546   197,52,89,200,                          //vmulps        %ymm0,%ymm9,%ymm9
10547   196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
10548   196,98,125,24,80,24,                    //vbroadcastss  0x18(%rax),%ymm10
10549   196,98,125,24,88,28,                    //vbroadcastss  0x1c(%rax),%ymm11
10550   196,98,125,24,96,32,                    //vbroadcastss  0x20(%rax),%ymm12
10551   197,164,89,201,                         //vmulps        %ymm1,%ymm11,%ymm1
10552   196,193,116,88,204,                     //vaddps        %ymm12,%ymm1,%ymm1
10553   197,172,89,192,                         //vmulps        %ymm0,%ymm10,%ymm0
10554   197,252,88,193,                         //vaddps        %ymm1,%ymm0,%ymm0
10555   197,252,83,200,                         //vrcpps        %ymm0,%ymm1
10556   197,188,89,193,                         //vmulps        %ymm1,%ymm8,%ymm0
10557   197,180,89,201,                         //vmulps        %ymm1,%ymm9,%ymm1
10558   72,173,                                 //lods          %ds:(%rsi),%rax
10559   255,224,                                //jmpq          *%rax
10560 };
10561 
10562 CODE const uint8_t sk_linear_gradient_2stops_avx[] = {
10563   72,173,                                 //lods          %ds:(%rsi),%rax
10564   196,226,125,24,72,16,                   //vbroadcastss  0x10(%rax),%ymm1
10565   196,226,125,24,16,                      //vbroadcastss  (%rax),%ymm2
10566   197,244,89,200,                         //vmulps        %ymm0,%ymm1,%ymm1
10567   197,108,88,193,                         //vaddps        %ymm1,%ymm2,%ymm8
10568   196,226,125,24,72,20,                   //vbroadcastss  0x14(%rax),%ymm1
10569   196,226,125,24,80,4,                    //vbroadcastss  0x4(%rax),%ymm2
10570   197,244,89,200,                         //vmulps        %ymm0,%ymm1,%ymm1
10571   197,236,88,201,                         //vaddps        %ymm1,%ymm2,%ymm1
10572   196,226,125,24,80,24,                   //vbroadcastss  0x18(%rax),%ymm2
10573   196,226,125,24,88,8,                    //vbroadcastss  0x8(%rax),%ymm3
10574   197,236,89,208,                         //vmulps        %ymm0,%ymm2,%ymm2
10575   197,228,88,210,                         //vaddps        %ymm2,%ymm3,%ymm2
10576   196,226,125,24,88,28,                   //vbroadcastss  0x1c(%rax),%ymm3
10577   196,98,125,24,72,12,                    //vbroadcastss  0xc(%rax),%ymm9
10578   197,228,89,192,                         //vmulps        %ymm0,%ymm3,%ymm0
10579   197,180,88,216,                         //vaddps        %ymm0,%ymm9,%ymm3
10580   72,173,                                 //lods          %ds:(%rsi),%rax
10581   197,124,41,192,                         //vmovaps       %ymm8,%ymm0
10582   255,224,                                //jmpq          *%rax
10583 };
10584 
10585 CODE const uint8_t sk_start_pipeline_sse41[] = {
10586   65,87,                                  //push          %r15
10587   65,86,                                  //push          %r14
10588   65,85,                                  //push          %r13
10589   65,84,                                  //push          %r12
10590   86,                                     //push          %rsi
10591   87,                                     //push          %rdi
10592   83,                                     //push          %rbx
10593   72,129,236,160,0,0,0,                   //sub           $0xa0,%rsp
10594   68,15,41,188,36,144,0,0,0,              //movaps        %xmm15,0x90(%rsp)
10595   68,15,41,180,36,128,0,0,0,              //movaps        %xmm14,0x80(%rsp)
10596   68,15,41,108,36,112,                    //movaps        %xmm13,0x70(%rsp)
10597   68,15,41,100,36,96,                     //movaps        %xmm12,0x60(%rsp)
10598   68,15,41,92,36,80,                      //movaps        %xmm11,0x50(%rsp)
10599   68,15,41,84,36,64,                      //movaps        %xmm10,0x40(%rsp)
10600   68,15,41,76,36,48,                      //movaps        %xmm9,0x30(%rsp)
10601   68,15,41,68,36,32,                      //movaps        %xmm8,0x20(%rsp)
10602   15,41,124,36,16,                        //movaps        %xmm7,0x10(%rsp)
10603   15,41,52,36,                            //movaps        %xmm6,(%rsp)
10604   77,137,207,                             //mov           %r9,%r15
10605   77,137,198,                             //mov           %r8,%r14
10606   72,137,203,                             //mov           %rcx,%rbx
10607   72,137,214,                             //mov           %rdx,%rsi
10608   72,173,                                 //lods          %ds:(%rsi),%rax
10609   73,137,196,                             //mov           %rax,%r12
10610   73,137,245,                             //mov           %rsi,%r13
10611   72,141,67,4,                            //lea           0x4(%rbx),%rax
10612   76,57,248,                              //cmp           %r15,%rax
10613   118,5,                                  //jbe           73 <_sk_start_pipeline_sse41+0x73>
10614   72,137,216,                             //mov           %rbx,%rax
10615   235,52,                                 //jmp           a7 <_sk_start_pipeline_sse41+0xa7>
10616   15,87,192,                              //xorps         %xmm0,%xmm0
10617   15,87,201,                              //xorps         %xmm1,%xmm1
10618   15,87,210,                              //xorps         %xmm2,%xmm2
10619   15,87,219,                              //xorps         %xmm3,%xmm3
10620   15,87,228,                              //xorps         %xmm4,%xmm4
10621   15,87,237,                              //xorps         %xmm5,%xmm5
10622   15,87,246,                              //xorps         %xmm6,%xmm6
10623   15,87,255,                              //xorps         %xmm7,%xmm7
10624   72,137,223,                             //mov           %rbx,%rdi
10625   76,137,238,                             //mov           %r13,%rsi
10626   76,137,242,                             //mov           %r14,%rdx
10627   65,255,212,                             //callq         *%r12
10628   72,141,67,4,                            //lea           0x4(%rbx),%rax
10629   72,131,195,8,                           //add           $0x8,%rbx
10630   76,57,251,                              //cmp           %r15,%rbx
10631   72,137,195,                             //mov           %rax,%rbx
10632   118,204,                                //jbe           73 <_sk_start_pipeline_sse41+0x73>
10633   15,40,52,36,                            //movaps        (%rsp),%xmm6
10634   15,40,124,36,16,                        //movaps        0x10(%rsp),%xmm7
10635   68,15,40,68,36,32,                      //movaps        0x20(%rsp),%xmm8
10636   68,15,40,76,36,48,                      //movaps        0x30(%rsp),%xmm9
10637   68,15,40,84,36,64,                      //movaps        0x40(%rsp),%xmm10
10638   68,15,40,92,36,80,                      //movaps        0x50(%rsp),%xmm11
10639   68,15,40,100,36,96,                     //movaps        0x60(%rsp),%xmm12
10640   68,15,40,108,36,112,                    //movaps        0x70(%rsp),%xmm13
10641   68,15,40,180,36,128,0,0,0,              //movaps        0x80(%rsp),%xmm14
10642   68,15,40,188,36,144,0,0,0,              //movaps        0x90(%rsp),%xmm15
10643   72,129,196,160,0,0,0,                   //add           $0xa0,%rsp
10644   91,                                     //pop           %rbx
10645   95,                                     //pop           %rdi
10646   94,                                     //pop           %rsi
10647   65,92,                                  //pop           %r12
10648   65,93,                                  //pop           %r13
10649   65,94,                                  //pop           %r14
10650   65,95,                                  //pop           %r15
10651   195,                                    //retq
10652 };
10653 
10654 CODE const uint8_t sk_just_return_sse41[] = {
10655   195,                                    //retq
10656 };
10657 
10658 CODE const uint8_t sk_seed_shader_sse41[] = {
10659   72,173,                                 //lods          %ds:(%rsi),%rax
10660   102,15,110,199,                         //movd          %edi,%xmm0
10661   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
10662   15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
10663   185,0,0,0,63,                           //mov           $0x3f000000,%ecx
10664   102,15,110,209,                         //movd          %ecx,%xmm2
10665   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
10666   15,88,202,                              //addps         %xmm2,%xmm1
10667   15,16,2,                                //movups        (%rdx),%xmm0
10668   15,88,193,                              //addps         %xmm1,%xmm0
10669   102,15,110,8,                           //movd          (%rax),%xmm1
10670   102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
10671   15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
10672   15,88,202,                              //addps         %xmm2,%xmm1
10673   184,0,0,128,63,                         //mov           $0x3f800000,%eax
10674   102,15,110,208,                         //movd          %eax,%xmm2
10675   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
10676   72,173,                                 //lods          %ds:(%rsi),%rax
10677   15,87,219,                              //xorps         %xmm3,%xmm3
10678   15,87,228,                              //xorps         %xmm4,%xmm4
10679   15,87,237,                              //xorps         %xmm5,%xmm5
10680   15,87,246,                              //xorps         %xmm6,%xmm6
10681   15,87,255,                              //xorps         %xmm7,%xmm7
10682   255,224,                                //jmpq          *%rax
10683 };
10684 
10685 CODE const uint8_t sk_constant_color_sse41[] = {
10686   72,173,                                 //lods          %ds:(%rsi),%rax
10687   15,16,24,                               //movups        (%rax),%xmm3
10688   15,40,195,                              //movaps        %xmm3,%xmm0
10689   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
10690   15,40,203,                              //movaps        %xmm3,%xmm1
10691   15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
10692   15,40,211,                              //movaps        %xmm3,%xmm2
10693   15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
10694   15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
10695   72,173,                                 //lods          %ds:(%rsi),%rax
10696   255,224,                                //jmpq          *%rax
10697 };
10698 
10699 CODE const uint8_t sk_clear_sse41[] = {
10700   72,173,                                 //lods          %ds:(%rsi),%rax
10701   15,87,192,                              //xorps         %xmm0,%xmm0
10702   15,87,201,                              //xorps         %xmm1,%xmm1
10703   15,87,210,                              //xorps         %xmm2,%xmm2
10704   15,87,219,                              //xorps         %xmm3,%xmm3
10705   255,224,                                //jmpq          *%rax
10706 };
10707 
10708 CODE const uint8_t sk_plus__sse41[] = {
10709   15,88,196,                              //addps         %xmm4,%xmm0
10710   15,88,205,                              //addps         %xmm5,%xmm1
10711   15,88,214,                              //addps         %xmm6,%xmm2
10712   15,88,223,                              //addps         %xmm7,%xmm3
10713   72,173,                                 //lods          %ds:(%rsi),%rax
10714   255,224,                                //jmpq          *%rax
10715 };
10716 
10717 CODE const uint8_t sk_srcover_sse41[] = {
10718   184,0,0,128,63,                         //mov           $0x3f800000,%eax
10719   102,68,15,110,192,                      //movd          %eax,%xmm8
10720   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
10721   68,15,92,195,                           //subps         %xmm3,%xmm8
10722   69,15,40,200,                           //movaps        %xmm8,%xmm9
10723   68,15,89,204,                           //mulps         %xmm4,%xmm9
10724   65,15,88,193,                           //addps         %xmm9,%xmm0
10725   69,15,40,200,                           //movaps        %xmm8,%xmm9
10726   68,15,89,205,                           //mulps         %xmm5,%xmm9
10727   65,15,88,201,                           //addps         %xmm9,%xmm1
10728   69,15,40,200,                           //movaps        %xmm8,%xmm9
10729   68,15,89,206,                           //mulps         %xmm6,%xmm9
10730   65,15,88,209,                           //addps         %xmm9,%xmm2
10731   68,15,89,199,                           //mulps         %xmm7,%xmm8
10732   65,15,88,216,                           //addps         %xmm8,%xmm3
10733   72,173,                                 //lods          %ds:(%rsi),%rax
10734   255,224,                                //jmpq          *%rax
10735 };
10736 
10737 CODE const uint8_t sk_dstover_sse41[] = {
10738   184,0,0,128,63,                         //mov           $0x3f800000,%eax
10739   102,68,15,110,192,                      //movd          %eax,%xmm8
10740   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
10741   68,15,92,199,                           //subps         %xmm7,%xmm8
10742   65,15,89,192,                           //mulps         %xmm8,%xmm0
10743   15,88,196,                              //addps         %xmm4,%xmm0
10744   65,15,89,200,                           //mulps         %xmm8,%xmm1
10745   15,88,205,                              //addps         %xmm5,%xmm1
10746   65,15,89,208,                           //mulps         %xmm8,%xmm2
10747   15,88,214,                              //addps         %xmm6,%xmm2
10748   65,15,89,216,                           //mulps         %xmm8,%xmm3
10749   15,88,223,                              //addps         %xmm7,%xmm3
10750   72,173,                                 //lods          %ds:(%rsi),%rax
10751   255,224,                                //jmpq          *%rax
10752 };
10753 
10754 CODE const uint8_t sk_clamp_0_sse41[] = {
10755   69,15,87,192,                           //xorps         %xmm8,%xmm8
10756   65,15,95,192,                           //maxps         %xmm8,%xmm0
10757   65,15,95,200,                           //maxps         %xmm8,%xmm1
10758   65,15,95,208,                           //maxps         %xmm8,%xmm2
10759   65,15,95,216,                           //maxps         %xmm8,%xmm3
10760   72,173,                                 //lods          %ds:(%rsi),%rax
10761   255,224,                                //jmpq          *%rax
10762 };
10763 
10764 CODE const uint8_t sk_clamp_1_sse41[] = {
10765   184,0,0,128,63,                         //mov           $0x3f800000,%eax
10766   102,68,15,110,192,                      //movd          %eax,%xmm8
10767   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
10768   65,15,93,192,                           //minps         %xmm8,%xmm0
10769   65,15,93,200,                           //minps         %xmm8,%xmm1
10770   65,15,93,208,                           //minps         %xmm8,%xmm2
10771   65,15,93,216,                           //minps         %xmm8,%xmm3
10772   72,173,                                 //lods          %ds:(%rsi),%rax
10773   255,224,                                //jmpq          *%rax
10774 };
10775 
10776 CODE const uint8_t sk_clamp_a_sse41[] = {
10777   184,0,0,128,63,                         //mov           $0x3f800000,%eax
10778   102,68,15,110,192,                      //movd          %eax,%xmm8
10779   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
10780   65,15,93,216,                           //minps         %xmm8,%xmm3
10781   15,93,195,                              //minps         %xmm3,%xmm0
10782   15,93,203,                              //minps         %xmm3,%xmm1
10783   15,93,211,                              //minps         %xmm3,%xmm2
10784   72,173,                                 //lods          %ds:(%rsi),%rax
10785   255,224,                                //jmpq          *%rax
10786 };
10787 
10788 CODE const uint8_t sk_set_rgb_sse41[] = {
10789   72,173,                                 //lods          %ds:(%rsi),%rax
10790   243,15,16,0,                            //movss         (%rax),%xmm0
10791   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
10792   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
10793   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
10794   243,15,16,80,8,                         //movss         0x8(%rax),%xmm2
10795   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
10796   72,173,                                 //lods          %ds:(%rsi),%rax
10797   255,224,                                //jmpq          *%rax
10798 };
10799 
10800 CODE const uint8_t sk_swap_rb_sse41[] = {
10801   68,15,40,192,                           //movaps        %xmm0,%xmm8
10802   72,173,                                 //lods          %ds:(%rsi),%rax
10803   15,40,194,                              //movaps        %xmm2,%xmm0
10804   65,15,40,208,                           //movaps        %xmm8,%xmm2
10805   255,224,                                //jmpq          *%rax
10806 };
10807 
10808 CODE const uint8_t sk_swap_sse41[] = {
10809   68,15,40,195,                           //movaps        %xmm3,%xmm8
10810   68,15,40,202,                           //movaps        %xmm2,%xmm9
10811   68,15,40,209,                           //movaps        %xmm1,%xmm10
10812   68,15,40,216,                           //movaps        %xmm0,%xmm11
10813   72,173,                                 //lods          %ds:(%rsi),%rax
10814   15,40,196,                              //movaps        %xmm4,%xmm0
10815   15,40,205,                              //movaps        %xmm5,%xmm1
10816   15,40,214,                              //movaps        %xmm6,%xmm2
10817   15,40,223,                              //movaps        %xmm7,%xmm3
10818   65,15,40,227,                           //movaps        %xmm11,%xmm4
10819   65,15,40,234,                           //movaps        %xmm10,%xmm5
10820   65,15,40,241,                           //movaps        %xmm9,%xmm6
10821   65,15,40,248,                           //movaps        %xmm8,%xmm7
10822   255,224,                                //jmpq          *%rax
10823 };
10824 
10825 CODE const uint8_t sk_move_src_dst_sse41[] = {
10826   72,173,                                 //lods          %ds:(%rsi),%rax
10827   15,40,224,                              //movaps        %xmm0,%xmm4
10828   15,40,233,                              //movaps        %xmm1,%xmm5
10829   15,40,242,                              //movaps        %xmm2,%xmm6
10830   15,40,251,                              //movaps        %xmm3,%xmm7
10831   255,224,                                //jmpq          *%rax
10832 };
10833 
10834 CODE const uint8_t sk_move_dst_src_sse41[] = {
10835   72,173,                                 //lods          %ds:(%rsi),%rax
10836   15,40,196,                              //movaps        %xmm4,%xmm0
10837   15,40,205,                              //movaps        %xmm5,%xmm1
10838   15,40,214,                              //movaps        %xmm6,%xmm2
10839   15,40,223,                              //movaps        %xmm7,%xmm3
10840   255,224,                                //jmpq          *%rax
10841 };
10842 
10843 CODE const uint8_t sk_premul_sse41[] = {
10844   15,89,195,                              //mulps         %xmm3,%xmm0
10845   15,89,203,                              //mulps         %xmm3,%xmm1
10846   15,89,211,                              //mulps         %xmm3,%xmm2
10847   72,173,                                 //lods          %ds:(%rsi),%rax
10848   255,224,                                //jmpq          *%rax
10849 };
10850 
10851 CODE const uint8_t sk_unpremul_sse41[] = {
10852   69,15,87,192,                           //xorps         %xmm8,%xmm8
10853   184,0,0,128,63,                         //mov           $0x3f800000,%eax
10854   102,68,15,110,200,                      //movd          %eax,%xmm9
10855   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
10856   68,15,94,203,                           //divps         %xmm3,%xmm9
10857   68,15,194,195,4,                        //cmpneqps      %xmm3,%xmm8
10858   69,15,84,193,                           //andps         %xmm9,%xmm8
10859   65,15,89,192,                           //mulps         %xmm8,%xmm0
10860   65,15,89,200,                           //mulps         %xmm8,%xmm1
10861   65,15,89,208,                           //mulps         %xmm8,%xmm2
10862   72,173,                                 //lods          %ds:(%rsi),%rax
10863   255,224,                                //jmpq          *%rax
10864 };
10865 
10866 CODE const uint8_t sk_from_srgb_sse41[] = {
10867   184,145,131,158,61,                     //mov           $0x3d9e8391,%eax
10868   102,68,15,110,216,                      //movd          %eax,%xmm11
10869   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
10870   69,15,40,211,                           //movaps        %xmm11,%xmm10
10871   68,15,89,208,                           //mulps         %xmm0,%xmm10
10872   68,15,40,240,                           //movaps        %xmm0,%xmm14
10873   69,15,89,246,                           //mulps         %xmm14,%xmm14
10874   184,154,153,153,62,                     //mov           $0x3e99999a,%eax
10875   102,68,15,110,192,                      //movd          %eax,%xmm8
10876   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
10877   184,92,143,50,63,                       //mov           $0x3f328f5c,%eax
10878   102,68,15,110,224,                      //movd          %eax,%xmm12
10879   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
10880   69,15,40,200,                           //movaps        %xmm8,%xmm9
10881   68,15,89,200,                           //mulps         %xmm0,%xmm9
10882   69,15,88,204,                           //addps         %xmm12,%xmm9
10883   184,10,215,35,59,                       //mov           $0x3b23d70a,%eax
10884   102,68,15,110,232,                      //movd          %eax,%xmm13
10885   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
10886   69,15,89,206,                           //mulps         %xmm14,%xmm9
10887   69,15,88,205,                           //addps         %xmm13,%xmm9
10888   184,174,71,97,61,                       //mov           $0x3d6147ae,%eax
10889   102,68,15,110,240,                      //movd          %eax,%xmm14
10890   69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
10891   65,15,194,198,1,                        //cmpltps       %xmm14,%xmm0
10892   102,69,15,56,20,202,                    //blendvps      %xmm0,%xmm10,%xmm9
10893   69,15,40,251,                           //movaps        %xmm11,%xmm15
10894   68,15,89,249,                           //mulps         %xmm1,%xmm15
10895   15,40,193,                              //movaps        %xmm1,%xmm0
10896   15,89,192,                              //mulps         %xmm0,%xmm0
10897   69,15,40,208,                           //movaps        %xmm8,%xmm10
10898   68,15,89,209,                           //mulps         %xmm1,%xmm10
10899   69,15,88,212,                           //addps         %xmm12,%xmm10
10900   68,15,89,208,                           //mulps         %xmm0,%xmm10
10901   69,15,88,213,                           //addps         %xmm13,%xmm10
10902   65,15,194,206,1,                        //cmpltps       %xmm14,%xmm1
10903   15,40,193,                              //movaps        %xmm1,%xmm0
10904   102,69,15,56,20,215,                    //blendvps      %xmm0,%xmm15,%xmm10
10905   68,15,89,218,                           //mulps         %xmm2,%xmm11
10906   15,40,194,                              //movaps        %xmm2,%xmm0
10907   15,89,192,                              //mulps         %xmm0,%xmm0
10908   68,15,89,194,                           //mulps         %xmm2,%xmm8
10909   69,15,88,196,                           //addps         %xmm12,%xmm8
10910   68,15,89,192,                           //mulps         %xmm0,%xmm8
10911   69,15,88,197,                           //addps         %xmm13,%xmm8
10912   65,15,194,214,1,                        //cmpltps       %xmm14,%xmm2
10913   15,40,194,                              //movaps        %xmm2,%xmm0
10914   102,69,15,56,20,195,                    //blendvps      %xmm0,%xmm11,%xmm8
10915   72,173,                                 //lods          %ds:(%rsi),%rax
10916   65,15,40,193,                           //movaps        %xmm9,%xmm0
10917   65,15,40,202,                           //movaps        %xmm10,%xmm1
10918   65,15,40,208,                           //movaps        %xmm8,%xmm2
10919   255,224,                                //jmpq          *%rax
10920 };
10921 
10922 CODE const uint8_t sk_to_srgb_sse41[] = {
10923   72,131,236,24,                          //sub           $0x18,%rsp
10924   15,41,60,36,                            //movaps        %xmm7,(%rsp)
10925   15,40,254,                              //movaps        %xmm6,%xmm7
10926   15,40,245,                              //movaps        %xmm5,%xmm6
10927   15,40,236,                              //movaps        %xmm4,%xmm5
10928   15,40,227,                              //movaps        %xmm3,%xmm4
10929   15,40,218,                              //movaps        %xmm2,%xmm3
10930   15,40,209,                              //movaps        %xmm1,%xmm2
10931   68,15,82,192,                           //rsqrtps       %xmm0,%xmm8
10932   69,15,83,200,                           //rcpps         %xmm8,%xmm9
10933   69,15,82,248,                           //rsqrtps       %xmm8,%xmm15
10934   184,41,92,71,65,                        //mov           $0x41475c29,%eax
10935   102,68,15,110,216,                      //movd          %eax,%xmm11
10936   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
10937   69,15,40,211,                           //movaps        %xmm11,%xmm10
10938   68,15,89,208,                           //mulps         %xmm0,%xmm10
10939   184,0,0,128,63,                         //mov           $0x3f800000,%eax
10940   102,68,15,110,192,                      //movd          %eax,%xmm8
10941   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
10942   184,194,135,210,62,                     //mov           $0x3ed287c2,%eax
10943   102,68,15,110,224,                      //movd          %eax,%xmm12
10944   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
10945   184,206,111,48,63,                      //mov           $0x3f306fce,%eax
10946   102,68,15,110,232,                      //movd          %eax,%xmm13
10947   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
10948   184,168,87,202,61,                      //mov           $0x3dca57a8,%eax
10949   53,0,0,0,128,                           //xor           $0x80000000,%eax
10950   102,68,15,110,240,                      //movd          %eax,%xmm14
10951   69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
10952   69,15,89,205,                           //mulps         %xmm13,%xmm9
10953   69,15,88,206,                           //addps         %xmm14,%xmm9
10954   69,15,89,252,                           //mulps         %xmm12,%xmm15
10955   69,15,88,249,                           //addps         %xmm9,%xmm15
10956   69,15,40,200,                           //movaps        %xmm8,%xmm9
10957   69,15,93,207,                           //minps         %xmm15,%xmm9
10958   184,4,231,140,59,                       //mov           $0x3b8ce704,%eax
10959   102,68,15,110,248,                      //movd          %eax,%xmm15
10960   69,15,198,255,0,                        //shufps        $0x0,%xmm15,%xmm15
10961   65,15,194,199,1,                        //cmpltps       %xmm15,%xmm0
10962   102,69,15,56,20,202,                    //blendvps      %xmm0,%xmm10,%xmm9
10963   68,15,82,210,                           //rsqrtps       %xmm2,%xmm10
10964   65,15,83,194,                           //rcpps         %xmm10,%xmm0
10965   69,15,82,210,                           //rsqrtps       %xmm10,%xmm10
10966   65,15,89,197,                           //mulps         %xmm13,%xmm0
10967   65,15,88,198,                           //addps         %xmm14,%xmm0
10968   69,15,89,212,                           //mulps         %xmm12,%xmm10
10969   68,15,88,208,                           //addps         %xmm0,%xmm10
10970   65,15,40,200,                           //movaps        %xmm8,%xmm1
10971   65,15,93,202,                           //minps         %xmm10,%xmm1
10972   69,15,40,211,                           //movaps        %xmm11,%xmm10
10973   68,15,89,210,                           //mulps         %xmm2,%xmm10
10974   65,15,194,215,1,                        //cmpltps       %xmm15,%xmm2
10975   15,40,194,                              //movaps        %xmm2,%xmm0
10976   102,65,15,56,20,202,                    //blendvps      %xmm0,%xmm10,%xmm1
10977   15,82,195,                              //rsqrtps       %xmm3,%xmm0
10978   15,83,208,                              //rcpps         %xmm0,%xmm2
10979   65,15,89,213,                           //mulps         %xmm13,%xmm2
10980   65,15,88,214,                           //addps         %xmm14,%xmm2
10981   15,82,192,                              //rsqrtps       %xmm0,%xmm0
10982   65,15,89,196,                           //mulps         %xmm12,%xmm0
10983   15,88,194,                              //addps         %xmm2,%xmm0
10984   68,15,93,192,                           //minps         %xmm0,%xmm8
10985   68,15,89,219,                           //mulps         %xmm3,%xmm11
10986   65,15,194,223,1,                        //cmpltps       %xmm15,%xmm3
10987   15,40,195,                              //movaps        %xmm3,%xmm0
10988   102,69,15,56,20,195,                    //blendvps      %xmm0,%xmm11,%xmm8
10989   72,173,                                 //lods          %ds:(%rsi),%rax
10990   65,15,40,193,                           //movaps        %xmm9,%xmm0
10991   65,15,40,208,                           //movaps        %xmm8,%xmm2
10992   15,40,220,                              //movaps        %xmm4,%xmm3
10993   15,40,229,                              //movaps        %xmm5,%xmm4
10994   15,40,238,                              //movaps        %xmm6,%xmm5
10995   15,40,247,                              //movaps        %xmm7,%xmm6
10996   15,40,60,36,                            //movaps        (%rsp),%xmm7
10997   72,131,196,24,                          //add           $0x18,%rsp
10998   255,224,                                //jmpq          *%rax
10999 };
11000 
11001 CODE const uint8_t sk_scale_1_float_sse41[] = {
11002   72,173,                                 //lods          %ds:(%rsi),%rax
11003   243,68,15,16,0,                         //movss         (%rax),%xmm8
11004   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11005   65,15,89,192,                           //mulps         %xmm8,%xmm0
11006   65,15,89,200,                           //mulps         %xmm8,%xmm1
11007   65,15,89,208,                           //mulps         %xmm8,%xmm2
11008   65,15,89,216,                           //mulps         %xmm8,%xmm3
11009   72,173,                                 //lods          %ds:(%rsi),%rax
11010   255,224,                                //jmpq          *%rax
11011 };
11012 
11013 CODE const uint8_t sk_scale_u8_sse41[] = {
11014   72,173,                                 //lods          %ds:(%rsi),%rax
11015   72,139,0,                               //mov           (%rax),%rax
11016   102,68,15,56,49,4,56,                   //pmovzxbd      (%rax,%rdi,1),%xmm8
11017   69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
11018   184,129,128,128,59,                     //mov           $0x3b808081,%eax
11019   102,68,15,110,200,                      //movd          %eax,%xmm9
11020   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
11021   69,15,89,200,                           //mulps         %xmm8,%xmm9
11022   65,15,89,193,                           //mulps         %xmm9,%xmm0
11023   65,15,89,201,                           //mulps         %xmm9,%xmm1
11024   65,15,89,209,                           //mulps         %xmm9,%xmm2
11025   65,15,89,217,                           //mulps         %xmm9,%xmm3
11026   72,173,                                 //lods          %ds:(%rsi),%rax
11027   255,224,                                //jmpq          *%rax
11028 };
11029 
11030 CODE const uint8_t sk_lerp_1_float_sse41[] = {
11031   72,173,                                 //lods          %ds:(%rsi),%rax
11032   243,68,15,16,0,                         //movss         (%rax),%xmm8
11033   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11034   15,92,196,                              //subps         %xmm4,%xmm0
11035   65,15,89,192,                           //mulps         %xmm8,%xmm0
11036   15,88,196,                              //addps         %xmm4,%xmm0
11037   15,92,205,                              //subps         %xmm5,%xmm1
11038   65,15,89,200,                           //mulps         %xmm8,%xmm1
11039   15,88,205,                              //addps         %xmm5,%xmm1
11040   15,92,214,                              //subps         %xmm6,%xmm2
11041   65,15,89,208,                           //mulps         %xmm8,%xmm2
11042   15,88,214,                              //addps         %xmm6,%xmm2
11043   15,92,223,                              //subps         %xmm7,%xmm3
11044   65,15,89,216,                           //mulps         %xmm8,%xmm3
11045   15,88,223,                              //addps         %xmm7,%xmm3
11046   72,173,                                 //lods          %ds:(%rsi),%rax
11047   255,224,                                //jmpq          *%rax
11048 };
11049 
11050 CODE const uint8_t sk_lerp_u8_sse41[] = {
11051   72,173,                                 //lods          %ds:(%rsi),%rax
11052   72,139,0,                               //mov           (%rax),%rax
11053   102,68,15,56,49,4,56,                   //pmovzxbd      (%rax,%rdi,1),%xmm8
11054   69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
11055   184,129,128,128,59,                     //mov           $0x3b808081,%eax
11056   102,68,15,110,200,                      //movd          %eax,%xmm9
11057   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
11058   69,15,89,200,                           //mulps         %xmm8,%xmm9
11059   15,92,196,                              //subps         %xmm4,%xmm0
11060   65,15,89,193,                           //mulps         %xmm9,%xmm0
11061   15,88,196,                              //addps         %xmm4,%xmm0
11062   15,92,205,                              //subps         %xmm5,%xmm1
11063   65,15,89,201,                           //mulps         %xmm9,%xmm1
11064   15,88,205,                              //addps         %xmm5,%xmm1
11065   15,92,214,                              //subps         %xmm6,%xmm2
11066   65,15,89,209,                           //mulps         %xmm9,%xmm2
11067   15,88,214,                              //addps         %xmm6,%xmm2
11068   15,92,223,                              //subps         %xmm7,%xmm3
11069   65,15,89,217,                           //mulps         %xmm9,%xmm3
11070   15,88,223,                              //addps         %xmm7,%xmm3
11071   72,173,                                 //lods          %ds:(%rsi),%rax
11072   255,224,                                //jmpq          *%rax
11073 };
11074 
11075 CODE const uint8_t sk_lerp_565_sse41[] = {
11076   72,173,                                 //lods          %ds:(%rsi),%rax
11077   72,139,0,                               //mov           (%rax),%rax
11078   102,68,15,56,51,4,120,                  //pmovzxwd      (%rax,%rdi,2),%xmm8
11079   184,0,248,0,0,                          //mov           $0xf800,%eax
11080   102,15,110,216,                         //movd          %eax,%xmm3
11081   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
11082   102,65,15,219,216,                      //pand          %xmm8,%xmm3
11083   68,15,91,203,                           //cvtdq2ps      %xmm3,%xmm9
11084   184,8,33,132,55,                        //mov           $0x37842108,%eax
11085   102,68,15,110,208,                      //movd          %eax,%xmm10
11086   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
11087   69,15,89,209,                           //mulps         %xmm9,%xmm10
11088   184,224,7,0,0,                          //mov           $0x7e0,%eax
11089   102,15,110,216,                         //movd          %eax,%xmm3
11090   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
11091   102,65,15,219,216,                      //pand          %xmm8,%xmm3
11092   68,15,91,203,                           //cvtdq2ps      %xmm3,%xmm9
11093   184,33,8,2,58,                          //mov           $0x3a020821,%eax
11094   102,68,15,110,216,                      //movd          %eax,%xmm11
11095   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
11096   69,15,89,217,                           //mulps         %xmm9,%xmm11
11097   184,31,0,0,0,                           //mov           $0x1f,%eax
11098   102,15,110,216,                         //movd          %eax,%xmm3
11099   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
11100   102,65,15,219,216,                      //pand          %xmm8,%xmm3
11101   68,15,91,195,                           //cvtdq2ps      %xmm3,%xmm8
11102   184,8,33,4,61,                          //mov           $0x3d042108,%eax
11103   102,15,110,216,                         //movd          %eax,%xmm3
11104   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
11105   65,15,89,216,                           //mulps         %xmm8,%xmm3
11106   15,92,196,                              //subps         %xmm4,%xmm0
11107   65,15,89,194,                           //mulps         %xmm10,%xmm0
11108   15,88,196,                              //addps         %xmm4,%xmm0
11109   15,92,205,                              //subps         %xmm5,%xmm1
11110   65,15,89,203,                           //mulps         %xmm11,%xmm1
11111   15,88,205,                              //addps         %xmm5,%xmm1
11112   15,92,214,                              //subps         %xmm6,%xmm2
11113   15,89,211,                              //mulps         %xmm3,%xmm2
11114   15,88,214,                              //addps         %xmm6,%xmm2
11115   184,0,0,128,63,                         //mov           $0x3f800000,%eax
11116   102,15,110,216,                         //movd          %eax,%xmm3
11117   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
11118   72,173,                                 //lods          %ds:(%rsi),%rax
11119   255,224,                                //jmpq          *%rax
11120 };
11121 
11122 CODE const uint8_t sk_load_tables_sse41[] = {
11123   72,173,                                 //lods          %ds:(%rsi),%rax
11124   72,139,8,                               //mov           (%rax),%rcx
11125   76,139,64,8,                            //mov           0x8(%rax),%r8
11126   243,68,15,111,4,185,                    //movdqu        (%rcx,%rdi,4),%xmm8
11127   185,255,0,0,0,                          //mov           $0xff,%ecx
11128   102,15,110,193,                         //movd          %ecx,%xmm0
11129   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
11130   102,65,15,111,200,                      //movdqa        %xmm8,%xmm1
11131   102,15,114,209,8,                       //psrld         $0x8,%xmm1
11132   102,15,219,200,                         //pand          %xmm0,%xmm1
11133   102,65,15,111,208,                      //movdqa        %xmm8,%xmm2
11134   102,15,114,210,16,                      //psrld         $0x10,%xmm2
11135   102,15,219,208,                         //pand          %xmm0,%xmm2
11136   102,65,15,219,192,                      //pand          %xmm8,%xmm0
11137   102,72,15,58,22,193,1,                  //pextrq        $0x1,%xmm0,%rcx
11138   65,137,201,                             //mov           %ecx,%r9d
11139   72,193,233,32,                          //shr           $0x20,%rcx
11140   102,73,15,126,194,                      //movq          %xmm0,%r10
11141   69,137,211,                             //mov           %r10d,%r11d
11142   73,193,234,32,                          //shr           $0x20,%r10
11143   243,67,15,16,4,152,                     //movss         (%r8,%r11,4),%xmm0
11144   102,67,15,58,33,4,144,16,               //insertps      $0x10,(%r8,%r10,4),%xmm0
11145   102,67,15,58,33,4,136,32,               //insertps      $0x20,(%r8,%r9,4),%xmm0
11146   102,65,15,58,33,4,136,48,               //insertps      $0x30,(%r8,%rcx,4),%xmm0
11147   76,139,64,16,                           //mov           0x10(%rax),%r8
11148   102,73,15,58,22,202,1,                  //pextrq        $0x1,%xmm1,%r10
11149   77,137,209,                             //mov           %r10,%r9
11150   73,193,233,32,                          //shr           $0x20,%r9
11151   102,72,15,126,201,                      //movq          %xmm1,%rcx
11152   65,137,203,                             //mov           %ecx,%r11d
11153   65,129,227,255,255,255,0,               //and           $0xffffff,%r11d
11154   72,193,233,30,                          //shr           $0x1e,%rcx
11155   65,129,226,255,255,255,0,               //and           $0xffffff,%r10d
11156   243,67,15,16,12,152,                    //movss         (%r8,%r11,4),%xmm1
11157   102,65,15,58,33,12,8,16,                //insertps      $0x10,(%r8,%rcx,1),%xmm1
11158   243,67,15,16,28,144,                    //movss         (%r8,%r10,4),%xmm3
11159   102,15,58,33,203,32,                    //insertps      $0x20,%xmm3,%xmm1
11160   243,67,15,16,28,136,                    //movss         (%r8,%r9,4),%xmm3
11161   102,15,58,33,203,48,                    //insertps      $0x30,%xmm3,%xmm1
11162   76,139,72,24,                           //mov           0x18(%rax),%r9
11163   102,72,15,58,22,209,1,                  //pextrq        $0x1,%xmm2,%rcx
11164   68,15,183,193,                          //movzwl        %cx,%r8d
11165   72,193,233,32,                          //shr           $0x20,%rcx
11166   102,72,15,126,208,                      //movq          %xmm2,%rax
11167   68,15,183,208,                          //movzwl        %ax,%r10d
11168   72,193,232,30,                          //shr           $0x1e,%rax
11169   243,67,15,16,20,145,                    //movss         (%r9,%r10,4),%xmm2
11170   102,65,15,58,33,20,1,16,                //insertps      $0x10,(%r9,%rax,1),%xmm2
11171   243,67,15,16,28,129,                    //movss         (%r9,%r8,4),%xmm3
11172   102,15,58,33,211,32,                    //insertps      $0x20,%xmm3,%xmm2
11173   243,65,15,16,28,137,                    //movss         (%r9,%rcx,4),%xmm3
11174   102,15,58,33,211,48,                    //insertps      $0x30,%xmm3,%xmm2
11175   102,65,15,114,208,24,                   //psrld         $0x18,%xmm8
11176   69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
11177   184,129,128,128,59,                     //mov           $0x3b808081,%eax
11178   102,15,110,216,                         //movd          %eax,%xmm3
11179   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
11180   65,15,89,216,                           //mulps         %xmm8,%xmm3
11181   72,173,                                 //lods          %ds:(%rsi),%rax
11182   255,224,                                //jmpq          *%rax
11183 };
11184 
11185 CODE const uint8_t sk_load_a8_sse41[] = {
11186   72,173,                                 //lods          %ds:(%rsi),%rax
11187   72,139,0,                               //mov           (%rax),%rax
11188   102,15,56,49,4,56,                      //pmovzxbd      (%rax,%rdi,1),%xmm0
11189   15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
11190   184,129,128,128,59,                     //mov           $0x3b808081,%eax
11191   102,15,110,216,                         //movd          %eax,%xmm3
11192   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
11193   15,89,216,                              //mulps         %xmm0,%xmm3
11194   72,173,                                 //lods          %ds:(%rsi),%rax
11195   15,87,192,                              //xorps         %xmm0,%xmm0
11196   15,87,201,                              //xorps         %xmm1,%xmm1
11197   15,87,210,                              //xorps         %xmm2,%xmm2
11198   255,224,                                //jmpq          *%rax
11199 };
11200 
11201 CODE const uint8_t sk_store_a8_sse41[] = {
11202   72,173,                                 //lods          %ds:(%rsi),%rax
11203   72,139,0,                               //mov           (%rax),%rax
11204   185,0,0,127,67,                         //mov           $0x437f0000,%ecx
11205   102,68,15,110,193,                      //movd          %ecx,%xmm8
11206   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11207   68,15,89,195,                           //mulps         %xmm3,%xmm8
11208   102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
11209   102,69,15,56,43,192,                    //packusdw      %xmm8,%xmm8
11210   102,69,15,103,192,                      //packuswb      %xmm8,%xmm8
11211   102,68,15,126,4,56,                     //movd          %xmm8,(%rax,%rdi,1)
11212   72,173,                                 //lods          %ds:(%rsi),%rax
11213   255,224,                                //jmpq          *%rax
11214 };
11215 
11216 CODE const uint8_t sk_load_565_sse41[] = {
11217   72,173,                                 //lods          %ds:(%rsi),%rax
11218   72,139,0,                               //mov           (%rax),%rax
11219   102,15,56,51,20,120,                    //pmovzxwd      (%rax,%rdi,2),%xmm2
11220   184,0,248,0,0,                          //mov           $0xf800,%eax
11221   102,15,110,192,                         //movd          %eax,%xmm0
11222   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
11223   102,15,219,194,                         //pand          %xmm2,%xmm0
11224   15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
11225   184,8,33,132,55,                        //mov           $0x37842108,%eax
11226   102,15,110,192,                         //movd          %eax,%xmm0
11227   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
11228   15,89,193,                              //mulps         %xmm1,%xmm0
11229   184,224,7,0,0,                          //mov           $0x7e0,%eax
11230   102,15,110,200,                         //movd          %eax,%xmm1
11231   102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
11232   102,15,219,202,                         //pand          %xmm2,%xmm1
11233   15,91,217,                              //cvtdq2ps      %xmm1,%xmm3
11234   184,33,8,2,58,                          //mov           $0x3a020821,%eax
11235   102,15,110,200,                         //movd          %eax,%xmm1
11236   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
11237   15,89,203,                              //mulps         %xmm3,%xmm1
11238   184,31,0,0,0,                           //mov           $0x1f,%eax
11239   102,15,110,216,                         //movd          %eax,%xmm3
11240   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
11241   102,15,219,218,                         //pand          %xmm2,%xmm3
11242   15,91,219,                              //cvtdq2ps      %xmm3,%xmm3
11243   184,8,33,4,61,                          //mov           $0x3d042108,%eax
11244   102,15,110,208,                         //movd          %eax,%xmm2
11245   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
11246   15,89,211,                              //mulps         %xmm3,%xmm2
11247   184,0,0,128,63,                         //mov           $0x3f800000,%eax
11248   102,15,110,216,                         //movd          %eax,%xmm3
11249   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
11250   72,173,                                 //lods          %ds:(%rsi),%rax
11251   255,224,                                //jmpq          *%rax
11252 };
11253 
11254 CODE const uint8_t sk_store_565_sse41[] = {
11255   72,173,                                 //lods          %ds:(%rsi),%rax
11256   72,139,0,                               //mov           (%rax),%rax
11257   185,0,0,248,65,                         //mov           $0x41f80000,%ecx
11258   102,68,15,110,193,                      //movd          %ecx,%xmm8
11259   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11260   69,15,40,200,                           //movaps        %xmm8,%xmm9
11261   68,15,89,200,                           //mulps         %xmm0,%xmm9
11262   102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
11263   102,65,15,114,241,11,                   //pslld         $0xb,%xmm9
11264   185,0,0,124,66,                         //mov           $0x427c0000,%ecx
11265   102,68,15,110,209,                      //movd          %ecx,%xmm10
11266   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
11267   68,15,89,209,                           //mulps         %xmm1,%xmm10
11268   102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
11269   102,65,15,114,242,5,                    //pslld         $0x5,%xmm10
11270   102,69,15,235,209,                      //por           %xmm9,%xmm10
11271   68,15,89,194,                           //mulps         %xmm2,%xmm8
11272   102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
11273   102,69,15,86,194,                       //orpd          %xmm10,%xmm8
11274   102,69,15,56,43,192,                    //packusdw      %xmm8,%xmm8
11275   102,68,15,214,4,120,                    //movq          %xmm8,(%rax,%rdi,2)
11276   72,173,                                 //lods          %ds:(%rsi),%rax
11277   255,224,                                //jmpq          *%rax
11278 };
11279 
11280 CODE const uint8_t sk_load_8888_sse41[] = {
11281   72,173,                                 //lods          %ds:(%rsi),%rax
11282   72,139,0,                               //mov           (%rax),%rax
11283   243,15,111,28,184,                      //movdqu        (%rax,%rdi,4),%xmm3
11284   184,255,0,0,0,                          //mov           $0xff,%eax
11285   102,15,110,192,                         //movd          %eax,%xmm0
11286   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
11287   102,15,111,203,                         //movdqa        %xmm3,%xmm1
11288   102,15,114,209,8,                       //psrld         $0x8,%xmm1
11289   102,15,219,200,                         //pand          %xmm0,%xmm1
11290   102,15,111,211,                         //movdqa        %xmm3,%xmm2
11291   102,15,114,210,16,                      //psrld         $0x10,%xmm2
11292   102,15,219,208,                         //pand          %xmm0,%xmm2
11293   102,15,219,195,                         //pand          %xmm3,%xmm0
11294   15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
11295   184,129,128,128,59,                     //mov           $0x3b808081,%eax
11296   102,68,15,110,192,                      //movd          %eax,%xmm8
11297   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11298   65,15,89,192,                           //mulps         %xmm8,%xmm0
11299   15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
11300   65,15,89,200,                           //mulps         %xmm8,%xmm1
11301   15,91,210,                              //cvtdq2ps      %xmm2,%xmm2
11302   65,15,89,208,                           //mulps         %xmm8,%xmm2
11303   102,15,114,211,24,                      //psrld         $0x18,%xmm3
11304   15,91,219,                              //cvtdq2ps      %xmm3,%xmm3
11305   65,15,89,216,                           //mulps         %xmm8,%xmm3
11306   72,173,                                 //lods          %ds:(%rsi),%rax
11307   255,224,                                //jmpq          *%rax
11308 };
11309 
11310 CODE const uint8_t sk_store_8888_sse41[] = {
11311   72,173,                                 //lods          %ds:(%rsi),%rax
11312   72,139,0,                               //mov           (%rax),%rax
11313   185,0,0,127,67,                         //mov           $0x437f0000,%ecx
11314   102,68,15,110,193,                      //movd          %ecx,%xmm8
11315   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11316   69,15,40,200,                           //movaps        %xmm8,%xmm9
11317   68,15,89,200,                           //mulps         %xmm0,%xmm9
11318   102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
11319   69,15,40,208,                           //movaps        %xmm8,%xmm10
11320   68,15,89,209,                           //mulps         %xmm1,%xmm10
11321   102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
11322   102,65,15,114,242,8,                    //pslld         $0x8,%xmm10
11323   102,69,15,235,209,                      //por           %xmm9,%xmm10
11324   69,15,40,200,                           //movaps        %xmm8,%xmm9
11325   68,15,89,202,                           //mulps         %xmm2,%xmm9
11326   102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
11327   102,65,15,114,241,16,                   //pslld         $0x10,%xmm9
11328   68,15,89,195,                           //mulps         %xmm3,%xmm8
11329   102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
11330   102,65,15,114,240,24,                   //pslld         $0x18,%xmm8
11331   102,69,15,235,193,                      //por           %xmm9,%xmm8
11332   102,69,15,235,194,                      //por           %xmm10,%xmm8
11333   243,68,15,127,4,184,                    //movdqu        %xmm8,(%rax,%rdi,4)
11334   72,173,                                 //lods          %ds:(%rsi),%rax
11335   255,224,                                //jmpq          *%rax
11336 };
11337 
11338 CODE const uint8_t sk_load_f16_sse41[] = {
11339   72,173,                                 //lods          %ds:(%rsi),%rax
11340   72,139,0,                               //mov           (%rax),%rax
11341   243,15,111,4,248,                       //movdqu        (%rax,%rdi,8),%xmm0
11342   243,15,111,76,248,16,                   //movdqu        0x10(%rax,%rdi,8),%xmm1
11343   102,15,111,208,                         //movdqa        %xmm0,%xmm2
11344   102,15,97,209,                          //punpcklwd     %xmm1,%xmm2
11345   102,15,105,193,                         //punpckhwd     %xmm1,%xmm0
11346   102,68,15,111,194,                      //movdqa        %xmm2,%xmm8
11347   102,68,15,97,192,                       //punpcklwd     %xmm0,%xmm8
11348   102,15,105,208,                         //punpckhwd     %xmm0,%xmm2
11349   184,0,4,0,4,                            //mov           $0x4000400,%eax
11350   102,15,110,192,                         //movd          %eax,%xmm0
11351   102,15,112,216,0,                       //pshufd        $0x0,%xmm0,%xmm3
11352   102,15,111,203,                         //movdqa        %xmm3,%xmm1
11353   102,65,15,101,200,                      //pcmpgtw       %xmm8,%xmm1
11354   102,65,15,223,200,                      //pandn         %xmm8,%xmm1
11355   102,15,101,218,                         //pcmpgtw       %xmm2,%xmm3
11356   102,15,223,218,                         //pandn         %xmm2,%xmm3
11357   102,15,56,51,193,                       //pmovzxwd      %xmm1,%xmm0
11358   102,15,114,240,13,                      //pslld         $0xd,%xmm0
11359   184,0,0,128,119,                        //mov           $0x77800000,%eax
11360   102,15,110,208,                         //movd          %eax,%xmm2
11361   102,68,15,112,194,0,                    //pshufd        $0x0,%xmm2,%xmm8
11362   65,15,89,192,                           //mulps         %xmm8,%xmm0
11363   102,69,15,239,201,                      //pxor          %xmm9,%xmm9
11364   102,65,15,105,201,                      //punpckhwd     %xmm9,%xmm1
11365   102,15,114,241,13,                      //pslld         $0xd,%xmm1
11366   65,15,89,200,                           //mulps         %xmm8,%xmm1
11367   102,15,56,51,211,                       //pmovzxwd      %xmm3,%xmm2
11368   102,15,114,242,13,                      //pslld         $0xd,%xmm2
11369   65,15,89,208,                           //mulps         %xmm8,%xmm2
11370   102,65,15,105,217,                      //punpckhwd     %xmm9,%xmm3
11371   102,15,114,243,13,                      //pslld         $0xd,%xmm3
11372   65,15,89,216,                           //mulps         %xmm8,%xmm3
11373   72,173,                                 //lods          %ds:(%rsi),%rax
11374   255,224,                                //jmpq          *%rax
11375 };
11376 
11377 CODE const uint8_t sk_store_f16_sse41[] = {
11378   72,173,                                 //lods          %ds:(%rsi),%rax
11379   72,139,0,                               //mov           (%rax),%rax
11380   185,0,0,128,7,                          //mov           $0x7800000,%ecx
11381   102,68,15,110,193,                      //movd          %ecx,%xmm8
11382   102,69,15,112,192,0,                    //pshufd        $0x0,%xmm8,%xmm8
11383   102,69,15,111,200,                      //movdqa        %xmm8,%xmm9
11384   68,15,89,200,                           //mulps         %xmm0,%xmm9
11385   102,65,15,114,209,13,                   //psrld         $0xd,%xmm9
11386   102,69,15,111,208,                      //movdqa        %xmm8,%xmm10
11387   68,15,89,209,                           //mulps         %xmm1,%xmm10
11388   102,65,15,114,210,13,                   //psrld         $0xd,%xmm10
11389   102,69,15,111,216,                      //movdqa        %xmm8,%xmm11
11390   68,15,89,218,                           //mulps         %xmm2,%xmm11
11391   102,65,15,114,211,13,                   //psrld         $0xd,%xmm11
11392   68,15,89,195,                           //mulps         %xmm3,%xmm8
11393   102,65,15,114,208,13,                   //psrld         $0xd,%xmm8
11394   102,65,15,115,250,2,                    //pslldq        $0x2,%xmm10
11395   102,69,15,235,209,                      //por           %xmm9,%xmm10
11396   102,65,15,115,248,2,                    //pslldq        $0x2,%xmm8
11397   102,69,15,235,195,                      //por           %xmm11,%xmm8
11398   102,69,15,111,202,                      //movdqa        %xmm10,%xmm9
11399   102,69,15,98,200,                       //punpckldq     %xmm8,%xmm9
11400   243,68,15,127,12,248,                   //movdqu        %xmm9,(%rax,%rdi,8)
11401   102,69,15,106,208,                      //punpckhdq     %xmm8,%xmm10
11402   243,68,15,127,84,248,16,                //movdqu        %xmm10,0x10(%rax,%rdi,8)
11403   72,173,                                 //lods          %ds:(%rsi),%rax
11404   255,224,                                //jmpq          *%rax
11405 };
11406 
11407 CODE const uint8_t sk_store_f32_sse41[] = {
11408   72,173,                                 //lods          %ds:(%rsi),%rax
11409   72,139,0,                               //mov           (%rax),%rax
11410   72,137,249,                             //mov           %rdi,%rcx
11411   72,193,225,4,                           //shl           $0x4,%rcx
11412   68,15,40,192,                           //movaps        %xmm0,%xmm8
11413   68,15,40,200,                           //movaps        %xmm0,%xmm9
11414   68,15,20,201,                           //unpcklps      %xmm1,%xmm9
11415   68,15,40,210,                           //movaps        %xmm2,%xmm10
11416   68,15,40,218,                           //movaps        %xmm2,%xmm11
11417   68,15,20,219,                           //unpcklps      %xmm3,%xmm11
11418   68,15,21,193,                           //unpckhps      %xmm1,%xmm8
11419   68,15,21,211,                           //unpckhps      %xmm3,%xmm10
11420   69,15,40,225,                           //movaps        %xmm9,%xmm12
11421   102,69,15,20,227,                       //unpcklpd      %xmm11,%xmm12
11422   69,15,18,217,                           //movhlps       %xmm9,%xmm11
11423   69,15,40,200,                           //movaps        %xmm8,%xmm9
11424   102,69,15,20,202,                       //unpcklpd      %xmm10,%xmm9
11425   69,15,18,208,                           //movhlps       %xmm8,%xmm10
11426   102,68,15,17,36,8,                      //movupd        %xmm12,(%rax,%rcx,1)
11427   68,15,17,92,8,16,                       //movups        %xmm11,0x10(%rax,%rcx,1)
11428   102,68,15,17,76,8,32,                   //movupd        %xmm9,0x20(%rax,%rcx,1)
11429   68,15,17,84,8,48,                       //movups        %xmm10,0x30(%rax,%rcx,1)
11430   72,173,                                 //lods          %ds:(%rsi),%rax
11431   255,224,                                //jmpq          *%rax
11432 };
11433 
11434 CODE const uint8_t sk_clamp_x_sse41[] = {
11435   72,173,                                 //lods          %ds:(%rsi),%rax
11436   69,15,87,192,                           //xorps         %xmm8,%xmm8
11437   68,15,95,192,                           //maxps         %xmm0,%xmm8
11438   243,68,15,16,8,                         //movss         (%rax),%xmm9
11439   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
11440   102,15,118,192,                         //pcmpeqd       %xmm0,%xmm0
11441   102,65,15,254,193,                      //paddd         %xmm9,%xmm0
11442   68,15,93,192,                           //minps         %xmm0,%xmm8
11443   72,173,                                 //lods          %ds:(%rsi),%rax
11444   65,15,40,192,                           //movaps        %xmm8,%xmm0
11445   255,224,                                //jmpq          *%rax
11446 };
11447 
11448 CODE const uint8_t sk_clamp_y_sse41[] = {
11449   72,173,                                 //lods          %ds:(%rsi),%rax
11450   69,15,87,192,                           //xorps         %xmm8,%xmm8
11451   68,15,95,193,                           //maxps         %xmm1,%xmm8
11452   243,68,15,16,8,                         //movss         (%rax),%xmm9
11453   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
11454   102,15,118,201,                         //pcmpeqd       %xmm1,%xmm1
11455   102,65,15,254,201,                      //paddd         %xmm9,%xmm1
11456   68,15,93,193,                           //minps         %xmm1,%xmm8
11457   72,173,                                 //lods          %ds:(%rsi),%rax
11458   65,15,40,200,                           //movaps        %xmm8,%xmm1
11459   255,224,                                //jmpq          *%rax
11460 };
11461 
11462 CODE const uint8_t sk_repeat_x_sse41[] = {
11463   72,173,                                 //lods          %ds:(%rsi),%rax
11464   243,68,15,16,0,                         //movss         (%rax),%xmm8
11465   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11466   68,15,40,200,                           //movaps        %xmm0,%xmm9
11467   69,15,94,200,                           //divps         %xmm8,%xmm9
11468   102,69,15,58,8,201,1,                   //roundps       $0x1,%xmm9,%xmm9
11469   69,15,89,200,                           //mulps         %xmm8,%xmm9
11470   65,15,92,193,                           //subps         %xmm9,%xmm0
11471   102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
11472   102,69,15,254,200,                      //paddd         %xmm8,%xmm9
11473   65,15,93,193,                           //minps         %xmm9,%xmm0
11474   72,173,                                 //lods          %ds:(%rsi),%rax
11475   255,224,                                //jmpq          *%rax
11476 };
11477 
11478 CODE const uint8_t sk_repeat_y_sse41[] = {
11479   72,173,                                 //lods          %ds:(%rsi),%rax
11480   243,68,15,16,0,                         //movss         (%rax),%xmm8
11481   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11482   68,15,40,201,                           //movaps        %xmm1,%xmm9
11483   69,15,94,200,                           //divps         %xmm8,%xmm9
11484   102,69,15,58,8,201,1,                   //roundps       $0x1,%xmm9,%xmm9
11485   69,15,89,200,                           //mulps         %xmm8,%xmm9
11486   65,15,92,201,                           //subps         %xmm9,%xmm1
11487   102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
11488   102,69,15,254,200,                      //paddd         %xmm8,%xmm9
11489   65,15,93,201,                           //minps         %xmm9,%xmm1
11490   72,173,                                 //lods          %ds:(%rsi),%rax
11491   255,224,                                //jmpq          *%rax
11492 };
11493 
11494 CODE const uint8_t sk_mirror_x_sse41[] = {
11495   72,173,                                 //lods          %ds:(%rsi),%rax
11496   243,68,15,16,0,                         //movss         (%rax),%xmm8
11497   69,15,40,200,                           //movaps        %xmm8,%xmm9
11498   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
11499   65,15,92,193,                           //subps         %xmm9,%xmm0
11500   243,69,15,88,192,                       //addss         %xmm8,%xmm8
11501   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11502   68,15,40,208,                           //movaps        %xmm0,%xmm10
11503   69,15,94,208,                           //divps         %xmm8,%xmm10
11504   102,69,15,58,8,210,1,                   //roundps       $0x1,%xmm10,%xmm10
11505   69,15,89,208,                           //mulps         %xmm8,%xmm10
11506   65,15,92,194,                           //subps         %xmm10,%xmm0
11507   65,15,92,193,                           //subps         %xmm9,%xmm0
11508   69,15,87,192,                           //xorps         %xmm8,%xmm8
11509   68,15,92,192,                           //subps         %xmm0,%xmm8
11510   65,15,84,192,                           //andps         %xmm8,%xmm0
11511   102,69,15,118,192,                      //pcmpeqd       %xmm8,%xmm8
11512   102,69,15,254,193,                      //paddd         %xmm9,%xmm8
11513   65,15,93,192,                           //minps         %xmm8,%xmm0
11514   72,173,                                 //lods          %ds:(%rsi),%rax
11515   255,224,                                //jmpq          *%rax
11516 };
11517 
11518 CODE const uint8_t sk_mirror_y_sse41[] = {
11519   72,173,                                 //lods          %ds:(%rsi),%rax
11520   243,68,15,16,0,                         //movss         (%rax),%xmm8
11521   69,15,40,200,                           //movaps        %xmm8,%xmm9
11522   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
11523   65,15,92,201,                           //subps         %xmm9,%xmm1
11524   243,69,15,88,192,                       //addss         %xmm8,%xmm8
11525   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11526   68,15,40,209,                           //movaps        %xmm1,%xmm10
11527   69,15,94,208,                           //divps         %xmm8,%xmm10
11528   102,69,15,58,8,210,1,                   //roundps       $0x1,%xmm10,%xmm10
11529   69,15,89,208,                           //mulps         %xmm8,%xmm10
11530   65,15,92,202,                           //subps         %xmm10,%xmm1
11531   65,15,92,201,                           //subps         %xmm9,%xmm1
11532   69,15,87,192,                           //xorps         %xmm8,%xmm8
11533   68,15,92,193,                           //subps         %xmm1,%xmm8
11534   65,15,84,200,                           //andps         %xmm8,%xmm1
11535   102,69,15,118,192,                      //pcmpeqd       %xmm8,%xmm8
11536   102,69,15,254,193,                      //paddd         %xmm9,%xmm8
11537   65,15,93,200,                           //minps         %xmm8,%xmm1
11538   72,173,                                 //lods          %ds:(%rsi),%rax
11539   255,224,                                //jmpq          *%rax
11540 };
11541 
11542 CODE const uint8_t sk_luminance_to_alpha_sse41[] = {
11543   184,208,179,89,62,                      //mov           $0x3e59b3d0,%eax
11544   102,15,110,216,                         //movd          %eax,%xmm3
11545   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
11546   15,89,216,                              //mulps         %xmm0,%xmm3
11547   184,89,23,55,63,                        //mov           $0x3f371759,%eax
11548   102,15,110,192,                         //movd          %eax,%xmm0
11549   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
11550   15,89,193,                              //mulps         %xmm1,%xmm0
11551   15,88,195,                              //addps         %xmm3,%xmm0
11552   184,152,221,147,61,                     //mov           $0x3d93dd98,%eax
11553   102,15,110,216,                         //movd          %eax,%xmm3
11554   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
11555   15,89,218,                              //mulps         %xmm2,%xmm3
11556   15,88,216,                              //addps         %xmm0,%xmm3
11557   72,173,                                 //lods          %ds:(%rsi),%rax
11558   15,87,192,                              //xorps         %xmm0,%xmm0
11559   15,87,201,                              //xorps         %xmm1,%xmm1
11560   15,87,210,                              //xorps         %xmm2,%xmm2
11561   255,224,                                //jmpq          *%rax
11562 };
11563 
11564 CODE const uint8_t sk_matrix_2x3_sse41[] = {
11565   68,15,40,201,                           //movaps        %xmm1,%xmm9
11566   68,15,40,192,                           //movaps        %xmm0,%xmm8
11567   72,173,                                 //lods          %ds:(%rsi),%rax
11568   243,15,16,0,                            //movss         (%rax),%xmm0
11569   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
11570   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
11571   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
11572   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
11573   243,68,15,16,88,16,                     //movss         0x10(%rax),%xmm11
11574   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
11575   69,15,89,209,                           //mulps         %xmm9,%xmm10
11576   69,15,88,211,                           //addps         %xmm11,%xmm10
11577   65,15,89,192,                           //mulps         %xmm8,%xmm0
11578   65,15,88,194,                           //addps         %xmm10,%xmm0
11579   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
11580   243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
11581   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
11582   243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
11583   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
11584   69,15,89,209,                           //mulps         %xmm9,%xmm10
11585   69,15,88,211,                           //addps         %xmm11,%xmm10
11586   65,15,89,200,                           //mulps         %xmm8,%xmm1
11587   65,15,88,202,                           //addps         %xmm10,%xmm1
11588   72,173,                                 //lods          %ds:(%rsi),%rax
11589   255,224,                                //jmpq          *%rax
11590 };
11591 
11592 CODE const uint8_t sk_matrix_3x4_sse41[] = {
11593   68,15,40,201,                           //movaps        %xmm1,%xmm9
11594   68,15,40,192,                           //movaps        %xmm0,%xmm8
11595   72,173,                                 //lods          %ds:(%rsi),%rax
11596   243,15,16,0,                            //movss         (%rax),%xmm0
11597   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
11598   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
11599   243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
11600   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
11601   243,68,15,16,88,24,                     //movss         0x18(%rax),%xmm11
11602   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
11603   243,68,15,16,96,36,                     //movss         0x24(%rax),%xmm12
11604   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
11605   68,15,89,218,                           //mulps         %xmm2,%xmm11
11606   69,15,88,220,                           //addps         %xmm12,%xmm11
11607   69,15,89,209,                           //mulps         %xmm9,%xmm10
11608   69,15,88,211,                           //addps         %xmm11,%xmm10
11609   65,15,89,192,                           //mulps         %xmm8,%xmm0
11610   65,15,88,194,                           //addps         %xmm10,%xmm0
11611   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
11612   243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
11613   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
11614   243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
11615   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
11616   243,68,15,16,96,40,                     //movss         0x28(%rax),%xmm12
11617   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
11618   68,15,89,218,                           //mulps         %xmm2,%xmm11
11619   69,15,88,220,                           //addps         %xmm12,%xmm11
11620   69,15,89,209,                           //mulps         %xmm9,%xmm10
11621   69,15,88,211,                           //addps         %xmm11,%xmm10
11622   65,15,89,200,                           //mulps         %xmm8,%xmm1
11623   65,15,88,202,                           //addps         %xmm10,%xmm1
11624   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
11625   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
11626   243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
11627   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
11628   243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
11629   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
11630   243,68,15,16,104,44,                    //movss         0x2c(%rax),%xmm13
11631   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
11632   68,15,89,226,                           //mulps         %xmm2,%xmm12
11633   69,15,88,229,                           //addps         %xmm13,%xmm12
11634   69,15,89,217,                           //mulps         %xmm9,%xmm11
11635   69,15,88,220,                           //addps         %xmm12,%xmm11
11636   69,15,89,208,                           //mulps         %xmm8,%xmm10
11637   69,15,88,211,                           //addps         %xmm11,%xmm10
11638   72,173,                                 //lods          %ds:(%rsi),%rax
11639   65,15,40,210,                           //movaps        %xmm10,%xmm2
11640   255,224,                                //jmpq          *%rax
11641 };
11642 
11643 CODE const uint8_t sk_matrix_4x5_sse41[] = {
11644   68,15,40,201,                           //movaps        %xmm1,%xmm9
11645   68,15,40,192,                           //movaps        %xmm0,%xmm8
11646   72,173,                                 //lods          %ds:(%rsi),%rax
11647   243,15,16,0,                            //movss         (%rax),%xmm0
11648   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
11649   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
11650   243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
11651   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
11652   243,68,15,16,88,32,                     //movss         0x20(%rax),%xmm11
11653   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
11654   243,68,15,16,96,48,                     //movss         0x30(%rax),%xmm12
11655   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
11656   243,68,15,16,104,64,                    //movss         0x40(%rax),%xmm13
11657   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
11658   68,15,89,227,                           //mulps         %xmm3,%xmm12
11659   69,15,88,229,                           //addps         %xmm13,%xmm12
11660   68,15,89,218,                           //mulps         %xmm2,%xmm11
11661   69,15,88,220,                           //addps         %xmm12,%xmm11
11662   69,15,89,209,                           //mulps         %xmm9,%xmm10
11663   69,15,88,211,                           //addps         %xmm11,%xmm10
11664   65,15,89,192,                           //mulps         %xmm8,%xmm0
11665   65,15,88,194,                           //addps         %xmm10,%xmm0
11666   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
11667   243,68,15,16,80,20,                     //movss         0x14(%rax),%xmm10
11668   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
11669   243,68,15,16,88,36,                     //movss         0x24(%rax),%xmm11
11670   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
11671   243,68,15,16,96,52,                     //movss         0x34(%rax),%xmm12
11672   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
11673   243,68,15,16,104,68,                    //movss         0x44(%rax),%xmm13
11674   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
11675   68,15,89,227,                           //mulps         %xmm3,%xmm12
11676   69,15,88,229,                           //addps         %xmm13,%xmm12
11677   68,15,89,218,                           //mulps         %xmm2,%xmm11
11678   69,15,88,220,                           //addps         %xmm12,%xmm11
11679   69,15,89,209,                           //mulps         %xmm9,%xmm10
11680   69,15,88,211,                           //addps         %xmm11,%xmm10
11681   65,15,89,200,                           //mulps         %xmm8,%xmm1
11682   65,15,88,202,                           //addps         %xmm10,%xmm1
11683   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
11684   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
11685   243,68,15,16,88,24,                     //movss         0x18(%rax),%xmm11
11686   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
11687   243,68,15,16,96,40,                     //movss         0x28(%rax),%xmm12
11688   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
11689   243,68,15,16,104,56,                    //movss         0x38(%rax),%xmm13
11690   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
11691   243,68,15,16,112,72,                    //movss         0x48(%rax),%xmm14
11692   69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
11693   68,15,89,235,                           //mulps         %xmm3,%xmm13
11694   69,15,88,238,                           //addps         %xmm14,%xmm13
11695   68,15,89,226,                           //mulps         %xmm2,%xmm12
11696   69,15,88,229,                           //addps         %xmm13,%xmm12
11697   69,15,89,217,                           //mulps         %xmm9,%xmm11
11698   69,15,88,220,                           //addps         %xmm12,%xmm11
11699   69,15,89,208,                           //mulps         %xmm8,%xmm10
11700   69,15,88,211,                           //addps         %xmm11,%xmm10
11701   243,68,15,16,88,12,                     //movss         0xc(%rax),%xmm11
11702   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
11703   243,68,15,16,96,28,                     //movss         0x1c(%rax),%xmm12
11704   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
11705   243,68,15,16,104,44,                    //movss         0x2c(%rax),%xmm13
11706   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
11707   243,68,15,16,112,60,                    //movss         0x3c(%rax),%xmm14
11708   69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
11709   243,68,15,16,120,76,                    //movss         0x4c(%rax),%xmm15
11710   69,15,198,255,0,                        //shufps        $0x0,%xmm15,%xmm15
11711   68,15,89,243,                           //mulps         %xmm3,%xmm14
11712   69,15,88,247,                           //addps         %xmm15,%xmm14
11713   68,15,89,234,                           //mulps         %xmm2,%xmm13
11714   69,15,88,238,                           //addps         %xmm14,%xmm13
11715   69,15,89,225,                           //mulps         %xmm9,%xmm12
11716   69,15,88,229,                           //addps         %xmm13,%xmm12
11717   69,15,89,216,                           //mulps         %xmm8,%xmm11
11718   69,15,88,220,                           //addps         %xmm12,%xmm11
11719   72,173,                                 //lods          %ds:(%rsi),%rax
11720   65,15,40,210,                           //movaps        %xmm10,%xmm2
11721   65,15,40,219,                           //movaps        %xmm11,%xmm3
11722   255,224,                                //jmpq          *%rax
11723 };
11724 
11725 CODE const uint8_t sk_matrix_perspective_sse41[] = {
11726   68,15,40,192,                           //movaps        %xmm0,%xmm8
11727   72,173,                                 //lods          %ds:(%rsi),%rax
11728   243,15,16,0,                            //movss         (%rax),%xmm0
11729   243,68,15,16,72,4,                      //movss         0x4(%rax),%xmm9
11730   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
11731   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
11732   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
11733   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
11734   68,15,89,201,                           //mulps         %xmm1,%xmm9
11735   69,15,88,202,                           //addps         %xmm10,%xmm9
11736   65,15,89,192,                           //mulps         %xmm8,%xmm0
11737   65,15,88,193,                           //addps         %xmm9,%xmm0
11738   243,68,15,16,72,12,                     //movss         0xc(%rax),%xmm9
11739   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
11740   243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
11741   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
11742   243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
11743   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
11744   68,15,89,209,                           //mulps         %xmm1,%xmm10
11745   69,15,88,211,                           //addps         %xmm11,%xmm10
11746   69,15,89,200,                           //mulps         %xmm8,%xmm9
11747   69,15,88,202,                           //addps         %xmm10,%xmm9
11748   243,68,15,16,80,24,                     //movss         0x18(%rax),%xmm10
11749   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
11750   243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
11751   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
11752   243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
11753   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
11754   68,15,89,217,                           //mulps         %xmm1,%xmm11
11755   69,15,88,220,                           //addps         %xmm12,%xmm11
11756   69,15,89,208,                           //mulps         %xmm8,%xmm10
11757   69,15,88,211,                           //addps         %xmm11,%xmm10
11758   65,15,83,202,                           //rcpps         %xmm10,%xmm1
11759   15,89,193,                              //mulps         %xmm1,%xmm0
11760   68,15,89,201,                           //mulps         %xmm1,%xmm9
11761   72,173,                                 //lods          %ds:(%rsi),%rax
11762   65,15,40,201,                           //movaps        %xmm9,%xmm1
11763   255,224,                                //jmpq          *%rax
11764 };
11765 
11766 CODE const uint8_t sk_linear_gradient_2stops_sse41[] = {
11767   72,173,                                 //lods          %ds:(%rsi),%rax
11768   68,15,16,8,                             //movups        (%rax),%xmm9
11769   15,16,88,16,                            //movups        0x10(%rax),%xmm3
11770   68,15,40,195,                           //movaps        %xmm3,%xmm8
11771   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11772   65,15,40,201,                           //movaps        %xmm9,%xmm1
11773   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
11774   68,15,89,192,                           //mulps         %xmm0,%xmm8
11775   68,15,88,193,                           //addps         %xmm1,%xmm8
11776   15,40,203,                              //movaps        %xmm3,%xmm1
11777   15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
11778   65,15,40,209,                           //movaps        %xmm9,%xmm2
11779   15,198,210,85,                          //shufps        $0x55,%xmm2,%xmm2
11780   15,89,200,                              //mulps         %xmm0,%xmm1
11781   15,88,202,                              //addps         %xmm2,%xmm1
11782   15,40,211,                              //movaps        %xmm3,%xmm2
11783   15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
11784   69,15,40,209,                           //movaps        %xmm9,%xmm10
11785   69,15,198,210,170,                      //shufps        $0xaa,%xmm10,%xmm10
11786   15,89,208,                              //mulps         %xmm0,%xmm2
11787   65,15,88,210,                           //addps         %xmm10,%xmm2
11788   15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
11789   69,15,198,201,255,                      //shufps        $0xff,%xmm9,%xmm9
11790   15,89,216,                              //mulps         %xmm0,%xmm3
11791   65,15,88,217,                           //addps         %xmm9,%xmm3
11792   72,173,                                 //lods          %ds:(%rsi),%rax
11793   65,15,40,192,                           //movaps        %xmm8,%xmm0
11794   255,224,                                //jmpq          *%rax
11795 };
11796 
11797 CODE const uint8_t sk_start_pipeline_sse2[] = {
11798   65,87,                                  //push          %r15
11799   65,86,                                  //push          %r14
11800   65,85,                                  //push          %r13
11801   65,84,                                  //push          %r12
11802   86,                                     //push          %rsi
11803   87,                                     //push          %rdi
11804   83,                                     //push          %rbx
11805   72,129,236,160,0,0,0,                   //sub           $0xa0,%rsp
11806   68,15,41,188,36,144,0,0,0,              //movaps        %xmm15,0x90(%rsp)
11807   68,15,41,180,36,128,0,0,0,              //movaps        %xmm14,0x80(%rsp)
11808   68,15,41,108,36,112,                    //movaps        %xmm13,0x70(%rsp)
11809   68,15,41,100,36,96,                     //movaps        %xmm12,0x60(%rsp)
11810   68,15,41,92,36,80,                      //movaps        %xmm11,0x50(%rsp)
11811   68,15,41,84,36,64,                      //movaps        %xmm10,0x40(%rsp)
11812   68,15,41,76,36,48,                      //movaps        %xmm9,0x30(%rsp)
11813   68,15,41,68,36,32,                      //movaps        %xmm8,0x20(%rsp)
11814   15,41,124,36,16,                        //movaps        %xmm7,0x10(%rsp)
11815   15,41,52,36,                            //movaps        %xmm6,(%rsp)
11816   77,137,207,                             //mov           %r9,%r15
11817   77,137,198,                             //mov           %r8,%r14
11818   72,137,203,                             //mov           %rcx,%rbx
11819   72,137,214,                             //mov           %rdx,%rsi
11820   72,173,                                 //lods          %ds:(%rsi),%rax
11821   73,137,196,                             //mov           %rax,%r12
11822   73,137,245,                             //mov           %rsi,%r13
11823   72,141,67,4,                            //lea           0x4(%rbx),%rax
11824   76,57,248,                              //cmp           %r15,%rax
11825   118,5,                                  //jbe           73 <_sk_start_pipeline_sse2+0x73>
11826   72,137,216,                             //mov           %rbx,%rax
11827   235,52,                                 //jmp           a7 <_sk_start_pipeline_sse2+0xa7>
11828   15,87,192,                              //xorps         %xmm0,%xmm0
11829   15,87,201,                              //xorps         %xmm1,%xmm1
11830   15,87,210,                              //xorps         %xmm2,%xmm2
11831   15,87,219,                              //xorps         %xmm3,%xmm3
11832   15,87,228,                              //xorps         %xmm4,%xmm4
11833   15,87,237,                              //xorps         %xmm5,%xmm5
11834   15,87,246,                              //xorps         %xmm6,%xmm6
11835   15,87,255,                              //xorps         %xmm7,%xmm7
11836   72,137,223,                             //mov           %rbx,%rdi
11837   76,137,238,                             //mov           %r13,%rsi
11838   76,137,242,                             //mov           %r14,%rdx
11839   65,255,212,                             //callq         *%r12
11840   72,141,67,4,                            //lea           0x4(%rbx),%rax
11841   72,131,195,8,                           //add           $0x8,%rbx
11842   76,57,251,                              //cmp           %r15,%rbx
11843   72,137,195,                             //mov           %rax,%rbx
11844   118,204,                                //jbe           73 <_sk_start_pipeline_sse2+0x73>
11845   15,40,52,36,                            //movaps        (%rsp),%xmm6
11846   15,40,124,36,16,                        //movaps        0x10(%rsp),%xmm7
11847   68,15,40,68,36,32,                      //movaps        0x20(%rsp),%xmm8
11848   68,15,40,76,36,48,                      //movaps        0x30(%rsp),%xmm9
11849   68,15,40,84,36,64,                      //movaps        0x40(%rsp),%xmm10
11850   68,15,40,92,36,80,                      //movaps        0x50(%rsp),%xmm11
11851   68,15,40,100,36,96,                     //movaps        0x60(%rsp),%xmm12
11852   68,15,40,108,36,112,                    //movaps        0x70(%rsp),%xmm13
11853   68,15,40,180,36,128,0,0,0,              //movaps        0x80(%rsp),%xmm14
11854   68,15,40,188,36,144,0,0,0,              //movaps        0x90(%rsp),%xmm15
11855   72,129,196,160,0,0,0,                   //add           $0xa0,%rsp
11856   91,                                     //pop           %rbx
11857   95,                                     //pop           %rdi
11858   94,                                     //pop           %rsi
11859   65,92,                                  //pop           %r12
11860   65,93,                                  //pop           %r13
11861   65,94,                                  //pop           %r14
11862   65,95,                                  //pop           %r15
11863   195,                                    //retq
11864 };
11865 
11866 CODE const uint8_t sk_just_return_sse2[] = {
11867   195,                                    //retq
11868 };
11869 
11870 CODE const uint8_t sk_seed_shader_sse2[] = {
11871   72,173,                                 //lods          %ds:(%rsi),%rax
11872   102,15,110,199,                         //movd          %edi,%xmm0
11873   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
11874   15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
11875   185,0,0,0,63,                           //mov           $0x3f000000,%ecx
11876   102,15,110,209,                         //movd          %ecx,%xmm2
11877   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
11878   15,88,202,                              //addps         %xmm2,%xmm1
11879   15,16,2,                                //movups        (%rdx),%xmm0
11880   15,88,193,                              //addps         %xmm1,%xmm0
11881   102,15,110,8,                           //movd          (%rax),%xmm1
11882   102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
11883   15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
11884   15,88,202,                              //addps         %xmm2,%xmm1
11885   184,0,0,128,63,                         //mov           $0x3f800000,%eax
11886   102,15,110,208,                         //movd          %eax,%xmm2
11887   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
11888   72,173,                                 //lods          %ds:(%rsi),%rax
11889   15,87,219,                              //xorps         %xmm3,%xmm3
11890   15,87,228,                              //xorps         %xmm4,%xmm4
11891   15,87,237,                              //xorps         %xmm5,%xmm5
11892   15,87,246,                              //xorps         %xmm6,%xmm6
11893   15,87,255,                              //xorps         %xmm7,%xmm7
11894   255,224,                                //jmpq          *%rax
11895 };
11896 
11897 CODE const uint8_t sk_constant_color_sse2[] = {
11898   72,173,                                 //lods          %ds:(%rsi),%rax
11899   15,16,24,                               //movups        (%rax),%xmm3
11900   15,40,195,                              //movaps        %xmm3,%xmm0
11901   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
11902   15,40,203,                              //movaps        %xmm3,%xmm1
11903   15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
11904   15,40,211,                              //movaps        %xmm3,%xmm2
11905   15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
11906   15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
11907   72,173,                                 //lods          %ds:(%rsi),%rax
11908   255,224,                                //jmpq          *%rax
11909 };
11910 
11911 CODE const uint8_t sk_clear_sse2[] = {
11912   72,173,                                 //lods          %ds:(%rsi),%rax
11913   15,87,192,                              //xorps         %xmm0,%xmm0
11914   15,87,201,                              //xorps         %xmm1,%xmm1
11915   15,87,210,                              //xorps         %xmm2,%xmm2
11916   15,87,219,                              //xorps         %xmm3,%xmm3
11917   255,224,                                //jmpq          *%rax
11918 };
11919 
11920 CODE const uint8_t sk_plus__sse2[] = {
11921   15,88,196,                              //addps         %xmm4,%xmm0
11922   15,88,205,                              //addps         %xmm5,%xmm1
11923   15,88,214,                              //addps         %xmm6,%xmm2
11924   15,88,223,                              //addps         %xmm7,%xmm3
11925   72,173,                                 //lods          %ds:(%rsi),%rax
11926   255,224,                                //jmpq          *%rax
11927 };
11928 
11929 CODE const uint8_t sk_srcover_sse2[] = {
11930   184,0,0,128,63,                         //mov           $0x3f800000,%eax
11931   102,68,15,110,192,                      //movd          %eax,%xmm8
11932   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11933   68,15,92,195,                           //subps         %xmm3,%xmm8
11934   69,15,40,200,                           //movaps        %xmm8,%xmm9
11935   68,15,89,204,                           //mulps         %xmm4,%xmm9
11936   65,15,88,193,                           //addps         %xmm9,%xmm0
11937   69,15,40,200,                           //movaps        %xmm8,%xmm9
11938   68,15,89,205,                           //mulps         %xmm5,%xmm9
11939   65,15,88,201,                           //addps         %xmm9,%xmm1
11940   69,15,40,200,                           //movaps        %xmm8,%xmm9
11941   68,15,89,206,                           //mulps         %xmm6,%xmm9
11942   65,15,88,209,                           //addps         %xmm9,%xmm2
11943   68,15,89,199,                           //mulps         %xmm7,%xmm8
11944   65,15,88,216,                           //addps         %xmm8,%xmm3
11945   72,173,                                 //lods          %ds:(%rsi),%rax
11946   255,224,                                //jmpq          *%rax
11947 };
11948 
11949 CODE const uint8_t sk_dstover_sse2[] = {
11950   184,0,0,128,63,                         //mov           $0x3f800000,%eax
11951   102,68,15,110,192,                      //movd          %eax,%xmm8
11952   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11953   68,15,92,199,                           //subps         %xmm7,%xmm8
11954   65,15,89,192,                           //mulps         %xmm8,%xmm0
11955   15,88,196,                              //addps         %xmm4,%xmm0
11956   65,15,89,200,                           //mulps         %xmm8,%xmm1
11957   15,88,205,                              //addps         %xmm5,%xmm1
11958   65,15,89,208,                           //mulps         %xmm8,%xmm2
11959   15,88,214,                              //addps         %xmm6,%xmm2
11960   65,15,89,216,                           //mulps         %xmm8,%xmm3
11961   15,88,223,                              //addps         %xmm7,%xmm3
11962   72,173,                                 //lods          %ds:(%rsi),%rax
11963   255,224,                                //jmpq          *%rax
11964 };
11965 
11966 CODE const uint8_t sk_clamp_0_sse2[] = {
11967   69,15,87,192,                           //xorps         %xmm8,%xmm8
11968   65,15,95,192,                           //maxps         %xmm8,%xmm0
11969   65,15,95,200,                           //maxps         %xmm8,%xmm1
11970   65,15,95,208,                           //maxps         %xmm8,%xmm2
11971   65,15,95,216,                           //maxps         %xmm8,%xmm3
11972   72,173,                                 //lods          %ds:(%rsi),%rax
11973   255,224,                                //jmpq          *%rax
11974 };
11975 
11976 CODE const uint8_t sk_clamp_1_sse2[] = {
11977   184,0,0,128,63,                         //mov           $0x3f800000,%eax
11978   102,68,15,110,192,                      //movd          %eax,%xmm8
11979   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11980   65,15,93,192,                           //minps         %xmm8,%xmm0
11981   65,15,93,200,                           //minps         %xmm8,%xmm1
11982   65,15,93,208,                           //minps         %xmm8,%xmm2
11983   65,15,93,216,                           //minps         %xmm8,%xmm3
11984   72,173,                                 //lods          %ds:(%rsi),%rax
11985   255,224,                                //jmpq          *%rax
11986 };
11987 
11988 CODE const uint8_t sk_clamp_a_sse2[] = {
11989   184,0,0,128,63,                         //mov           $0x3f800000,%eax
11990   102,68,15,110,192,                      //movd          %eax,%xmm8
11991   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
11992   65,15,93,216,                           //minps         %xmm8,%xmm3
11993   15,93,195,                              //minps         %xmm3,%xmm0
11994   15,93,203,                              //minps         %xmm3,%xmm1
11995   15,93,211,                              //minps         %xmm3,%xmm2
11996   72,173,                                 //lods          %ds:(%rsi),%rax
11997   255,224,                                //jmpq          *%rax
11998 };
11999 
12000 CODE const uint8_t sk_set_rgb_sse2[] = {
12001   72,173,                                 //lods          %ds:(%rsi),%rax
12002   243,15,16,0,                            //movss         (%rax),%xmm0
12003   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
12004   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
12005   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
12006   243,15,16,80,8,                         //movss         0x8(%rax),%xmm2
12007   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
12008   72,173,                                 //lods          %ds:(%rsi),%rax
12009   255,224,                                //jmpq          *%rax
12010 };
12011 
12012 CODE const uint8_t sk_swap_rb_sse2[] = {
12013   68,15,40,192,                           //movaps        %xmm0,%xmm8
12014   72,173,                                 //lods          %ds:(%rsi),%rax
12015   15,40,194,                              //movaps        %xmm2,%xmm0
12016   65,15,40,208,                           //movaps        %xmm8,%xmm2
12017   255,224,                                //jmpq          *%rax
12018 };
12019 
12020 CODE const uint8_t sk_swap_sse2[] = {
12021   68,15,40,195,                           //movaps        %xmm3,%xmm8
12022   68,15,40,202,                           //movaps        %xmm2,%xmm9
12023   68,15,40,209,                           //movaps        %xmm1,%xmm10
12024   68,15,40,216,                           //movaps        %xmm0,%xmm11
12025   72,173,                                 //lods          %ds:(%rsi),%rax
12026   15,40,196,                              //movaps        %xmm4,%xmm0
12027   15,40,205,                              //movaps        %xmm5,%xmm1
12028   15,40,214,                              //movaps        %xmm6,%xmm2
12029   15,40,223,                              //movaps        %xmm7,%xmm3
12030   65,15,40,227,                           //movaps        %xmm11,%xmm4
12031   65,15,40,234,                           //movaps        %xmm10,%xmm5
12032   65,15,40,241,                           //movaps        %xmm9,%xmm6
12033   65,15,40,248,                           //movaps        %xmm8,%xmm7
12034   255,224,                                //jmpq          *%rax
12035 };
12036 
12037 CODE const uint8_t sk_move_src_dst_sse2[] = {
12038   72,173,                                 //lods          %ds:(%rsi),%rax
12039   15,40,224,                              //movaps        %xmm0,%xmm4
12040   15,40,233,                              //movaps        %xmm1,%xmm5
12041   15,40,242,                              //movaps        %xmm2,%xmm6
12042   15,40,251,                              //movaps        %xmm3,%xmm7
12043   255,224,                                //jmpq          *%rax
12044 };
12045 
12046 CODE const uint8_t sk_move_dst_src_sse2[] = {
12047   72,173,                                 //lods          %ds:(%rsi),%rax
12048   15,40,196,                              //movaps        %xmm4,%xmm0
12049   15,40,205,                              //movaps        %xmm5,%xmm1
12050   15,40,214,                              //movaps        %xmm6,%xmm2
12051   15,40,223,                              //movaps        %xmm7,%xmm3
12052   255,224,                                //jmpq          *%rax
12053 };
12054 
12055 CODE const uint8_t sk_premul_sse2[] = {
12056   15,89,195,                              //mulps         %xmm3,%xmm0
12057   15,89,203,                              //mulps         %xmm3,%xmm1
12058   15,89,211,                              //mulps         %xmm3,%xmm2
12059   72,173,                                 //lods          %ds:(%rsi),%rax
12060   255,224,                                //jmpq          *%rax
12061 };
12062 
12063 CODE const uint8_t sk_unpremul_sse2[] = {
12064   69,15,87,192,                           //xorps         %xmm8,%xmm8
12065   184,0,0,128,63,                         //mov           $0x3f800000,%eax
12066   102,68,15,110,200,                      //movd          %eax,%xmm9
12067   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
12068   68,15,94,203,                           //divps         %xmm3,%xmm9
12069   68,15,194,195,4,                        //cmpneqps      %xmm3,%xmm8
12070   69,15,84,193,                           //andps         %xmm9,%xmm8
12071   65,15,89,192,                           //mulps         %xmm8,%xmm0
12072   65,15,89,200,                           //mulps         %xmm8,%xmm1
12073   65,15,89,208,                           //mulps         %xmm8,%xmm2
12074   72,173,                                 //lods          %ds:(%rsi),%rax
12075   255,224,                                //jmpq          *%rax
12076 };
12077 
12078 CODE const uint8_t sk_from_srgb_sse2[] = {
12079   184,145,131,158,61,                     //mov           $0x3d9e8391,%eax
12080   102,68,15,110,192,                      //movd          %eax,%xmm8
12081   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
12082   69,15,40,232,                           //movaps        %xmm8,%xmm13
12083   68,15,89,232,                           //mulps         %xmm0,%xmm13
12084   68,15,40,224,                           //movaps        %xmm0,%xmm12
12085   69,15,89,228,                           //mulps         %xmm12,%xmm12
12086   184,154,153,153,62,                     //mov           $0x3e99999a,%eax
12087   102,68,15,110,200,                      //movd          %eax,%xmm9
12088   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
12089   184,92,143,50,63,                       //mov           $0x3f328f5c,%eax
12090   102,68,15,110,208,                      //movd          %eax,%xmm10
12091   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
12092   69,15,40,241,                           //movaps        %xmm9,%xmm14
12093   68,15,89,240,                           //mulps         %xmm0,%xmm14
12094   69,15,88,242,                           //addps         %xmm10,%xmm14
12095   184,10,215,35,59,                       //mov           $0x3b23d70a,%eax
12096   102,68,15,110,216,                      //movd          %eax,%xmm11
12097   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
12098   69,15,89,244,                           //mulps         %xmm12,%xmm14
12099   69,15,88,243,                           //addps         %xmm11,%xmm14
12100   184,174,71,97,61,                       //mov           $0x3d6147ae,%eax
12101   102,68,15,110,224,                      //movd          %eax,%xmm12
12102   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
12103   65,15,194,196,1,                        //cmpltps       %xmm12,%xmm0
12104   68,15,84,232,                           //andps         %xmm0,%xmm13
12105   65,15,85,198,                           //andnps        %xmm14,%xmm0
12106   65,15,86,197,                           //orps          %xmm13,%xmm0
12107   69,15,40,232,                           //movaps        %xmm8,%xmm13
12108   68,15,89,233,                           //mulps         %xmm1,%xmm13
12109   68,15,40,241,                           //movaps        %xmm1,%xmm14
12110   69,15,89,246,                           //mulps         %xmm14,%xmm14
12111   69,15,40,249,                           //movaps        %xmm9,%xmm15
12112   68,15,89,249,                           //mulps         %xmm1,%xmm15
12113   69,15,88,250,                           //addps         %xmm10,%xmm15
12114   69,15,89,254,                           //mulps         %xmm14,%xmm15
12115   69,15,88,251,                           //addps         %xmm11,%xmm15
12116   65,15,194,204,1,                        //cmpltps       %xmm12,%xmm1
12117   68,15,84,233,                           //andps         %xmm1,%xmm13
12118   65,15,85,207,                           //andnps        %xmm15,%xmm1
12119   65,15,86,205,                           //orps          %xmm13,%xmm1
12120   68,15,89,194,                           //mulps         %xmm2,%xmm8
12121   68,15,40,234,                           //movaps        %xmm2,%xmm13
12122   69,15,89,237,                           //mulps         %xmm13,%xmm13
12123   68,15,89,202,                           //mulps         %xmm2,%xmm9
12124   69,15,88,202,                           //addps         %xmm10,%xmm9
12125   69,15,89,205,                           //mulps         %xmm13,%xmm9
12126   69,15,88,203,                           //addps         %xmm11,%xmm9
12127   65,15,194,212,1,                        //cmpltps       %xmm12,%xmm2
12128   68,15,84,194,                           //andps         %xmm2,%xmm8
12129   65,15,85,209,                           //andnps        %xmm9,%xmm2
12130   65,15,86,208,                           //orps          %xmm8,%xmm2
12131   72,173,                                 //lods          %ds:(%rsi),%rax
12132   255,224,                                //jmpq          *%rax
12133 };
12134 
12135 CODE const uint8_t sk_to_srgb_sse2[] = {
12136   68,15,82,192,                           //rsqrtps       %xmm0,%xmm8
12137   69,15,83,248,                           //rcpps         %xmm8,%xmm15
12138   69,15,82,232,                           //rsqrtps       %xmm8,%xmm13
12139   184,41,92,71,65,                        //mov           $0x41475c29,%eax
12140   102,68,15,110,192,                      //movd          %eax,%xmm8
12141   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
12142   69,15,40,240,                           //movaps        %xmm8,%xmm14
12143   68,15,89,240,                           //mulps         %xmm0,%xmm14
12144   184,0,0,128,63,                         //mov           $0x3f800000,%eax
12145   102,68,15,110,200,                      //movd          %eax,%xmm9
12146   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
12147   184,194,135,210,62,                     //mov           $0x3ed287c2,%eax
12148   102,68,15,110,208,                      //movd          %eax,%xmm10
12149   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
12150   184,206,111,48,63,                      //mov           $0x3f306fce,%eax
12151   102,68,15,110,216,                      //movd          %eax,%xmm11
12152   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
12153   184,168,87,202,61,                      //mov           $0x3dca57a8,%eax
12154   53,0,0,0,128,                           //xor           $0x80000000,%eax
12155   102,68,15,110,224,                      //movd          %eax,%xmm12
12156   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
12157   69,15,89,251,                           //mulps         %xmm11,%xmm15
12158   69,15,88,252,                           //addps         %xmm12,%xmm15
12159   69,15,89,234,                           //mulps         %xmm10,%xmm13
12160   69,15,88,239,                           //addps         %xmm15,%xmm13
12161   69,15,40,249,                           //movaps        %xmm9,%xmm15
12162   69,15,93,253,                           //minps         %xmm13,%xmm15
12163   184,4,231,140,59,                       //mov           $0x3b8ce704,%eax
12164   102,68,15,110,232,                      //movd          %eax,%xmm13
12165   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
12166   65,15,194,197,1,                        //cmpltps       %xmm13,%xmm0
12167   68,15,84,240,                           //andps         %xmm0,%xmm14
12168   65,15,85,199,                           //andnps        %xmm15,%xmm0
12169   65,15,86,198,                           //orps          %xmm14,%xmm0
12170   68,15,82,241,                           //rsqrtps       %xmm1,%xmm14
12171   69,15,83,254,                           //rcpps         %xmm14,%xmm15
12172   69,15,82,246,                           //rsqrtps       %xmm14,%xmm14
12173   69,15,89,251,                           //mulps         %xmm11,%xmm15
12174   69,15,88,252,                           //addps         %xmm12,%xmm15
12175   69,15,89,242,                           //mulps         %xmm10,%xmm14
12176   69,15,88,247,                           //addps         %xmm15,%xmm14
12177   69,15,40,249,                           //movaps        %xmm9,%xmm15
12178   69,15,93,254,                           //minps         %xmm14,%xmm15
12179   69,15,40,240,                           //movaps        %xmm8,%xmm14
12180   68,15,89,241,                           //mulps         %xmm1,%xmm14
12181   65,15,194,205,1,                        //cmpltps       %xmm13,%xmm1
12182   68,15,84,241,                           //andps         %xmm1,%xmm14
12183   65,15,85,207,                           //andnps        %xmm15,%xmm1
12184   65,15,86,206,                           //orps          %xmm14,%xmm1
12185   68,15,82,242,                           //rsqrtps       %xmm2,%xmm14
12186   69,15,83,254,                           //rcpps         %xmm14,%xmm15
12187   69,15,89,251,                           //mulps         %xmm11,%xmm15
12188   69,15,88,252,                           //addps         %xmm12,%xmm15
12189   69,15,82,222,                           //rsqrtps       %xmm14,%xmm11
12190   69,15,89,218,                           //mulps         %xmm10,%xmm11
12191   69,15,88,223,                           //addps         %xmm15,%xmm11
12192   69,15,93,203,                           //minps         %xmm11,%xmm9
12193   68,15,89,194,                           //mulps         %xmm2,%xmm8
12194   65,15,194,213,1,                        //cmpltps       %xmm13,%xmm2
12195   68,15,84,194,                           //andps         %xmm2,%xmm8
12196   65,15,85,209,                           //andnps        %xmm9,%xmm2
12197   65,15,86,208,                           //orps          %xmm8,%xmm2
12198   72,173,                                 //lods          %ds:(%rsi),%rax
12199   255,224,                                //jmpq          *%rax
12200 };
12201 
12202 CODE const uint8_t sk_scale_1_float_sse2[] = {
12203   72,173,                                 //lods          %ds:(%rsi),%rax
12204   243,68,15,16,0,                         //movss         (%rax),%xmm8
12205   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
12206   65,15,89,192,                           //mulps         %xmm8,%xmm0
12207   65,15,89,200,                           //mulps         %xmm8,%xmm1
12208   65,15,89,208,                           //mulps         %xmm8,%xmm2
12209   65,15,89,216,                           //mulps         %xmm8,%xmm3
12210   72,173,                                 //lods          %ds:(%rsi),%rax
12211   255,224,                                //jmpq          *%rax
12212 };
12213 
12214 CODE const uint8_t sk_scale_u8_sse2[] = {
12215   72,173,                                 //lods          %ds:(%rsi),%rax
12216   72,139,0,                               //mov           (%rax),%rax
12217   102,68,15,110,4,56,                     //movd          (%rax,%rdi,1),%xmm8
12218   102,69,15,239,201,                      //pxor          %xmm9,%xmm9
12219   102,69,15,96,193,                       //punpcklbw     %xmm9,%xmm8
12220   102,69,15,97,193,                       //punpcklwd     %xmm9,%xmm8
12221   69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
12222   184,129,128,128,59,                     //mov           $0x3b808081,%eax
12223   102,68,15,110,200,                      //movd          %eax,%xmm9
12224   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
12225   69,15,89,200,                           //mulps         %xmm8,%xmm9
12226   65,15,89,193,                           //mulps         %xmm9,%xmm0
12227   65,15,89,201,                           //mulps         %xmm9,%xmm1
12228   65,15,89,209,                           //mulps         %xmm9,%xmm2
12229   65,15,89,217,                           //mulps         %xmm9,%xmm3
12230   72,173,                                 //lods          %ds:(%rsi),%rax
12231   255,224,                                //jmpq          *%rax
12232 };
12233 
12234 CODE const uint8_t sk_lerp_1_float_sse2[] = {
12235   72,173,                                 //lods          %ds:(%rsi),%rax
12236   243,68,15,16,0,                         //movss         (%rax),%xmm8
12237   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
12238   15,92,196,                              //subps         %xmm4,%xmm0
12239   65,15,89,192,                           //mulps         %xmm8,%xmm0
12240   15,88,196,                              //addps         %xmm4,%xmm0
12241   15,92,205,                              //subps         %xmm5,%xmm1
12242   65,15,89,200,                           //mulps         %xmm8,%xmm1
12243   15,88,205,                              //addps         %xmm5,%xmm1
12244   15,92,214,                              //subps         %xmm6,%xmm2
12245   65,15,89,208,                           //mulps         %xmm8,%xmm2
12246   15,88,214,                              //addps         %xmm6,%xmm2
12247   15,92,223,                              //subps         %xmm7,%xmm3
12248   65,15,89,216,                           //mulps         %xmm8,%xmm3
12249   15,88,223,                              //addps         %xmm7,%xmm3
12250   72,173,                                 //lods          %ds:(%rsi),%rax
12251   255,224,                                //jmpq          *%rax
12252 };
12253 
12254 CODE const uint8_t sk_lerp_u8_sse2[] = {
12255   72,173,                                 //lods          %ds:(%rsi),%rax
12256   72,139,0,                               //mov           (%rax),%rax
12257   102,68,15,110,4,56,                     //movd          (%rax,%rdi,1),%xmm8
12258   102,69,15,239,201,                      //pxor          %xmm9,%xmm9
12259   102,69,15,96,193,                       //punpcklbw     %xmm9,%xmm8
12260   102,69,15,97,193,                       //punpcklwd     %xmm9,%xmm8
12261   69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
12262   184,129,128,128,59,                     //mov           $0x3b808081,%eax
12263   102,68,15,110,200,                      //movd          %eax,%xmm9
12264   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
12265   69,15,89,200,                           //mulps         %xmm8,%xmm9
12266   15,92,196,                              //subps         %xmm4,%xmm0
12267   65,15,89,193,                           //mulps         %xmm9,%xmm0
12268   15,88,196,                              //addps         %xmm4,%xmm0
12269   15,92,205,                              //subps         %xmm5,%xmm1
12270   65,15,89,201,                           //mulps         %xmm9,%xmm1
12271   15,88,205,                              //addps         %xmm5,%xmm1
12272   15,92,214,                              //subps         %xmm6,%xmm2
12273   65,15,89,209,                           //mulps         %xmm9,%xmm2
12274   15,88,214,                              //addps         %xmm6,%xmm2
12275   15,92,223,                              //subps         %xmm7,%xmm3
12276   65,15,89,217,                           //mulps         %xmm9,%xmm3
12277   15,88,223,                              //addps         %xmm7,%xmm3
12278   72,173,                                 //lods          %ds:(%rsi),%rax
12279   255,224,                                //jmpq          *%rax
12280 };
12281 
12282 CODE const uint8_t sk_lerp_565_sse2[] = {
12283   72,173,                                 //lods          %ds:(%rsi),%rax
12284   72,139,0,                               //mov           (%rax),%rax
12285   243,68,15,126,4,120,                    //movq          (%rax,%rdi,2),%xmm8
12286   102,15,239,219,                         //pxor          %xmm3,%xmm3
12287   102,68,15,97,195,                       //punpcklwd     %xmm3,%xmm8
12288   184,0,248,0,0,                          //mov           $0xf800,%eax
12289   102,15,110,216,                         //movd          %eax,%xmm3
12290   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
12291   102,65,15,219,216,                      //pand          %xmm8,%xmm3
12292   68,15,91,203,                           //cvtdq2ps      %xmm3,%xmm9
12293   184,8,33,132,55,                        //mov           $0x37842108,%eax
12294   102,68,15,110,208,                      //movd          %eax,%xmm10
12295   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
12296   69,15,89,209,                           //mulps         %xmm9,%xmm10
12297   184,224,7,0,0,                          //mov           $0x7e0,%eax
12298   102,15,110,216,                         //movd          %eax,%xmm3
12299   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
12300   102,65,15,219,216,                      //pand          %xmm8,%xmm3
12301   68,15,91,203,                           //cvtdq2ps      %xmm3,%xmm9
12302   184,33,8,2,58,                          //mov           $0x3a020821,%eax
12303   102,68,15,110,216,                      //movd          %eax,%xmm11
12304   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
12305   69,15,89,217,                           //mulps         %xmm9,%xmm11
12306   184,31,0,0,0,                           //mov           $0x1f,%eax
12307   102,15,110,216,                         //movd          %eax,%xmm3
12308   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
12309   102,65,15,219,216,                      //pand          %xmm8,%xmm3
12310   68,15,91,195,                           //cvtdq2ps      %xmm3,%xmm8
12311   184,8,33,4,61,                          //mov           $0x3d042108,%eax
12312   102,15,110,216,                         //movd          %eax,%xmm3
12313   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
12314   65,15,89,216,                           //mulps         %xmm8,%xmm3
12315   15,92,196,                              //subps         %xmm4,%xmm0
12316   65,15,89,194,                           //mulps         %xmm10,%xmm0
12317   15,88,196,                              //addps         %xmm4,%xmm0
12318   15,92,205,                              //subps         %xmm5,%xmm1
12319   65,15,89,203,                           //mulps         %xmm11,%xmm1
12320   15,88,205,                              //addps         %xmm5,%xmm1
12321   15,92,214,                              //subps         %xmm6,%xmm2
12322   15,89,211,                              //mulps         %xmm3,%xmm2
12323   15,88,214,                              //addps         %xmm6,%xmm2
12324   184,0,0,128,63,                         //mov           $0x3f800000,%eax
12325   102,15,110,216,                         //movd          %eax,%xmm3
12326   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
12327   72,173,                                 //lods          %ds:(%rsi),%rax
12328   255,224,                                //jmpq          *%rax
12329 };
12330 
12331 CODE const uint8_t sk_load_tables_sse2[] = {
12332   72,173,                                 //lods          %ds:(%rsi),%rax
12333   72,139,8,                               //mov           (%rax),%rcx
12334   76,139,64,8,                            //mov           0x8(%rax),%r8
12335   243,68,15,111,4,185,                    //movdqu        (%rcx,%rdi,4),%xmm8
12336   185,255,0,0,0,                          //mov           $0xff,%ecx
12337   102,15,110,193,                         //movd          %ecx,%xmm0
12338   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
12339   102,69,15,111,200,                      //movdqa        %xmm8,%xmm9
12340   102,65,15,114,209,8,                    //psrld         $0x8,%xmm9
12341   102,68,15,219,200,                      //pand          %xmm0,%xmm9
12342   102,69,15,111,208,                      //movdqa        %xmm8,%xmm10
12343   102,65,15,114,210,16,                   //psrld         $0x10,%xmm10
12344   102,68,15,219,208,                      //pand          %xmm0,%xmm10
12345   102,65,15,219,192,                      //pand          %xmm8,%xmm0
12346   102,15,112,216,78,                      //pshufd        $0x4e,%xmm0,%xmm3
12347   102,72,15,126,217,                      //movq          %xmm3,%rcx
12348   65,137,201,                             //mov           %ecx,%r9d
12349   72,193,233,32,                          //shr           $0x20,%rcx
12350   102,73,15,126,194,                      //movq          %xmm0,%r10
12351   69,137,211,                             //mov           %r10d,%r11d
12352   73,193,234,32,                          //shr           $0x20,%r10
12353   243,67,15,16,28,144,                    //movss         (%r8,%r10,4),%xmm3
12354   243,65,15,16,4,136,                     //movss         (%r8,%rcx,4),%xmm0
12355   15,20,216,                              //unpcklps      %xmm0,%xmm3
12356   243,67,15,16,4,152,                     //movss         (%r8,%r11,4),%xmm0
12357   243,67,15,16,12,136,                    //movss         (%r8,%r9,4),%xmm1
12358   15,20,193,                              //unpcklps      %xmm1,%xmm0
12359   15,20,195,                              //unpcklps      %xmm3,%xmm0
12360   76,139,64,16,                           //mov           0x10(%rax),%r8
12361   102,65,15,112,201,78,                   //pshufd        $0x4e,%xmm9,%xmm1
12362   102,73,15,126,202,                      //movq          %xmm1,%r10
12363   77,137,209,                             //mov           %r10,%r9
12364   73,193,233,32,                          //shr           $0x20,%r9
12365   102,76,15,126,201,                      //movq          %xmm9,%rcx
12366   65,137,203,                             //mov           %ecx,%r11d
12367   65,129,227,255,255,255,0,               //and           $0xffffff,%r11d
12368   72,193,233,30,                          //shr           $0x1e,%rcx
12369   65,129,226,255,255,255,0,               //and           $0xffffff,%r10d
12370   243,65,15,16,28,8,                      //movss         (%r8,%rcx,1),%xmm3
12371   243,67,15,16,12,136,                    //movss         (%r8,%r9,4),%xmm1
12372   15,20,217,                              //unpcklps      %xmm1,%xmm3
12373   243,67,15,16,12,152,                    //movss         (%r8,%r11,4),%xmm1
12374   243,67,15,16,20,144,                    //movss         (%r8,%r10,4),%xmm2
12375   15,20,202,                              //unpcklps      %xmm2,%xmm1
12376   15,20,203,                              //unpcklps      %xmm3,%xmm1
12377   76,139,72,24,                           //mov           0x18(%rax),%r9
12378   102,65,15,112,210,78,                   //pshufd        $0x4e,%xmm10,%xmm2
12379   102,72,15,126,209,                      //movq          %xmm2,%rcx
12380   68,15,183,193,                          //movzwl        %cx,%r8d
12381   72,193,233,32,                          //shr           $0x20,%rcx
12382   102,76,15,126,208,                      //movq          %xmm10,%rax
12383   68,15,183,208,                          //movzwl        %ax,%r10d
12384   72,193,232,30,                          //shr           $0x1e,%rax
12385   243,69,15,16,12,1,                      //movss         (%r9,%rax,1),%xmm9
12386   243,65,15,16,20,137,                    //movss         (%r9,%rcx,4),%xmm2
12387   68,15,20,202,                           //unpcklps      %xmm2,%xmm9
12388   243,67,15,16,20,145,                    //movss         (%r9,%r10,4),%xmm2
12389   243,67,15,16,28,129,                    //movss         (%r9,%r8,4),%xmm3
12390   15,20,211,                              //unpcklps      %xmm3,%xmm2
12391   65,15,20,209,                           //unpcklps      %xmm9,%xmm2
12392   102,65,15,114,208,24,                   //psrld         $0x18,%xmm8
12393   69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
12394   184,129,128,128,59,                     //mov           $0x3b808081,%eax
12395   102,15,110,216,                         //movd          %eax,%xmm3
12396   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
12397   65,15,89,216,                           //mulps         %xmm8,%xmm3
12398   72,173,                                 //lods          %ds:(%rsi),%rax
12399   255,224,                                //jmpq          *%rax
12400 };
12401 
12402 CODE const uint8_t sk_load_a8_sse2[] = {
12403   72,173,                                 //lods          %ds:(%rsi),%rax
12404   72,139,0,                               //mov           (%rax),%rax
12405   102,15,110,4,56,                        //movd          (%rax,%rdi,1),%xmm0
12406   102,15,239,201,                         //pxor          %xmm1,%xmm1
12407   102,15,96,193,                          //punpcklbw     %xmm1,%xmm0
12408   102,15,97,193,                          //punpcklwd     %xmm1,%xmm0
12409   15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
12410   184,129,128,128,59,                     //mov           $0x3b808081,%eax
12411   102,15,110,216,                         //movd          %eax,%xmm3
12412   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
12413   15,89,216,                              //mulps         %xmm0,%xmm3
12414   72,173,                                 //lods          %ds:(%rsi),%rax
12415   15,87,192,                              //xorps         %xmm0,%xmm0
12416   102,15,239,201,                         //pxor          %xmm1,%xmm1
12417   15,87,210,                              //xorps         %xmm2,%xmm2
12418   255,224,                                //jmpq          *%rax
12419 };
12420 
12421 CODE const uint8_t sk_store_a8_sse2[] = {
12422   72,173,                                 //lods          %ds:(%rsi),%rax
12423   72,139,0,                               //mov           (%rax),%rax
12424   185,0,0,127,67,                         //mov           $0x437f0000,%ecx
12425   102,68,15,110,193,                      //movd          %ecx,%xmm8
12426   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
12427   68,15,89,195,                           //mulps         %xmm3,%xmm8
12428   102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
12429   102,65,15,114,240,16,                   //pslld         $0x10,%xmm8
12430   102,65,15,114,224,16,                   //psrad         $0x10,%xmm8
12431   102,69,15,107,192,                      //packssdw      %xmm8,%xmm8
12432   102,69,15,103,192,                      //packuswb      %xmm8,%xmm8
12433   102,68,15,126,4,56,                     //movd          %xmm8,(%rax,%rdi,1)
12434   72,173,                                 //lods          %ds:(%rsi),%rax
12435   255,224,                                //jmpq          *%rax
12436 };
12437 
12438 CODE const uint8_t sk_load_565_sse2[] = {
12439   72,173,                                 //lods          %ds:(%rsi),%rax
12440   72,139,0,                               //mov           (%rax),%rax
12441   243,15,126,20,120,                      //movq          (%rax,%rdi,2),%xmm2
12442   102,15,239,192,                         //pxor          %xmm0,%xmm0
12443   102,15,97,208,                          //punpcklwd     %xmm0,%xmm2
12444   184,0,248,0,0,                          //mov           $0xf800,%eax
12445   102,15,110,192,                         //movd          %eax,%xmm0
12446   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
12447   102,15,219,194,                         //pand          %xmm2,%xmm0
12448   15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
12449   184,8,33,132,55,                        //mov           $0x37842108,%eax
12450   102,15,110,192,                         //movd          %eax,%xmm0
12451   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
12452   15,89,193,                              //mulps         %xmm1,%xmm0
12453   184,224,7,0,0,                          //mov           $0x7e0,%eax
12454   102,15,110,200,                         //movd          %eax,%xmm1
12455   102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
12456   102,15,219,202,                         //pand          %xmm2,%xmm1
12457   15,91,217,                              //cvtdq2ps      %xmm1,%xmm3
12458   184,33,8,2,58,                          //mov           $0x3a020821,%eax
12459   102,15,110,200,                         //movd          %eax,%xmm1
12460   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
12461   15,89,203,                              //mulps         %xmm3,%xmm1
12462   184,31,0,0,0,                           //mov           $0x1f,%eax
12463   102,15,110,216,                         //movd          %eax,%xmm3
12464   102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
12465   102,15,219,218,                         //pand          %xmm2,%xmm3
12466   15,91,219,                              //cvtdq2ps      %xmm3,%xmm3
12467   184,8,33,4,61,                          //mov           $0x3d042108,%eax
12468   102,15,110,208,                         //movd          %eax,%xmm2
12469   15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
12470   15,89,211,                              //mulps         %xmm3,%xmm2
12471   184,0,0,128,63,                         //mov           $0x3f800000,%eax
12472   102,15,110,216,                         //movd          %eax,%xmm3
12473   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
12474   72,173,                                 //lods          %ds:(%rsi),%rax
12475   255,224,                                //jmpq          *%rax
12476 };
12477 
12478 CODE const uint8_t sk_store_565_sse2[] = {
12479   72,173,                                 //lods          %ds:(%rsi),%rax
12480   72,139,0,                               //mov           (%rax),%rax
12481   185,0,0,248,65,                         //mov           $0x41f80000,%ecx
12482   102,68,15,110,193,                      //movd          %ecx,%xmm8
12483   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
12484   69,15,40,200,                           //movaps        %xmm8,%xmm9
12485   68,15,89,200,                           //mulps         %xmm0,%xmm9
12486   102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
12487   102,65,15,114,241,11,                   //pslld         $0xb,%xmm9
12488   185,0,0,124,66,                         //mov           $0x427c0000,%ecx
12489   102,68,15,110,209,                      //movd          %ecx,%xmm10
12490   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
12491   68,15,89,209,                           //mulps         %xmm1,%xmm10
12492   102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
12493   102,65,15,114,242,5,                    //pslld         $0x5,%xmm10
12494   102,69,15,235,209,                      //por           %xmm9,%xmm10
12495   68,15,89,194,                           //mulps         %xmm2,%xmm8
12496   102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
12497   102,69,15,86,194,                       //orpd          %xmm10,%xmm8
12498   102,65,15,114,240,16,                   //pslld         $0x10,%xmm8
12499   102,65,15,114,224,16,                   //psrad         $0x10,%xmm8
12500   102,69,15,107,192,                      //packssdw      %xmm8,%xmm8
12501   102,68,15,214,4,120,                    //movq          %xmm8,(%rax,%rdi,2)
12502   72,173,                                 //lods          %ds:(%rsi),%rax
12503   255,224,                                //jmpq          *%rax
12504 };
12505 
12506 CODE const uint8_t sk_load_8888_sse2[] = {
12507   72,173,                                 //lods          %ds:(%rsi),%rax
12508   72,139,0,                               //mov           (%rax),%rax
12509   243,15,111,28,184,                      //movdqu        (%rax,%rdi,4),%xmm3
12510   184,255,0,0,0,                          //mov           $0xff,%eax
12511   102,15,110,192,                         //movd          %eax,%xmm0
12512   102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
12513   102,15,111,203,                         //movdqa        %xmm3,%xmm1
12514   102,15,114,209,8,                       //psrld         $0x8,%xmm1
12515   102,15,219,200,                         //pand          %xmm0,%xmm1
12516   102,15,111,211,                         //movdqa        %xmm3,%xmm2
12517   102,15,114,210,16,                      //psrld         $0x10,%xmm2
12518   102,15,219,208,                         //pand          %xmm0,%xmm2
12519   102,15,219,195,                         //pand          %xmm3,%xmm0
12520   15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
12521   184,129,128,128,59,                     //mov           $0x3b808081,%eax
12522   102,68,15,110,192,                      //movd          %eax,%xmm8
12523   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
12524   65,15,89,192,                           //mulps         %xmm8,%xmm0
12525   15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
12526   65,15,89,200,                           //mulps         %xmm8,%xmm1
12527   15,91,210,                              //cvtdq2ps      %xmm2,%xmm2
12528   65,15,89,208,                           //mulps         %xmm8,%xmm2
12529   102,15,114,211,24,                      //psrld         $0x18,%xmm3
12530   15,91,219,                              //cvtdq2ps      %xmm3,%xmm3
12531   65,15,89,216,                           //mulps         %xmm8,%xmm3
12532   72,173,                                 //lods          %ds:(%rsi),%rax
12533   255,224,                                //jmpq          *%rax
12534 };
12535 
12536 CODE const uint8_t sk_store_8888_sse2[] = {
12537   72,173,                                 //lods          %ds:(%rsi),%rax
12538   72,139,0,                               //mov           (%rax),%rax
12539   185,0,0,127,67,                         //mov           $0x437f0000,%ecx
12540   102,68,15,110,193,                      //movd          %ecx,%xmm8
12541   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
12542   69,15,40,200,                           //movaps        %xmm8,%xmm9
12543   68,15,89,200,                           //mulps         %xmm0,%xmm9
12544   102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
12545   69,15,40,208,                           //movaps        %xmm8,%xmm10
12546   68,15,89,209,                           //mulps         %xmm1,%xmm10
12547   102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
12548   102,65,15,114,242,8,                    //pslld         $0x8,%xmm10
12549   102,69,15,235,209,                      //por           %xmm9,%xmm10
12550   69,15,40,200,                           //movaps        %xmm8,%xmm9
12551   68,15,89,202,                           //mulps         %xmm2,%xmm9
12552   102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
12553   102,65,15,114,241,16,                   //pslld         $0x10,%xmm9
12554   68,15,89,195,                           //mulps         %xmm3,%xmm8
12555   102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
12556   102,65,15,114,240,24,                   //pslld         $0x18,%xmm8
12557   102,69,15,235,193,                      //por           %xmm9,%xmm8
12558   102,69,15,235,194,                      //por           %xmm10,%xmm8
12559   243,68,15,127,4,184,                    //movdqu        %xmm8,(%rax,%rdi,4)
12560   72,173,                                 //lods          %ds:(%rsi),%rax
12561   255,224,                                //jmpq          *%rax
12562 };
12563 
12564 CODE const uint8_t sk_load_f16_sse2[] = {
12565   72,173,                                 //lods          %ds:(%rsi),%rax
12566   72,139,0,                               //mov           (%rax),%rax
12567   243,15,111,4,248,                       //movdqu        (%rax,%rdi,8),%xmm0
12568   243,15,111,76,248,16,                   //movdqu        0x10(%rax,%rdi,8),%xmm1
12569   102,15,111,208,                         //movdqa        %xmm0,%xmm2
12570   102,15,97,209,                          //punpcklwd     %xmm1,%xmm2
12571   102,15,105,193,                         //punpckhwd     %xmm1,%xmm0
12572   102,68,15,111,194,                      //movdqa        %xmm2,%xmm8
12573   102,68,15,97,192,                       //punpcklwd     %xmm0,%xmm8
12574   102,15,105,208,                         //punpckhwd     %xmm0,%xmm2
12575   184,0,4,0,4,                            //mov           $0x4000400,%eax
12576   102,15,110,192,                         //movd          %eax,%xmm0
12577   102,15,112,216,0,                       //pshufd        $0x0,%xmm0,%xmm3
12578   102,15,111,203,                         //movdqa        %xmm3,%xmm1
12579   102,65,15,101,200,                      //pcmpgtw       %xmm8,%xmm1
12580   102,65,15,223,200,                      //pandn         %xmm8,%xmm1
12581   102,15,101,218,                         //pcmpgtw       %xmm2,%xmm3
12582   102,15,223,218,                         //pandn         %xmm2,%xmm3
12583   102,69,15,239,192,                      //pxor          %xmm8,%xmm8
12584   102,15,111,193,                         //movdqa        %xmm1,%xmm0
12585   102,65,15,97,192,                       //punpcklwd     %xmm8,%xmm0
12586   102,15,114,240,13,                      //pslld         $0xd,%xmm0
12587   184,0,0,128,119,                        //mov           $0x77800000,%eax
12588   102,15,110,208,                         //movd          %eax,%xmm2
12589   102,68,15,112,202,0,                    //pshufd        $0x0,%xmm2,%xmm9
12590   65,15,89,193,                           //mulps         %xmm9,%xmm0
12591   102,65,15,105,200,                      //punpckhwd     %xmm8,%xmm1
12592   102,15,114,241,13,                      //pslld         $0xd,%xmm1
12593   65,15,89,201,                           //mulps         %xmm9,%xmm1
12594   102,15,111,211,                         //movdqa        %xmm3,%xmm2
12595   102,65,15,97,208,                       //punpcklwd     %xmm8,%xmm2
12596   102,15,114,242,13,                      //pslld         $0xd,%xmm2
12597   65,15,89,209,                           //mulps         %xmm9,%xmm2
12598   102,65,15,105,216,                      //punpckhwd     %xmm8,%xmm3
12599   102,15,114,243,13,                      //pslld         $0xd,%xmm3
12600   65,15,89,217,                           //mulps         %xmm9,%xmm3
12601   72,173,                                 //lods          %ds:(%rsi),%rax
12602   255,224,                                //jmpq          *%rax
12603 };
12604 
12605 CODE const uint8_t sk_store_f16_sse2[] = {
12606   72,173,                                 //lods          %ds:(%rsi),%rax
12607   72,139,0,                               //mov           (%rax),%rax
12608   185,0,0,128,7,                          //mov           $0x7800000,%ecx
12609   102,68,15,110,193,                      //movd          %ecx,%xmm8
12610   102,69,15,112,192,0,                    //pshufd        $0x0,%xmm8,%xmm8
12611   102,69,15,111,200,                      //movdqa        %xmm8,%xmm9
12612   68,15,89,200,                           //mulps         %xmm0,%xmm9
12613   102,65,15,114,209,13,                   //psrld         $0xd,%xmm9
12614   102,69,15,111,208,                      //movdqa        %xmm8,%xmm10
12615   68,15,89,209,                           //mulps         %xmm1,%xmm10
12616   102,65,15,114,210,13,                   //psrld         $0xd,%xmm10
12617   102,69,15,111,216,                      //movdqa        %xmm8,%xmm11
12618   68,15,89,218,                           //mulps         %xmm2,%xmm11
12619   102,65,15,114,211,13,                   //psrld         $0xd,%xmm11
12620   68,15,89,195,                           //mulps         %xmm3,%xmm8
12621   102,65,15,114,208,13,                   //psrld         $0xd,%xmm8
12622   102,65,15,115,250,2,                    //pslldq        $0x2,%xmm10
12623   102,69,15,235,209,                      //por           %xmm9,%xmm10
12624   102,65,15,115,248,2,                    //pslldq        $0x2,%xmm8
12625   102,69,15,235,195,                      //por           %xmm11,%xmm8
12626   102,69,15,111,202,                      //movdqa        %xmm10,%xmm9
12627   102,69,15,98,200,                       //punpckldq     %xmm8,%xmm9
12628   243,68,15,127,12,248,                   //movdqu        %xmm9,(%rax,%rdi,8)
12629   102,69,15,106,208,                      //punpckhdq     %xmm8,%xmm10
12630   243,68,15,127,84,248,16,                //movdqu        %xmm10,0x10(%rax,%rdi,8)
12631   72,173,                                 //lods          %ds:(%rsi),%rax
12632   255,224,                                //jmpq          *%rax
12633 };
12634 
12635 CODE const uint8_t sk_store_f32_sse2[] = {
12636   72,173,                                 //lods          %ds:(%rsi),%rax
12637   72,139,0,                               //mov           (%rax),%rax
12638   72,137,249,                             //mov           %rdi,%rcx
12639   72,193,225,4,                           //shl           $0x4,%rcx
12640   68,15,40,192,                           //movaps        %xmm0,%xmm8
12641   68,15,40,200,                           //movaps        %xmm0,%xmm9
12642   68,15,20,201,                           //unpcklps      %xmm1,%xmm9
12643   68,15,40,210,                           //movaps        %xmm2,%xmm10
12644   68,15,40,218,                           //movaps        %xmm2,%xmm11
12645   68,15,20,219,                           //unpcklps      %xmm3,%xmm11
12646   68,15,21,193,                           //unpckhps      %xmm1,%xmm8
12647   68,15,21,211,                           //unpckhps      %xmm3,%xmm10
12648   69,15,40,225,                           //movaps        %xmm9,%xmm12
12649   102,69,15,20,227,                       //unpcklpd      %xmm11,%xmm12
12650   69,15,18,217,                           //movhlps       %xmm9,%xmm11
12651   69,15,40,200,                           //movaps        %xmm8,%xmm9
12652   102,69,15,20,202,                       //unpcklpd      %xmm10,%xmm9
12653   69,15,18,208,                           //movhlps       %xmm8,%xmm10
12654   102,68,15,17,36,8,                      //movupd        %xmm12,(%rax,%rcx,1)
12655   68,15,17,92,8,16,                       //movups        %xmm11,0x10(%rax,%rcx,1)
12656   102,68,15,17,76,8,32,                   //movupd        %xmm9,0x20(%rax,%rcx,1)
12657   68,15,17,84,8,48,                       //movups        %xmm10,0x30(%rax,%rcx,1)
12658   72,173,                                 //lods          %ds:(%rsi),%rax
12659   255,224,                                //jmpq          *%rax
12660 };
12661 
12662 CODE const uint8_t sk_clamp_x_sse2[] = {
12663   72,173,                                 //lods          %ds:(%rsi),%rax
12664   69,15,87,192,                           //xorps         %xmm8,%xmm8
12665   68,15,95,192,                           //maxps         %xmm0,%xmm8
12666   243,68,15,16,8,                         //movss         (%rax),%xmm9
12667   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
12668   102,15,118,192,                         //pcmpeqd       %xmm0,%xmm0
12669   102,65,15,254,193,                      //paddd         %xmm9,%xmm0
12670   68,15,93,192,                           //minps         %xmm0,%xmm8
12671   72,173,                                 //lods          %ds:(%rsi),%rax
12672   65,15,40,192,                           //movaps        %xmm8,%xmm0
12673   255,224,                                //jmpq          *%rax
12674 };
12675 
12676 CODE const uint8_t sk_clamp_y_sse2[] = {
12677   72,173,                                 //lods          %ds:(%rsi),%rax
12678   69,15,87,192,                           //xorps         %xmm8,%xmm8
12679   68,15,95,193,                           //maxps         %xmm1,%xmm8
12680   243,68,15,16,8,                         //movss         (%rax),%xmm9
12681   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
12682   102,15,118,201,                         //pcmpeqd       %xmm1,%xmm1
12683   102,65,15,254,201,                      //paddd         %xmm9,%xmm1
12684   68,15,93,193,                           //minps         %xmm1,%xmm8
12685   72,173,                                 //lods          %ds:(%rsi),%rax
12686   65,15,40,200,                           //movaps        %xmm8,%xmm1
12687   255,224,                                //jmpq          *%rax
12688 };
12689 
12690 CODE const uint8_t sk_repeat_x_sse2[] = {
12691   72,173,                                 //lods          %ds:(%rsi),%rax
12692   243,68,15,16,0,                         //movss         (%rax),%xmm8
12693   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
12694   68,15,40,200,                           //movaps        %xmm0,%xmm9
12695   69,15,94,200,                           //divps         %xmm8,%xmm9
12696   243,69,15,91,209,                       //cvttps2dq     %xmm9,%xmm10
12697   69,15,91,210,                           //cvtdq2ps      %xmm10,%xmm10
12698   69,15,194,202,1,                        //cmpltps       %xmm10,%xmm9
12699   184,0,0,128,63,                         //mov           $0x3f800000,%eax
12700   102,68,15,110,216,                      //movd          %eax,%xmm11
12701   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
12702   69,15,84,217,                           //andps         %xmm9,%xmm11
12703   69,15,92,211,                           //subps         %xmm11,%xmm10
12704   69,15,89,208,                           //mulps         %xmm8,%xmm10
12705   65,15,92,194,                           //subps         %xmm10,%xmm0
12706   102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
12707   102,69,15,254,200,                      //paddd         %xmm8,%xmm9
12708   65,15,93,193,                           //minps         %xmm9,%xmm0
12709   72,173,                                 //lods          %ds:(%rsi),%rax
12710   255,224,                                //jmpq          *%rax
12711 };
12712 
12713 CODE const uint8_t sk_repeat_y_sse2[] = {
12714   72,173,                                 //lods          %ds:(%rsi),%rax
12715   243,68,15,16,0,                         //movss         (%rax),%xmm8
12716   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
12717   68,15,40,201,                           //movaps        %xmm1,%xmm9
12718   69,15,94,200,                           //divps         %xmm8,%xmm9
12719   243,69,15,91,209,                       //cvttps2dq     %xmm9,%xmm10
12720   69,15,91,210,                           //cvtdq2ps      %xmm10,%xmm10
12721   69,15,194,202,1,                        //cmpltps       %xmm10,%xmm9
12722   184,0,0,128,63,                         //mov           $0x3f800000,%eax
12723   102,68,15,110,216,                      //movd          %eax,%xmm11
12724   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
12725   69,15,84,217,                           //andps         %xmm9,%xmm11
12726   69,15,92,211,                           //subps         %xmm11,%xmm10
12727   69,15,89,208,                           //mulps         %xmm8,%xmm10
12728   65,15,92,202,                           //subps         %xmm10,%xmm1
12729   102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
12730   102,69,15,254,200,                      //paddd         %xmm8,%xmm9
12731   65,15,93,201,                           //minps         %xmm9,%xmm1
12732   72,173,                                 //lods          %ds:(%rsi),%rax
12733   255,224,                                //jmpq          *%rax
12734 };
12735 
12736 CODE const uint8_t sk_mirror_x_sse2[] = {
12737   72,173,                                 //lods          %ds:(%rsi),%rax
12738   243,68,15,16,8,                         //movss         (%rax),%xmm9
12739   69,15,40,193,                           //movaps        %xmm9,%xmm8
12740   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
12741   65,15,92,192,                           //subps         %xmm8,%xmm0
12742   243,69,15,88,201,                       //addss         %xmm9,%xmm9
12743   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
12744   68,15,40,208,                           //movaps        %xmm0,%xmm10
12745   69,15,94,209,                           //divps         %xmm9,%xmm10
12746   243,69,15,91,218,                       //cvttps2dq     %xmm10,%xmm11
12747   69,15,91,219,                           //cvtdq2ps      %xmm11,%xmm11
12748   69,15,194,211,1,                        //cmpltps       %xmm11,%xmm10
12749   184,0,0,128,63,                         //mov           $0x3f800000,%eax
12750   102,68,15,110,224,                      //movd          %eax,%xmm12
12751   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
12752   69,15,84,226,                           //andps         %xmm10,%xmm12
12753   69,15,87,210,                           //xorps         %xmm10,%xmm10
12754   69,15,92,220,                           //subps         %xmm12,%xmm11
12755   69,15,89,217,                           //mulps         %xmm9,%xmm11
12756   65,15,92,195,                           //subps         %xmm11,%xmm0
12757   65,15,92,192,                           //subps         %xmm8,%xmm0
12758   68,15,92,208,                           //subps         %xmm0,%xmm10
12759   65,15,84,194,                           //andps         %xmm10,%xmm0
12760   102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
12761   102,69,15,254,200,                      //paddd         %xmm8,%xmm9
12762   65,15,93,193,                           //minps         %xmm9,%xmm0
12763   72,173,                                 //lods          %ds:(%rsi),%rax
12764   255,224,                                //jmpq          *%rax
12765 };
12766 
12767 CODE const uint8_t sk_mirror_y_sse2[] = {
12768   72,173,                                 //lods          %ds:(%rsi),%rax
12769   243,68,15,16,8,                         //movss         (%rax),%xmm9
12770   69,15,40,193,                           //movaps        %xmm9,%xmm8
12771   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
12772   65,15,92,200,                           //subps         %xmm8,%xmm1
12773   243,69,15,88,201,                       //addss         %xmm9,%xmm9
12774   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
12775   68,15,40,209,                           //movaps        %xmm1,%xmm10
12776   69,15,94,209,                           //divps         %xmm9,%xmm10
12777   243,69,15,91,218,                       //cvttps2dq     %xmm10,%xmm11
12778   69,15,91,219,                           //cvtdq2ps      %xmm11,%xmm11
12779   69,15,194,211,1,                        //cmpltps       %xmm11,%xmm10
12780   184,0,0,128,63,                         //mov           $0x3f800000,%eax
12781   102,68,15,110,224,                      //movd          %eax,%xmm12
12782   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
12783   69,15,84,226,                           //andps         %xmm10,%xmm12
12784   69,15,87,210,                           //xorps         %xmm10,%xmm10
12785   69,15,92,220,                           //subps         %xmm12,%xmm11
12786   69,15,89,217,                           //mulps         %xmm9,%xmm11
12787   65,15,92,203,                           //subps         %xmm11,%xmm1
12788   65,15,92,200,                           //subps         %xmm8,%xmm1
12789   68,15,92,209,                           //subps         %xmm1,%xmm10
12790   65,15,84,202,                           //andps         %xmm10,%xmm1
12791   102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
12792   102,69,15,254,200,                      //paddd         %xmm8,%xmm9
12793   65,15,93,201,                           //minps         %xmm9,%xmm1
12794   72,173,                                 //lods          %ds:(%rsi),%rax
12795   255,224,                                //jmpq          *%rax
12796 };
12797 
12798 CODE const uint8_t sk_luminance_to_alpha_sse2[] = {
12799   184,208,179,89,62,                      //mov           $0x3e59b3d0,%eax
12800   102,15,110,216,                         //movd          %eax,%xmm3
12801   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
12802   15,89,216,                              //mulps         %xmm0,%xmm3
12803   184,89,23,55,63,                        //mov           $0x3f371759,%eax
12804   102,15,110,192,                         //movd          %eax,%xmm0
12805   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
12806   15,89,193,                              //mulps         %xmm1,%xmm0
12807   15,88,195,                              //addps         %xmm3,%xmm0
12808   184,152,221,147,61,                     //mov           $0x3d93dd98,%eax
12809   102,15,110,216,                         //movd          %eax,%xmm3
12810   15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
12811   15,89,218,                              //mulps         %xmm2,%xmm3
12812   15,88,216,                              //addps         %xmm0,%xmm3
12813   72,173,                                 //lods          %ds:(%rsi),%rax
12814   15,87,192,                              //xorps         %xmm0,%xmm0
12815   15,87,201,                              //xorps         %xmm1,%xmm1
12816   15,87,210,                              //xorps         %xmm2,%xmm2
12817   255,224,                                //jmpq          *%rax
12818 };
12819 
12820 CODE const uint8_t sk_matrix_2x3_sse2[] = {
12821   68,15,40,201,                           //movaps        %xmm1,%xmm9
12822   68,15,40,192,                           //movaps        %xmm0,%xmm8
12823   72,173,                                 //lods          %ds:(%rsi),%rax
12824   243,15,16,0,                            //movss         (%rax),%xmm0
12825   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
12826   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
12827   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
12828   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
12829   243,68,15,16,88,16,                     //movss         0x10(%rax),%xmm11
12830   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
12831   69,15,89,209,                           //mulps         %xmm9,%xmm10
12832   69,15,88,211,                           //addps         %xmm11,%xmm10
12833   65,15,89,192,                           //mulps         %xmm8,%xmm0
12834   65,15,88,194,                           //addps         %xmm10,%xmm0
12835   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
12836   243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
12837   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
12838   243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
12839   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
12840   69,15,89,209,                           //mulps         %xmm9,%xmm10
12841   69,15,88,211,                           //addps         %xmm11,%xmm10
12842   65,15,89,200,                           //mulps         %xmm8,%xmm1
12843   65,15,88,202,                           //addps         %xmm10,%xmm1
12844   72,173,                                 //lods          %ds:(%rsi),%rax
12845   255,224,                                //jmpq          *%rax
12846 };
12847 
12848 CODE const uint8_t sk_matrix_3x4_sse2[] = {
12849   68,15,40,201,                           //movaps        %xmm1,%xmm9
12850   68,15,40,192,                           //movaps        %xmm0,%xmm8
12851   72,173,                                 //lods          %ds:(%rsi),%rax
12852   243,15,16,0,                            //movss         (%rax),%xmm0
12853   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
12854   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
12855   243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
12856   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
12857   243,68,15,16,88,24,                     //movss         0x18(%rax),%xmm11
12858   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
12859   243,68,15,16,96,36,                     //movss         0x24(%rax),%xmm12
12860   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
12861   68,15,89,218,                           //mulps         %xmm2,%xmm11
12862   69,15,88,220,                           //addps         %xmm12,%xmm11
12863   69,15,89,209,                           //mulps         %xmm9,%xmm10
12864   69,15,88,211,                           //addps         %xmm11,%xmm10
12865   65,15,89,192,                           //mulps         %xmm8,%xmm0
12866   65,15,88,194,                           //addps         %xmm10,%xmm0
12867   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
12868   243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
12869   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
12870   243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
12871   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
12872   243,68,15,16,96,40,                     //movss         0x28(%rax),%xmm12
12873   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
12874   68,15,89,218,                           //mulps         %xmm2,%xmm11
12875   69,15,88,220,                           //addps         %xmm12,%xmm11
12876   69,15,89,209,                           //mulps         %xmm9,%xmm10
12877   69,15,88,211,                           //addps         %xmm11,%xmm10
12878   65,15,89,200,                           //mulps         %xmm8,%xmm1
12879   65,15,88,202,                           //addps         %xmm10,%xmm1
12880   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
12881   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
12882   243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
12883   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
12884   243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
12885   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
12886   243,68,15,16,104,44,                    //movss         0x2c(%rax),%xmm13
12887   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
12888   68,15,89,226,                           //mulps         %xmm2,%xmm12
12889   69,15,88,229,                           //addps         %xmm13,%xmm12
12890   69,15,89,217,                           //mulps         %xmm9,%xmm11
12891   69,15,88,220,                           //addps         %xmm12,%xmm11
12892   69,15,89,208,                           //mulps         %xmm8,%xmm10
12893   69,15,88,211,                           //addps         %xmm11,%xmm10
12894   72,173,                                 //lods          %ds:(%rsi),%rax
12895   65,15,40,210,                           //movaps        %xmm10,%xmm2
12896   255,224,                                //jmpq          *%rax
12897 };
12898 
12899 CODE const uint8_t sk_matrix_4x5_sse2[] = {
12900   68,15,40,201,                           //movaps        %xmm1,%xmm9
12901   68,15,40,192,                           //movaps        %xmm0,%xmm8
12902   72,173,                                 //lods          %ds:(%rsi),%rax
12903   243,15,16,0,                            //movss         (%rax),%xmm0
12904   243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
12905   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
12906   243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
12907   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
12908   243,68,15,16,88,32,                     //movss         0x20(%rax),%xmm11
12909   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
12910   243,68,15,16,96,48,                     //movss         0x30(%rax),%xmm12
12911   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
12912   243,68,15,16,104,64,                    //movss         0x40(%rax),%xmm13
12913   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
12914   68,15,89,227,                           //mulps         %xmm3,%xmm12
12915   69,15,88,229,                           //addps         %xmm13,%xmm12
12916   68,15,89,218,                           //mulps         %xmm2,%xmm11
12917   69,15,88,220,                           //addps         %xmm12,%xmm11
12918   69,15,89,209,                           //mulps         %xmm9,%xmm10
12919   69,15,88,211,                           //addps         %xmm11,%xmm10
12920   65,15,89,192,                           //mulps         %xmm8,%xmm0
12921   65,15,88,194,                           //addps         %xmm10,%xmm0
12922   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
12923   243,68,15,16,80,20,                     //movss         0x14(%rax),%xmm10
12924   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
12925   243,68,15,16,88,36,                     //movss         0x24(%rax),%xmm11
12926   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
12927   243,68,15,16,96,52,                     //movss         0x34(%rax),%xmm12
12928   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
12929   243,68,15,16,104,68,                    //movss         0x44(%rax),%xmm13
12930   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
12931   68,15,89,227,                           //mulps         %xmm3,%xmm12
12932   69,15,88,229,                           //addps         %xmm13,%xmm12
12933   68,15,89,218,                           //mulps         %xmm2,%xmm11
12934   69,15,88,220,                           //addps         %xmm12,%xmm11
12935   69,15,89,209,                           //mulps         %xmm9,%xmm10
12936   69,15,88,211,                           //addps         %xmm11,%xmm10
12937   65,15,89,200,                           //mulps         %xmm8,%xmm1
12938   65,15,88,202,                           //addps         %xmm10,%xmm1
12939   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
12940   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
12941   243,68,15,16,88,24,                     //movss         0x18(%rax),%xmm11
12942   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
12943   243,68,15,16,96,40,                     //movss         0x28(%rax),%xmm12
12944   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
12945   243,68,15,16,104,56,                    //movss         0x38(%rax),%xmm13
12946   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
12947   243,68,15,16,112,72,                    //movss         0x48(%rax),%xmm14
12948   69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
12949   68,15,89,235,                           //mulps         %xmm3,%xmm13
12950   69,15,88,238,                           //addps         %xmm14,%xmm13
12951   68,15,89,226,                           //mulps         %xmm2,%xmm12
12952   69,15,88,229,                           //addps         %xmm13,%xmm12
12953   69,15,89,217,                           //mulps         %xmm9,%xmm11
12954   69,15,88,220,                           //addps         %xmm12,%xmm11
12955   69,15,89,208,                           //mulps         %xmm8,%xmm10
12956   69,15,88,211,                           //addps         %xmm11,%xmm10
12957   243,68,15,16,88,12,                     //movss         0xc(%rax),%xmm11
12958   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
12959   243,68,15,16,96,28,                     //movss         0x1c(%rax),%xmm12
12960   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
12961   243,68,15,16,104,44,                    //movss         0x2c(%rax),%xmm13
12962   69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
12963   243,68,15,16,112,60,                    //movss         0x3c(%rax),%xmm14
12964   69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
12965   243,68,15,16,120,76,                    //movss         0x4c(%rax),%xmm15
12966   69,15,198,255,0,                        //shufps        $0x0,%xmm15,%xmm15
12967   68,15,89,243,                           //mulps         %xmm3,%xmm14
12968   69,15,88,247,                           //addps         %xmm15,%xmm14
12969   68,15,89,234,                           //mulps         %xmm2,%xmm13
12970   69,15,88,238,                           //addps         %xmm14,%xmm13
12971   69,15,89,225,                           //mulps         %xmm9,%xmm12
12972   69,15,88,229,                           //addps         %xmm13,%xmm12
12973   69,15,89,216,                           //mulps         %xmm8,%xmm11
12974   69,15,88,220,                           //addps         %xmm12,%xmm11
12975   72,173,                                 //lods          %ds:(%rsi),%rax
12976   65,15,40,210,                           //movaps        %xmm10,%xmm2
12977   65,15,40,219,                           //movaps        %xmm11,%xmm3
12978   255,224,                                //jmpq          *%rax
12979 };
12980 
12981 CODE const uint8_t sk_matrix_perspective_sse2[] = {
12982   68,15,40,192,                           //movaps        %xmm0,%xmm8
12983   72,173,                                 //lods          %ds:(%rsi),%rax
12984   243,15,16,0,                            //movss         (%rax),%xmm0
12985   243,68,15,16,72,4,                      //movss         0x4(%rax),%xmm9
12986   15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
12987   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
12988   243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
12989   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
12990   68,15,89,201,                           //mulps         %xmm1,%xmm9
12991   69,15,88,202,                           //addps         %xmm10,%xmm9
12992   65,15,89,192,                           //mulps         %xmm8,%xmm0
12993   65,15,88,193,                           //addps         %xmm9,%xmm0
12994   243,68,15,16,72,12,                     //movss         0xc(%rax),%xmm9
12995   69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
12996   243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
12997   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
12998   243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
12999   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
13000   68,15,89,209,                           //mulps         %xmm1,%xmm10
13001   69,15,88,211,                           //addps         %xmm11,%xmm10
13002   69,15,89,200,                           //mulps         %xmm8,%xmm9
13003   69,15,88,202,                           //addps         %xmm10,%xmm9
13004   243,68,15,16,80,24,                     //movss         0x18(%rax),%xmm10
13005   69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
13006   243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
13007   69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
13008   243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
13009   69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
13010   68,15,89,217,                           //mulps         %xmm1,%xmm11
13011   69,15,88,220,                           //addps         %xmm12,%xmm11
13012   69,15,89,208,                           //mulps         %xmm8,%xmm10
13013   69,15,88,211,                           //addps         %xmm11,%xmm10
13014   65,15,83,202,                           //rcpps         %xmm10,%xmm1
13015   15,89,193,                              //mulps         %xmm1,%xmm0
13016   68,15,89,201,                           //mulps         %xmm1,%xmm9
13017   72,173,                                 //lods          %ds:(%rsi),%rax
13018   65,15,40,201,                           //movaps        %xmm9,%xmm1
13019   255,224,                                //jmpq          *%rax
13020 };
13021 
13022 CODE const uint8_t sk_linear_gradient_2stops_sse2[] = {
13023   72,173,                                 //lods          %ds:(%rsi),%rax
13024   68,15,16,8,                             //movups        (%rax),%xmm9
13025   15,16,88,16,                            //movups        0x10(%rax),%xmm3
13026   68,15,40,195,                           //movaps        %xmm3,%xmm8
13027   69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
13028   65,15,40,201,                           //movaps        %xmm9,%xmm1
13029   15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
13030   68,15,89,192,                           //mulps         %xmm0,%xmm8
13031   68,15,88,193,                           //addps         %xmm1,%xmm8
13032   15,40,203,                              //movaps        %xmm3,%xmm1
13033   15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
13034   65,15,40,209,                           //movaps        %xmm9,%xmm2
13035   15,198,210,85,                          //shufps        $0x55,%xmm2,%xmm2
13036   15,89,200,                              //mulps         %xmm0,%xmm1
13037   15,88,202,                              //addps         %xmm2,%xmm1
13038   15,40,211,                              //movaps        %xmm3,%xmm2
13039   15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
13040   69,15,40,209,                           //movaps        %xmm9,%xmm10
13041   69,15,198,210,170,                      //shufps        $0xaa,%xmm10,%xmm10
13042   15,89,208,                              //mulps         %xmm0,%xmm2
13043   65,15,88,210,                           //addps         %xmm10,%xmm2
13044   15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
13045   69,15,198,201,255,                      //shufps        $0xff,%xmm9,%xmm9
13046   15,89,216,                              //mulps         %xmm0,%xmm3
13047   65,15,88,217,                           //addps         %xmm9,%xmm3
13048   72,173,                                 //lods          %ds:(%rsi),%rax
13049   65,15,40,192,                           //movaps        %xmm8,%xmm0
13050   255,224,                                //jmpq          *%rax
13051 };
13052 #endif
13053