1 /* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 // This file is generated semi-automatically with this command: 9 // $ src/jumper/build_stages.py 10 11 #include <stdint.h> 12 13 #if defined(_MSC_VER) 14 #pragma section("code", read,execute) 15 #define CODE extern "C" __declspec(allocate("code")) 16 #elif defined(__MACH__) 17 #define CODE extern "C" __attribute__((section("__TEXT,__text"))) 18 #else 19 #define CODE extern "C" __attribute__((section(".text"))) 20 #endif 21 22 #if defined(__aarch64__) 23 24 CODE const uint32_t sk_start_pipeline_aarch64[] = { 25 0xa9bd5bf7, //stp x23, x22, [sp, #-48]! 26 0xa90153f5, //stp x21, x20, [sp, #16] 27 0xa9027bf3, //stp x19, x30, [sp, #32] 28 0xaa0103f4, //mov x20, x1 29 0xf8408697, //ldr x23, [x20], #8 30 0xaa0003f5, //mov x21, x0 31 0xaa0303f3, //mov x19, x3 32 0x910012a8, //add x8, x21, #0x4 33 0xeb13011f, //cmp x8, x19 34 0xaa0203f6, //mov x22, x2 35 0x54000069, //b.ls 34 <sk_start_pipeline_aarch64+0x34> // b.plast 36 0xaa1503e0, //mov x0, x21 37 0x14000012, //b 78 <sk_start_pipeline_aarch64+0x78> 38 0x6f00e400, //movi v0.2d, #0x0 39 0x6f00e401, //movi v1.2d, #0x0 40 0x6f00e402, //movi v2.2d, #0x0 41 0x6f00e403, //movi v3.2d, #0x0 42 0x6f00e404, //movi v4.2d, #0x0 43 0x6f00e405, //movi v5.2d, #0x0 44 0x6f00e406, //movi v6.2d, #0x0 45 0x6f00e407, //movi v7.2d, #0x0 46 0xaa1503e0, //mov x0, x21 47 0xaa1403e1, //mov x1, x20 48 0xaa1603e2, //mov x2, x22 49 0xd63f02e0, //blr x23 50 0x910012a0, //add x0, x21, #0x4 51 0x910022a8, //add x8, x21, #0x8 52 0xeb13011f, //cmp x8, x19 53 0xaa0003f5, //mov x21, x0 54 0x54fffe09, //b.ls 34 <sk_start_pipeline_aarch64+0x34> // b.plast 55 0xa9427bf3, //ldp x19, x30, [sp, #32] 56 0xa94153f5, //ldp x21, x20, [sp, #16] 57 0xa8c35bf7, //ldp x23, x22, [sp], #48 58 0xd65f03c0, //ret 59 }; 60 61 CODE const uint32_t sk_just_return_aarch64[] = { 62 0xd65f03c0, //ret 63 }; 64 65 CODE const uint32_t sk_seed_shader_aarch64[] = { 66 0xa8c10c28, //ldp x8, x3, [x1], #16 67 0x3dc00046, //ldr q6, [x2] 68 0x4e040c00, //dup v0.4s, w0 69 0x4f0167e7, //movi v7.4s, #0x3f, lsl #24 70 0x4d40c901, //ld1r {v1.4s}, [x8] 71 0x4e21d800, //scvtf v0.4s, v0.4s 72 0x4e27d400, //fadd v0.4s, v0.4s, v7.4s 73 0x4f03f602, //fmov v2.4s, #1.000000000000000000e+00 74 0x4e21d821, //scvtf v1.4s, v1.4s 75 0x6f00e403, //movi v3.2d, #0x0 76 0x6f00e404, //movi v4.2d, #0x0 77 0x6f00e405, //movi v5.2d, #0x0 78 0x4e26d400, //fadd v0.4s, v0.4s, v6.4s 79 0x6f00e406, //movi v6.2d, #0x0 80 0x4e27d421, //fadd v1.4s, v1.4s, v7.4s 81 0x6f00e407, //movi v7.2d, #0x0 82 0xd61f0060, //br x3 83 }; 84 85 CODE const uint32_t sk_constant_color_aarch64[] = { 86 0xa8c10c28, //ldp x8, x3, [x1], #16 87 0x3dc00103, //ldr q3, [x8] 88 0x4e040460, //dup v0.4s, v3.s[0] 89 0x4e0c0461, //dup v1.4s, v3.s[1] 90 0x4e140462, //dup v2.4s, v3.s[2] 91 0x4e1c0463, //dup v3.4s, v3.s[3] 92 0xd61f0060, //br x3 93 }; 94 95 CODE const uint32_t sk_clear_aarch64[] = { 96 0xf8408423, //ldr x3, [x1], #8 97 0x6f00e400, //movi v0.2d, #0x0 98 0x6f00e401, //movi v1.2d, #0x0 99 0x6f00e402, //movi v2.2d, #0x0 100 0x6f00e403, //movi v3.2d, #0x0 101 0xd61f0060, //br x3 102 }; 103 104 CODE const uint32_t sk_plus__aarch64[] = { 105 0xf8408423, //ldr x3, [x1], #8 106 0x4e24d400, //fadd v0.4s, v0.4s, v4.4s 107 0x4e25d421, //fadd v1.4s, v1.4s, v5.4s 108 0x4e26d442, //fadd v2.4s, v2.4s, v6.4s 109 0x4e27d463, //fadd v3.4s, v3.4s, v7.4s 110 0xd61f0060, //br x3 111 }; 112 113 CODE const uint32_t sk_srcover_aarch64[] = { 114 0xf8408423, //ldr x3, [x1], #8 115 0x4f03f610, //fmov v16.4s, #1.000000000000000000e+00 116 0x4ea3d610, //fsub v16.4s, v16.4s, v3.4s 117 0x4e24ce00, //fmla v0.4s, v16.4s, v4.4s 118 0x4e25ce01, //fmla v1.4s, v16.4s, v5.4s 119 0x4e26ce02, //fmla v2.4s, v16.4s, v6.4s 120 0x4e27ce03, //fmla v3.4s, v16.4s, v7.4s 121 0xd61f0060, //br x3 122 }; 123 124 CODE const uint32_t sk_dstover_aarch64[] = { 125 0x4f03f611, //fmov v17.4s, #1.000000000000000000e+00 126 0xf8408423, //ldr x3, [x1], #8 127 0x4ea41c90, //mov v16.16b, v4.16b 128 0x4ea7d634, //fsub v20.4s, v17.4s, v7.4s 129 0x4ea51cb1, //mov v17.16b, v5.16b 130 0x4ea61cd2, //mov v18.16b, v6.16b 131 0x4ea71cf3, //mov v19.16b, v7.16b 132 0x4e20ce90, //fmla v16.4s, v20.4s, v0.4s 133 0x4e21ce91, //fmla v17.4s, v20.4s, v1.4s 134 0x4e22ce92, //fmla v18.4s, v20.4s, v2.4s 135 0x4e23ce93, //fmla v19.4s, v20.4s, v3.4s 136 0x4eb01e00, //mov v0.16b, v16.16b 137 0x4eb11e21, //mov v1.16b, v17.16b 138 0x4eb21e42, //mov v2.16b, v18.16b 139 0x4eb31e63, //mov v3.16b, v19.16b 140 0xd61f0060, //br x3 141 }; 142 143 CODE const uint32_t sk_clamp_0_aarch64[] = { 144 0xf8408423, //ldr x3, [x1], #8 145 0x6f00e410, //movi v16.2d, #0x0 146 0x4e30f400, //fmax v0.4s, v0.4s, v16.4s 147 0x4e30f421, //fmax v1.4s, v1.4s, v16.4s 148 0x4e30f442, //fmax v2.4s, v2.4s, v16.4s 149 0x4e30f463, //fmax v3.4s, v3.4s, v16.4s 150 0xd61f0060, //br x3 151 }; 152 153 CODE const uint32_t sk_clamp_1_aarch64[] = { 154 0xf8408423, //ldr x3, [x1], #8 155 0x4f03f610, //fmov v16.4s, #1.000000000000000000e+00 156 0x4eb0f400, //fmin v0.4s, v0.4s, v16.4s 157 0x4eb0f421, //fmin v1.4s, v1.4s, v16.4s 158 0x4eb0f442, //fmin v2.4s, v2.4s, v16.4s 159 0x4eb0f463, //fmin v3.4s, v3.4s, v16.4s 160 0xd61f0060, //br x3 161 }; 162 163 CODE const uint32_t sk_clamp_a_aarch64[] = { 164 0xf8408423, //ldr x3, [x1], #8 165 0x4f03f610, //fmov v16.4s, #1.000000000000000000e+00 166 0x4eb0f463, //fmin v3.4s, v3.4s, v16.4s 167 0x4ea3f400, //fmin v0.4s, v0.4s, v3.4s 168 0x4ea3f421, //fmin v1.4s, v1.4s, v3.4s 169 0x4ea3f442, //fmin v2.4s, v2.4s, v3.4s 170 0xd61f0060, //br x3 171 }; 172 173 CODE const uint32_t sk_set_rgb_aarch64[] = { 174 0xa8c10c28, //ldp x8, x3, [x1], #16 175 0xaa0803e9, //mov x9, x8 176 0x4ddfc920, //ld1r {v0.4s}, [x9], #4 177 0x91002108, //add x8, x8, #0x8 178 0x4d40c902, //ld1r {v2.4s}, [x8] 179 0x4d40c921, //ld1r {v1.4s}, [x9] 180 0xd61f0060, //br x3 181 }; 182 183 CODE const uint32_t sk_swap_rb_aarch64[] = { 184 0xf8408423, //ldr x3, [x1], #8 185 0x4ea01c10, //mov v16.16b, v0.16b 186 0x4ea21c40, //mov v0.16b, v2.16b 187 0x4eb01e02, //mov v2.16b, v16.16b 188 0xd61f0060, //br x3 189 }; 190 191 CODE const uint32_t sk_swap_aarch64[] = { 192 0xf8408423, //ldr x3, [x1], #8 193 0x4ea31c70, //mov v16.16b, v3.16b 194 0x4ea21c51, //mov v17.16b, v2.16b 195 0x4ea11c32, //mov v18.16b, v1.16b 196 0x4ea01c13, //mov v19.16b, v0.16b 197 0x4ea41c80, //mov v0.16b, v4.16b 198 0x4ea51ca1, //mov v1.16b, v5.16b 199 0x4ea61cc2, //mov v2.16b, v6.16b 200 0x4ea71ce3, //mov v3.16b, v7.16b 201 0x4eb31e64, //mov v4.16b, v19.16b 202 0x4eb21e45, //mov v5.16b, v18.16b 203 0x4eb11e26, //mov v6.16b, v17.16b 204 0x4eb01e07, //mov v7.16b, v16.16b 205 0xd61f0060, //br x3 206 }; 207 208 CODE const uint32_t sk_move_src_dst_aarch64[] = { 209 0xf8408423, //ldr x3, [x1], #8 210 0x4ea01c04, //mov v4.16b, v0.16b 211 0x4ea11c25, //mov v5.16b, v1.16b 212 0x4ea21c46, //mov v6.16b, v2.16b 213 0x4ea31c67, //mov v7.16b, v3.16b 214 0xd61f0060, //br x3 215 }; 216 217 CODE const uint32_t sk_move_dst_src_aarch64[] = { 218 0xf8408423, //ldr x3, [x1], #8 219 0x4ea41c80, //mov v0.16b, v4.16b 220 0x4ea51ca1, //mov v1.16b, v5.16b 221 0x4ea61cc2, //mov v2.16b, v6.16b 222 0x4ea71ce3, //mov v3.16b, v7.16b 223 0xd61f0060, //br x3 224 }; 225 226 CODE const uint32_t sk_premul_aarch64[] = { 227 0xf8408423, //ldr x3, [x1], #8 228 0x6e23dc00, //fmul v0.4s, v0.4s, v3.4s 229 0x6e23dc21, //fmul v1.4s, v1.4s, v3.4s 230 0x6e23dc42, //fmul v2.4s, v2.4s, v3.4s 231 0xd61f0060, //br x3 232 }; 233 234 CODE const uint32_t sk_unpremul_aarch64[] = { 235 0x4f03f611, //fmov v17.4s, #1.000000000000000000e+00 236 0xf8408423, //ldr x3, [x1], #8 237 0x4ea0d870, //fcmeq v16.4s, v3.4s, #0.0 238 0x6e23fe31, //fdiv v17.4s, v17.4s, v3.4s 239 0x4e701e30, //bic v16.16b, v17.16b, v16.16b 240 0x6e20de00, //fmul v0.4s, v16.4s, v0.4s 241 0x6e21de01, //fmul v1.4s, v16.4s, v1.4s 242 0x6e22de02, //fmul v2.4s, v16.4s, v2.4s 243 0xd61f0060, //br x3 244 }; 245 246 CODE const uint32_t sk_from_srgb_aarch64[] = { 247 0x52a7d328, //mov w8, #0x3e990000 248 0x72933348, //movk w8, #0x999a 249 0x4e040d10, //dup v16.4s, w8 250 0x52a7e648, //mov w8, #0x3f320000 251 0x7291eb88, //movk w8, #0x8f5c 252 0x4e040d11, //dup v17.4s, w8 253 0x52a76468, //mov w8, #0x3b230000 254 0x729ae148, //movk w8, #0xd70a 255 0x4e040d12, //dup v18.4s, w8 256 0x52a7b3c8, //mov w8, #0x3d9e0000 257 0x72907228, //movk w8, #0x8391 258 0x6e22dc54, //fmul v20.4s, v2.4s, v2.4s 259 0x4eb11e35, //mov v21.16b, v17.16b 260 0x4eb11e37, //mov v23.16b, v17.16b 261 0x4e22ce11, //fmla v17.4s, v16.4s, v2.4s 262 0x4eb21e56, //mov v22.16b, v18.16b 263 0x4eb21e58, //mov v24.16b, v18.16b 264 0x4e34ce32, //fmla v18.4s, v17.4s, v20.4s 265 0x4e040d11, //dup v17.4s, w8 266 0x52a7ac28, //mov w8, #0x3d610000 267 0x6e20dc13, //fmul v19.4s, v0.4s, v0.4s 268 0x7288f5c8, //movk w8, #0x47ae 269 0x4e20ce15, //fmla v21.4s, v16.4s, v0.4s 270 0xf8408423, //ldr x3, [x1], #8 271 0x6e21dc34, //fmul v20.4s, v1.4s, v1.4s 272 0x4e33ceb6, //fmla v22.4s, v21.4s, v19.4s 273 0x4e040d13, //dup v19.4s, w8 274 0x4e21ce17, //fmla v23.4s, v16.4s, v1.4s 275 0x6e31dc15, //fmul v21.4s, v0.4s, v17.4s 276 0x6ea0e660, //fcmgt v0.4s, v19.4s, v0.4s 277 0x6e31dc30, //fmul v16.4s, v1.4s, v17.4s 278 0x6ea1e661, //fcmgt v1.4s, v19.4s, v1.4s 279 0x6e31dc51, //fmul v17.4s, v2.4s, v17.4s 280 0x6ea2e662, //fcmgt v2.4s, v19.4s, v2.4s 281 0x4e34cef8, //fmla v24.4s, v23.4s, v20.4s 282 0x6e761ea0, //bsl v0.16b, v21.16b, v22.16b 283 0x6e781e01, //bsl v1.16b, v16.16b, v24.16b 284 0x6e721e22, //bsl v2.16b, v17.16b, v18.16b 285 0xd61f0060, //br x3 286 }; 287 288 CODE const uint32_t sk_to_srgb_aarch64[] = { 289 0x52a828e8, //mov w8, #0x41470000 290 0x728b8528, //movk w8, #0x5c29 291 0x4e040d12, //dup v18.4s, w8 292 0x52a7e608, //mov w8, #0x3f300000 293 0x728df9c8, //movk w8, #0x6fce 294 0x6ea1d811, //frsqrte v17.4s, v0.4s 295 0x4e040d13, //dup v19.4s, w8 296 0x52b7b948, //mov w8, #0xbdca0000 297 0x728af508, //movk w8, #0x57a8 298 0x6ea1d834, //frsqrte v20.4s, v1.4s 299 0x6e31de36, //fmul v22.4s, v17.4s, v17.4s 300 0x4e040d10, //dup v16.4s, w8 301 0x52a77188, //mov w8, #0x3b8c0000 302 0x6ea1d855, //frsqrte v21.4s, v2.4s 303 0x6e34de98, //fmul v24.4s, v20.4s, v20.4s 304 0x4eb6fc16, //frsqrts v22.4s, v0.4s, v22.4s 305 0x729ce088, //movk w8, #0xe704 306 0x6e35deb9, //fmul v25.4s, v21.4s, v21.4s 307 0x4eb8fc38, //frsqrts v24.4s, v1.4s, v24.4s 308 0x6e36de31, //fmul v17.4s, v17.4s, v22.4s 309 0x4e040d17, //dup v23.4s, w8 310 0x4eb9fc59, //frsqrts v25.4s, v2.4s, v25.4s 311 0x6e38de94, //fmul v20.4s, v20.4s, v24.4s 312 0x4ea1da36, //frecpe v22.4s, v17.4s 313 0x6e32dc1a, //fmul v26.4s, v0.4s, v18.4s 314 0x6ea0e6e0, //fcmgt v0.4s, v23.4s, v0.4s 315 0x6e32dc3c, //fmul v28.4s, v1.4s, v18.4s 316 0x6ea1e6e1, //fcmgt v1.4s, v23.4s, v1.4s 317 0x6e32dc52, //fmul v18.4s, v2.4s, v18.4s 318 0x6ea2e6e2, //fcmgt v2.4s, v23.4s, v2.4s 319 0x6e39deb5, //fmul v21.4s, v21.4s, v25.4s 320 0x4ea1da97, //frecpe v23.4s, v20.4s 321 0x4e36fe39, //frecps v25.4s, v17.4s, v22.4s 322 0x4ea1dab8, //frecpe v24.4s, v21.4s 323 0x6e39ded6, //fmul v22.4s, v22.4s, v25.4s 324 0x4e37fe99, //frecps v25.4s, v20.4s, v23.4s 325 0x4eb01e1b, //mov v27.16b, v16.16b 326 0x6e39def7, //fmul v23.4s, v23.4s, v25.4s 327 0x4e38feb9, //frecps v25.4s, v21.4s, v24.4s 328 0x6e39df18, //fmul v24.4s, v24.4s, v25.4s 329 0x4eb01e19, //mov v25.16b, v16.16b 330 0x4e36ce7b, //fmla v27.4s, v19.4s, v22.4s 331 0x6ea1da36, //frsqrte v22.4s, v17.4s 332 0x4e37ce79, //fmla v25.4s, v19.4s, v23.4s 333 0x6ea1da97, //frsqrte v23.4s, v20.4s 334 0x4e38ce70, //fmla v16.4s, v19.4s, v24.4s 335 0x6e36ded8, //fmul v24.4s, v22.4s, v22.4s 336 0x6ea1dab3, //frsqrte v19.4s, v21.4s 337 0x4eb8fe31, //frsqrts v17.4s, v17.4s, v24.4s 338 0x6e37def8, //fmul v24.4s, v23.4s, v23.4s 339 0x4eb8fe94, //frsqrts v20.4s, v20.4s, v24.4s 340 0x6e33de78, //fmul v24.4s, v19.4s, v19.4s 341 0x52a7da48, //mov w8, #0x3ed20000 342 0x4eb8feb5, //frsqrts v21.4s, v21.4s, v24.4s 343 0x7290f848, //movk w8, #0x87c2 344 0x6e31ded1, //fmul v17.4s, v22.4s, v17.4s 345 0x6e34def4, //fmul v20.4s, v23.4s, v20.4s 346 0x6e35de73, //fmul v19.4s, v19.4s, v21.4s 347 0x4e040d15, //dup v21.4s, w8 348 0xf8408423, //ldr x3, [x1], #8 349 0x4e31cebb, //fmla v27.4s, v21.4s, v17.4s 350 0x4f03f611, //fmov v17.4s, #1.000000000000000000e+00 351 0x4e34ceb9, //fmla v25.4s, v21.4s, v20.4s 352 0x4e33ceb0, //fmla v16.4s, v21.4s, v19.4s 353 0x4ebbf633, //fmin v19.4s, v17.4s, v27.4s 354 0x4eb9f634, //fmin v20.4s, v17.4s, v25.4s 355 0x4eb0f630, //fmin v16.4s, v17.4s, v16.4s 356 0x6e731f40, //bsl v0.16b, v26.16b, v19.16b 357 0x6e741f81, //bsl v1.16b, v28.16b, v20.16b 358 0x6e701e42, //bsl v2.16b, v18.16b, v16.16b 359 0xd61f0060, //br x3 360 }; 361 362 CODE const uint32_t sk_scale_1_float_aarch64[] = { 363 0xa8c10c28, //ldp x8, x3, [x1], #16 364 0xbd400110, //ldr s16, [x8] 365 0x4f909000, //fmul v0.4s, v0.4s, v16.s[0] 366 0x4f909021, //fmul v1.4s, v1.4s, v16.s[0] 367 0x4f909042, //fmul v2.4s, v2.4s, v16.s[0] 368 0x4f909063, //fmul v3.4s, v3.4s, v16.s[0] 369 0xd61f0060, //br x3 370 }; 371 372 CODE const uint32_t sk_scale_u8_aarch64[] = { 373 0xa8c10c28, //ldp x8, x3, [x1], #16 374 0x52a77009, //mov w9, #0x3b800000 375 0x72901029, //movk w9, #0x8081 376 0x4e040d30, //dup v16.4s, w9 377 0xf9400108, //ldr x8, [x8] 378 0x8b000108, //add x8, x8, x0 379 0x39400109, //ldrb w9, [x8] 380 0x3940050a, //ldrb w10, [x8, #1] 381 0x3940090b, //ldrb w11, [x8, #2] 382 0x39400d08, //ldrb w8, [x8, #3] 383 0x4e021d31, //mov v17.h[0], w9 384 0x4e061d51, //mov v17.h[1], w10 385 0x4e0a1d71, //mov v17.h[2], w11 386 0x4e0e1d11, //mov v17.h[3], w8 387 0x2f10a631, //uxtl v17.4s, v17.4h 388 0x6e21da31, //ucvtf v17.4s, v17.4s 389 0x6e30de30, //fmul v16.4s, v17.4s, v16.4s 390 0x6e20de00, //fmul v0.4s, v16.4s, v0.4s 391 0x6e21de01, //fmul v1.4s, v16.4s, v1.4s 392 0x6e22de02, //fmul v2.4s, v16.4s, v2.4s 393 0x6e23de03, //fmul v3.4s, v16.4s, v3.4s 394 0xd61f0060, //br x3 395 }; 396 397 CODE const uint32_t sk_lerp_1_float_aarch64[] = { 398 0xa8c10c28, //ldp x8, x3, [x1], #16 399 0x4ea4d411, //fsub v17.4s, v0.4s, v4.4s 400 0x4ea41c80, //mov v0.16b, v4.16b 401 0x4ea5d432, //fsub v18.4s, v1.4s, v5.4s 402 0xbd400110, //ldr s16, [x8] 403 0x4ea51ca1, //mov v1.16b, v5.16b 404 0x4f901220, //fmla v0.4s, v17.4s, v16.s[0] 405 0x4ea6d451, //fsub v17.4s, v2.4s, v6.4s 406 0x4f901241, //fmla v1.4s, v18.4s, v16.s[0] 407 0x4ea61cc2, //mov v2.16b, v6.16b 408 0x4ea7d472, //fsub v18.4s, v3.4s, v7.4s 409 0x4ea71ce3, //mov v3.16b, v7.16b 410 0x4f901222, //fmla v2.4s, v17.4s, v16.s[0] 411 0x4f901243, //fmla v3.4s, v18.4s, v16.s[0] 412 0xd61f0060, //br x3 413 }; 414 415 CODE const uint32_t sk_lerp_u8_aarch64[] = { 416 0xa8c10c28, //ldp x8, x3, [x1], #16 417 0x52a77009, //mov w9, #0x3b800000 418 0x72901029, //movk w9, #0x8081 419 0x4e040d30, //dup v16.4s, w9 420 0xf9400108, //ldr x8, [x8] 421 0x4ea4d412, //fsub v18.4s, v0.4s, v4.4s 422 0x8b000108, //add x8, x8, x0 423 0x3940010a, //ldrb w10, [x8] 424 0x39400509, //ldrb w9, [x8, #1] 425 0x3940090b, //ldrb w11, [x8, #2] 426 0x39400d08, //ldrb w8, [x8, #3] 427 0x4e021d51, //mov v17.h[0], w10 428 0x4e061d31, //mov v17.h[1], w9 429 0x4e0a1d71, //mov v17.h[2], w11 430 0x4e0e1d11, //mov v17.h[3], w8 431 0x2f10a620, //uxtl v0.4s, v17.4h 432 0x6e21d800, //ucvtf v0.4s, v0.4s 433 0x6e30dc10, //fmul v16.4s, v0.4s, v16.4s 434 0x4ea41c80, //mov v0.16b, v4.16b 435 0x4ea5d431, //fsub v17.4s, v1.4s, v5.4s 436 0x4ea51ca1, //mov v1.16b, v5.16b 437 0x4e32ce00, //fmla v0.4s, v16.4s, v18.4s 438 0x4ea6d452, //fsub v18.4s, v2.4s, v6.4s 439 0x4e31ce01, //fmla v1.4s, v16.4s, v17.4s 440 0x4ea61cc2, //mov v2.16b, v6.16b 441 0x4ea7d471, //fsub v17.4s, v3.4s, v7.4s 442 0x4ea71ce3, //mov v3.16b, v7.16b 443 0x4e32ce02, //fmla v2.4s, v16.4s, v18.4s 444 0x4e31ce03, //fmla v3.4s, v16.4s, v17.4s 445 0xd61f0060, //br x3 446 }; 447 448 CODE const uint32_t sk_lerp_565_aarch64[] = { 449 0xa8c10c28, //ldp x8, x3, [x1], #16 450 0xd37ff809, //lsl x9, x0, #1 451 0x4f072710, //movi v16.4s, #0xf8, lsl #8 452 0x4ea4d413, //fsub v19.4s, v0.4s, v4.4s 453 0xf9400108, //ldr x8, [x8] 454 0xfc696903, //ldr d3, [x8, x9] 455 0x52a6f088, //mov w8, #0x37840000 456 0x72842108, //movk w8, #0x2108 457 0x4e040d11, //dup v17.4s, w8 458 0x2f10a463, //uxtl v3.4s, v3.4h 459 0x321b17e8, //orr w8, wzr, #0x7e0 460 0x4e301c60, //and v0.16b, v3.16b, v16.16b 461 0x4e040d12, //dup v18.4s, w8 462 0x52a74048, //mov w8, #0x3a020000 463 0x4e21d800, //scvtf v0.4s, v0.4s 464 0x72810428, //movk w8, #0x821 465 0x6e31dc10, //fmul v16.4s, v0.4s, v17.4s 466 0x4ea41c80, //mov v0.16b, v4.16b 467 0x4e33ce00, //fmla v0.4s, v16.4s, v19.4s 468 0x4f0007f0, //movi v16.4s, #0x1f 469 0x4e040d11, //dup v17.4s, w8 470 0x52a7a088, //mov w8, #0x3d040000 471 0x4e321c72, //and v18.16b, v3.16b, v18.16b 472 0x72842108, //movk w8, #0x2108 473 0x4e301c63, //and v3.16b, v3.16b, v16.16b 474 0x4ea6d450, //fsub v16.4s, v2.4s, v6.4s 475 0x4e21da42, //scvtf v2.4s, v18.4s 476 0x6e31dc51, //fmul v17.4s, v2.4s, v17.4s 477 0x4e040d02, //dup v2.4s, w8 478 0x4e21d863, //scvtf v3.4s, v3.4s 479 0x4ea5d433, //fsub v19.4s, v1.4s, v5.4s 480 0x4ea51ca1, //mov v1.16b, v5.16b 481 0x6e22dc63, //fmul v3.4s, v3.4s, v2.4s 482 0x4ea61cc2, //mov v2.16b, v6.16b 483 0x4e33ce21, //fmla v1.4s, v17.4s, v19.4s 484 0x4e30cc62, //fmla v2.4s, v3.4s, v16.4s 485 0x4f03f603, //fmov v3.4s, #1.000000000000000000e+00 486 0xd61f0060, //br x3 487 }; 488 489 CODE const uint32_t sk_load_tables_aarch64[] = { 490 0xa8c10c28, //ldp x8, x3, [x1], #16 491 0xd37ef409, //lsl x9, x0, #2 492 0x6f00e620, //movi v0.2d, #0xff000000ff 493 0x52a7700b, //mov w11, #0x3b800000 494 0xa940310a, //ldp x10, x12, [x8] 495 0x7290102b, //movk w11, #0x8081 496 0x4e040d63, //dup v3.4s, w11 497 0x3ce96942, //ldr q2, [x10, x9] 498 0xa9412109, //ldp x9, x8, [x8, #16] 499 0x4e201c41, //and v1.16b, v2.16b, v0.16b 500 0x1e26002e, //fmov w14, s1 501 0x6f380450, //ushr v16.4s, v2.4s, #8 502 0x6f300451, //ushr v17.4s, v2.4s, #16 503 0x8b2e498e, //add x14, x12, w14, uxtw #2 504 0x0e0c3c2a, //mov w10, v1.s[1] 505 0x0e143c2b, //mov w11, v1.s[2] 506 0x0e1c3c2d, //mov w13, v1.s[3] 507 0x4e201e01, //and v1.16b, v16.16b, v0.16b 508 0x4e201e30, //and v16.16b, v17.16b, v0.16b 509 0x0d4081c0, //ld1 {v0.s}[0], [x14] 510 0x8b2a498a, //add x10, x12, w10, uxtw #2 511 0xbc6b5991, //ldr s17, [x12, w11, uxtw #2] 512 0xbc6d5992, //ldr s18, [x12, w13, uxtw #2] 513 0x0e0c3c2b, //mov w11, v1.s[1] 514 0x0e143c2c, //mov w12, v1.s[2] 515 0x0e1c3c2d, //mov w13, v1.s[3] 516 0x1e26002e, //fmov w14, s1 517 0x8b2e492e, //add x14, x9, w14, uxtw #2 518 0xbc6c5933, //ldr s19, [x9, w12, uxtw #2] 519 0xbc6d5934, //ldr s20, [x9, w13, uxtw #2] 520 0x8b2b4929, //add x9, x9, w11, uxtw #2 521 0x1e26020b, //fmov w11, s16 522 0x6f280442, //ushr v2.4s, v2.4s, #24 523 0x0d409140, //ld1 {v0.s}[1], [x10] 524 0x4e21d842, //scvtf v2.4s, v2.4s 525 0x8b2b490a, //add x10, x8, w11, uxtw #2 526 0x0d4081c1, //ld1 {v1.s}[0], [x14] 527 0x6e23dc43, //fmul v3.4s, v2.4s, v3.4s 528 0x0d408142, //ld1 {v2.s}[0], [x10] 529 0x0e0c3e0f, //mov w15, v16.s[1] 530 0x0e143e0c, //mov w12, v16.s[2] 531 0x8b2f490a, //add x10, x8, w15, uxtw #2 532 0x0e1c3e0d, //mov w13, v16.s[3] 533 0xbc6c5910, //ldr s16, [x8, w12, uxtw #2] 534 0x0d409121, //ld1 {v1.s}[1], [x9] 535 0x0d409142, //ld1 {v2.s}[1], [x10] 536 0x6e140620, //mov v0.s[2], v17.s[0] 537 0xbc6d5911, //ldr s17, [x8, w13, uxtw #2] 538 0x6e140661, //mov v1.s[2], v19.s[0] 539 0x6e140602, //mov v2.s[2], v16.s[0] 540 0x6e1c0640, //mov v0.s[3], v18.s[0] 541 0x6e1c0681, //mov v1.s[3], v20.s[0] 542 0x6e1c0622, //mov v2.s[3], v17.s[0] 543 0xd61f0060, //br x3 544 }; 545 546 CODE const uint32_t sk_load_a8_aarch64[] = { 547 0xa8c10c28, //ldp x8, x3, [x1], #16 548 0x52a77009, //mov w9, #0x3b800000 549 0x72901029, //movk w9, #0x8081 550 0x4e040d22, //dup v2.4s, w9 551 0xf9400108, //ldr x8, [x8] 552 0x6f00e400, //movi v0.2d, #0x0 553 0x6f00e401, //movi v1.2d, #0x0 554 0x8b000108, //add x8, x8, x0 555 0x3940010a, //ldrb w10, [x8] 556 0x3940050b, //ldrb w11, [x8, #1] 557 0x3940090c, //ldrb w12, [x8, #2] 558 0x39400d08, //ldrb w8, [x8, #3] 559 0x4e021d43, //mov v3.h[0], w10 560 0x4e061d63, //mov v3.h[1], w11 561 0x4e0a1d83, //mov v3.h[2], w12 562 0x4e0e1d03, //mov v3.h[3], w8 563 0x2f10a463, //uxtl v3.4s, v3.4h 564 0x6e21d863, //ucvtf v3.4s, v3.4s 565 0x6e22dc63, //fmul v3.4s, v3.4s, v2.4s 566 0x6f00e402, //movi v2.2d, #0x0 567 0xd61f0060, //br x3 568 }; 569 570 CODE const uint32_t sk_store_a8_aarch64[] = { 571 0xf9400028, //ldr x8, [x1] 572 0x52a86fe9, //mov w9, #0x437f0000 573 0x4e040d30, //dup v16.4s, w9 574 0x6e30dc70, //fmul v16.4s, v3.4s, v16.4s 575 0xf9400108, //ldr x8, [x8] 576 0x6e21aa10, //fcvtnu v16.4s, v16.4s 577 0x0e612a10, //xtn v16.4h, v16.4s 578 0x0e0e3e09, //umov w9, v16.h[3] 579 0x8b000108, //add x8, x8, x0 580 0x39000d09, //strb w9, [x8, #3] 581 0x0e0a3e09, //umov w9, v16.h[2] 582 0x39000909, //strb w9, [x8, #2] 583 0x0e063e09, //umov w9, v16.h[1] 584 0x39000509, //strb w9, [x8, #1] 585 0x0e023e09, //umov w9, v16.h[0] 586 0x39000109, //strb w9, [x8] 587 0xf9400423, //ldr x3, [x1, #8] 588 0x91004021, //add x1, x1, #0x10 589 0xd61f0060, //br x3 590 }; 591 592 CODE const uint32_t sk_load_565_aarch64[] = { 593 0xa8c10c28, //ldp x8, x3, [x1], #16 594 0xd37ff809, //lsl x9, x0, #1 595 0x4f072701, //movi v1.4s, #0xf8, lsl #8 596 0x4f0007e3, //movi v3.4s, #0x1f 597 0xf9400108, //ldr x8, [x8] 598 0xfc696900, //ldr d0, [x8, x9] 599 0x321b17e8, //orr w8, wzr, #0x7e0 600 0x4e040d02, //dup v2.4s, w8 601 0x52a6f088, //mov w8, #0x37840000 602 0x72842108, //movk w8, #0x2108 603 0x2f10a400, //uxtl v0.4s, v0.4h 604 0x4e211c01, //and v1.16b, v0.16b, v1.16b 605 0x4e221c02, //and v2.16b, v0.16b, v2.16b 606 0x4e231c03, //and v3.16b, v0.16b, v3.16b 607 0x4e040d00, //dup v0.4s, w8 608 0x52a74048, //mov w8, #0x3a020000 609 0x72810428, //movk w8, #0x821 610 0x4e21d821, //scvtf v1.4s, v1.4s 611 0x6e20dc20, //fmul v0.4s, v1.4s, v0.4s 612 0x4e040d01, //dup v1.4s, w8 613 0x52a7a088, //mov w8, #0x3d040000 614 0x72842108, //movk w8, #0x2108 615 0x4e21d842, //scvtf v2.4s, v2.4s 616 0x6e21dc41, //fmul v1.4s, v2.4s, v1.4s 617 0x4e040d02, //dup v2.4s, w8 618 0x4e21d863, //scvtf v3.4s, v3.4s 619 0x6e22dc62, //fmul v2.4s, v3.4s, v2.4s 620 0x4f03f603, //fmov v3.4s, #1.000000000000000000e+00 621 0xd61f0060, //br x3 622 }; 623 624 CODE const uint32_t sk_store_565_aarch64[] = { 625 0xf9400028, //ldr x8, [x1] 626 0x52a84f8a, //mov w10, #0x427c0000 627 0x4f01f7f0, //fmov v16.4s, #3.100000000000000000e+01 628 0x4e040d52, //dup v18.4s, w10 629 0x6e30dc11, //fmul v17.4s, v0.4s, v16.4s 630 0x6e32dc32, //fmul v18.4s, v1.4s, v18.4s 631 0x6e21aa31, //fcvtnu v17.4s, v17.4s 632 0x6e21aa52, //fcvtnu v18.4s, v18.4s 633 0x6e30dc50, //fmul v16.4s, v2.4s, v16.4s 634 0x4f2b5631, //shl v17.4s, v17.4s, #11 635 0xf9400108, //ldr x8, [x8] 636 0x4f255652, //shl v18.4s, v18.4s, #5 637 0x4eb11e51, //orr v17.16b, v18.16b, v17.16b 638 0x6e21aa10, //fcvtnu v16.4s, v16.4s 639 0x4eb01e30, //orr v16.16b, v17.16b, v16.16b 640 0xd37ff809, //lsl x9, x0, #1 641 0x0e612a10, //xtn v16.4h, v16.4s 642 0xfc296910, //str d16, [x8, x9] 643 0xf9400423, //ldr x3, [x1, #8] 644 0x91004021, //add x1, x1, #0x10 645 0xd61f0060, //br x3 646 }; 647 648 CODE const uint32_t sk_load_8888_aarch64[] = { 649 0xa8c10c28, //ldp x8, x3, [x1], #16 650 0xd37ef409, //lsl x9, x0, #2 651 0x6f00e621, //movi v1.2d, #0xff000000ff 652 0xf9400108, //ldr x8, [x8] 653 0x3ce96900, //ldr q0, [x8, x9] 654 0x52a77008, //mov w8, #0x3b800000 655 0x72901028, //movk w8, #0x8081 656 0x4e040d02, //dup v2.4s, w8 657 0x6f380410, //ushr v16.4s, v0.4s, #8 658 0x6f300411, //ushr v17.4s, v0.4s, #16 659 0x4e211c03, //and v3.16b, v0.16b, v1.16b 660 0x6f280400, //ushr v0.4s, v0.4s, #24 661 0x4e211e10, //and v16.16b, v16.16b, v1.16b 662 0x4e211e21, //and v1.16b, v17.16b, v1.16b 663 0x4e21d863, //scvtf v3.4s, v3.4s 664 0x4e21d811, //scvtf v17.4s, v0.4s 665 0x4e21da10, //scvtf v16.4s, v16.4s 666 0x4e21d832, //scvtf v18.4s, v1.4s 667 0x6e22dc60, //fmul v0.4s, v3.4s, v2.4s 668 0x6e22de23, //fmul v3.4s, v17.4s, v2.4s 669 0x6e22de01, //fmul v1.4s, v16.4s, v2.4s 670 0x6e22de42, //fmul v2.4s, v18.4s, v2.4s 671 0xd61f0060, //br x3 672 }; 673 674 CODE const uint32_t sk_store_8888_aarch64[] = { 675 0x52a86fea, //mov w10, #0x437f0000 676 0x4e040d50, //dup v16.4s, w10 677 0xf9400028, //ldr x8, [x1] 678 0x6e30dc32, //fmul v18.4s, v1.4s, v16.4s 679 0x6e30dc11, //fmul v17.4s, v0.4s, v16.4s 680 0x6e21aa52, //fcvtnu v18.4s, v18.4s 681 0x6e21aa31, //fcvtnu v17.4s, v17.4s 682 0x4f285652, //shl v18.4s, v18.4s, #8 683 0x4eb11e51, //orr v17.16b, v18.16b, v17.16b 684 0x6e30dc52, //fmul v18.4s, v2.4s, v16.4s 685 0x6e30dc70, //fmul v16.4s, v3.4s, v16.4s 686 0x6e21aa52, //fcvtnu v18.4s, v18.4s 687 0xf9400108, //ldr x8, [x8] 688 0x6e21aa10, //fcvtnu v16.4s, v16.4s 689 0x4f305652, //shl v18.4s, v18.4s, #16 690 0x4eb21e31, //orr v17.16b, v17.16b, v18.16b 691 0x4f385610, //shl v16.4s, v16.4s, #24 692 0xd37ef409, //lsl x9, x0, #2 693 0x4eb01e30, //orr v16.16b, v17.16b, v16.16b 694 0x3ca96910, //str q16, [x8, x9] 695 0xf9400423, //ldr x3, [x1, #8] 696 0x91004021, //add x1, x1, #0x10 697 0xd61f0060, //br x3 698 }; 699 700 CODE const uint32_t sk_load_f16_aarch64[] = { 701 0xa8c10c28, //ldp x8, x3, [x1], #16 702 0xf9400108, //ldr x8, [x8] 703 0x8b000d08, //add x8, x8, x0, lsl #3 704 0x0c400510, //ld4 {v16.4h-v19.4h}, [x8] 705 0x0e217a00, //fcvtl v0.4s, v16.4h 706 0x0e217a21, //fcvtl v1.4s, v17.4h 707 0x0e217a42, //fcvtl v2.4s, v18.4h 708 0x0e217a63, //fcvtl v3.4s, v19.4h 709 0xd61f0060, //br x3 710 }; 711 712 CODE const uint32_t sk_store_f16_aarch64[] = { 713 0xf9400028, //ldr x8, [x1] 714 0x0e216810, //fcvtn v16.4h, v0.4s 715 0x0e216831, //fcvtn v17.4h, v1.4s 716 0x0e216852, //fcvtn v18.4h, v2.4s 717 0xf9400108, //ldr x8, [x8] 718 0x0e216873, //fcvtn v19.4h, v3.4s 719 0x8b000d08, //add x8, x8, x0, lsl #3 720 0x0c000510, //st4 {v16.4h-v19.4h}, [x8] 721 0xf9400423, //ldr x3, [x1, #8] 722 0x91004021, //add x1, x1, #0x10 723 0xd61f0060, //br x3 724 }; 725 726 CODE const uint32_t sk_store_f32_aarch64[] = { 727 0xf9400028, //ldr x8, [x1] 728 0xf9400108, //ldr x8, [x8] 729 0x8b001108, //add x8, x8, x0, lsl #4 730 0x4c000900, //st4 {v0.4s-v3.4s}, [x8] 731 0xf9400423, //ldr x3, [x1, #8] 732 0x91004021, //add x1, x1, #0x10 733 0xd61f0060, //br x3 734 }; 735 736 CODE const uint32_t sk_clamp_x_aarch64[] = { 737 0xa8c10c28, //ldp x8, x3, [x1], #16 738 0x6f00e411, //movi v17.2d, #0x0 739 0x4e20f620, //fmax v0.4s, v17.4s, v0.4s 740 0x6f07e7f1, //movi v17.2d, #0xffffffffffffffff 741 0x4d40c910, //ld1r {v16.4s}, [x8] 742 0x4eb18610, //add v16.4s, v16.4s, v17.4s 743 0x4eb0f400, //fmin v0.4s, v0.4s, v16.4s 744 0xd61f0060, //br x3 745 }; 746 747 CODE const uint32_t sk_clamp_y_aarch64[] = { 748 0xa8c10c28, //ldp x8, x3, [x1], #16 749 0x6f00e411, //movi v17.2d, #0x0 750 0x4e21f621, //fmax v1.4s, v17.4s, v1.4s 751 0x6f07e7f1, //movi v17.2d, #0xffffffffffffffff 752 0x4d40c910, //ld1r {v16.4s}, [x8] 753 0x4eb18610, //add v16.4s, v16.4s, v17.4s 754 0x4eb0f421, //fmin v1.4s, v1.4s, v16.4s 755 0xd61f0060, //br x3 756 }; 757 758 CODE const uint32_t sk_repeat_x_aarch64[] = { 759 0xa8c10c28, //ldp x8, x3, [x1], #16 760 0x6f07e7f1, //movi v17.2d, #0xffffffffffffffff 761 0xbd400110, //ldr s16, [x8] 762 0x4e040612, //dup v18.4s, v16.s[0] 763 0x4eb18651, //add v17.4s, v18.4s, v17.4s 764 0x6e32fc12, //fdiv v18.4s, v0.4s, v18.4s 765 0x4e219a52, //frintm v18.4s, v18.4s 766 0x4f905240, //fmls v0.4s, v18.4s, v16.s[0] 767 0x4eb1f400, //fmin v0.4s, v0.4s, v17.4s 768 0xd61f0060, //br x3 769 }; 770 771 CODE const uint32_t sk_repeat_y_aarch64[] = { 772 0xa8c10c28, //ldp x8, x3, [x1], #16 773 0x6f07e7f1, //movi v17.2d, #0xffffffffffffffff 774 0xbd400110, //ldr s16, [x8] 775 0x4e040612, //dup v18.4s, v16.s[0] 776 0x4eb18651, //add v17.4s, v18.4s, v17.4s 777 0x6e32fc32, //fdiv v18.4s, v1.4s, v18.4s 778 0x4e219a52, //frintm v18.4s, v18.4s 779 0x4f905241, //fmls v1.4s, v18.4s, v16.s[0] 780 0x4eb1f421, //fmin v1.4s, v1.4s, v17.4s 781 0xd61f0060, //br x3 782 }; 783 784 CODE const uint32_t sk_mirror_x_aarch64[] = { 785 0xa8c10c28, //ldp x8, x3, [x1], #16 786 0xbd400110, //ldr s16, [x8] 787 0x4e040611, //dup v17.4s, v16.s[0] 788 0x1e302a10, //fadd s16, s16, s16 789 0x4eb1d400, //fsub v0.4s, v0.4s, v17.4s 790 0x4e040612, //dup v18.4s, v16.s[0] 791 0x6e32fc12, //fdiv v18.4s, v0.4s, v18.4s 792 0x4e219a52, //frintm v18.4s, v18.4s 793 0x4f905240, //fmls v0.4s, v18.4s, v16.s[0] 794 0x6f07e7f0, //movi v16.2d, #0xffffffffffffffff 795 0x4eb1d400, //fsub v0.4s, v0.4s, v17.4s 796 0x4eb08630, //add v16.4s, v17.4s, v16.4s 797 0x4ea0f800, //fabs v0.4s, v0.4s 798 0x4eb0f400, //fmin v0.4s, v0.4s, v16.4s 799 0xd61f0060, //br x3 800 }; 801 802 CODE const uint32_t sk_mirror_y_aarch64[] = { 803 0xa8c10c28, //ldp x8, x3, [x1], #16 804 0xbd400110, //ldr s16, [x8] 805 0x4e040611, //dup v17.4s, v16.s[0] 806 0x1e302a10, //fadd s16, s16, s16 807 0x4eb1d421, //fsub v1.4s, v1.4s, v17.4s 808 0x4e040612, //dup v18.4s, v16.s[0] 809 0x6e32fc32, //fdiv v18.4s, v1.4s, v18.4s 810 0x4e219a52, //frintm v18.4s, v18.4s 811 0x4f905241, //fmls v1.4s, v18.4s, v16.s[0] 812 0x6f07e7f0, //movi v16.2d, #0xffffffffffffffff 813 0x4eb1d421, //fsub v1.4s, v1.4s, v17.4s 814 0x4eb08630, //add v16.4s, v17.4s, v16.4s 815 0x4ea0f821, //fabs v1.4s, v1.4s 816 0x4eb0f421, //fmin v1.4s, v1.4s, v16.4s 817 0xd61f0060, //br x3 818 }; 819 820 CODE const uint32_t sk_luminance_to_alpha_aarch64[] = { 821 0x52a7cb28, //mov w8, #0x3e590000 822 0x72967a08, //movk w8, #0xb3d0 823 0x4e040d11, //dup v17.4s, w8 824 0x52a7e6e8, //mov w8, #0x3f370000 825 0x7282eb28, //movk w8, #0x1759 826 0x4ea01c10, //mov v16.16b, v0.16b 827 0x4e040d00, //dup v0.4s, w8 828 0x52a7b268, //mov w8, #0x3d930000 829 0xf8408423, //ldr x3, [x1], #8 830 0x729bb308, //movk w8, #0xdd98 831 0x6e20dc23, //fmul v3.4s, v1.4s, v0.4s 832 0x4e30ce23, //fmla v3.4s, v17.4s, v16.4s 833 0x4e040d10, //dup v16.4s, w8 834 0x6f00e400, //movi v0.2d, #0x0 835 0x6f00e401, //movi v1.2d, #0x0 836 0x4e22ce03, //fmla v3.4s, v16.4s, v2.4s 837 0x6f00e402, //movi v2.2d, #0x0 838 0xd61f0060, //br x3 839 }; 840 841 CODE const uint32_t sk_matrix_2x3_aarch64[] = { 842 0xa8c10c28, //ldp x8, x3, [x1], #16 843 0xaa0803e9, //mov x9, x8 844 0x9100410a, //add x10, x8, #0x10 845 0x4ddfc932, //ld1r {v18.4s}, [x9], #4 846 0x4d40c950, //ld1r {v16.4s}, [x10] 847 0x2d415113, //ldp s19, s20, [x8, #8] 848 0x9100510a, //add x10, x8, #0x14 849 0x4d40c951, //ld1r {v17.4s}, [x10] 850 0x4f931030, //fmla v16.4s, v1.4s, v19.s[0] 851 0xbd400133, //ldr s19, [x9] 852 0x4f941031, //fmla v17.4s, v1.4s, v20.s[0] 853 0x4e20ce50, //fmla v16.4s, v18.4s, v0.4s 854 0x4f931011, //fmla v17.4s, v0.4s, v19.s[0] 855 0x4eb01e00, //mov v0.16b, v16.16b 856 0x4eb11e21, //mov v1.16b, v17.16b 857 0xd61f0060, //br x3 858 }; 859 860 CODE const uint32_t sk_matrix_3x4_aarch64[] = { 861 0xa8c10c28, //ldp x8, x3, [x1], #16 862 0xaa0803e9, //mov x9, x8 863 0x9100910a, //add x10, x8, #0x24 864 0x4ddfc933, //ld1r {v19.4s}, [x9], #4 865 0x4d40c950, //ld1r {v16.4s}, [x10] 866 0x9100a10a, //add x10, x8, #0x28 867 0x4d40c951, //ld1r {v17.4s}, [x10] 868 0x9100b10a, //add x10, x8, #0x2c 869 0x2d435514, //ldp s20, s21, [x8, #24] 870 0xbd402116, //ldr s22, [x8, #32] 871 0x4d40c952, //ld1r {v18.4s}, [x10] 872 0x4f941050, //fmla v16.4s, v2.4s, v20.s[0] 873 0x4f951051, //fmla v17.4s, v2.4s, v21.s[0] 874 0x4f961052, //fmla v18.4s, v2.4s, v22.s[0] 875 0x2d425502, //ldp s2, s21, [x8, #16] 876 0x2d415d14, //ldp s20, s23, [x8, #8] 877 0x4f821031, //fmla v17.4s, v1.4s, v2.s[0] 878 0xbd400122, //ldr s2, [x9] 879 0x4f971030, //fmla v16.4s, v1.4s, v23.s[0] 880 0x4f951032, //fmla v18.4s, v1.4s, v21.s[0] 881 0x4e20ce70, //fmla v16.4s, v19.4s, v0.4s 882 0x4f941012, //fmla v18.4s, v0.4s, v20.s[0] 883 0x4f821011, //fmla v17.4s, v0.4s, v2.s[0] 884 0x4eb01e00, //mov v0.16b, v16.16b 885 0x4eb11e21, //mov v1.16b, v17.16b 886 0x4eb21e42, //mov v2.16b, v18.16b 887 0xd61f0060, //br x3 888 }; 889 890 CODE const uint32_t sk_matrix_4x5_aarch64[] = { 891 0xf9400029, //ldr x9, [x1] 892 0xaa0903e8, //mov x8, x9 893 0x9101012a, //add x10, x9, #0x40 894 0x4ddfc914, //ld1r {v20.4s}, [x8], #4 895 0x4d40c950, //ld1r {v16.4s}, [x10] 896 0x9101112a, //add x10, x9, #0x44 897 0x4d40c951, //ld1r {v17.4s}, [x10] 898 0x9101212a, //add x10, x9, #0x48 899 0x4d40c952, //ld1r {v18.4s}, [x10] 900 0x2d465533, //ldp s19, s21, [x9, #48] 901 0x2d475d36, //ldp s22, s23, [x9, #56] 902 0x9101312a, //add x10, x9, #0x4c 903 0xf9400423, //ldr x3, [x1, #8] 904 0x4f931070, //fmla v16.4s, v3.4s, v19.s[0] 905 0x4d40c953, //ld1r {v19.4s}, [x10] 906 0x4f951071, //fmla v17.4s, v3.4s, v21.s[0] 907 0x4f961072, //fmla v18.4s, v3.4s, v22.s[0] 908 0x2d445935, //ldp s21, s22, [x9, #32] 909 0x4f971073, //fmla v19.4s, v3.4s, v23.s[0] 910 0x2d455d23, //ldp s3, s23, [x9, #40] 911 0x91004021, //add x1, x1, #0x10 912 0x4f951050, //fmla v16.4s, v2.4s, v21.s[0] 913 0x4f961051, //fmla v17.4s, v2.4s, v22.s[0] 914 0x2d425935, //ldp s21, s22, [x9, #16] 915 0x4f971053, //fmla v19.4s, v2.4s, v23.s[0] 916 0x4f831052, //fmla v18.4s, v2.4s, v3.s[0] 917 0x2d410d22, //ldp s2, s3, [x9, #8] 918 0x4f951030, //fmla v16.4s, v1.4s, v21.s[0] 919 0x2d435d35, //ldp s21, s23, [x9, #24] 920 0x4f961031, //fmla v17.4s, v1.4s, v22.s[0] 921 0xbd400116, //ldr s22, [x8] 922 0x4e20ce90, //fmla v16.4s, v20.4s, v0.4s 923 0x4f951032, //fmla v18.4s, v1.4s, v21.s[0] 924 0x4f971033, //fmla v19.4s, v1.4s, v23.s[0] 925 0x4f821012, //fmla v18.4s, v0.4s, v2.s[0] 926 0x4f831013, //fmla v19.4s, v0.4s, v3.s[0] 927 0x4f961011, //fmla v17.4s, v0.4s, v22.s[0] 928 0x4eb01e00, //mov v0.16b, v16.16b 929 0x4eb11e21, //mov v1.16b, v17.16b 930 0x4eb21e42, //mov v2.16b, v18.16b 931 0x4eb31e63, //mov v3.16b, v19.16b 932 0xd61f0060, //br x3 933 }; 934 935 CODE const uint32_t sk_matrix_perspective_aarch64[] = { 936 0xa8c10c28, //ldp x8, x3, [x1], #16 937 0xaa0803e9, //mov x9, x8 938 0x9100510a, //add x10, x8, #0x14 939 0x4ddfc930, //ld1r {v16.4s}, [x9], #4 940 0x4d40c951, //ld1r {v17.4s}, [x10] 941 0x9100810a, //add x10, x8, #0x20 942 0x4d40c952, //ld1r {v18.4s}, [x10] 943 0x2d41d113, //ldp s19, s20, [x8, #12] 944 0x2d435915, //ldp s21, s22, [x8, #24] 945 0x91002108, //add x8, x8, #0x8 946 0x4f941031, //fmla v17.4s, v1.4s, v20.s[0] 947 0x4d40c914, //ld1r {v20.4s}, [x8] 948 0x4f961032, //fmla v18.4s, v1.4s, v22.s[0] 949 0xbd400136, //ldr s22, [x9] 950 0x4f951012, //fmla v18.4s, v0.4s, v21.s[0] 951 0x4f931011, //fmla v17.4s, v0.4s, v19.s[0] 952 0x4f961034, //fmla v20.4s, v1.4s, v22.s[0] 953 0x4ea1da41, //frecpe v1.4s, v18.4s 954 0x4e21fe52, //frecps v18.4s, v18.4s, v1.4s 955 0x6e32dc32, //fmul v18.4s, v1.4s, v18.4s 956 0x4e20ce14, //fmla v20.4s, v16.4s, v0.4s 957 0x6e32de21, //fmul v1.4s, v17.4s, v18.4s 958 0x6e32de80, //fmul v0.4s, v20.4s, v18.4s 959 0xd61f0060, //br x3 960 }; 961 962 CODE const uint32_t sk_linear_gradient_2stops_aarch64[] = { 963 0xa8c10c28, //ldp x8, x3, [x1], #16 964 0xad404503, //ldp q3, q17, [x8] 965 0x4e040470, //dup v16.4s, v3.s[0] 966 0x4e0c0461, //dup v1.4s, v3.s[1] 967 0x4e140462, //dup v2.4s, v3.s[2] 968 0x4e1c0463, //dup v3.4s, v3.s[3] 969 0x4f911010, //fmla v16.4s, v0.4s, v17.s[0] 970 0x4fb11001, //fmla v1.4s, v0.4s, v17.s[1] 971 0x4f911802, //fmla v2.4s, v0.4s, v17.s[2] 972 0x4fb11803, //fmla v3.4s, v0.4s, v17.s[3] 973 0x4eb01e00, //mov v0.16b, v16.16b 974 0xd61f0060, //br x3 975 }; 976 #elif defined(__arm__) 977 978 CODE const uint32_t sk_start_pipeline_vfp4[] = { 979 0xe92d41f0, //push {r4, r5, r6, r7, r8, lr} 980 0xe1a04000, //mov r4, r0 981 0xe2840002, //add r0, r4, #2 982 0xe1a05003, //mov r5, r3 983 0xe1a08002, //mov r8, r2 984 0xe1a07001, //mov r7, r1 985 0xe1500005, //cmp r0, r5 986 0x8a000010, //bhi 64 <sk_start_pipeline_vfp4+0x64> 987 0xe4976004, //ldr r6, [r7], #4 988 0xf2800010, //vmov.i32 d0, #0 989 0xe1a00004, //mov r0, r4 990 0xf2801010, //vmov.i32 d1, #0 991 0xe1a01007, //mov r1, r7 992 0xf2802010, //vmov.i32 d2, #0 993 0xe1a02008, //mov r2, r8 994 0xf2803010, //vmov.i32 d3, #0 995 0xf2804010, //vmov.i32 d4, #0 996 0xf2805010, //vmov.i32 d5, #0 997 0xf2806010, //vmov.i32 d6, #0 998 0xf2807010, //vmov.i32 d7, #0 999 0xe12fff36, //blx r6 1000 0xe2840004, //add r0, r4, #4 1001 0xe2844002, //add r4, r4, #2 1002 0xe1500005, //cmp r0, r5 1003 0x9affffef, //bls 24 <sk_start_pipeline_vfp4+0x24> 1004 0xe1a00004, //mov r0, r4 1005 0xe8bd81f0, //pop {r4, r5, r6, r7, r8, pc} 1006 }; 1007 1008 CODE const uint32_t sk_just_return_vfp4[] = { 1009 0xe12fff1e, //bx lr 1010 }; 1011 1012 CODE const uint32_t sk_seed_shader_vfp4[] = { 1013 0xee800b90, //vdup.32 d16, r0 1014 0xe8911008, //ldm r1, {r3, ip} 1015 0xf3fb0620, //vcvt.f32.s32 d16, d16 1016 0xf2c3161f, //vmov.i32 d17, #1056964608 1017 0xedd23b00, //vldr d19, [r2] 1018 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] 1019 0xf2872f10, //vmov.f32 d2, #1 1020 0xf3fb2622, //vcvt.f32.s32 d18, d18 1021 0xe2811008, //add r1, r1, #8 1022 0xf2400da1, //vadd.f32 d16, d16, d17 1023 0xf2803010, //vmov.i32 d3, #0 1024 0xf2804010, //vmov.i32 d4, #0 1025 0xf2021da1, //vadd.f32 d1, d18, d17 1026 0xf2000da3, //vadd.f32 d0, d16, d19 1027 0xf2805010, //vmov.i32 d5, #0 1028 0xf2806010, //vmov.i32 d6, #0 1029 0xf2807010, //vmov.i32 d7, #0 1030 0xe12fff1c, //bx ip 1031 }; 1032 1033 CODE const uint32_t sk_constant_color_vfp4[] = { 1034 0xe8911008, //ldm r1, {r3, ip} 1035 0xe2811008, //add r1, r1, #8 1036 0xf4630a0f, //vld1.8 {d16-d17}, [r3] 1037 0xf3b40c20, //vdup.32 d0, d16[0] 1038 0xf3bc1c20, //vdup.32 d1, d16[1] 1039 0xf3b42c21, //vdup.32 d2, d17[0] 1040 0xf3bc3c21, //vdup.32 d3, d17[1] 1041 0xe12fff1c, //bx ip 1042 }; 1043 1044 CODE const uint32_t sk_clear_vfp4[] = { 1045 0xe4913004, //ldr r3, [r1], #4 1046 0xf2800010, //vmov.i32 d0, #0 1047 0xf2801010, //vmov.i32 d1, #0 1048 0xf2802010, //vmov.i32 d2, #0 1049 0xf2803010, //vmov.i32 d3, #0 1050 0xe12fff13, //bx r3 1051 }; 1052 1053 CODE const uint32_t sk_plus__vfp4[] = { 1054 0xf2000d04, //vadd.f32 d0, d0, d4 1055 0xe4913004, //ldr r3, [r1], #4 1056 0xf2011d05, //vadd.f32 d1, d1, d5 1057 0xf2022d06, //vadd.f32 d2, d2, d6 1058 0xf2033d07, //vadd.f32 d3, d3, d7 1059 0xe12fff13, //bx r3 1060 }; 1061 1062 CODE const uint32_t sk_srcover_vfp4[] = { 1063 0xf2c70f10, //vmov.f32 d16, #1 1064 0xe4913004, //ldr r3, [r1], #4 1065 0xf2600d83, //vsub.f32 d16, d16, d3 1066 0xf2040c30, //vfma.f32 d0, d4, d16 1067 0xf2051c30, //vfma.f32 d1, d5, d16 1068 0xf2062c30, //vfma.f32 d2, d6, d16 1069 0xf2073c30, //vfma.f32 d3, d7, d16 1070 0xe12fff13, //bx r3 1071 }; 1072 1073 CODE const uint32_t sk_dstover_vfp4[] = { 1074 0xf2c70f10, //vmov.f32 d16, #1 1075 0xe4913004, //ldr r3, [r1], #4 1076 0xf2651115, //vorr d17, d5, d5 1077 0xf2604d87, //vsub.f32 d20, d16, d7 1078 0xf2640114, //vorr d16, d4, d4 1079 0xf2662116, //vorr d18, d6, d6 1080 0xf2673117, //vorr d19, d7, d7 1081 0xf2400c34, //vfma.f32 d16, d0, d20 1082 0xf2411c34, //vfma.f32 d17, d1, d20 1083 0xf2422c34, //vfma.f32 d18, d2, d20 1084 0xf2433c34, //vfma.f32 d19, d3, d20 1085 0xf22001b0, //vorr d0, d16, d16 1086 0xf22111b1, //vorr d1, d17, d17 1087 0xf22221b2, //vorr d2, d18, d18 1088 0xf22331b3, //vorr d3, d19, d19 1089 0xe12fff13, //bx r3 1090 }; 1091 1092 CODE const uint32_t sk_clamp_0_vfp4[] = { 1093 0xf2c00010, //vmov.i32 d16, #0 1094 0xe4913004, //ldr r3, [r1], #4 1095 0xf2000f20, //vmax.f32 d0, d0, d16 1096 0xf2011f20, //vmax.f32 d1, d1, d16 1097 0xf2022f20, //vmax.f32 d2, d2, d16 1098 0xf2033f20, //vmax.f32 d3, d3, d16 1099 0xe12fff13, //bx r3 1100 }; 1101 1102 CODE const uint32_t sk_clamp_1_vfp4[] = { 1103 0xf2c70f10, //vmov.f32 d16, #1 1104 0xe4913004, //ldr r3, [r1], #4 1105 0xf2200f20, //vmin.f32 d0, d0, d16 1106 0xf2211f20, //vmin.f32 d1, d1, d16 1107 0xf2222f20, //vmin.f32 d2, d2, d16 1108 0xf2233f20, //vmin.f32 d3, d3, d16 1109 0xe12fff13, //bx r3 1110 }; 1111 1112 CODE const uint32_t sk_clamp_a_vfp4[] = { 1113 0xf2c70f10, //vmov.f32 d16, #1 1114 0xe4913004, //ldr r3, [r1], #4 1115 0xf2233f20, //vmin.f32 d3, d3, d16 1116 0xf2200f03, //vmin.f32 d0, d0, d3 1117 0xf2211f03, //vmin.f32 d1, d1, d3 1118 0xf2222f03, //vmin.f32 d2, d2, d3 1119 0xe12fff13, //bx r3 1120 }; 1121 1122 CODE const uint32_t sk_set_rgb_vfp4[] = { 1123 0xe92d4800, //push {fp, lr} 1124 0xe8911008, //ldm r1, {r3, ip} 1125 0xe2811008, //add r1, r1, #8 1126 0xe283e008, //add lr, r3, #8 1127 0xf4a30c9d, //vld1.32 {d0[]}, [r3 :32]! 1128 0xf4ae2c9f, //vld1.32 {d2[]}, [lr :32] 1129 0xf4a31c9f, //vld1.32 {d1[]}, [r3 :32] 1130 0xe8bd4800, //pop {fp, lr} 1131 0xe12fff1c, //bx ip 1132 }; 1133 1134 CODE const uint32_t sk_swap_rb_vfp4[] = { 1135 0xeef00b40, //vmov.f64 d16, d0 1136 0xe4913004, //ldr r3, [r1], #4 1137 0xeeb00b42, //vmov.f64 d0, d2 1138 0xeeb02b60, //vmov.f64 d2, d16 1139 0xe12fff13, //bx r3 1140 }; 1141 1142 CODE const uint32_t sk_swap_vfp4[] = { 1143 0xeef00b43, //vmov.f64 d16, d3 1144 0xe4913004, //ldr r3, [r1], #4 1145 0xeef01b42, //vmov.f64 d17, d2 1146 0xeef02b41, //vmov.f64 d18, d1 1147 0xeef03b40, //vmov.f64 d19, d0 1148 0xeeb00b44, //vmov.f64 d0, d4 1149 0xeeb01b45, //vmov.f64 d1, d5 1150 0xeeb02b46, //vmov.f64 d2, d6 1151 0xeeb03b47, //vmov.f64 d3, d7 1152 0xeeb04b63, //vmov.f64 d4, d19 1153 0xeeb05b62, //vmov.f64 d5, d18 1154 0xeeb06b61, //vmov.f64 d6, d17 1155 0xeeb07b60, //vmov.f64 d7, d16 1156 0xe12fff13, //bx r3 1157 }; 1158 1159 CODE const uint32_t sk_move_src_dst_vfp4[] = { 1160 0xeeb04b40, //vmov.f64 d4, d0 1161 0xe4913004, //ldr r3, [r1], #4 1162 0xeeb05b41, //vmov.f64 d5, d1 1163 0xeeb06b42, //vmov.f64 d6, d2 1164 0xeeb07b43, //vmov.f64 d7, d3 1165 0xe12fff13, //bx r3 1166 }; 1167 1168 CODE const uint32_t sk_move_dst_src_vfp4[] = { 1169 0xeeb00b44, //vmov.f64 d0, d4 1170 0xe4913004, //ldr r3, [r1], #4 1171 0xeeb01b45, //vmov.f64 d1, d5 1172 0xeeb02b46, //vmov.f64 d2, d6 1173 0xeeb03b47, //vmov.f64 d3, d7 1174 0xe12fff13, //bx r3 1175 }; 1176 1177 CODE const uint32_t sk_premul_vfp4[] = { 1178 0xf3000d13, //vmul.f32 d0, d0, d3 1179 0xe4913004, //ldr r3, [r1], #4 1180 0xf3011d13, //vmul.f32 d1, d1, d3 1181 0xf3022d13, //vmul.f32 d2, d2, d3 1182 0xe12fff13, //bx r3 1183 }; 1184 1185 CODE const uint32_t sk_unpremul_vfp4[] = { 1186 0xed2d8b04, //vpush {d8-d9} 1187 0xeeb78a00, //vmov.f32 s16, #112 1188 0xf3f91503, //vceq.f32 d17, d3, #0 1189 0xf2c00010, //vmov.i32 d16, #0 1190 0xe4913004, //ldr r3, [r1], #4 1191 0xeec89a23, //vdiv.f32 s19, s16, s7 1192 0xee889a03, //vdiv.f32 s18, s16, s6 1193 0xf3501199, //vbsl d17, d16, d9 1194 0xf3010d90, //vmul.f32 d0, d17, d0 1195 0xf3011d91, //vmul.f32 d1, d17, d1 1196 0xf3012d92, //vmul.f32 d2, d17, d2 1197 0xecbd8b04, //vpop {d8-d9} 1198 0xe12fff13, //bx r3 1199 0xe320f000, //nop {0} 1200 }; 1201 1202 CODE const uint32_t sk_from_srgb_vfp4[] = { 1203 0xeddf3b20, //vldr d19, [pc, #128] 1204 0xf3408d10, //vmul.f32 d24, d0, d0 1205 0xeddf0b1c, //vldr d16, [pc, #112] 1206 0xf26341b3, //vorr d20, d19, d19 1207 0xf26351b3, //vorr d21, d19, d19 1208 0xeddf9b1f, //vldr d25, [pc, #124] 1209 0xf2404c30, //vfma.f32 d20, d0, d16 1210 0xeddf2b1b, //vldr d18, [pc, #108] 1211 0xf2415c30, //vfma.f32 d21, d1, d16 1212 0xeddfcb1d, //vldr d28, [pc, #116] 1213 0xf2423c30, //vfma.f32 d19, d2, d16 1214 0xe4913004, //ldr r3, [r1], #4 1215 0xf3426d12, //vmul.f32 d22, d2, d2 1216 0xf3417d11, //vmul.f32 d23, d1, d1 1217 0xf3620e80, //vcgt.f32 d16, d18, d0 1218 0xf3621e81, //vcgt.f32 d17, d18, d1 1219 0xf341ad39, //vmul.f32 d26, d1, d25 1220 0xf342bd39, //vmul.f32 d27, d2, d25 1221 0xf3622e82, //vcgt.f32 d18, d18, d2 1222 0xf3409d39, //vmul.f32 d25, d0, d25 1223 0xf26cd1bc, //vorr d29, d28, d28 1224 0xf248dcb4, //vfma.f32 d29, d24, d20 1225 0xf26c41bc, //vorr d20, d28, d28 1226 0xf2474cb5, //vfma.f32 d20, d23, d21 1227 0xf246ccb3, //vfma.f32 d28, d22, d19 1228 0xf35901bd, //vbsl d16, d25, d29 1229 0xf35a11b4, //vbsl d17, d26, d20 1230 0xf35b21bc, //vbsl d18, d27, d28 1231 0xf22001b0, //vorr d0, d16, d16 1232 0xf22111b1, //vorr d1, d17, d17 1233 0xf22221b2, //vorr d2, d18, d18 1234 0xe12fff13, //bx r3 1235 0x3e99999a, //.word 0x3e99999a 1236 0x3e99999a, //.word 0x3e99999a 1237 0x3f328f5c, //.word 0x3f328f5c 1238 0x3f328f5c, //.word 0x3f328f5c 1239 0x3d6147ae, //.word 0x3d6147ae 1240 0x3d6147ae, //.word 0x3d6147ae 1241 0x3d9e8391, //.word 0x3d9e8391 1242 0x3d9e8391, //.word 0x3d9e8391 1243 0x3b23d70a, //.word 0x3b23d70a 1244 0x3b23d70a, //.word 0x3b23d70a 1245 }; 1246 1247 CODE const uint32_t sk_to_srgb_vfp4[] = { 1248 0xf3fb0582, //vrsqrte.f32 d16, d2 1249 0xe4913004, //ldr r3, [r1], #4 1250 0xf3fb1581, //vrsqrte.f32 d17, d1 1251 0xf3fb2580, //vrsqrte.f32 d18, d0 1252 0xf3403db0, //vmul.f32 d19, d16, d16 1253 0xf3414db1, //vmul.f32 d20, d17, d17 1254 0xf3425db2, //vmul.f32 d21, d18, d18 1255 0xf2623f33, //vrsqrts.f32 d19, d2, d19 1256 0xf2614f34, //vrsqrts.f32 d20, d1, d20 1257 0xf2605f35, //vrsqrts.f32 d21, d0, d21 1258 0xf3400db3, //vmul.f32 d16, d16, d19 1259 0xf3411db4, //vmul.f32 d17, d17, d20 1260 0xf3422db5, //vmul.f32 d18, d18, d21 1261 0xf3fb3520, //vrecpe.f32 d19, d16 1262 0xf3fb4521, //vrecpe.f32 d20, d17 1263 0xf3fb6522, //vrecpe.f32 d22, d18 1264 0xf3fb55a0, //vrsqrte.f32 d21, d16 1265 0xf3fb75a1, //vrsqrte.f32 d23, d17 1266 0xf3fb85a2, //vrsqrte.f32 d24, d18 1267 0xf2409fb3, //vrecps.f32 d25, d16, d19 1268 0xf241afb4, //vrecps.f32 d26, d17, d20 1269 0xf242bfb6, //vrecps.f32 d27, d18, d22 1270 0xf345cdb5, //vmul.f32 d28, d21, d21 1271 0xf347ddb7, //vmul.f32 d29, d23, d23 1272 0xf348edb8, //vmul.f32 d30, d24, d24 1273 0xf2600fbc, //vrsqrts.f32 d16, d16, d28 1274 0xf2611fbd, //vrsqrts.f32 d17, d17, d29 1275 0xf2622fbe, //vrsqrts.f32 d18, d18, d30 1276 0xf3433db9, //vmul.f32 d19, d19, d25 1277 0xeddf9b21, //vldr d25, [pc, #132] 1278 0xf3444dba, //vmul.f32 d20, d20, d26 1279 0xeddfab21, //vldr d26, [pc, #132] 1280 0xf3466dbb, //vmul.f32 d22, d22, d27 1281 0xf26ab1ba, //vorr d27, d26, d26 1282 0xf243bcb9, //vfma.f32 d27, d19, d25 1283 0xf26a31ba, //vorr d19, d26, d26 1284 0xf2443cb9, //vfma.f32 d19, d20, d25 1285 0xeddf4b1d, //vldr d20, [pc, #116] 1286 0xf246acb9, //vfma.f32 d26, d22, d25 1287 0xf3450db0, //vmul.f32 d16, d21, d16 1288 0xeddf5b1c, //vldr d21, [pc, #112] 1289 0xf3471db1, //vmul.f32 d17, d23, d17 1290 0xf3482db2, //vmul.f32 d18, d24, d18 1291 0xf3406d35, //vmul.f32 d22, d0, d21 1292 0xf240bcb4, //vfma.f32 d27, d16, d20 1293 0xf2413cb4, //vfma.f32 d19, d17, d20 1294 0xf242acb4, //vfma.f32 d26, d18, d20 1295 0xeddf2b17, //vldr d18, [pc, #92] 1296 0xf3417d35, //vmul.f32 d23, d1, d21 1297 0xf3620e80, //vcgt.f32 d16, d18, d0 1298 0xf3621e81, //vcgt.f32 d17, d18, d1 1299 0xf3622e82, //vcgt.f32 d18, d18, d2 1300 0xf3425d35, //vmul.f32 d21, d2, d21 1301 0xf2c74f10, //vmov.f32 d20, #1 1302 0xf2648faa, //vmin.f32 d24, d20, d26 1303 0xf2643fa3, //vmin.f32 d19, d20, d19 1304 0xf2644fab, //vmin.f32 d20, d20, d27 1305 0xf35601b8, //vbsl d16, d22, d24 1306 0xf35711b3, //vbsl d17, d23, d19 1307 0xf35521b4, //vbsl d18, d21, d20 1308 0xf22001b0, //vorr d0, d16, d16 1309 0xf22111b1, //vorr d1, d17, d17 1310 0xf22221b2, //vorr d2, d18, d18 1311 0xe12fff13, //bx r3 1312 0x3f306fce, //.word 0x3f306fce 1313 0x3f306fce, //.word 0x3f306fce 1314 0xbdca57a8, //.word 0xbdca57a8 1315 0xbdca57a8, //.word 0xbdca57a8 1316 0x3ed287c2, //.word 0x3ed287c2 1317 0x3ed287c2, //.word 0x3ed287c2 1318 0x41475c29, //.word 0x41475c29 1319 0x41475c29, //.word 0x41475c29 1320 0x3b8ce704, //.word 0x3b8ce704 1321 0x3b8ce704, //.word 0x3b8ce704 1322 }; 1323 1324 CODE const uint32_t sk_scale_1_float_vfp4[] = { 1325 0xe8911008, //ldm r1, {r3, ip} 1326 0xe2811008, //add r1, r1, #8 1327 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] 1328 0xf3000d90, //vmul.f32 d0, d16, d0 1329 0xf3001d91, //vmul.f32 d1, d16, d1 1330 0xf3002d92, //vmul.f32 d2, d16, d2 1331 0xf3003d93, //vmul.f32 d3, d16, d3 1332 0xe12fff1c, //bx ip 1333 }; 1334 1335 CODE const uint32_t sk_scale_u8_vfp4[] = { 1336 0xe24dd004, //sub sp, sp, #4 1337 0xe8911008, //ldm r1, {r3, ip} 1338 0xe2811008, //add r1, r1, #8 1339 0xe5933000, //ldr r3, [r3] 1340 0xe0833000, //add r3, r3, r0 1341 0xe1d330b0, //ldrh r3, [r3] 1342 0xe1cd30b0, //strh r3, [sp] 1343 0xe1a0300d, //mov r3, sp 1344 0xf4e3041f, //vld1.16 {d16[0]}, [r3 :16] 1345 0xf3c80a30, //vmovl.u8 q8, d16 1346 0xf3d00a30, //vmovl.u16 q8, d16 1347 0xf3fb06a0, //vcvt.f32.u32 d16, d16 1348 0xeddf1b06, //vldr d17, [pc, #24] 1349 0xf3400db1, //vmul.f32 d16, d16, d17 1350 0xf3000d90, //vmul.f32 d0, d16, d0 1351 0xf3001d91, //vmul.f32 d1, d16, d1 1352 0xf3002d92, //vmul.f32 d2, d16, d2 1353 0xf3003d93, //vmul.f32 d3, d16, d3 1354 0xe28dd004, //add sp, sp, #4 1355 0xe12fff1c, //bx ip 1356 0x3b808081, //.word 0x3b808081 1357 0x3b808081, //.word 0x3b808081 1358 }; 1359 1360 CODE const uint32_t sk_lerp_1_float_vfp4[] = { 1361 0xe8911008, //ldm r1, {r3, ip} 1362 0xf2600d04, //vsub.f32 d16, d0, d4 1363 0xf2611d05, //vsub.f32 d17, d1, d5 1364 0xf2622d06, //vsub.f32 d18, d2, d6 1365 0xe2811008, //add r1, r1, #8 1366 0xf2633d07, //vsub.f32 d19, d3, d7 1367 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32] 1368 0xf2240114, //vorr d0, d4, d4 1369 0xf2251115, //vorr d1, d5, d5 1370 0xf2262116, //vorr d2, d6, d6 1371 0xf2273117, //vorr d3, d7, d7 1372 0xf2000cb4, //vfma.f32 d0, d16, d20 1373 0xf2011cb4, //vfma.f32 d1, d17, d20 1374 0xf2022cb4, //vfma.f32 d2, d18, d20 1375 0xf2033cb4, //vfma.f32 d3, d19, d20 1376 0xe12fff1c, //bx ip 1377 }; 1378 1379 CODE const uint32_t sk_lerp_u8_vfp4[] = { 1380 0xe24dd004, //sub sp, sp, #4 1381 0xe8911008, //ldm r1, {r3, ip} 1382 0xf2602d04, //vsub.f32 d18, d0, d4 1383 0xf2623d06, //vsub.f32 d19, d2, d6 1384 0xf2634d07, //vsub.f32 d20, d3, d7 1385 0xe2811008, //add r1, r1, #8 1386 0xe5933000, //ldr r3, [r3] 1387 0xf2240114, //vorr d0, d4, d4 1388 0xf2262116, //vorr d2, d6, d6 1389 0xe0833000, //add r3, r3, r0 1390 0xf2273117, //vorr d3, d7, d7 1391 0xe1d330b0, //ldrh r3, [r3] 1392 0xe1cd30b0, //strh r3, [sp] 1393 0xe1a0300d, //mov r3, sp 1394 0xf4e3041f, //vld1.16 {d16[0]}, [r3 :16] 1395 0xf3c80a30, //vmovl.u8 q8, d16 1396 0xf3d00a30, //vmovl.u16 q8, d16 1397 0xf3fb06a0, //vcvt.f32.u32 d16, d16 1398 0xeddf1b08, //vldr d17, [pc, #32] 1399 0xf3400db1, //vmul.f32 d16, d16, d17 1400 0xf2611d05, //vsub.f32 d17, d1, d5 1401 0xf2251115, //vorr d1, d5, d5 1402 0xf2020cb0, //vfma.f32 d0, d18, d16 1403 0xf2011cb0, //vfma.f32 d1, d17, d16 1404 0xf2032cb0, //vfma.f32 d2, d19, d16 1405 0xf2043cb0, //vfma.f32 d3, d20, d16 1406 0xe28dd004, //add sp, sp, #4 1407 0xe12fff1c, //bx ip 1408 0x3b808081, //.word 0x3b808081 1409 0x3b808081, //.word 0x3b808081 1410 }; 1411 1412 CODE const uint32_t sk_lerp_565_vfp4[] = { 1413 0xe24dd004, //sub sp, sp, #4 1414 0xe8911008, //ldm r1, {r3, ip} 1415 0xf3c72218, //vmov.i32 d18, #63488 1416 0xf2c1101f, //vmov.i32 d17, #31 1417 0xf2603d04, //vsub.f32 d19, d0, d4 1418 0xe2811008, //add r1, r1, #8 1419 0xe5933000, //ldr r3, [r3] 1420 0xf2616d05, //vsub.f32 d22, d1, d5 1421 0xf2240114, //vorr d0, d4, d4 1422 0xf2251115, //vorr d1, d5, d5 1423 0xe7933080, //ldr r3, [r3, r0, lsl #1] 1424 0xf2873f10, //vmov.f32 d3, #1 1425 0xe58d3000, //str r3, [sp] 1426 0xe1a0300d, //mov r3, sp 1427 0xf4e3083f, //vld1.32 {d16[0]}, [r3 :32] 1428 0xe3a03e7e, //mov r3, #2016 1429 0xf3d04a30, //vmovl.u16 q10, d16 1430 0xee803b90, //vdup.32 d16, r3 1431 0xf24421b2, //vand d18, d20, d18 1432 0xf24411b1, //vand d17, d20, d17 1433 0xeddf5b12, //vldr d21, [pc, #72] 1434 0xf24401b0, //vand d16, d20, d16 1435 0xeddf4b0e, //vldr d20, [pc, #56] 1436 0xf3fb2622, //vcvt.f32.s32 d18, d18 1437 0xf3fb0620, //vcvt.f32.s32 d16, d16 1438 0xf3fb1621, //vcvt.f32.s32 d17, d17 1439 0xf3422db4, //vmul.f32 d18, d18, d20 1440 0xeddf4b0d, //vldr d20, [pc, #52] 1441 0xf3400db5, //vmul.f32 d16, d16, d21 1442 0xf2625d06, //vsub.f32 d21, d2, d6 1443 0xf3411db4, //vmul.f32 d17, d17, d20 1444 0xf2262116, //vorr d2, d6, d6 1445 0xf2030cb2, //vfma.f32 d0, d19, d18 1446 0xf2061cb0, //vfma.f32 d1, d22, d16 1447 0xf2052cb1, //vfma.f32 d2, d21, d17 1448 0xe28dd004, //add sp, sp, #4 1449 0xe12fff1c, //bx ip 1450 0xe320f000, //nop {0} 1451 0x37842108, //.word 0x37842108 1452 0x37842108, //.word 0x37842108 1453 0x3a020821, //.word 0x3a020821 1454 0x3a020821, //.word 0x3a020821 1455 0x3d042108, //.word 0x3d042108 1456 0x3d042108, //.word 0x3d042108 1457 }; 1458 1459 CODE const uint32_t sk_load_tables_vfp4[] = { 1460 0xe92d48f0, //push {r4, r5, r6, r7, fp, lr} 1461 0xe8911008, //ldm r1, {r3, ip} 1462 0xf3c7001f, //vmov.i32 d16, #255 1463 0xe2811008, //add r1, r1, #8 1464 0xe593e000, //ldr lr, [r3] 1465 0xe99300b0, //ldmib r3, {r4, r5, r7} 1466 0xe08e3100, //add r3, lr, r0, lsl #2 1467 0xedd31b00, //vldr d17, [r3] 1468 0xf24121b0, //vand d18, d17, d16 1469 0xf3f83031, //vshr.u32 d19, d17, #8 1470 0xee323b90, //vmov.32 r3, d18[1] 1471 0xee126b90, //vmov.32 r6, d18[0] 1472 0xf3f02031, //vshr.u32 d18, d17, #16 1473 0xf24221b0, //vand d18, d18, d16 1474 0xf24301b0, //vand d16, d19, d16 1475 0xe0843103, //add r3, r4, r3, lsl #2 1476 0xedd30a00, //vldr s1, [r3] 1477 0xe0843106, //add r3, r4, r6, lsl #2 1478 0xee326b90, //vmov.32 r6, d18[1] 1479 0xed930a00, //vldr s0, [r3] 1480 0xee303b90, //vmov.32 r3, d16[1] 1481 0xee104b90, //vmov.32 r4, d16[0] 1482 0xf3e80031, //vshr.u32 d16, d17, #24 1483 0xeddf1b0d, //vldr d17, [pc, #52] 1484 0xf3fb0620, //vcvt.f32.s32 d16, d16 1485 0xf3003db1, //vmul.f32 d3, d16, d17 1486 0xe087e106, //add lr, r7, r6, lsl #2 1487 0xee126b90, //vmov.32 r6, d18[0] 1488 0xe0853103, //add r3, r5, r3, lsl #2 1489 0xedde2a00, //vldr s5, [lr] 1490 0xedd31a00, //vldr s3, [r3] 1491 0xe0853104, //add r3, r5, r4, lsl #2 1492 0xed931a00, //vldr s2, [r3] 1493 0xe0873106, //add r3, r7, r6, lsl #2 1494 0xed932a00, //vldr s4, [r3] 1495 0xe8bd48f0, //pop {r4, r5, r6, r7, fp, lr} 1496 0xe12fff1c, //bx ip 1497 0xe320f000, //nop {0} 1498 0x3b808081, //.word 0x3b808081 1499 0x3b808081, //.word 0x3b808081 1500 }; 1501 1502 CODE const uint32_t sk_load_a8_vfp4[] = { 1503 0xe24dd004, //sub sp, sp, #4 1504 0xe8911008, //ldm r1, {r3, ip} 1505 0xe2811008, //add r1, r1, #8 1506 0xf2800010, //vmov.i32 d0, #0 1507 0xf2801010, //vmov.i32 d1, #0 1508 0xe5933000, //ldr r3, [r3] 1509 0xf2802010, //vmov.i32 d2, #0 1510 0xe0833000, //add r3, r3, r0 1511 0xe1d330b0, //ldrh r3, [r3] 1512 0xe1cd30b0, //strh r3, [sp] 1513 0xe1a0300d, //mov r3, sp 1514 0xf4e3041f, //vld1.16 {d16[0]}, [r3 :16] 1515 0xf3c80a30, //vmovl.u8 q8, d16 1516 0xf3d00a30, //vmovl.u16 q8, d16 1517 0xf3fb06a0, //vcvt.f32.u32 d16, d16 1518 0xeddf1b03, //vldr d17, [pc, #12] 1519 0xf3003db1, //vmul.f32 d3, d16, d17 1520 0xe28dd004, //add sp, sp, #4 1521 0xe12fff1c, //bx ip 1522 0xe320f000, //nop {0} 1523 0x3b808081, //.word 0x3b808081 1524 0x3b808081, //.word 0x3b808081 1525 }; 1526 1527 CODE const uint32_t sk_store_a8_vfp4[] = { 1528 0xe92d4800, //push {fp, lr} 1529 0xeddf0b0d, //vldr d16, [pc, #52] 1530 0xf2c3161f, //vmov.i32 d17, #1056964608 1531 0xf2431c30, //vfma.f32 d17, d3, d16 1532 0xe5913000, //ldr r3, [r1] 1533 0xe5933000, //ldr r3, [r3] 1534 0xf3fb07a1, //vcvt.u32.f32 d16, d17 1535 0xee10eb90, //vmov.32 lr, d16[0] 1536 0xee30cb90, //vmov.32 ip, d16[1] 1537 0xe7e3e000, //strb lr, [r3, r0]! 1538 0xe5c3c001, //strb ip, [r3, #1] 1539 0xe2813008, //add r3, r1, #8 1540 0xe591c004, //ldr ip, [r1, #4] 1541 0xe1a01003, //mov r1, r3 1542 0xe8bd4800, //pop {fp, lr} 1543 0xe12fff1c, //bx ip 1544 0x437f0000, //.word 0x437f0000 1545 0x437f0000, //.word 0x437f0000 1546 }; 1547 1548 CODE const uint32_t sk_load_565_vfp4[] = { 1549 0xe24dd004, //sub sp, sp, #4 1550 0xe8911008, //ldm r1, {r3, ip} 1551 0xf2c1101f, //vmov.i32 d17, #31 1552 0xf3c72218, //vmov.i32 d18, #63488 1553 0xeddf3b16, //vldr d19, [pc, #88] 1554 0xe2811008, //add r1, r1, #8 1555 0xe5933000, //ldr r3, [r3] 1556 0xf2873f10, //vmov.f32 d3, #1 1557 0xe7933080, //ldr r3, [r3, r0, lsl #1] 1558 0xe58d3000, //str r3, [sp] 1559 0xe1a0300d, //mov r3, sp 1560 0xf4e3083f, //vld1.32 {d16[0]}, [r3 :32] 1561 0xe3a03e7e, //mov r3, #2016 1562 0xf3d04a30, //vmovl.u16 q10, d16 1563 0xee803b90, //vdup.32 d16, r3 1564 0xf24411b1, //vand d17, d20, d17 1565 0xeddf5b0e, //vldr d21, [pc, #56] 1566 0xf24421b2, //vand d18, d20, d18 1567 0xf24401b0, //vand d16, d20, d16 1568 0xeddf4b09, //vldr d20, [pc, #36] 1569 0xf3fb2622, //vcvt.f32.s32 d18, d18 1570 0xf3fb0620, //vcvt.f32.s32 d16, d16 1571 0xf3fb1621, //vcvt.f32.s32 d17, d17 1572 0xf3020db3, //vmul.f32 d0, d18, d19 1573 0xf3001db4, //vmul.f32 d1, d16, d20 1574 0xf3012db5, //vmul.f32 d2, d17, d21 1575 0xe28dd004, //add sp, sp, #4 1576 0xe12fff1c, //bx ip 1577 0x37842108, //.word 0x37842108 1578 0x37842108, //.word 0x37842108 1579 0x3a020821, //.word 0x3a020821 1580 0x3a020821, //.word 0x3a020821 1581 0x3d042108, //.word 0x3d042108 1582 0x3d042108, //.word 0x3d042108 1583 }; 1584 1585 CODE const uint32_t sk_store_565_vfp4[] = { 1586 0xf2c30f1f, //vmov.f32 d16, #31 1587 0xeddf1b15, //vldr d17, [pc, #84] 1588 0xf2c3361f, //vmov.i32 d19, #1056964608 1589 0xe5913000, //ldr r3, [r1] 1590 0xf2413c31, //vfma.f32 d19, d1, d17 1591 0xf2c3161f, //vmov.i32 d17, #1056964608 1592 0xf2401c30, //vfma.f32 d17, d0, d16 1593 0xe5933000, //ldr r3, [r3] 1594 0xf2c3261f, //vmov.i32 d18, #1056964608 1595 0xf2422c30, //vfma.f32 d18, d2, d16 1596 0xe0833080, //add r3, r3, r0, lsl #1 1597 0xf3fb07a3, //vcvt.u32.f32 d16, d19 1598 0xf3fb17a1, //vcvt.u32.f32 d17, d17 1599 0xf3fb27a2, //vcvt.u32.f32 d18, d18 1600 0xf2e50530, //vshl.s32 d16, d16, #5 1601 0xf2eb1531, //vshl.s32 d17, d17, #11 1602 0xf26001b1, //vorr d16, d16, d17 1603 0xf26001b2, //vorr d16, d16, d18 1604 0xf3f60121, //vuzp.16 d16, d17 1605 0xf4c3080f, //vst1.32 {d16[0]}, [r3] 1606 0xe2813008, //add r3, r1, #8 1607 0xe591c004, //ldr ip, [r1, #4] 1608 0xe1a01003, //mov r1, r3 1609 0xe12fff1c, //bx ip 1610 0x427c0000, //.word 0x427c0000 1611 0x427c0000, //.word 0x427c0000 1612 }; 1613 1614 CODE const uint32_t sk_load_8888_vfp4[] = { 1615 0xe8911008, //ldm r1, {r3, ip} 1616 0xf3c7001f, //vmov.i32 d16, #255 1617 0xe2811008, //add r1, r1, #8 1618 0xe5933000, //ldr r3, [r3] 1619 0xe0833100, //add r3, r3, r0, lsl #2 1620 0xedd31b00, //vldr d17, [r3] 1621 0xf24121b0, //vand d18, d17, d16 1622 0xf3f83031, //vshr.u32 d19, d17, #8 1623 0xf3e84031, //vshr.u32 d20, d17, #24 1624 0xf3f01031, //vshr.u32 d17, d17, #16 1625 0xf24331b0, //vand d19, d19, d16 1626 0xf24101b0, //vand d16, d17, d16 1627 0xeddf1b08, //vldr d17, [pc, #32] 1628 0xf3fb2622, //vcvt.f32.s32 d18, d18 1629 0xf3fb4624, //vcvt.f32.s32 d20, d20 1630 0xf3fb3623, //vcvt.f32.s32 d19, d19 1631 0xf3fb0620, //vcvt.f32.s32 d16, d16 1632 0xf3020db1, //vmul.f32 d0, d18, d17 1633 0xf3043db1, //vmul.f32 d3, d20, d17 1634 0xf3031db1, //vmul.f32 d1, d19, d17 1635 0xf3002db1, //vmul.f32 d2, d16, d17 1636 0xe12fff1c, //bx ip 1637 0x3b808081, //.word 0x3b808081 1638 0x3b808081, //.word 0x3b808081 1639 }; 1640 1641 CODE const uint32_t sk_store_8888_vfp4[] = { 1642 0xeddf0b1a, //vldr d16, [pc, #104] 1643 0xf2c3261f, //vmov.i32 d18, #1056964608 1644 0xf2412c30, //vfma.f32 d18, d1, d16 1645 0xe5913000, //ldr r3, [r1] 1646 0xf2c3361f, //vmov.i32 d19, #1056964608 1647 0xf2c3161f, //vmov.i32 d17, #1056964608 1648 0xf2423c30, //vfma.f32 d19, d2, d16 1649 0xe5933000, //ldr r3, [r3] 1650 0xf2c3461f, //vmov.i32 d20, #1056964608 1651 0xf2401c30, //vfma.f32 d17, d0, d16 1652 0xe0833100, //add r3, r3, r0, lsl #2 1653 0xf2434c30, //vfma.f32 d20, d3, d16 1654 0xf3fb07a2, //vcvt.u32.f32 d16, d18 1655 0xf3fb27a3, //vcvt.u32.f32 d18, d19 1656 0xf3fb17a1, //vcvt.u32.f32 d17, d17 1657 0xf3fb37a4, //vcvt.u32.f32 d19, d20 1658 0xf2e80530, //vshl.s32 d16, d16, #8 1659 0xf2f02532, //vshl.s32 d18, d18, #16 1660 0xf26001b1, //vorr d16, d16, d17 1661 0xf2f81533, //vshl.s32 d17, d19, #24 1662 0xf26001b2, //vorr d16, d16, d18 1663 0xf26001b1, //vorr d16, d16, d17 1664 0xedc30b00, //vstr d16, [r3] 1665 0xe2813008, //add r3, r1, #8 1666 0xe591c004, //ldr ip, [r1, #4] 1667 0xe1a01003, //mov r1, r3 1668 0xe12fff1c, //bx ip 1669 0xe320f000, //nop {0} 1670 0x437f0000, //.word 0x437f0000 1671 0x437f0000, //.word 0x437f0000 1672 }; 1673 1674 CODE const uint32_t sk_load_f16_vfp4[] = { 1675 0xe8911008, //ldm r1, {r3, ip} 1676 0xe2811008, //add r1, r1, #8 1677 0xe5933000, //ldr r3, [r3] 1678 0xe0833180, //add r3, r3, r0, lsl #3 1679 0xf463084f, //vld2.16 {d16-d17}, [r3] 1680 0xf3f62720, //vcvt.f32.f16 q9, d16 1681 0xf3f60721, //vcvt.f32.f16 q8, d17 1682 0xf22201b2, //vorr d0, d18, d18 1683 0xf22011b0, //vorr d1, d16, d16 1684 0xf3ba00a3, //vtrn.32 d0, d19 1685 0xf22321b3, //vorr d2, d19, d19 1686 0xf3ba10a1, //vtrn.32 d1, d17 1687 0xf22131b1, //vorr d3, d17, d17 1688 0xe12fff1c, //bx ip 1689 }; 1690 1691 CODE const uint32_t sk_store_f16_vfp4[] = { 1692 0xeef00b41, //vmov.f64 d16, d1 1693 0xeef03b42, //vmov.f64 d19, d2 1694 0xf2631113, //vorr d17, d3, d3 1695 0xf2602110, //vorr d18, d0, d0 1696 0xf3fa00a1, //vtrn.32 d16, d17 1697 0xf3f61620, //vcvt.f16.f32 d17, q8 1698 0xf3fa20a3, //vtrn.32 d18, d19 1699 0xe5913000, //ldr r3, [r1] 1700 0xf3f60622, //vcvt.f16.f32 d16, q9 1701 0xe5933000, //ldr r3, [r3] 1702 0xe0833180, //add r3, r3, r0, lsl #3 1703 0xf443084f, //vst2.16 {d16-d17}, [r3] 1704 0xe2813008, //add r3, r1, #8 1705 0xe591c004, //ldr ip, [r1, #4] 1706 0xe1a01003, //mov r1, r3 1707 0xe12fff1c, //bx ip 1708 }; 1709 1710 CODE const uint32_t sk_store_f32_vfp4[] = { 1711 0xe5913000, //ldr r3, [r1] 1712 0xe5933000, //ldr r3, [r3] 1713 0xe0833200, //add r3, r3, r0, lsl #4 1714 0xf403008f, //vst4.32 {d0-d3}, [r3] 1715 0xe2813008, //add r3, r1, #8 1716 0xe591c004, //ldr ip, [r1, #4] 1717 0xe1a01003, //mov r1, r3 1718 0xe12fff1c, //bx ip 1719 }; 1720 1721 CODE const uint32_t sk_clamp_x_vfp4[] = { 1722 0xe8911008, //ldm r1, {r3, ip} 1723 0xf2c00010, //vmov.i32 d16, #0 1724 0xf3c71e1f, //vmov.i8 d17, #255 1725 0xf2400f80, //vmax.f32 d16, d16, d0 1726 0xe2811008, //add r1, r1, #8 1727 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] 1728 0xf26218a1, //vadd.i32 d17, d18, d17 1729 0xf2200fa1, //vmin.f32 d0, d16, d17 1730 0xe12fff1c, //bx ip 1731 }; 1732 1733 CODE const uint32_t sk_clamp_y_vfp4[] = { 1734 0xe8911008, //ldm r1, {r3, ip} 1735 0xf2c00010, //vmov.i32 d16, #0 1736 0xf3c71e1f, //vmov.i8 d17, #255 1737 0xf2400f81, //vmax.f32 d16, d16, d1 1738 0xe2811008, //add r1, r1, #8 1739 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] 1740 0xf26218a1, //vadd.i32 d17, d18, d17 1741 0xf2201fa1, //vmin.f32 d1, d16, d17 1742 0xe12fff1c, //bx ip 1743 }; 1744 1745 CODE const uint32_t sk_repeat_x_vfp4[] = { 1746 0xed2d8b04, //vpush {d8-d9} 1747 0xe8911008, //ldm r1, {r3, ip} 1748 0xf2c02010, //vmov.i32 d18, #0 1749 0xe2811008, //add r1, r1, #8 1750 0xeddf3b10, //vldr d19, [pc, #64] 1751 0xed938a00, //vldr s16, [r3] 1752 0xeec09a88, //vdiv.f32 s19, s1, s16 1753 0xee809a08, //vdiv.f32 s18, s0, s16 1754 0xf3fb0709, //vcvt.s32.f32 d16, d9 1755 0xf3fb0620, //vcvt.f32.s32 d16, d16 1756 0xf3601e89, //vcgt.f32 d17, d16, d9 1757 0xf35311b2, //vbsl d17, d19, d18 1758 0xf3f42c08, //vdup.32 d18, d8[0] 1759 0xf2600da1, //vsub.f32 d16, d16, d17 1760 0xf3c71e1f, //vmov.i8 d17, #255 1761 0xf26218a1, //vadd.i32 d17, d18, d17 1762 0xf2e009c8, //vmul.f32 d16, d16, d8[0] 1763 0xf2600d20, //vsub.f32 d16, d0, d16 1764 0xf2200fa1, //vmin.f32 d0, d16, d17 1765 0xecbd8b04, //vpop {d8-d9} 1766 0xe12fff1c, //bx ip 1767 0xe320f000, //nop {0} 1768 0x3f800000, //.word 0x3f800000 1769 0x3f800000, //.word 0x3f800000 1770 }; 1771 1772 CODE const uint32_t sk_repeat_y_vfp4[] = { 1773 0xed2d8b04, //vpush {d8-d9} 1774 0xe8911008, //ldm r1, {r3, ip} 1775 0xf2c02010, //vmov.i32 d18, #0 1776 0xe2811008, //add r1, r1, #8 1777 0xeddf3b10, //vldr d19, [pc, #64] 1778 0xed938a00, //vldr s16, [r3] 1779 0xeec19a88, //vdiv.f32 s19, s3, s16 1780 0xee819a08, //vdiv.f32 s18, s2, s16 1781 0xf3fb0709, //vcvt.s32.f32 d16, d9 1782 0xf3fb0620, //vcvt.f32.s32 d16, d16 1783 0xf3601e89, //vcgt.f32 d17, d16, d9 1784 0xf35311b2, //vbsl d17, d19, d18 1785 0xf3f42c08, //vdup.32 d18, d8[0] 1786 0xf2600da1, //vsub.f32 d16, d16, d17 1787 0xf3c71e1f, //vmov.i8 d17, #255 1788 0xf26218a1, //vadd.i32 d17, d18, d17 1789 0xf2e009c8, //vmul.f32 d16, d16, d8[0] 1790 0xf2610d20, //vsub.f32 d16, d1, d16 1791 0xf2201fa1, //vmin.f32 d1, d16, d17 1792 0xecbd8b04, //vpop {d8-d9} 1793 0xe12fff1c, //bx ip 1794 0xe320f000, //nop {0} 1795 0x3f800000, //.word 0x3f800000 1796 0x3f800000, //.word 0x3f800000 1797 }; 1798 1799 CODE const uint32_t sk_mirror_x_vfp4[] = { 1800 0xed2d8b04, //vpush {d8-d9} 1801 0xe8911008, //ldm r1, {r3, ip} 1802 0xf2c03010, //vmov.i32 d19, #0 1803 0xe2811008, //add r1, r1, #8 1804 0xeddf4b14, //vldr d20, [pc, #80] 1805 0xed938a00, //vldr s16, [r3] 1806 0xee389a08, //vadd.f32 s18, s16, s16 1807 0xf3f40c08, //vdup.32 d16, d8[0] 1808 0xf2200d20, //vsub.f32 d0, d0, d16 1809 0xeec08a89, //vdiv.f32 s17, s1, s18 1810 0xee808a09, //vdiv.f32 s16, s0, s18 1811 0xf3fb1708, //vcvt.s32.f32 d17, d8 1812 0xf3fb1621, //vcvt.f32.s32 d17, d17 1813 0xf3612e88, //vcgt.f32 d18, d17, d8 1814 0xf35421b3, //vbsl d18, d20, d19 1815 0xf2611da2, //vsub.f32 d17, d17, d18 1816 0xf3c72e1f, //vmov.i8 d18, #255 1817 0xf2e119c9, //vmul.f32 d17, d17, d9[0] 1818 0xf2601d21, //vsub.f32 d17, d0, d17 1819 0xf2611da0, //vsub.f32 d17, d17, d16 1820 0xf26008a2, //vadd.i32 d16, d16, d18 1821 0xf3f91721, //vabs.f32 d17, d17 1822 0xf2210fa0, //vmin.f32 d0, d17, d16 1823 0xecbd8b04, //vpop {d8-d9} 1824 0xe12fff1c, //bx ip 1825 0xe320f000, //nop {0} 1826 0x3f800000, //.word 0x3f800000 1827 0x3f800000, //.word 0x3f800000 1828 }; 1829 1830 CODE const uint32_t sk_mirror_y_vfp4[] = { 1831 0xed2d8b04, //vpush {d8-d9} 1832 0xe8911008, //ldm r1, {r3, ip} 1833 0xf2c03010, //vmov.i32 d19, #0 1834 0xe2811008, //add r1, r1, #8 1835 0xeddf4b14, //vldr d20, [pc, #80] 1836 0xed938a00, //vldr s16, [r3] 1837 0xee389a08, //vadd.f32 s18, s16, s16 1838 0xf3f40c08, //vdup.32 d16, d8[0] 1839 0xf2211d20, //vsub.f32 d1, d1, d16 1840 0xeec18a89, //vdiv.f32 s17, s3, s18 1841 0xee818a09, //vdiv.f32 s16, s2, s18 1842 0xf3fb1708, //vcvt.s32.f32 d17, d8 1843 0xf3fb1621, //vcvt.f32.s32 d17, d17 1844 0xf3612e88, //vcgt.f32 d18, d17, d8 1845 0xf35421b3, //vbsl d18, d20, d19 1846 0xf2611da2, //vsub.f32 d17, d17, d18 1847 0xf3c72e1f, //vmov.i8 d18, #255 1848 0xf2e119c9, //vmul.f32 d17, d17, d9[0] 1849 0xf2611d21, //vsub.f32 d17, d1, d17 1850 0xf2611da0, //vsub.f32 d17, d17, d16 1851 0xf26008a2, //vadd.i32 d16, d16, d18 1852 0xf3f91721, //vabs.f32 d17, d17 1853 0xf2211fa0, //vmin.f32 d1, d17, d16 1854 0xecbd8b04, //vpop {d8-d9} 1855 0xe12fff1c, //bx ip 1856 0xe320f000, //nop {0} 1857 0x3f800000, //.word 0x3f800000 1858 0x3f800000, //.word 0x3f800000 1859 }; 1860 1861 CODE const uint32_t sk_luminance_to_alpha_vfp4[] = { 1862 0xeddf0b0a, //vldr d16, [pc, #40] 1863 0xeddf1b0b, //vldr d17, [pc, #44] 1864 0xf3410d30, //vmul.f32 d16, d1, d16 1865 0xe4913004, //ldr r3, [r1], #4 1866 0xf3401d31, //vmul.f32 d17, d0, d17 1867 0xf2800010, //vmov.i32 d0, #0 1868 0xf2801010, //vmov.i32 d1, #0 1869 0xf2013da0, //vadd.f32 d3, d17, d16 1870 0xeddf0b06, //vldr d16, [pc, #24] 1871 0xf2023c30, //vfma.f32 d3, d2, d16 1872 0xf2802010, //vmov.i32 d2, #0 1873 0xe12fff13, //bx r3 1874 0x3f371759, //.word 0x3f371759 1875 0x3f371759, //.word 0x3f371759 1876 0x3e59b3d0, //.word 0x3e59b3d0 1877 0x3e59b3d0, //.word 0x3e59b3d0 1878 0x3d93dd98, //.word 0x3d93dd98 1879 0x3d93dd98, //.word 0x3d93dd98 1880 }; 1881 1882 CODE const uint32_t sk_matrix_2x3_vfp4[] = { 1883 0xe92d4800, //push {fp, lr} 1884 0xe591e000, //ldr lr, [r1] 1885 0xe591c004, //ldr ip, [r1, #4] 1886 0xe2811008, //add r1, r1, #8 1887 0xe28e300c, //add r3, lr, #12 1888 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] 1889 0xe28e3008, //add r3, lr, #8 1890 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] 1891 0xe28e3010, //add r3, lr, #16 1892 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] 1893 0xe28e3014, //add r3, lr, #20 1894 0xf2410c31, //vfma.f32 d16, d1, d17 1895 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] 1896 0xf2411c32, //vfma.f32 d17, d1, d18 1897 0xf4ee2c9d, //vld1.32 {d18[]}, [lr :32]! 1898 0xf4ee3c9f, //vld1.32 {d19[]}, [lr :32] 1899 0xf2400c32, //vfma.f32 d16, d0, d18 1900 0xf2401c33, //vfma.f32 d17, d0, d19 1901 0xf22001b0, //vorr d0, d16, d16 1902 0xf22111b1, //vorr d1, d17, d17 1903 0xe8bd4800, //pop {fp, lr} 1904 0xe12fff1c, //bx ip 1905 }; 1906 1907 CODE const uint32_t sk_matrix_3x4_vfp4[] = { 1908 0xe92d4800, //push {fp, lr} 1909 0xe591e000, //ldr lr, [r1] 1910 0xe591c004, //ldr ip, [r1, #4] 1911 0xe2811008, //add r1, r1, #8 1912 0xe28e3020, //add r3, lr, #32 1913 0xf4e33c9f, //vld1.32 {d19[]}, [r3 :32] 1914 0xe28e302c, //add r3, lr, #44 1915 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] 1916 0xe28e301c, //add r3, lr, #28 1917 0xf2420c33, //vfma.f32 d16, d2, d19 1918 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32] 1919 0xe28e3018, //add r3, lr, #24 1920 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] 1921 0xe28e3024, //add r3, lr, #36 1922 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] 1923 0xe28e3028, //add r3, lr, #40 1924 0xf2421c32, //vfma.f32 d17, d2, d18 1925 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] 1926 0xe28e3010, //add r3, lr, #16 1927 0xf2422c34, //vfma.f32 d18, d2, d20 1928 0xf4e33c9f, //vld1.32 {d19[]}, [r3 :32] 1929 0xe28e300c, //add r3, lr, #12 1930 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32] 1931 0xe28e3014, //add r3, lr, #20 1932 0xf2411c34, //vfma.f32 d17, d1, d20 1933 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32] 1934 0xf2410c34, //vfma.f32 d16, d1, d20 1935 0xe28e3008, //add r3, lr, #8 1936 0xf2412c33, //vfma.f32 d18, d1, d19 1937 0xf4ee3c9d, //vld1.32 {d19[]}, [lr :32]! 1938 0xf4ee4c9f, //vld1.32 {d20[]}, [lr :32] 1939 0xf2401c33, //vfma.f32 d17, d0, d19 1940 0xf4e33c9f, //vld1.32 {d19[]}, [r3 :32] 1941 0xf2400c33, //vfma.f32 d16, d0, d19 1942 0xf2402c34, //vfma.f32 d18, d0, d20 1943 0xf22101b1, //vorr d0, d17, d17 1944 0xf22021b0, //vorr d2, d16, d16 1945 0xf22211b2, //vorr d1, d18, d18 1946 0xe8bd4800, //pop {fp, lr} 1947 0xe12fff1c, //bx ip 1948 }; 1949 1950 CODE const uint32_t sk_matrix_4x5_vfp4[] = { 1951 0xe92d4010, //push {r4, lr} 1952 0xe8911008, //ldm r1, {r3, ip} 1953 0xf2620112, //vorr d16, d2, d2 1954 0xe2811008, //add r1, r1, #8 1955 0xe2834014, //add r4, r3, #20 1956 0xe1a0e003, //mov lr, r3 1957 0xf4e45c9f, //vld1.32 {d21[]}, [r4 :32] 1958 0xe2834028, //add r4, r3, #40 1959 0xf4e46c9f, //vld1.32 {d22[]}, [r4 :32] 1960 0xe2834038, //add r4, r3, #56 1961 0xf4e47c9f, //vld1.32 {d23[]}, [r4 :32] 1962 0xe2834048, //add r4, r3, #72 1963 0xf4a42c9f, //vld1.32 {d2[]}, [r4 :32] 1964 0xe2834034, //add r4, r3, #52 1965 0xf2032c37, //vfma.f32 d2, d3, d23 1966 0xf4e48c9f, //vld1.32 {d24[]}, [r4 :32] 1967 0xe2834044, //add r4, r3, #68 1968 0xf4e41c9f, //vld1.32 {d17[]}, [r4 :32] 1969 0xe2834030, //add r4, r3, #48 1970 0xf2431c38, //vfma.f32 d17, d3, d24 1971 0xf4e49c9f, //vld1.32 {d25[]}, [r4 :32] 1972 0xe283403c, //add r4, r3, #60 1973 0xf4e43c9f, //vld1.32 {d19[]}, [r4 :32] 1974 0xe283404c, //add r4, r3, #76 1975 0xf2002cb6, //vfma.f32 d2, d16, d22 1976 0xf4e42c9f, //vld1.32 {d18[]}, [r4 :32] 1977 0xe2834040, //add r4, r3, #64 1978 0xf2432c33, //vfma.f32 d18, d3, d19 1979 0xf4e43c9f, //vld1.32 {d19[]}, [r4 :32] 1980 0xe2834020, //add r4, r3, #32 1981 0xf2433c39, //vfma.f32 d19, d3, d25 1982 0xf4e47c9f, //vld1.32 {d23[]}, [r4 :32] 1983 0xe283402c, //add r4, r3, #44 1984 0xf4e48c9f, //vld1.32 {d24[]}, [r4 :32] 1985 0xe2834024, //add r4, r3, #36 1986 0xf2402cb8, //vfma.f32 d18, d16, d24 1987 0xf4e48c9f, //vld1.32 {d24[]}, [r4 :32] 1988 0xf2401cb8, //vfma.f32 d17, d16, d24 1989 0xe2834010, //add r4, r3, #16 1990 0xf2403cb7, //vfma.f32 d19, d16, d23 1991 0xf4ee4c9d, //vld1.32 {d20[]}, [lr :32]! 1992 0xf4e40c9f, //vld1.32 {d16[]}, [r4 :32] 1993 0xe283401c, //add r4, r3, #28 1994 0xf4e46c9f, //vld1.32 {d22[]}, [r4 :32] 1995 0xe2834018, //add r4, r3, #24 1996 0xf2412c36, //vfma.f32 d18, d1, d22 1997 0xf2411c35, //vfma.f32 d17, d1, d21 1998 0xf4ee5c9f, //vld1.32 {d21[]}, [lr :32] 1999 0xf2413c30, //vfma.f32 d19, d1, d16 2000 0xf4e40c9f, //vld1.32 {d16[]}, [r4 :32] 2001 0xe2834008, //add r4, r3, #8 2002 0xe283300c, //add r3, r3, #12 2003 0xf2012c30, //vfma.f32 d2, d1, d16 2004 0xf4e40c9f, //vld1.32 {d16[]}, [r4 :32] 2005 0xf2401c35, //vfma.f32 d17, d0, d21 2006 0xf2403c34, //vfma.f32 d19, d0, d20 2007 0xf4e34c9f, //vld1.32 {d20[]}, [r3 :32] 2008 0xf2402c34, //vfma.f32 d18, d0, d20 2009 0xf2002c30, //vfma.f32 d2, d0, d16 2010 0xf22111b1, //vorr d1, d17, d17 2011 0xf22301b3, //vorr d0, d19, d19 2012 0xf22231b2, //vorr d3, d18, d18 2013 0xe8bd4010, //pop {r4, lr} 2014 0xe12fff1c, //bx ip 2015 }; 2016 2017 CODE const uint32_t sk_matrix_perspective_vfp4[] = { 2018 0xe92d4010, //push {r4, lr} 2019 0xe591e000, //ldr lr, [r1] 2020 0xe591c004, //ldr ip, [r1, #4] 2021 0xe2811008, //add r1, r1, #8 2022 0xe28e301c, //add r3, lr, #28 2023 0xe28e4010, //add r4, lr, #16 2024 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] 2025 0xe28e3020, //add r3, lr, #32 2026 0xf4e31c9f, //vld1.32 {d17[]}, [r3 :32] 2027 0xe28e3018, //add r3, lr, #24 2028 0xf2411c30, //vfma.f32 d17, d1, d16 2029 0xf4e30c9f, //vld1.32 {d16[]}, [r3 :32] 2030 0xe1a0300e, //mov r3, lr 2031 0xf4e42c9f, //vld1.32 {d18[]}, [r4 :32] 2032 0xe28e4008, //add r4, lr, #8 2033 0xf4e43c9f, //vld1.32 {d19[]}, [r4 :32] 2034 0xf2401c30, //vfma.f32 d17, d0, d16 2035 0xf4e30c9d, //vld1.32 {d16[]}, [r3 :32]! 2036 0xf4e35c9f, //vld1.32 {d21[]}, [r3 :32] 2037 0xe28e3014, //add r3, lr, #20 2038 0xf2413c35, //vfma.f32 d19, d1, d21 2039 0xf4e35c9f, //vld1.32 {d21[]}, [r3 :32] 2040 0xe28e300c, //add r3, lr, #12 2041 0xf2415c32, //vfma.f32 d21, d1, d18 2042 0xf4e32c9f, //vld1.32 {d18[]}, [r3 :32] 2043 0xf3fb4521, //vrecpe.f32 d20, d17 2044 0xf2403c30, //vfma.f32 d19, d0, d16 2045 0xf2411fb4, //vrecps.f32 d17, d17, d20 2046 0xf2405c32, //vfma.f32 d21, d0, d18 2047 0xf3440db1, //vmul.f32 d16, d20, d17 2048 0xf3030db0, //vmul.f32 d0, d19, d16 2049 0xf3051db0, //vmul.f32 d1, d21, d16 2050 0xe8bd4010, //pop {r4, lr} 2051 0xe12fff1c, //bx ip 2052 }; 2053 2054 CODE const uint32_t sk_linear_gradient_2stops_vfp4[] = { 2055 0xe8911008, //ldm r1, {r3, ip} 2056 0xe2811008, //add r1, r1, #8 2057 0xf4632a0d, //vld1.8 {d18-d19}, [r3]! 2058 0xf4634a0f, //vld1.8 {d20-d21}, [r3] 2059 0xf3f40c22, //vdup.32 d16, d18[0] 2060 0xf3f41c24, //vdup.32 d17, d20[0] 2061 0xf2400c31, //vfma.f32 d16, d0, d17 2062 0xf3fc6c24, //vdup.32 d22, d20[1] 2063 0xf3bc1c22, //vdup.32 d1, d18[1] 2064 0xf3b42c23, //vdup.32 d2, d19[0] 2065 0xf2001c36, //vfma.f32 d1, d0, d22 2066 0xf3f41c25, //vdup.32 d17, d21[0] 2067 0xf3fc4c25, //vdup.32 d20, d21[1] 2068 0xf2002c31, //vfma.f32 d2, d0, d17 2069 0xf3bc3c23, //vdup.32 d3, d19[1] 2070 0xf2003c34, //vfma.f32 d3, d0, d20 2071 0xf22001b0, //vorr d0, d16, d16 2072 0xe12fff1c, //bx ip 2073 }; 2074 #elif defined(__x86_64__) 2075 2076 CODE const uint8_t sk_start_pipeline_hsw[] = { 2077 65,87, //push %r15 2078 65,86, //push %r14 2079 65,85, //push %r13 2080 65,84, //push %r12 2081 83, //push %rbx 2082 73,137,205, //mov %rcx,%r13 2083 73,137,214, //mov %rdx,%r14 2084 72,137,251, //mov %rdi,%rbx 2085 72,173, //lods %ds:(%rsi),%rax 2086 73,137,199, //mov %rax,%r15 2087 73,137,244, //mov %rsi,%r12 2088 72,141,67,8, //lea 0x8(%rbx),%rax 2089 76,57,232, //cmp %r13,%rax 2090 118,5, //jbe 28 <_sk_start_pipeline_hsw+0x28> 2091 72,137,223, //mov %rbx,%rdi 2092 235,65, //jmp 69 <_sk_start_pipeline_hsw+0x69> 2093 185,0,0,0,0, //mov $0x0,%ecx 2094 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 2095 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 2096 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 2097 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 2098 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4 2099 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5 2100 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6 2101 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7 2102 72,137,223, //mov %rbx,%rdi 2103 76,137,230, //mov %r12,%rsi 2104 76,137,242, //mov %r14,%rdx 2105 65,255,215, //callq *%r15 2106 72,141,123,8, //lea 0x8(%rbx),%rdi 2107 72,131,195,16, //add $0x10,%rbx 2108 76,57,235, //cmp %r13,%rbx 2109 72,137,251, //mov %rdi,%rbx 2110 118,191, //jbe 28 <_sk_start_pipeline_hsw+0x28> 2111 76,137,233, //mov %r13,%rcx 2112 72,41,249, //sub %rdi,%rcx 2113 116,41, //je 9a <_sk_start_pipeline_hsw+0x9a> 2114 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 2115 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 2116 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 2117 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 2118 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4 2119 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5 2120 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6 2121 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7 2122 76,137,230, //mov %r12,%rsi 2123 76,137,242, //mov %r14,%rdx 2124 65,255,215, //callq *%r15 2125 76,137,232, //mov %r13,%rax 2126 91, //pop %rbx 2127 65,92, //pop %r12 2128 65,93, //pop %r13 2129 65,94, //pop %r14 2130 65,95, //pop %r15 2131 197,248,119, //vzeroupper 2132 195, //retq 2133 }; 2134 2135 CODE const uint8_t sk_just_return_hsw[] = { 2136 195, //retq 2137 }; 2138 2139 CODE const uint8_t sk_seed_shader_hsw[] = { 2140 72,173, //lods %ds:(%rsi),%rax 2141 197,249,110,199, //vmovd %edi,%xmm0 2142 196,226,125,88,192, //vpbroadcastd %xmm0,%ymm0 2143 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 2144 65,184,0,0,0,63, //mov $0x3f000000,%r8d 2145 196,193,121,110,200, //vmovd %r8d,%xmm1 2146 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 2147 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 2148 197,252,88,2, //vaddps (%rdx),%ymm0,%ymm0 2149 196,226,125,24,16, //vbroadcastss (%rax),%ymm2 2150 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 2151 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1 2152 184,0,0,128,63, //mov $0x3f800000,%eax 2153 197,249,110,208, //vmovd %eax,%xmm2 2154 196,226,125,88,210, //vpbroadcastd %xmm2,%ymm2 2155 72,173, //lods %ds:(%rsi),%rax 2156 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 2157 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4 2158 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5 2159 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6 2160 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7 2161 255,224, //jmpq *%rax 2162 }; 2163 2164 CODE const uint8_t sk_constant_color_hsw[] = { 2165 72,173, //lods %ds:(%rsi),%rax 2166 196,226,125,24,0, //vbroadcastss (%rax),%ymm0 2167 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1 2168 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2 2169 196,226,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm3 2170 72,173, //lods %ds:(%rsi),%rax 2171 255,224, //jmpq *%rax 2172 }; 2173 2174 CODE const uint8_t sk_clear_hsw[] = { 2175 72,173, //lods %ds:(%rsi),%rax 2176 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 2177 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 2178 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 2179 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 2180 255,224, //jmpq *%rax 2181 }; 2182 2183 CODE const uint8_t sk_plus__hsw[] = { 2184 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0 2185 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1 2186 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2 2187 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3 2188 72,173, //lods %ds:(%rsi),%rax 2189 255,224, //jmpq *%rax 2190 }; 2191 2192 CODE const uint8_t sk_srcover_hsw[] = { 2193 184,0,0,128,63, //mov $0x3f800000,%eax 2194 197,121,110,192, //vmovd %eax,%xmm8 2195 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 2196 197,60,92,195, //vsubps %ymm3,%ymm8,%ymm8 2197 196,194,93,184,192, //vfmadd231ps %ymm8,%ymm4,%ymm0 2198 196,194,85,184,200, //vfmadd231ps %ymm8,%ymm5,%ymm1 2199 196,194,77,184,208, //vfmadd231ps %ymm8,%ymm6,%ymm2 2200 196,194,69,184,216, //vfmadd231ps %ymm8,%ymm7,%ymm3 2201 72,173, //lods %ds:(%rsi),%rax 2202 255,224, //jmpq *%rax 2203 }; 2204 2205 CODE const uint8_t sk_dstover_hsw[] = { 2206 184,0,0,128,63, //mov $0x3f800000,%eax 2207 197,121,110,192, //vmovd %eax,%xmm8 2208 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 2209 197,60,92,199, //vsubps %ymm7,%ymm8,%ymm8 2210 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0 2211 196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1 2212 196,226,61,168,214, //vfmadd213ps %ymm6,%ymm8,%ymm2 2213 196,226,61,168,223, //vfmadd213ps %ymm7,%ymm8,%ymm3 2214 72,173, //lods %ds:(%rsi),%rax 2215 255,224, //jmpq *%rax 2216 }; 2217 2218 CODE const uint8_t sk_clamp_0_hsw[] = { 2219 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 2220 196,193,124,95,192, //vmaxps %ymm8,%ymm0,%ymm0 2221 196,193,116,95,200, //vmaxps %ymm8,%ymm1,%ymm1 2222 196,193,108,95,208, //vmaxps %ymm8,%ymm2,%ymm2 2223 196,193,100,95,216, //vmaxps %ymm8,%ymm3,%ymm3 2224 72,173, //lods %ds:(%rsi),%rax 2225 255,224, //jmpq *%rax 2226 }; 2227 2228 CODE const uint8_t sk_clamp_1_hsw[] = { 2229 184,0,0,128,63, //mov $0x3f800000,%eax 2230 197,121,110,192, //vmovd %eax,%xmm8 2231 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 2232 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0 2233 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1 2234 196,193,108,93,208, //vminps %ymm8,%ymm2,%ymm2 2235 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3 2236 72,173, //lods %ds:(%rsi),%rax 2237 255,224, //jmpq *%rax 2238 }; 2239 2240 CODE const uint8_t sk_clamp_a_hsw[] = { 2241 184,0,0,128,63, //mov $0x3f800000,%eax 2242 197,121,110,192, //vmovd %eax,%xmm8 2243 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 2244 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3 2245 197,252,93,195, //vminps %ymm3,%ymm0,%ymm0 2246 197,244,93,203, //vminps %ymm3,%ymm1,%ymm1 2247 197,236,93,211, //vminps %ymm3,%ymm2,%ymm2 2248 72,173, //lods %ds:(%rsi),%rax 2249 255,224, //jmpq *%rax 2250 }; 2251 2252 CODE const uint8_t sk_set_rgb_hsw[] = { 2253 72,173, //lods %ds:(%rsi),%rax 2254 196,226,125,24,0, //vbroadcastss (%rax),%ymm0 2255 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1 2256 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2 2257 72,173, //lods %ds:(%rsi),%rax 2258 255,224, //jmpq *%rax 2259 }; 2260 2261 CODE const uint8_t sk_swap_rb_hsw[] = { 2262 197,124,40,192, //vmovaps %ymm0,%ymm8 2263 72,173, //lods %ds:(%rsi),%rax 2264 197,252,40,194, //vmovaps %ymm2,%ymm0 2265 197,124,41,194, //vmovaps %ymm8,%ymm2 2266 255,224, //jmpq *%rax 2267 }; 2268 2269 CODE const uint8_t sk_swap_hsw[] = { 2270 197,124,40,195, //vmovaps %ymm3,%ymm8 2271 197,124,40,202, //vmovaps %ymm2,%ymm9 2272 197,124,40,209, //vmovaps %ymm1,%ymm10 2273 197,124,40,216, //vmovaps %ymm0,%ymm11 2274 72,173, //lods %ds:(%rsi),%rax 2275 197,252,40,196, //vmovaps %ymm4,%ymm0 2276 197,252,40,205, //vmovaps %ymm5,%ymm1 2277 197,252,40,214, //vmovaps %ymm6,%ymm2 2278 197,252,40,223, //vmovaps %ymm7,%ymm3 2279 197,124,41,220, //vmovaps %ymm11,%ymm4 2280 197,124,41,213, //vmovaps %ymm10,%ymm5 2281 197,124,41,206, //vmovaps %ymm9,%ymm6 2282 197,124,41,199, //vmovaps %ymm8,%ymm7 2283 255,224, //jmpq *%rax 2284 }; 2285 2286 CODE const uint8_t sk_move_src_dst_hsw[] = { 2287 72,173, //lods %ds:(%rsi),%rax 2288 197,252,40,224, //vmovaps %ymm0,%ymm4 2289 197,252,40,233, //vmovaps %ymm1,%ymm5 2290 197,252,40,242, //vmovaps %ymm2,%ymm6 2291 197,252,40,251, //vmovaps %ymm3,%ymm7 2292 255,224, //jmpq *%rax 2293 }; 2294 2295 CODE const uint8_t sk_move_dst_src_hsw[] = { 2296 72,173, //lods %ds:(%rsi),%rax 2297 197,252,40,196, //vmovaps %ymm4,%ymm0 2298 197,252,40,205, //vmovaps %ymm5,%ymm1 2299 197,252,40,214, //vmovaps %ymm6,%ymm2 2300 197,252,40,223, //vmovaps %ymm7,%ymm3 2301 255,224, //jmpq *%rax 2302 }; 2303 2304 CODE const uint8_t sk_premul_hsw[] = { 2305 197,252,89,195, //vmulps %ymm3,%ymm0,%ymm0 2306 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1 2307 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 2308 72,173, //lods %ds:(%rsi),%rax 2309 255,224, //jmpq *%rax 2310 }; 2311 2312 CODE const uint8_t sk_unpremul_hsw[] = { 2313 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 2314 196,65,100,194,200,0, //vcmpeqps %ymm8,%ymm3,%ymm9 2315 184,0,0,128,63, //mov $0x3f800000,%eax 2316 197,121,110,208, //vmovd %eax,%xmm10 2317 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 2318 197,44,94,211, //vdivps %ymm3,%ymm10,%ymm10 2319 196,67,45,74,192,144, //vblendvps %ymm9,%ymm8,%ymm10,%ymm8 2320 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 2321 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 2322 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2 2323 72,173, //lods %ds:(%rsi),%rax 2324 255,224, //jmpq *%rax 2325 }; 2326 2327 CODE const uint8_t sk_from_srgb_hsw[] = { 2328 184,145,131,158,61, //mov $0x3d9e8391,%eax 2329 197,121,110,192, //vmovd %eax,%xmm8 2330 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 2331 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 2332 197,124,89,208, //vmulps %ymm0,%ymm0,%ymm10 2333 184,154,153,153,62, //mov $0x3e99999a,%eax 2334 197,121,110,216, //vmovd %eax,%xmm11 2335 196,66,125,88,219, //vpbroadcastd %xmm11,%ymm11 2336 184,92,143,50,63, //mov $0x3f328f5c,%eax 2337 197,121,110,224, //vmovd %eax,%xmm12 2338 196,66,125,88,228, //vpbroadcastd %xmm12,%ymm12 2339 196,65,125,111,235, //vmovdqa %ymm11,%ymm13 2340 196,66,125,168,236, //vfmadd213ps %ymm12,%ymm0,%ymm13 2341 184,10,215,35,59, //mov $0x3b23d70a,%eax 2342 197,121,110,240, //vmovd %eax,%xmm14 2343 196,66,125,88,246, //vpbroadcastd %xmm14,%ymm14 2344 196,66,45,168,238, //vfmadd213ps %ymm14,%ymm10,%ymm13 2345 184,174,71,97,61, //mov $0x3d6147ae,%eax 2346 197,121,110,208, //vmovd %eax,%xmm10 2347 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 2348 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0 2349 196,195,21,74,193,0, //vblendvps %ymm0,%ymm9,%ymm13,%ymm0 2350 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9 2351 197,116,89,233, //vmulps %ymm1,%ymm1,%ymm13 2352 196,65,125,111,251, //vmovdqa %ymm11,%ymm15 2353 196,66,117,168,252, //vfmadd213ps %ymm12,%ymm1,%ymm15 2354 196,66,21,168,254, //vfmadd213ps %ymm14,%ymm13,%ymm15 2355 196,193,116,194,202,1, //vcmpltps %ymm10,%ymm1,%ymm1 2356 196,195,5,74,201,16, //vblendvps %ymm1,%ymm9,%ymm15,%ymm1 2357 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 2358 197,108,89,202, //vmulps %ymm2,%ymm2,%ymm9 2359 196,66,109,168,220, //vfmadd213ps %ymm12,%ymm2,%ymm11 2360 196,66,53,168,222, //vfmadd213ps %ymm14,%ymm9,%ymm11 2361 196,193,108,194,210,1, //vcmpltps %ymm10,%ymm2,%ymm2 2362 196,195,37,74,208,32, //vblendvps %ymm2,%ymm8,%ymm11,%ymm2 2363 72,173, //lods %ds:(%rsi),%rax 2364 255,224, //jmpq *%rax 2365 }; 2366 2367 CODE const uint8_t sk_to_srgb_hsw[] = { 2368 197,124,82,192, //vrsqrtps %ymm0,%ymm8 2369 196,65,124,83,216, //vrcpps %ymm8,%ymm11 2370 196,65,124,82,224, //vrsqrtps %ymm8,%ymm12 2371 184,41,92,71,65, //mov $0x41475c29,%eax 2372 197,121,110,192, //vmovd %eax,%xmm8 2373 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 2374 197,60,89,232, //vmulps %ymm0,%ymm8,%ymm13 2375 184,0,0,128,63, //mov $0x3f800000,%eax 2376 197,121,110,200, //vmovd %eax,%xmm9 2377 196,66,125,88,201, //vpbroadcastd %xmm9,%ymm9 2378 184,194,135,210,62, //mov $0x3ed287c2,%eax 2379 197,121,110,208, //vmovd %eax,%xmm10 2380 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 2381 184,206,111,48,63, //mov $0x3f306fce,%eax 2382 197,121,110,240, //vmovd %eax,%xmm14 2383 196,66,125,88,246, //vpbroadcastd %xmm14,%ymm14 2384 184,168,87,202,61, //mov $0x3dca57a8,%eax 2385 53,0,0,0,128, //xor $0x80000000,%eax 2386 197,121,110,248, //vmovd %eax,%xmm15 2387 196,66,125,88,255, //vpbroadcastd %xmm15,%ymm15 2388 196,66,13,168,223, //vfmadd213ps %ymm15,%ymm14,%ymm11 2389 196,66,45,184,220, //vfmadd231ps %ymm12,%ymm10,%ymm11 2390 196,65,52,93,219, //vminps %ymm11,%ymm9,%ymm11 2391 184,4,231,140,59, //mov $0x3b8ce704,%eax 2392 197,121,110,224, //vmovd %eax,%xmm12 2393 196,66,125,88,228, //vpbroadcastd %xmm12,%ymm12 2394 196,193,124,194,196,1, //vcmpltps %ymm12,%ymm0,%ymm0 2395 196,195,37,74,197,0, //vblendvps %ymm0,%ymm13,%ymm11,%ymm0 2396 197,124,82,217, //vrsqrtps %ymm1,%ymm11 2397 196,65,124,83,235, //vrcpps %ymm11,%ymm13 2398 196,65,124,82,219, //vrsqrtps %ymm11,%ymm11 2399 196,66,13,168,239, //vfmadd213ps %ymm15,%ymm14,%ymm13 2400 196,66,45,184,235, //vfmadd231ps %ymm11,%ymm10,%ymm13 2401 197,60,89,217, //vmulps %ymm1,%ymm8,%ymm11 2402 196,65,52,93,237, //vminps %ymm13,%ymm9,%ymm13 2403 196,193,116,194,204,1, //vcmpltps %ymm12,%ymm1,%ymm1 2404 196,195,21,74,203,16, //vblendvps %ymm1,%ymm11,%ymm13,%ymm1 2405 197,124,82,218, //vrsqrtps %ymm2,%ymm11 2406 196,65,124,83,235, //vrcpps %ymm11,%ymm13 2407 196,66,13,168,239, //vfmadd213ps %ymm15,%ymm14,%ymm13 2408 196,65,124,82,219, //vrsqrtps %ymm11,%ymm11 2409 196,66,45,184,235, //vfmadd231ps %ymm11,%ymm10,%ymm13 2410 196,65,52,93,205, //vminps %ymm13,%ymm9,%ymm9 2411 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 2412 196,193,108,194,212,1, //vcmpltps %ymm12,%ymm2,%ymm2 2413 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2 2414 72,173, //lods %ds:(%rsi),%rax 2415 255,224, //jmpq *%rax 2416 }; 2417 2418 CODE const uint8_t sk_scale_1_float_hsw[] = { 2419 72,173, //lods %ds:(%rsi),%rax 2420 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 2421 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 2422 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 2423 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2 2424 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 2425 72,173, //lods %ds:(%rsi),%rax 2426 255,224, //jmpq *%rax 2427 }; 2428 2429 CODE const uint8_t sk_scale_u8_hsw[] = { 2430 73,137,200, //mov %rcx,%r8 2431 72,173, //lods %ds:(%rsi),%rax 2432 72,139,0, //mov (%rax),%rax 2433 72,1,248, //add %rdi,%rax 2434 77,133,192, //test %r8,%r8 2435 117,56, //jne 4bf <_sk_scale_u8_hsw+0x48> 2436 197,122,126,0, //vmovq (%rax),%xmm8 2437 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8 2438 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 2439 184,129,128,128,59, //mov $0x3b808081,%eax 2440 197,121,110,200, //vmovd %eax,%xmm9 2441 196,66,125,88,201, //vpbroadcastd %xmm9,%ymm9 2442 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8 2443 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 2444 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 2445 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2 2446 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 2447 72,173, //lods %ds:(%rsi),%rax 2448 76,137,193, //mov %r8,%rcx 2449 255,224, //jmpq *%rax 2450 49,201, //xor %ecx,%ecx 2451 77,137,194, //mov %r8,%r10 2452 69,49,201, //xor %r9d,%r9d 2453 68,15,182,24, //movzbl (%rax),%r11d 2454 72,255,192, //inc %rax 2455 73,211,227, //shl %cl,%r11 2456 77,9,217, //or %r11,%r9 2457 72,131,193,8, //add $0x8,%rcx 2458 73,255,202, //dec %r10 2459 117,234, //jne 4c7 <_sk_scale_u8_hsw+0x50> 2460 196,65,249,110,193, //vmovq %r9,%xmm8 2461 235,167, //jmp 48b <_sk_scale_u8_hsw+0x14> 2462 }; 2463 2464 CODE const uint8_t sk_lerp_1_float_hsw[] = { 2465 72,173, //lods %ds:(%rsi),%rax 2466 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 2467 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 2468 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0 2469 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 2470 196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1 2471 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 2472 196,226,61,168,214, //vfmadd213ps %ymm6,%ymm8,%ymm2 2473 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3 2474 196,226,61,168,223, //vfmadd213ps %ymm7,%ymm8,%ymm3 2475 72,173, //lods %ds:(%rsi),%rax 2476 255,224, //jmpq *%rax 2477 }; 2478 2479 CODE const uint8_t sk_lerp_u8_hsw[] = { 2480 73,137,200, //mov %rcx,%r8 2481 72,173, //lods %ds:(%rsi),%rax 2482 72,139,0, //mov (%rax),%rax 2483 72,1,248, //add %rdi,%rax 2484 77,133,192, //test %r8,%r8 2485 117,76, //jne 56f <_sk_lerp_u8_hsw+0x5c> 2486 197,122,126,0, //vmovq (%rax),%xmm8 2487 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8 2488 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 2489 184,129,128,128,59, //mov $0x3b808081,%eax 2490 197,121,110,200, //vmovd %eax,%xmm9 2491 196,66,125,88,201, //vpbroadcastd %xmm9,%ymm9 2492 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8 2493 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 2494 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0 2495 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 2496 196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1 2497 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 2498 196,226,61,168,214, //vfmadd213ps %ymm6,%ymm8,%ymm2 2499 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3 2500 196,226,61,168,223, //vfmadd213ps %ymm7,%ymm8,%ymm3 2501 72,173, //lods %ds:(%rsi),%rax 2502 76,137,193, //mov %r8,%rcx 2503 255,224, //jmpq *%rax 2504 49,201, //xor %ecx,%ecx 2505 77,137,194, //mov %r8,%r10 2506 69,49,201, //xor %r9d,%r9d 2507 68,15,182,24, //movzbl (%rax),%r11d 2508 72,255,192, //inc %rax 2509 73,211,227, //shl %cl,%r11 2510 77,9,217, //or %r11,%r9 2511 72,131,193,8, //add $0x8,%rcx 2512 73,255,202, //dec %r10 2513 117,234, //jne 577 <_sk_lerp_u8_hsw+0x64> 2514 196,65,249,110,193, //vmovq %r9,%xmm8 2515 235,147, //jmp 527 <_sk_lerp_u8_hsw+0x14> 2516 }; 2517 2518 CODE const uint8_t sk_lerp_565_hsw[] = { 2519 72,173, //lods %ds:(%rsi),%rax 2520 76,139,16, //mov (%rax),%r10 2521 72,133,201, //test %rcx,%rcx 2522 15,133,179,0,0,0, //jne 655 <_sk_lerp_565_hsw+0xc1> 2523 196,193,122,111,28,122, //vmovdqu (%r10,%rdi,2),%xmm3 2524 196,98,125,51,195, //vpmovzxwd %xmm3,%ymm8 2525 184,0,248,0,0, //mov $0xf800,%eax 2526 197,249,110,216, //vmovd %eax,%xmm3 2527 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 2528 196,193,101,219,216, //vpand %ymm8,%ymm3,%ymm3 2529 197,124,91,203, //vcvtdq2ps %ymm3,%ymm9 2530 184,8,33,132,55, //mov $0x37842108,%eax 2531 197,249,110,216, //vmovd %eax,%xmm3 2532 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 2533 197,52,89,203, //vmulps %ymm3,%ymm9,%ymm9 2534 184,224,7,0,0, //mov $0x7e0,%eax 2535 197,249,110,216, //vmovd %eax,%xmm3 2536 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 2537 196,193,101,219,216, //vpand %ymm8,%ymm3,%ymm3 2538 197,124,91,211, //vcvtdq2ps %ymm3,%ymm10 2539 184,33,8,2,58, //mov $0x3a020821,%eax 2540 197,249,110,216, //vmovd %eax,%xmm3 2541 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 2542 197,44,89,211, //vmulps %ymm3,%ymm10,%ymm10 2543 184,31,0,0,0, //mov $0x1f,%eax 2544 197,249,110,216, //vmovd %eax,%xmm3 2545 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 2546 196,193,101,219,216, //vpand %ymm8,%ymm3,%ymm3 2547 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 2548 184,8,33,4,61, //mov $0x3d042108,%eax 2549 197,249,110,216, //vmovd %eax,%xmm3 2550 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 2551 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 2552 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 2553 196,226,53,168,196, //vfmadd213ps %ymm4,%ymm9,%ymm0 2554 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 2555 196,226,45,168,205, //vfmadd213ps %ymm5,%ymm10,%ymm1 2556 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 2557 196,226,101,168,214, //vfmadd213ps %ymm6,%ymm3,%ymm2 2558 184,0,0,128,63, //mov $0x3f800000,%eax 2559 197,249,110,216, //vmovd %eax,%xmm3 2560 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 2561 72,173, //lods %ds:(%rsi),%rax 2562 255,224, //jmpq *%rax 2563 65,137,200, //mov %ecx,%r8d 2564 65,128,224,7, //and $0x7,%r8b 2565 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 2566 65,254,200, //dec %r8b 2567 65,128,248,6, //cmp $0x6,%r8b 2568 15,135,59,255,255,255, //ja 5a8 <_sk_lerp_565_hsw+0x14> 2569 69,15,182,192, //movzbl %r8b,%r8d 2570 76,141,13,76,0,0,0, //lea 0x4c(%rip),%r9 # 6c4 <_sk_lerp_565_hsw+0x130> 2571 75,99,4,129, //movslq (%r9,%r8,4),%rax 2572 76,1,200, //add %r9,%rax 2573 255,224, //jmpq *%rax 2574 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 2575 196,193,97,196,92,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm3 2576 196,193,97,196,92,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm3,%xmm3 2577 196,193,97,196,92,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm3,%xmm3 2578 196,193,97,196,92,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm3,%xmm3 2579 196,193,97,196,92,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3 2580 196,193,97,196,92,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3 2581 196,193,97,196,28,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm3,%xmm3 2582 233,231,254,255,255, //jmpq 5a8 <_sk_lerp_565_hsw+0x14> 2583 15,31,0, //nopl (%rax) 2584 241, //icebp 2585 255, //(bad) 2586 255, //(bad) 2587 255, //(bad) 2588 233,255,255,255,225, //jmpq ffffffffe20006cc <_sk_linear_gradient_2stops_hsw+0xffffffffe1fff4f0> 2589 255, //(bad) 2590 255, //(bad) 2591 255, //(bad) 2592 217,255, //fcos 2593 255, //(bad) 2594 255,209, //callq *%rcx 2595 255, //(bad) 2596 255, //(bad) 2597 255,201, //dec %ecx 2598 255, //(bad) 2599 255, //(bad) 2600 255, //(bad) 2601 189, //.byte 0xbd 2602 255, //(bad) 2603 255, //(bad) 2604 255, //.byte 0xff 2605 }; 2606 2607 CODE const uint8_t sk_load_tables_hsw[] = { 2608 73,137,200, //mov %rcx,%r8 2609 72,173, //lods %ds:(%rsi),%rax 2610 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9 2611 76,3,8, //add (%rax),%r9 2612 77,133,192, //test %r8,%r8 2613 117,121, //jne 76e <_sk_load_tables_hsw+0x8e> 2614 196,193,126,111,25, //vmovdqu (%r9),%ymm3 2615 185,255,0,0,0, //mov $0xff,%ecx 2616 197,249,110,193, //vmovd %ecx,%xmm0 2617 196,226,125,88,208, //vpbroadcastd %xmm0,%ymm2 2618 197,237,219,203, //vpand %ymm3,%ymm2,%ymm1 2619 196,65,61,118,192, //vpcmpeqd %ymm8,%ymm8,%ymm8 2620 72,139,72,8, //mov 0x8(%rax),%rcx 2621 76,139,72,16, //mov 0x10(%rax),%r9 2622 196,65,53,118,201, //vpcmpeqd %ymm9,%ymm9,%ymm9 2623 196,226,53,146,4,137, //vgatherdps %ymm9,(%rcx,%ymm1,4),%ymm0 2624 197,245,114,211,8, //vpsrld $0x8,%ymm3,%ymm1 2625 197,109,219,201, //vpand %ymm1,%ymm2,%ymm9 2626 196,65,45,118,210, //vpcmpeqd %ymm10,%ymm10,%ymm10 2627 196,130,45,146,12,137, //vgatherdps %ymm10,(%r9,%ymm9,4),%ymm1 2628 72,139,64,24, //mov 0x18(%rax),%rax 2629 197,181,114,211,16, //vpsrld $0x10,%ymm3,%ymm9 2630 196,65,109,219,201, //vpand %ymm9,%ymm2,%ymm9 2631 196,162,61,146,20,136, //vgatherdps %ymm8,(%rax,%ymm9,4),%ymm2 2632 197,229,114,211,24, //vpsrld $0x18,%ymm3,%ymm3 2633 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 2634 184,129,128,128,59, //mov $0x3b808081,%eax 2635 197,249,110,216, //vmovd %eax,%xmm3 2636 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 2637 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 2638 72,173, //lods %ds:(%rsi),%rax 2639 76,137,193, //mov %r8,%rcx 2640 255,224, //jmpq *%rax 2641 185,8,0,0,0, //mov $0x8,%ecx 2642 68,41,193, //sub %r8d,%ecx 2643 192,225,3, //shl $0x3,%cl 2644 73,199,194,255,255,255,255, //mov $0xffffffffffffffff,%r10 2645 73,211,234, //shr %cl,%r10 2646 196,193,249,110,194, //vmovq %r10,%xmm0 2647 196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0 2648 196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3 2649 233,99,255,255,255, //jmpq 6fa <_sk_load_tables_hsw+0x1a> 2650 }; 2651 2652 CODE const uint8_t sk_load_a8_hsw[] = { 2653 73,137,200, //mov %rcx,%r8 2654 72,173, //lods %ds:(%rsi),%rax 2655 72,139,0, //mov (%rax),%rax 2656 72,1,248, //add %rdi,%rax 2657 77,133,192, //test %r8,%r8 2658 117,50, //jne 7d9 <_sk_load_a8_hsw+0x42> 2659 197,250,126,0, //vmovq (%rax),%xmm0 2660 196,226,125,49,192, //vpmovzxbd %xmm0,%ymm0 2661 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 2662 184,129,128,128,59, //mov $0x3b808081,%eax 2663 197,249,110,200, //vmovd %eax,%xmm1 2664 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 2665 197,252,89,217, //vmulps %ymm1,%ymm0,%ymm3 2666 72,173, //lods %ds:(%rsi),%rax 2667 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 2668 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 2669 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 2670 76,137,193, //mov %r8,%rcx 2671 255,224, //jmpq *%rax 2672 49,201, //xor %ecx,%ecx 2673 77,137,194, //mov %r8,%r10 2674 69,49,201, //xor %r9d,%r9d 2675 68,15,182,24, //movzbl (%rax),%r11d 2676 72,255,192, //inc %rax 2677 73,211,227, //shl %cl,%r11 2678 77,9,217, //or %r11,%r9 2679 72,131,193,8, //add $0x8,%rcx 2680 73,255,202, //dec %r10 2681 117,234, //jne 7e1 <_sk_load_a8_hsw+0x4a> 2682 196,193,249,110,193, //vmovq %r9,%xmm0 2683 235,173, //jmp 7ab <_sk_load_a8_hsw+0x14> 2684 }; 2685 2686 CODE const uint8_t sk_store_a8_hsw[] = { 2687 72,173, //lods %ds:(%rsi),%rax 2688 76,139,8, //mov (%rax),%r9 2689 184,0,0,127,67, //mov $0x437f0000,%eax 2690 197,121,110,192, //vmovd %eax,%xmm8 2691 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 2692 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 2693 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 2694 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 2695 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 2696 196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8 2697 72,133,201, //test %rcx,%rcx 2698 117,10, //jne 839 <_sk_store_a8_hsw+0x3b> 2699 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1) 2700 72,173, //lods %ds:(%rsi),%rax 2701 255,224, //jmpq *%rax 2702 65,137,200, //mov %ecx,%r8d 2703 65,128,224,7, //and $0x7,%r8b 2704 65,254,200, //dec %r8b 2705 65,128,248,6, //cmp $0x6,%r8b 2706 119,236, //ja 835 <_sk_store_a8_hsw+0x37> 2707 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8 2708 65,15,182,192, //movzbl %r8b,%eax 2709 76,141,5,67,0,0,0, //lea 0x43(%rip),%r8 # 89c <_sk_store_a8_hsw+0x9e> 2710 73,99,4,128, //movslq (%r8,%rax,4),%rax 2711 76,1,192, //add %r8,%rax 2712 255,224, //jmpq *%rax 2713 196,67,121,20,68,57,6,12, //vpextrb $0xc,%xmm8,0x6(%r9,%rdi,1) 2714 196,67,121,20,68,57,5,10, //vpextrb $0xa,%xmm8,0x5(%r9,%rdi,1) 2715 196,67,121,20,68,57,4,8, //vpextrb $0x8,%xmm8,0x4(%r9,%rdi,1) 2716 196,67,121,20,68,57,3,6, //vpextrb $0x6,%xmm8,0x3(%r9,%rdi,1) 2717 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1) 2718 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1) 2719 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1) 2720 235,154, //jmp 835 <_sk_store_a8_hsw+0x37> 2721 144, //nop 2722 246,255, //idiv %bh 2723 255, //(bad) 2724 255, //(bad) 2725 238, //out %al,(%dx) 2726 255, //(bad) 2727 255, //(bad) 2728 255,230, //jmpq *%rsi 2729 255, //(bad) 2730 255, //(bad) 2731 255, //(bad) 2732 222,255, //fdivrp %st,%st(7) 2733 255, //(bad) 2734 255,214, //callq *%rsi 2735 255, //(bad) 2736 255, //(bad) 2737 255,206, //dec %esi 2738 255, //(bad) 2739 255, //(bad) 2740 255,198, //inc %esi 2741 255, //(bad) 2742 255, //(bad) 2743 255, //.byte 0xff 2744 }; 2745 2746 CODE const uint8_t sk_load_565_hsw[] = { 2747 72,173, //lods %ds:(%rsi),%rax 2748 76,139,16, //mov (%rax),%r10 2749 72,133,201, //test %rcx,%rcx 2750 15,133,149,0,0,0, //jne 95b <_sk_load_565_hsw+0xa3> 2751 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0 2752 196,226,125,51,208, //vpmovzxwd %xmm0,%ymm2 2753 184,0,248,0,0, //mov $0xf800,%eax 2754 197,249,110,192, //vmovd %eax,%xmm0 2755 196,226,125,88,192, //vpbroadcastd %xmm0,%ymm0 2756 197,253,219,194, //vpand %ymm2,%ymm0,%ymm0 2757 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 2758 184,8,33,132,55, //mov $0x37842108,%eax 2759 197,249,110,200, //vmovd %eax,%xmm1 2760 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 2761 197,252,89,193, //vmulps %ymm1,%ymm0,%ymm0 2762 184,224,7,0,0, //mov $0x7e0,%eax 2763 197,249,110,200, //vmovd %eax,%xmm1 2764 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 2765 197,245,219,202, //vpand %ymm2,%ymm1,%ymm1 2766 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1 2767 184,33,8,2,58, //mov $0x3a020821,%eax 2768 197,249,110,216, //vmovd %eax,%xmm3 2769 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 2770 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1 2771 184,31,0,0,0, //mov $0x1f,%eax 2772 197,249,110,216, //vmovd %eax,%xmm3 2773 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 2774 197,229,219,210, //vpand %ymm2,%ymm3,%ymm2 2775 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 2776 184,8,33,4,61, //mov $0x3d042108,%eax 2777 197,249,110,216, //vmovd %eax,%xmm3 2778 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 2779 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 2780 184,0,0,128,63, //mov $0x3f800000,%eax 2781 197,249,110,216, //vmovd %eax,%xmm3 2782 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 2783 72,173, //lods %ds:(%rsi),%rax 2784 255,224, //jmpq *%rax 2785 65,137,200, //mov %ecx,%r8d 2786 65,128,224,7, //and $0x7,%r8b 2787 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0 2788 65,254,200, //dec %r8b 2789 65,128,248,6, //cmp $0x6,%r8b 2790 15,135,89,255,255,255, //ja 8cc <_sk_load_565_hsw+0x14> 2791 69,15,182,192, //movzbl %r8b,%r8d 2792 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # 9c8 <_sk_load_565_hsw+0x110> 2793 75,99,4,129, //movslq (%r9,%r8,4),%rax 2794 76,1,200, //add %r9,%rax 2795 255,224, //jmpq *%rax 2796 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0 2797 196,193,121,196,68,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0 2798 196,193,121,196,68,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0 2799 196,193,121,196,68,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0 2800 196,193,121,196,68,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0 2801 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0 2802 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0 2803 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0 2804 233,5,255,255,255, //jmpq 8cc <_sk_load_565_hsw+0x14> 2805 144, //nop 2806 243,255, //repz (bad) 2807 255, //(bad) 2808 255, //(bad) 2809 235,255, //jmp 9cd <_sk_load_565_hsw+0x115> 2810 255, //(bad) 2811 255,227, //jmpq *%rbx 2812 255, //(bad) 2813 255, //(bad) 2814 255, //(bad) 2815 219,255, //(bad) 2816 255, //(bad) 2817 255,211, //callq *%rbx 2818 255, //(bad) 2819 255, //(bad) 2820 255,203, //dec %ebx 2821 255, //(bad) 2822 255, //(bad) 2823 255, //(bad) 2824 191, //.byte 0xbf 2825 255, //(bad) 2826 255, //(bad) 2827 255, //.byte 0xff 2828 }; 2829 2830 CODE const uint8_t sk_store_565_hsw[] = { 2831 72,173, //lods %ds:(%rsi),%rax 2832 76,139,8, //mov (%rax),%r9 2833 184,0,0,248,65, //mov $0x41f80000,%eax 2834 197,121,110,192, //vmovd %eax,%xmm8 2835 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 2836 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 2837 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 2838 196,193,53,114,241,11, //vpslld $0xb,%ymm9,%ymm9 2839 184,0,0,124,66, //mov $0x427c0000,%eax 2840 197,121,110,208, //vmovd %eax,%xmm10 2841 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 2842 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 2843 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 2844 196,193,45,114,242,5, //vpslld $0x5,%ymm10,%ymm10 2845 196,65,45,235,201, //vpor %ymm9,%ymm10,%ymm9 2846 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 2847 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 2848 196,65,53,235,192, //vpor %ymm8,%ymm9,%ymm8 2849 196,67,125,57,193,1, //vextracti128 $0x1,%ymm8,%xmm9 2850 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 2851 72,133,201, //test %rcx,%rcx 2852 117,10, //jne a50 <_sk_store_565_hsw+0x6c> 2853 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2) 2854 72,173, //lods %ds:(%rsi),%rax 2855 255,224, //jmpq *%rax 2856 65,137,200, //mov %ecx,%r8d 2857 65,128,224,7, //and $0x7,%r8b 2858 65,254,200, //dec %r8b 2859 65,128,248,6, //cmp $0x6,%r8b 2860 119,236, //ja a4c <_sk_store_565_hsw+0x68> 2861 65,15,182,192, //movzbl %r8b,%eax 2862 76,141,5,69,0,0,0, //lea 0x45(%rip),%r8 # ab0 <_sk_store_565_hsw+0xcc> 2863 73,99,4,128, //movslq (%r8,%rax,4),%rax 2864 76,1,192, //add %r8,%rax 2865 255,224, //jmpq *%rax 2866 196,67,121,21,68,121,12,6, //vpextrw $0x6,%xmm8,0xc(%r9,%rdi,2) 2867 196,67,121,21,68,121,10,5, //vpextrw $0x5,%xmm8,0xa(%r9,%rdi,2) 2868 196,67,121,21,68,121,8,4, //vpextrw $0x4,%xmm8,0x8(%r9,%rdi,2) 2869 196,67,121,21,68,121,6,3, //vpextrw $0x3,%xmm8,0x6(%r9,%rdi,2) 2870 196,67,121,21,68,121,4,2, //vpextrw $0x2,%xmm8,0x4(%r9,%rdi,2) 2871 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2) 2872 196,67,121,21,4,121,0, //vpextrw $0x0,%xmm8,(%r9,%rdi,2) 2873 235,159, //jmp a4c <_sk_store_565_hsw+0x68> 2874 15,31,0, //nopl (%rax) 2875 244, //hlt 2876 255, //(bad) 2877 255, //(bad) 2878 255, //(bad) 2879 236, //in (%dx),%al 2880 255, //(bad) 2881 255, //(bad) 2882 255,228, //jmpq *%rsp 2883 255, //(bad) 2884 255, //(bad) 2885 255, //(bad) 2886 220,255, //fdivr %st,%st(7) 2887 255, //(bad) 2888 255,212, //callq *%rsp 2889 255, //(bad) 2890 255, //(bad) 2891 255,204, //dec %esp 2892 255, //(bad) 2893 255, //(bad) 2894 255,196, //inc %esp 2895 255, //(bad) 2896 255, //(bad) 2897 255, //.byte 0xff 2898 }; 2899 2900 CODE const uint8_t sk_load_8888_hsw[] = { 2901 73,137,200, //mov %rcx,%r8 2902 72,173, //lods %ds:(%rsi),%rax 2903 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9 2904 76,3,8, //add (%rax),%r9 2905 77,133,192, //test %r8,%r8 2906 117,104, //jne b49 <_sk_load_8888_hsw+0x7d> 2907 196,193,126,111,25, //vmovdqu (%r9),%ymm3 2908 184,255,0,0,0, //mov $0xff,%eax 2909 197,249,110,192, //vmovd %eax,%xmm0 2910 196,226,125,88,208, //vpbroadcastd %xmm0,%ymm2 2911 197,237,219,195, //vpand %ymm3,%ymm2,%ymm0 2912 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 2913 184,129,128,128,59, //mov $0x3b808081,%eax 2914 197,249,110,200, //vmovd %eax,%xmm1 2915 196,98,125,88,193, //vpbroadcastd %xmm1,%ymm8 2916 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0 2917 197,245,114,211,8, //vpsrld $0x8,%ymm3,%ymm1 2918 197,237,219,201, //vpand %ymm1,%ymm2,%ymm1 2919 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1 2920 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1 2921 197,181,114,211,16, //vpsrld $0x10,%ymm3,%ymm9 2922 196,193,109,219,209, //vpand %ymm9,%ymm2,%ymm2 2923 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 2924 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2 2925 197,229,114,211,24, //vpsrld $0x18,%ymm3,%ymm3 2926 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3 2927 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3 2928 72,173, //lods %ds:(%rsi),%rax 2929 76,137,193, //mov %r8,%rcx 2930 255,224, //jmpq *%rax 2931 185,8,0,0,0, //mov $0x8,%ecx 2932 68,41,193, //sub %r8d,%ecx 2933 192,225,3, //shl $0x3,%cl 2934 72,199,192,255,255,255,255, //mov $0xffffffffffffffff,%rax 2935 72,211,232, //shr %cl,%rax 2936 196,225,249,110,192, //vmovq %rax,%xmm0 2937 196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0 2938 196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3 2939 233,116,255,255,255, //jmpq ae6 <_sk_load_8888_hsw+0x1a> 2940 }; 2941 2942 CODE const uint8_t sk_store_8888_hsw[] = { 2943 73,137,200, //mov %rcx,%r8 2944 72,173, //lods %ds:(%rsi),%rax 2945 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9 2946 76,3,8, //add (%rax),%r9 2947 184,0,0,127,67, //mov $0x437f0000,%eax 2948 197,121,110,192, //vmovd %eax,%xmm8 2949 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 2950 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 2951 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 2952 197,60,89,209, //vmulps %ymm1,%ymm8,%ymm10 2953 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 2954 196,193,45,114,242,8, //vpslld $0x8,%ymm10,%ymm10 2955 196,65,45,235,201, //vpor %ymm9,%ymm10,%ymm9 2956 197,60,89,210, //vmulps %ymm2,%ymm8,%ymm10 2957 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 2958 196,193,45,114,242,16, //vpslld $0x10,%ymm10,%ymm10 2959 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 2960 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 2961 196,193,61,114,240,24, //vpslld $0x18,%ymm8,%ymm8 2962 196,65,45,235,192, //vpor %ymm8,%ymm10,%ymm8 2963 196,65,53,235,192, //vpor %ymm8,%ymm9,%ymm8 2964 77,133,192, //test %r8,%r8 2965 117,12, //jne be6 <_sk_store_8888_hsw+0x74> 2966 196,65,126,127,1, //vmovdqu %ymm8,(%r9) 2967 72,173, //lods %ds:(%rsi),%rax 2968 76,137,193, //mov %r8,%rcx 2969 255,224, //jmpq *%rax 2970 185,8,0,0,0, //mov $0x8,%ecx 2971 68,41,193, //sub %r8d,%ecx 2972 192,225,3, //shl $0x3,%cl 2973 72,199,192,255,255,255,255, //mov $0xffffffffffffffff,%rax 2974 72,211,232, //shr %cl,%rax 2975 196,97,249,110,200, //vmovq %rax,%xmm9 2976 196,66,125,33,201, //vpmovsxbd %xmm9,%ymm9 2977 196,66,53,142,1, //vpmaskmovd %ymm8,%ymm9,(%r9) 2978 235,211, //jmp bdf <_sk_store_8888_hsw+0x6d> 2979 }; 2980 2981 CODE const uint8_t sk_load_f16_hsw[] = { 2982 72,173, //lods %ds:(%rsi),%rax 2983 72,139,0, //mov (%rax),%rax 2984 72,133,201, //test %rcx,%rcx 2985 117,97, //jne c77 <_sk_load_f16_hsw+0x6b> 2986 197,121,16,4,248, //vmovupd (%rax,%rdi,8),%xmm8 2987 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2 2988 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3 2989 197,122,111,76,248,48, //vmovdqu 0x30(%rax,%rdi,8),%xmm9 2990 197,185,97,194, //vpunpcklwd %xmm2,%xmm8,%xmm0 2991 197,185,105,210, //vpunpckhwd %xmm2,%xmm8,%xmm2 2992 196,193,97,97,201, //vpunpcklwd %xmm9,%xmm3,%xmm1 2993 196,193,97,105,217, //vpunpckhwd %xmm9,%xmm3,%xmm3 2994 197,121,97,194, //vpunpcklwd %xmm2,%xmm0,%xmm8 2995 197,121,105,202, //vpunpckhwd %xmm2,%xmm0,%xmm9 2996 197,241,97,211, //vpunpcklwd %xmm3,%xmm1,%xmm2 2997 197,241,105,219, //vpunpckhwd %xmm3,%xmm1,%xmm3 2998 197,185,108,194, //vpunpcklqdq %xmm2,%xmm8,%xmm0 2999 196,226,125,19,192, //vcvtph2ps %xmm0,%ymm0 3000 197,185,109,202, //vpunpckhqdq %xmm2,%xmm8,%xmm1 3001 196,226,125,19,201, //vcvtph2ps %xmm1,%ymm1 3002 197,177,108,211, //vpunpcklqdq %xmm3,%xmm9,%xmm2 3003 196,226,125,19,210, //vcvtph2ps %xmm2,%ymm2 3004 197,177,109,219, //vpunpckhqdq %xmm3,%xmm9,%xmm3 3005 196,226,125,19,219, //vcvtph2ps %xmm3,%ymm3 3006 72,173, //lods %ds:(%rsi),%rax 3007 255,224, //jmpq *%rax 3008 197,123,16,4,248, //vmovsd (%rax,%rdi,8),%xmm8 3009 196,65,49,239,201, //vpxor %xmm9,%xmm9,%xmm9 3010 72,131,249,1, //cmp $0x1,%rcx 3011 116,79, //je cd6 <_sk_load_f16_hsw+0xca> 3012 197,57,22,68,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm8,%xmm8 3013 72,131,249,3, //cmp $0x3,%rcx 3014 114,67, //jb cd6 <_sk_load_f16_hsw+0xca> 3015 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2 3016 72,131,249,3, //cmp $0x3,%rcx 3017 116,68, //je ce3 <_sk_load_f16_hsw+0xd7> 3018 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2 3019 72,131,249,5, //cmp $0x5,%rcx 3020 114,56, //jb ce3 <_sk_load_f16_hsw+0xd7> 3021 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3 3022 72,131,249,5, //cmp $0x5,%rcx 3023 15,132,114,255,255,255, //je c2d <_sk_load_f16_hsw+0x21> 3024 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3 3025 72,131,249,7, //cmp $0x7,%rcx 3026 15,130,98,255,255,255, //jb c2d <_sk_load_f16_hsw+0x21> 3027 197,122,126,76,248,48, //vmovq 0x30(%rax,%rdi,8),%xmm9 3028 233,87,255,255,255, //jmpq c2d <_sk_load_f16_hsw+0x21> 3029 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 3030 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 3031 233,74,255,255,255, //jmpq c2d <_sk_load_f16_hsw+0x21> 3032 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 3033 233,65,255,255,255, //jmpq c2d <_sk_load_f16_hsw+0x21> 3034 }; 3035 3036 CODE const uint8_t sk_store_f16_hsw[] = { 3037 72,173, //lods %ds:(%rsi),%rax 3038 72,139,0, //mov (%rax),%rax 3039 196,195,125,29,192,4, //vcvtps2ph $0x4,%ymm0,%xmm8 3040 196,195,125,29,201,4, //vcvtps2ph $0x4,%ymm1,%xmm9 3041 196,195,125,29,210,4, //vcvtps2ph $0x4,%ymm2,%xmm10 3042 196,195,125,29,219,4, //vcvtps2ph $0x4,%ymm3,%xmm11 3043 196,65,57,97,225, //vpunpcklwd %xmm9,%xmm8,%xmm12 3044 196,65,57,105,193, //vpunpckhwd %xmm9,%xmm8,%xmm8 3045 196,65,41,97,203, //vpunpcklwd %xmm11,%xmm10,%xmm9 3046 196,65,41,105,235, //vpunpckhwd %xmm11,%xmm10,%xmm13 3047 196,65,25,98,217, //vpunpckldq %xmm9,%xmm12,%xmm11 3048 196,65,25,106,209, //vpunpckhdq %xmm9,%xmm12,%xmm10 3049 196,65,57,98,205, //vpunpckldq %xmm13,%xmm8,%xmm9 3050 196,65,57,106,197, //vpunpckhdq %xmm13,%xmm8,%xmm8 3051 72,133,201, //test %rcx,%rcx 3052 117,27, //jne d51 <_sk_store_f16_hsw+0x65> 3053 197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8) 3054 197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8) 3055 197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8) 3056 197,122,127,68,248,48, //vmovdqu %xmm8,0x30(%rax,%rdi,8) 3057 72,173, //lods %ds:(%rsi),%rax 3058 255,224, //jmpq *%rax 3059 197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8) 3060 72,131,249,1, //cmp $0x1,%rcx 3061 116,241, //je d4d <_sk_store_f16_hsw+0x61> 3062 197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8) 3063 72,131,249,3, //cmp $0x3,%rcx 3064 114,229, //jb d4d <_sk_store_f16_hsw+0x61> 3065 197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8) 3066 116,221, //je d4d <_sk_store_f16_hsw+0x61> 3067 197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8) 3068 72,131,249,5, //cmp $0x5,%rcx 3069 114,209, //jb d4d <_sk_store_f16_hsw+0x61> 3070 197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8) 3071 116,201, //je d4d <_sk_store_f16_hsw+0x61> 3072 197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8) 3073 72,131,249,7, //cmp $0x7,%rcx 3074 114,189, //jb d4d <_sk_store_f16_hsw+0x61> 3075 197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8) 3076 235,181, //jmp d4d <_sk_store_f16_hsw+0x61> 3077 }; 3078 3079 CODE const uint8_t sk_store_f32_hsw[] = { 3080 72,173, //lods %ds:(%rsi),%rax 3081 76,139,0, //mov (%rax),%r8 3082 72,141,4,189,0,0,0,0, //lea 0x0(,%rdi,4),%rax 3083 197,124,20,193, //vunpcklps %ymm1,%ymm0,%ymm8 3084 197,124,21,217, //vunpckhps %ymm1,%ymm0,%ymm11 3085 197,108,20,203, //vunpcklps %ymm3,%ymm2,%ymm9 3086 197,108,21,227, //vunpckhps %ymm3,%ymm2,%ymm12 3087 196,65,61,20,209, //vunpcklpd %ymm9,%ymm8,%ymm10 3088 196,65,61,21,201, //vunpckhpd %ymm9,%ymm8,%ymm9 3089 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8 3090 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11 3091 72,133,201, //test %rcx,%rcx 3092 117,55, //jne e05 <_sk_store_f32_hsw+0x6d> 3093 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12 3094 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13 3095 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9 3096 196,67,61,6,195,49, //vperm2f128 $0x31,%ymm11,%ymm8,%ymm8 3097 196,65,125,17,36,128, //vmovupd %ymm12,(%r8,%rax,4) 3098 196,65,125,17,108,128,32, //vmovupd %ymm13,0x20(%r8,%rax,4) 3099 196,65,125,17,76,128,64, //vmovupd %ymm9,0x40(%r8,%rax,4) 3100 196,65,125,17,68,128,96, //vmovupd %ymm8,0x60(%r8,%rax,4) 3101 72,173, //lods %ds:(%rsi),%rax 3102 255,224, //jmpq *%rax 3103 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4) 3104 72,131,249,1, //cmp $0x1,%rcx 3105 116,240, //je e01 <_sk_store_f32_hsw+0x69> 3106 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4) 3107 72,131,249,3, //cmp $0x3,%rcx 3108 114,227, //jb e01 <_sk_store_f32_hsw+0x69> 3109 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4) 3110 116,218, //je e01 <_sk_store_f32_hsw+0x69> 3111 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4) 3112 72,131,249,5, //cmp $0x5,%rcx 3113 114,205, //jb e01 <_sk_store_f32_hsw+0x69> 3114 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4) 3115 116,195, //je e01 <_sk_store_f32_hsw+0x69> 3116 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4) 3117 72,131,249,7, //cmp $0x7,%rcx 3118 114,181, //jb e01 <_sk_store_f32_hsw+0x69> 3119 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4) 3120 235,171, //jmp e01 <_sk_store_f32_hsw+0x69> 3121 }; 3122 3123 CODE const uint8_t sk_clamp_x_hsw[] = { 3124 72,173, //lods %ds:(%rsi),%rax 3125 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 3126 197,188,95,192, //vmaxps %ymm0,%ymm8,%ymm0 3127 196,98,125,88,0, //vpbroadcastd (%rax),%ymm8 3128 196,65,53,118,201, //vpcmpeqd %ymm9,%ymm9,%ymm9 3129 196,65,61,254,193, //vpaddd %ymm9,%ymm8,%ymm8 3130 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0 3131 72,173, //lods %ds:(%rsi),%rax 3132 255,224, //jmpq *%rax 3133 }; 3134 3135 CODE const uint8_t sk_clamp_y_hsw[] = { 3136 72,173, //lods %ds:(%rsi),%rax 3137 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 3138 197,188,95,201, //vmaxps %ymm1,%ymm8,%ymm1 3139 196,98,125,88,0, //vpbroadcastd (%rax),%ymm8 3140 196,65,53,118,201, //vpcmpeqd %ymm9,%ymm9,%ymm9 3141 196,65,61,254,193, //vpaddd %ymm9,%ymm8,%ymm8 3142 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1 3143 72,173, //lods %ds:(%rsi),%rax 3144 255,224, //jmpq *%rax 3145 }; 3146 3147 CODE const uint8_t sk_repeat_x_hsw[] = { 3148 72,173, //lods %ds:(%rsi),%rax 3149 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 3150 196,65,124,94,200, //vdivps %ymm8,%ymm0,%ymm9 3151 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9 3152 196,98,61,172,200, //vfnmadd213ps %ymm0,%ymm8,%ymm9 3153 197,253,118,192, //vpcmpeqd %ymm0,%ymm0,%ymm0 3154 197,189,254,192, //vpaddd %ymm0,%ymm8,%ymm0 3155 197,180,93,192, //vminps %ymm0,%ymm9,%ymm0 3156 72,173, //lods %ds:(%rsi),%rax 3157 255,224, //jmpq *%rax 3158 }; 3159 3160 CODE const uint8_t sk_repeat_y_hsw[] = { 3161 72,173, //lods %ds:(%rsi),%rax 3162 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 3163 196,65,116,94,200, //vdivps %ymm8,%ymm1,%ymm9 3164 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9 3165 196,98,61,172,201, //vfnmadd213ps %ymm1,%ymm8,%ymm9 3166 197,245,118,201, //vpcmpeqd %ymm1,%ymm1,%ymm1 3167 197,189,254,201, //vpaddd %ymm1,%ymm8,%ymm1 3168 197,180,93,201, //vminps %ymm1,%ymm9,%ymm1 3169 72,173, //lods %ds:(%rsi),%rax 3170 255,224, //jmpq *%rax 3171 }; 3172 3173 CODE const uint8_t sk_mirror_x_hsw[] = { 3174 72,173, //lods %ds:(%rsi),%rax 3175 197,122,16,0, //vmovss (%rax),%xmm8 3176 196,66,125,24,200, //vbroadcastss %xmm8,%ymm9 3177 196,65,124,92,209, //vsubps %ymm9,%ymm0,%ymm10 3178 196,193,58,88,192, //vaddss %xmm8,%xmm8,%xmm0 3179 196,226,125,24,192, //vbroadcastss %xmm0,%ymm0 3180 197,44,94,192, //vdivps %ymm0,%ymm10,%ymm8 3181 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8 3182 196,66,125,172,194, //vfnmadd213ps %ymm10,%ymm0,%ymm8 3183 196,193,60,92,193, //vsubps %ymm9,%ymm8,%ymm0 3184 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 3185 197,60,92,192, //vsubps %ymm0,%ymm8,%ymm8 3186 197,188,84,192, //vandps %ymm0,%ymm8,%ymm0 3187 196,65,61,118,192, //vpcmpeqd %ymm8,%ymm8,%ymm8 3188 196,65,53,254,192, //vpaddd %ymm8,%ymm9,%ymm8 3189 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0 3190 72,173, //lods %ds:(%rsi),%rax 3191 255,224, //jmpq *%rax 3192 }; 3193 3194 CODE const uint8_t sk_mirror_y_hsw[] = { 3195 72,173, //lods %ds:(%rsi),%rax 3196 197,122,16,0, //vmovss (%rax),%xmm8 3197 196,66,125,24,200, //vbroadcastss %xmm8,%ymm9 3198 196,65,116,92,209, //vsubps %ymm9,%ymm1,%ymm10 3199 196,193,58,88,200, //vaddss %xmm8,%xmm8,%xmm1 3200 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1 3201 197,44,94,193, //vdivps %ymm1,%ymm10,%ymm8 3202 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8 3203 196,66,117,172,194, //vfnmadd213ps %ymm10,%ymm1,%ymm8 3204 196,193,60,92,201, //vsubps %ymm9,%ymm8,%ymm1 3205 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 3206 197,60,92,193, //vsubps %ymm1,%ymm8,%ymm8 3207 197,188,84,201, //vandps %ymm1,%ymm8,%ymm1 3208 196,65,61,118,192, //vpcmpeqd %ymm8,%ymm8,%ymm8 3209 196,65,53,254,192, //vpaddd %ymm8,%ymm9,%ymm8 3210 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1 3211 72,173, //lods %ds:(%rsi),%rax 3212 255,224, //jmpq *%rax 3213 }; 3214 3215 CODE const uint8_t sk_luminance_to_alpha_hsw[] = { 3216 184,208,179,89,62, //mov $0x3e59b3d0,%eax 3217 197,249,110,216, //vmovd %eax,%xmm3 3218 196,98,125,88,195, //vpbroadcastd %xmm3,%ymm8 3219 184,89,23,55,63, //mov $0x3f371759,%eax 3220 197,249,110,216, //vmovd %eax,%xmm3 3221 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 3222 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1 3223 196,98,125,168,193, //vfmadd213ps %ymm1,%ymm0,%ymm8 3224 184,152,221,147,61, //mov $0x3d93dd98,%eax 3225 197,249,110,192, //vmovd %eax,%xmm0 3226 196,226,125,88,216, //vpbroadcastd %xmm0,%ymm3 3227 196,194,109,168,216, //vfmadd213ps %ymm8,%ymm2,%ymm3 3228 72,173, //lods %ds:(%rsi),%rax 3229 197,253,239,192, //vpxor %ymm0,%ymm0,%ymm0 3230 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 3231 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 3232 255,224, //jmpq *%rax 3233 }; 3234 3235 CODE const uint8_t sk_matrix_2x3_hsw[] = { 3236 72,173, //lods %ds:(%rsi),%rax 3237 196,98,125,24,8, //vbroadcastss (%rax),%ymm9 3238 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10 3239 196,98,125,24,64,16, //vbroadcastss 0x10(%rax),%ymm8 3240 196,66,117,184,194, //vfmadd231ps %ymm10,%ymm1,%ymm8 3241 196,66,125,184,193, //vfmadd231ps %ymm9,%ymm0,%ymm8 3242 196,98,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm10 3243 196,98,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm11 3244 196,98,125,24,72,20, //vbroadcastss 0x14(%rax),%ymm9 3245 196,66,117,184,203, //vfmadd231ps %ymm11,%ymm1,%ymm9 3246 196,66,125,184,202, //vfmadd231ps %ymm10,%ymm0,%ymm9 3247 72,173, //lods %ds:(%rsi),%rax 3248 197,124,41,192, //vmovaps %ymm8,%ymm0 3249 197,124,41,201, //vmovaps %ymm9,%ymm1 3250 255,224, //jmpq *%rax 3251 }; 3252 3253 CODE const uint8_t sk_matrix_3x4_hsw[] = { 3254 72,173, //lods %ds:(%rsi),%rax 3255 196,98,125,24,8, //vbroadcastss (%rax),%ymm9 3256 196,98,125,24,80,12, //vbroadcastss 0xc(%rax),%ymm10 3257 196,98,125,24,88,24, //vbroadcastss 0x18(%rax),%ymm11 3258 196,98,125,24,64,36, //vbroadcastss 0x24(%rax),%ymm8 3259 196,66,109,184,195, //vfmadd231ps %ymm11,%ymm2,%ymm8 3260 196,66,117,184,194, //vfmadd231ps %ymm10,%ymm1,%ymm8 3261 196,66,125,184,193, //vfmadd231ps %ymm9,%ymm0,%ymm8 3262 196,98,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm10 3263 196,98,125,24,88,16, //vbroadcastss 0x10(%rax),%ymm11 3264 196,98,125,24,96,28, //vbroadcastss 0x1c(%rax),%ymm12 3265 196,98,125,24,72,40, //vbroadcastss 0x28(%rax),%ymm9 3266 196,66,109,184,204, //vfmadd231ps %ymm12,%ymm2,%ymm9 3267 196,66,117,184,203, //vfmadd231ps %ymm11,%ymm1,%ymm9 3268 196,66,125,184,202, //vfmadd231ps %ymm10,%ymm0,%ymm9 3269 196,98,125,24,88,8, //vbroadcastss 0x8(%rax),%ymm11 3270 196,98,125,24,96,20, //vbroadcastss 0x14(%rax),%ymm12 3271 196,98,125,24,104,32, //vbroadcastss 0x20(%rax),%ymm13 3272 196,98,125,24,80,44, //vbroadcastss 0x2c(%rax),%ymm10 3273 196,66,109,184,213, //vfmadd231ps %ymm13,%ymm2,%ymm10 3274 196,66,117,184,212, //vfmadd231ps %ymm12,%ymm1,%ymm10 3275 196,66,125,184,211, //vfmadd231ps %ymm11,%ymm0,%ymm10 3276 72,173, //lods %ds:(%rsi),%rax 3277 197,124,41,192, //vmovaps %ymm8,%ymm0 3278 197,124,41,201, //vmovaps %ymm9,%ymm1 3279 197,124,41,210, //vmovaps %ymm10,%ymm2 3280 255,224, //jmpq *%rax 3281 }; 3282 3283 CODE const uint8_t sk_matrix_4x5_hsw[] = { 3284 72,173, //lods %ds:(%rsi),%rax 3285 196,98,125,24,8, //vbroadcastss (%rax),%ymm9 3286 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10 3287 196,98,125,24,88,32, //vbroadcastss 0x20(%rax),%ymm11 3288 196,98,125,24,96,48, //vbroadcastss 0x30(%rax),%ymm12 3289 196,98,125,24,64,64, //vbroadcastss 0x40(%rax),%ymm8 3290 196,66,101,184,196, //vfmadd231ps %ymm12,%ymm3,%ymm8 3291 196,66,109,184,195, //vfmadd231ps %ymm11,%ymm2,%ymm8 3292 196,66,117,184,194, //vfmadd231ps %ymm10,%ymm1,%ymm8 3293 196,66,125,184,193, //vfmadd231ps %ymm9,%ymm0,%ymm8 3294 196,98,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm10 3295 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11 3296 196,98,125,24,96,36, //vbroadcastss 0x24(%rax),%ymm12 3297 196,98,125,24,104,52, //vbroadcastss 0x34(%rax),%ymm13 3298 196,98,125,24,72,68, //vbroadcastss 0x44(%rax),%ymm9 3299 196,66,101,184,205, //vfmadd231ps %ymm13,%ymm3,%ymm9 3300 196,66,109,184,204, //vfmadd231ps %ymm12,%ymm2,%ymm9 3301 196,66,117,184,203, //vfmadd231ps %ymm11,%ymm1,%ymm9 3302 196,66,125,184,202, //vfmadd231ps %ymm10,%ymm0,%ymm9 3303 196,98,125,24,88,8, //vbroadcastss 0x8(%rax),%ymm11 3304 196,98,125,24,96,24, //vbroadcastss 0x18(%rax),%ymm12 3305 196,98,125,24,104,40, //vbroadcastss 0x28(%rax),%ymm13 3306 196,98,125,24,112,56, //vbroadcastss 0x38(%rax),%ymm14 3307 196,98,125,24,80,72, //vbroadcastss 0x48(%rax),%ymm10 3308 196,66,101,184,214, //vfmadd231ps %ymm14,%ymm3,%ymm10 3309 196,66,109,184,213, //vfmadd231ps %ymm13,%ymm2,%ymm10 3310 196,66,117,184,212, //vfmadd231ps %ymm12,%ymm1,%ymm10 3311 196,66,125,184,211, //vfmadd231ps %ymm11,%ymm0,%ymm10 3312 196,98,125,24,96,12, //vbroadcastss 0xc(%rax),%ymm12 3313 196,98,125,24,104,28, //vbroadcastss 0x1c(%rax),%ymm13 3314 196,98,125,24,112,44, //vbroadcastss 0x2c(%rax),%ymm14 3315 196,98,125,24,120,60, //vbroadcastss 0x3c(%rax),%ymm15 3316 196,98,125,24,88,76, //vbroadcastss 0x4c(%rax),%ymm11 3317 196,66,101,184,223, //vfmadd231ps %ymm15,%ymm3,%ymm11 3318 196,66,109,184,222, //vfmadd231ps %ymm14,%ymm2,%ymm11 3319 196,66,117,184,221, //vfmadd231ps %ymm13,%ymm1,%ymm11 3320 196,66,125,184,220, //vfmadd231ps %ymm12,%ymm0,%ymm11 3321 72,173, //lods %ds:(%rsi),%rax 3322 197,124,41,192, //vmovaps %ymm8,%ymm0 3323 197,124,41,201, //vmovaps %ymm9,%ymm1 3324 197,124,41,210, //vmovaps %ymm10,%ymm2 3325 197,124,41,219, //vmovaps %ymm11,%ymm3 3326 255,224, //jmpq *%rax 3327 }; 3328 3329 CODE const uint8_t sk_matrix_perspective_hsw[] = { 3330 72,173, //lods %ds:(%rsi),%rax 3331 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 3332 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9 3333 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10 3334 196,66,117,184,209, //vfmadd231ps %ymm9,%ymm1,%ymm10 3335 196,66,125,184,208, //vfmadd231ps %ymm8,%ymm0,%ymm10 3336 196,98,125,24,64,12, //vbroadcastss 0xc(%rax),%ymm8 3337 196,98,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm9 3338 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11 3339 196,66,117,184,217, //vfmadd231ps %ymm9,%ymm1,%ymm11 3340 196,66,125,184,216, //vfmadd231ps %ymm8,%ymm0,%ymm11 3341 196,98,125,24,64,24, //vbroadcastss 0x18(%rax),%ymm8 3342 196,98,125,24,72,28, //vbroadcastss 0x1c(%rax),%ymm9 3343 196,98,125,24,96,32, //vbroadcastss 0x20(%rax),%ymm12 3344 196,66,117,184,225, //vfmadd231ps %ymm9,%ymm1,%ymm12 3345 196,66,125,184,224, //vfmadd231ps %ymm8,%ymm0,%ymm12 3346 196,193,124,83,204, //vrcpps %ymm12,%ymm1 3347 197,172,89,193, //vmulps %ymm1,%ymm10,%ymm0 3348 197,164,89,201, //vmulps %ymm1,%ymm11,%ymm1 3349 72,173, //lods %ds:(%rsi),%rax 3350 255,224, //jmpq *%rax 3351 }; 3352 3353 CODE const uint8_t sk_linear_gradient_2stops_hsw[] = { 3354 72,173, //lods %ds:(%rsi),%rax 3355 196,226,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm1 3356 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 3357 196,98,125,184,193, //vfmadd231ps %ymm1,%ymm0,%ymm8 3358 196,226,125,24,80,20, //vbroadcastss 0x14(%rax),%ymm2 3359 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1 3360 196,226,125,184,202, //vfmadd231ps %ymm2,%ymm0,%ymm1 3361 196,226,125,24,88,24, //vbroadcastss 0x18(%rax),%ymm3 3362 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2 3363 196,226,125,184,211, //vfmadd231ps %ymm3,%ymm0,%ymm2 3364 196,98,125,24,72,28, //vbroadcastss 0x1c(%rax),%ymm9 3365 196,226,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm3 3366 196,194,125,184,217, //vfmadd231ps %ymm9,%ymm0,%ymm3 3367 72,173, //lods %ds:(%rsi),%rax 3368 197,124,41,192, //vmovaps %ymm8,%ymm0 3369 255,224, //jmpq *%rax 3370 }; 3371 3372 CODE const uint8_t sk_start_pipeline_avx[] = { 3373 65,87, //push %r15 3374 65,86, //push %r14 3375 65,85, //push %r13 3376 65,84, //push %r12 3377 83, //push %rbx 3378 73,137,205, //mov %rcx,%r13 3379 73,137,214, //mov %rdx,%r14 3380 72,137,251, //mov %rdi,%rbx 3381 72,173, //lods %ds:(%rsi),%rax 3382 73,137,199, //mov %rax,%r15 3383 73,137,244, //mov %rsi,%r12 3384 72,141,67,8, //lea 0x8(%rbx),%rax 3385 76,57,232, //cmp %r13,%rax 3386 118,5, //jbe 28 <_sk_start_pipeline_avx+0x28> 3387 72,137,223, //mov %rbx,%rdi 3388 235,65, //jmp 69 <_sk_start_pipeline_avx+0x69> 3389 185,0,0,0,0, //mov $0x0,%ecx 3390 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 3391 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 3392 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 3393 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 3394 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4 3395 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5 3396 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6 3397 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7 3398 72,137,223, //mov %rbx,%rdi 3399 76,137,230, //mov %r12,%rsi 3400 76,137,242, //mov %r14,%rdx 3401 65,255,215, //callq *%r15 3402 72,141,123,8, //lea 0x8(%rbx),%rdi 3403 72,131,195,16, //add $0x10,%rbx 3404 76,57,235, //cmp %r13,%rbx 3405 72,137,251, //mov %rdi,%rbx 3406 118,191, //jbe 28 <_sk_start_pipeline_avx+0x28> 3407 76,137,233, //mov %r13,%rcx 3408 72,41,249, //sub %rdi,%rcx 3409 116,41, //je 9a <_sk_start_pipeline_avx+0x9a> 3410 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 3411 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 3412 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 3413 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 3414 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4 3415 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5 3416 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6 3417 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7 3418 76,137,230, //mov %r12,%rsi 3419 76,137,242, //mov %r14,%rdx 3420 65,255,215, //callq *%r15 3421 76,137,232, //mov %r13,%rax 3422 91, //pop %rbx 3423 65,92, //pop %r12 3424 65,93, //pop %r13 3425 65,94, //pop %r14 3426 65,95, //pop %r15 3427 197,248,119, //vzeroupper 3428 195, //retq 3429 }; 3430 3431 CODE const uint8_t sk_just_return_avx[] = { 3432 195, //retq 3433 }; 3434 3435 CODE const uint8_t sk_seed_shader_avx[] = { 3436 72,173, //lods %ds:(%rsi),%rax 3437 197,249,110,199, //vmovd %edi,%xmm0 3438 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0 3439 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 3440 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 3441 65,184,0,0,0,63, //mov $0x3f000000,%r8d 3442 196,193,121,110,200, //vmovd %r8d,%xmm1 3443 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 3444 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 3445 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 3446 197,252,88,2, //vaddps (%rdx),%ymm0,%ymm0 3447 196,226,125,24,16, //vbroadcastss (%rax),%ymm2 3448 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 3449 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1 3450 184,0,0,128,63, //mov $0x3f800000,%eax 3451 197,249,110,208, //vmovd %eax,%xmm2 3452 196,227,121,4,210,0, //vpermilps $0x0,%xmm2,%xmm2 3453 196,227,109,24,210,1, //vinsertf128 $0x1,%xmm2,%ymm2,%ymm2 3454 72,173, //lods %ds:(%rsi),%rax 3455 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 3456 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4 3457 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5 3458 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6 3459 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7 3460 255,224, //jmpq *%rax 3461 }; 3462 3463 CODE const uint8_t sk_constant_color_avx[] = { 3464 72,173, //lods %ds:(%rsi),%rax 3465 196,226,125,24,0, //vbroadcastss (%rax),%ymm0 3466 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1 3467 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2 3468 196,226,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm3 3469 72,173, //lods %ds:(%rsi),%rax 3470 255,224, //jmpq *%rax 3471 }; 3472 3473 CODE const uint8_t sk_clear_avx[] = { 3474 72,173, //lods %ds:(%rsi),%rax 3475 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 3476 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 3477 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 3478 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 3479 255,224, //jmpq *%rax 3480 }; 3481 3482 CODE const uint8_t sk_plus__avx[] = { 3483 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0 3484 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1 3485 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2 3486 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3 3487 72,173, //lods %ds:(%rsi),%rax 3488 255,224, //jmpq *%rax 3489 }; 3490 3491 CODE const uint8_t sk_srcover_avx[] = { 3492 184,0,0,128,63, //mov $0x3f800000,%eax 3493 197,121,110,192, //vmovd %eax,%xmm8 3494 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 3495 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 3496 197,60,92,195, //vsubps %ymm3,%ymm8,%ymm8 3497 197,60,89,204, //vmulps %ymm4,%ymm8,%ymm9 3498 197,180,88,192, //vaddps %ymm0,%ymm9,%ymm0 3499 197,60,89,205, //vmulps %ymm5,%ymm8,%ymm9 3500 197,180,88,201, //vaddps %ymm1,%ymm9,%ymm1 3501 197,60,89,206, //vmulps %ymm6,%ymm8,%ymm9 3502 197,180,88,210, //vaddps %ymm2,%ymm9,%ymm2 3503 197,60,89,199, //vmulps %ymm7,%ymm8,%ymm8 3504 197,188,88,219, //vaddps %ymm3,%ymm8,%ymm3 3505 72,173, //lods %ds:(%rsi),%rax 3506 255,224, //jmpq *%rax 3507 }; 3508 3509 CODE const uint8_t sk_dstover_avx[] = { 3510 184,0,0,128,63, //mov $0x3f800000,%eax 3511 197,121,110,192, //vmovd %eax,%xmm8 3512 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 3513 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 3514 197,60,92,199, //vsubps %ymm7,%ymm8,%ymm8 3515 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 3516 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0 3517 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 3518 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1 3519 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2 3520 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2 3521 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 3522 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3 3523 72,173, //lods %ds:(%rsi),%rax 3524 255,224, //jmpq *%rax 3525 }; 3526 3527 CODE const uint8_t sk_clamp_0_avx[] = { 3528 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 3529 196,193,124,95,192, //vmaxps %ymm8,%ymm0,%ymm0 3530 196,193,116,95,200, //vmaxps %ymm8,%ymm1,%ymm1 3531 196,193,108,95,208, //vmaxps %ymm8,%ymm2,%ymm2 3532 196,193,100,95,216, //vmaxps %ymm8,%ymm3,%ymm3 3533 72,173, //lods %ds:(%rsi),%rax 3534 255,224, //jmpq *%rax 3535 }; 3536 3537 CODE const uint8_t sk_clamp_1_avx[] = { 3538 184,0,0,128,63, //mov $0x3f800000,%eax 3539 197,121,110,192, //vmovd %eax,%xmm8 3540 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 3541 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 3542 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0 3543 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1 3544 196,193,108,93,208, //vminps %ymm8,%ymm2,%ymm2 3545 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3 3546 72,173, //lods %ds:(%rsi),%rax 3547 255,224, //jmpq *%rax 3548 }; 3549 3550 CODE const uint8_t sk_clamp_a_avx[] = { 3551 184,0,0,128,63, //mov $0x3f800000,%eax 3552 197,121,110,192, //vmovd %eax,%xmm8 3553 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 3554 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 3555 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3 3556 197,252,93,195, //vminps %ymm3,%ymm0,%ymm0 3557 197,244,93,203, //vminps %ymm3,%ymm1,%ymm1 3558 197,236,93,211, //vminps %ymm3,%ymm2,%ymm2 3559 72,173, //lods %ds:(%rsi),%rax 3560 255,224, //jmpq *%rax 3561 }; 3562 3563 CODE const uint8_t sk_set_rgb_avx[] = { 3564 72,173, //lods %ds:(%rsi),%rax 3565 196,226,125,24,0, //vbroadcastss (%rax),%ymm0 3566 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1 3567 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2 3568 72,173, //lods %ds:(%rsi),%rax 3569 255,224, //jmpq *%rax 3570 }; 3571 3572 CODE const uint8_t sk_swap_rb_avx[] = { 3573 197,124,40,192, //vmovaps %ymm0,%ymm8 3574 72,173, //lods %ds:(%rsi),%rax 3575 197,252,40,194, //vmovaps %ymm2,%ymm0 3576 197,124,41,194, //vmovaps %ymm8,%ymm2 3577 255,224, //jmpq *%rax 3578 }; 3579 3580 CODE const uint8_t sk_swap_avx[] = { 3581 197,124,40,195, //vmovaps %ymm3,%ymm8 3582 197,124,40,202, //vmovaps %ymm2,%ymm9 3583 197,124,40,209, //vmovaps %ymm1,%ymm10 3584 197,124,40,216, //vmovaps %ymm0,%ymm11 3585 72,173, //lods %ds:(%rsi),%rax 3586 197,252,40,196, //vmovaps %ymm4,%ymm0 3587 197,252,40,205, //vmovaps %ymm5,%ymm1 3588 197,252,40,214, //vmovaps %ymm6,%ymm2 3589 197,252,40,223, //vmovaps %ymm7,%ymm3 3590 197,124,41,220, //vmovaps %ymm11,%ymm4 3591 197,124,41,213, //vmovaps %ymm10,%ymm5 3592 197,124,41,206, //vmovaps %ymm9,%ymm6 3593 197,124,41,199, //vmovaps %ymm8,%ymm7 3594 255,224, //jmpq *%rax 3595 }; 3596 3597 CODE const uint8_t sk_move_src_dst_avx[] = { 3598 72,173, //lods %ds:(%rsi),%rax 3599 197,252,40,224, //vmovaps %ymm0,%ymm4 3600 197,252,40,233, //vmovaps %ymm1,%ymm5 3601 197,252,40,242, //vmovaps %ymm2,%ymm6 3602 197,252,40,251, //vmovaps %ymm3,%ymm7 3603 255,224, //jmpq *%rax 3604 }; 3605 3606 CODE const uint8_t sk_move_dst_src_avx[] = { 3607 72,173, //lods %ds:(%rsi),%rax 3608 197,252,40,196, //vmovaps %ymm4,%ymm0 3609 197,252,40,205, //vmovaps %ymm5,%ymm1 3610 197,252,40,214, //vmovaps %ymm6,%ymm2 3611 197,252,40,223, //vmovaps %ymm7,%ymm3 3612 255,224, //jmpq *%rax 3613 }; 3614 3615 CODE const uint8_t sk_premul_avx[] = { 3616 197,252,89,195, //vmulps %ymm3,%ymm0,%ymm0 3617 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1 3618 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 3619 72,173, //lods %ds:(%rsi),%rax 3620 255,224, //jmpq *%rax 3621 }; 3622 3623 CODE const uint8_t sk_unpremul_avx[] = { 3624 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 3625 196,65,100,194,200,0, //vcmpeqps %ymm8,%ymm3,%ymm9 3626 184,0,0,128,63, //mov $0x3f800000,%eax 3627 197,121,110,208, //vmovd %eax,%xmm10 3628 196,67,121,4,210,0, //vpermilps $0x0,%xmm10,%xmm10 3629 196,67,45,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm10,%ymm10 3630 197,44,94,211, //vdivps %ymm3,%ymm10,%ymm10 3631 196,67,45,74,192,144, //vblendvps %ymm9,%ymm8,%ymm10,%ymm8 3632 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 3633 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 3634 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2 3635 72,173, //lods %ds:(%rsi),%rax 3636 255,224, //jmpq *%rax 3637 }; 3638 3639 CODE const uint8_t sk_from_srgb_avx[] = { 3640 184,145,131,158,61, //mov $0x3d9e8391,%eax 3641 197,121,110,192, //vmovd %eax,%xmm8 3642 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 3643 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 3644 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 3645 197,124,89,208, //vmulps %ymm0,%ymm0,%ymm10 3646 184,154,153,153,62, //mov $0x3e99999a,%eax 3647 197,121,110,216, //vmovd %eax,%xmm11 3648 196,67,121,4,219,0, //vpermilps $0x0,%xmm11,%xmm11 3649 196,67,37,24,219,1, //vinsertf128 $0x1,%xmm11,%ymm11,%ymm11 3650 184,92,143,50,63, //mov $0x3f328f5c,%eax 3651 197,121,110,224, //vmovd %eax,%xmm12 3652 196,67,121,4,228,0, //vpermilps $0x0,%xmm12,%xmm12 3653 196,67,29,24,228,1, //vinsertf128 $0x1,%xmm12,%ymm12,%ymm12 3654 197,36,89,232, //vmulps %ymm0,%ymm11,%ymm13 3655 196,65,20,88,236, //vaddps %ymm12,%ymm13,%ymm13 3656 184,10,215,35,59, //mov $0x3b23d70a,%eax 3657 197,121,110,240, //vmovd %eax,%xmm14 3658 196,67,121,4,246,0, //vpermilps $0x0,%xmm14,%xmm14 3659 196,67,13,24,246,1, //vinsertf128 $0x1,%xmm14,%ymm14,%ymm14 3660 196,65,44,89,213, //vmulps %ymm13,%ymm10,%ymm10 3661 196,65,12,88,210, //vaddps %ymm10,%ymm14,%ymm10 3662 184,174,71,97,61, //mov $0x3d6147ae,%eax 3663 197,121,110,232, //vmovd %eax,%xmm13 3664 196,67,121,4,237,0, //vpermilps $0x0,%xmm13,%xmm13 3665 196,67,21,24,237,1, //vinsertf128 $0x1,%xmm13,%ymm13,%ymm13 3666 196,193,124,194,197,1, //vcmpltps %ymm13,%ymm0,%ymm0 3667 196,195,45,74,193,0, //vblendvps %ymm0,%ymm9,%ymm10,%ymm0 3668 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9 3669 197,116,89,209, //vmulps %ymm1,%ymm1,%ymm10 3670 197,36,89,249, //vmulps %ymm1,%ymm11,%ymm15 3671 196,65,28,88,255, //vaddps %ymm15,%ymm12,%ymm15 3672 196,65,44,89,215, //vmulps %ymm15,%ymm10,%ymm10 3673 196,65,12,88,210, //vaddps %ymm10,%ymm14,%ymm10 3674 196,193,116,194,205,1, //vcmpltps %ymm13,%ymm1,%ymm1 3675 196,195,45,74,201,16, //vblendvps %ymm1,%ymm9,%ymm10,%ymm1 3676 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 3677 197,108,89,202, //vmulps %ymm2,%ymm2,%ymm9 3678 197,36,89,210, //vmulps %ymm2,%ymm11,%ymm10 3679 196,65,28,88,210, //vaddps %ymm10,%ymm12,%ymm10 3680 196,65,52,89,202, //vmulps %ymm10,%ymm9,%ymm9 3681 196,65,12,88,201, //vaddps %ymm9,%ymm14,%ymm9 3682 196,193,108,194,213,1, //vcmpltps %ymm13,%ymm2,%ymm2 3683 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2 3684 72,173, //lods %ds:(%rsi),%rax 3685 255,224, //jmpq *%rax 3686 }; 3687 3688 CODE const uint8_t sk_to_srgb_avx[] = { 3689 197,124,82,192, //vrsqrtps %ymm0,%ymm8 3690 196,65,124,83,232, //vrcpps %ymm8,%ymm13 3691 196,65,124,82,240, //vrsqrtps %ymm8,%ymm14 3692 184,41,92,71,65, //mov $0x41475c29,%eax 3693 197,121,110,192, //vmovd %eax,%xmm8 3694 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 3695 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 3696 197,60,89,224, //vmulps %ymm0,%ymm8,%ymm12 3697 184,0,0,128,63, //mov $0x3f800000,%eax 3698 197,121,110,200, //vmovd %eax,%xmm9 3699 196,67,121,4,201,0, //vpermilps $0x0,%xmm9,%xmm9 3700 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 3701 184,194,135,210,62, //mov $0x3ed287c2,%eax 3702 197,121,110,208, //vmovd %eax,%xmm10 3703 196,67,121,4,210,0, //vpermilps $0x0,%xmm10,%xmm10 3704 196,67,45,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm10,%ymm10 3705 184,206,111,48,63, //mov $0x3f306fce,%eax 3706 197,121,110,216, //vmovd %eax,%xmm11 3707 196,67,121,4,219,0, //vpermilps $0x0,%xmm11,%xmm11 3708 196,67,37,24,219,1, //vinsertf128 $0x1,%xmm11,%ymm11,%ymm11 3709 184,168,87,202,61, //mov $0x3dca57a8,%eax 3710 53,0,0,0,128, //xor $0x80000000,%eax 3711 197,121,110,248, //vmovd %eax,%xmm15 3712 196,67,121,4,255,0, //vpermilps $0x0,%xmm15,%xmm15 3713 196,67,5,24,255,1, //vinsertf128 $0x1,%xmm15,%ymm15,%ymm15 3714 196,65,20,89,235, //vmulps %ymm11,%ymm13,%ymm13 3715 196,65,20,88,239, //vaddps %ymm15,%ymm13,%ymm13 3716 196,65,12,89,242, //vmulps %ymm10,%ymm14,%ymm14 3717 196,65,12,88,237, //vaddps %ymm13,%ymm14,%ymm13 3718 196,65,52,93,237, //vminps %ymm13,%ymm9,%ymm13 3719 184,4,231,140,59, //mov $0x3b8ce704,%eax 3720 197,121,110,240, //vmovd %eax,%xmm14 3721 196,67,121,4,246,0, //vpermilps $0x0,%xmm14,%xmm14 3722 196,67,13,24,246,1, //vinsertf128 $0x1,%xmm14,%ymm14,%ymm14 3723 196,193,124,194,198,1, //vcmpltps %ymm14,%ymm0,%ymm0 3724 196,195,21,74,196,0, //vblendvps %ymm0,%ymm12,%ymm13,%ymm0 3725 197,124,82,225, //vrsqrtps %ymm1,%ymm12 3726 196,65,124,83,236, //vrcpps %ymm12,%ymm13 3727 196,65,124,82,228, //vrsqrtps %ymm12,%ymm12 3728 196,65,36,89,237, //vmulps %ymm13,%ymm11,%ymm13 3729 196,65,4,88,237, //vaddps %ymm13,%ymm15,%ymm13 3730 196,65,44,89,228, //vmulps %ymm12,%ymm10,%ymm12 3731 196,65,28,88,229, //vaddps %ymm13,%ymm12,%ymm12 3732 197,60,89,233, //vmulps %ymm1,%ymm8,%ymm13 3733 196,65,52,93,228, //vminps %ymm12,%ymm9,%ymm12 3734 196,193,116,194,206,1, //vcmpltps %ymm14,%ymm1,%ymm1 3735 196,195,29,74,205,16, //vblendvps %ymm1,%ymm13,%ymm12,%ymm1 3736 197,124,82,226, //vrsqrtps %ymm2,%ymm12 3737 196,65,124,83,236, //vrcpps %ymm12,%ymm13 3738 196,65,36,89,221, //vmulps %ymm13,%ymm11,%ymm11 3739 196,65,4,88,219, //vaddps %ymm11,%ymm15,%ymm11 3740 196,65,124,82,228, //vrsqrtps %ymm12,%ymm12 3741 196,65,44,89,212, //vmulps %ymm12,%ymm10,%ymm10 3742 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 3743 196,65,52,93,202, //vminps %ymm10,%ymm9,%ymm9 3744 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 3745 196,193,108,194,214,1, //vcmpltps %ymm14,%ymm2,%ymm2 3746 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2 3747 72,173, //lods %ds:(%rsi),%rax 3748 255,224, //jmpq *%rax 3749 }; 3750 3751 CODE const uint8_t sk_scale_1_float_avx[] = { 3752 72,173, //lods %ds:(%rsi),%rax 3753 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 3754 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 3755 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 3756 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2 3757 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 3758 72,173, //lods %ds:(%rsi),%rax 3759 255,224, //jmpq *%rax 3760 }; 3761 3762 CODE const uint8_t sk_scale_u8_avx[] = { 3763 73,137,200, //mov %rcx,%r8 3764 72,173, //lods %ds:(%rsi),%rax 3765 72,139,0, //mov (%rax),%rax 3766 72,1,248, //add %rdi,%rax 3767 77,133,192, //test %r8,%r8 3768 117,80, //jne 5a2 <_sk_scale_u8_avx+0x60> 3769 197,122,126,0, //vmovq (%rax),%xmm8 3770 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9 3771 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8 3772 196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8 3773 196,67,53,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm9,%ymm8 3774 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 3775 184,129,128,128,59, //mov $0x3b808081,%eax 3776 197,121,110,200, //vmovd %eax,%xmm9 3777 196,67,121,4,201,0, //vpermilps $0x0,%xmm9,%xmm9 3778 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 3779 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8 3780 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 3781 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 3782 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2 3783 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 3784 72,173, //lods %ds:(%rsi),%rax 3785 76,137,193, //mov %r8,%rcx 3786 255,224, //jmpq *%rax 3787 49,201, //xor %ecx,%ecx 3788 77,137,194, //mov %r8,%r10 3789 69,49,201, //xor %r9d,%r9d 3790 68,15,182,24, //movzbl (%rax),%r11d 3791 72,255,192, //inc %rax 3792 73,211,227, //shl %cl,%r11 3793 77,9,217, //or %r11,%r9 3794 72,131,193,8, //add $0x8,%rcx 3795 73,255,202, //dec %r10 3796 117,234, //jne 5aa <_sk_scale_u8_avx+0x68> 3797 196,65,249,110,193, //vmovq %r9,%xmm8 3798 235,143, //jmp 556 <_sk_scale_u8_avx+0x14> 3799 }; 3800 3801 CODE const uint8_t sk_lerp_1_float_avx[] = { 3802 72,173, //lods %ds:(%rsi),%rax 3803 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 3804 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 3805 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0 3806 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0 3807 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 3808 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1 3809 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1 3810 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 3811 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2 3812 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2 3813 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3 3814 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3 3815 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3 3816 72,173, //lods %ds:(%rsi),%rax 3817 255,224, //jmpq *%rax 3818 }; 3819 3820 CODE const uint8_t sk_lerp_u8_avx[] = { 3821 73,137,200, //mov %rcx,%r8 3822 72,173, //lods %ds:(%rsi),%rax 3823 72,139,0, //mov (%rax),%rax 3824 72,1,248, //add %rdi,%rax 3825 77,133,192, //test %r8,%r8 3826 117,116, //jne 68a <_sk_lerp_u8_avx+0x84> 3827 197,122,126,0, //vmovq (%rax),%xmm8 3828 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9 3829 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8 3830 196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8 3831 196,67,53,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm9,%ymm8 3832 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 3833 184,129,128,128,59, //mov $0x3b808081,%eax 3834 197,121,110,200, //vmovd %eax,%xmm9 3835 196,67,121,4,201,0, //vpermilps $0x0,%xmm9,%xmm9 3836 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 3837 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8 3838 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 3839 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0 3840 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0 3841 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 3842 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1 3843 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1 3844 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 3845 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2 3846 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2 3847 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3 3848 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3 3849 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3 3850 72,173, //lods %ds:(%rsi),%rax 3851 76,137,193, //mov %r8,%rcx 3852 255,224, //jmpq *%rax 3853 49,201, //xor %ecx,%ecx 3854 77,137,194, //mov %r8,%r10 3855 69,49,201, //xor %r9d,%r9d 3856 68,15,182,24, //movzbl (%rax),%r11d 3857 72,255,192, //inc %rax 3858 73,211,227, //shl %cl,%r11 3859 77,9,217, //or %r11,%r9 3860 72,131,193,8, //add $0x8,%rcx 3861 73,255,202, //dec %r10 3862 117,234, //jne 692 <_sk_lerp_u8_avx+0x8c> 3863 196,65,249,110,193, //vmovq %r9,%xmm8 3864 233,104,255,255,255, //jmpq 61a <_sk_lerp_u8_avx+0x14> 3865 }; 3866 3867 CODE const uint8_t sk_lerp_565_avx[] = { 3868 72,173, //lods %ds:(%rsi),%rax 3869 76,139,16, //mov (%rax),%r10 3870 72,133,201, //test %rcx,%rcx 3871 15,133,250,0,0,0, //jne 7ba <_sk_lerp_565_avx+0x108> 3872 196,65,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm8 3873 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 3874 197,185,105,219, //vpunpckhwd %xmm3,%xmm8,%xmm3 3875 196,66,121,51,192, //vpmovzxwd %xmm8,%xmm8 3876 196,99,61,24,195,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm8 3877 184,0,248,0,0, //mov $0xf800,%eax 3878 197,249,110,216, //vmovd %eax,%xmm3 3879 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 3880 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 3881 196,193,100,84,216, //vandps %ymm8,%ymm3,%ymm3 3882 197,124,91,203, //vcvtdq2ps %ymm3,%ymm9 3883 184,8,33,132,55, //mov $0x37842108,%eax 3884 197,249,110,216, //vmovd %eax,%xmm3 3885 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 3886 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 3887 197,52,89,203, //vmulps %ymm3,%ymm9,%ymm9 3888 184,224,7,0,0, //mov $0x7e0,%eax 3889 197,249,110,216, //vmovd %eax,%xmm3 3890 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 3891 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 3892 196,193,100,84,216, //vandps %ymm8,%ymm3,%ymm3 3893 197,124,91,211, //vcvtdq2ps %ymm3,%ymm10 3894 184,33,8,2,58, //mov $0x3a020821,%eax 3895 197,249,110,216, //vmovd %eax,%xmm3 3896 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 3897 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 3898 197,44,89,211, //vmulps %ymm3,%ymm10,%ymm10 3899 184,31,0,0,0, //mov $0x1f,%eax 3900 197,249,110,216, //vmovd %eax,%xmm3 3901 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 3902 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 3903 196,193,100,84,216, //vandps %ymm8,%ymm3,%ymm3 3904 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 3905 184,8,33,4,61, //mov $0x3d042108,%eax 3906 197,249,110,216, //vmovd %eax,%xmm3 3907 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 3908 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 3909 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 3910 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 3911 196,193,124,89,193, //vmulps %ymm9,%ymm0,%ymm0 3912 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0 3913 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 3914 196,193,116,89,202, //vmulps %ymm10,%ymm1,%ymm1 3915 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1 3916 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 3917 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 3918 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2 3919 184,0,0,128,63, //mov $0x3f800000,%eax 3920 197,249,110,216, //vmovd %eax,%xmm3 3921 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 3922 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 3923 72,173, //lods %ds:(%rsi),%rax 3924 255,224, //jmpq *%rax 3925 65,137,200, //mov %ecx,%r8d 3926 65,128,224,7, //and $0x7,%r8b 3927 196,65,57,239,192, //vpxor %xmm8,%xmm8,%xmm8 3928 65,254,200, //dec %r8b 3929 65,128,248,6, //cmp $0x6,%r8b 3930 15,135,243,254,255,255, //ja 6c6 <_sk_lerp_565_avx+0x14> 3931 69,15,182,192, //movzbl %r8b,%r8d 3932 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # 828 <_sk_lerp_565_avx+0x176> 3933 75,99,4,129, //movslq (%r9,%r8,4),%rax 3934 76,1,200, //add %r9,%rax 3935 255,224, //jmpq *%rax 3936 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 3937 196,65,97,196,68,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm8 3938 196,65,57,196,68,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm8,%xmm8 3939 196,65,57,196,68,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm8,%xmm8 3940 196,65,57,196,68,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm8,%xmm8 3941 196,65,57,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8 3942 196,65,57,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8 3943 196,65,57,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm8,%xmm8 3944 233,159,254,255,255, //jmpq 6c6 <_sk_lerp_565_avx+0x14> 3945 144, //nop 3946 243,255, //repz (bad) 3947 255, //(bad) 3948 255, //(bad) 3949 235,255, //jmp 82d <_sk_lerp_565_avx+0x17b> 3950 255, //(bad) 3951 255,227, //jmpq *%rbx 3952 255, //(bad) 3953 255, //(bad) 3954 255, //(bad) 3955 219,255, //(bad) 3956 255, //(bad) 3957 255,211, //callq *%rbx 3958 255, //(bad) 3959 255, //(bad) 3960 255,203, //dec %ebx 3961 255, //(bad) 3962 255, //(bad) 3963 255, //(bad) 3964 191, //.byte 0xbf 3965 255, //(bad) 3966 255, //(bad) 3967 255, //.byte 0xff 3968 }; 3969 3970 CODE const uint8_t sk_load_tables_avx[] = { 3971 85, //push %rbp 3972 65,87, //push %r15 3973 65,86, //push %r14 3974 65,85, //push %r13 3975 65,84, //push %r12 3976 83, //push %rbx 3977 72,173, //lods %ds:(%rsi),%rax 3978 76,139,0, //mov (%rax),%r8 3979 72,133,201, //test %rcx,%rcx 3980 15,133,56,2,0,0, //jne a94 <_sk_load_tables_avx+0x250> 3981 196,65,124,16,4,184, //vmovups (%r8,%rdi,4),%ymm8 3982 187,255,0,0,0, //mov $0xff,%ebx 3983 197,249,110,195, //vmovd %ebx,%xmm0 3984 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0 3985 196,99,125,24,200,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm9 3986 196,193,52,84,192, //vandps %ymm8,%ymm9,%ymm0 3987 196,193,249,126,193, //vmovq %xmm0,%r9 3988 69,137,203, //mov %r9d,%r11d 3989 196,195,249,22,194,1, //vpextrq $0x1,%xmm0,%r10 3990 69,137,214, //mov %r10d,%r14d 3991 73,193,234,32, //shr $0x20,%r10 3992 73,193,233,32, //shr $0x20,%r9 3993 196,227,125,25,192,1, //vextractf128 $0x1,%ymm0,%xmm0 3994 196,193,249,126,196, //vmovq %xmm0,%r12 3995 69,137,231, //mov %r12d,%r15d 3996 196,227,249,22,195,1, //vpextrq $0x1,%xmm0,%rbx 3997 65,137,221, //mov %ebx,%r13d 3998 72,193,235,32, //shr $0x20,%rbx 3999 73,193,236,32, //shr $0x20,%r12 4000 72,139,104,8, //mov 0x8(%rax),%rbp 4001 76,139,64,16, //mov 0x10(%rax),%r8 4002 196,161,122,16,68,189,0, //vmovss 0x0(%rbp,%r15,4),%xmm0 4003 196,163,121,33,68,165,0,16, //vinsertps $0x10,0x0(%rbp,%r12,4),%xmm0,%xmm0 4004 196,161,122,16,76,173,0, //vmovss 0x0(%rbp,%r13,4),%xmm1 4005 196,227,121,33,193,32, //vinsertps $0x20,%xmm1,%xmm0,%xmm0 4006 197,250,16,76,157,0, //vmovss 0x0(%rbp,%rbx,4),%xmm1 4007 196,227,121,33,193,48, //vinsertps $0x30,%xmm1,%xmm0,%xmm0 4008 196,161,122,16,76,157,0, //vmovss 0x0(%rbp,%r11,4),%xmm1 4009 196,163,113,33,76,141,0,16, //vinsertps $0x10,0x0(%rbp,%r9,4),%xmm1,%xmm1 4010 196,161,122,16,92,181,0, //vmovss 0x0(%rbp,%r14,4),%xmm3 4011 196,227,113,33,203,32, //vinsertps $0x20,%xmm3,%xmm1,%xmm1 4012 196,161,122,16,92,149,0, //vmovss 0x0(%rbp,%r10,4),%xmm3 4013 196,227,113,33,203,48, //vinsertps $0x30,%xmm3,%xmm1,%xmm1 4014 196,227,117,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm1,%ymm0 4015 196,193,113,114,208,8, //vpsrld $0x8,%xmm8,%xmm1 4016 196,67,125,25,194,1, //vextractf128 $0x1,%ymm8,%xmm10 4017 196,193,105,114,210,8, //vpsrld $0x8,%xmm10,%xmm2 4018 196,227,117,24,202,1, //vinsertf128 $0x1,%xmm2,%ymm1,%ymm1 4019 197,180,84,201, //vandps %ymm1,%ymm9,%ymm1 4020 196,193,249,126,201, //vmovq %xmm1,%r9 4021 69,137,203, //mov %r9d,%r11d 4022 196,195,249,22,202,1, //vpextrq $0x1,%xmm1,%r10 4023 69,137,214, //mov %r10d,%r14d 4024 73,193,234,32, //shr $0x20,%r10 4025 73,193,233,32, //shr $0x20,%r9 4026 196,227,125,25,201,1, //vextractf128 $0x1,%ymm1,%xmm1 4027 196,225,249,126,205, //vmovq %xmm1,%rbp 4028 65,137,239, //mov %ebp,%r15d 4029 196,227,249,22,203,1, //vpextrq $0x1,%xmm1,%rbx 4030 65,137,220, //mov %ebx,%r12d 4031 72,193,235,32, //shr $0x20,%rbx 4032 72,193,237,32, //shr $0x20,%rbp 4033 196,129,122,16,12,184, //vmovss (%r8,%r15,4),%xmm1 4034 196,195,113,33,12,168,16, //vinsertps $0x10,(%r8,%rbp,4),%xmm1,%xmm1 4035 196,129,122,16,20,160, //vmovss (%r8,%r12,4),%xmm2 4036 196,227,113,33,202,32, //vinsertps $0x20,%xmm2,%xmm1,%xmm1 4037 196,193,122,16,20,152, //vmovss (%r8,%rbx,4),%xmm2 4038 196,227,113,33,202,48, //vinsertps $0x30,%xmm2,%xmm1,%xmm1 4039 196,129,122,16,20,152, //vmovss (%r8,%r11,4),%xmm2 4040 196,131,105,33,20,136,16, //vinsertps $0x10,(%r8,%r9,4),%xmm2,%xmm2 4041 196,129,122,16,28,176, //vmovss (%r8,%r14,4),%xmm3 4042 196,227,105,33,211,32, //vinsertps $0x20,%xmm3,%xmm2,%xmm2 4043 196,129,122,16,28,144, //vmovss (%r8,%r10,4),%xmm3 4044 196,227,105,33,211,48, //vinsertps $0x30,%xmm3,%xmm2,%xmm2 4045 196,227,109,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm2,%ymm1 4046 72,139,64,24, //mov 0x18(%rax),%rax 4047 196,193,105,114,208,16, //vpsrld $0x10,%xmm8,%xmm2 4048 196,193,97,114,210,16, //vpsrld $0x10,%xmm10,%xmm3 4049 196,227,109,24,211,1, //vinsertf128 $0x1,%xmm3,%ymm2,%ymm2 4050 197,180,84,210, //vandps %ymm2,%ymm9,%ymm2 4051 196,193,249,126,208, //vmovq %xmm2,%r8 4052 69,137,194, //mov %r8d,%r10d 4053 196,195,249,22,209,1, //vpextrq $0x1,%xmm2,%r9 4054 69,137,203, //mov %r9d,%r11d 4055 73,193,233,32, //shr $0x20,%r9 4056 73,193,232,32, //shr $0x20,%r8 4057 196,227,125,25,210,1, //vextractf128 $0x1,%ymm2,%xmm2 4058 196,225,249,126,213, //vmovq %xmm2,%rbp 4059 65,137,238, //mov %ebp,%r14d 4060 196,227,249,22,211,1, //vpextrq $0x1,%xmm2,%rbx 4061 65,137,223, //mov %ebx,%r15d 4062 72,193,235,32, //shr $0x20,%rbx 4063 72,193,237,32, //shr $0x20,%rbp 4064 196,161,122,16,20,176, //vmovss (%rax,%r14,4),%xmm2 4065 196,227,105,33,20,168,16, //vinsertps $0x10,(%rax,%rbp,4),%xmm2,%xmm2 4066 196,161,122,16,28,184, //vmovss (%rax,%r15,4),%xmm3 4067 196,227,105,33,211,32, //vinsertps $0x20,%xmm3,%xmm2,%xmm2 4068 197,250,16,28,152, //vmovss (%rax,%rbx,4),%xmm3 4069 196,99,105,33,203,48, //vinsertps $0x30,%xmm3,%xmm2,%xmm9 4070 196,161,122,16,28,144, //vmovss (%rax,%r10,4),%xmm3 4071 196,163,97,33,28,128,16, //vinsertps $0x10,(%rax,%r8,4),%xmm3,%xmm3 4072 196,161,122,16,20,152, //vmovss (%rax,%r11,4),%xmm2 4073 196,227,97,33,210,32, //vinsertps $0x20,%xmm2,%xmm3,%xmm2 4074 196,161,122,16,28,136, //vmovss (%rax,%r9,4),%xmm3 4075 196,227,105,33,211,48, //vinsertps $0x30,%xmm3,%xmm2,%xmm2 4076 196,195,109,24,209,1, //vinsertf128 $0x1,%xmm9,%ymm2,%ymm2 4077 196,193,57,114,208,24, //vpsrld $0x18,%xmm8,%xmm8 4078 196,193,97,114,210,24, //vpsrld $0x18,%xmm10,%xmm3 4079 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3 4080 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 4081 184,129,128,128,59, //mov $0x3b808081,%eax 4082 197,249,110,216, //vmovd %eax,%xmm3 4083 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 4084 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 4085 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 4086 72,173, //lods %ds:(%rsi),%rax 4087 91, //pop %rbx 4088 65,92, //pop %r12 4089 65,93, //pop %r13 4090 65,94, //pop %r14 4091 65,95, //pop %r15 4092 93, //pop %rbp 4093 255,224, //jmpq *%rax 4094 137,203, //mov %ecx,%ebx 4095 128,227,7, //and $0x7,%bl 4096 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 4097 254,203, //dec %bl 4098 128,251,6, //cmp $0x6,%bl 4099 15,135,185,253,255,255, //ja 862 <_sk_load_tables_avx+0x1e> 4100 15,182,219, //movzbl %bl,%ebx 4101 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # b3c <_sk_load_tables_avx+0x2f8> 4102 73,99,28,153, //movslq (%r9,%rbx,4),%rbx 4103 76,1,203, //add %r9,%rbx 4104 255,227, //jmpq *%rbx 4105 196,193,121,110,68,184,24, //vmovd 0x18(%r8,%rdi,4),%xmm0 4106 197,249,112,192,68, //vpshufd $0x44,%xmm0,%xmm0 4107 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 4108 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 4109 196,99,117,12,192,64, //vblendps $0x40,%ymm0,%ymm1,%ymm8 4110 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0 4111 196,195,121,34,68,184,20,1, //vpinsrd $0x1,0x14(%r8,%rdi,4),%xmm0,%xmm0 4112 196,99,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm8 4113 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0 4114 196,195,121,34,68,184,16,0, //vpinsrd $0x0,0x10(%r8,%rdi,4),%xmm0,%xmm0 4115 196,99,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm8 4116 196,195,57,34,68,184,12,3, //vpinsrd $0x3,0xc(%r8,%rdi,4),%xmm8,%xmm0 4117 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8 4118 196,195,57,34,68,184,8,2, //vpinsrd $0x2,0x8(%r8,%rdi,4),%xmm8,%xmm0 4119 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8 4120 196,195,57,34,68,184,4,1, //vpinsrd $0x1,0x4(%r8,%rdi,4),%xmm8,%xmm0 4121 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8 4122 196,195,57,34,4,184,0, //vpinsrd $0x0,(%r8,%rdi,4),%xmm8,%xmm0 4123 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8 4124 233,38,253,255,255, //jmpq 862 <_sk_load_tables_avx+0x1e> 4125 238, //out %al,(%dx) 4126 255, //(bad) 4127 255, //(bad) 4128 255,224, //jmpq *%rax 4129 255, //(bad) 4130 255, //(bad) 4131 255,210, //callq *%rdx 4132 255, //(bad) 4133 255, //(bad) 4134 255,196, //inc %esp 4135 255, //(bad) 4136 255, //(bad) 4137 255,176,255,255,255,156, //pushq -0x63000001(%rax) 4138 255, //(bad) 4139 255, //(bad) 4140 255, //.byte 0xff 4141 128,255,255, //cmp $0xff,%bh 4142 255, //.byte 0xff 4143 }; 4144 4145 CODE const uint8_t sk_load_a8_avx[] = { 4146 73,137,200, //mov %rcx,%r8 4147 72,173, //lods %ds:(%rsi),%rax 4148 72,139,0, //mov (%rax),%rax 4149 72,1,248, //add %rdi,%rax 4150 77,133,192, //test %r8,%r8 4151 117,74, //jne bb2 <_sk_load_a8_avx+0x5a> 4152 197,250,126,0, //vmovq (%rax),%xmm0 4153 196,226,121,49,200, //vpmovzxbd %xmm0,%xmm1 4154 196,227,121,4,192,229, //vpermilps $0xe5,%xmm0,%xmm0 4155 196,226,121,49,192, //vpmovzxbd %xmm0,%xmm0 4156 196,227,117,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm1,%ymm0 4157 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 4158 184,129,128,128,59, //mov $0x3b808081,%eax 4159 197,249,110,200, //vmovd %eax,%xmm1 4160 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 4161 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 4162 197,252,89,217, //vmulps %ymm1,%ymm0,%ymm3 4163 72,173, //lods %ds:(%rsi),%rax 4164 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 4165 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 4166 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 4167 76,137,193, //mov %r8,%rcx 4168 255,224, //jmpq *%rax 4169 49,201, //xor %ecx,%ecx 4170 77,137,194, //mov %r8,%r10 4171 69,49,201, //xor %r9d,%r9d 4172 68,15,182,24, //movzbl (%rax),%r11d 4173 72,255,192, //inc %rax 4174 73,211,227, //shl %cl,%r11 4175 77,9,217, //or %r11,%r9 4176 72,131,193,8, //add $0x8,%rcx 4177 73,255,202, //dec %r10 4178 117,234, //jne bba <_sk_load_a8_avx+0x62> 4179 196,193,249,110,193, //vmovq %r9,%xmm0 4180 235,149, //jmp b6c <_sk_load_a8_avx+0x14> 4181 }; 4182 4183 CODE const uint8_t sk_store_a8_avx[] = { 4184 72,173, //lods %ds:(%rsi),%rax 4185 76,139,8, //mov (%rax),%r9 4186 184,0,0,127,67, //mov $0x437f0000,%eax 4187 197,121,110,192, //vmovd %eax,%xmm8 4188 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 4189 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 4190 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 4191 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 4192 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 4193 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 4194 196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8 4195 72,133,201, //test %rcx,%rcx 4196 117,10, //jne c19 <_sk_store_a8_avx+0x42> 4197 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1) 4198 72,173, //lods %ds:(%rsi),%rax 4199 255,224, //jmpq *%rax 4200 65,137,200, //mov %ecx,%r8d 4201 65,128,224,7, //and $0x7,%r8b 4202 65,254,200, //dec %r8b 4203 65,128,248,6, //cmp $0x6,%r8b 4204 119,236, //ja c15 <_sk_store_a8_avx+0x3e> 4205 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8 4206 65,15,182,192, //movzbl %r8b,%eax 4207 76,141,5,67,0,0,0, //lea 0x43(%rip),%r8 # c7c <_sk_store_a8_avx+0xa5> 4208 73,99,4,128, //movslq (%r8,%rax,4),%rax 4209 76,1,192, //add %r8,%rax 4210 255,224, //jmpq *%rax 4211 196,67,121,20,68,57,6,12, //vpextrb $0xc,%xmm8,0x6(%r9,%rdi,1) 4212 196,67,121,20,68,57,5,10, //vpextrb $0xa,%xmm8,0x5(%r9,%rdi,1) 4213 196,67,121,20,68,57,4,8, //vpextrb $0x8,%xmm8,0x4(%r9,%rdi,1) 4214 196,67,121,20,68,57,3,6, //vpextrb $0x6,%xmm8,0x3(%r9,%rdi,1) 4215 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1) 4216 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1) 4217 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1) 4218 235,154, //jmp c15 <_sk_store_a8_avx+0x3e> 4219 144, //nop 4220 246,255, //idiv %bh 4221 255, //(bad) 4222 255, //(bad) 4223 238, //out %al,(%dx) 4224 255, //(bad) 4225 255, //(bad) 4226 255,230, //jmpq *%rsi 4227 255, //(bad) 4228 255, //(bad) 4229 255, //(bad) 4230 222,255, //fdivrp %st,%st(7) 4231 255, //(bad) 4232 255,214, //callq *%rsi 4233 255, //(bad) 4234 255, //(bad) 4235 255,206, //dec %esi 4236 255, //(bad) 4237 255, //(bad) 4238 255,198, //inc %esi 4239 255, //(bad) 4240 255, //(bad) 4241 255, //.byte 0xff 4242 }; 4243 4244 CODE const uint8_t sk_load_565_avx[] = { 4245 72,173, //lods %ds:(%rsi),%rax 4246 76,139,16, //mov (%rax),%r10 4247 72,133,201, //test %rcx,%rcx 4248 15,133,209,0,0,0, //jne d77 <_sk_load_565_avx+0xdf> 4249 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0 4250 197,241,239,201, //vpxor %xmm1,%xmm1,%xmm1 4251 197,249,105,201, //vpunpckhwd %xmm1,%xmm0,%xmm1 4252 196,226,121,51,192, //vpmovzxwd %xmm0,%xmm0 4253 196,227,125,24,209,1, //vinsertf128 $0x1,%xmm1,%ymm0,%ymm2 4254 184,0,248,0,0, //mov $0xf800,%eax 4255 197,249,110,192, //vmovd %eax,%xmm0 4256 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0 4257 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 4258 197,252,84,194, //vandps %ymm2,%ymm0,%ymm0 4259 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 4260 184,8,33,132,55, //mov $0x37842108,%eax 4261 197,249,110,200, //vmovd %eax,%xmm1 4262 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 4263 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 4264 197,252,89,193, //vmulps %ymm1,%ymm0,%ymm0 4265 184,224,7,0,0, //mov $0x7e0,%eax 4266 197,249,110,200, //vmovd %eax,%xmm1 4267 197,249,112,201,0, //vpshufd $0x0,%xmm1,%xmm1 4268 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 4269 197,244,84,202, //vandps %ymm2,%ymm1,%ymm1 4270 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1 4271 184,33,8,2,58, //mov $0x3a020821,%eax 4272 197,249,110,216, //vmovd %eax,%xmm3 4273 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 4274 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 4275 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1 4276 184,31,0,0,0, //mov $0x1f,%eax 4277 197,249,110,216, //vmovd %eax,%xmm3 4278 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 4279 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 4280 197,228,84,210, //vandps %ymm2,%ymm3,%ymm2 4281 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 4282 184,8,33,4,61, //mov $0x3d042108,%eax 4283 197,249,110,216, //vmovd %eax,%xmm3 4284 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 4285 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 4286 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 4287 184,0,0,128,63, //mov $0x3f800000,%eax 4288 197,249,110,216, //vmovd %eax,%xmm3 4289 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 4290 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 4291 72,173, //lods %ds:(%rsi),%rax 4292 255,224, //jmpq *%rax 4293 65,137,200, //mov %ecx,%r8d 4294 65,128,224,7, //and $0x7,%r8b 4295 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0 4296 65,254,200, //dec %r8b 4297 65,128,248,6, //cmp $0x6,%r8b 4298 15,135,29,255,255,255, //ja cac <_sk_load_565_avx+0x14> 4299 69,15,182,192, //movzbl %r8b,%r8d 4300 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # de4 <_sk_load_565_avx+0x14c> 4301 75,99,4,129, //movslq (%r9,%r8,4),%rax 4302 76,1,200, //add %r9,%rax 4303 255,224, //jmpq *%rax 4304 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0 4305 196,193,121,196,68,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0 4306 196,193,121,196,68,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0 4307 196,193,121,196,68,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0 4308 196,193,121,196,68,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0 4309 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0 4310 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0 4311 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0 4312 233,201,254,255,255, //jmpq cac <_sk_load_565_avx+0x14> 4313 144, //nop 4314 243,255, //repz (bad) 4315 255, //(bad) 4316 255, //(bad) 4317 235,255, //jmp de9 <_sk_load_565_avx+0x151> 4318 255, //(bad) 4319 255,227, //jmpq *%rbx 4320 255, //(bad) 4321 255, //(bad) 4322 255, //(bad) 4323 219,255, //(bad) 4324 255, //(bad) 4325 255,211, //callq *%rbx 4326 255, //(bad) 4327 255, //(bad) 4328 255,203, //dec %ebx 4329 255, //(bad) 4330 255, //(bad) 4331 255, //(bad) 4332 191, //.byte 0xbf 4333 255, //(bad) 4334 255, //(bad) 4335 255, //.byte 0xff 4336 }; 4337 4338 CODE const uint8_t sk_store_565_avx[] = { 4339 72,173, //lods %ds:(%rsi),%rax 4340 76,139,8, //mov (%rax),%r9 4341 184,0,0,248,65, //mov $0x41f80000,%eax 4342 197,121,110,192, //vmovd %eax,%xmm8 4343 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 4344 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 4345 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 4346 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 4347 196,193,41,114,241,11, //vpslld $0xb,%xmm9,%xmm10 4348 196,67,125,25,201,1, //vextractf128 $0x1,%ymm9,%xmm9 4349 196,193,49,114,241,11, //vpslld $0xb,%xmm9,%xmm9 4350 196,67,45,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm9 4351 184,0,0,124,66, //mov $0x427c0000,%eax 4352 197,121,110,208, //vmovd %eax,%xmm10 4353 196,67,121,4,210,0, //vpermilps $0x0,%xmm10,%xmm10 4354 196,67,45,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm10,%ymm10 4355 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 4356 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 4357 196,193,33,114,242,5, //vpslld $0x5,%xmm10,%xmm11 4358 196,67,125,25,210,1, //vextractf128 $0x1,%ymm10,%xmm10 4359 196,193,41,114,242,5, //vpslld $0x5,%xmm10,%xmm10 4360 196,67,37,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm11,%ymm10 4361 196,65,45,86,201, //vorpd %ymm9,%ymm10,%ymm9 4362 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 4363 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 4364 196,65,53,86,192, //vorpd %ymm8,%ymm9,%ymm8 4365 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 4366 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 4367 72,133,201, //test %rcx,%rcx 4368 117,10, //jne e9e <_sk_store_565_avx+0x9e> 4369 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2) 4370 72,173, //lods %ds:(%rsi),%rax 4371 255,224, //jmpq *%rax 4372 65,137,200, //mov %ecx,%r8d 4373 65,128,224,7, //and $0x7,%r8b 4374 65,254,200, //dec %r8b 4375 65,128,248,6, //cmp $0x6,%r8b 4376 119,236, //ja e9a <_sk_store_565_avx+0x9a> 4377 65,15,182,192, //movzbl %r8b,%eax 4378 76,141,5,67,0,0,0, //lea 0x43(%rip),%r8 # efc <_sk_store_565_avx+0xfc> 4379 73,99,4,128, //movslq (%r8,%rax,4),%rax 4380 76,1,192, //add %r8,%rax 4381 255,224, //jmpq *%rax 4382 196,67,121,21,68,121,12,6, //vpextrw $0x6,%xmm8,0xc(%r9,%rdi,2) 4383 196,67,121,21,68,121,10,5, //vpextrw $0x5,%xmm8,0xa(%r9,%rdi,2) 4384 196,67,121,21,68,121,8,4, //vpextrw $0x4,%xmm8,0x8(%r9,%rdi,2) 4385 196,67,121,21,68,121,6,3, //vpextrw $0x3,%xmm8,0x6(%r9,%rdi,2) 4386 196,67,121,21,68,121,4,2, //vpextrw $0x2,%xmm8,0x4(%r9,%rdi,2) 4387 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2) 4388 196,67,121,21,4,121,0, //vpextrw $0x0,%xmm8,(%r9,%rdi,2) 4389 235,159, //jmp e9a <_sk_store_565_avx+0x9a> 4390 144, //nop 4391 246,255, //idiv %bh 4392 255, //(bad) 4393 255, //(bad) 4394 238, //out %al,(%dx) 4395 255, //(bad) 4396 255, //(bad) 4397 255,230, //jmpq *%rsi 4398 255, //(bad) 4399 255, //(bad) 4400 255, //(bad) 4401 222,255, //fdivrp %st,%st(7) 4402 255, //(bad) 4403 255,214, //callq *%rsi 4404 255, //(bad) 4405 255, //(bad) 4406 255,206, //dec %esi 4407 255, //(bad) 4408 255, //(bad) 4409 255,198, //inc %esi 4410 255, //(bad) 4411 255, //(bad) 4412 255, //.byte 0xff 4413 }; 4414 4415 CODE const uint8_t sk_load_8888_avx[] = { 4416 72,173, //lods %ds:(%rsi),%rax 4417 76,139,16, //mov (%rax),%r10 4418 72,133,201, //test %rcx,%rcx 4419 15,133,157,0,0,0, //jne fc3 <_sk_load_8888_avx+0xab> 4420 196,65,124,16,12,186, //vmovups (%r10,%rdi,4),%ymm9 4421 184,255,0,0,0, //mov $0xff,%eax 4422 197,249,110,192, //vmovd %eax,%xmm0 4423 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0 4424 196,99,125,24,216,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm11 4425 196,193,36,84,193, //vandps %ymm9,%ymm11,%ymm0 4426 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 4427 184,129,128,128,59, //mov $0x3b808081,%eax 4428 197,249,110,200, //vmovd %eax,%xmm1 4429 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 4430 196,99,117,24,193,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm8 4431 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0 4432 196,193,41,114,209,8, //vpsrld $0x8,%xmm9,%xmm10 4433 196,99,125,25,203,1, //vextractf128 $0x1,%ymm9,%xmm3 4434 197,241,114,211,8, //vpsrld $0x8,%xmm3,%xmm1 4435 196,227,45,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm10,%ymm1 4436 197,164,84,201, //vandps %ymm1,%ymm11,%ymm1 4437 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1 4438 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1 4439 196,193,41,114,209,16, //vpsrld $0x10,%xmm9,%xmm10 4440 197,233,114,211,16, //vpsrld $0x10,%xmm3,%xmm2 4441 196,227,45,24,210,1, //vinsertf128 $0x1,%xmm2,%ymm10,%ymm2 4442 197,164,84,210, //vandps %ymm2,%ymm11,%ymm2 4443 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 4444 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2 4445 196,193,49,114,209,24, //vpsrld $0x18,%xmm9,%xmm9 4446 197,225,114,211,24, //vpsrld $0x18,%xmm3,%xmm3 4447 196,227,53,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm9,%ymm3 4448 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3 4449 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3 4450 72,173, //lods %ds:(%rsi),%rax 4451 255,224, //jmpq *%rax 4452 65,137,200, //mov %ecx,%r8d 4453 65,128,224,7, //and $0x7,%r8b 4454 196,65,52,87,201, //vxorps %ymm9,%ymm9,%ymm9 4455 65,254,200, //dec %r8b 4456 65,128,248,6, //cmp $0x6,%r8b 4457 15,135,80,255,255,255, //ja f2c <_sk_load_8888_avx+0x14> 4458 69,15,182,192, //movzbl %r8b,%r8d 4459 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # 1070 <_sk_load_8888_avx+0x158> 4460 75,99,4,129, //movslq (%r9,%r8,4),%rax 4461 76,1,200, //add %r9,%rax 4462 255,224, //jmpq *%rax 4463 196,193,121,110,68,186,24, //vmovd 0x18(%r10,%rdi,4),%xmm0 4464 197,249,112,192,68, //vpshufd $0x44,%xmm0,%xmm0 4465 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 4466 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 4467 196,99,117,12,200,64, //vblendps $0x40,%ymm0,%ymm1,%ymm9 4468 196,99,125,25,200,1, //vextractf128 $0x1,%ymm9,%xmm0 4469 196,195,121,34,68,186,20,1, //vpinsrd $0x1,0x14(%r10,%rdi,4),%xmm0,%xmm0 4470 196,99,53,24,200,1, //vinsertf128 $0x1,%xmm0,%ymm9,%ymm9 4471 196,99,125,25,200,1, //vextractf128 $0x1,%ymm9,%xmm0 4472 196,195,121,34,68,186,16,0, //vpinsrd $0x0,0x10(%r10,%rdi,4),%xmm0,%xmm0 4473 196,99,53,24,200,1, //vinsertf128 $0x1,%xmm0,%ymm9,%ymm9 4474 196,195,49,34,68,186,12,3, //vpinsrd $0x3,0xc(%r10,%rdi,4),%xmm9,%xmm0 4475 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 4476 196,195,49,34,68,186,8,2, //vpinsrd $0x2,0x8(%r10,%rdi,4),%xmm9,%xmm0 4477 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 4478 196,195,49,34,68,186,4,1, //vpinsrd $0x1,0x4(%r10,%rdi,4),%xmm9,%xmm0 4479 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 4480 196,195,49,34,4,186,0, //vpinsrd $0x0,(%r10,%rdi,4),%xmm9,%xmm0 4481 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 4482 233,188,254,255,255, //jmpq f2c <_sk_load_8888_avx+0x14> 4483 238, //out %al,(%dx) 4484 255, //(bad) 4485 255, //(bad) 4486 255,224, //jmpq *%rax 4487 255, //(bad) 4488 255, //(bad) 4489 255,210, //callq *%rdx 4490 255, //(bad) 4491 255, //(bad) 4492 255,196, //inc %esp 4493 255, //(bad) 4494 255, //(bad) 4495 255,176,255,255,255,156, //pushq -0x63000001(%rax) 4496 255, //(bad) 4497 255, //(bad) 4498 255, //.byte 0xff 4499 128,255,255, //cmp $0xff,%bh 4500 255, //.byte 0xff 4501 }; 4502 4503 CODE const uint8_t sk_store_8888_avx[] = { 4504 72,173, //lods %ds:(%rsi),%rax 4505 76,139,8, //mov (%rax),%r9 4506 184,0,0,127,67, //mov $0x437f0000,%eax 4507 197,121,110,192, //vmovd %eax,%xmm8 4508 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 4509 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 4510 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 4511 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 4512 197,60,89,209, //vmulps %ymm1,%ymm8,%ymm10 4513 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 4514 196,193,33,114,242,8, //vpslld $0x8,%xmm10,%xmm11 4515 196,67,125,25,210,1, //vextractf128 $0x1,%ymm10,%xmm10 4516 196,193,41,114,242,8, //vpslld $0x8,%xmm10,%xmm10 4517 196,67,37,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm11,%ymm10 4518 196,65,45,86,201, //vorpd %ymm9,%ymm10,%ymm9 4519 197,60,89,210, //vmulps %ymm2,%ymm8,%ymm10 4520 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 4521 196,193,33,114,242,16, //vpslld $0x10,%xmm10,%xmm11 4522 196,67,125,25,210,1, //vextractf128 $0x1,%ymm10,%xmm10 4523 196,193,41,114,242,16, //vpslld $0x10,%xmm10,%xmm10 4524 196,67,37,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm11,%ymm10 4525 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 4526 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 4527 196,193,33,114,240,24, //vpslld $0x18,%xmm8,%xmm11 4528 196,67,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm8 4529 196,193,57,114,240,24, //vpslld $0x18,%xmm8,%xmm8 4530 196,67,37,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm11,%ymm8 4531 196,65,45,86,192, //vorpd %ymm8,%ymm10,%ymm8 4532 196,65,53,86,192, //vorpd %ymm8,%ymm9,%ymm8 4533 72,133,201, //test %rcx,%rcx 4534 117,10, //jne 1130 <_sk_store_8888_avx+0xa4> 4535 196,65,124,17,4,185, //vmovups %ymm8,(%r9,%rdi,4) 4536 72,173, //lods %ds:(%rsi),%rax 4537 255,224, //jmpq *%rax 4538 65,137,200, //mov %ecx,%r8d 4539 65,128,224,7, //and $0x7,%r8b 4540 65,254,200, //dec %r8b 4541 65,128,248,6, //cmp $0x6,%r8b 4542 119,236, //ja 112c <_sk_store_8888_avx+0xa0> 4543 65,15,182,192, //movzbl %r8b,%eax 4544 76,141,5,85,0,0,0, //lea 0x55(%rip),%r8 # 11a0 <_sk_store_8888_avx+0x114> 4545 73,99,4,128, //movslq (%r8,%rax,4),%rax 4546 76,1,192, //add %r8,%rax 4547 255,224, //jmpq *%rax 4548 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 4549 196,67,121,22,76,185,24,2, //vpextrd $0x2,%xmm9,0x18(%r9,%rdi,4) 4550 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 4551 196,67,121,22,76,185,20,1, //vpextrd $0x1,%xmm9,0x14(%r9,%rdi,4) 4552 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 4553 196,65,122,17,76,185,16, //vmovss %xmm9,0x10(%r9,%rdi,4) 4554 196,67,121,22,68,185,12,3, //vpextrd $0x3,%xmm8,0xc(%r9,%rdi,4) 4555 196,67,121,22,68,185,8,2, //vpextrd $0x2,%xmm8,0x8(%r9,%rdi,4) 4556 196,67,121,22,68,185,4,1, //vpextrd $0x1,%xmm8,0x4(%r9,%rdi,4) 4557 196,65,121,126,4,185, //vmovd %xmm8,(%r9,%rdi,4) 4558 235,143, //jmp 112c <_sk_store_8888_avx+0xa0> 4559 15,31,0, //nopl (%rax) 4560 245, //cmc 4561 255, //(bad) 4562 255, //(bad) 4563 255, //(bad) 4564 237, //in (%dx),%eax 4565 255, //(bad) 4566 255, //(bad) 4567 255,229, //jmpq *%rbp 4568 255, //(bad) 4569 255, //(bad) 4570 255, //(bad) 4571 221,255, //(bad) 4572 255, //(bad) 4573 255,208, //callq *%rax 4574 255, //(bad) 4575 255, //(bad) 4576 255,194, //inc %edx 4577 255, //(bad) 4578 255, //(bad) 4579 255, //.byte 0xff 4580 180,255, //mov $0xff,%ah 4581 255, //(bad) 4582 255, //.byte 0xff 4583 }; 4584 4585 CODE const uint8_t sk_load_f16_avx[] = { 4586 72,173, //lods %ds:(%rsi),%rax 4587 72,139,0, //mov (%rax),%rax 4588 72,133,201, //test %rcx,%rcx 4589 15,133,2,1,0,0, //jne 12cc <_sk_load_f16_avx+0x110> 4590 197,121,16,4,248, //vmovupd (%rax,%rdi,8),%xmm8 4591 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2 4592 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3 4593 197,122,111,76,248,48, //vmovdqu 0x30(%rax,%rdi,8),%xmm9 4594 197,185,97,194, //vpunpcklwd %xmm2,%xmm8,%xmm0 4595 197,185,105,210, //vpunpckhwd %xmm2,%xmm8,%xmm2 4596 196,193,97,97,201, //vpunpcklwd %xmm9,%xmm3,%xmm1 4597 196,193,97,105,217, //vpunpckhwd %xmm9,%xmm3,%xmm3 4598 197,121,97,194, //vpunpcklwd %xmm2,%xmm0,%xmm8 4599 197,249,105,194, //vpunpckhwd %xmm2,%xmm0,%xmm0 4600 197,241,97,211, //vpunpcklwd %xmm3,%xmm1,%xmm2 4601 197,113,105,203, //vpunpckhwd %xmm3,%xmm1,%xmm9 4602 184,0,4,0,4, //mov $0x4000400,%eax 4603 197,249,110,216, //vmovd %eax,%xmm3 4604 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 4605 196,193,97,101,200, //vpcmpgtw %xmm8,%xmm3,%xmm1 4606 196,65,113,223,192, //vpandn %xmm8,%xmm1,%xmm8 4607 197,225,101,200, //vpcmpgtw %xmm0,%xmm3,%xmm1 4608 197,241,223,192, //vpandn %xmm0,%xmm1,%xmm0 4609 197,225,101,202, //vpcmpgtw %xmm2,%xmm3,%xmm1 4610 197,241,223,202, //vpandn %xmm2,%xmm1,%xmm1 4611 196,193,97,101,209, //vpcmpgtw %xmm9,%xmm3,%xmm2 4612 196,193,105,223,209, //vpandn %xmm9,%xmm2,%xmm2 4613 196,66,121,51,208, //vpmovzxwd %xmm8,%xmm10 4614 196,98,121,51,201, //vpmovzxwd %xmm1,%xmm9 4615 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 4616 197,57,105,195, //vpunpckhwd %xmm3,%xmm8,%xmm8 4617 197,241,105,203, //vpunpckhwd %xmm3,%xmm1,%xmm1 4618 196,98,121,51,216, //vpmovzxwd %xmm0,%xmm11 4619 196,98,121,51,226, //vpmovzxwd %xmm2,%xmm12 4620 197,121,105,235, //vpunpckhwd %xmm3,%xmm0,%xmm13 4621 197,105,105,243, //vpunpckhwd %xmm3,%xmm2,%xmm14 4622 196,193,121,114,242,13, //vpslld $0xd,%xmm10,%xmm0 4623 196,193,105,114,241,13, //vpslld $0xd,%xmm9,%xmm2 4624 196,227,125,24,194,1, //vinsertf128 $0x1,%xmm2,%ymm0,%ymm0 4625 184,0,0,128,119, //mov $0x77800000,%eax 4626 197,249,110,208, //vmovd %eax,%xmm2 4627 197,249,112,210,0, //vpshufd $0x0,%xmm2,%xmm2 4628 196,99,109,24,202,1, //vinsertf128 $0x1,%xmm2,%ymm2,%ymm9 4629 197,180,89,192, //vmulps %ymm0,%ymm9,%ymm0 4630 196,193,105,114,240,13, //vpslld $0xd,%xmm8,%xmm2 4631 197,241,114,241,13, //vpslld $0xd,%xmm1,%xmm1 4632 196,227,109,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm2,%ymm1 4633 197,180,89,201, //vmulps %ymm1,%ymm9,%ymm1 4634 196,193,105,114,243,13, //vpslld $0xd,%xmm11,%xmm2 4635 196,193,97,114,244,13, //vpslld $0xd,%xmm12,%xmm3 4636 196,227,109,24,211,1, //vinsertf128 $0x1,%xmm3,%ymm2,%ymm2 4637 197,180,89,210, //vmulps %ymm2,%ymm9,%ymm2 4638 196,193,57,114,245,13, //vpslld $0xd,%xmm13,%xmm8 4639 196,193,97,114,246,13, //vpslld $0xd,%xmm14,%xmm3 4640 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3 4641 197,180,89,219, //vmulps %ymm3,%ymm9,%ymm3 4642 72,173, //lods %ds:(%rsi),%rax 4643 255,224, //jmpq *%rax 4644 197,123,16,4,248, //vmovsd (%rax,%rdi,8),%xmm8 4645 196,65,49,239,201, //vpxor %xmm9,%xmm9,%xmm9 4646 72,131,249,1, //cmp $0x1,%rcx 4647 116,79, //je 132b <_sk_load_f16_avx+0x16f> 4648 197,57,22,68,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm8,%xmm8 4649 72,131,249,3, //cmp $0x3,%rcx 4650 114,67, //jb 132b <_sk_load_f16_avx+0x16f> 4651 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2 4652 72,131,249,3, //cmp $0x3,%rcx 4653 116,68, //je 1338 <_sk_load_f16_avx+0x17c> 4654 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2 4655 72,131,249,5, //cmp $0x5,%rcx 4656 114,56, //jb 1338 <_sk_load_f16_avx+0x17c> 4657 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3 4658 72,131,249,5, //cmp $0x5,%rcx 4659 15,132,209,254,255,255, //je 11e1 <_sk_load_f16_avx+0x25> 4660 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3 4661 72,131,249,7, //cmp $0x7,%rcx 4662 15,130,193,254,255,255, //jb 11e1 <_sk_load_f16_avx+0x25> 4663 197,122,126,76,248,48, //vmovq 0x30(%rax,%rdi,8),%xmm9 4664 233,182,254,255,255, //jmpq 11e1 <_sk_load_f16_avx+0x25> 4665 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 4666 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 4667 233,169,254,255,255, //jmpq 11e1 <_sk_load_f16_avx+0x25> 4668 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 4669 233,160,254,255,255, //jmpq 11e1 <_sk_load_f16_avx+0x25> 4670 }; 4671 4672 CODE const uint8_t sk_store_f16_avx[] = { 4673 72,173, //lods %ds:(%rsi),%rax 4674 76,139,0, //mov (%rax),%r8 4675 184,0,0,128,7, //mov $0x7800000,%eax 4676 197,121,110,192, //vmovd %eax,%xmm8 4677 196,65,121,112,192,0, //vpshufd $0x0,%xmm8,%xmm8 4678 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 4679 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 4680 196,67,125,25,202,1, //vextractf128 $0x1,%ymm9,%xmm10 4681 196,193,41,114,210,13, //vpsrld $0xd,%xmm10,%xmm10 4682 196,193,49,114,209,13, //vpsrld $0xd,%xmm9,%xmm9 4683 197,60,89,217, //vmulps %ymm1,%ymm8,%ymm11 4684 196,67,125,25,220,1, //vextractf128 $0x1,%ymm11,%xmm12 4685 196,193,25,114,212,13, //vpsrld $0xd,%xmm12,%xmm12 4686 196,193,33,114,211,13, //vpsrld $0xd,%xmm11,%xmm11 4687 197,60,89,234, //vmulps %ymm2,%ymm8,%ymm13 4688 196,67,125,25,238,1, //vextractf128 $0x1,%ymm13,%xmm14 4689 196,193,9,114,214,13, //vpsrld $0xd,%xmm14,%xmm14 4690 196,193,17,114,213,13, //vpsrld $0xd,%xmm13,%xmm13 4691 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 4692 196,67,125,25,199,1, //vextractf128 $0x1,%ymm8,%xmm15 4693 196,193,1,114,215,13, //vpsrld $0xd,%xmm15,%xmm15 4694 196,193,57,114,208,13, //vpsrld $0xd,%xmm8,%xmm8 4695 196,193,33,115,251,2, //vpslldq $0x2,%xmm11,%xmm11 4696 196,65,33,235,201, //vpor %xmm9,%xmm11,%xmm9 4697 196,193,33,115,252,2, //vpslldq $0x2,%xmm12,%xmm11 4698 196,65,33,235,226, //vpor %xmm10,%xmm11,%xmm12 4699 196,193,57,115,248,2, //vpslldq $0x2,%xmm8,%xmm8 4700 196,65,57,235,197, //vpor %xmm13,%xmm8,%xmm8 4701 196,193,41,115,255,2, //vpslldq $0x2,%xmm15,%xmm10 4702 196,65,41,235,238, //vpor %xmm14,%xmm10,%xmm13 4703 196,65,49,98,216, //vpunpckldq %xmm8,%xmm9,%xmm11 4704 196,65,49,106,208, //vpunpckhdq %xmm8,%xmm9,%xmm10 4705 196,65,25,98,205, //vpunpckldq %xmm13,%xmm12,%xmm9 4706 196,65,25,106,197, //vpunpckhdq %xmm13,%xmm12,%xmm8 4707 72,133,201, //test %rcx,%rcx 4708 117,31, //jne 1417 <_sk_store_f16_avx+0xd6> 4709 196,65,120,17,28,248, //vmovups %xmm11,(%r8,%rdi,8) 4710 196,65,120,17,84,248,16, //vmovups %xmm10,0x10(%r8,%rdi,8) 4711 196,65,120,17,76,248,32, //vmovups %xmm9,0x20(%r8,%rdi,8) 4712 196,65,122,127,68,248,48, //vmovdqu %xmm8,0x30(%r8,%rdi,8) 4713 72,173, //lods %ds:(%rsi),%rax 4714 255,224, //jmpq *%rax 4715 196,65,121,214,28,248, //vmovq %xmm11,(%r8,%rdi,8) 4716 72,131,249,1, //cmp $0x1,%rcx 4717 116,240, //je 1413 <_sk_store_f16_avx+0xd2> 4718 196,65,121,23,92,248,8, //vmovhpd %xmm11,0x8(%r8,%rdi,8) 4719 72,131,249,3, //cmp $0x3,%rcx 4720 114,227, //jb 1413 <_sk_store_f16_avx+0xd2> 4721 196,65,121,214,84,248,16, //vmovq %xmm10,0x10(%r8,%rdi,8) 4722 116,218, //je 1413 <_sk_store_f16_avx+0xd2> 4723 196,65,121,23,84,248,24, //vmovhpd %xmm10,0x18(%r8,%rdi,8) 4724 72,131,249,5, //cmp $0x5,%rcx 4725 114,205, //jb 1413 <_sk_store_f16_avx+0xd2> 4726 196,65,121,214,76,248,32, //vmovq %xmm9,0x20(%r8,%rdi,8) 4727 116,196, //je 1413 <_sk_store_f16_avx+0xd2> 4728 196,65,121,23,76,248,40, //vmovhpd %xmm9,0x28(%r8,%rdi,8) 4729 72,131,249,7, //cmp $0x7,%rcx 4730 114,183, //jb 1413 <_sk_store_f16_avx+0xd2> 4731 196,65,121,214,68,248,48, //vmovq %xmm8,0x30(%r8,%rdi,8) 4732 235,174, //jmp 1413 <_sk_store_f16_avx+0xd2> 4733 }; 4734 4735 CODE const uint8_t sk_store_f32_avx[] = { 4736 72,173, //lods %ds:(%rsi),%rax 4737 76,139,0, //mov (%rax),%r8 4738 72,141,4,189,0,0,0,0, //lea 0x0(,%rdi,4),%rax 4739 197,124,20,193, //vunpcklps %ymm1,%ymm0,%ymm8 4740 197,124,21,217, //vunpckhps %ymm1,%ymm0,%ymm11 4741 197,108,20,203, //vunpcklps %ymm3,%ymm2,%ymm9 4742 197,108,21,227, //vunpckhps %ymm3,%ymm2,%ymm12 4743 196,65,61,20,209, //vunpcklpd %ymm9,%ymm8,%ymm10 4744 196,65,61,21,201, //vunpckhpd %ymm9,%ymm8,%ymm9 4745 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8 4746 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11 4747 72,133,201, //test %rcx,%rcx 4748 117,55, //jne 14d2 <_sk_store_f32_avx+0x6d> 4749 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12 4750 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13 4751 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9 4752 196,67,61,6,195,49, //vperm2f128 $0x31,%ymm11,%ymm8,%ymm8 4753 196,65,125,17,36,128, //vmovupd %ymm12,(%r8,%rax,4) 4754 196,65,125,17,108,128,32, //vmovupd %ymm13,0x20(%r8,%rax,4) 4755 196,65,125,17,76,128,64, //vmovupd %ymm9,0x40(%r8,%rax,4) 4756 196,65,125,17,68,128,96, //vmovupd %ymm8,0x60(%r8,%rax,4) 4757 72,173, //lods %ds:(%rsi),%rax 4758 255,224, //jmpq *%rax 4759 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4) 4760 72,131,249,1, //cmp $0x1,%rcx 4761 116,240, //je 14ce <_sk_store_f32_avx+0x69> 4762 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4) 4763 72,131,249,3, //cmp $0x3,%rcx 4764 114,227, //jb 14ce <_sk_store_f32_avx+0x69> 4765 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4) 4766 116,218, //je 14ce <_sk_store_f32_avx+0x69> 4767 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4) 4768 72,131,249,5, //cmp $0x5,%rcx 4769 114,205, //jb 14ce <_sk_store_f32_avx+0x69> 4770 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4) 4771 116,195, //je 14ce <_sk_store_f32_avx+0x69> 4772 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4) 4773 72,131,249,7, //cmp $0x7,%rcx 4774 114,181, //jb 14ce <_sk_store_f32_avx+0x69> 4775 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4) 4776 235,171, //jmp 14ce <_sk_store_f32_avx+0x69> 4777 }; 4778 4779 CODE const uint8_t sk_clamp_x_avx[] = { 4780 72,173, //lods %ds:(%rsi),%rax 4781 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 4782 197,60,95,200, //vmaxps %ymm0,%ymm8,%ymm9 4783 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 4784 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0 4785 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10 4786 196,193,121,254,194, //vpaddd %xmm10,%xmm0,%xmm0 4787 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8 4788 196,227,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm0 4789 197,180,93,192, //vminps %ymm0,%ymm9,%ymm0 4790 72,173, //lods %ds:(%rsi),%rax 4791 255,224, //jmpq *%rax 4792 }; 4793 4794 CODE const uint8_t sk_clamp_y_avx[] = { 4795 72,173, //lods %ds:(%rsi),%rax 4796 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 4797 197,60,95,201, //vmaxps %ymm1,%ymm8,%ymm9 4798 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 4799 196,99,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm1 4800 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10 4801 196,193,113,254,202, //vpaddd %xmm10,%xmm1,%xmm1 4802 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8 4803 196,227,61,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm8,%ymm1 4804 197,180,93,201, //vminps %ymm1,%ymm9,%ymm1 4805 72,173, //lods %ds:(%rsi),%rax 4806 255,224, //jmpq *%rax 4807 }; 4808 4809 CODE const uint8_t sk_repeat_x_avx[] = { 4810 72,173, //lods %ds:(%rsi),%rax 4811 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 4812 196,65,124,94,200, //vdivps %ymm8,%ymm0,%ymm9 4813 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9 4814 196,65,52,89,200, //vmulps %ymm8,%ymm9,%ymm9 4815 196,65,124,92,201, //vsubps %ymm9,%ymm0,%ymm9 4816 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0 4817 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10 4818 196,193,121,254,194, //vpaddd %xmm10,%xmm0,%xmm0 4819 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8 4820 196,227,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm0 4821 197,180,93,192, //vminps %ymm0,%ymm9,%ymm0 4822 72,173, //lods %ds:(%rsi),%rax 4823 255,224, //jmpq *%rax 4824 }; 4825 4826 CODE const uint8_t sk_repeat_y_avx[] = { 4827 72,173, //lods %ds:(%rsi),%rax 4828 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 4829 196,65,116,94,200, //vdivps %ymm8,%ymm1,%ymm9 4830 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9 4831 196,65,52,89,200, //vmulps %ymm8,%ymm9,%ymm9 4832 196,65,116,92,201, //vsubps %ymm9,%ymm1,%ymm9 4833 196,99,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm1 4834 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10 4835 196,193,113,254,202, //vpaddd %xmm10,%xmm1,%xmm1 4836 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8 4837 196,227,61,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm8,%ymm1 4838 197,180,93,201, //vminps %ymm1,%ymm9,%ymm1 4839 72,173, //lods %ds:(%rsi),%rax 4840 255,224, //jmpq *%rax 4841 }; 4842 4843 CODE const uint8_t sk_mirror_x_avx[] = { 4844 72,173, //lods %ds:(%rsi),%rax 4845 197,121,110,0, //vmovd (%rax),%xmm8 4846 196,65,121,112,200,0, //vpshufd $0x0,%xmm8,%xmm9 4847 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 4848 196,65,124,92,209, //vsubps %ymm9,%ymm0,%ymm10 4849 196,193,58,88,192, //vaddss %xmm8,%xmm8,%xmm0 4850 196,227,121,4,192,0, //vpermilps $0x0,%xmm0,%xmm0 4851 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 4852 197,44,94,192, //vdivps %ymm0,%ymm10,%ymm8 4853 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8 4854 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 4855 197,172,92,192, //vsubps %ymm0,%ymm10,%ymm0 4856 196,193,124,92,193, //vsubps %ymm9,%ymm0,%ymm0 4857 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 4858 197,60,92,192, //vsubps %ymm0,%ymm8,%ymm8 4859 197,60,84,192, //vandps %ymm0,%ymm8,%ymm8 4860 196,99,125,25,200,1, //vextractf128 $0x1,%ymm9,%xmm0 4861 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10 4862 196,193,121,254,194, //vpaddd %xmm10,%xmm0,%xmm0 4863 196,65,49,254,202, //vpaddd %xmm10,%xmm9,%xmm9 4864 196,227,53,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm9,%ymm0 4865 197,188,93,192, //vminps %ymm0,%ymm8,%ymm0 4866 72,173, //lods %ds:(%rsi),%rax 4867 255,224, //jmpq *%rax 4868 }; 4869 4870 CODE const uint8_t sk_mirror_y_avx[] = { 4871 72,173, //lods %ds:(%rsi),%rax 4872 197,121,110,0, //vmovd (%rax),%xmm8 4873 196,65,121,112,200,0, //vpshufd $0x0,%xmm8,%xmm9 4874 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 4875 196,65,116,92,209, //vsubps %ymm9,%ymm1,%ymm10 4876 196,193,58,88,200, //vaddss %xmm8,%xmm8,%xmm1 4877 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 4878 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 4879 197,44,94,193, //vdivps %ymm1,%ymm10,%ymm8 4880 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8 4881 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 4882 197,172,92,201, //vsubps %ymm1,%ymm10,%ymm1 4883 196,193,116,92,201, //vsubps %ymm9,%ymm1,%ymm1 4884 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 4885 197,60,92,193, //vsubps %ymm1,%ymm8,%ymm8 4886 197,60,84,193, //vandps %ymm1,%ymm8,%ymm8 4887 196,99,125,25,201,1, //vextractf128 $0x1,%ymm9,%xmm1 4888 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10 4889 196,193,113,254,202, //vpaddd %xmm10,%xmm1,%xmm1 4890 196,65,49,254,202, //vpaddd %xmm10,%xmm9,%xmm9 4891 196,227,53,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm9,%ymm1 4892 197,188,93,201, //vminps %ymm1,%ymm8,%ymm1 4893 72,173, //lods %ds:(%rsi),%rax 4894 255,224, //jmpq *%rax 4895 }; 4896 4897 CODE const uint8_t sk_luminance_to_alpha_avx[] = { 4898 184,208,179,89,62, //mov $0x3e59b3d0,%eax 4899 197,249,110,216, //vmovd %eax,%xmm3 4900 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 4901 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 4902 197,228,89,192, //vmulps %ymm0,%ymm3,%ymm0 4903 184,89,23,55,63, //mov $0x3f371759,%eax 4904 197,249,110,216, //vmovd %eax,%xmm3 4905 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 4906 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 4907 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1 4908 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 4909 184,152,221,147,61, //mov $0x3d93dd98,%eax 4910 197,249,110,200, //vmovd %eax,%xmm1 4911 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 4912 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 4913 197,244,89,202, //vmulps %ymm2,%ymm1,%ymm1 4914 197,252,88,217, //vaddps %ymm1,%ymm0,%ymm3 4915 72,173, //lods %ds:(%rsi),%rax 4916 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 4917 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 4918 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 4919 255,224, //jmpq *%rax 4920 }; 4921 4922 CODE const uint8_t sk_matrix_2x3_avx[] = { 4923 72,173, //lods %ds:(%rsi),%rax 4924 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 4925 196,98,125,24,72,8, //vbroadcastss 0x8(%rax),%ymm9 4926 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10 4927 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9 4928 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9 4929 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8 4930 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8 4931 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9 4932 196,98,125,24,80,12, //vbroadcastss 0xc(%rax),%ymm10 4933 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11 4934 197,172,89,201, //vmulps %ymm1,%ymm10,%ymm1 4935 196,193,116,88,203, //vaddps %ymm11,%ymm1,%ymm1 4936 197,180,89,192, //vmulps %ymm0,%ymm9,%ymm0 4937 197,252,88,201, //vaddps %ymm1,%ymm0,%ymm1 4938 72,173, //lods %ds:(%rsi),%rax 4939 197,124,41,192, //vmovaps %ymm8,%ymm0 4940 255,224, //jmpq *%rax 4941 }; 4942 4943 CODE const uint8_t sk_matrix_3x4_avx[] = { 4944 72,173, //lods %ds:(%rsi),%rax 4945 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 4946 196,98,125,24,72,12, //vbroadcastss 0xc(%rax),%ymm9 4947 196,98,125,24,80,24, //vbroadcastss 0x18(%rax),%ymm10 4948 196,98,125,24,88,36, //vbroadcastss 0x24(%rax),%ymm11 4949 197,44,89,210, //vmulps %ymm2,%ymm10,%ymm10 4950 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 4951 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9 4952 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9 4953 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8 4954 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8 4955 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9 4956 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10 4957 196,98,125,24,88,28, //vbroadcastss 0x1c(%rax),%ymm11 4958 196,98,125,24,96,40, //vbroadcastss 0x28(%rax),%ymm12 4959 197,36,89,218, //vmulps %ymm2,%ymm11,%ymm11 4960 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11 4961 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 4962 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 4963 197,52,89,200, //vmulps %ymm0,%ymm9,%ymm9 4964 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9 4965 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10 4966 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11 4967 196,98,125,24,96,32, //vbroadcastss 0x20(%rax),%ymm12 4968 196,98,125,24,104,44, //vbroadcastss 0x2c(%rax),%ymm13 4969 197,156,89,210, //vmulps %ymm2,%ymm12,%ymm2 4970 196,193,108,88,213, //vaddps %ymm13,%ymm2,%ymm2 4971 197,164,89,201, //vmulps %ymm1,%ymm11,%ymm1 4972 197,244,88,202, //vaddps %ymm2,%ymm1,%ymm1 4973 197,172,89,192, //vmulps %ymm0,%ymm10,%ymm0 4974 197,252,88,209, //vaddps %ymm1,%ymm0,%ymm2 4975 72,173, //lods %ds:(%rsi),%rax 4976 197,124,41,192, //vmovaps %ymm8,%ymm0 4977 197,124,41,201, //vmovaps %ymm9,%ymm1 4978 255,224, //jmpq *%rax 4979 }; 4980 4981 CODE const uint8_t sk_matrix_4x5_avx[] = { 4982 72,173, //lods %ds:(%rsi),%rax 4983 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 4984 196,98,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm9 4985 196,98,125,24,80,32, //vbroadcastss 0x20(%rax),%ymm10 4986 196,98,125,24,88,48, //vbroadcastss 0x30(%rax),%ymm11 4987 196,98,125,24,96,64, //vbroadcastss 0x40(%rax),%ymm12 4988 197,36,89,219, //vmulps %ymm3,%ymm11,%ymm11 4989 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11 4990 197,44,89,210, //vmulps %ymm2,%ymm10,%ymm10 4991 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 4992 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9 4993 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9 4994 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8 4995 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8 4996 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9 4997 196,98,125,24,80,20, //vbroadcastss 0x14(%rax),%ymm10 4998 196,98,125,24,88,36, //vbroadcastss 0x24(%rax),%ymm11 4999 196,98,125,24,96,52, //vbroadcastss 0x34(%rax),%ymm12 5000 196,98,125,24,104,68, //vbroadcastss 0x44(%rax),%ymm13 5001 197,28,89,227, //vmulps %ymm3,%ymm12,%ymm12 5002 196,65,28,88,229, //vaddps %ymm13,%ymm12,%ymm12 5003 197,36,89,218, //vmulps %ymm2,%ymm11,%ymm11 5004 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11 5005 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 5006 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 5007 197,52,89,200, //vmulps %ymm0,%ymm9,%ymm9 5008 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9 5009 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10 5010 196,98,125,24,88,24, //vbroadcastss 0x18(%rax),%ymm11 5011 196,98,125,24,96,40, //vbroadcastss 0x28(%rax),%ymm12 5012 196,98,125,24,104,56, //vbroadcastss 0x38(%rax),%ymm13 5013 196,98,125,24,112,72, //vbroadcastss 0x48(%rax),%ymm14 5014 197,20,89,235, //vmulps %ymm3,%ymm13,%ymm13 5015 196,65,20,88,238, //vaddps %ymm14,%ymm13,%ymm13 5016 197,28,89,226, //vmulps %ymm2,%ymm12,%ymm12 5017 196,65,28,88,229, //vaddps %ymm13,%ymm12,%ymm12 5018 197,36,89,217, //vmulps %ymm1,%ymm11,%ymm11 5019 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11 5020 197,44,89,208, //vmulps %ymm0,%ymm10,%ymm10 5021 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 5022 196,98,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm11 5023 196,98,125,24,96,28, //vbroadcastss 0x1c(%rax),%ymm12 5024 196,98,125,24,104,44, //vbroadcastss 0x2c(%rax),%ymm13 5025 196,98,125,24,112,60, //vbroadcastss 0x3c(%rax),%ymm14 5026 196,98,125,24,120,76, //vbroadcastss 0x4c(%rax),%ymm15 5027 197,140,89,219, //vmulps %ymm3,%ymm14,%ymm3 5028 196,193,100,88,223, //vaddps %ymm15,%ymm3,%ymm3 5029 197,148,89,210, //vmulps %ymm2,%ymm13,%ymm2 5030 197,236,88,211, //vaddps %ymm3,%ymm2,%ymm2 5031 197,156,89,201, //vmulps %ymm1,%ymm12,%ymm1 5032 197,244,88,202, //vaddps %ymm2,%ymm1,%ymm1 5033 197,164,89,192, //vmulps %ymm0,%ymm11,%ymm0 5034 197,252,88,217, //vaddps %ymm1,%ymm0,%ymm3 5035 72,173, //lods %ds:(%rsi),%rax 5036 197,124,41,192, //vmovaps %ymm8,%ymm0 5037 197,124,41,201, //vmovaps %ymm9,%ymm1 5038 197,124,41,210, //vmovaps %ymm10,%ymm2 5039 255,224, //jmpq *%rax 5040 }; 5041 5042 CODE const uint8_t sk_matrix_perspective_avx[] = { 5043 72,173, //lods %ds:(%rsi),%rax 5044 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 5045 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9 5046 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10 5047 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9 5048 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9 5049 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8 5050 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8 5051 196,98,125,24,72,12, //vbroadcastss 0xc(%rax),%ymm9 5052 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10 5053 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11 5054 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 5055 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 5056 197,52,89,200, //vmulps %ymm0,%ymm9,%ymm9 5057 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9 5058 196,98,125,24,80,24, //vbroadcastss 0x18(%rax),%ymm10 5059 196,98,125,24,88,28, //vbroadcastss 0x1c(%rax),%ymm11 5060 196,98,125,24,96,32, //vbroadcastss 0x20(%rax),%ymm12 5061 197,164,89,201, //vmulps %ymm1,%ymm11,%ymm1 5062 196,193,116,88,204, //vaddps %ymm12,%ymm1,%ymm1 5063 197,172,89,192, //vmulps %ymm0,%ymm10,%ymm0 5064 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 5065 197,252,83,200, //vrcpps %ymm0,%ymm1 5066 197,188,89,193, //vmulps %ymm1,%ymm8,%ymm0 5067 197,180,89,201, //vmulps %ymm1,%ymm9,%ymm1 5068 72,173, //lods %ds:(%rsi),%rax 5069 255,224, //jmpq *%rax 5070 }; 5071 5072 CODE const uint8_t sk_linear_gradient_2stops_avx[] = { 5073 72,173, //lods %ds:(%rsi),%rax 5074 196,226,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm1 5075 196,226,125,24,16, //vbroadcastss (%rax),%ymm2 5076 197,244,89,200, //vmulps %ymm0,%ymm1,%ymm1 5077 197,108,88,193, //vaddps %ymm1,%ymm2,%ymm8 5078 196,226,125,24,72,20, //vbroadcastss 0x14(%rax),%ymm1 5079 196,226,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm2 5080 197,244,89,200, //vmulps %ymm0,%ymm1,%ymm1 5081 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1 5082 196,226,125,24,80,24, //vbroadcastss 0x18(%rax),%ymm2 5083 196,226,125,24,88,8, //vbroadcastss 0x8(%rax),%ymm3 5084 197,236,89,208, //vmulps %ymm0,%ymm2,%ymm2 5085 197,228,88,210, //vaddps %ymm2,%ymm3,%ymm2 5086 196,226,125,24,88,28, //vbroadcastss 0x1c(%rax),%ymm3 5087 196,98,125,24,72,12, //vbroadcastss 0xc(%rax),%ymm9 5088 197,228,89,192, //vmulps %ymm0,%ymm3,%ymm0 5089 197,180,88,216, //vaddps %ymm0,%ymm9,%ymm3 5090 72,173, //lods %ds:(%rsi),%rax 5091 197,124,41,192, //vmovaps %ymm8,%ymm0 5092 255,224, //jmpq *%rax 5093 }; 5094 5095 CODE const uint8_t sk_start_pipeline_sse41[] = { 5096 65,87, //push %r15 5097 65,86, //push %r14 5098 65,85, //push %r13 5099 65,84, //push %r12 5100 83, //push %rbx 5101 73,137,207, //mov %rcx,%r15 5102 73,137,214, //mov %rdx,%r14 5103 72,137,251, //mov %rdi,%rbx 5104 72,173, //lods %ds:(%rsi),%rax 5105 73,137,196, //mov %rax,%r12 5106 73,137,245, //mov %rsi,%r13 5107 72,141,67,4, //lea 0x4(%rbx),%rax 5108 76,57,248, //cmp %r15,%rax 5109 118,5, //jbe 28 <_sk_start_pipeline_sse41+0x28> 5110 72,137,216, //mov %rbx,%rax 5111 235,52, //jmp 5c <_sk_start_pipeline_sse41+0x5c> 5112 15,87,192, //xorps %xmm0,%xmm0 5113 15,87,201, //xorps %xmm1,%xmm1 5114 15,87,210, //xorps %xmm2,%xmm2 5115 15,87,219, //xorps %xmm3,%xmm3 5116 15,87,228, //xorps %xmm4,%xmm4 5117 15,87,237, //xorps %xmm5,%xmm5 5118 15,87,246, //xorps %xmm6,%xmm6 5119 15,87,255, //xorps %xmm7,%xmm7 5120 72,137,223, //mov %rbx,%rdi 5121 76,137,238, //mov %r13,%rsi 5122 76,137,242, //mov %r14,%rdx 5123 65,255,212, //callq *%r12 5124 72,141,67,4, //lea 0x4(%rbx),%rax 5125 72,131,195,8, //add $0x8,%rbx 5126 76,57,251, //cmp %r15,%rbx 5127 72,137,195, //mov %rax,%rbx 5128 118,204, //jbe 28 <_sk_start_pipeline_sse41+0x28> 5129 91, //pop %rbx 5130 65,92, //pop %r12 5131 65,93, //pop %r13 5132 65,94, //pop %r14 5133 65,95, //pop %r15 5134 195, //retq 5135 }; 5136 5137 CODE const uint8_t sk_just_return_sse41[] = { 5138 195, //retq 5139 }; 5140 5141 CODE const uint8_t sk_seed_shader_sse41[] = { 5142 72,173, //lods %ds:(%rsi),%rax 5143 102,15,110,199, //movd %edi,%xmm0 5144 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 5145 15,91,200, //cvtdq2ps %xmm0,%xmm1 5146 185,0,0,0,63, //mov $0x3f000000,%ecx 5147 102,15,110,209, //movd %ecx,%xmm2 5148 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 5149 15,88,202, //addps %xmm2,%xmm1 5150 15,16,2, //movups (%rdx),%xmm0 5151 15,88,193, //addps %xmm1,%xmm0 5152 102,15,110,8, //movd (%rax),%xmm1 5153 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 5154 15,91,201, //cvtdq2ps %xmm1,%xmm1 5155 15,88,202, //addps %xmm2,%xmm1 5156 184,0,0,128,63, //mov $0x3f800000,%eax 5157 102,15,110,208, //movd %eax,%xmm2 5158 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 5159 72,173, //lods %ds:(%rsi),%rax 5160 15,87,219, //xorps %xmm3,%xmm3 5161 15,87,228, //xorps %xmm4,%xmm4 5162 15,87,237, //xorps %xmm5,%xmm5 5163 15,87,246, //xorps %xmm6,%xmm6 5164 15,87,255, //xorps %xmm7,%xmm7 5165 255,224, //jmpq *%rax 5166 }; 5167 5168 CODE const uint8_t sk_constant_color_sse41[] = { 5169 72,173, //lods %ds:(%rsi),%rax 5170 15,16,24, //movups (%rax),%xmm3 5171 15,40,195, //movaps %xmm3,%xmm0 5172 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 5173 15,40,203, //movaps %xmm3,%xmm1 5174 15,198,201,85, //shufps $0x55,%xmm1,%xmm1 5175 15,40,211, //movaps %xmm3,%xmm2 5176 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2 5177 15,198,219,255, //shufps $0xff,%xmm3,%xmm3 5178 72,173, //lods %ds:(%rsi),%rax 5179 255,224, //jmpq *%rax 5180 }; 5181 5182 CODE const uint8_t sk_clear_sse41[] = { 5183 72,173, //lods %ds:(%rsi),%rax 5184 15,87,192, //xorps %xmm0,%xmm0 5185 15,87,201, //xorps %xmm1,%xmm1 5186 15,87,210, //xorps %xmm2,%xmm2 5187 15,87,219, //xorps %xmm3,%xmm3 5188 255,224, //jmpq *%rax 5189 }; 5190 5191 CODE const uint8_t sk_plus__sse41[] = { 5192 15,88,196, //addps %xmm4,%xmm0 5193 15,88,205, //addps %xmm5,%xmm1 5194 15,88,214, //addps %xmm6,%xmm2 5195 15,88,223, //addps %xmm7,%xmm3 5196 72,173, //lods %ds:(%rsi),%rax 5197 255,224, //jmpq *%rax 5198 }; 5199 5200 CODE const uint8_t sk_srcover_sse41[] = { 5201 184,0,0,128,63, //mov $0x3f800000,%eax 5202 102,68,15,110,192, //movd %eax,%xmm8 5203 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5204 68,15,92,195, //subps %xmm3,%xmm8 5205 69,15,40,200, //movaps %xmm8,%xmm9 5206 68,15,89,204, //mulps %xmm4,%xmm9 5207 65,15,88,193, //addps %xmm9,%xmm0 5208 69,15,40,200, //movaps %xmm8,%xmm9 5209 68,15,89,205, //mulps %xmm5,%xmm9 5210 65,15,88,201, //addps %xmm9,%xmm1 5211 69,15,40,200, //movaps %xmm8,%xmm9 5212 68,15,89,206, //mulps %xmm6,%xmm9 5213 65,15,88,209, //addps %xmm9,%xmm2 5214 68,15,89,199, //mulps %xmm7,%xmm8 5215 65,15,88,216, //addps %xmm8,%xmm3 5216 72,173, //lods %ds:(%rsi),%rax 5217 255,224, //jmpq *%rax 5218 }; 5219 5220 CODE const uint8_t sk_dstover_sse41[] = { 5221 184,0,0,128,63, //mov $0x3f800000,%eax 5222 102,68,15,110,192, //movd %eax,%xmm8 5223 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5224 68,15,92,199, //subps %xmm7,%xmm8 5225 65,15,89,192, //mulps %xmm8,%xmm0 5226 15,88,196, //addps %xmm4,%xmm0 5227 65,15,89,200, //mulps %xmm8,%xmm1 5228 15,88,205, //addps %xmm5,%xmm1 5229 65,15,89,208, //mulps %xmm8,%xmm2 5230 15,88,214, //addps %xmm6,%xmm2 5231 65,15,89,216, //mulps %xmm8,%xmm3 5232 15,88,223, //addps %xmm7,%xmm3 5233 72,173, //lods %ds:(%rsi),%rax 5234 255,224, //jmpq *%rax 5235 }; 5236 5237 CODE const uint8_t sk_clamp_0_sse41[] = { 5238 69,15,87,192, //xorps %xmm8,%xmm8 5239 65,15,95,192, //maxps %xmm8,%xmm0 5240 65,15,95,200, //maxps %xmm8,%xmm1 5241 65,15,95,208, //maxps %xmm8,%xmm2 5242 65,15,95,216, //maxps %xmm8,%xmm3 5243 72,173, //lods %ds:(%rsi),%rax 5244 255,224, //jmpq *%rax 5245 }; 5246 5247 CODE const uint8_t sk_clamp_1_sse41[] = { 5248 184,0,0,128,63, //mov $0x3f800000,%eax 5249 102,68,15,110,192, //movd %eax,%xmm8 5250 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5251 65,15,93,192, //minps %xmm8,%xmm0 5252 65,15,93,200, //minps %xmm8,%xmm1 5253 65,15,93,208, //minps %xmm8,%xmm2 5254 65,15,93,216, //minps %xmm8,%xmm3 5255 72,173, //lods %ds:(%rsi),%rax 5256 255,224, //jmpq *%rax 5257 }; 5258 5259 CODE const uint8_t sk_clamp_a_sse41[] = { 5260 184,0,0,128,63, //mov $0x3f800000,%eax 5261 102,68,15,110,192, //movd %eax,%xmm8 5262 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5263 65,15,93,216, //minps %xmm8,%xmm3 5264 15,93,195, //minps %xmm3,%xmm0 5265 15,93,203, //minps %xmm3,%xmm1 5266 15,93,211, //minps %xmm3,%xmm2 5267 72,173, //lods %ds:(%rsi),%rax 5268 255,224, //jmpq *%rax 5269 }; 5270 5271 CODE const uint8_t sk_set_rgb_sse41[] = { 5272 72,173, //lods %ds:(%rsi),%rax 5273 243,15,16,0, //movss (%rax),%xmm0 5274 243,15,16,72,4, //movss 0x4(%rax),%xmm1 5275 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 5276 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 5277 243,15,16,80,8, //movss 0x8(%rax),%xmm2 5278 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 5279 72,173, //lods %ds:(%rsi),%rax 5280 255,224, //jmpq *%rax 5281 }; 5282 5283 CODE const uint8_t sk_swap_rb_sse41[] = { 5284 68,15,40,192, //movaps %xmm0,%xmm8 5285 72,173, //lods %ds:(%rsi),%rax 5286 15,40,194, //movaps %xmm2,%xmm0 5287 65,15,40,208, //movaps %xmm8,%xmm2 5288 255,224, //jmpq *%rax 5289 }; 5290 5291 CODE const uint8_t sk_swap_sse41[] = { 5292 68,15,40,195, //movaps %xmm3,%xmm8 5293 68,15,40,202, //movaps %xmm2,%xmm9 5294 68,15,40,209, //movaps %xmm1,%xmm10 5295 68,15,40,216, //movaps %xmm0,%xmm11 5296 72,173, //lods %ds:(%rsi),%rax 5297 15,40,196, //movaps %xmm4,%xmm0 5298 15,40,205, //movaps %xmm5,%xmm1 5299 15,40,214, //movaps %xmm6,%xmm2 5300 15,40,223, //movaps %xmm7,%xmm3 5301 65,15,40,227, //movaps %xmm11,%xmm4 5302 65,15,40,234, //movaps %xmm10,%xmm5 5303 65,15,40,241, //movaps %xmm9,%xmm6 5304 65,15,40,248, //movaps %xmm8,%xmm7 5305 255,224, //jmpq *%rax 5306 }; 5307 5308 CODE const uint8_t sk_move_src_dst_sse41[] = { 5309 72,173, //lods %ds:(%rsi),%rax 5310 15,40,224, //movaps %xmm0,%xmm4 5311 15,40,233, //movaps %xmm1,%xmm5 5312 15,40,242, //movaps %xmm2,%xmm6 5313 15,40,251, //movaps %xmm3,%xmm7 5314 255,224, //jmpq *%rax 5315 }; 5316 5317 CODE const uint8_t sk_move_dst_src_sse41[] = { 5318 72,173, //lods %ds:(%rsi),%rax 5319 15,40,196, //movaps %xmm4,%xmm0 5320 15,40,205, //movaps %xmm5,%xmm1 5321 15,40,214, //movaps %xmm6,%xmm2 5322 15,40,223, //movaps %xmm7,%xmm3 5323 255,224, //jmpq *%rax 5324 }; 5325 5326 CODE const uint8_t sk_premul_sse41[] = { 5327 15,89,195, //mulps %xmm3,%xmm0 5328 15,89,203, //mulps %xmm3,%xmm1 5329 15,89,211, //mulps %xmm3,%xmm2 5330 72,173, //lods %ds:(%rsi),%rax 5331 255,224, //jmpq *%rax 5332 }; 5333 5334 CODE const uint8_t sk_unpremul_sse41[] = { 5335 69,15,87,192, //xorps %xmm8,%xmm8 5336 184,0,0,128,63, //mov $0x3f800000,%eax 5337 102,68,15,110,200, //movd %eax,%xmm9 5338 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 5339 68,15,94,203, //divps %xmm3,%xmm9 5340 68,15,194,195,4, //cmpneqps %xmm3,%xmm8 5341 69,15,84,193, //andps %xmm9,%xmm8 5342 65,15,89,192, //mulps %xmm8,%xmm0 5343 65,15,89,200, //mulps %xmm8,%xmm1 5344 65,15,89,208, //mulps %xmm8,%xmm2 5345 72,173, //lods %ds:(%rsi),%rax 5346 255,224, //jmpq *%rax 5347 }; 5348 5349 CODE const uint8_t sk_from_srgb_sse41[] = { 5350 184,145,131,158,61, //mov $0x3d9e8391,%eax 5351 102,68,15,110,216, //movd %eax,%xmm11 5352 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 5353 69,15,40,211, //movaps %xmm11,%xmm10 5354 68,15,89,208, //mulps %xmm0,%xmm10 5355 68,15,40,240, //movaps %xmm0,%xmm14 5356 69,15,89,246, //mulps %xmm14,%xmm14 5357 184,154,153,153,62, //mov $0x3e99999a,%eax 5358 102,68,15,110,192, //movd %eax,%xmm8 5359 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5360 184,92,143,50,63, //mov $0x3f328f5c,%eax 5361 102,68,15,110,224, //movd %eax,%xmm12 5362 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 5363 69,15,40,200, //movaps %xmm8,%xmm9 5364 68,15,89,200, //mulps %xmm0,%xmm9 5365 69,15,88,204, //addps %xmm12,%xmm9 5366 184,10,215,35,59, //mov $0x3b23d70a,%eax 5367 102,68,15,110,232, //movd %eax,%xmm13 5368 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 5369 69,15,89,206, //mulps %xmm14,%xmm9 5370 69,15,88,205, //addps %xmm13,%xmm9 5371 184,174,71,97,61, //mov $0x3d6147ae,%eax 5372 102,68,15,110,240, //movd %eax,%xmm14 5373 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 5374 65,15,194,198,1, //cmpltps %xmm14,%xmm0 5375 102,69,15,56,20,202, //blendvps %xmm0,%xmm10,%xmm9 5376 69,15,40,251, //movaps %xmm11,%xmm15 5377 68,15,89,249, //mulps %xmm1,%xmm15 5378 15,40,193, //movaps %xmm1,%xmm0 5379 15,89,192, //mulps %xmm0,%xmm0 5380 69,15,40,208, //movaps %xmm8,%xmm10 5381 68,15,89,209, //mulps %xmm1,%xmm10 5382 69,15,88,212, //addps %xmm12,%xmm10 5383 68,15,89,208, //mulps %xmm0,%xmm10 5384 69,15,88,213, //addps %xmm13,%xmm10 5385 65,15,194,206,1, //cmpltps %xmm14,%xmm1 5386 15,40,193, //movaps %xmm1,%xmm0 5387 102,69,15,56,20,215, //blendvps %xmm0,%xmm15,%xmm10 5388 68,15,89,218, //mulps %xmm2,%xmm11 5389 15,40,194, //movaps %xmm2,%xmm0 5390 15,89,192, //mulps %xmm0,%xmm0 5391 68,15,89,194, //mulps %xmm2,%xmm8 5392 69,15,88,196, //addps %xmm12,%xmm8 5393 68,15,89,192, //mulps %xmm0,%xmm8 5394 69,15,88,197, //addps %xmm13,%xmm8 5395 65,15,194,214,1, //cmpltps %xmm14,%xmm2 5396 15,40,194, //movaps %xmm2,%xmm0 5397 102,69,15,56,20,195, //blendvps %xmm0,%xmm11,%xmm8 5398 72,173, //lods %ds:(%rsi),%rax 5399 65,15,40,193, //movaps %xmm9,%xmm0 5400 65,15,40,202, //movaps %xmm10,%xmm1 5401 65,15,40,208, //movaps %xmm8,%xmm2 5402 255,224, //jmpq *%rax 5403 }; 5404 5405 CODE const uint8_t sk_to_srgb_sse41[] = { 5406 72,131,236,24, //sub $0x18,%rsp 5407 15,41,60,36, //movaps %xmm7,(%rsp) 5408 15,40,254, //movaps %xmm6,%xmm7 5409 15,40,245, //movaps %xmm5,%xmm6 5410 15,40,236, //movaps %xmm4,%xmm5 5411 15,40,227, //movaps %xmm3,%xmm4 5412 15,40,218, //movaps %xmm2,%xmm3 5413 15,40,209, //movaps %xmm1,%xmm2 5414 68,15,82,192, //rsqrtps %xmm0,%xmm8 5415 69,15,83,200, //rcpps %xmm8,%xmm9 5416 69,15,82,248, //rsqrtps %xmm8,%xmm15 5417 184,41,92,71,65, //mov $0x41475c29,%eax 5418 102,68,15,110,216, //movd %eax,%xmm11 5419 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 5420 69,15,40,211, //movaps %xmm11,%xmm10 5421 68,15,89,208, //mulps %xmm0,%xmm10 5422 184,0,0,128,63, //mov $0x3f800000,%eax 5423 102,68,15,110,192, //movd %eax,%xmm8 5424 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5425 184,194,135,210,62, //mov $0x3ed287c2,%eax 5426 102,68,15,110,224, //movd %eax,%xmm12 5427 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 5428 184,206,111,48,63, //mov $0x3f306fce,%eax 5429 102,68,15,110,232, //movd %eax,%xmm13 5430 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 5431 184,168,87,202,61, //mov $0x3dca57a8,%eax 5432 53,0,0,0,128, //xor $0x80000000,%eax 5433 102,68,15,110,240, //movd %eax,%xmm14 5434 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 5435 69,15,89,205, //mulps %xmm13,%xmm9 5436 69,15,88,206, //addps %xmm14,%xmm9 5437 69,15,89,252, //mulps %xmm12,%xmm15 5438 69,15,88,249, //addps %xmm9,%xmm15 5439 69,15,40,200, //movaps %xmm8,%xmm9 5440 69,15,93,207, //minps %xmm15,%xmm9 5441 184,4,231,140,59, //mov $0x3b8ce704,%eax 5442 102,68,15,110,248, //movd %eax,%xmm15 5443 69,15,198,255,0, //shufps $0x0,%xmm15,%xmm15 5444 65,15,194,199,1, //cmpltps %xmm15,%xmm0 5445 102,69,15,56,20,202, //blendvps %xmm0,%xmm10,%xmm9 5446 68,15,82,210, //rsqrtps %xmm2,%xmm10 5447 65,15,83,194, //rcpps %xmm10,%xmm0 5448 69,15,82,210, //rsqrtps %xmm10,%xmm10 5449 65,15,89,197, //mulps %xmm13,%xmm0 5450 65,15,88,198, //addps %xmm14,%xmm0 5451 69,15,89,212, //mulps %xmm12,%xmm10 5452 68,15,88,208, //addps %xmm0,%xmm10 5453 65,15,40,200, //movaps %xmm8,%xmm1 5454 65,15,93,202, //minps %xmm10,%xmm1 5455 69,15,40,211, //movaps %xmm11,%xmm10 5456 68,15,89,210, //mulps %xmm2,%xmm10 5457 65,15,194,215,1, //cmpltps %xmm15,%xmm2 5458 15,40,194, //movaps %xmm2,%xmm0 5459 102,65,15,56,20,202, //blendvps %xmm0,%xmm10,%xmm1 5460 15,82,195, //rsqrtps %xmm3,%xmm0 5461 15,83,208, //rcpps %xmm0,%xmm2 5462 65,15,89,213, //mulps %xmm13,%xmm2 5463 65,15,88,214, //addps %xmm14,%xmm2 5464 15,82,192, //rsqrtps %xmm0,%xmm0 5465 65,15,89,196, //mulps %xmm12,%xmm0 5466 15,88,194, //addps %xmm2,%xmm0 5467 68,15,93,192, //minps %xmm0,%xmm8 5468 68,15,89,219, //mulps %xmm3,%xmm11 5469 65,15,194,223,1, //cmpltps %xmm15,%xmm3 5470 15,40,195, //movaps %xmm3,%xmm0 5471 102,69,15,56,20,195, //blendvps %xmm0,%xmm11,%xmm8 5472 72,173, //lods %ds:(%rsi),%rax 5473 65,15,40,193, //movaps %xmm9,%xmm0 5474 65,15,40,208, //movaps %xmm8,%xmm2 5475 15,40,220, //movaps %xmm4,%xmm3 5476 15,40,229, //movaps %xmm5,%xmm4 5477 15,40,238, //movaps %xmm6,%xmm5 5478 15,40,247, //movaps %xmm7,%xmm6 5479 15,40,60,36, //movaps (%rsp),%xmm7 5480 72,131,196,24, //add $0x18,%rsp 5481 255,224, //jmpq *%rax 5482 }; 5483 5484 CODE const uint8_t sk_scale_1_float_sse41[] = { 5485 72,173, //lods %ds:(%rsi),%rax 5486 243,68,15,16,0, //movss (%rax),%xmm8 5487 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5488 65,15,89,192, //mulps %xmm8,%xmm0 5489 65,15,89,200, //mulps %xmm8,%xmm1 5490 65,15,89,208, //mulps %xmm8,%xmm2 5491 65,15,89,216, //mulps %xmm8,%xmm3 5492 72,173, //lods %ds:(%rsi),%rax 5493 255,224, //jmpq *%rax 5494 }; 5495 5496 CODE const uint8_t sk_scale_u8_sse41[] = { 5497 72,173, //lods %ds:(%rsi),%rax 5498 72,139,0, //mov (%rax),%rax 5499 102,68,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm8 5500 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 5501 184,129,128,128,59, //mov $0x3b808081,%eax 5502 102,68,15,110,200, //movd %eax,%xmm9 5503 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 5504 69,15,89,200, //mulps %xmm8,%xmm9 5505 65,15,89,193, //mulps %xmm9,%xmm0 5506 65,15,89,201, //mulps %xmm9,%xmm1 5507 65,15,89,209, //mulps %xmm9,%xmm2 5508 65,15,89,217, //mulps %xmm9,%xmm3 5509 72,173, //lods %ds:(%rsi),%rax 5510 255,224, //jmpq *%rax 5511 }; 5512 5513 CODE const uint8_t sk_lerp_1_float_sse41[] = { 5514 72,173, //lods %ds:(%rsi),%rax 5515 243,68,15,16,0, //movss (%rax),%xmm8 5516 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5517 15,92,196, //subps %xmm4,%xmm0 5518 65,15,89,192, //mulps %xmm8,%xmm0 5519 15,88,196, //addps %xmm4,%xmm0 5520 15,92,205, //subps %xmm5,%xmm1 5521 65,15,89,200, //mulps %xmm8,%xmm1 5522 15,88,205, //addps %xmm5,%xmm1 5523 15,92,214, //subps %xmm6,%xmm2 5524 65,15,89,208, //mulps %xmm8,%xmm2 5525 15,88,214, //addps %xmm6,%xmm2 5526 15,92,223, //subps %xmm7,%xmm3 5527 65,15,89,216, //mulps %xmm8,%xmm3 5528 15,88,223, //addps %xmm7,%xmm3 5529 72,173, //lods %ds:(%rsi),%rax 5530 255,224, //jmpq *%rax 5531 }; 5532 5533 CODE const uint8_t sk_lerp_u8_sse41[] = { 5534 72,173, //lods %ds:(%rsi),%rax 5535 72,139,0, //mov (%rax),%rax 5536 102,68,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm8 5537 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 5538 184,129,128,128,59, //mov $0x3b808081,%eax 5539 102,68,15,110,200, //movd %eax,%xmm9 5540 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 5541 69,15,89,200, //mulps %xmm8,%xmm9 5542 15,92,196, //subps %xmm4,%xmm0 5543 65,15,89,193, //mulps %xmm9,%xmm0 5544 15,88,196, //addps %xmm4,%xmm0 5545 15,92,205, //subps %xmm5,%xmm1 5546 65,15,89,201, //mulps %xmm9,%xmm1 5547 15,88,205, //addps %xmm5,%xmm1 5548 15,92,214, //subps %xmm6,%xmm2 5549 65,15,89,209, //mulps %xmm9,%xmm2 5550 15,88,214, //addps %xmm6,%xmm2 5551 15,92,223, //subps %xmm7,%xmm3 5552 65,15,89,217, //mulps %xmm9,%xmm3 5553 15,88,223, //addps %xmm7,%xmm3 5554 72,173, //lods %ds:(%rsi),%rax 5555 255,224, //jmpq *%rax 5556 }; 5557 5558 CODE const uint8_t sk_lerp_565_sse41[] = { 5559 72,173, //lods %ds:(%rsi),%rax 5560 72,139,0, //mov (%rax),%rax 5561 102,68,15,56,51,4,120, //pmovzxwd (%rax,%rdi,2),%xmm8 5562 184,0,248,0,0, //mov $0xf800,%eax 5563 102,15,110,216, //movd %eax,%xmm3 5564 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 5565 102,65,15,219,216, //pand %xmm8,%xmm3 5566 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 5567 184,8,33,132,55, //mov $0x37842108,%eax 5568 102,68,15,110,208, //movd %eax,%xmm10 5569 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 5570 69,15,89,209, //mulps %xmm9,%xmm10 5571 184,224,7,0,0, //mov $0x7e0,%eax 5572 102,15,110,216, //movd %eax,%xmm3 5573 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 5574 102,65,15,219,216, //pand %xmm8,%xmm3 5575 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 5576 184,33,8,2,58, //mov $0x3a020821,%eax 5577 102,68,15,110,216, //movd %eax,%xmm11 5578 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 5579 69,15,89,217, //mulps %xmm9,%xmm11 5580 184,31,0,0,0, //mov $0x1f,%eax 5581 102,15,110,216, //movd %eax,%xmm3 5582 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 5583 102,65,15,219,216, //pand %xmm8,%xmm3 5584 68,15,91,195, //cvtdq2ps %xmm3,%xmm8 5585 184,8,33,4,61, //mov $0x3d042108,%eax 5586 102,15,110,216, //movd %eax,%xmm3 5587 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 5588 65,15,89,216, //mulps %xmm8,%xmm3 5589 15,92,196, //subps %xmm4,%xmm0 5590 65,15,89,194, //mulps %xmm10,%xmm0 5591 15,88,196, //addps %xmm4,%xmm0 5592 15,92,205, //subps %xmm5,%xmm1 5593 65,15,89,203, //mulps %xmm11,%xmm1 5594 15,88,205, //addps %xmm5,%xmm1 5595 15,92,214, //subps %xmm6,%xmm2 5596 15,89,211, //mulps %xmm3,%xmm2 5597 15,88,214, //addps %xmm6,%xmm2 5598 184,0,0,128,63, //mov $0x3f800000,%eax 5599 102,15,110,216, //movd %eax,%xmm3 5600 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 5601 72,173, //lods %ds:(%rsi),%rax 5602 255,224, //jmpq *%rax 5603 }; 5604 5605 CODE const uint8_t sk_load_tables_sse41[] = { 5606 72,173, //lods %ds:(%rsi),%rax 5607 72,139,8, //mov (%rax),%rcx 5608 76,139,64,8, //mov 0x8(%rax),%r8 5609 243,68,15,111,4,185, //movdqu (%rcx,%rdi,4),%xmm8 5610 185,255,0,0,0, //mov $0xff,%ecx 5611 102,15,110,193, //movd %ecx,%xmm0 5612 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 5613 102,65,15,111,200, //movdqa %xmm8,%xmm1 5614 102,15,114,209,8, //psrld $0x8,%xmm1 5615 102,15,219,200, //pand %xmm0,%xmm1 5616 102,65,15,111,208, //movdqa %xmm8,%xmm2 5617 102,15,114,210,16, //psrld $0x10,%xmm2 5618 102,15,219,208, //pand %xmm0,%xmm2 5619 102,65,15,219,192, //pand %xmm8,%xmm0 5620 102,72,15,58,22,193,1, //pextrq $0x1,%xmm0,%rcx 5621 65,137,201, //mov %ecx,%r9d 5622 72,193,233,32, //shr $0x20,%rcx 5623 102,73,15,126,194, //movq %xmm0,%r10 5624 69,137,211, //mov %r10d,%r11d 5625 73,193,234,32, //shr $0x20,%r10 5626 243,67,15,16,4,152, //movss (%r8,%r11,4),%xmm0 5627 102,67,15,58,33,4,144,16, //insertps $0x10,(%r8,%r10,4),%xmm0 5628 102,67,15,58,33,4,136,32, //insertps $0x20,(%r8,%r9,4),%xmm0 5629 102,65,15,58,33,4,136,48, //insertps $0x30,(%r8,%rcx,4),%xmm0 5630 76,139,64,16, //mov 0x10(%rax),%r8 5631 102,73,15,58,22,202,1, //pextrq $0x1,%xmm1,%r10 5632 77,137,209, //mov %r10,%r9 5633 73,193,233,32, //shr $0x20,%r9 5634 102,72,15,126,201, //movq %xmm1,%rcx 5635 65,137,203, //mov %ecx,%r11d 5636 65,129,227,255,255,255,0, //and $0xffffff,%r11d 5637 72,193,233,30, //shr $0x1e,%rcx 5638 65,129,226,255,255,255,0, //and $0xffffff,%r10d 5639 243,67,15,16,12,152, //movss (%r8,%r11,4),%xmm1 5640 102,65,15,58,33,12,8,16, //insertps $0x10,(%r8,%rcx,1),%xmm1 5641 243,67,15,16,28,144, //movss (%r8,%r10,4),%xmm3 5642 102,15,58,33,203,32, //insertps $0x20,%xmm3,%xmm1 5643 243,67,15,16,28,136, //movss (%r8,%r9,4),%xmm3 5644 102,15,58,33,203,48, //insertps $0x30,%xmm3,%xmm1 5645 76,139,72,24, //mov 0x18(%rax),%r9 5646 102,72,15,58,22,209,1, //pextrq $0x1,%xmm2,%rcx 5647 68,15,183,193, //movzwl %cx,%r8d 5648 72,193,233,32, //shr $0x20,%rcx 5649 102,72,15,126,208, //movq %xmm2,%rax 5650 68,15,183,208, //movzwl %ax,%r10d 5651 72,193,232,30, //shr $0x1e,%rax 5652 243,67,15,16,20,145, //movss (%r9,%r10,4),%xmm2 5653 102,65,15,58,33,20,1,16, //insertps $0x10,(%r9,%rax,1),%xmm2 5654 243,67,15,16,28,129, //movss (%r9,%r8,4),%xmm3 5655 102,15,58,33,211,32, //insertps $0x20,%xmm3,%xmm2 5656 243,65,15,16,28,137, //movss (%r9,%rcx,4),%xmm3 5657 102,15,58,33,211,48, //insertps $0x30,%xmm3,%xmm2 5658 102,65,15,114,208,24, //psrld $0x18,%xmm8 5659 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 5660 184,129,128,128,59, //mov $0x3b808081,%eax 5661 102,15,110,216, //movd %eax,%xmm3 5662 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 5663 65,15,89,216, //mulps %xmm8,%xmm3 5664 72,173, //lods %ds:(%rsi),%rax 5665 255,224, //jmpq *%rax 5666 }; 5667 5668 CODE const uint8_t sk_load_a8_sse41[] = { 5669 72,173, //lods %ds:(%rsi),%rax 5670 72,139,0, //mov (%rax),%rax 5671 102,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm0 5672 15,91,192, //cvtdq2ps %xmm0,%xmm0 5673 184,129,128,128,59, //mov $0x3b808081,%eax 5674 102,15,110,216, //movd %eax,%xmm3 5675 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 5676 15,89,216, //mulps %xmm0,%xmm3 5677 72,173, //lods %ds:(%rsi),%rax 5678 15,87,192, //xorps %xmm0,%xmm0 5679 15,87,201, //xorps %xmm1,%xmm1 5680 15,87,210, //xorps %xmm2,%xmm2 5681 255,224, //jmpq *%rax 5682 }; 5683 5684 CODE const uint8_t sk_store_a8_sse41[] = { 5685 72,173, //lods %ds:(%rsi),%rax 5686 72,139,0, //mov (%rax),%rax 5687 185,0,0,127,67, //mov $0x437f0000,%ecx 5688 102,68,15,110,193, //movd %ecx,%xmm8 5689 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5690 68,15,89,195, //mulps %xmm3,%xmm8 5691 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 5692 102,69,15,56,43,192, //packusdw %xmm8,%xmm8 5693 102,69,15,103,192, //packuswb %xmm8,%xmm8 5694 102,68,15,126,4,56, //movd %xmm8,(%rax,%rdi,1) 5695 72,173, //lods %ds:(%rsi),%rax 5696 255,224, //jmpq *%rax 5697 }; 5698 5699 CODE const uint8_t sk_load_565_sse41[] = { 5700 72,173, //lods %ds:(%rsi),%rax 5701 72,139,0, //mov (%rax),%rax 5702 102,15,56,51,20,120, //pmovzxwd (%rax,%rdi,2),%xmm2 5703 184,0,248,0,0, //mov $0xf800,%eax 5704 102,15,110,192, //movd %eax,%xmm0 5705 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 5706 102,15,219,194, //pand %xmm2,%xmm0 5707 15,91,200, //cvtdq2ps %xmm0,%xmm1 5708 184,8,33,132,55, //mov $0x37842108,%eax 5709 102,15,110,192, //movd %eax,%xmm0 5710 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 5711 15,89,193, //mulps %xmm1,%xmm0 5712 184,224,7,0,0, //mov $0x7e0,%eax 5713 102,15,110,200, //movd %eax,%xmm1 5714 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 5715 102,15,219,202, //pand %xmm2,%xmm1 5716 15,91,217, //cvtdq2ps %xmm1,%xmm3 5717 184,33,8,2,58, //mov $0x3a020821,%eax 5718 102,15,110,200, //movd %eax,%xmm1 5719 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 5720 15,89,203, //mulps %xmm3,%xmm1 5721 184,31,0,0,0, //mov $0x1f,%eax 5722 102,15,110,216, //movd %eax,%xmm3 5723 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 5724 102,15,219,218, //pand %xmm2,%xmm3 5725 15,91,219, //cvtdq2ps %xmm3,%xmm3 5726 184,8,33,4,61, //mov $0x3d042108,%eax 5727 102,15,110,208, //movd %eax,%xmm2 5728 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 5729 15,89,211, //mulps %xmm3,%xmm2 5730 184,0,0,128,63, //mov $0x3f800000,%eax 5731 102,15,110,216, //movd %eax,%xmm3 5732 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 5733 72,173, //lods %ds:(%rsi),%rax 5734 255,224, //jmpq *%rax 5735 }; 5736 5737 CODE const uint8_t sk_store_565_sse41[] = { 5738 72,173, //lods %ds:(%rsi),%rax 5739 72,139,0, //mov (%rax),%rax 5740 185,0,0,248,65, //mov $0x41f80000,%ecx 5741 102,68,15,110,193, //movd %ecx,%xmm8 5742 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5743 69,15,40,200, //movaps %xmm8,%xmm9 5744 68,15,89,200, //mulps %xmm0,%xmm9 5745 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 5746 102,65,15,114,241,11, //pslld $0xb,%xmm9 5747 185,0,0,124,66, //mov $0x427c0000,%ecx 5748 102,68,15,110,209, //movd %ecx,%xmm10 5749 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 5750 68,15,89,209, //mulps %xmm1,%xmm10 5751 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 5752 102,65,15,114,242,5, //pslld $0x5,%xmm10 5753 102,69,15,235,209, //por %xmm9,%xmm10 5754 68,15,89,194, //mulps %xmm2,%xmm8 5755 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 5756 102,69,15,86,194, //orpd %xmm10,%xmm8 5757 102,69,15,56,43,192, //packusdw %xmm8,%xmm8 5758 102,68,15,214,4,120, //movq %xmm8,(%rax,%rdi,2) 5759 72,173, //lods %ds:(%rsi),%rax 5760 255,224, //jmpq *%rax 5761 }; 5762 5763 CODE const uint8_t sk_load_8888_sse41[] = { 5764 72,173, //lods %ds:(%rsi),%rax 5765 72,139,0, //mov (%rax),%rax 5766 243,15,111,28,184, //movdqu (%rax,%rdi,4),%xmm3 5767 184,255,0,0,0, //mov $0xff,%eax 5768 102,15,110,192, //movd %eax,%xmm0 5769 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 5770 102,15,111,203, //movdqa %xmm3,%xmm1 5771 102,15,114,209,8, //psrld $0x8,%xmm1 5772 102,15,219,200, //pand %xmm0,%xmm1 5773 102,15,111,211, //movdqa %xmm3,%xmm2 5774 102,15,114,210,16, //psrld $0x10,%xmm2 5775 102,15,219,208, //pand %xmm0,%xmm2 5776 102,15,219,195, //pand %xmm3,%xmm0 5777 15,91,192, //cvtdq2ps %xmm0,%xmm0 5778 184,129,128,128,59, //mov $0x3b808081,%eax 5779 102,68,15,110,192, //movd %eax,%xmm8 5780 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5781 65,15,89,192, //mulps %xmm8,%xmm0 5782 15,91,201, //cvtdq2ps %xmm1,%xmm1 5783 65,15,89,200, //mulps %xmm8,%xmm1 5784 15,91,210, //cvtdq2ps %xmm2,%xmm2 5785 65,15,89,208, //mulps %xmm8,%xmm2 5786 102,15,114,211,24, //psrld $0x18,%xmm3 5787 15,91,219, //cvtdq2ps %xmm3,%xmm3 5788 65,15,89,216, //mulps %xmm8,%xmm3 5789 72,173, //lods %ds:(%rsi),%rax 5790 255,224, //jmpq *%rax 5791 }; 5792 5793 CODE const uint8_t sk_store_8888_sse41[] = { 5794 72,173, //lods %ds:(%rsi),%rax 5795 72,139,0, //mov (%rax),%rax 5796 185,0,0,127,67, //mov $0x437f0000,%ecx 5797 102,68,15,110,193, //movd %ecx,%xmm8 5798 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5799 69,15,40,200, //movaps %xmm8,%xmm9 5800 68,15,89,200, //mulps %xmm0,%xmm9 5801 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 5802 69,15,40,208, //movaps %xmm8,%xmm10 5803 68,15,89,209, //mulps %xmm1,%xmm10 5804 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 5805 102,65,15,114,242,8, //pslld $0x8,%xmm10 5806 102,69,15,235,209, //por %xmm9,%xmm10 5807 69,15,40,200, //movaps %xmm8,%xmm9 5808 68,15,89,202, //mulps %xmm2,%xmm9 5809 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 5810 102,65,15,114,241,16, //pslld $0x10,%xmm9 5811 68,15,89,195, //mulps %xmm3,%xmm8 5812 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 5813 102,65,15,114,240,24, //pslld $0x18,%xmm8 5814 102,69,15,235,193, //por %xmm9,%xmm8 5815 102,69,15,235,194, //por %xmm10,%xmm8 5816 243,68,15,127,4,184, //movdqu %xmm8,(%rax,%rdi,4) 5817 72,173, //lods %ds:(%rsi),%rax 5818 255,224, //jmpq *%rax 5819 }; 5820 5821 CODE const uint8_t sk_load_f16_sse41[] = { 5822 72,173, //lods %ds:(%rsi),%rax 5823 72,139,0, //mov (%rax),%rax 5824 243,15,111,4,248, //movdqu (%rax,%rdi,8),%xmm0 5825 243,15,111,76,248,16, //movdqu 0x10(%rax,%rdi,8),%xmm1 5826 102,15,111,208, //movdqa %xmm0,%xmm2 5827 102,15,97,209, //punpcklwd %xmm1,%xmm2 5828 102,15,105,193, //punpckhwd %xmm1,%xmm0 5829 102,68,15,111,194, //movdqa %xmm2,%xmm8 5830 102,68,15,97,192, //punpcklwd %xmm0,%xmm8 5831 102,15,105,208, //punpckhwd %xmm0,%xmm2 5832 184,0,4,0,4, //mov $0x4000400,%eax 5833 102,15,110,192, //movd %eax,%xmm0 5834 102,15,112,216,0, //pshufd $0x0,%xmm0,%xmm3 5835 102,15,111,203, //movdqa %xmm3,%xmm1 5836 102,65,15,101,200, //pcmpgtw %xmm8,%xmm1 5837 102,65,15,223,200, //pandn %xmm8,%xmm1 5838 102,15,101,218, //pcmpgtw %xmm2,%xmm3 5839 102,15,223,218, //pandn %xmm2,%xmm3 5840 102,15,56,51,193, //pmovzxwd %xmm1,%xmm0 5841 102,15,114,240,13, //pslld $0xd,%xmm0 5842 184,0,0,128,119, //mov $0x77800000,%eax 5843 102,15,110,208, //movd %eax,%xmm2 5844 102,68,15,112,194,0, //pshufd $0x0,%xmm2,%xmm8 5845 65,15,89,192, //mulps %xmm8,%xmm0 5846 102,69,15,239,201, //pxor %xmm9,%xmm9 5847 102,65,15,105,201, //punpckhwd %xmm9,%xmm1 5848 102,15,114,241,13, //pslld $0xd,%xmm1 5849 65,15,89,200, //mulps %xmm8,%xmm1 5850 102,15,56,51,211, //pmovzxwd %xmm3,%xmm2 5851 102,15,114,242,13, //pslld $0xd,%xmm2 5852 65,15,89,208, //mulps %xmm8,%xmm2 5853 102,65,15,105,217, //punpckhwd %xmm9,%xmm3 5854 102,15,114,243,13, //pslld $0xd,%xmm3 5855 65,15,89,216, //mulps %xmm8,%xmm3 5856 72,173, //lods %ds:(%rsi),%rax 5857 255,224, //jmpq *%rax 5858 }; 5859 5860 CODE const uint8_t sk_store_f16_sse41[] = { 5861 72,173, //lods %ds:(%rsi),%rax 5862 72,139,0, //mov (%rax),%rax 5863 185,0,0,128,7, //mov $0x7800000,%ecx 5864 102,68,15,110,193, //movd %ecx,%xmm8 5865 102,69,15,112,192,0, //pshufd $0x0,%xmm8,%xmm8 5866 102,69,15,111,200, //movdqa %xmm8,%xmm9 5867 68,15,89,200, //mulps %xmm0,%xmm9 5868 102,65,15,114,209,13, //psrld $0xd,%xmm9 5869 102,69,15,111,208, //movdqa %xmm8,%xmm10 5870 68,15,89,209, //mulps %xmm1,%xmm10 5871 102,65,15,114,210,13, //psrld $0xd,%xmm10 5872 102,69,15,111,216, //movdqa %xmm8,%xmm11 5873 68,15,89,218, //mulps %xmm2,%xmm11 5874 102,65,15,114,211,13, //psrld $0xd,%xmm11 5875 68,15,89,195, //mulps %xmm3,%xmm8 5876 102,65,15,114,208,13, //psrld $0xd,%xmm8 5877 102,65,15,115,250,2, //pslldq $0x2,%xmm10 5878 102,69,15,235,209, //por %xmm9,%xmm10 5879 102,65,15,115,248,2, //pslldq $0x2,%xmm8 5880 102,69,15,235,195, //por %xmm11,%xmm8 5881 102,69,15,111,202, //movdqa %xmm10,%xmm9 5882 102,69,15,98,200, //punpckldq %xmm8,%xmm9 5883 243,68,15,127,12,248, //movdqu %xmm9,(%rax,%rdi,8) 5884 102,69,15,106,208, //punpckhdq %xmm8,%xmm10 5885 243,68,15,127,84,248,16, //movdqu %xmm10,0x10(%rax,%rdi,8) 5886 72,173, //lods %ds:(%rsi),%rax 5887 255,224, //jmpq *%rax 5888 }; 5889 5890 CODE const uint8_t sk_store_f32_sse41[] = { 5891 72,173, //lods %ds:(%rsi),%rax 5892 72,139,0, //mov (%rax),%rax 5893 72,137,249, //mov %rdi,%rcx 5894 72,193,225,4, //shl $0x4,%rcx 5895 68,15,40,192, //movaps %xmm0,%xmm8 5896 68,15,40,200, //movaps %xmm0,%xmm9 5897 68,15,20,201, //unpcklps %xmm1,%xmm9 5898 68,15,40,210, //movaps %xmm2,%xmm10 5899 68,15,40,218, //movaps %xmm2,%xmm11 5900 68,15,20,219, //unpcklps %xmm3,%xmm11 5901 68,15,21,193, //unpckhps %xmm1,%xmm8 5902 68,15,21,211, //unpckhps %xmm3,%xmm10 5903 69,15,40,225, //movaps %xmm9,%xmm12 5904 102,69,15,20,227, //unpcklpd %xmm11,%xmm12 5905 69,15,18,217, //movhlps %xmm9,%xmm11 5906 69,15,40,200, //movaps %xmm8,%xmm9 5907 102,69,15,20,202, //unpcklpd %xmm10,%xmm9 5908 69,15,18,208, //movhlps %xmm8,%xmm10 5909 102,68,15,17,36,8, //movupd %xmm12,(%rax,%rcx,1) 5910 68,15,17,92,8,16, //movups %xmm11,0x10(%rax,%rcx,1) 5911 102,68,15,17,76,8,32, //movupd %xmm9,0x20(%rax,%rcx,1) 5912 68,15,17,84,8,48, //movups %xmm10,0x30(%rax,%rcx,1) 5913 72,173, //lods %ds:(%rsi),%rax 5914 255,224, //jmpq *%rax 5915 }; 5916 5917 CODE const uint8_t sk_clamp_x_sse41[] = { 5918 72,173, //lods %ds:(%rsi),%rax 5919 69,15,87,192, //xorps %xmm8,%xmm8 5920 68,15,95,192, //maxps %xmm0,%xmm8 5921 243,68,15,16,8, //movss (%rax),%xmm9 5922 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 5923 102,15,118,192, //pcmpeqd %xmm0,%xmm0 5924 102,65,15,254,193, //paddd %xmm9,%xmm0 5925 68,15,93,192, //minps %xmm0,%xmm8 5926 72,173, //lods %ds:(%rsi),%rax 5927 65,15,40,192, //movaps %xmm8,%xmm0 5928 255,224, //jmpq *%rax 5929 }; 5930 5931 CODE const uint8_t sk_clamp_y_sse41[] = { 5932 72,173, //lods %ds:(%rsi),%rax 5933 69,15,87,192, //xorps %xmm8,%xmm8 5934 68,15,95,193, //maxps %xmm1,%xmm8 5935 243,68,15,16,8, //movss (%rax),%xmm9 5936 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 5937 102,15,118,201, //pcmpeqd %xmm1,%xmm1 5938 102,65,15,254,201, //paddd %xmm9,%xmm1 5939 68,15,93,193, //minps %xmm1,%xmm8 5940 72,173, //lods %ds:(%rsi),%rax 5941 65,15,40,200, //movaps %xmm8,%xmm1 5942 255,224, //jmpq *%rax 5943 }; 5944 5945 CODE const uint8_t sk_repeat_x_sse41[] = { 5946 72,173, //lods %ds:(%rsi),%rax 5947 243,68,15,16,0, //movss (%rax),%xmm8 5948 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5949 68,15,40,200, //movaps %xmm0,%xmm9 5950 69,15,94,200, //divps %xmm8,%xmm9 5951 102,69,15,58,8,201,1, //roundps $0x1,%xmm9,%xmm9 5952 69,15,89,200, //mulps %xmm8,%xmm9 5953 65,15,92,193, //subps %xmm9,%xmm0 5954 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9 5955 102,69,15,254,200, //paddd %xmm8,%xmm9 5956 65,15,93,193, //minps %xmm9,%xmm0 5957 72,173, //lods %ds:(%rsi),%rax 5958 255,224, //jmpq *%rax 5959 }; 5960 5961 CODE const uint8_t sk_repeat_y_sse41[] = { 5962 72,173, //lods %ds:(%rsi),%rax 5963 243,68,15,16,0, //movss (%rax),%xmm8 5964 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5965 68,15,40,201, //movaps %xmm1,%xmm9 5966 69,15,94,200, //divps %xmm8,%xmm9 5967 102,69,15,58,8,201,1, //roundps $0x1,%xmm9,%xmm9 5968 69,15,89,200, //mulps %xmm8,%xmm9 5969 65,15,92,201, //subps %xmm9,%xmm1 5970 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9 5971 102,69,15,254,200, //paddd %xmm8,%xmm9 5972 65,15,93,201, //minps %xmm9,%xmm1 5973 72,173, //lods %ds:(%rsi),%rax 5974 255,224, //jmpq *%rax 5975 }; 5976 5977 CODE const uint8_t sk_mirror_x_sse41[] = { 5978 72,173, //lods %ds:(%rsi),%rax 5979 243,68,15,16,0, //movss (%rax),%xmm8 5980 69,15,40,200, //movaps %xmm8,%xmm9 5981 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 5982 65,15,92,193, //subps %xmm9,%xmm0 5983 243,69,15,88,192, //addss %xmm8,%xmm8 5984 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 5985 68,15,40,208, //movaps %xmm0,%xmm10 5986 69,15,94,208, //divps %xmm8,%xmm10 5987 102,69,15,58,8,210,1, //roundps $0x1,%xmm10,%xmm10 5988 69,15,89,208, //mulps %xmm8,%xmm10 5989 65,15,92,194, //subps %xmm10,%xmm0 5990 65,15,92,193, //subps %xmm9,%xmm0 5991 69,15,87,192, //xorps %xmm8,%xmm8 5992 68,15,92,192, //subps %xmm0,%xmm8 5993 65,15,84,192, //andps %xmm8,%xmm0 5994 102,69,15,118,192, //pcmpeqd %xmm8,%xmm8 5995 102,69,15,254,193, //paddd %xmm9,%xmm8 5996 65,15,93,192, //minps %xmm8,%xmm0 5997 72,173, //lods %ds:(%rsi),%rax 5998 255,224, //jmpq *%rax 5999 }; 6000 6001 CODE const uint8_t sk_mirror_y_sse41[] = { 6002 72,173, //lods %ds:(%rsi),%rax 6003 243,68,15,16,0, //movss (%rax),%xmm8 6004 69,15,40,200, //movaps %xmm8,%xmm9 6005 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 6006 65,15,92,201, //subps %xmm9,%xmm1 6007 243,69,15,88,192, //addss %xmm8,%xmm8 6008 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 6009 68,15,40,209, //movaps %xmm1,%xmm10 6010 69,15,94,208, //divps %xmm8,%xmm10 6011 102,69,15,58,8,210,1, //roundps $0x1,%xmm10,%xmm10 6012 69,15,89,208, //mulps %xmm8,%xmm10 6013 65,15,92,202, //subps %xmm10,%xmm1 6014 65,15,92,201, //subps %xmm9,%xmm1 6015 69,15,87,192, //xorps %xmm8,%xmm8 6016 68,15,92,193, //subps %xmm1,%xmm8 6017 65,15,84,200, //andps %xmm8,%xmm1 6018 102,69,15,118,192, //pcmpeqd %xmm8,%xmm8 6019 102,69,15,254,193, //paddd %xmm9,%xmm8 6020 65,15,93,200, //minps %xmm8,%xmm1 6021 72,173, //lods %ds:(%rsi),%rax 6022 255,224, //jmpq *%rax 6023 }; 6024 6025 CODE const uint8_t sk_luminance_to_alpha_sse41[] = { 6026 184,208,179,89,62, //mov $0x3e59b3d0,%eax 6027 102,15,110,216, //movd %eax,%xmm3 6028 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 6029 15,89,216, //mulps %xmm0,%xmm3 6030 184,89,23,55,63, //mov $0x3f371759,%eax 6031 102,15,110,192, //movd %eax,%xmm0 6032 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 6033 15,89,193, //mulps %xmm1,%xmm0 6034 15,88,195, //addps %xmm3,%xmm0 6035 184,152,221,147,61, //mov $0x3d93dd98,%eax 6036 102,15,110,216, //movd %eax,%xmm3 6037 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 6038 15,89,218, //mulps %xmm2,%xmm3 6039 15,88,216, //addps %xmm0,%xmm3 6040 72,173, //lods %ds:(%rsi),%rax 6041 15,87,192, //xorps %xmm0,%xmm0 6042 15,87,201, //xorps %xmm1,%xmm1 6043 15,87,210, //xorps %xmm2,%xmm2 6044 255,224, //jmpq *%rax 6045 }; 6046 6047 CODE const uint8_t sk_matrix_2x3_sse41[] = { 6048 68,15,40,201, //movaps %xmm1,%xmm9 6049 68,15,40,192, //movaps %xmm0,%xmm8 6050 72,173, //lods %ds:(%rsi),%rax 6051 243,15,16,0, //movss (%rax),%xmm0 6052 243,15,16,72,4, //movss 0x4(%rax),%xmm1 6053 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 6054 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 6055 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6056 243,68,15,16,88,16, //movss 0x10(%rax),%xmm11 6057 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 6058 69,15,89,209, //mulps %xmm9,%xmm10 6059 69,15,88,211, //addps %xmm11,%xmm10 6060 65,15,89,192, //mulps %xmm8,%xmm0 6061 65,15,88,194, //addps %xmm10,%xmm0 6062 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 6063 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10 6064 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6065 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11 6066 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 6067 69,15,89,209, //mulps %xmm9,%xmm10 6068 69,15,88,211, //addps %xmm11,%xmm10 6069 65,15,89,200, //mulps %xmm8,%xmm1 6070 65,15,88,202, //addps %xmm10,%xmm1 6071 72,173, //lods %ds:(%rsi),%rax 6072 255,224, //jmpq *%rax 6073 }; 6074 6075 CODE const uint8_t sk_matrix_3x4_sse41[] = { 6076 68,15,40,201, //movaps %xmm1,%xmm9 6077 68,15,40,192, //movaps %xmm0,%xmm8 6078 72,173, //lods %ds:(%rsi),%rax 6079 243,15,16,0, //movss (%rax),%xmm0 6080 243,15,16,72,4, //movss 0x4(%rax),%xmm1 6081 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 6082 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10 6083 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6084 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11 6085 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 6086 243,68,15,16,96,36, //movss 0x24(%rax),%xmm12 6087 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 6088 68,15,89,218, //mulps %xmm2,%xmm11 6089 69,15,88,220, //addps %xmm12,%xmm11 6090 69,15,89,209, //mulps %xmm9,%xmm10 6091 69,15,88,211, //addps %xmm11,%xmm10 6092 65,15,89,192, //mulps %xmm8,%xmm0 6093 65,15,88,194, //addps %xmm10,%xmm0 6094 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 6095 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10 6096 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6097 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11 6098 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 6099 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12 6100 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 6101 68,15,89,218, //mulps %xmm2,%xmm11 6102 69,15,88,220, //addps %xmm12,%xmm11 6103 69,15,89,209, //mulps %xmm9,%xmm10 6104 69,15,88,211, //addps %xmm11,%xmm10 6105 65,15,89,200, //mulps %xmm8,%xmm1 6106 65,15,88,202, //addps %xmm10,%xmm1 6107 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 6108 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6109 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11 6110 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 6111 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12 6112 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 6113 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13 6114 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 6115 68,15,89,226, //mulps %xmm2,%xmm12 6116 69,15,88,229, //addps %xmm13,%xmm12 6117 69,15,89,217, //mulps %xmm9,%xmm11 6118 69,15,88,220, //addps %xmm12,%xmm11 6119 69,15,89,208, //mulps %xmm8,%xmm10 6120 69,15,88,211, //addps %xmm11,%xmm10 6121 72,173, //lods %ds:(%rsi),%rax 6122 65,15,40,210, //movaps %xmm10,%xmm2 6123 255,224, //jmpq *%rax 6124 }; 6125 6126 CODE const uint8_t sk_matrix_4x5_sse41[] = { 6127 68,15,40,201, //movaps %xmm1,%xmm9 6128 68,15,40,192, //movaps %xmm0,%xmm8 6129 72,173, //lods %ds:(%rsi),%rax 6130 243,15,16,0, //movss (%rax),%xmm0 6131 243,15,16,72,4, //movss 0x4(%rax),%xmm1 6132 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 6133 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10 6134 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6135 243,68,15,16,88,32, //movss 0x20(%rax),%xmm11 6136 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 6137 243,68,15,16,96,48, //movss 0x30(%rax),%xmm12 6138 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 6139 243,68,15,16,104,64, //movss 0x40(%rax),%xmm13 6140 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 6141 68,15,89,227, //mulps %xmm3,%xmm12 6142 69,15,88,229, //addps %xmm13,%xmm12 6143 68,15,89,218, //mulps %xmm2,%xmm11 6144 69,15,88,220, //addps %xmm12,%xmm11 6145 69,15,89,209, //mulps %xmm9,%xmm10 6146 69,15,88,211, //addps %xmm11,%xmm10 6147 65,15,89,192, //mulps %xmm8,%xmm0 6148 65,15,88,194, //addps %xmm10,%xmm0 6149 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 6150 243,68,15,16,80,20, //movss 0x14(%rax),%xmm10 6151 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6152 243,68,15,16,88,36, //movss 0x24(%rax),%xmm11 6153 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 6154 243,68,15,16,96,52, //movss 0x34(%rax),%xmm12 6155 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 6156 243,68,15,16,104,68, //movss 0x44(%rax),%xmm13 6157 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 6158 68,15,89,227, //mulps %xmm3,%xmm12 6159 69,15,88,229, //addps %xmm13,%xmm12 6160 68,15,89,218, //mulps %xmm2,%xmm11 6161 69,15,88,220, //addps %xmm12,%xmm11 6162 69,15,89,209, //mulps %xmm9,%xmm10 6163 69,15,88,211, //addps %xmm11,%xmm10 6164 65,15,89,200, //mulps %xmm8,%xmm1 6165 65,15,88,202, //addps %xmm10,%xmm1 6166 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 6167 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6168 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11 6169 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 6170 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12 6171 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 6172 243,68,15,16,104,56, //movss 0x38(%rax),%xmm13 6173 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 6174 243,68,15,16,112,72, //movss 0x48(%rax),%xmm14 6175 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 6176 68,15,89,235, //mulps %xmm3,%xmm13 6177 69,15,88,238, //addps %xmm14,%xmm13 6178 68,15,89,226, //mulps %xmm2,%xmm12 6179 69,15,88,229, //addps %xmm13,%xmm12 6180 69,15,89,217, //mulps %xmm9,%xmm11 6181 69,15,88,220, //addps %xmm12,%xmm11 6182 69,15,89,208, //mulps %xmm8,%xmm10 6183 69,15,88,211, //addps %xmm11,%xmm10 6184 243,68,15,16,88,12, //movss 0xc(%rax),%xmm11 6185 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 6186 243,68,15,16,96,28, //movss 0x1c(%rax),%xmm12 6187 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 6188 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13 6189 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 6190 243,68,15,16,112,60, //movss 0x3c(%rax),%xmm14 6191 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 6192 243,68,15,16,120,76, //movss 0x4c(%rax),%xmm15 6193 69,15,198,255,0, //shufps $0x0,%xmm15,%xmm15 6194 68,15,89,243, //mulps %xmm3,%xmm14 6195 69,15,88,247, //addps %xmm15,%xmm14 6196 68,15,89,234, //mulps %xmm2,%xmm13 6197 69,15,88,238, //addps %xmm14,%xmm13 6198 69,15,89,225, //mulps %xmm9,%xmm12 6199 69,15,88,229, //addps %xmm13,%xmm12 6200 69,15,89,216, //mulps %xmm8,%xmm11 6201 69,15,88,220, //addps %xmm12,%xmm11 6202 72,173, //lods %ds:(%rsi),%rax 6203 65,15,40,210, //movaps %xmm10,%xmm2 6204 65,15,40,219, //movaps %xmm11,%xmm3 6205 255,224, //jmpq *%rax 6206 }; 6207 6208 CODE const uint8_t sk_matrix_perspective_sse41[] = { 6209 68,15,40,192, //movaps %xmm0,%xmm8 6210 72,173, //lods %ds:(%rsi),%rax 6211 243,15,16,0, //movss (%rax),%xmm0 6212 243,68,15,16,72,4, //movss 0x4(%rax),%xmm9 6213 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 6214 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 6215 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 6216 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6217 68,15,89,201, //mulps %xmm1,%xmm9 6218 69,15,88,202, //addps %xmm10,%xmm9 6219 65,15,89,192, //mulps %xmm8,%xmm0 6220 65,15,88,193, //addps %xmm9,%xmm0 6221 243,68,15,16,72,12, //movss 0xc(%rax),%xmm9 6222 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 6223 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10 6224 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6225 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11 6226 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 6227 68,15,89,209, //mulps %xmm1,%xmm10 6228 69,15,88,211, //addps %xmm11,%xmm10 6229 69,15,89,200, //mulps %xmm8,%xmm9 6230 69,15,88,202, //addps %xmm10,%xmm9 6231 243,68,15,16,80,24, //movss 0x18(%rax),%xmm10 6232 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6233 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11 6234 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 6235 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12 6236 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 6237 68,15,89,217, //mulps %xmm1,%xmm11 6238 69,15,88,220, //addps %xmm12,%xmm11 6239 69,15,89,208, //mulps %xmm8,%xmm10 6240 69,15,88,211, //addps %xmm11,%xmm10 6241 65,15,83,202, //rcpps %xmm10,%xmm1 6242 15,89,193, //mulps %xmm1,%xmm0 6243 68,15,89,201, //mulps %xmm1,%xmm9 6244 72,173, //lods %ds:(%rsi),%rax 6245 65,15,40,201, //movaps %xmm9,%xmm1 6246 255,224, //jmpq *%rax 6247 }; 6248 6249 CODE const uint8_t sk_linear_gradient_2stops_sse41[] = { 6250 72,173, //lods %ds:(%rsi),%rax 6251 68,15,16,8, //movups (%rax),%xmm9 6252 15,16,88,16, //movups 0x10(%rax),%xmm3 6253 68,15,40,195, //movaps %xmm3,%xmm8 6254 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 6255 65,15,40,201, //movaps %xmm9,%xmm1 6256 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 6257 68,15,89,192, //mulps %xmm0,%xmm8 6258 68,15,88,193, //addps %xmm1,%xmm8 6259 15,40,203, //movaps %xmm3,%xmm1 6260 15,198,201,85, //shufps $0x55,%xmm1,%xmm1 6261 65,15,40,209, //movaps %xmm9,%xmm2 6262 15,198,210,85, //shufps $0x55,%xmm2,%xmm2 6263 15,89,200, //mulps %xmm0,%xmm1 6264 15,88,202, //addps %xmm2,%xmm1 6265 15,40,211, //movaps %xmm3,%xmm2 6266 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2 6267 69,15,40,209, //movaps %xmm9,%xmm10 6268 69,15,198,210,170, //shufps $0xaa,%xmm10,%xmm10 6269 15,89,208, //mulps %xmm0,%xmm2 6270 65,15,88,210, //addps %xmm10,%xmm2 6271 15,198,219,255, //shufps $0xff,%xmm3,%xmm3 6272 69,15,198,201,255, //shufps $0xff,%xmm9,%xmm9 6273 15,89,216, //mulps %xmm0,%xmm3 6274 65,15,88,217, //addps %xmm9,%xmm3 6275 72,173, //lods %ds:(%rsi),%rax 6276 65,15,40,192, //movaps %xmm8,%xmm0 6277 255,224, //jmpq *%rax 6278 }; 6279 6280 CODE const uint8_t sk_start_pipeline_sse2[] = { 6281 65,87, //push %r15 6282 65,86, //push %r14 6283 65,85, //push %r13 6284 65,84, //push %r12 6285 83, //push %rbx 6286 73,137,207, //mov %rcx,%r15 6287 73,137,214, //mov %rdx,%r14 6288 72,137,251, //mov %rdi,%rbx 6289 72,173, //lods %ds:(%rsi),%rax 6290 73,137,196, //mov %rax,%r12 6291 73,137,245, //mov %rsi,%r13 6292 72,141,67,4, //lea 0x4(%rbx),%rax 6293 76,57,248, //cmp %r15,%rax 6294 118,5, //jbe 28 <_sk_start_pipeline_sse2+0x28> 6295 72,137,216, //mov %rbx,%rax 6296 235,52, //jmp 5c <_sk_start_pipeline_sse2+0x5c> 6297 15,87,192, //xorps %xmm0,%xmm0 6298 15,87,201, //xorps %xmm1,%xmm1 6299 15,87,210, //xorps %xmm2,%xmm2 6300 15,87,219, //xorps %xmm3,%xmm3 6301 15,87,228, //xorps %xmm4,%xmm4 6302 15,87,237, //xorps %xmm5,%xmm5 6303 15,87,246, //xorps %xmm6,%xmm6 6304 15,87,255, //xorps %xmm7,%xmm7 6305 72,137,223, //mov %rbx,%rdi 6306 76,137,238, //mov %r13,%rsi 6307 76,137,242, //mov %r14,%rdx 6308 65,255,212, //callq *%r12 6309 72,141,67,4, //lea 0x4(%rbx),%rax 6310 72,131,195,8, //add $0x8,%rbx 6311 76,57,251, //cmp %r15,%rbx 6312 72,137,195, //mov %rax,%rbx 6313 118,204, //jbe 28 <_sk_start_pipeline_sse2+0x28> 6314 91, //pop %rbx 6315 65,92, //pop %r12 6316 65,93, //pop %r13 6317 65,94, //pop %r14 6318 65,95, //pop %r15 6319 195, //retq 6320 }; 6321 6322 CODE const uint8_t sk_just_return_sse2[] = { 6323 195, //retq 6324 }; 6325 6326 CODE const uint8_t sk_seed_shader_sse2[] = { 6327 72,173, //lods %ds:(%rsi),%rax 6328 102,15,110,199, //movd %edi,%xmm0 6329 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 6330 15,91,200, //cvtdq2ps %xmm0,%xmm1 6331 185,0,0,0,63, //mov $0x3f000000,%ecx 6332 102,15,110,209, //movd %ecx,%xmm2 6333 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 6334 15,88,202, //addps %xmm2,%xmm1 6335 15,16,2, //movups (%rdx),%xmm0 6336 15,88,193, //addps %xmm1,%xmm0 6337 102,15,110,8, //movd (%rax),%xmm1 6338 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 6339 15,91,201, //cvtdq2ps %xmm1,%xmm1 6340 15,88,202, //addps %xmm2,%xmm1 6341 184,0,0,128,63, //mov $0x3f800000,%eax 6342 102,15,110,208, //movd %eax,%xmm2 6343 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 6344 72,173, //lods %ds:(%rsi),%rax 6345 15,87,219, //xorps %xmm3,%xmm3 6346 15,87,228, //xorps %xmm4,%xmm4 6347 15,87,237, //xorps %xmm5,%xmm5 6348 15,87,246, //xorps %xmm6,%xmm6 6349 15,87,255, //xorps %xmm7,%xmm7 6350 255,224, //jmpq *%rax 6351 }; 6352 6353 CODE const uint8_t sk_constant_color_sse2[] = { 6354 72,173, //lods %ds:(%rsi),%rax 6355 15,16,24, //movups (%rax),%xmm3 6356 15,40,195, //movaps %xmm3,%xmm0 6357 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 6358 15,40,203, //movaps %xmm3,%xmm1 6359 15,198,201,85, //shufps $0x55,%xmm1,%xmm1 6360 15,40,211, //movaps %xmm3,%xmm2 6361 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2 6362 15,198,219,255, //shufps $0xff,%xmm3,%xmm3 6363 72,173, //lods %ds:(%rsi),%rax 6364 255,224, //jmpq *%rax 6365 }; 6366 6367 CODE const uint8_t sk_clear_sse2[] = { 6368 72,173, //lods %ds:(%rsi),%rax 6369 15,87,192, //xorps %xmm0,%xmm0 6370 15,87,201, //xorps %xmm1,%xmm1 6371 15,87,210, //xorps %xmm2,%xmm2 6372 15,87,219, //xorps %xmm3,%xmm3 6373 255,224, //jmpq *%rax 6374 }; 6375 6376 CODE const uint8_t sk_plus__sse2[] = { 6377 15,88,196, //addps %xmm4,%xmm0 6378 15,88,205, //addps %xmm5,%xmm1 6379 15,88,214, //addps %xmm6,%xmm2 6380 15,88,223, //addps %xmm7,%xmm3 6381 72,173, //lods %ds:(%rsi),%rax 6382 255,224, //jmpq *%rax 6383 }; 6384 6385 CODE const uint8_t sk_srcover_sse2[] = { 6386 184,0,0,128,63, //mov $0x3f800000,%eax 6387 102,68,15,110,192, //movd %eax,%xmm8 6388 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 6389 68,15,92,195, //subps %xmm3,%xmm8 6390 69,15,40,200, //movaps %xmm8,%xmm9 6391 68,15,89,204, //mulps %xmm4,%xmm9 6392 65,15,88,193, //addps %xmm9,%xmm0 6393 69,15,40,200, //movaps %xmm8,%xmm9 6394 68,15,89,205, //mulps %xmm5,%xmm9 6395 65,15,88,201, //addps %xmm9,%xmm1 6396 69,15,40,200, //movaps %xmm8,%xmm9 6397 68,15,89,206, //mulps %xmm6,%xmm9 6398 65,15,88,209, //addps %xmm9,%xmm2 6399 68,15,89,199, //mulps %xmm7,%xmm8 6400 65,15,88,216, //addps %xmm8,%xmm3 6401 72,173, //lods %ds:(%rsi),%rax 6402 255,224, //jmpq *%rax 6403 }; 6404 6405 CODE const uint8_t sk_dstover_sse2[] = { 6406 184,0,0,128,63, //mov $0x3f800000,%eax 6407 102,68,15,110,192, //movd %eax,%xmm8 6408 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 6409 68,15,92,199, //subps %xmm7,%xmm8 6410 65,15,89,192, //mulps %xmm8,%xmm0 6411 15,88,196, //addps %xmm4,%xmm0 6412 65,15,89,200, //mulps %xmm8,%xmm1 6413 15,88,205, //addps %xmm5,%xmm1 6414 65,15,89,208, //mulps %xmm8,%xmm2 6415 15,88,214, //addps %xmm6,%xmm2 6416 65,15,89,216, //mulps %xmm8,%xmm3 6417 15,88,223, //addps %xmm7,%xmm3 6418 72,173, //lods %ds:(%rsi),%rax 6419 255,224, //jmpq *%rax 6420 }; 6421 6422 CODE const uint8_t sk_clamp_0_sse2[] = { 6423 69,15,87,192, //xorps %xmm8,%xmm8 6424 65,15,95,192, //maxps %xmm8,%xmm0 6425 65,15,95,200, //maxps %xmm8,%xmm1 6426 65,15,95,208, //maxps %xmm8,%xmm2 6427 65,15,95,216, //maxps %xmm8,%xmm3 6428 72,173, //lods %ds:(%rsi),%rax 6429 255,224, //jmpq *%rax 6430 }; 6431 6432 CODE const uint8_t sk_clamp_1_sse2[] = { 6433 184,0,0,128,63, //mov $0x3f800000,%eax 6434 102,68,15,110,192, //movd %eax,%xmm8 6435 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 6436 65,15,93,192, //minps %xmm8,%xmm0 6437 65,15,93,200, //minps %xmm8,%xmm1 6438 65,15,93,208, //minps %xmm8,%xmm2 6439 65,15,93,216, //minps %xmm8,%xmm3 6440 72,173, //lods %ds:(%rsi),%rax 6441 255,224, //jmpq *%rax 6442 }; 6443 6444 CODE const uint8_t sk_clamp_a_sse2[] = { 6445 184,0,0,128,63, //mov $0x3f800000,%eax 6446 102,68,15,110,192, //movd %eax,%xmm8 6447 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 6448 65,15,93,216, //minps %xmm8,%xmm3 6449 15,93,195, //minps %xmm3,%xmm0 6450 15,93,203, //minps %xmm3,%xmm1 6451 15,93,211, //minps %xmm3,%xmm2 6452 72,173, //lods %ds:(%rsi),%rax 6453 255,224, //jmpq *%rax 6454 }; 6455 6456 CODE const uint8_t sk_set_rgb_sse2[] = { 6457 72,173, //lods %ds:(%rsi),%rax 6458 243,15,16,0, //movss (%rax),%xmm0 6459 243,15,16,72,4, //movss 0x4(%rax),%xmm1 6460 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 6461 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 6462 243,15,16,80,8, //movss 0x8(%rax),%xmm2 6463 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 6464 72,173, //lods %ds:(%rsi),%rax 6465 255,224, //jmpq *%rax 6466 }; 6467 6468 CODE const uint8_t sk_swap_rb_sse2[] = { 6469 68,15,40,192, //movaps %xmm0,%xmm8 6470 72,173, //lods %ds:(%rsi),%rax 6471 15,40,194, //movaps %xmm2,%xmm0 6472 65,15,40,208, //movaps %xmm8,%xmm2 6473 255,224, //jmpq *%rax 6474 }; 6475 6476 CODE const uint8_t sk_swap_sse2[] = { 6477 68,15,40,195, //movaps %xmm3,%xmm8 6478 68,15,40,202, //movaps %xmm2,%xmm9 6479 68,15,40,209, //movaps %xmm1,%xmm10 6480 68,15,40,216, //movaps %xmm0,%xmm11 6481 72,173, //lods %ds:(%rsi),%rax 6482 15,40,196, //movaps %xmm4,%xmm0 6483 15,40,205, //movaps %xmm5,%xmm1 6484 15,40,214, //movaps %xmm6,%xmm2 6485 15,40,223, //movaps %xmm7,%xmm3 6486 65,15,40,227, //movaps %xmm11,%xmm4 6487 65,15,40,234, //movaps %xmm10,%xmm5 6488 65,15,40,241, //movaps %xmm9,%xmm6 6489 65,15,40,248, //movaps %xmm8,%xmm7 6490 255,224, //jmpq *%rax 6491 }; 6492 6493 CODE const uint8_t sk_move_src_dst_sse2[] = { 6494 72,173, //lods %ds:(%rsi),%rax 6495 15,40,224, //movaps %xmm0,%xmm4 6496 15,40,233, //movaps %xmm1,%xmm5 6497 15,40,242, //movaps %xmm2,%xmm6 6498 15,40,251, //movaps %xmm3,%xmm7 6499 255,224, //jmpq *%rax 6500 }; 6501 6502 CODE const uint8_t sk_move_dst_src_sse2[] = { 6503 72,173, //lods %ds:(%rsi),%rax 6504 15,40,196, //movaps %xmm4,%xmm0 6505 15,40,205, //movaps %xmm5,%xmm1 6506 15,40,214, //movaps %xmm6,%xmm2 6507 15,40,223, //movaps %xmm7,%xmm3 6508 255,224, //jmpq *%rax 6509 }; 6510 6511 CODE const uint8_t sk_premul_sse2[] = { 6512 15,89,195, //mulps %xmm3,%xmm0 6513 15,89,203, //mulps %xmm3,%xmm1 6514 15,89,211, //mulps %xmm3,%xmm2 6515 72,173, //lods %ds:(%rsi),%rax 6516 255,224, //jmpq *%rax 6517 }; 6518 6519 CODE const uint8_t sk_unpremul_sse2[] = { 6520 69,15,87,192, //xorps %xmm8,%xmm8 6521 184,0,0,128,63, //mov $0x3f800000,%eax 6522 102,68,15,110,200, //movd %eax,%xmm9 6523 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 6524 68,15,94,203, //divps %xmm3,%xmm9 6525 68,15,194,195,4, //cmpneqps %xmm3,%xmm8 6526 69,15,84,193, //andps %xmm9,%xmm8 6527 65,15,89,192, //mulps %xmm8,%xmm0 6528 65,15,89,200, //mulps %xmm8,%xmm1 6529 65,15,89,208, //mulps %xmm8,%xmm2 6530 72,173, //lods %ds:(%rsi),%rax 6531 255,224, //jmpq *%rax 6532 }; 6533 6534 CODE const uint8_t sk_from_srgb_sse2[] = { 6535 184,145,131,158,61, //mov $0x3d9e8391,%eax 6536 102,68,15,110,192, //movd %eax,%xmm8 6537 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 6538 69,15,40,232, //movaps %xmm8,%xmm13 6539 68,15,89,232, //mulps %xmm0,%xmm13 6540 68,15,40,224, //movaps %xmm0,%xmm12 6541 69,15,89,228, //mulps %xmm12,%xmm12 6542 184,154,153,153,62, //mov $0x3e99999a,%eax 6543 102,68,15,110,200, //movd %eax,%xmm9 6544 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 6545 184,92,143,50,63, //mov $0x3f328f5c,%eax 6546 102,68,15,110,208, //movd %eax,%xmm10 6547 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6548 69,15,40,241, //movaps %xmm9,%xmm14 6549 68,15,89,240, //mulps %xmm0,%xmm14 6550 69,15,88,242, //addps %xmm10,%xmm14 6551 184,10,215,35,59, //mov $0x3b23d70a,%eax 6552 102,68,15,110,216, //movd %eax,%xmm11 6553 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 6554 69,15,89,244, //mulps %xmm12,%xmm14 6555 69,15,88,243, //addps %xmm11,%xmm14 6556 184,174,71,97,61, //mov $0x3d6147ae,%eax 6557 102,68,15,110,224, //movd %eax,%xmm12 6558 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 6559 65,15,194,196,1, //cmpltps %xmm12,%xmm0 6560 68,15,84,232, //andps %xmm0,%xmm13 6561 65,15,85,198, //andnps %xmm14,%xmm0 6562 65,15,86,197, //orps %xmm13,%xmm0 6563 69,15,40,232, //movaps %xmm8,%xmm13 6564 68,15,89,233, //mulps %xmm1,%xmm13 6565 68,15,40,241, //movaps %xmm1,%xmm14 6566 69,15,89,246, //mulps %xmm14,%xmm14 6567 69,15,40,249, //movaps %xmm9,%xmm15 6568 68,15,89,249, //mulps %xmm1,%xmm15 6569 69,15,88,250, //addps %xmm10,%xmm15 6570 69,15,89,254, //mulps %xmm14,%xmm15 6571 69,15,88,251, //addps %xmm11,%xmm15 6572 65,15,194,204,1, //cmpltps %xmm12,%xmm1 6573 68,15,84,233, //andps %xmm1,%xmm13 6574 65,15,85,207, //andnps %xmm15,%xmm1 6575 65,15,86,205, //orps %xmm13,%xmm1 6576 68,15,89,194, //mulps %xmm2,%xmm8 6577 68,15,40,234, //movaps %xmm2,%xmm13 6578 69,15,89,237, //mulps %xmm13,%xmm13 6579 68,15,89,202, //mulps %xmm2,%xmm9 6580 69,15,88,202, //addps %xmm10,%xmm9 6581 69,15,89,205, //mulps %xmm13,%xmm9 6582 69,15,88,203, //addps %xmm11,%xmm9 6583 65,15,194,212,1, //cmpltps %xmm12,%xmm2 6584 68,15,84,194, //andps %xmm2,%xmm8 6585 65,15,85,209, //andnps %xmm9,%xmm2 6586 65,15,86,208, //orps %xmm8,%xmm2 6587 72,173, //lods %ds:(%rsi),%rax 6588 255,224, //jmpq *%rax 6589 }; 6590 6591 CODE const uint8_t sk_to_srgb_sse2[] = { 6592 68,15,82,192, //rsqrtps %xmm0,%xmm8 6593 69,15,83,248, //rcpps %xmm8,%xmm15 6594 69,15,82,232, //rsqrtps %xmm8,%xmm13 6595 184,41,92,71,65, //mov $0x41475c29,%eax 6596 102,68,15,110,192, //movd %eax,%xmm8 6597 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 6598 69,15,40,240, //movaps %xmm8,%xmm14 6599 68,15,89,240, //mulps %xmm0,%xmm14 6600 184,0,0,128,63, //mov $0x3f800000,%eax 6601 102,68,15,110,200, //movd %eax,%xmm9 6602 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 6603 184,194,135,210,62, //mov $0x3ed287c2,%eax 6604 102,68,15,110,208, //movd %eax,%xmm10 6605 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6606 184,206,111,48,63, //mov $0x3f306fce,%eax 6607 102,68,15,110,216, //movd %eax,%xmm11 6608 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 6609 184,168,87,202,61, //mov $0x3dca57a8,%eax 6610 53,0,0,0,128, //xor $0x80000000,%eax 6611 102,68,15,110,224, //movd %eax,%xmm12 6612 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 6613 69,15,89,251, //mulps %xmm11,%xmm15 6614 69,15,88,252, //addps %xmm12,%xmm15 6615 69,15,89,234, //mulps %xmm10,%xmm13 6616 69,15,88,239, //addps %xmm15,%xmm13 6617 69,15,40,249, //movaps %xmm9,%xmm15 6618 69,15,93,253, //minps %xmm13,%xmm15 6619 184,4,231,140,59, //mov $0x3b8ce704,%eax 6620 102,68,15,110,232, //movd %eax,%xmm13 6621 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 6622 65,15,194,197,1, //cmpltps %xmm13,%xmm0 6623 68,15,84,240, //andps %xmm0,%xmm14 6624 65,15,85,199, //andnps %xmm15,%xmm0 6625 65,15,86,198, //orps %xmm14,%xmm0 6626 68,15,82,241, //rsqrtps %xmm1,%xmm14 6627 69,15,83,254, //rcpps %xmm14,%xmm15 6628 69,15,82,246, //rsqrtps %xmm14,%xmm14 6629 69,15,89,251, //mulps %xmm11,%xmm15 6630 69,15,88,252, //addps %xmm12,%xmm15 6631 69,15,89,242, //mulps %xmm10,%xmm14 6632 69,15,88,247, //addps %xmm15,%xmm14 6633 69,15,40,249, //movaps %xmm9,%xmm15 6634 69,15,93,254, //minps %xmm14,%xmm15 6635 69,15,40,240, //movaps %xmm8,%xmm14 6636 68,15,89,241, //mulps %xmm1,%xmm14 6637 65,15,194,205,1, //cmpltps %xmm13,%xmm1 6638 68,15,84,241, //andps %xmm1,%xmm14 6639 65,15,85,207, //andnps %xmm15,%xmm1 6640 65,15,86,206, //orps %xmm14,%xmm1 6641 68,15,82,242, //rsqrtps %xmm2,%xmm14 6642 69,15,83,254, //rcpps %xmm14,%xmm15 6643 69,15,89,251, //mulps %xmm11,%xmm15 6644 69,15,88,252, //addps %xmm12,%xmm15 6645 69,15,82,222, //rsqrtps %xmm14,%xmm11 6646 69,15,89,218, //mulps %xmm10,%xmm11 6647 69,15,88,223, //addps %xmm15,%xmm11 6648 69,15,93,203, //minps %xmm11,%xmm9 6649 68,15,89,194, //mulps %xmm2,%xmm8 6650 65,15,194,213,1, //cmpltps %xmm13,%xmm2 6651 68,15,84,194, //andps %xmm2,%xmm8 6652 65,15,85,209, //andnps %xmm9,%xmm2 6653 65,15,86,208, //orps %xmm8,%xmm2 6654 72,173, //lods %ds:(%rsi),%rax 6655 255,224, //jmpq *%rax 6656 }; 6657 6658 CODE const uint8_t sk_scale_1_float_sse2[] = { 6659 72,173, //lods %ds:(%rsi),%rax 6660 243,68,15,16,0, //movss (%rax),%xmm8 6661 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 6662 65,15,89,192, //mulps %xmm8,%xmm0 6663 65,15,89,200, //mulps %xmm8,%xmm1 6664 65,15,89,208, //mulps %xmm8,%xmm2 6665 65,15,89,216, //mulps %xmm8,%xmm3 6666 72,173, //lods %ds:(%rsi),%rax 6667 255,224, //jmpq *%rax 6668 }; 6669 6670 CODE const uint8_t sk_scale_u8_sse2[] = { 6671 72,173, //lods %ds:(%rsi),%rax 6672 72,139,0, //mov (%rax),%rax 6673 102,68,15,110,4,56, //movd (%rax,%rdi,1),%xmm8 6674 102,69,15,239,201, //pxor %xmm9,%xmm9 6675 102,69,15,96,193, //punpcklbw %xmm9,%xmm8 6676 102,69,15,97,193, //punpcklwd %xmm9,%xmm8 6677 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 6678 184,129,128,128,59, //mov $0x3b808081,%eax 6679 102,68,15,110,200, //movd %eax,%xmm9 6680 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 6681 69,15,89,200, //mulps %xmm8,%xmm9 6682 65,15,89,193, //mulps %xmm9,%xmm0 6683 65,15,89,201, //mulps %xmm9,%xmm1 6684 65,15,89,209, //mulps %xmm9,%xmm2 6685 65,15,89,217, //mulps %xmm9,%xmm3 6686 72,173, //lods %ds:(%rsi),%rax 6687 255,224, //jmpq *%rax 6688 }; 6689 6690 CODE const uint8_t sk_lerp_1_float_sse2[] = { 6691 72,173, //lods %ds:(%rsi),%rax 6692 243,68,15,16,0, //movss (%rax),%xmm8 6693 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 6694 15,92,196, //subps %xmm4,%xmm0 6695 65,15,89,192, //mulps %xmm8,%xmm0 6696 15,88,196, //addps %xmm4,%xmm0 6697 15,92,205, //subps %xmm5,%xmm1 6698 65,15,89,200, //mulps %xmm8,%xmm1 6699 15,88,205, //addps %xmm5,%xmm1 6700 15,92,214, //subps %xmm6,%xmm2 6701 65,15,89,208, //mulps %xmm8,%xmm2 6702 15,88,214, //addps %xmm6,%xmm2 6703 15,92,223, //subps %xmm7,%xmm3 6704 65,15,89,216, //mulps %xmm8,%xmm3 6705 15,88,223, //addps %xmm7,%xmm3 6706 72,173, //lods %ds:(%rsi),%rax 6707 255,224, //jmpq *%rax 6708 }; 6709 6710 CODE const uint8_t sk_lerp_u8_sse2[] = { 6711 72,173, //lods %ds:(%rsi),%rax 6712 72,139,0, //mov (%rax),%rax 6713 102,68,15,110,4,56, //movd (%rax,%rdi,1),%xmm8 6714 102,69,15,239,201, //pxor %xmm9,%xmm9 6715 102,69,15,96,193, //punpcklbw %xmm9,%xmm8 6716 102,69,15,97,193, //punpcklwd %xmm9,%xmm8 6717 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 6718 184,129,128,128,59, //mov $0x3b808081,%eax 6719 102,68,15,110,200, //movd %eax,%xmm9 6720 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 6721 69,15,89,200, //mulps %xmm8,%xmm9 6722 15,92,196, //subps %xmm4,%xmm0 6723 65,15,89,193, //mulps %xmm9,%xmm0 6724 15,88,196, //addps %xmm4,%xmm0 6725 15,92,205, //subps %xmm5,%xmm1 6726 65,15,89,201, //mulps %xmm9,%xmm1 6727 15,88,205, //addps %xmm5,%xmm1 6728 15,92,214, //subps %xmm6,%xmm2 6729 65,15,89,209, //mulps %xmm9,%xmm2 6730 15,88,214, //addps %xmm6,%xmm2 6731 15,92,223, //subps %xmm7,%xmm3 6732 65,15,89,217, //mulps %xmm9,%xmm3 6733 15,88,223, //addps %xmm7,%xmm3 6734 72,173, //lods %ds:(%rsi),%rax 6735 255,224, //jmpq *%rax 6736 }; 6737 6738 CODE const uint8_t sk_lerp_565_sse2[] = { 6739 72,173, //lods %ds:(%rsi),%rax 6740 72,139,0, //mov (%rax),%rax 6741 243,68,15,126,4,120, //movq (%rax,%rdi,2),%xmm8 6742 102,15,239,219, //pxor %xmm3,%xmm3 6743 102,68,15,97,195, //punpcklwd %xmm3,%xmm8 6744 184,0,248,0,0, //mov $0xf800,%eax 6745 102,15,110,216, //movd %eax,%xmm3 6746 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 6747 102,65,15,219,216, //pand %xmm8,%xmm3 6748 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 6749 184,8,33,132,55, //mov $0x37842108,%eax 6750 102,68,15,110,208, //movd %eax,%xmm10 6751 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6752 69,15,89,209, //mulps %xmm9,%xmm10 6753 184,224,7,0,0, //mov $0x7e0,%eax 6754 102,15,110,216, //movd %eax,%xmm3 6755 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 6756 102,65,15,219,216, //pand %xmm8,%xmm3 6757 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 6758 184,33,8,2,58, //mov $0x3a020821,%eax 6759 102,68,15,110,216, //movd %eax,%xmm11 6760 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 6761 69,15,89,217, //mulps %xmm9,%xmm11 6762 184,31,0,0,0, //mov $0x1f,%eax 6763 102,15,110,216, //movd %eax,%xmm3 6764 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 6765 102,65,15,219,216, //pand %xmm8,%xmm3 6766 68,15,91,195, //cvtdq2ps %xmm3,%xmm8 6767 184,8,33,4,61, //mov $0x3d042108,%eax 6768 102,15,110,216, //movd %eax,%xmm3 6769 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 6770 65,15,89,216, //mulps %xmm8,%xmm3 6771 15,92,196, //subps %xmm4,%xmm0 6772 65,15,89,194, //mulps %xmm10,%xmm0 6773 15,88,196, //addps %xmm4,%xmm0 6774 15,92,205, //subps %xmm5,%xmm1 6775 65,15,89,203, //mulps %xmm11,%xmm1 6776 15,88,205, //addps %xmm5,%xmm1 6777 15,92,214, //subps %xmm6,%xmm2 6778 15,89,211, //mulps %xmm3,%xmm2 6779 15,88,214, //addps %xmm6,%xmm2 6780 184,0,0,128,63, //mov $0x3f800000,%eax 6781 102,15,110,216, //movd %eax,%xmm3 6782 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 6783 72,173, //lods %ds:(%rsi),%rax 6784 255,224, //jmpq *%rax 6785 }; 6786 6787 CODE const uint8_t sk_load_tables_sse2[] = { 6788 72,173, //lods %ds:(%rsi),%rax 6789 72,139,8, //mov (%rax),%rcx 6790 76,139,64,8, //mov 0x8(%rax),%r8 6791 243,68,15,111,4,185, //movdqu (%rcx,%rdi,4),%xmm8 6792 185,255,0,0,0, //mov $0xff,%ecx 6793 102,15,110,193, //movd %ecx,%xmm0 6794 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 6795 102,69,15,111,200, //movdqa %xmm8,%xmm9 6796 102,65,15,114,209,8, //psrld $0x8,%xmm9 6797 102,68,15,219,200, //pand %xmm0,%xmm9 6798 102,69,15,111,208, //movdqa %xmm8,%xmm10 6799 102,65,15,114,210,16, //psrld $0x10,%xmm10 6800 102,68,15,219,208, //pand %xmm0,%xmm10 6801 102,65,15,219,192, //pand %xmm8,%xmm0 6802 102,15,112,216,78, //pshufd $0x4e,%xmm0,%xmm3 6803 102,72,15,126,217, //movq %xmm3,%rcx 6804 65,137,201, //mov %ecx,%r9d 6805 72,193,233,32, //shr $0x20,%rcx 6806 102,73,15,126,194, //movq %xmm0,%r10 6807 69,137,211, //mov %r10d,%r11d 6808 73,193,234,32, //shr $0x20,%r10 6809 243,67,15,16,28,144, //movss (%r8,%r10,4),%xmm3 6810 243,65,15,16,4,136, //movss (%r8,%rcx,4),%xmm0 6811 15,20,216, //unpcklps %xmm0,%xmm3 6812 243,67,15,16,4,152, //movss (%r8,%r11,4),%xmm0 6813 243,67,15,16,12,136, //movss (%r8,%r9,4),%xmm1 6814 15,20,193, //unpcklps %xmm1,%xmm0 6815 15,20,195, //unpcklps %xmm3,%xmm0 6816 76,139,64,16, //mov 0x10(%rax),%r8 6817 102,65,15,112,201,78, //pshufd $0x4e,%xmm9,%xmm1 6818 102,73,15,126,202, //movq %xmm1,%r10 6819 77,137,209, //mov %r10,%r9 6820 73,193,233,32, //shr $0x20,%r9 6821 102,76,15,126,201, //movq %xmm9,%rcx 6822 65,137,203, //mov %ecx,%r11d 6823 65,129,227,255,255,255,0, //and $0xffffff,%r11d 6824 72,193,233,30, //shr $0x1e,%rcx 6825 65,129,226,255,255,255,0, //and $0xffffff,%r10d 6826 243,65,15,16,28,8, //movss (%r8,%rcx,1),%xmm3 6827 243,67,15,16,12,136, //movss (%r8,%r9,4),%xmm1 6828 15,20,217, //unpcklps %xmm1,%xmm3 6829 243,67,15,16,12,152, //movss (%r8,%r11,4),%xmm1 6830 243,67,15,16,20,144, //movss (%r8,%r10,4),%xmm2 6831 15,20,202, //unpcklps %xmm2,%xmm1 6832 15,20,203, //unpcklps %xmm3,%xmm1 6833 76,139,72,24, //mov 0x18(%rax),%r9 6834 102,65,15,112,210,78, //pshufd $0x4e,%xmm10,%xmm2 6835 102,72,15,126,209, //movq %xmm2,%rcx 6836 68,15,183,193, //movzwl %cx,%r8d 6837 72,193,233,32, //shr $0x20,%rcx 6838 102,76,15,126,208, //movq %xmm10,%rax 6839 68,15,183,208, //movzwl %ax,%r10d 6840 72,193,232,30, //shr $0x1e,%rax 6841 243,69,15,16,12,1, //movss (%r9,%rax,1),%xmm9 6842 243,65,15,16,20,137, //movss (%r9,%rcx,4),%xmm2 6843 68,15,20,202, //unpcklps %xmm2,%xmm9 6844 243,67,15,16,20,145, //movss (%r9,%r10,4),%xmm2 6845 243,67,15,16,28,129, //movss (%r9,%r8,4),%xmm3 6846 15,20,211, //unpcklps %xmm3,%xmm2 6847 65,15,20,209, //unpcklps %xmm9,%xmm2 6848 102,65,15,114,208,24, //psrld $0x18,%xmm8 6849 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 6850 184,129,128,128,59, //mov $0x3b808081,%eax 6851 102,15,110,216, //movd %eax,%xmm3 6852 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 6853 65,15,89,216, //mulps %xmm8,%xmm3 6854 72,173, //lods %ds:(%rsi),%rax 6855 255,224, //jmpq *%rax 6856 }; 6857 6858 CODE const uint8_t sk_load_a8_sse2[] = { 6859 72,173, //lods %ds:(%rsi),%rax 6860 72,139,0, //mov (%rax),%rax 6861 102,15,110,4,56, //movd (%rax,%rdi,1),%xmm0 6862 102,15,239,201, //pxor %xmm1,%xmm1 6863 102,15,96,193, //punpcklbw %xmm1,%xmm0 6864 102,15,97,193, //punpcklwd %xmm1,%xmm0 6865 15,91,192, //cvtdq2ps %xmm0,%xmm0 6866 184,129,128,128,59, //mov $0x3b808081,%eax 6867 102,15,110,216, //movd %eax,%xmm3 6868 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 6869 15,89,216, //mulps %xmm0,%xmm3 6870 72,173, //lods %ds:(%rsi),%rax 6871 15,87,192, //xorps %xmm0,%xmm0 6872 102,15,239,201, //pxor %xmm1,%xmm1 6873 15,87,210, //xorps %xmm2,%xmm2 6874 255,224, //jmpq *%rax 6875 }; 6876 6877 CODE const uint8_t sk_store_a8_sse2[] = { 6878 72,173, //lods %ds:(%rsi),%rax 6879 72,139,0, //mov (%rax),%rax 6880 185,0,0,127,67, //mov $0x437f0000,%ecx 6881 102,68,15,110,193, //movd %ecx,%xmm8 6882 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 6883 68,15,89,195, //mulps %xmm3,%xmm8 6884 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 6885 102,65,15,114,240,16, //pslld $0x10,%xmm8 6886 102,65,15,114,224,16, //psrad $0x10,%xmm8 6887 102,69,15,107,192, //packssdw %xmm8,%xmm8 6888 102,69,15,103,192, //packuswb %xmm8,%xmm8 6889 102,68,15,126,4,56, //movd %xmm8,(%rax,%rdi,1) 6890 72,173, //lods %ds:(%rsi),%rax 6891 255,224, //jmpq *%rax 6892 }; 6893 6894 CODE const uint8_t sk_load_565_sse2[] = { 6895 72,173, //lods %ds:(%rsi),%rax 6896 72,139,0, //mov (%rax),%rax 6897 243,15,126,20,120, //movq (%rax,%rdi,2),%xmm2 6898 102,15,239,192, //pxor %xmm0,%xmm0 6899 102,15,97,208, //punpcklwd %xmm0,%xmm2 6900 184,0,248,0,0, //mov $0xf800,%eax 6901 102,15,110,192, //movd %eax,%xmm0 6902 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 6903 102,15,219,194, //pand %xmm2,%xmm0 6904 15,91,200, //cvtdq2ps %xmm0,%xmm1 6905 184,8,33,132,55, //mov $0x37842108,%eax 6906 102,15,110,192, //movd %eax,%xmm0 6907 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 6908 15,89,193, //mulps %xmm1,%xmm0 6909 184,224,7,0,0, //mov $0x7e0,%eax 6910 102,15,110,200, //movd %eax,%xmm1 6911 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 6912 102,15,219,202, //pand %xmm2,%xmm1 6913 15,91,217, //cvtdq2ps %xmm1,%xmm3 6914 184,33,8,2,58, //mov $0x3a020821,%eax 6915 102,15,110,200, //movd %eax,%xmm1 6916 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 6917 15,89,203, //mulps %xmm3,%xmm1 6918 184,31,0,0,0, //mov $0x1f,%eax 6919 102,15,110,216, //movd %eax,%xmm3 6920 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 6921 102,15,219,218, //pand %xmm2,%xmm3 6922 15,91,219, //cvtdq2ps %xmm3,%xmm3 6923 184,8,33,4,61, //mov $0x3d042108,%eax 6924 102,15,110,208, //movd %eax,%xmm2 6925 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 6926 15,89,211, //mulps %xmm3,%xmm2 6927 184,0,0,128,63, //mov $0x3f800000,%eax 6928 102,15,110,216, //movd %eax,%xmm3 6929 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 6930 72,173, //lods %ds:(%rsi),%rax 6931 255,224, //jmpq *%rax 6932 }; 6933 6934 CODE const uint8_t sk_store_565_sse2[] = { 6935 72,173, //lods %ds:(%rsi),%rax 6936 72,139,0, //mov (%rax),%rax 6937 185,0,0,248,65, //mov $0x41f80000,%ecx 6938 102,68,15,110,193, //movd %ecx,%xmm8 6939 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 6940 69,15,40,200, //movaps %xmm8,%xmm9 6941 68,15,89,200, //mulps %xmm0,%xmm9 6942 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 6943 102,65,15,114,241,11, //pslld $0xb,%xmm9 6944 185,0,0,124,66, //mov $0x427c0000,%ecx 6945 102,68,15,110,209, //movd %ecx,%xmm10 6946 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 6947 68,15,89,209, //mulps %xmm1,%xmm10 6948 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 6949 102,65,15,114,242,5, //pslld $0x5,%xmm10 6950 102,69,15,235,209, //por %xmm9,%xmm10 6951 68,15,89,194, //mulps %xmm2,%xmm8 6952 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 6953 102,69,15,86,194, //orpd %xmm10,%xmm8 6954 102,65,15,114,240,16, //pslld $0x10,%xmm8 6955 102,65,15,114,224,16, //psrad $0x10,%xmm8 6956 102,69,15,107,192, //packssdw %xmm8,%xmm8 6957 102,68,15,214,4,120, //movq %xmm8,(%rax,%rdi,2) 6958 72,173, //lods %ds:(%rsi),%rax 6959 255,224, //jmpq *%rax 6960 }; 6961 6962 CODE const uint8_t sk_load_8888_sse2[] = { 6963 72,173, //lods %ds:(%rsi),%rax 6964 72,139,0, //mov (%rax),%rax 6965 243,15,111,28,184, //movdqu (%rax,%rdi,4),%xmm3 6966 184,255,0,0,0, //mov $0xff,%eax 6967 102,15,110,192, //movd %eax,%xmm0 6968 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 6969 102,15,111,203, //movdqa %xmm3,%xmm1 6970 102,15,114,209,8, //psrld $0x8,%xmm1 6971 102,15,219,200, //pand %xmm0,%xmm1 6972 102,15,111,211, //movdqa %xmm3,%xmm2 6973 102,15,114,210,16, //psrld $0x10,%xmm2 6974 102,15,219,208, //pand %xmm0,%xmm2 6975 102,15,219,195, //pand %xmm3,%xmm0 6976 15,91,192, //cvtdq2ps %xmm0,%xmm0 6977 184,129,128,128,59, //mov $0x3b808081,%eax 6978 102,68,15,110,192, //movd %eax,%xmm8 6979 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 6980 65,15,89,192, //mulps %xmm8,%xmm0 6981 15,91,201, //cvtdq2ps %xmm1,%xmm1 6982 65,15,89,200, //mulps %xmm8,%xmm1 6983 15,91,210, //cvtdq2ps %xmm2,%xmm2 6984 65,15,89,208, //mulps %xmm8,%xmm2 6985 102,15,114,211,24, //psrld $0x18,%xmm3 6986 15,91,219, //cvtdq2ps %xmm3,%xmm3 6987 65,15,89,216, //mulps %xmm8,%xmm3 6988 72,173, //lods %ds:(%rsi),%rax 6989 255,224, //jmpq *%rax 6990 }; 6991 6992 CODE const uint8_t sk_store_8888_sse2[] = { 6993 72,173, //lods %ds:(%rsi),%rax 6994 72,139,0, //mov (%rax),%rax 6995 185,0,0,127,67, //mov $0x437f0000,%ecx 6996 102,68,15,110,193, //movd %ecx,%xmm8 6997 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 6998 69,15,40,200, //movaps %xmm8,%xmm9 6999 68,15,89,200, //mulps %xmm0,%xmm9 7000 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 7001 69,15,40,208, //movaps %xmm8,%xmm10 7002 68,15,89,209, //mulps %xmm1,%xmm10 7003 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 7004 102,65,15,114,242,8, //pslld $0x8,%xmm10 7005 102,69,15,235,209, //por %xmm9,%xmm10 7006 69,15,40,200, //movaps %xmm8,%xmm9 7007 68,15,89,202, //mulps %xmm2,%xmm9 7008 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 7009 102,65,15,114,241,16, //pslld $0x10,%xmm9 7010 68,15,89,195, //mulps %xmm3,%xmm8 7011 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 7012 102,65,15,114,240,24, //pslld $0x18,%xmm8 7013 102,69,15,235,193, //por %xmm9,%xmm8 7014 102,69,15,235,194, //por %xmm10,%xmm8 7015 243,68,15,127,4,184, //movdqu %xmm8,(%rax,%rdi,4) 7016 72,173, //lods %ds:(%rsi),%rax 7017 255,224, //jmpq *%rax 7018 }; 7019 7020 CODE const uint8_t sk_load_f16_sse2[] = { 7021 72,173, //lods %ds:(%rsi),%rax 7022 72,139,0, //mov (%rax),%rax 7023 243,15,111,4,248, //movdqu (%rax,%rdi,8),%xmm0 7024 243,15,111,76,248,16, //movdqu 0x10(%rax,%rdi,8),%xmm1 7025 102,15,111,208, //movdqa %xmm0,%xmm2 7026 102,15,97,209, //punpcklwd %xmm1,%xmm2 7027 102,15,105,193, //punpckhwd %xmm1,%xmm0 7028 102,68,15,111,194, //movdqa %xmm2,%xmm8 7029 102,68,15,97,192, //punpcklwd %xmm0,%xmm8 7030 102,15,105,208, //punpckhwd %xmm0,%xmm2 7031 184,0,4,0,4, //mov $0x4000400,%eax 7032 102,15,110,192, //movd %eax,%xmm0 7033 102,15,112,216,0, //pshufd $0x0,%xmm0,%xmm3 7034 102,15,111,203, //movdqa %xmm3,%xmm1 7035 102,65,15,101,200, //pcmpgtw %xmm8,%xmm1 7036 102,65,15,223,200, //pandn %xmm8,%xmm1 7037 102,15,101,218, //pcmpgtw %xmm2,%xmm3 7038 102,15,223,218, //pandn %xmm2,%xmm3 7039 102,69,15,239,192, //pxor %xmm8,%xmm8 7040 102,15,111,193, //movdqa %xmm1,%xmm0 7041 102,65,15,97,192, //punpcklwd %xmm8,%xmm0 7042 102,15,114,240,13, //pslld $0xd,%xmm0 7043 184,0,0,128,119, //mov $0x77800000,%eax 7044 102,15,110,208, //movd %eax,%xmm2 7045 102,68,15,112,202,0, //pshufd $0x0,%xmm2,%xmm9 7046 65,15,89,193, //mulps %xmm9,%xmm0 7047 102,65,15,105,200, //punpckhwd %xmm8,%xmm1 7048 102,15,114,241,13, //pslld $0xd,%xmm1 7049 65,15,89,201, //mulps %xmm9,%xmm1 7050 102,15,111,211, //movdqa %xmm3,%xmm2 7051 102,65,15,97,208, //punpcklwd %xmm8,%xmm2 7052 102,15,114,242,13, //pslld $0xd,%xmm2 7053 65,15,89,209, //mulps %xmm9,%xmm2 7054 102,65,15,105,216, //punpckhwd %xmm8,%xmm3 7055 102,15,114,243,13, //pslld $0xd,%xmm3 7056 65,15,89,217, //mulps %xmm9,%xmm3 7057 72,173, //lods %ds:(%rsi),%rax 7058 255,224, //jmpq *%rax 7059 }; 7060 7061 CODE const uint8_t sk_store_f16_sse2[] = { 7062 72,173, //lods %ds:(%rsi),%rax 7063 72,139,0, //mov (%rax),%rax 7064 185,0,0,128,7, //mov $0x7800000,%ecx 7065 102,68,15,110,193, //movd %ecx,%xmm8 7066 102,69,15,112,192,0, //pshufd $0x0,%xmm8,%xmm8 7067 102,69,15,111,200, //movdqa %xmm8,%xmm9 7068 68,15,89,200, //mulps %xmm0,%xmm9 7069 102,65,15,114,209,13, //psrld $0xd,%xmm9 7070 102,69,15,111,208, //movdqa %xmm8,%xmm10 7071 68,15,89,209, //mulps %xmm1,%xmm10 7072 102,65,15,114,210,13, //psrld $0xd,%xmm10 7073 102,69,15,111,216, //movdqa %xmm8,%xmm11 7074 68,15,89,218, //mulps %xmm2,%xmm11 7075 102,65,15,114,211,13, //psrld $0xd,%xmm11 7076 68,15,89,195, //mulps %xmm3,%xmm8 7077 102,65,15,114,208,13, //psrld $0xd,%xmm8 7078 102,65,15,115,250,2, //pslldq $0x2,%xmm10 7079 102,69,15,235,209, //por %xmm9,%xmm10 7080 102,65,15,115,248,2, //pslldq $0x2,%xmm8 7081 102,69,15,235,195, //por %xmm11,%xmm8 7082 102,69,15,111,202, //movdqa %xmm10,%xmm9 7083 102,69,15,98,200, //punpckldq %xmm8,%xmm9 7084 243,68,15,127,12,248, //movdqu %xmm9,(%rax,%rdi,8) 7085 102,69,15,106,208, //punpckhdq %xmm8,%xmm10 7086 243,68,15,127,84,248,16, //movdqu %xmm10,0x10(%rax,%rdi,8) 7087 72,173, //lods %ds:(%rsi),%rax 7088 255,224, //jmpq *%rax 7089 }; 7090 7091 CODE const uint8_t sk_store_f32_sse2[] = { 7092 72,173, //lods %ds:(%rsi),%rax 7093 72,139,0, //mov (%rax),%rax 7094 72,137,249, //mov %rdi,%rcx 7095 72,193,225,4, //shl $0x4,%rcx 7096 68,15,40,192, //movaps %xmm0,%xmm8 7097 68,15,40,200, //movaps %xmm0,%xmm9 7098 68,15,20,201, //unpcklps %xmm1,%xmm9 7099 68,15,40,210, //movaps %xmm2,%xmm10 7100 68,15,40,218, //movaps %xmm2,%xmm11 7101 68,15,20,219, //unpcklps %xmm3,%xmm11 7102 68,15,21,193, //unpckhps %xmm1,%xmm8 7103 68,15,21,211, //unpckhps %xmm3,%xmm10 7104 69,15,40,225, //movaps %xmm9,%xmm12 7105 102,69,15,20,227, //unpcklpd %xmm11,%xmm12 7106 69,15,18,217, //movhlps %xmm9,%xmm11 7107 69,15,40,200, //movaps %xmm8,%xmm9 7108 102,69,15,20,202, //unpcklpd %xmm10,%xmm9 7109 69,15,18,208, //movhlps %xmm8,%xmm10 7110 102,68,15,17,36,8, //movupd %xmm12,(%rax,%rcx,1) 7111 68,15,17,92,8,16, //movups %xmm11,0x10(%rax,%rcx,1) 7112 102,68,15,17,76,8,32, //movupd %xmm9,0x20(%rax,%rcx,1) 7113 68,15,17,84,8,48, //movups %xmm10,0x30(%rax,%rcx,1) 7114 72,173, //lods %ds:(%rsi),%rax 7115 255,224, //jmpq *%rax 7116 }; 7117 7118 CODE const uint8_t sk_clamp_x_sse2[] = { 7119 72,173, //lods %ds:(%rsi),%rax 7120 69,15,87,192, //xorps %xmm8,%xmm8 7121 68,15,95,192, //maxps %xmm0,%xmm8 7122 243,68,15,16,8, //movss (%rax),%xmm9 7123 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 7124 102,15,118,192, //pcmpeqd %xmm0,%xmm0 7125 102,65,15,254,193, //paddd %xmm9,%xmm0 7126 68,15,93,192, //minps %xmm0,%xmm8 7127 72,173, //lods %ds:(%rsi),%rax 7128 65,15,40,192, //movaps %xmm8,%xmm0 7129 255,224, //jmpq *%rax 7130 }; 7131 7132 CODE const uint8_t sk_clamp_y_sse2[] = { 7133 72,173, //lods %ds:(%rsi),%rax 7134 69,15,87,192, //xorps %xmm8,%xmm8 7135 68,15,95,193, //maxps %xmm1,%xmm8 7136 243,68,15,16,8, //movss (%rax),%xmm9 7137 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 7138 102,15,118,201, //pcmpeqd %xmm1,%xmm1 7139 102,65,15,254,201, //paddd %xmm9,%xmm1 7140 68,15,93,193, //minps %xmm1,%xmm8 7141 72,173, //lods %ds:(%rsi),%rax 7142 65,15,40,200, //movaps %xmm8,%xmm1 7143 255,224, //jmpq *%rax 7144 }; 7145 7146 CODE const uint8_t sk_repeat_x_sse2[] = { 7147 72,173, //lods %ds:(%rsi),%rax 7148 243,68,15,16,0, //movss (%rax),%xmm8 7149 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 7150 68,15,40,200, //movaps %xmm0,%xmm9 7151 69,15,94,200, //divps %xmm8,%xmm9 7152 243,69,15,91,209, //cvttps2dq %xmm9,%xmm10 7153 69,15,91,210, //cvtdq2ps %xmm10,%xmm10 7154 69,15,194,202,1, //cmpltps %xmm10,%xmm9 7155 184,0,0,128,63, //mov $0x3f800000,%eax 7156 102,68,15,110,216, //movd %eax,%xmm11 7157 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 7158 69,15,84,217, //andps %xmm9,%xmm11 7159 69,15,92,211, //subps %xmm11,%xmm10 7160 69,15,89,208, //mulps %xmm8,%xmm10 7161 65,15,92,194, //subps %xmm10,%xmm0 7162 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9 7163 102,69,15,254,200, //paddd %xmm8,%xmm9 7164 65,15,93,193, //minps %xmm9,%xmm0 7165 72,173, //lods %ds:(%rsi),%rax 7166 255,224, //jmpq *%rax 7167 }; 7168 7169 CODE const uint8_t sk_repeat_y_sse2[] = { 7170 72,173, //lods %ds:(%rsi),%rax 7171 243,68,15,16,0, //movss (%rax),%xmm8 7172 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 7173 68,15,40,201, //movaps %xmm1,%xmm9 7174 69,15,94,200, //divps %xmm8,%xmm9 7175 243,69,15,91,209, //cvttps2dq %xmm9,%xmm10 7176 69,15,91,210, //cvtdq2ps %xmm10,%xmm10 7177 69,15,194,202,1, //cmpltps %xmm10,%xmm9 7178 184,0,0,128,63, //mov $0x3f800000,%eax 7179 102,68,15,110,216, //movd %eax,%xmm11 7180 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 7181 69,15,84,217, //andps %xmm9,%xmm11 7182 69,15,92,211, //subps %xmm11,%xmm10 7183 69,15,89,208, //mulps %xmm8,%xmm10 7184 65,15,92,202, //subps %xmm10,%xmm1 7185 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9 7186 102,69,15,254,200, //paddd %xmm8,%xmm9 7187 65,15,93,201, //minps %xmm9,%xmm1 7188 72,173, //lods %ds:(%rsi),%rax 7189 255,224, //jmpq *%rax 7190 }; 7191 7192 CODE const uint8_t sk_mirror_x_sse2[] = { 7193 72,173, //lods %ds:(%rsi),%rax 7194 243,68,15,16,8, //movss (%rax),%xmm9 7195 69,15,40,193, //movaps %xmm9,%xmm8 7196 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 7197 65,15,92,192, //subps %xmm8,%xmm0 7198 243,69,15,88,201, //addss %xmm9,%xmm9 7199 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 7200 68,15,40,208, //movaps %xmm0,%xmm10 7201 69,15,94,209, //divps %xmm9,%xmm10 7202 243,69,15,91,218, //cvttps2dq %xmm10,%xmm11 7203 69,15,91,219, //cvtdq2ps %xmm11,%xmm11 7204 69,15,194,211,1, //cmpltps %xmm11,%xmm10 7205 184,0,0,128,63, //mov $0x3f800000,%eax 7206 102,68,15,110,224, //movd %eax,%xmm12 7207 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 7208 69,15,84,226, //andps %xmm10,%xmm12 7209 69,15,87,210, //xorps %xmm10,%xmm10 7210 69,15,92,220, //subps %xmm12,%xmm11 7211 69,15,89,217, //mulps %xmm9,%xmm11 7212 65,15,92,195, //subps %xmm11,%xmm0 7213 65,15,92,192, //subps %xmm8,%xmm0 7214 68,15,92,208, //subps %xmm0,%xmm10 7215 65,15,84,194, //andps %xmm10,%xmm0 7216 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9 7217 102,69,15,254,200, //paddd %xmm8,%xmm9 7218 65,15,93,193, //minps %xmm9,%xmm0 7219 72,173, //lods %ds:(%rsi),%rax 7220 255,224, //jmpq *%rax 7221 }; 7222 7223 CODE const uint8_t sk_mirror_y_sse2[] = { 7224 72,173, //lods %ds:(%rsi),%rax 7225 243,68,15,16,8, //movss (%rax),%xmm9 7226 69,15,40,193, //movaps %xmm9,%xmm8 7227 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 7228 65,15,92,200, //subps %xmm8,%xmm1 7229 243,69,15,88,201, //addss %xmm9,%xmm9 7230 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 7231 68,15,40,209, //movaps %xmm1,%xmm10 7232 69,15,94,209, //divps %xmm9,%xmm10 7233 243,69,15,91,218, //cvttps2dq %xmm10,%xmm11 7234 69,15,91,219, //cvtdq2ps %xmm11,%xmm11 7235 69,15,194,211,1, //cmpltps %xmm11,%xmm10 7236 184,0,0,128,63, //mov $0x3f800000,%eax 7237 102,68,15,110,224, //movd %eax,%xmm12 7238 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 7239 69,15,84,226, //andps %xmm10,%xmm12 7240 69,15,87,210, //xorps %xmm10,%xmm10 7241 69,15,92,220, //subps %xmm12,%xmm11 7242 69,15,89,217, //mulps %xmm9,%xmm11 7243 65,15,92,203, //subps %xmm11,%xmm1 7244 65,15,92,200, //subps %xmm8,%xmm1 7245 68,15,92,209, //subps %xmm1,%xmm10 7246 65,15,84,202, //andps %xmm10,%xmm1 7247 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9 7248 102,69,15,254,200, //paddd %xmm8,%xmm9 7249 65,15,93,201, //minps %xmm9,%xmm1 7250 72,173, //lods %ds:(%rsi),%rax 7251 255,224, //jmpq *%rax 7252 }; 7253 7254 CODE const uint8_t sk_luminance_to_alpha_sse2[] = { 7255 184,208,179,89,62, //mov $0x3e59b3d0,%eax 7256 102,15,110,216, //movd %eax,%xmm3 7257 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 7258 15,89,216, //mulps %xmm0,%xmm3 7259 184,89,23,55,63, //mov $0x3f371759,%eax 7260 102,15,110,192, //movd %eax,%xmm0 7261 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 7262 15,89,193, //mulps %xmm1,%xmm0 7263 15,88,195, //addps %xmm3,%xmm0 7264 184,152,221,147,61, //mov $0x3d93dd98,%eax 7265 102,15,110,216, //movd %eax,%xmm3 7266 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 7267 15,89,218, //mulps %xmm2,%xmm3 7268 15,88,216, //addps %xmm0,%xmm3 7269 72,173, //lods %ds:(%rsi),%rax 7270 15,87,192, //xorps %xmm0,%xmm0 7271 15,87,201, //xorps %xmm1,%xmm1 7272 15,87,210, //xorps %xmm2,%xmm2 7273 255,224, //jmpq *%rax 7274 }; 7275 7276 CODE const uint8_t sk_matrix_2x3_sse2[] = { 7277 68,15,40,201, //movaps %xmm1,%xmm9 7278 68,15,40,192, //movaps %xmm0,%xmm8 7279 72,173, //lods %ds:(%rsi),%rax 7280 243,15,16,0, //movss (%rax),%xmm0 7281 243,15,16,72,4, //movss 0x4(%rax),%xmm1 7282 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 7283 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 7284 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 7285 243,68,15,16,88,16, //movss 0x10(%rax),%xmm11 7286 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 7287 69,15,89,209, //mulps %xmm9,%xmm10 7288 69,15,88,211, //addps %xmm11,%xmm10 7289 65,15,89,192, //mulps %xmm8,%xmm0 7290 65,15,88,194, //addps %xmm10,%xmm0 7291 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 7292 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10 7293 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 7294 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11 7295 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 7296 69,15,89,209, //mulps %xmm9,%xmm10 7297 69,15,88,211, //addps %xmm11,%xmm10 7298 65,15,89,200, //mulps %xmm8,%xmm1 7299 65,15,88,202, //addps %xmm10,%xmm1 7300 72,173, //lods %ds:(%rsi),%rax 7301 255,224, //jmpq *%rax 7302 }; 7303 7304 CODE const uint8_t sk_matrix_3x4_sse2[] = { 7305 68,15,40,201, //movaps %xmm1,%xmm9 7306 68,15,40,192, //movaps %xmm0,%xmm8 7307 72,173, //lods %ds:(%rsi),%rax 7308 243,15,16,0, //movss (%rax),%xmm0 7309 243,15,16,72,4, //movss 0x4(%rax),%xmm1 7310 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 7311 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10 7312 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 7313 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11 7314 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 7315 243,68,15,16,96,36, //movss 0x24(%rax),%xmm12 7316 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 7317 68,15,89,218, //mulps %xmm2,%xmm11 7318 69,15,88,220, //addps %xmm12,%xmm11 7319 69,15,89,209, //mulps %xmm9,%xmm10 7320 69,15,88,211, //addps %xmm11,%xmm10 7321 65,15,89,192, //mulps %xmm8,%xmm0 7322 65,15,88,194, //addps %xmm10,%xmm0 7323 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 7324 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10 7325 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 7326 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11 7327 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 7328 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12 7329 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 7330 68,15,89,218, //mulps %xmm2,%xmm11 7331 69,15,88,220, //addps %xmm12,%xmm11 7332 69,15,89,209, //mulps %xmm9,%xmm10 7333 69,15,88,211, //addps %xmm11,%xmm10 7334 65,15,89,200, //mulps %xmm8,%xmm1 7335 65,15,88,202, //addps %xmm10,%xmm1 7336 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 7337 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 7338 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11 7339 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 7340 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12 7341 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 7342 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13 7343 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 7344 68,15,89,226, //mulps %xmm2,%xmm12 7345 69,15,88,229, //addps %xmm13,%xmm12 7346 69,15,89,217, //mulps %xmm9,%xmm11 7347 69,15,88,220, //addps %xmm12,%xmm11 7348 69,15,89,208, //mulps %xmm8,%xmm10 7349 69,15,88,211, //addps %xmm11,%xmm10 7350 72,173, //lods %ds:(%rsi),%rax 7351 65,15,40,210, //movaps %xmm10,%xmm2 7352 255,224, //jmpq *%rax 7353 }; 7354 7355 CODE const uint8_t sk_matrix_4x5_sse2[] = { 7356 68,15,40,201, //movaps %xmm1,%xmm9 7357 68,15,40,192, //movaps %xmm0,%xmm8 7358 72,173, //lods %ds:(%rsi),%rax 7359 243,15,16,0, //movss (%rax),%xmm0 7360 243,15,16,72,4, //movss 0x4(%rax),%xmm1 7361 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 7362 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10 7363 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 7364 243,68,15,16,88,32, //movss 0x20(%rax),%xmm11 7365 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 7366 243,68,15,16,96,48, //movss 0x30(%rax),%xmm12 7367 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 7368 243,68,15,16,104,64, //movss 0x40(%rax),%xmm13 7369 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 7370 68,15,89,227, //mulps %xmm3,%xmm12 7371 69,15,88,229, //addps %xmm13,%xmm12 7372 68,15,89,218, //mulps %xmm2,%xmm11 7373 69,15,88,220, //addps %xmm12,%xmm11 7374 69,15,89,209, //mulps %xmm9,%xmm10 7375 69,15,88,211, //addps %xmm11,%xmm10 7376 65,15,89,192, //mulps %xmm8,%xmm0 7377 65,15,88,194, //addps %xmm10,%xmm0 7378 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 7379 243,68,15,16,80,20, //movss 0x14(%rax),%xmm10 7380 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 7381 243,68,15,16,88,36, //movss 0x24(%rax),%xmm11 7382 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 7383 243,68,15,16,96,52, //movss 0x34(%rax),%xmm12 7384 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 7385 243,68,15,16,104,68, //movss 0x44(%rax),%xmm13 7386 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 7387 68,15,89,227, //mulps %xmm3,%xmm12 7388 69,15,88,229, //addps %xmm13,%xmm12 7389 68,15,89,218, //mulps %xmm2,%xmm11 7390 69,15,88,220, //addps %xmm12,%xmm11 7391 69,15,89,209, //mulps %xmm9,%xmm10 7392 69,15,88,211, //addps %xmm11,%xmm10 7393 65,15,89,200, //mulps %xmm8,%xmm1 7394 65,15,88,202, //addps %xmm10,%xmm1 7395 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 7396 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 7397 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11 7398 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 7399 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12 7400 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 7401 243,68,15,16,104,56, //movss 0x38(%rax),%xmm13 7402 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 7403 243,68,15,16,112,72, //movss 0x48(%rax),%xmm14 7404 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 7405 68,15,89,235, //mulps %xmm3,%xmm13 7406 69,15,88,238, //addps %xmm14,%xmm13 7407 68,15,89,226, //mulps %xmm2,%xmm12 7408 69,15,88,229, //addps %xmm13,%xmm12 7409 69,15,89,217, //mulps %xmm9,%xmm11 7410 69,15,88,220, //addps %xmm12,%xmm11 7411 69,15,89,208, //mulps %xmm8,%xmm10 7412 69,15,88,211, //addps %xmm11,%xmm10 7413 243,68,15,16,88,12, //movss 0xc(%rax),%xmm11 7414 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 7415 243,68,15,16,96,28, //movss 0x1c(%rax),%xmm12 7416 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 7417 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13 7418 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 7419 243,68,15,16,112,60, //movss 0x3c(%rax),%xmm14 7420 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 7421 243,68,15,16,120,76, //movss 0x4c(%rax),%xmm15 7422 69,15,198,255,0, //shufps $0x0,%xmm15,%xmm15 7423 68,15,89,243, //mulps %xmm3,%xmm14 7424 69,15,88,247, //addps %xmm15,%xmm14 7425 68,15,89,234, //mulps %xmm2,%xmm13 7426 69,15,88,238, //addps %xmm14,%xmm13 7427 69,15,89,225, //mulps %xmm9,%xmm12 7428 69,15,88,229, //addps %xmm13,%xmm12 7429 69,15,89,216, //mulps %xmm8,%xmm11 7430 69,15,88,220, //addps %xmm12,%xmm11 7431 72,173, //lods %ds:(%rsi),%rax 7432 65,15,40,210, //movaps %xmm10,%xmm2 7433 65,15,40,219, //movaps %xmm11,%xmm3 7434 255,224, //jmpq *%rax 7435 }; 7436 7437 CODE const uint8_t sk_matrix_perspective_sse2[] = { 7438 68,15,40,192, //movaps %xmm0,%xmm8 7439 72,173, //lods %ds:(%rsi),%rax 7440 243,15,16,0, //movss (%rax),%xmm0 7441 243,68,15,16,72,4, //movss 0x4(%rax),%xmm9 7442 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 7443 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 7444 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 7445 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 7446 68,15,89,201, //mulps %xmm1,%xmm9 7447 69,15,88,202, //addps %xmm10,%xmm9 7448 65,15,89,192, //mulps %xmm8,%xmm0 7449 65,15,88,193, //addps %xmm9,%xmm0 7450 243,68,15,16,72,12, //movss 0xc(%rax),%xmm9 7451 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 7452 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10 7453 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 7454 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11 7455 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 7456 68,15,89,209, //mulps %xmm1,%xmm10 7457 69,15,88,211, //addps %xmm11,%xmm10 7458 69,15,89,200, //mulps %xmm8,%xmm9 7459 69,15,88,202, //addps %xmm10,%xmm9 7460 243,68,15,16,80,24, //movss 0x18(%rax),%xmm10 7461 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 7462 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11 7463 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 7464 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12 7465 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 7466 68,15,89,217, //mulps %xmm1,%xmm11 7467 69,15,88,220, //addps %xmm12,%xmm11 7468 69,15,89,208, //mulps %xmm8,%xmm10 7469 69,15,88,211, //addps %xmm11,%xmm10 7470 65,15,83,202, //rcpps %xmm10,%xmm1 7471 15,89,193, //mulps %xmm1,%xmm0 7472 68,15,89,201, //mulps %xmm1,%xmm9 7473 72,173, //lods %ds:(%rsi),%rax 7474 65,15,40,201, //movaps %xmm9,%xmm1 7475 255,224, //jmpq *%rax 7476 }; 7477 7478 CODE const uint8_t sk_linear_gradient_2stops_sse2[] = { 7479 72,173, //lods %ds:(%rsi),%rax 7480 68,15,16,8, //movups (%rax),%xmm9 7481 15,16,88,16, //movups 0x10(%rax),%xmm3 7482 68,15,40,195, //movaps %xmm3,%xmm8 7483 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 7484 65,15,40,201, //movaps %xmm9,%xmm1 7485 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 7486 68,15,89,192, //mulps %xmm0,%xmm8 7487 68,15,88,193, //addps %xmm1,%xmm8 7488 15,40,203, //movaps %xmm3,%xmm1 7489 15,198,201,85, //shufps $0x55,%xmm1,%xmm1 7490 65,15,40,209, //movaps %xmm9,%xmm2 7491 15,198,210,85, //shufps $0x55,%xmm2,%xmm2 7492 15,89,200, //mulps %xmm0,%xmm1 7493 15,88,202, //addps %xmm2,%xmm1 7494 15,40,211, //movaps %xmm3,%xmm2 7495 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2 7496 69,15,40,209, //movaps %xmm9,%xmm10 7497 69,15,198,210,170, //shufps $0xaa,%xmm10,%xmm10 7498 15,89,208, //mulps %xmm0,%xmm2 7499 65,15,88,210, //addps %xmm10,%xmm2 7500 15,198,219,255, //shufps $0xff,%xmm3,%xmm3 7501 69,15,198,201,255, //shufps $0xff,%xmm9,%xmm9 7502 15,89,216, //mulps %xmm0,%xmm3 7503 65,15,88,217, //addps %xmm9,%xmm3 7504 72,173, //lods %ds:(%rsi),%rax 7505 65,15,40,192, //movaps %xmm8,%xmm0 7506 255,224, //jmpq *%rax 7507 }; 7508 #elif defined(_M_X64) 7509 7510 CODE const uint8_t sk_start_pipeline_hsw[] = { 7511 65,87, //push %r15 7512 65,86, //push %r14 7513 65,85, //push %r13 7514 65,84, //push %r12 7515 86, //push %rsi 7516 87, //push %rdi 7517 83, //push %rbx 7518 72,129,236,160,0,0,0, //sub $0xa0,%rsp 7519 197,120,41,188,36,144,0,0,0, //vmovaps %xmm15,0x90(%rsp) 7520 197,120,41,180,36,128,0,0,0, //vmovaps %xmm14,0x80(%rsp) 7521 197,120,41,108,36,112, //vmovaps %xmm13,0x70(%rsp) 7522 197,120,41,100,36,96, //vmovaps %xmm12,0x60(%rsp) 7523 197,120,41,92,36,80, //vmovaps %xmm11,0x50(%rsp) 7524 197,120,41,84,36,64, //vmovaps %xmm10,0x40(%rsp) 7525 197,120,41,76,36,48, //vmovaps %xmm9,0x30(%rsp) 7526 197,120,41,68,36,32, //vmovaps %xmm8,0x20(%rsp) 7527 197,248,41,124,36,16, //vmovaps %xmm7,0x10(%rsp) 7528 197,248,41,52,36, //vmovaps %xmm6,(%rsp) 7529 77,137,205, //mov %r9,%r13 7530 77,137,198, //mov %r8,%r14 7531 72,137,203, //mov %rcx,%rbx 7532 72,137,214, //mov %rdx,%rsi 7533 72,173, //lods %ds:(%rsi),%rax 7534 73,137,199, //mov %rax,%r15 7535 73,137,244, //mov %rsi,%r12 7536 72,141,67,8, //lea 0x8(%rbx),%rax 7537 76,57,232, //cmp %r13,%rax 7538 118,5, //jbe 75 <_sk_start_pipeline_hsw+0x75> 7539 72,137,223, //mov %rbx,%rdi 7540 235,65, //jmp b6 <_sk_start_pipeline_hsw+0xb6> 7541 185,0,0,0,0, //mov $0x0,%ecx 7542 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 7543 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 7544 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 7545 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 7546 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4 7547 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5 7548 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6 7549 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7 7550 72,137,223, //mov %rbx,%rdi 7551 76,137,230, //mov %r12,%rsi 7552 76,137,242, //mov %r14,%rdx 7553 65,255,215, //callq *%r15 7554 72,141,123,8, //lea 0x8(%rbx),%rdi 7555 72,131,195,16, //add $0x10,%rbx 7556 76,57,235, //cmp %r13,%rbx 7557 72,137,251, //mov %rdi,%rbx 7558 118,191, //jbe 75 <_sk_start_pipeline_hsw+0x75> 7559 76,137,233, //mov %r13,%rcx 7560 72,41,249, //sub %rdi,%rcx 7561 116,41, //je e7 <_sk_start_pipeline_hsw+0xe7> 7562 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 7563 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 7564 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 7565 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 7566 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4 7567 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5 7568 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6 7569 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7 7570 76,137,230, //mov %r12,%rsi 7571 76,137,242, //mov %r14,%rdx 7572 65,255,215, //callq *%r15 7573 76,137,232, //mov %r13,%rax 7574 197,248,40,52,36, //vmovaps (%rsp),%xmm6 7575 197,248,40,124,36,16, //vmovaps 0x10(%rsp),%xmm7 7576 197,120,40,68,36,32, //vmovaps 0x20(%rsp),%xmm8 7577 197,120,40,76,36,48, //vmovaps 0x30(%rsp),%xmm9 7578 197,120,40,84,36,64, //vmovaps 0x40(%rsp),%xmm10 7579 197,120,40,92,36,80, //vmovaps 0x50(%rsp),%xmm11 7580 197,120,40,100,36,96, //vmovaps 0x60(%rsp),%xmm12 7581 197,120,40,108,36,112, //vmovaps 0x70(%rsp),%xmm13 7582 197,120,40,180,36,128,0,0,0, //vmovaps 0x80(%rsp),%xmm14 7583 197,120,40,188,36,144,0,0,0, //vmovaps 0x90(%rsp),%xmm15 7584 72,129,196,160,0,0,0, //add $0xa0,%rsp 7585 91, //pop %rbx 7586 95, //pop %rdi 7587 94, //pop %rsi 7588 65,92, //pop %r12 7589 65,93, //pop %r13 7590 65,94, //pop %r14 7591 65,95, //pop %r15 7592 197,248,119, //vzeroupper 7593 195, //retq 7594 }; 7595 7596 CODE const uint8_t sk_just_return_hsw[] = { 7597 195, //retq 7598 }; 7599 7600 CODE const uint8_t sk_seed_shader_hsw[] = { 7601 72,173, //lods %ds:(%rsi),%rax 7602 197,249,110,199, //vmovd %edi,%xmm0 7603 196,226,125,88,192, //vpbroadcastd %xmm0,%ymm0 7604 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 7605 65,184,0,0,0,63, //mov $0x3f000000,%r8d 7606 196,193,121,110,200, //vmovd %r8d,%xmm1 7607 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 7608 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 7609 197,252,88,2, //vaddps (%rdx),%ymm0,%ymm0 7610 196,226,125,24,16, //vbroadcastss (%rax),%ymm2 7611 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 7612 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1 7613 184,0,0,128,63, //mov $0x3f800000,%eax 7614 197,249,110,208, //vmovd %eax,%xmm2 7615 196,226,125,88,210, //vpbroadcastd %xmm2,%ymm2 7616 72,173, //lods %ds:(%rsi),%rax 7617 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 7618 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4 7619 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5 7620 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6 7621 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7 7622 255,224, //jmpq *%rax 7623 }; 7624 7625 CODE const uint8_t sk_constant_color_hsw[] = { 7626 72,173, //lods %ds:(%rsi),%rax 7627 196,226,125,24,0, //vbroadcastss (%rax),%ymm0 7628 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1 7629 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2 7630 196,226,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm3 7631 72,173, //lods %ds:(%rsi),%rax 7632 255,224, //jmpq *%rax 7633 }; 7634 7635 CODE const uint8_t sk_clear_hsw[] = { 7636 72,173, //lods %ds:(%rsi),%rax 7637 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 7638 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 7639 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 7640 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 7641 255,224, //jmpq *%rax 7642 }; 7643 7644 CODE const uint8_t sk_plus__hsw[] = { 7645 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0 7646 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1 7647 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2 7648 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3 7649 72,173, //lods %ds:(%rsi),%rax 7650 255,224, //jmpq *%rax 7651 }; 7652 7653 CODE const uint8_t sk_srcover_hsw[] = { 7654 184,0,0,128,63, //mov $0x3f800000,%eax 7655 197,121,110,192, //vmovd %eax,%xmm8 7656 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 7657 197,60,92,195, //vsubps %ymm3,%ymm8,%ymm8 7658 196,194,93,184,192, //vfmadd231ps %ymm8,%ymm4,%ymm0 7659 196,194,85,184,200, //vfmadd231ps %ymm8,%ymm5,%ymm1 7660 196,194,77,184,208, //vfmadd231ps %ymm8,%ymm6,%ymm2 7661 196,194,69,184,216, //vfmadd231ps %ymm8,%ymm7,%ymm3 7662 72,173, //lods %ds:(%rsi),%rax 7663 255,224, //jmpq *%rax 7664 }; 7665 7666 CODE const uint8_t sk_dstover_hsw[] = { 7667 184,0,0,128,63, //mov $0x3f800000,%eax 7668 197,121,110,192, //vmovd %eax,%xmm8 7669 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 7670 197,60,92,199, //vsubps %ymm7,%ymm8,%ymm8 7671 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0 7672 196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1 7673 196,226,61,168,214, //vfmadd213ps %ymm6,%ymm8,%ymm2 7674 196,226,61,168,223, //vfmadd213ps %ymm7,%ymm8,%ymm3 7675 72,173, //lods %ds:(%rsi),%rax 7676 255,224, //jmpq *%rax 7677 }; 7678 7679 CODE const uint8_t sk_clamp_0_hsw[] = { 7680 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 7681 196,193,124,95,192, //vmaxps %ymm8,%ymm0,%ymm0 7682 196,193,116,95,200, //vmaxps %ymm8,%ymm1,%ymm1 7683 196,193,108,95,208, //vmaxps %ymm8,%ymm2,%ymm2 7684 196,193,100,95,216, //vmaxps %ymm8,%ymm3,%ymm3 7685 72,173, //lods %ds:(%rsi),%rax 7686 255,224, //jmpq *%rax 7687 }; 7688 7689 CODE const uint8_t sk_clamp_1_hsw[] = { 7690 184,0,0,128,63, //mov $0x3f800000,%eax 7691 197,121,110,192, //vmovd %eax,%xmm8 7692 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 7693 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0 7694 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1 7695 196,193,108,93,208, //vminps %ymm8,%ymm2,%ymm2 7696 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3 7697 72,173, //lods %ds:(%rsi),%rax 7698 255,224, //jmpq *%rax 7699 }; 7700 7701 CODE const uint8_t sk_clamp_a_hsw[] = { 7702 184,0,0,128,63, //mov $0x3f800000,%eax 7703 197,121,110,192, //vmovd %eax,%xmm8 7704 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 7705 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3 7706 197,252,93,195, //vminps %ymm3,%ymm0,%ymm0 7707 197,244,93,203, //vminps %ymm3,%ymm1,%ymm1 7708 197,236,93,211, //vminps %ymm3,%ymm2,%ymm2 7709 72,173, //lods %ds:(%rsi),%rax 7710 255,224, //jmpq *%rax 7711 }; 7712 7713 CODE const uint8_t sk_set_rgb_hsw[] = { 7714 72,173, //lods %ds:(%rsi),%rax 7715 196,226,125,24,0, //vbroadcastss (%rax),%ymm0 7716 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1 7717 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2 7718 72,173, //lods %ds:(%rsi),%rax 7719 255,224, //jmpq *%rax 7720 }; 7721 7722 CODE const uint8_t sk_swap_rb_hsw[] = { 7723 197,124,40,192, //vmovaps %ymm0,%ymm8 7724 72,173, //lods %ds:(%rsi),%rax 7725 197,252,40,194, //vmovaps %ymm2,%ymm0 7726 197,124,41,194, //vmovaps %ymm8,%ymm2 7727 255,224, //jmpq *%rax 7728 }; 7729 7730 CODE const uint8_t sk_swap_hsw[] = { 7731 197,124,40,195, //vmovaps %ymm3,%ymm8 7732 197,124,40,202, //vmovaps %ymm2,%ymm9 7733 197,124,40,209, //vmovaps %ymm1,%ymm10 7734 197,124,40,216, //vmovaps %ymm0,%ymm11 7735 72,173, //lods %ds:(%rsi),%rax 7736 197,252,40,196, //vmovaps %ymm4,%ymm0 7737 197,252,40,205, //vmovaps %ymm5,%ymm1 7738 197,252,40,214, //vmovaps %ymm6,%ymm2 7739 197,252,40,223, //vmovaps %ymm7,%ymm3 7740 197,124,41,220, //vmovaps %ymm11,%ymm4 7741 197,124,41,213, //vmovaps %ymm10,%ymm5 7742 197,124,41,206, //vmovaps %ymm9,%ymm6 7743 197,124,41,199, //vmovaps %ymm8,%ymm7 7744 255,224, //jmpq *%rax 7745 }; 7746 7747 CODE const uint8_t sk_move_src_dst_hsw[] = { 7748 72,173, //lods %ds:(%rsi),%rax 7749 197,252,40,224, //vmovaps %ymm0,%ymm4 7750 197,252,40,233, //vmovaps %ymm1,%ymm5 7751 197,252,40,242, //vmovaps %ymm2,%ymm6 7752 197,252,40,251, //vmovaps %ymm3,%ymm7 7753 255,224, //jmpq *%rax 7754 }; 7755 7756 CODE const uint8_t sk_move_dst_src_hsw[] = { 7757 72,173, //lods %ds:(%rsi),%rax 7758 197,252,40,196, //vmovaps %ymm4,%ymm0 7759 197,252,40,205, //vmovaps %ymm5,%ymm1 7760 197,252,40,214, //vmovaps %ymm6,%ymm2 7761 197,252,40,223, //vmovaps %ymm7,%ymm3 7762 255,224, //jmpq *%rax 7763 }; 7764 7765 CODE const uint8_t sk_premul_hsw[] = { 7766 197,252,89,195, //vmulps %ymm3,%ymm0,%ymm0 7767 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1 7768 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 7769 72,173, //lods %ds:(%rsi),%rax 7770 255,224, //jmpq *%rax 7771 }; 7772 7773 CODE const uint8_t sk_unpremul_hsw[] = { 7774 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 7775 196,65,100,194,200,0, //vcmpeqps %ymm8,%ymm3,%ymm9 7776 184,0,0,128,63, //mov $0x3f800000,%eax 7777 197,121,110,208, //vmovd %eax,%xmm10 7778 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 7779 197,44,94,211, //vdivps %ymm3,%ymm10,%ymm10 7780 196,67,45,74,192,144, //vblendvps %ymm9,%ymm8,%ymm10,%ymm8 7781 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 7782 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 7783 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2 7784 72,173, //lods %ds:(%rsi),%rax 7785 255,224, //jmpq *%rax 7786 }; 7787 7788 CODE const uint8_t sk_from_srgb_hsw[] = { 7789 184,145,131,158,61, //mov $0x3d9e8391,%eax 7790 197,121,110,192, //vmovd %eax,%xmm8 7791 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 7792 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 7793 197,124,89,208, //vmulps %ymm0,%ymm0,%ymm10 7794 184,154,153,153,62, //mov $0x3e99999a,%eax 7795 197,121,110,216, //vmovd %eax,%xmm11 7796 196,66,125,88,219, //vpbroadcastd %xmm11,%ymm11 7797 184,92,143,50,63, //mov $0x3f328f5c,%eax 7798 197,121,110,224, //vmovd %eax,%xmm12 7799 196,66,125,88,228, //vpbroadcastd %xmm12,%ymm12 7800 196,65,125,111,235, //vmovdqa %ymm11,%ymm13 7801 196,66,125,168,236, //vfmadd213ps %ymm12,%ymm0,%ymm13 7802 184,10,215,35,59, //mov $0x3b23d70a,%eax 7803 197,121,110,240, //vmovd %eax,%xmm14 7804 196,66,125,88,246, //vpbroadcastd %xmm14,%ymm14 7805 196,66,45,168,238, //vfmadd213ps %ymm14,%ymm10,%ymm13 7806 184,174,71,97,61, //mov $0x3d6147ae,%eax 7807 197,121,110,208, //vmovd %eax,%xmm10 7808 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 7809 196,193,124,194,194,1, //vcmpltps %ymm10,%ymm0,%ymm0 7810 196,195,21,74,193,0, //vblendvps %ymm0,%ymm9,%ymm13,%ymm0 7811 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9 7812 197,116,89,233, //vmulps %ymm1,%ymm1,%ymm13 7813 196,65,125,111,251, //vmovdqa %ymm11,%ymm15 7814 196,66,117,168,252, //vfmadd213ps %ymm12,%ymm1,%ymm15 7815 196,66,21,168,254, //vfmadd213ps %ymm14,%ymm13,%ymm15 7816 196,193,116,194,202,1, //vcmpltps %ymm10,%ymm1,%ymm1 7817 196,195,5,74,201,16, //vblendvps %ymm1,%ymm9,%ymm15,%ymm1 7818 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 7819 197,108,89,202, //vmulps %ymm2,%ymm2,%ymm9 7820 196,66,109,168,220, //vfmadd213ps %ymm12,%ymm2,%ymm11 7821 196,66,53,168,222, //vfmadd213ps %ymm14,%ymm9,%ymm11 7822 196,193,108,194,210,1, //vcmpltps %ymm10,%ymm2,%ymm2 7823 196,195,37,74,208,32, //vblendvps %ymm2,%ymm8,%ymm11,%ymm2 7824 72,173, //lods %ds:(%rsi),%rax 7825 255,224, //jmpq *%rax 7826 }; 7827 7828 CODE const uint8_t sk_to_srgb_hsw[] = { 7829 197,124,82,192, //vrsqrtps %ymm0,%ymm8 7830 196,65,124,83,216, //vrcpps %ymm8,%ymm11 7831 196,65,124,82,224, //vrsqrtps %ymm8,%ymm12 7832 184,41,92,71,65, //mov $0x41475c29,%eax 7833 197,121,110,192, //vmovd %eax,%xmm8 7834 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 7835 197,60,89,232, //vmulps %ymm0,%ymm8,%ymm13 7836 184,0,0,128,63, //mov $0x3f800000,%eax 7837 197,121,110,200, //vmovd %eax,%xmm9 7838 196,66,125,88,201, //vpbroadcastd %xmm9,%ymm9 7839 184,194,135,210,62, //mov $0x3ed287c2,%eax 7840 197,121,110,208, //vmovd %eax,%xmm10 7841 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 7842 184,206,111,48,63, //mov $0x3f306fce,%eax 7843 197,121,110,240, //vmovd %eax,%xmm14 7844 196,66,125,88,246, //vpbroadcastd %xmm14,%ymm14 7845 184,168,87,202,61, //mov $0x3dca57a8,%eax 7846 53,0,0,0,128, //xor $0x80000000,%eax 7847 197,121,110,248, //vmovd %eax,%xmm15 7848 196,66,125,88,255, //vpbroadcastd %xmm15,%ymm15 7849 196,66,13,168,223, //vfmadd213ps %ymm15,%ymm14,%ymm11 7850 196,66,45,184,220, //vfmadd231ps %ymm12,%ymm10,%ymm11 7851 196,65,52,93,219, //vminps %ymm11,%ymm9,%ymm11 7852 184,4,231,140,59, //mov $0x3b8ce704,%eax 7853 197,121,110,224, //vmovd %eax,%xmm12 7854 196,66,125,88,228, //vpbroadcastd %xmm12,%ymm12 7855 196,193,124,194,196,1, //vcmpltps %ymm12,%ymm0,%ymm0 7856 196,195,37,74,197,0, //vblendvps %ymm0,%ymm13,%ymm11,%ymm0 7857 197,124,82,217, //vrsqrtps %ymm1,%ymm11 7858 196,65,124,83,235, //vrcpps %ymm11,%ymm13 7859 196,65,124,82,219, //vrsqrtps %ymm11,%ymm11 7860 196,66,13,168,239, //vfmadd213ps %ymm15,%ymm14,%ymm13 7861 196,66,45,184,235, //vfmadd231ps %ymm11,%ymm10,%ymm13 7862 197,60,89,217, //vmulps %ymm1,%ymm8,%ymm11 7863 196,65,52,93,237, //vminps %ymm13,%ymm9,%ymm13 7864 196,193,116,194,204,1, //vcmpltps %ymm12,%ymm1,%ymm1 7865 196,195,21,74,203,16, //vblendvps %ymm1,%ymm11,%ymm13,%ymm1 7866 197,124,82,218, //vrsqrtps %ymm2,%ymm11 7867 196,65,124,83,235, //vrcpps %ymm11,%ymm13 7868 196,66,13,168,239, //vfmadd213ps %ymm15,%ymm14,%ymm13 7869 196,65,124,82,219, //vrsqrtps %ymm11,%ymm11 7870 196,66,45,184,235, //vfmadd231ps %ymm11,%ymm10,%ymm13 7871 196,65,52,93,205, //vminps %ymm13,%ymm9,%ymm9 7872 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 7873 196,193,108,194,212,1, //vcmpltps %ymm12,%ymm2,%ymm2 7874 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2 7875 72,173, //lods %ds:(%rsi),%rax 7876 255,224, //jmpq *%rax 7877 }; 7878 7879 CODE const uint8_t sk_scale_1_float_hsw[] = { 7880 72,173, //lods %ds:(%rsi),%rax 7881 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 7882 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 7883 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 7884 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2 7885 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 7886 72,173, //lods %ds:(%rsi),%rax 7887 255,224, //jmpq *%rax 7888 }; 7889 7890 CODE const uint8_t sk_scale_u8_hsw[] = { 7891 73,137,200, //mov %rcx,%r8 7892 72,173, //lods %ds:(%rsi),%rax 7893 72,139,0, //mov (%rax),%rax 7894 72,1,248, //add %rdi,%rax 7895 77,133,192, //test %r8,%r8 7896 117,56, //jne 556 <_sk_scale_u8_hsw+0x48> 7897 197,122,126,0, //vmovq (%rax),%xmm8 7898 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8 7899 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 7900 184,129,128,128,59, //mov $0x3b808081,%eax 7901 197,121,110,200, //vmovd %eax,%xmm9 7902 196,66,125,88,201, //vpbroadcastd %xmm9,%ymm9 7903 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8 7904 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 7905 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 7906 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2 7907 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 7908 72,173, //lods %ds:(%rsi),%rax 7909 76,137,193, //mov %r8,%rcx 7910 255,224, //jmpq *%rax 7911 49,201, //xor %ecx,%ecx 7912 77,137,194, //mov %r8,%r10 7913 69,49,201, //xor %r9d,%r9d 7914 68,15,182,24, //movzbl (%rax),%r11d 7915 72,255,192, //inc %rax 7916 73,211,227, //shl %cl,%r11 7917 77,9,217, //or %r11,%r9 7918 72,131,193,8, //add $0x8,%rcx 7919 73,255,202, //dec %r10 7920 117,234, //jne 55e <_sk_scale_u8_hsw+0x50> 7921 196,65,249,110,193, //vmovq %r9,%xmm8 7922 235,167, //jmp 522 <_sk_scale_u8_hsw+0x14> 7923 }; 7924 7925 CODE const uint8_t sk_lerp_1_float_hsw[] = { 7926 72,173, //lods %ds:(%rsi),%rax 7927 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 7928 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 7929 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0 7930 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 7931 196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1 7932 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 7933 196,226,61,168,214, //vfmadd213ps %ymm6,%ymm8,%ymm2 7934 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3 7935 196,226,61,168,223, //vfmadd213ps %ymm7,%ymm8,%ymm3 7936 72,173, //lods %ds:(%rsi),%rax 7937 255,224, //jmpq *%rax 7938 }; 7939 7940 CODE const uint8_t sk_lerp_u8_hsw[] = { 7941 73,137,200, //mov %rcx,%r8 7942 72,173, //lods %ds:(%rsi),%rax 7943 72,139,0, //mov (%rax),%rax 7944 72,1,248, //add %rdi,%rax 7945 77,133,192, //test %r8,%r8 7946 117,76, //jne 606 <_sk_lerp_u8_hsw+0x5c> 7947 197,122,126,0, //vmovq (%rax),%xmm8 7948 196,66,125,49,192, //vpmovzxbd %xmm8,%ymm8 7949 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 7950 184,129,128,128,59, //mov $0x3b808081,%eax 7951 197,121,110,200, //vmovd %eax,%xmm9 7952 196,66,125,88,201, //vpbroadcastd %xmm9,%ymm9 7953 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8 7954 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 7955 196,226,61,168,196, //vfmadd213ps %ymm4,%ymm8,%ymm0 7956 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 7957 196,226,61,168,205, //vfmadd213ps %ymm5,%ymm8,%ymm1 7958 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 7959 196,226,61,168,214, //vfmadd213ps %ymm6,%ymm8,%ymm2 7960 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3 7961 196,226,61,168,223, //vfmadd213ps %ymm7,%ymm8,%ymm3 7962 72,173, //lods %ds:(%rsi),%rax 7963 76,137,193, //mov %r8,%rcx 7964 255,224, //jmpq *%rax 7965 49,201, //xor %ecx,%ecx 7966 77,137,194, //mov %r8,%r10 7967 69,49,201, //xor %r9d,%r9d 7968 68,15,182,24, //movzbl (%rax),%r11d 7969 72,255,192, //inc %rax 7970 73,211,227, //shl %cl,%r11 7971 77,9,217, //or %r11,%r9 7972 72,131,193,8, //add $0x8,%rcx 7973 73,255,202, //dec %r10 7974 117,234, //jne 60e <_sk_lerp_u8_hsw+0x64> 7975 196,65,249,110,193, //vmovq %r9,%xmm8 7976 235,147, //jmp 5be <_sk_lerp_u8_hsw+0x14> 7977 }; 7978 7979 CODE const uint8_t sk_lerp_565_hsw[] = { 7980 72,173, //lods %ds:(%rsi),%rax 7981 76,139,16, //mov (%rax),%r10 7982 72,133,201, //test %rcx,%rcx 7983 15,133,179,0,0,0, //jne 6ec <_sk_lerp_565_hsw+0xc1> 7984 196,193,122,111,28,122, //vmovdqu (%r10,%rdi,2),%xmm3 7985 196,98,125,51,195, //vpmovzxwd %xmm3,%ymm8 7986 184,0,248,0,0, //mov $0xf800,%eax 7987 197,249,110,216, //vmovd %eax,%xmm3 7988 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 7989 196,193,101,219,216, //vpand %ymm8,%ymm3,%ymm3 7990 197,124,91,203, //vcvtdq2ps %ymm3,%ymm9 7991 184,8,33,132,55, //mov $0x37842108,%eax 7992 197,249,110,216, //vmovd %eax,%xmm3 7993 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 7994 197,52,89,203, //vmulps %ymm3,%ymm9,%ymm9 7995 184,224,7,0,0, //mov $0x7e0,%eax 7996 197,249,110,216, //vmovd %eax,%xmm3 7997 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 7998 196,193,101,219,216, //vpand %ymm8,%ymm3,%ymm3 7999 197,124,91,211, //vcvtdq2ps %ymm3,%ymm10 8000 184,33,8,2,58, //mov $0x3a020821,%eax 8001 197,249,110,216, //vmovd %eax,%xmm3 8002 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 8003 197,44,89,211, //vmulps %ymm3,%ymm10,%ymm10 8004 184,31,0,0,0, //mov $0x1f,%eax 8005 197,249,110,216, //vmovd %eax,%xmm3 8006 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 8007 196,193,101,219,216, //vpand %ymm8,%ymm3,%ymm3 8008 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 8009 184,8,33,4,61, //mov $0x3d042108,%eax 8010 197,249,110,216, //vmovd %eax,%xmm3 8011 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 8012 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 8013 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 8014 196,226,53,168,196, //vfmadd213ps %ymm4,%ymm9,%ymm0 8015 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 8016 196,226,45,168,205, //vfmadd213ps %ymm5,%ymm10,%ymm1 8017 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 8018 196,226,101,168,214, //vfmadd213ps %ymm6,%ymm3,%ymm2 8019 184,0,0,128,63, //mov $0x3f800000,%eax 8020 197,249,110,216, //vmovd %eax,%xmm3 8021 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 8022 72,173, //lods %ds:(%rsi),%rax 8023 255,224, //jmpq *%rax 8024 65,137,200, //mov %ecx,%r8d 8025 65,128,224,7, //and $0x7,%r8b 8026 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 8027 65,254,200, //dec %r8b 8028 65,128,248,6, //cmp $0x6,%r8b 8029 15,135,59,255,255,255, //ja 63f <_sk_lerp_565_hsw+0x14> 8030 69,15,182,192, //movzbl %r8b,%r8d 8031 76,141,13,73,0,0,0, //lea 0x49(%rip),%r9 # 758 <_sk_lerp_565_hsw+0x12d> 8032 75,99,4,129, //movslq (%r9,%r8,4),%rax 8033 76,1,200, //add %r9,%rax 8034 255,224, //jmpq *%rax 8035 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 8036 196,193,97,196,92,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm3 8037 196,193,97,196,92,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm3,%xmm3 8038 196,193,97,196,92,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm3,%xmm3 8039 196,193,97,196,92,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm3,%xmm3 8040 196,193,97,196,92,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3 8041 196,193,97,196,92,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3 8042 196,193,97,196,28,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm3,%xmm3 8043 233,231,254,255,255, //jmpq 63f <_sk_lerp_565_hsw+0x14> 8044 244, //hlt 8045 255, //(bad) 8046 255, //(bad) 8047 255, //(bad) 8048 236, //in (%dx),%al 8049 255, //(bad) 8050 255, //(bad) 8051 255,228, //jmpq *%rsp 8052 255, //(bad) 8053 255, //(bad) 8054 255, //(bad) 8055 220,255, //fdivr %st,%st(7) 8056 255, //(bad) 8057 255,212, //callq *%rsp 8058 255, //(bad) 8059 255, //(bad) 8060 255,204, //dec %esp 8061 255, //(bad) 8062 255, //(bad) 8063 255,192, //inc %eax 8064 255, //(bad) 8065 255, //(bad) 8066 255, //.byte 0xff 8067 }; 8068 8069 CODE const uint8_t sk_load_tables_hsw[] = { 8070 73,137,200, //mov %rcx,%r8 8071 72,173, //lods %ds:(%rsi),%rax 8072 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9 8073 76,3,8, //add (%rax),%r9 8074 77,133,192, //test %r8,%r8 8075 117,121, //jne 802 <_sk_load_tables_hsw+0x8e> 8076 196,193,126,111,25, //vmovdqu (%r9),%ymm3 8077 185,255,0,0,0, //mov $0xff,%ecx 8078 197,249,110,193, //vmovd %ecx,%xmm0 8079 196,226,125,88,208, //vpbroadcastd %xmm0,%ymm2 8080 197,237,219,203, //vpand %ymm3,%ymm2,%ymm1 8081 196,65,61,118,192, //vpcmpeqd %ymm8,%ymm8,%ymm8 8082 72,139,72,8, //mov 0x8(%rax),%rcx 8083 76,139,72,16, //mov 0x10(%rax),%r9 8084 196,65,53,118,201, //vpcmpeqd %ymm9,%ymm9,%ymm9 8085 196,226,53,146,4,137, //vgatherdps %ymm9,(%rcx,%ymm1,4),%ymm0 8086 197,245,114,211,8, //vpsrld $0x8,%ymm3,%ymm1 8087 197,109,219,201, //vpand %ymm1,%ymm2,%ymm9 8088 196,65,45,118,210, //vpcmpeqd %ymm10,%ymm10,%ymm10 8089 196,130,45,146,12,137, //vgatherdps %ymm10,(%r9,%ymm9,4),%ymm1 8090 72,139,64,24, //mov 0x18(%rax),%rax 8091 197,181,114,211,16, //vpsrld $0x10,%ymm3,%ymm9 8092 196,65,109,219,201, //vpand %ymm9,%ymm2,%ymm9 8093 196,162,61,146,20,136, //vgatherdps %ymm8,(%rax,%ymm9,4),%ymm2 8094 197,229,114,211,24, //vpsrld $0x18,%ymm3,%ymm3 8095 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 8096 184,129,128,128,59, //mov $0x3b808081,%eax 8097 197,249,110,216, //vmovd %eax,%xmm3 8098 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 8099 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 8100 72,173, //lods %ds:(%rsi),%rax 8101 76,137,193, //mov %r8,%rcx 8102 255,224, //jmpq *%rax 8103 185,8,0,0,0, //mov $0x8,%ecx 8104 68,41,193, //sub %r8d,%ecx 8105 192,225,3, //shl $0x3,%cl 8106 73,199,194,255,255,255,255, //mov $0xffffffffffffffff,%r10 8107 73,211,234, //shr %cl,%r10 8108 196,193,249,110,194, //vmovq %r10,%xmm0 8109 196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0 8110 196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3 8111 233,99,255,255,255, //jmpq 78e <_sk_load_tables_hsw+0x1a> 8112 }; 8113 8114 CODE const uint8_t sk_load_a8_hsw[] = { 8115 73,137,200, //mov %rcx,%r8 8116 72,173, //lods %ds:(%rsi),%rax 8117 72,139,0, //mov (%rax),%rax 8118 72,1,248, //add %rdi,%rax 8119 77,133,192, //test %r8,%r8 8120 117,50, //jne 86d <_sk_load_a8_hsw+0x42> 8121 197,250,126,0, //vmovq (%rax),%xmm0 8122 196,226,125,49,192, //vpmovzxbd %xmm0,%ymm0 8123 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 8124 184,129,128,128,59, //mov $0x3b808081,%eax 8125 197,249,110,200, //vmovd %eax,%xmm1 8126 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 8127 197,252,89,217, //vmulps %ymm1,%ymm0,%ymm3 8128 72,173, //lods %ds:(%rsi),%rax 8129 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 8130 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 8131 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 8132 76,137,193, //mov %r8,%rcx 8133 255,224, //jmpq *%rax 8134 49,201, //xor %ecx,%ecx 8135 77,137,194, //mov %r8,%r10 8136 69,49,201, //xor %r9d,%r9d 8137 68,15,182,24, //movzbl (%rax),%r11d 8138 72,255,192, //inc %rax 8139 73,211,227, //shl %cl,%r11 8140 77,9,217, //or %r11,%r9 8141 72,131,193,8, //add $0x8,%rcx 8142 73,255,202, //dec %r10 8143 117,234, //jne 875 <_sk_load_a8_hsw+0x4a> 8144 196,193,249,110,193, //vmovq %r9,%xmm0 8145 235,173, //jmp 83f <_sk_load_a8_hsw+0x14> 8146 }; 8147 8148 CODE const uint8_t sk_store_a8_hsw[] = { 8149 72,173, //lods %ds:(%rsi),%rax 8150 76,139,8, //mov (%rax),%r9 8151 184,0,0,127,67, //mov $0x437f0000,%eax 8152 197,121,110,192, //vmovd %eax,%xmm8 8153 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 8154 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 8155 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 8156 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 8157 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 8158 196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8 8159 72,133,201, //test %rcx,%rcx 8160 117,10, //jne 8cd <_sk_store_a8_hsw+0x3b> 8161 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1) 8162 72,173, //lods %ds:(%rsi),%rax 8163 255,224, //jmpq *%rax 8164 65,137,200, //mov %ecx,%r8d 8165 65,128,224,7, //and $0x7,%r8b 8166 65,254,200, //dec %r8b 8167 65,128,248,6, //cmp $0x6,%r8b 8168 119,236, //ja 8c9 <_sk_store_a8_hsw+0x37> 8169 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8 8170 65,15,182,192, //movzbl %r8b,%eax 8171 76,141,5,67,0,0,0, //lea 0x43(%rip),%r8 # 930 <_sk_store_a8_hsw+0x9e> 8172 73,99,4,128, //movslq (%r8,%rax,4),%rax 8173 76,1,192, //add %r8,%rax 8174 255,224, //jmpq *%rax 8175 196,67,121,20,68,57,6,12, //vpextrb $0xc,%xmm8,0x6(%r9,%rdi,1) 8176 196,67,121,20,68,57,5,10, //vpextrb $0xa,%xmm8,0x5(%r9,%rdi,1) 8177 196,67,121,20,68,57,4,8, //vpextrb $0x8,%xmm8,0x4(%r9,%rdi,1) 8178 196,67,121,20,68,57,3,6, //vpextrb $0x6,%xmm8,0x3(%r9,%rdi,1) 8179 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1) 8180 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1) 8181 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1) 8182 235,154, //jmp 8c9 <_sk_store_a8_hsw+0x37> 8183 144, //nop 8184 246,255, //idiv %bh 8185 255, //(bad) 8186 255, //(bad) 8187 238, //out %al,(%dx) 8188 255, //(bad) 8189 255, //(bad) 8190 255,230, //jmpq *%rsi 8191 255, //(bad) 8192 255, //(bad) 8193 255, //(bad) 8194 222,255, //fdivrp %st,%st(7) 8195 255, //(bad) 8196 255,214, //callq *%rsi 8197 255, //(bad) 8198 255, //(bad) 8199 255,206, //dec %esi 8200 255, //(bad) 8201 255, //(bad) 8202 255,198, //inc %esi 8203 255, //(bad) 8204 255, //(bad) 8205 255, //.byte 0xff 8206 }; 8207 8208 CODE const uint8_t sk_load_565_hsw[] = { 8209 72,173, //lods %ds:(%rsi),%rax 8210 76,139,16, //mov (%rax),%r10 8211 72,133,201, //test %rcx,%rcx 8212 15,133,149,0,0,0, //jne 9ef <_sk_load_565_hsw+0xa3> 8213 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0 8214 196,226,125,51,208, //vpmovzxwd %xmm0,%ymm2 8215 184,0,248,0,0, //mov $0xf800,%eax 8216 197,249,110,192, //vmovd %eax,%xmm0 8217 196,226,125,88,192, //vpbroadcastd %xmm0,%ymm0 8218 197,253,219,194, //vpand %ymm2,%ymm0,%ymm0 8219 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 8220 184,8,33,132,55, //mov $0x37842108,%eax 8221 197,249,110,200, //vmovd %eax,%xmm1 8222 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 8223 197,252,89,193, //vmulps %ymm1,%ymm0,%ymm0 8224 184,224,7,0,0, //mov $0x7e0,%eax 8225 197,249,110,200, //vmovd %eax,%xmm1 8226 196,226,125,88,201, //vpbroadcastd %xmm1,%ymm1 8227 197,245,219,202, //vpand %ymm2,%ymm1,%ymm1 8228 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1 8229 184,33,8,2,58, //mov $0x3a020821,%eax 8230 197,249,110,216, //vmovd %eax,%xmm3 8231 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 8232 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1 8233 184,31,0,0,0, //mov $0x1f,%eax 8234 197,249,110,216, //vmovd %eax,%xmm3 8235 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 8236 197,229,219,210, //vpand %ymm2,%ymm3,%ymm2 8237 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 8238 184,8,33,4,61, //mov $0x3d042108,%eax 8239 197,249,110,216, //vmovd %eax,%xmm3 8240 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 8241 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 8242 184,0,0,128,63, //mov $0x3f800000,%eax 8243 197,249,110,216, //vmovd %eax,%xmm3 8244 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 8245 72,173, //lods %ds:(%rsi),%rax 8246 255,224, //jmpq *%rax 8247 65,137,200, //mov %ecx,%r8d 8248 65,128,224,7, //and $0x7,%r8b 8249 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0 8250 65,254,200, //dec %r8b 8251 65,128,248,6, //cmp $0x6,%r8b 8252 15,135,89,255,255,255, //ja 960 <_sk_load_565_hsw+0x14> 8253 69,15,182,192, //movzbl %r8b,%r8d 8254 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # a5c <_sk_load_565_hsw+0x110> 8255 75,99,4,129, //movslq (%r9,%r8,4),%rax 8256 76,1,200, //add %r9,%rax 8257 255,224, //jmpq *%rax 8258 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0 8259 196,193,121,196,68,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0 8260 196,193,121,196,68,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0 8261 196,193,121,196,68,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0 8262 196,193,121,196,68,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0 8263 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0 8264 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0 8265 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0 8266 233,5,255,255,255, //jmpq 960 <_sk_load_565_hsw+0x14> 8267 144, //nop 8268 243,255, //repz (bad) 8269 255, //(bad) 8270 255, //(bad) 8271 235,255, //jmp a61 <_sk_load_565_hsw+0x115> 8272 255, //(bad) 8273 255,227, //jmpq *%rbx 8274 255, //(bad) 8275 255, //(bad) 8276 255, //(bad) 8277 219,255, //(bad) 8278 255, //(bad) 8279 255,211, //callq *%rbx 8280 255, //(bad) 8281 255, //(bad) 8282 255,203, //dec %ebx 8283 255, //(bad) 8284 255, //(bad) 8285 255, //(bad) 8286 191, //.byte 0xbf 8287 255, //(bad) 8288 255, //(bad) 8289 255, //.byte 0xff 8290 }; 8291 8292 CODE const uint8_t sk_store_565_hsw[] = { 8293 72,173, //lods %ds:(%rsi),%rax 8294 76,139,8, //mov (%rax),%r9 8295 184,0,0,248,65, //mov $0x41f80000,%eax 8296 197,121,110,192, //vmovd %eax,%xmm8 8297 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 8298 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 8299 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 8300 196,193,53,114,241,11, //vpslld $0xb,%ymm9,%ymm9 8301 184,0,0,124,66, //mov $0x427c0000,%eax 8302 197,121,110,208, //vmovd %eax,%xmm10 8303 196,66,125,88,210, //vpbroadcastd %xmm10,%ymm10 8304 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 8305 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 8306 196,193,45,114,242,5, //vpslld $0x5,%ymm10,%ymm10 8307 196,65,45,235,201, //vpor %ymm9,%ymm10,%ymm9 8308 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 8309 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 8310 196,65,53,235,192, //vpor %ymm8,%ymm9,%ymm8 8311 196,67,125,57,193,1, //vextracti128 $0x1,%ymm8,%xmm9 8312 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 8313 72,133,201, //test %rcx,%rcx 8314 117,10, //jne ae4 <_sk_store_565_hsw+0x6c> 8315 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2) 8316 72,173, //lods %ds:(%rsi),%rax 8317 255,224, //jmpq *%rax 8318 65,137,200, //mov %ecx,%r8d 8319 65,128,224,7, //and $0x7,%r8b 8320 65,254,200, //dec %r8b 8321 65,128,248,6, //cmp $0x6,%r8b 8322 119,236, //ja ae0 <_sk_store_565_hsw+0x68> 8323 65,15,182,192, //movzbl %r8b,%eax 8324 76,141,5,69,0,0,0, //lea 0x45(%rip),%r8 # b44 <_sk_store_565_hsw+0xcc> 8325 73,99,4,128, //movslq (%r8,%rax,4),%rax 8326 76,1,192, //add %r8,%rax 8327 255,224, //jmpq *%rax 8328 196,67,121,21,68,121,12,6, //vpextrw $0x6,%xmm8,0xc(%r9,%rdi,2) 8329 196,67,121,21,68,121,10,5, //vpextrw $0x5,%xmm8,0xa(%r9,%rdi,2) 8330 196,67,121,21,68,121,8,4, //vpextrw $0x4,%xmm8,0x8(%r9,%rdi,2) 8331 196,67,121,21,68,121,6,3, //vpextrw $0x3,%xmm8,0x6(%r9,%rdi,2) 8332 196,67,121,21,68,121,4,2, //vpextrw $0x2,%xmm8,0x4(%r9,%rdi,2) 8333 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2) 8334 196,67,121,21,4,121,0, //vpextrw $0x0,%xmm8,(%r9,%rdi,2) 8335 235,159, //jmp ae0 <_sk_store_565_hsw+0x68> 8336 15,31,0, //nopl (%rax) 8337 244, //hlt 8338 255, //(bad) 8339 255, //(bad) 8340 255, //(bad) 8341 236, //in (%dx),%al 8342 255, //(bad) 8343 255, //(bad) 8344 255,228, //jmpq *%rsp 8345 255, //(bad) 8346 255, //(bad) 8347 255, //(bad) 8348 220,255, //fdivr %st,%st(7) 8349 255, //(bad) 8350 255,212, //callq *%rsp 8351 255, //(bad) 8352 255, //(bad) 8353 255,204, //dec %esp 8354 255, //(bad) 8355 255, //(bad) 8356 255,196, //inc %esp 8357 255, //(bad) 8358 255, //(bad) 8359 255, //.byte 0xff 8360 }; 8361 8362 CODE const uint8_t sk_load_8888_hsw[] = { 8363 73,137,200, //mov %rcx,%r8 8364 72,173, //lods %ds:(%rsi),%rax 8365 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9 8366 76,3,8, //add (%rax),%r9 8367 77,133,192, //test %r8,%r8 8368 117,104, //jne bdd <_sk_load_8888_hsw+0x7d> 8369 196,193,126,111,25, //vmovdqu (%r9),%ymm3 8370 184,255,0,0,0, //mov $0xff,%eax 8371 197,249,110,192, //vmovd %eax,%xmm0 8372 196,226,125,88,208, //vpbroadcastd %xmm0,%ymm2 8373 197,237,219,195, //vpand %ymm3,%ymm2,%ymm0 8374 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 8375 184,129,128,128,59, //mov $0x3b808081,%eax 8376 197,249,110,200, //vmovd %eax,%xmm1 8377 196,98,125,88,193, //vpbroadcastd %xmm1,%ymm8 8378 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0 8379 197,245,114,211,8, //vpsrld $0x8,%ymm3,%ymm1 8380 197,237,219,201, //vpand %ymm1,%ymm2,%ymm1 8381 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1 8382 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1 8383 197,181,114,211,16, //vpsrld $0x10,%ymm3,%ymm9 8384 196,193,109,219,209, //vpand %ymm9,%ymm2,%ymm2 8385 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 8386 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2 8387 197,229,114,211,24, //vpsrld $0x18,%ymm3,%ymm3 8388 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3 8389 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3 8390 72,173, //lods %ds:(%rsi),%rax 8391 76,137,193, //mov %r8,%rcx 8392 255,224, //jmpq *%rax 8393 185,8,0,0,0, //mov $0x8,%ecx 8394 68,41,193, //sub %r8d,%ecx 8395 192,225,3, //shl $0x3,%cl 8396 72,199,192,255,255,255,255, //mov $0xffffffffffffffff,%rax 8397 72,211,232, //shr %cl,%rax 8398 196,225,249,110,192, //vmovq %rax,%xmm0 8399 196,226,125,33,192, //vpmovsxbd %xmm0,%ymm0 8400 196,194,125,140,25, //vpmaskmovd (%r9),%ymm0,%ymm3 8401 233,116,255,255,255, //jmpq b7a <_sk_load_8888_hsw+0x1a> 8402 }; 8403 8404 CODE const uint8_t sk_store_8888_hsw[] = { 8405 73,137,200, //mov %rcx,%r8 8406 72,173, //lods %ds:(%rsi),%rax 8407 76,141,12,189,0,0,0,0, //lea 0x0(,%rdi,4),%r9 8408 76,3,8, //add (%rax),%r9 8409 184,0,0,127,67, //mov $0x437f0000,%eax 8410 197,121,110,192, //vmovd %eax,%xmm8 8411 196,66,125,88,192, //vpbroadcastd %xmm8,%ymm8 8412 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 8413 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 8414 197,60,89,209, //vmulps %ymm1,%ymm8,%ymm10 8415 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 8416 196,193,45,114,242,8, //vpslld $0x8,%ymm10,%ymm10 8417 196,65,45,235,201, //vpor %ymm9,%ymm10,%ymm9 8418 197,60,89,210, //vmulps %ymm2,%ymm8,%ymm10 8419 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 8420 196,193,45,114,242,16, //vpslld $0x10,%ymm10,%ymm10 8421 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 8422 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 8423 196,193,61,114,240,24, //vpslld $0x18,%ymm8,%ymm8 8424 196,65,45,235,192, //vpor %ymm8,%ymm10,%ymm8 8425 196,65,53,235,192, //vpor %ymm8,%ymm9,%ymm8 8426 77,133,192, //test %r8,%r8 8427 117,12, //jne c7a <_sk_store_8888_hsw+0x74> 8428 196,65,126,127,1, //vmovdqu %ymm8,(%r9) 8429 72,173, //lods %ds:(%rsi),%rax 8430 76,137,193, //mov %r8,%rcx 8431 255,224, //jmpq *%rax 8432 185,8,0,0,0, //mov $0x8,%ecx 8433 68,41,193, //sub %r8d,%ecx 8434 192,225,3, //shl $0x3,%cl 8435 72,199,192,255,255,255,255, //mov $0xffffffffffffffff,%rax 8436 72,211,232, //shr %cl,%rax 8437 196,97,249,110,200, //vmovq %rax,%xmm9 8438 196,66,125,33,201, //vpmovsxbd %xmm9,%ymm9 8439 196,66,53,142,1, //vpmaskmovd %ymm8,%ymm9,(%r9) 8440 235,211, //jmp c73 <_sk_store_8888_hsw+0x6d> 8441 }; 8442 8443 CODE const uint8_t sk_load_f16_hsw[] = { 8444 72,173, //lods %ds:(%rsi),%rax 8445 72,139,0, //mov (%rax),%rax 8446 72,133,201, //test %rcx,%rcx 8447 117,97, //jne d0b <_sk_load_f16_hsw+0x6b> 8448 197,121,16,4,248, //vmovupd (%rax,%rdi,8),%xmm8 8449 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2 8450 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3 8451 197,122,111,76,248,48, //vmovdqu 0x30(%rax,%rdi,8),%xmm9 8452 197,185,97,194, //vpunpcklwd %xmm2,%xmm8,%xmm0 8453 197,185,105,210, //vpunpckhwd %xmm2,%xmm8,%xmm2 8454 196,193,97,97,201, //vpunpcklwd %xmm9,%xmm3,%xmm1 8455 196,193,97,105,217, //vpunpckhwd %xmm9,%xmm3,%xmm3 8456 197,121,97,194, //vpunpcklwd %xmm2,%xmm0,%xmm8 8457 197,121,105,202, //vpunpckhwd %xmm2,%xmm0,%xmm9 8458 197,241,97,211, //vpunpcklwd %xmm3,%xmm1,%xmm2 8459 197,241,105,219, //vpunpckhwd %xmm3,%xmm1,%xmm3 8460 197,185,108,194, //vpunpcklqdq %xmm2,%xmm8,%xmm0 8461 196,226,125,19,192, //vcvtph2ps %xmm0,%ymm0 8462 197,185,109,202, //vpunpckhqdq %xmm2,%xmm8,%xmm1 8463 196,226,125,19,201, //vcvtph2ps %xmm1,%ymm1 8464 197,177,108,211, //vpunpcklqdq %xmm3,%xmm9,%xmm2 8465 196,226,125,19,210, //vcvtph2ps %xmm2,%ymm2 8466 197,177,109,219, //vpunpckhqdq %xmm3,%xmm9,%xmm3 8467 196,226,125,19,219, //vcvtph2ps %xmm3,%ymm3 8468 72,173, //lods %ds:(%rsi),%rax 8469 255,224, //jmpq *%rax 8470 197,123,16,4,248, //vmovsd (%rax,%rdi,8),%xmm8 8471 196,65,49,239,201, //vpxor %xmm9,%xmm9,%xmm9 8472 72,131,249,1, //cmp $0x1,%rcx 8473 116,79, //je d6a <_sk_load_f16_hsw+0xca> 8474 197,57,22,68,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm8,%xmm8 8475 72,131,249,3, //cmp $0x3,%rcx 8476 114,67, //jb d6a <_sk_load_f16_hsw+0xca> 8477 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2 8478 72,131,249,3, //cmp $0x3,%rcx 8479 116,68, //je d77 <_sk_load_f16_hsw+0xd7> 8480 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2 8481 72,131,249,5, //cmp $0x5,%rcx 8482 114,56, //jb d77 <_sk_load_f16_hsw+0xd7> 8483 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3 8484 72,131,249,5, //cmp $0x5,%rcx 8485 15,132,114,255,255,255, //je cc1 <_sk_load_f16_hsw+0x21> 8486 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3 8487 72,131,249,7, //cmp $0x7,%rcx 8488 15,130,98,255,255,255, //jb cc1 <_sk_load_f16_hsw+0x21> 8489 197,122,126,76,248,48, //vmovq 0x30(%rax,%rdi,8),%xmm9 8490 233,87,255,255,255, //jmpq cc1 <_sk_load_f16_hsw+0x21> 8491 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 8492 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 8493 233,74,255,255,255, //jmpq cc1 <_sk_load_f16_hsw+0x21> 8494 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 8495 233,65,255,255,255, //jmpq cc1 <_sk_load_f16_hsw+0x21> 8496 }; 8497 8498 CODE const uint8_t sk_store_f16_hsw[] = { 8499 72,173, //lods %ds:(%rsi),%rax 8500 72,139,0, //mov (%rax),%rax 8501 196,195,125,29,192,4, //vcvtps2ph $0x4,%ymm0,%xmm8 8502 196,195,125,29,201,4, //vcvtps2ph $0x4,%ymm1,%xmm9 8503 196,195,125,29,210,4, //vcvtps2ph $0x4,%ymm2,%xmm10 8504 196,195,125,29,219,4, //vcvtps2ph $0x4,%ymm3,%xmm11 8505 196,65,57,97,225, //vpunpcklwd %xmm9,%xmm8,%xmm12 8506 196,65,57,105,193, //vpunpckhwd %xmm9,%xmm8,%xmm8 8507 196,65,41,97,203, //vpunpcklwd %xmm11,%xmm10,%xmm9 8508 196,65,41,105,235, //vpunpckhwd %xmm11,%xmm10,%xmm13 8509 196,65,25,98,217, //vpunpckldq %xmm9,%xmm12,%xmm11 8510 196,65,25,106,209, //vpunpckhdq %xmm9,%xmm12,%xmm10 8511 196,65,57,98,205, //vpunpckldq %xmm13,%xmm8,%xmm9 8512 196,65,57,106,197, //vpunpckhdq %xmm13,%xmm8,%xmm8 8513 72,133,201, //test %rcx,%rcx 8514 117,27, //jne de5 <_sk_store_f16_hsw+0x65> 8515 197,120,17,28,248, //vmovups %xmm11,(%rax,%rdi,8) 8516 197,120,17,84,248,16, //vmovups %xmm10,0x10(%rax,%rdi,8) 8517 197,120,17,76,248,32, //vmovups %xmm9,0x20(%rax,%rdi,8) 8518 197,122,127,68,248,48, //vmovdqu %xmm8,0x30(%rax,%rdi,8) 8519 72,173, //lods %ds:(%rsi),%rax 8520 255,224, //jmpq *%rax 8521 197,121,214,28,248, //vmovq %xmm11,(%rax,%rdi,8) 8522 72,131,249,1, //cmp $0x1,%rcx 8523 116,241, //je de1 <_sk_store_f16_hsw+0x61> 8524 197,121,23,92,248,8, //vmovhpd %xmm11,0x8(%rax,%rdi,8) 8525 72,131,249,3, //cmp $0x3,%rcx 8526 114,229, //jb de1 <_sk_store_f16_hsw+0x61> 8527 197,121,214,84,248,16, //vmovq %xmm10,0x10(%rax,%rdi,8) 8528 116,221, //je de1 <_sk_store_f16_hsw+0x61> 8529 197,121,23,84,248,24, //vmovhpd %xmm10,0x18(%rax,%rdi,8) 8530 72,131,249,5, //cmp $0x5,%rcx 8531 114,209, //jb de1 <_sk_store_f16_hsw+0x61> 8532 197,121,214,76,248,32, //vmovq %xmm9,0x20(%rax,%rdi,8) 8533 116,201, //je de1 <_sk_store_f16_hsw+0x61> 8534 197,121,23,76,248,40, //vmovhpd %xmm9,0x28(%rax,%rdi,8) 8535 72,131,249,7, //cmp $0x7,%rcx 8536 114,189, //jb de1 <_sk_store_f16_hsw+0x61> 8537 197,121,214,68,248,48, //vmovq %xmm8,0x30(%rax,%rdi,8) 8538 235,181, //jmp de1 <_sk_store_f16_hsw+0x61> 8539 }; 8540 8541 CODE const uint8_t sk_store_f32_hsw[] = { 8542 72,173, //lods %ds:(%rsi),%rax 8543 76,139,0, //mov (%rax),%r8 8544 72,141,4,189,0,0,0,0, //lea 0x0(,%rdi,4),%rax 8545 197,124,20,193, //vunpcklps %ymm1,%ymm0,%ymm8 8546 197,124,21,217, //vunpckhps %ymm1,%ymm0,%ymm11 8547 197,108,20,203, //vunpcklps %ymm3,%ymm2,%ymm9 8548 197,108,21,227, //vunpckhps %ymm3,%ymm2,%ymm12 8549 196,65,61,20,209, //vunpcklpd %ymm9,%ymm8,%ymm10 8550 196,65,61,21,201, //vunpckhpd %ymm9,%ymm8,%ymm9 8551 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8 8552 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11 8553 72,133,201, //test %rcx,%rcx 8554 117,55, //jne e99 <_sk_store_f32_hsw+0x6d> 8555 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12 8556 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13 8557 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9 8558 196,67,61,6,195,49, //vperm2f128 $0x31,%ymm11,%ymm8,%ymm8 8559 196,65,125,17,36,128, //vmovupd %ymm12,(%r8,%rax,4) 8560 196,65,125,17,108,128,32, //vmovupd %ymm13,0x20(%r8,%rax,4) 8561 196,65,125,17,76,128,64, //vmovupd %ymm9,0x40(%r8,%rax,4) 8562 196,65,125,17,68,128,96, //vmovupd %ymm8,0x60(%r8,%rax,4) 8563 72,173, //lods %ds:(%rsi),%rax 8564 255,224, //jmpq *%rax 8565 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4) 8566 72,131,249,1, //cmp $0x1,%rcx 8567 116,240, //je e95 <_sk_store_f32_hsw+0x69> 8568 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4) 8569 72,131,249,3, //cmp $0x3,%rcx 8570 114,227, //jb e95 <_sk_store_f32_hsw+0x69> 8571 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4) 8572 116,218, //je e95 <_sk_store_f32_hsw+0x69> 8573 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4) 8574 72,131,249,5, //cmp $0x5,%rcx 8575 114,205, //jb e95 <_sk_store_f32_hsw+0x69> 8576 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4) 8577 116,195, //je e95 <_sk_store_f32_hsw+0x69> 8578 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4) 8579 72,131,249,7, //cmp $0x7,%rcx 8580 114,181, //jb e95 <_sk_store_f32_hsw+0x69> 8581 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4) 8582 235,171, //jmp e95 <_sk_store_f32_hsw+0x69> 8583 }; 8584 8585 CODE const uint8_t sk_clamp_x_hsw[] = { 8586 72,173, //lods %ds:(%rsi),%rax 8587 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 8588 197,188,95,192, //vmaxps %ymm0,%ymm8,%ymm0 8589 196,98,125,88,0, //vpbroadcastd (%rax),%ymm8 8590 196,65,53,118,201, //vpcmpeqd %ymm9,%ymm9,%ymm9 8591 196,65,61,254,193, //vpaddd %ymm9,%ymm8,%ymm8 8592 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0 8593 72,173, //lods %ds:(%rsi),%rax 8594 255,224, //jmpq *%rax 8595 }; 8596 8597 CODE const uint8_t sk_clamp_y_hsw[] = { 8598 72,173, //lods %ds:(%rsi),%rax 8599 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 8600 197,188,95,201, //vmaxps %ymm1,%ymm8,%ymm1 8601 196,98,125,88,0, //vpbroadcastd (%rax),%ymm8 8602 196,65,53,118,201, //vpcmpeqd %ymm9,%ymm9,%ymm9 8603 196,65,61,254,193, //vpaddd %ymm9,%ymm8,%ymm8 8604 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1 8605 72,173, //lods %ds:(%rsi),%rax 8606 255,224, //jmpq *%rax 8607 }; 8608 8609 CODE const uint8_t sk_repeat_x_hsw[] = { 8610 72,173, //lods %ds:(%rsi),%rax 8611 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 8612 196,65,124,94,200, //vdivps %ymm8,%ymm0,%ymm9 8613 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9 8614 196,98,61,172,200, //vfnmadd213ps %ymm0,%ymm8,%ymm9 8615 197,253,118,192, //vpcmpeqd %ymm0,%ymm0,%ymm0 8616 197,189,254,192, //vpaddd %ymm0,%ymm8,%ymm0 8617 197,180,93,192, //vminps %ymm0,%ymm9,%ymm0 8618 72,173, //lods %ds:(%rsi),%rax 8619 255,224, //jmpq *%rax 8620 }; 8621 8622 CODE const uint8_t sk_repeat_y_hsw[] = { 8623 72,173, //lods %ds:(%rsi),%rax 8624 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 8625 196,65,116,94,200, //vdivps %ymm8,%ymm1,%ymm9 8626 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9 8627 196,98,61,172,201, //vfnmadd213ps %ymm1,%ymm8,%ymm9 8628 197,245,118,201, //vpcmpeqd %ymm1,%ymm1,%ymm1 8629 197,189,254,201, //vpaddd %ymm1,%ymm8,%ymm1 8630 197,180,93,201, //vminps %ymm1,%ymm9,%ymm1 8631 72,173, //lods %ds:(%rsi),%rax 8632 255,224, //jmpq *%rax 8633 }; 8634 8635 CODE const uint8_t sk_mirror_x_hsw[] = { 8636 72,173, //lods %ds:(%rsi),%rax 8637 197,122,16,0, //vmovss (%rax),%xmm8 8638 196,66,125,24,200, //vbroadcastss %xmm8,%ymm9 8639 196,65,124,92,209, //vsubps %ymm9,%ymm0,%ymm10 8640 196,193,58,88,192, //vaddss %xmm8,%xmm8,%xmm0 8641 196,226,125,24,192, //vbroadcastss %xmm0,%ymm0 8642 197,44,94,192, //vdivps %ymm0,%ymm10,%ymm8 8643 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8 8644 196,66,125,172,194, //vfnmadd213ps %ymm10,%ymm0,%ymm8 8645 196,193,60,92,193, //vsubps %ymm9,%ymm8,%ymm0 8646 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 8647 197,60,92,192, //vsubps %ymm0,%ymm8,%ymm8 8648 197,188,84,192, //vandps %ymm0,%ymm8,%ymm0 8649 196,65,61,118,192, //vpcmpeqd %ymm8,%ymm8,%ymm8 8650 196,65,53,254,192, //vpaddd %ymm8,%ymm9,%ymm8 8651 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0 8652 72,173, //lods %ds:(%rsi),%rax 8653 255,224, //jmpq *%rax 8654 }; 8655 8656 CODE const uint8_t sk_mirror_y_hsw[] = { 8657 72,173, //lods %ds:(%rsi),%rax 8658 197,122,16,0, //vmovss (%rax),%xmm8 8659 196,66,125,24,200, //vbroadcastss %xmm8,%ymm9 8660 196,65,116,92,209, //vsubps %ymm9,%ymm1,%ymm10 8661 196,193,58,88,200, //vaddss %xmm8,%xmm8,%xmm1 8662 196,226,125,24,201, //vbroadcastss %xmm1,%ymm1 8663 197,44,94,193, //vdivps %ymm1,%ymm10,%ymm8 8664 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8 8665 196,66,117,172,194, //vfnmadd213ps %ymm10,%ymm1,%ymm8 8666 196,193,60,92,201, //vsubps %ymm9,%ymm8,%ymm1 8667 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 8668 197,60,92,193, //vsubps %ymm1,%ymm8,%ymm8 8669 197,188,84,201, //vandps %ymm1,%ymm8,%ymm1 8670 196,65,61,118,192, //vpcmpeqd %ymm8,%ymm8,%ymm8 8671 196,65,53,254,192, //vpaddd %ymm8,%ymm9,%ymm8 8672 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1 8673 72,173, //lods %ds:(%rsi),%rax 8674 255,224, //jmpq *%rax 8675 }; 8676 8677 CODE const uint8_t sk_luminance_to_alpha_hsw[] = { 8678 184,208,179,89,62, //mov $0x3e59b3d0,%eax 8679 197,249,110,216, //vmovd %eax,%xmm3 8680 196,98,125,88,195, //vpbroadcastd %xmm3,%ymm8 8681 184,89,23,55,63, //mov $0x3f371759,%eax 8682 197,249,110,216, //vmovd %eax,%xmm3 8683 196,226,125,88,219, //vpbroadcastd %xmm3,%ymm3 8684 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1 8685 196,98,125,168,193, //vfmadd213ps %ymm1,%ymm0,%ymm8 8686 184,152,221,147,61, //mov $0x3d93dd98,%eax 8687 197,249,110,192, //vmovd %eax,%xmm0 8688 196,226,125,88,216, //vpbroadcastd %xmm0,%ymm3 8689 196,194,109,168,216, //vfmadd213ps %ymm8,%ymm2,%ymm3 8690 72,173, //lods %ds:(%rsi),%rax 8691 197,253,239,192, //vpxor %ymm0,%ymm0,%ymm0 8692 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 8693 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 8694 255,224, //jmpq *%rax 8695 }; 8696 8697 CODE const uint8_t sk_matrix_2x3_hsw[] = { 8698 72,173, //lods %ds:(%rsi),%rax 8699 196,98,125,24,8, //vbroadcastss (%rax),%ymm9 8700 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10 8701 196,98,125,24,64,16, //vbroadcastss 0x10(%rax),%ymm8 8702 196,66,117,184,194, //vfmadd231ps %ymm10,%ymm1,%ymm8 8703 196,66,125,184,193, //vfmadd231ps %ymm9,%ymm0,%ymm8 8704 196,98,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm10 8705 196,98,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm11 8706 196,98,125,24,72,20, //vbroadcastss 0x14(%rax),%ymm9 8707 196,66,117,184,203, //vfmadd231ps %ymm11,%ymm1,%ymm9 8708 196,66,125,184,202, //vfmadd231ps %ymm10,%ymm0,%ymm9 8709 72,173, //lods %ds:(%rsi),%rax 8710 197,124,41,192, //vmovaps %ymm8,%ymm0 8711 197,124,41,201, //vmovaps %ymm9,%ymm1 8712 255,224, //jmpq *%rax 8713 }; 8714 8715 CODE const uint8_t sk_matrix_3x4_hsw[] = { 8716 72,173, //lods %ds:(%rsi),%rax 8717 196,98,125,24,8, //vbroadcastss (%rax),%ymm9 8718 196,98,125,24,80,12, //vbroadcastss 0xc(%rax),%ymm10 8719 196,98,125,24,88,24, //vbroadcastss 0x18(%rax),%ymm11 8720 196,98,125,24,64,36, //vbroadcastss 0x24(%rax),%ymm8 8721 196,66,109,184,195, //vfmadd231ps %ymm11,%ymm2,%ymm8 8722 196,66,117,184,194, //vfmadd231ps %ymm10,%ymm1,%ymm8 8723 196,66,125,184,193, //vfmadd231ps %ymm9,%ymm0,%ymm8 8724 196,98,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm10 8725 196,98,125,24,88,16, //vbroadcastss 0x10(%rax),%ymm11 8726 196,98,125,24,96,28, //vbroadcastss 0x1c(%rax),%ymm12 8727 196,98,125,24,72,40, //vbroadcastss 0x28(%rax),%ymm9 8728 196,66,109,184,204, //vfmadd231ps %ymm12,%ymm2,%ymm9 8729 196,66,117,184,203, //vfmadd231ps %ymm11,%ymm1,%ymm9 8730 196,66,125,184,202, //vfmadd231ps %ymm10,%ymm0,%ymm9 8731 196,98,125,24,88,8, //vbroadcastss 0x8(%rax),%ymm11 8732 196,98,125,24,96,20, //vbroadcastss 0x14(%rax),%ymm12 8733 196,98,125,24,104,32, //vbroadcastss 0x20(%rax),%ymm13 8734 196,98,125,24,80,44, //vbroadcastss 0x2c(%rax),%ymm10 8735 196,66,109,184,213, //vfmadd231ps %ymm13,%ymm2,%ymm10 8736 196,66,117,184,212, //vfmadd231ps %ymm12,%ymm1,%ymm10 8737 196,66,125,184,211, //vfmadd231ps %ymm11,%ymm0,%ymm10 8738 72,173, //lods %ds:(%rsi),%rax 8739 197,124,41,192, //vmovaps %ymm8,%ymm0 8740 197,124,41,201, //vmovaps %ymm9,%ymm1 8741 197,124,41,210, //vmovaps %ymm10,%ymm2 8742 255,224, //jmpq *%rax 8743 }; 8744 8745 CODE const uint8_t sk_matrix_4x5_hsw[] = { 8746 72,173, //lods %ds:(%rsi),%rax 8747 196,98,125,24,8, //vbroadcastss (%rax),%ymm9 8748 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10 8749 196,98,125,24,88,32, //vbroadcastss 0x20(%rax),%ymm11 8750 196,98,125,24,96,48, //vbroadcastss 0x30(%rax),%ymm12 8751 196,98,125,24,64,64, //vbroadcastss 0x40(%rax),%ymm8 8752 196,66,101,184,196, //vfmadd231ps %ymm12,%ymm3,%ymm8 8753 196,66,109,184,195, //vfmadd231ps %ymm11,%ymm2,%ymm8 8754 196,66,117,184,194, //vfmadd231ps %ymm10,%ymm1,%ymm8 8755 196,66,125,184,193, //vfmadd231ps %ymm9,%ymm0,%ymm8 8756 196,98,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm10 8757 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11 8758 196,98,125,24,96,36, //vbroadcastss 0x24(%rax),%ymm12 8759 196,98,125,24,104,52, //vbroadcastss 0x34(%rax),%ymm13 8760 196,98,125,24,72,68, //vbroadcastss 0x44(%rax),%ymm9 8761 196,66,101,184,205, //vfmadd231ps %ymm13,%ymm3,%ymm9 8762 196,66,109,184,204, //vfmadd231ps %ymm12,%ymm2,%ymm9 8763 196,66,117,184,203, //vfmadd231ps %ymm11,%ymm1,%ymm9 8764 196,66,125,184,202, //vfmadd231ps %ymm10,%ymm0,%ymm9 8765 196,98,125,24,88,8, //vbroadcastss 0x8(%rax),%ymm11 8766 196,98,125,24,96,24, //vbroadcastss 0x18(%rax),%ymm12 8767 196,98,125,24,104,40, //vbroadcastss 0x28(%rax),%ymm13 8768 196,98,125,24,112,56, //vbroadcastss 0x38(%rax),%ymm14 8769 196,98,125,24,80,72, //vbroadcastss 0x48(%rax),%ymm10 8770 196,66,101,184,214, //vfmadd231ps %ymm14,%ymm3,%ymm10 8771 196,66,109,184,213, //vfmadd231ps %ymm13,%ymm2,%ymm10 8772 196,66,117,184,212, //vfmadd231ps %ymm12,%ymm1,%ymm10 8773 196,66,125,184,211, //vfmadd231ps %ymm11,%ymm0,%ymm10 8774 196,98,125,24,96,12, //vbroadcastss 0xc(%rax),%ymm12 8775 196,98,125,24,104,28, //vbroadcastss 0x1c(%rax),%ymm13 8776 196,98,125,24,112,44, //vbroadcastss 0x2c(%rax),%ymm14 8777 196,98,125,24,120,60, //vbroadcastss 0x3c(%rax),%ymm15 8778 196,98,125,24,88,76, //vbroadcastss 0x4c(%rax),%ymm11 8779 196,66,101,184,223, //vfmadd231ps %ymm15,%ymm3,%ymm11 8780 196,66,109,184,222, //vfmadd231ps %ymm14,%ymm2,%ymm11 8781 196,66,117,184,221, //vfmadd231ps %ymm13,%ymm1,%ymm11 8782 196,66,125,184,220, //vfmadd231ps %ymm12,%ymm0,%ymm11 8783 72,173, //lods %ds:(%rsi),%rax 8784 197,124,41,192, //vmovaps %ymm8,%ymm0 8785 197,124,41,201, //vmovaps %ymm9,%ymm1 8786 197,124,41,210, //vmovaps %ymm10,%ymm2 8787 197,124,41,219, //vmovaps %ymm11,%ymm3 8788 255,224, //jmpq *%rax 8789 }; 8790 8791 CODE const uint8_t sk_matrix_perspective_hsw[] = { 8792 72,173, //lods %ds:(%rsi),%rax 8793 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 8794 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9 8795 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10 8796 196,66,117,184,209, //vfmadd231ps %ymm9,%ymm1,%ymm10 8797 196,66,125,184,208, //vfmadd231ps %ymm8,%ymm0,%ymm10 8798 196,98,125,24,64,12, //vbroadcastss 0xc(%rax),%ymm8 8799 196,98,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm9 8800 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11 8801 196,66,117,184,217, //vfmadd231ps %ymm9,%ymm1,%ymm11 8802 196,66,125,184,216, //vfmadd231ps %ymm8,%ymm0,%ymm11 8803 196,98,125,24,64,24, //vbroadcastss 0x18(%rax),%ymm8 8804 196,98,125,24,72,28, //vbroadcastss 0x1c(%rax),%ymm9 8805 196,98,125,24,96,32, //vbroadcastss 0x20(%rax),%ymm12 8806 196,66,117,184,225, //vfmadd231ps %ymm9,%ymm1,%ymm12 8807 196,66,125,184,224, //vfmadd231ps %ymm8,%ymm0,%ymm12 8808 196,193,124,83,204, //vrcpps %ymm12,%ymm1 8809 197,172,89,193, //vmulps %ymm1,%ymm10,%ymm0 8810 197,164,89,201, //vmulps %ymm1,%ymm11,%ymm1 8811 72,173, //lods %ds:(%rsi),%rax 8812 255,224, //jmpq *%rax 8813 }; 8814 8815 CODE const uint8_t sk_linear_gradient_2stops_hsw[] = { 8816 72,173, //lods %ds:(%rsi),%rax 8817 196,226,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm1 8818 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 8819 196,98,125,184,193, //vfmadd231ps %ymm1,%ymm0,%ymm8 8820 196,226,125,24,80,20, //vbroadcastss 0x14(%rax),%ymm2 8821 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1 8822 196,226,125,184,202, //vfmadd231ps %ymm2,%ymm0,%ymm1 8823 196,226,125,24,88,24, //vbroadcastss 0x18(%rax),%ymm3 8824 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2 8825 196,226,125,184,211, //vfmadd231ps %ymm3,%ymm0,%ymm2 8826 196,98,125,24,72,28, //vbroadcastss 0x1c(%rax),%ymm9 8827 196,226,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm3 8828 196,194,125,184,217, //vfmadd231ps %ymm9,%ymm0,%ymm3 8829 72,173, //lods %ds:(%rsi),%rax 8830 197,124,41,192, //vmovaps %ymm8,%ymm0 8831 255,224, //jmpq *%rax 8832 }; 8833 8834 CODE const uint8_t sk_start_pipeline_avx[] = { 8835 65,87, //push %r15 8836 65,86, //push %r14 8837 65,85, //push %r13 8838 65,84, //push %r12 8839 86, //push %rsi 8840 87, //push %rdi 8841 83, //push %rbx 8842 72,129,236,160,0,0,0, //sub $0xa0,%rsp 8843 197,120,41,188,36,144,0,0,0, //vmovaps %xmm15,0x90(%rsp) 8844 197,120,41,180,36,128,0,0,0, //vmovaps %xmm14,0x80(%rsp) 8845 197,120,41,108,36,112, //vmovaps %xmm13,0x70(%rsp) 8846 197,120,41,100,36,96, //vmovaps %xmm12,0x60(%rsp) 8847 197,120,41,92,36,80, //vmovaps %xmm11,0x50(%rsp) 8848 197,120,41,84,36,64, //vmovaps %xmm10,0x40(%rsp) 8849 197,120,41,76,36,48, //vmovaps %xmm9,0x30(%rsp) 8850 197,120,41,68,36,32, //vmovaps %xmm8,0x20(%rsp) 8851 197,248,41,124,36,16, //vmovaps %xmm7,0x10(%rsp) 8852 197,248,41,52,36, //vmovaps %xmm6,(%rsp) 8853 77,137,205, //mov %r9,%r13 8854 77,137,198, //mov %r8,%r14 8855 72,137,203, //mov %rcx,%rbx 8856 72,137,214, //mov %rdx,%rsi 8857 72,173, //lods %ds:(%rsi),%rax 8858 73,137,199, //mov %rax,%r15 8859 73,137,244, //mov %rsi,%r12 8860 72,141,67,8, //lea 0x8(%rbx),%rax 8861 76,57,232, //cmp %r13,%rax 8862 118,5, //jbe 75 <_sk_start_pipeline_avx+0x75> 8863 72,137,223, //mov %rbx,%rdi 8864 235,65, //jmp b6 <_sk_start_pipeline_avx+0xb6> 8865 185,0,0,0,0, //mov $0x0,%ecx 8866 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 8867 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 8868 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 8869 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 8870 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4 8871 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5 8872 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6 8873 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7 8874 72,137,223, //mov %rbx,%rdi 8875 76,137,230, //mov %r12,%rsi 8876 76,137,242, //mov %r14,%rdx 8877 65,255,215, //callq *%r15 8878 72,141,123,8, //lea 0x8(%rbx),%rdi 8879 72,131,195,16, //add $0x10,%rbx 8880 76,57,235, //cmp %r13,%rbx 8881 72,137,251, //mov %rdi,%rbx 8882 118,191, //jbe 75 <_sk_start_pipeline_avx+0x75> 8883 76,137,233, //mov %r13,%rcx 8884 72,41,249, //sub %rdi,%rcx 8885 116,41, //je e7 <_sk_start_pipeline_avx+0xe7> 8886 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 8887 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 8888 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 8889 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 8890 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4 8891 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5 8892 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6 8893 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7 8894 76,137,230, //mov %r12,%rsi 8895 76,137,242, //mov %r14,%rdx 8896 65,255,215, //callq *%r15 8897 76,137,232, //mov %r13,%rax 8898 197,248,40,52,36, //vmovaps (%rsp),%xmm6 8899 197,248,40,124,36,16, //vmovaps 0x10(%rsp),%xmm7 8900 197,120,40,68,36,32, //vmovaps 0x20(%rsp),%xmm8 8901 197,120,40,76,36,48, //vmovaps 0x30(%rsp),%xmm9 8902 197,120,40,84,36,64, //vmovaps 0x40(%rsp),%xmm10 8903 197,120,40,92,36,80, //vmovaps 0x50(%rsp),%xmm11 8904 197,120,40,100,36,96, //vmovaps 0x60(%rsp),%xmm12 8905 197,120,40,108,36,112, //vmovaps 0x70(%rsp),%xmm13 8906 197,120,40,180,36,128,0,0,0, //vmovaps 0x80(%rsp),%xmm14 8907 197,120,40,188,36,144,0,0,0, //vmovaps 0x90(%rsp),%xmm15 8908 72,129,196,160,0,0,0, //add $0xa0,%rsp 8909 91, //pop %rbx 8910 95, //pop %rdi 8911 94, //pop %rsi 8912 65,92, //pop %r12 8913 65,93, //pop %r13 8914 65,94, //pop %r14 8915 65,95, //pop %r15 8916 197,248,119, //vzeroupper 8917 195, //retq 8918 }; 8919 8920 CODE const uint8_t sk_just_return_avx[] = { 8921 195, //retq 8922 }; 8923 8924 CODE const uint8_t sk_seed_shader_avx[] = { 8925 72,173, //lods %ds:(%rsi),%rax 8926 197,249,110,199, //vmovd %edi,%xmm0 8927 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0 8928 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 8929 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 8930 65,184,0,0,0,63, //mov $0x3f000000,%r8d 8931 196,193,121,110,200, //vmovd %r8d,%xmm1 8932 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 8933 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 8934 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 8935 197,252,88,2, //vaddps (%rdx),%ymm0,%ymm0 8936 196,226,125,24,16, //vbroadcastss (%rax),%ymm2 8937 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 8938 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1 8939 184,0,0,128,63, //mov $0x3f800000,%eax 8940 197,249,110,208, //vmovd %eax,%xmm2 8941 196,227,121,4,210,0, //vpermilps $0x0,%xmm2,%xmm2 8942 196,227,109,24,210,1, //vinsertf128 $0x1,%xmm2,%ymm2,%ymm2 8943 72,173, //lods %ds:(%rsi),%rax 8944 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 8945 197,220,87,228, //vxorps %ymm4,%ymm4,%ymm4 8946 197,212,87,237, //vxorps %ymm5,%ymm5,%ymm5 8947 197,204,87,246, //vxorps %ymm6,%ymm6,%ymm6 8948 197,196,87,255, //vxorps %ymm7,%ymm7,%ymm7 8949 255,224, //jmpq *%rax 8950 }; 8951 8952 CODE const uint8_t sk_constant_color_avx[] = { 8953 72,173, //lods %ds:(%rsi),%rax 8954 196,226,125,24,0, //vbroadcastss (%rax),%ymm0 8955 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1 8956 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2 8957 196,226,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm3 8958 72,173, //lods %ds:(%rsi),%rax 8959 255,224, //jmpq *%rax 8960 }; 8961 8962 CODE const uint8_t sk_clear_avx[] = { 8963 72,173, //lods %ds:(%rsi),%rax 8964 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 8965 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 8966 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 8967 197,228,87,219, //vxorps %ymm3,%ymm3,%ymm3 8968 255,224, //jmpq *%rax 8969 }; 8970 8971 CODE const uint8_t sk_plus__avx[] = { 8972 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0 8973 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1 8974 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2 8975 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3 8976 72,173, //lods %ds:(%rsi),%rax 8977 255,224, //jmpq *%rax 8978 }; 8979 8980 CODE const uint8_t sk_srcover_avx[] = { 8981 184,0,0,128,63, //mov $0x3f800000,%eax 8982 197,121,110,192, //vmovd %eax,%xmm8 8983 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 8984 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 8985 197,60,92,195, //vsubps %ymm3,%ymm8,%ymm8 8986 197,60,89,204, //vmulps %ymm4,%ymm8,%ymm9 8987 197,180,88,192, //vaddps %ymm0,%ymm9,%ymm0 8988 197,60,89,205, //vmulps %ymm5,%ymm8,%ymm9 8989 197,180,88,201, //vaddps %ymm1,%ymm9,%ymm1 8990 197,60,89,206, //vmulps %ymm6,%ymm8,%ymm9 8991 197,180,88,210, //vaddps %ymm2,%ymm9,%ymm2 8992 197,60,89,199, //vmulps %ymm7,%ymm8,%ymm8 8993 197,188,88,219, //vaddps %ymm3,%ymm8,%ymm3 8994 72,173, //lods %ds:(%rsi),%rax 8995 255,224, //jmpq *%rax 8996 }; 8997 8998 CODE const uint8_t sk_dstover_avx[] = { 8999 184,0,0,128,63, //mov $0x3f800000,%eax 9000 197,121,110,192, //vmovd %eax,%xmm8 9001 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 9002 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 9003 197,60,92,199, //vsubps %ymm7,%ymm8,%ymm8 9004 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 9005 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0 9006 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 9007 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1 9008 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2 9009 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2 9010 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 9011 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3 9012 72,173, //lods %ds:(%rsi),%rax 9013 255,224, //jmpq *%rax 9014 }; 9015 9016 CODE const uint8_t sk_clamp_0_avx[] = { 9017 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 9018 196,193,124,95,192, //vmaxps %ymm8,%ymm0,%ymm0 9019 196,193,116,95,200, //vmaxps %ymm8,%ymm1,%ymm1 9020 196,193,108,95,208, //vmaxps %ymm8,%ymm2,%ymm2 9021 196,193,100,95,216, //vmaxps %ymm8,%ymm3,%ymm3 9022 72,173, //lods %ds:(%rsi),%rax 9023 255,224, //jmpq *%rax 9024 }; 9025 9026 CODE const uint8_t sk_clamp_1_avx[] = { 9027 184,0,0,128,63, //mov $0x3f800000,%eax 9028 197,121,110,192, //vmovd %eax,%xmm8 9029 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 9030 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 9031 196,193,124,93,192, //vminps %ymm8,%ymm0,%ymm0 9032 196,193,116,93,200, //vminps %ymm8,%ymm1,%ymm1 9033 196,193,108,93,208, //vminps %ymm8,%ymm2,%ymm2 9034 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3 9035 72,173, //lods %ds:(%rsi),%rax 9036 255,224, //jmpq *%rax 9037 }; 9038 9039 CODE const uint8_t sk_clamp_a_avx[] = { 9040 184,0,0,128,63, //mov $0x3f800000,%eax 9041 197,121,110,192, //vmovd %eax,%xmm8 9042 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 9043 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 9044 196,193,100,93,216, //vminps %ymm8,%ymm3,%ymm3 9045 197,252,93,195, //vminps %ymm3,%ymm0,%ymm0 9046 197,244,93,203, //vminps %ymm3,%ymm1,%ymm1 9047 197,236,93,211, //vminps %ymm3,%ymm2,%ymm2 9048 72,173, //lods %ds:(%rsi),%rax 9049 255,224, //jmpq *%rax 9050 }; 9051 9052 CODE const uint8_t sk_set_rgb_avx[] = { 9053 72,173, //lods %ds:(%rsi),%rax 9054 196,226,125,24,0, //vbroadcastss (%rax),%ymm0 9055 196,226,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm1 9056 196,226,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm2 9057 72,173, //lods %ds:(%rsi),%rax 9058 255,224, //jmpq *%rax 9059 }; 9060 9061 CODE const uint8_t sk_swap_rb_avx[] = { 9062 197,124,40,192, //vmovaps %ymm0,%ymm8 9063 72,173, //lods %ds:(%rsi),%rax 9064 197,252,40,194, //vmovaps %ymm2,%ymm0 9065 197,124,41,194, //vmovaps %ymm8,%ymm2 9066 255,224, //jmpq *%rax 9067 }; 9068 9069 CODE const uint8_t sk_swap_avx[] = { 9070 197,124,40,195, //vmovaps %ymm3,%ymm8 9071 197,124,40,202, //vmovaps %ymm2,%ymm9 9072 197,124,40,209, //vmovaps %ymm1,%ymm10 9073 197,124,40,216, //vmovaps %ymm0,%ymm11 9074 72,173, //lods %ds:(%rsi),%rax 9075 197,252,40,196, //vmovaps %ymm4,%ymm0 9076 197,252,40,205, //vmovaps %ymm5,%ymm1 9077 197,252,40,214, //vmovaps %ymm6,%ymm2 9078 197,252,40,223, //vmovaps %ymm7,%ymm3 9079 197,124,41,220, //vmovaps %ymm11,%ymm4 9080 197,124,41,213, //vmovaps %ymm10,%ymm5 9081 197,124,41,206, //vmovaps %ymm9,%ymm6 9082 197,124,41,199, //vmovaps %ymm8,%ymm7 9083 255,224, //jmpq *%rax 9084 }; 9085 9086 CODE const uint8_t sk_move_src_dst_avx[] = { 9087 72,173, //lods %ds:(%rsi),%rax 9088 197,252,40,224, //vmovaps %ymm0,%ymm4 9089 197,252,40,233, //vmovaps %ymm1,%ymm5 9090 197,252,40,242, //vmovaps %ymm2,%ymm6 9091 197,252,40,251, //vmovaps %ymm3,%ymm7 9092 255,224, //jmpq *%rax 9093 }; 9094 9095 CODE const uint8_t sk_move_dst_src_avx[] = { 9096 72,173, //lods %ds:(%rsi),%rax 9097 197,252,40,196, //vmovaps %ymm4,%ymm0 9098 197,252,40,205, //vmovaps %ymm5,%ymm1 9099 197,252,40,214, //vmovaps %ymm6,%ymm2 9100 197,252,40,223, //vmovaps %ymm7,%ymm3 9101 255,224, //jmpq *%rax 9102 }; 9103 9104 CODE const uint8_t sk_premul_avx[] = { 9105 197,252,89,195, //vmulps %ymm3,%ymm0,%ymm0 9106 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1 9107 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 9108 72,173, //lods %ds:(%rsi),%rax 9109 255,224, //jmpq *%rax 9110 }; 9111 9112 CODE const uint8_t sk_unpremul_avx[] = { 9113 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 9114 196,65,100,194,200,0, //vcmpeqps %ymm8,%ymm3,%ymm9 9115 184,0,0,128,63, //mov $0x3f800000,%eax 9116 197,121,110,208, //vmovd %eax,%xmm10 9117 196,67,121,4,210,0, //vpermilps $0x0,%xmm10,%xmm10 9118 196,67,45,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm10,%ymm10 9119 197,44,94,211, //vdivps %ymm3,%ymm10,%ymm10 9120 196,67,45,74,192,144, //vblendvps %ymm9,%ymm8,%ymm10,%ymm8 9121 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 9122 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 9123 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2 9124 72,173, //lods %ds:(%rsi),%rax 9125 255,224, //jmpq *%rax 9126 }; 9127 9128 CODE const uint8_t sk_from_srgb_avx[] = { 9129 184,145,131,158,61, //mov $0x3d9e8391,%eax 9130 197,121,110,192, //vmovd %eax,%xmm8 9131 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 9132 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 9133 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 9134 197,124,89,208, //vmulps %ymm0,%ymm0,%ymm10 9135 184,154,153,153,62, //mov $0x3e99999a,%eax 9136 197,121,110,216, //vmovd %eax,%xmm11 9137 196,67,121,4,219,0, //vpermilps $0x0,%xmm11,%xmm11 9138 196,67,37,24,219,1, //vinsertf128 $0x1,%xmm11,%ymm11,%ymm11 9139 184,92,143,50,63, //mov $0x3f328f5c,%eax 9140 197,121,110,224, //vmovd %eax,%xmm12 9141 196,67,121,4,228,0, //vpermilps $0x0,%xmm12,%xmm12 9142 196,67,29,24,228,1, //vinsertf128 $0x1,%xmm12,%ymm12,%ymm12 9143 197,36,89,232, //vmulps %ymm0,%ymm11,%ymm13 9144 196,65,20,88,236, //vaddps %ymm12,%ymm13,%ymm13 9145 184,10,215,35,59, //mov $0x3b23d70a,%eax 9146 197,121,110,240, //vmovd %eax,%xmm14 9147 196,67,121,4,246,0, //vpermilps $0x0,%xmm14,%xmm14 9148 196,67,13,24,246,1, //vinsertf128 $0x1,%xmm14,%ymm14,%ymm14 9149 196,65,44,89,213, //vmulps %ymm13,%ymm10,%ymm10 9150 196,65,12,88,210, //vaddps %ymm10,%ymm14,%ymm10 9151 184,174,71,97,61, //mov $0x3d6147ae,%eax 9152 197,121,110,232, //vmovd %eax,%xmm13 9153 196,67,121,4,237,0, //vpermilps $0x0,%xmm13,%xmm13 9154 196,67,21,24,237,1, //vinsertf128 $0x1,%xmm13,%ymm13,%ymm13 9155 196,193,124,194,197,1, //vcmpltps %ymm13,%ymm0,%ymm0 9156 196,195,45,74,193,0, //vblendvps %ymm0,%ymm9,%ymm10,%ymm0 9157 197,60,89,201, //vmulps %ymm1,%ymm8,%ymm9 9158 197,116,89,209, //vmulps %ymm1,%ymm1,%ymm10 9159 197,36,89,249, //vmulps %ymm1,%ymm11,%ymm15 9160 196,65,28,88,255, //vaddps %ymm15,%ymm12,%ymm15 9161 196,65,44,89,215, //vmulps %ymm15,%ymm10,%ymm10 9162 196,65,12,88,210, //vaddps %ymm10,%ymm14,%ymm10 9163 196,193,116,194,205,1, //vcmpltps %ymm13,%ymm1,%ymm1 9164 196,195,45,74,201,16, //vblendvps %ymm1,%ymm9,%ymm10,%ymm1 9165 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 9166 197,108,89,202, //vmulps %ymm2,%ymm2,%ymm9 9167 197,36,89,210, //vmulps %ymm2,%ymm11,%ymm10 9168 196,65,28,88,210, //vaddps %ymm10,%ymm12,%ymm10 9169 196,65,52,89,202, //vmulps %ymm10,%ymm9,%ymm9 9170 196,65,12,88,201, //vaddps %ymm9,%ymm14,%ymm9 9171 196,193,108,194,213,1, //vcmpltps %ymm13,%ymm2,%ymm2 9172 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2 9173 72,173, //lods %ds:(%rsi),%rax 9174 255,224, //jmpq *%rax 9175 }; 9176 9177 CODE const uint8_t sk_to_srgb_avx[] = { 9178 197,124,82,192, //vrsqrtps %ymm0,%ymm8 9179 196,65,124,83,232, //vrcpps %ymm8,%ymm13 9180 196,65,124,82,240, //vrsqrtps %ymm8,%ymm14 9181 184,41,92,71,65, //mov $0x41475c29,%eax 9182 197,121,110,192, //vmovd %eax,%xmm8 9183 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 9184 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 9185 197,60,89,224, //vmulps %ymm0,%ymm8,%ymm12 9186 184,0,0,128,63, //mov $0x3f800000,%eax 9187 197,121,110,200, //vmovd %eax,%xmm9 9188 196,67,121,4,201,0, //vpermilps $0x0,%xmm9,%xmm9 9189 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 9190 184,194,135,210,62, //mov $0x3ed287c2,%eax 9191 197,121,110,208, //vmovd %eax,%xmm10 9192 196,67,121,4,210,0, //vpermilps $0x0,%xmm10,%xmm10 9193 196,67,45,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm10,%ymm10 9194 184,206,111,48,63, //mov $0x3f306fce,%eax 9195 197,121,110,216, //vmovd %eax,%xmm11 9196 196,67,121,4,219,0, //vpermilps $0x0,%xmm11,%xmm11 9197 196,67,37,24,219,1, //vinsertf128 $0x1,%xmm11,%ymm11,%ymm11 9198 184,168,87,202,61, //mov $0x3dca57a8,%eax 9199 53,0,0,0,128, //xor $0x80000000,%eax 9200 197,121,110,248, //vmovd %eax,%xmm15 9201 196,67,121,4,255,0, //vpermilps $0x0,%xmm15,%xmm15 9202 196,67,5,24,255,1, //vinsertf128 $0x1,%xmm15,%ymm15,%ymm15 9203 196,65,20,89,235, //vmulps %ymm11,%ymm13,%ymm13 9204 196,65,20,88,239, //vaddps %ymm15,%ymm13,%ymm13 9205 196,65,12,89,242, //vmulps %ymm10,%ymm14,%ymm14 9206 196,65,12,88,237, //vaddps %ymm13,%ymm14,%ymm13 9207 196,65,52,93,237, //vminps %ymm13,%ymm9,%ymm13 9208 184,4,231,140,59, //mov $0x3b8ce704,%eax 9209 197,121,110,240, //vmovd %eax,%xmm14 9210 196,67,121,4,246,0, //vpermilps $0x0,%xmm14,%xmm14 9211 196,67,13,24,246,1, //vinsertf128 $0x1,%xmm14,%ymm14,%ymm14 9212 196,193,124,194,198,1, //vcmpltps %ymm14,%ymm0,%ymm0 9213 196,195,21,74,196,0, //vblendvps %ymm0,%ymm12,%ymm13,%ymm0 9214 197,124,82,225, //vrsqrtps %ymm1,%ymm12 9215 196,65,124,83,236, //vrcpps %ymm12,%ymm13 9216 196,65,124,82,228, //vrsqrtps %ymm12,%ymm12 9217 196,65,36,89,237, //vmulps %ymm13,%ymm11,%ymm13 9218 196,65,4,88,237, //vaddps %ymm13,%ymm15,%ymm13 9219 196,65,44,89,228, //vmulps %ymm12,%ymm10,%ymm12 9220 196,65,28,88,229, //vaddps %ymm13,%ymm12,%ymm12 9221 197,60,89,233, //vmulps %ymm1,%ymm8,%ymm13 9222 196,65,52,93,228, //vminps %ymm12,%ymm9,%ymm12 9223 196,193,116,194,206,1, //vcmpltps %ymm14,%ymm1,%ymm1 9224 196,195,29,74,205,16, //vblendvps %ymm1,%ymm13,%ymm12,%ymm1 9225 197,124,82,226, //vrsqrtps %ymm2,%ymm12 9226 196,65,124,83,236, //vrcpps %ymm12,%ymm13 9227 196,65,36,89,221, //vmulps %ymm13,%ymm11,%ymm11 9228 196,65,4,88,219, //vaddps %ymm11,%ymm15,%ymm11 9229 196,65,124,82,228, //vrsqrtps %ymm12,%ymm12 9230 196,65,44,89,212, //vmulps %ymm12,%ymm10,%ymm10 9231 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 9232 196,65,52,93,202, //vminps %ymm10,%ymm9,%ymm9 9233 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 9234 196,193,108,194,214,1, //vcmpltps %ymm14,%ymm2,%ymm2 9235 196,195,53,74,208,32, //vblendvps %ymm2,%ymm8,%ymm9,%ymm2 9236 72,173, //lods %ds:(%rsi),%rax 9237 255,224, //jmpq *%rax 9238 }; 9239 9240 CODE const uint8_t sk_scale_1_float_avx[] = { 9241 72,173, //lods %ds:(%rsi),%rax 9242 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 9243 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 9244 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 9245 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2 9246 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 9247 72,173, //lods %ds:(%rsi),%rax 9248 255,224, //jmpq *%rax 9249 }; 9250 9251 CODE const uint8_t sk_scale_u8_avx[] = { 9252 73,137,200, //mov %rcx,%r8 9253 72,173, //lods %ds:(%rsi),%rax 9254 72,139,0, //mov (%rax),%rax 9255 72,1,248, //add %rdi,%rax 9256 77,133,192, //test %r8,%r8 9257 117,80, //jne 639 <_sk_scale_u8_avx+0x60> 9258 197,122,126,0, //vmovq (%rax),%xmm8 9259 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9 9260 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8 9261 196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8 9262 196,67,53,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm9,%ymm8 9263 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 9264 184,129,128,128,59, //mov $0x3b808081,%eax 9265 197,121,110,200, //vmovd %eax,%xmm9 9266 196,67,121,4,201,0, //vpermilps $0x0,%xmm9,%xmm9 9267 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 9268 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8 9269 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 9270 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 9271 197,188,89,210, //vmulps %ymm2,%ymm8,%ymm2 9272 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 9273 72,173, //lods %ds:(%rsi),%rax 9274 76,137,193, //mov %r8,%rcx 9275 255,224, //jmpq *%rax 9276 49,201, //xor %ecx,%ecx 9277 77,137,194, //mov %r8,%r10 9278 69,49,201, //xor %r9d,%r9d 9279 68,15,182,24, //movzbl (%rax),%r11d 9280 72,255,192, //inc %rax 9281 73,211,227, //shl %cl,%r11 9282 77,9,217, //or %r11,%r9 9283 72,131,193,8, //add $0x8,%rcx 9284 73,255,202, //dec %r10 9285 117,234, //jne 641 <_sk_scale_u8_avx+0x68> 9286 196,65,249,110,193, //vmovq %r9,%xmm8 9287 235,143, //jmp 5ed <_sk_scale_u8_avx+0x14> 9288 }; 9289 9290 CODE const uint8_t sk_lerp_1_float_avx[] = { 9291 72,173, //lods %ds:(%rsi),%rax 9292 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 9293 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 9294 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0 9295 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0 9296 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 9297 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1 9298 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1 9299 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 9300 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2 9301 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2 9302 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3 9303 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3 9304 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3 9305 72,173, //lods %ds:(%rsi),%rax 9306 255,224, //jmpq *%rax 9307 }; 9308 9309 CODE const uint8_t sk_lerp_u8_avx[] = { 9310 73,137,200, //mov %rcx,%r8 9311 72,173, //lods %ds:(%rsi),%rax 9312 72,139,0, //mov (%rax),%rax 9313 72,1,248, //add %rdi,%rax 9314 77,133,192, //test %r8,%r8 9315 117,116, //jne 721 <_sk_lerp_u8_avx+0x84> 9316 197,122,126,0, //vmovq (%rax),%xmm8 9317 196,66,121,49,200, //vpmovzxbd %xmm8,%xmm9 9318 196,67,121,4,192,229, //vpermilps $0xe5,%xmm8,%xmm8 9319 196,66,121,49,192, //vpmovzxbd %xmm8,%xmm8 9320 196,67,53,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm9,%ymm8 9321 196,65,124,91,192, //vcvtdq2ps %ymm8,%ymm8 9322 184,129,128,128,59, //mov $0x3b808081,%eax 9323 197,121,110,200, //vmovd %eax,%xmm9 9324 196,67,121,4,201,0, //vpermilps $0x0,%xmm9,%xmm9 9325 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 9326 196,65,60,89,193, //vmulps %ymm9,%ymm8,%ymm8 9327 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 9328 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0 9329 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0 9330 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 9331 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1 9332 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1 9333 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 9334 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2 9335 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2 9336 197,228,92,223, //vsubps %ymm7,%ymm3,%ymm3 9337 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3 9338 197,228,88,223, //vaddps %ymm7,%ymm3,%ymm3 9339 72,173, //lods %ds:(%rsi),%rax 9340 76,137,193, //mov %r8,%rcx 9341 255,224, //jmpq *%rax 9342 49,201, //xor %ecx,%ecx 9343 77,137,194, //mov %r8,%r10 9344 69,49,201, //xor %r9d,%r9d 9345 68,15,182,24, //movzbl (%rax),%r11d 9346 72,255,192, //inc %rax 9347 73,211,227, //shl %cl,%r11 9348 77,9,217, //or %r11,%r9 9349 72,131,193,8, //add $0x8,%rcx 9350 73,255,202, //dec %r10 9351 117,234, //jne 729 <_sk_lerp_u8_avx+0x8c> 9352 196,65,249,110,193, //vmovq %r9,%xmm8 9353 233,104,255,255,255, //jmpq 6b1 <_sk_lerp_u8_avx+0x14> 9354 }; 9355 9356 CODE const uint8_t sk_lerp_565_avx[] = { 9357 72,173, //lods %ds:(%rsi),%rax 9358 76,139,16, //mov (%rax),%r10 9359 72,133,201, //test %rcx,%rcx 9360 15,133,250,0,0,0, //jne 851 <_sk_lerp_565_avx+0x108> 9361 196,65,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm8 9362 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 9363 197,185,105,219, //vpunpckhwd %xmm3,%xmm8,%xmm3 9364 196,66,121,51,192, //vpmovzxwd %xmm8,%xmm8 9365 196,99,61,24,195,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm8 9366 184,0,248,0,0, //mov $0xf800,%eax 9367 197,249,110,216, //vmovd %eax,%xmm3 9368 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 9369 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 9370 196,193,100,84,216, //vandps %ymm8,%ymm3,%ymm3 9371 197,124,91,203, //vcvtdq2ps %ymm3,%ymm9 9372 184,8,33,132,55, //mov $0x37842108,%eax 9373 197,249,110,216, //vmovd %eax,%xmm3 9374 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 9375 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 9376 197,52,89,203, //vmulps %ymm3,%ymm9,%ymm9 9377 184,224,7,0,0, //mov $0x7e0,%eax 9378 197,249,110,216, //vmovd %eax,%xmm3 9379 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 9380 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 9381 196,193,100,84,216, //vandps %ymm8,%ymm3,%ymm3 9382 197,124,91,211, //vcvtdq2ps %ymm3,%ymm10 9383 184,33,8,2,58, //mov $0x3a020821,%eax 9384 197,249,110,216, //vmovd %eax,%xmm3 9385 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 9386 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 9387 197,44,89,211, //vmulps %ymm3,%ymm10,%ymm10 9388 184,31,0,0,0, //mov $0x1f,%eax 9389 197,249,110,216, //vmovd %eax,%xmm3 9390 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 9391 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 9392 196,193,100,84,216, //vandps %ymm8,%ymm3,%ymm3 9393 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 9394 184,8,33,4,61, //mov $0x3d042108,%eax 9395 197,249,110,216, //vmovd %eax,%xmm3 9396 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 9397 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 9398 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 9399 197,252,92,196, //vsubps %ymm4,%ymm0,%ymm0 9400 196,193,124,89,193, //vmulps %ymm9,%ymm0,%ymm0 9401 197,252,88,196, //vaddps %ymm4,%ymm0,%ymm0 9402 197,244,92,205, //vsubps %ymm5,%ymm1,%ymm1 9403 196,193,116,89,202, //vmulps %ymm10,%ymm1,%ymm1 9404 197,244,88,205, //vaddps %ymm5,%ymm1,%ymm1 9405 197,236,92,214, //vsubps %ymm6,%ymm2,%ymm2 9406 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 9407 197,236,88,214, //vaddps %ymm6,%ymm2,%ymm2 9408 184,0,0,128,63, //mov $0x3f800000,%eax 9409 197,249,110,216, //vmovd %eax,%xmm3 9410 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 9411 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 9412 72,173, //lods %ds:(%rsi),%rax 9413 255,224, //jmpq *%rax 9414 65,137,200, //mov %ecx,%r8d 9415 65,128,224,7, //and $0x7,%r8b 9416 196,65,57,239,192, //vpxor %xmm8,%xmm8,%xmm8 9417 65,254,200, //dec %r8b 9418 65,128,248,6, //cmp $0x6,%r8b 9419 15,135,243,254,255,255, //ja 75d <_sk_lerp_565_avx+0x14> 9420 69,15,182,192, //movzbl %r8b,%r8d 9421 76,141,13,75,0,0,0, //lea 0x4b(%rip),%r9 # 8c0 <_sk_lerp_565_avx+0x177> 9422 75,99,4,129, //movslq (%r9,%r8,4),%rax 9423 76,1,200, //add %r9,%rax 9424 255,224, //jmpq *%rax 9425 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 9426 196,65,97,196,68,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm8 9427 196,65,57,196,68,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm8,%xmm8 9428 196,65,57,196,68,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm8,%xmm8 9429 196,65,57,196,68,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm8,%xmm8 9430 196,65,57,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8 9431 196,65,57,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8 9432 196,65,57,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm8,%xmm8 9433 233,159,254,255,255, //jmpq 75d <_sk_lerp_565_avx+0x14> 9434 102,144, //xchg %ax,%ax 9435 242,255, //repnz (bad) 9436 255, //(bad) 9437 255, //(bad) 9438 234, //(bad) 9439 255, //(bad) 9440 255, //(bad) 9441 255,226, //jmpq *%rdx 9442 255, //(bad) 9443 255, //(bad) 9444 255, //(bad) 9445 218,255, //(bad) 9446 255, //(bad) 9447 255,210, //callq *%rdx 9448 255, //(bad) 9449 255, //(bad) 9450 255,202, //dec %edx 9451 255, //(bad) 9452 255, //(bad) 9453 255, //(bad) 9454 190, //.byte 0xbe 9455 255, //(bad) 9456 255, //(bad) 9457 255, //.byte 0xff 9458 }; 9459 9460 CODE const uint8_t sk_load_tables_avx[] = { 9461 85, //push %rbp 9462 65,87, //push %r15 9463 65,86, //push %r14 9464 65,85, //push %r13 9465 65,84, //push %r12 9466 83, //push %rbx 9467 72,173, //lods %ds:(%rsi),%rax 9468 76,139,0, //mov (%rax),%r8 9469 72,133,201, //test %rcx,%rcx 9470 15,133,56,2,0,0, //jne b2c <_sk_load_tables_avx+0x250> 9471 196,65,124,16,4,184, //vmovups (%r8,%rdi,4),%ymm8 9472 187,255,0,0,0, //mov $0xff,%ebx 9473 197,249,110,195, //vmovd %ebx,%xmm0 9474 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0 9475 196,99,125,24,200,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm9 9476 196,193,52,84,192, //vandps %ymm8,%ymm9,%ymm0 9477 196,193,249,126,193, //vmovq %xmm0,%r9 9478 69,137,203, //mov %r9d,%r11d 9479 196,195,249,22,194,1, //vpextrq $0x1,%xmm0,%r10 9480 69,137,214, //mov %r10d,%r14d 9481 73,193,234,32, //shr $0x20,%r10 9482 73,193,233,32, //shr $0x20,%r9 9483 196,227,125,25,192,1, //vextractf128 $0x1,%ymm0,%xmm0 9484 196,193,249,126,196, //vmovq %xmm0,%r12 9485 69,137,231, //mov %r12d,%r15d 9486 196,227,249,22,195,1, //vpextrq $0x1,%xmm0,%rbx 9487 65,137,221, //mov %ebx,%r13d 9488 72,193,235,32, //shr $0x20,%rbx 9489 73,193,236,32, //shr $0x20,%r12 9490 72,139,104,8, //mov 0x8(%rax),%rbp 9491 76,139,64,16, //mov 0x10(%rax),%r8 9492 196,161,122,16,68,189,0, //vmovss 0x0(%rbp,%r15,4),%xmm0 9493 196,163,121,33,68,165,0,16, //vinsertps $0x10,0x0(%rbp,%r12,4),%xmm0,%xmm0 9494 196,161,122,16,76,173,0, //vmovss 0x0(%rbp,%r13,4),%xmm1 9495 196,227,121,33,193,32, //vinsertps $0x20,%xmm1,%xmm0,%xmm0 9496 197,250,16,76,157,0, //vmovss 0x0(%rbp,%rbx,4),%xmm1 9497 196,227,121,33,193,48, //vinsertps $0x30,%xmm1,%xmm0,%xmm0 9498 196,161,122,16,76,157,0, //vmovss 0x0(%rbp,%r11,4),%xmm1 9499 196,163,113,33,76,141,0,16, //vinsertps $0x10,0x0(%rbp,%r9,4),%xmm1,%xmm1 9500 196,161,122,16,92,181,0, //vmovss 0x0(%rbp,%r14,4),%xmm3 9501 196,227,113,33,203,32, //vinsertps $0x20,%xmm3,%xmm1,%xmm1 9502 196,161,122,16,92,149,0, //vmovss 0x0(%rbp,%r10,4),%xmm3 9503 196,227,113,33,203,48, //vinsertps $0x30,%xmm3,%xmm1,%xmm1 9504 196,227,117,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm1,%ymm0 9505 196,193,113,114,208,8, //vpsrld $0x8,%xmm8,%xmm1 9506 196,67,125,25,194,1, //vextractf128 $0x1,%ymm8,%xmm10 9507 196,193,105,114,210,8, //vpsrld $0x8,%xmm10,%xmm2 9508 196,227,117,24,202,1, //vinsertf128 $0x1,%xmm2,%ymm1,%ymm1 9509 197,180,84,201, //vandps %ymm1,%ymm9,%ymm1 9510 196,193,249,126,201, //vmovq %xmm1,%r9 9511 69,137,203, //mov %r9d,%r11d 9512 196,195,249,22,202,1, //vpextrq $0x1,%xmm1,%r10 9513 69,137,214, //mov %r10d,%r14d 9514 73,193,234,32, //shr $0x20,%r10 9515 73,193,233,32, //shr $0x20,%r9 9516 196,227,125,25,201,1, //vextractf128 $0x1,%ymm1,%xmm1 9517 196,225,249,126,205, //vmovq %xmm1,%rbp 9518 65,137,239, //mov %ebp,%r15d 9519 196,227,249,22,203,1, //vpextrq $0x1,%xmm1,%rbx 9520 65,137,220, //mov %ebx,%r12d 9521 72,193,235,32, //shr $0x20,%rbx 9522 72,193,237,32, //shr $0x20,%rbp 9523 196,129,122,16,12,184, //vmovss (%r8,%r15,4),%xmm1 9524 196,195,113,33,12,168,16, //vinsertps $0x10,(%r8,%rbp,4),%xmm1,%xmm1 9525 196,129,122,16,20,160, //vmovss (%r8,%r12,4),%xmm2 9526 196,227,113,33,202,32, //vinsertps $0x20,%xmm2,%xmm1,%xmm1 9527 196,193,122,16,20,152, //vmovss (%r8,%rbx,4),%xmm2 9528 196,227,113,33,202,48, //vinsertps $0x30,%xmm2,%xmm1,%xmm1 9529 196,129,122,16,20,152, //vmovss (%r8,%r11,4),%xmm2 9530 196,131,105,33,20,136,16, //vinsertps $0x10,(%r8,%r9,4),%xmm2,%xmm2 9531 196,129,122,16,28,176, //vmovss (%r8,%r14,4),%xmm3 9532 196,227,105,33,211,32, //vinsertps $0x20,%xmm3,%xmm2,%xmm2 9533 196,129,122,16,28,144, //vmovss (%r8,%r10,4),%xmm3 9534 196,227,105,33,211,48, //vinsertps $0x30,%xmm3,%xmm2,%xmm2 9535 196,227,109,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm2,%ymm1 9536 72,139,64,24, //mov 0x18(%rax),%rax 9537 196,193,105,114,208,16, //vpsrld $0x10,%xmm8,%xmm2 9538 196,193,97,114,210,16, //vpsrld $0x10,%xmm10,%xmm3 9539 196,227,109,24,211,1, //vinsertf128 $0x1,%xmm3,%ymm2,%ymm2 9540 197,180,84,210, //vandps %ymm2,%ymm9,%ymm2 9541 196,193,249,126,208, //vmovq %xmm2,%r8 9542 69,137,194, //mov %r8d,%r10d 9543 196,195,249,22,209,1, //vpextrq $0x1,%xmm2,%r9 9544 69,137,203, //mov %r9d,%r11d 9545 73,193,233,32, //shr $0x20,%r9 9546 73,193,232,32, //shr $0x20,%r8 9547 196,227,125,25,210,1, //vextractf128 $0x1,%ymm2,%xmm2 9548 196,225,249,126,213, //vmovq %xmm2,%rbp 9549 65,137,238, //mov %ebp,%r14d 9550 196,227,249,22,211,1, //vpextrq $0x1,%xmm2,%rbx 9551 65,137,223, //mov %ebx,%r15d 9552 72,193,235,32, //shr $0x20,%rbx 9553 72,193,237,32, //shr $0x20,%rbp 9554 196,161,122,16,20,176, //vmovss (%rax,%r14,4),%xmm2 9555 196,227,105,33,20,168,16, //vinsertps $0x10,(%rax,%rbp,4),%xmm2,%xmm2 9556 196,161,122,16,28,184, //vmovss (%rax,%r15,4),%xmm3 9557 196,227,105,33,211,32, //vinsertps $0x20,%xmm3,%xmm2,%xmm2 9558 197,250,16,28,152, //vmovss (%rax,%rbx,4),%xmm3 9559 196,99,105,33,203,48, //vinsertps $0x30,%xmm3,%xmm2,%xmm9 9560 196,161,122,16,28,144, //vmovss (%rax,%r10,4),%xmm3 9561 196,163,97,33,28,128,16, //vinsertps $0x10,(%rax,%r8,4),%xmm3,%xmm3 9562 196,161,122,16,20,152, //vmovss (%rax,%r11,4),%xmm2 9563 196,227,97,33,210,32, //vinsertps $0x20,%xmm2,%xmm3,%xmm2 9564 196,161,122,16,28,136, //vmovss (%rax,%r9,4),%xmm3 9565 196,227,105,33,211,48, //vinsertps $0x30,%xmm3,%xmm2,%xmm2 9566 196,195,109,24,209,1, //vinsertf128 $0x1,%xmm9,%ymm2,%ymm2 9567 196,193,57,114,208,24, //vpsrld $0x18,%xmm8,%xmm8 9568 196,193,97,114,210,24, //vpsrld $0x18,%xmm10,%xmm3 9569 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3 9570 197,124,91,195, //vcvtdq2ps %ymm3,%ymm8 9571 184,129,128,128,59, //mov $0x3b808081,%eax 9572 197,249,110,216, //vmovd %eax,%xmm3 9573 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 9574 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 9575 197,188,89,219, //vmulps %ymm3,%ymm8,%ymm3 9576 72,173, //lods %ds:(%rsi),%rax 9577 91, //pop %rbx 9578 65,92, //pop %r12 9579 65,93, //pop %r13 9580 65,94, //pop %r14 9581 65,95, //pop %r15 9582 93, //pop %rbp 9583 255,224, //jmpq *%rax 9584 137,203, //mov %ecx,%ebx 9585 128,227,7, //and $0x7,%bl 9586 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 9587 254,203, //dec %bl 9588 128,251,6, //cmp $0x6,%bl 9589 15,135,185,253,255,255, //ja 8fa <_sk_load_tables_avx+0x1e> 9590 15,182,219, //movzbl %bl,%ebx 9591 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # bd4 <_sk_load_tables_avx+0x2f8> 9592 73,99,28,153, //movslq (%r9,%rbx,4),%rbx 9593 76,1,203, //add %r9,%rbx 9594 255,227, //jmpq *%rbx 9595 196,193,121,110,68,184,24, //vmovd 0x18(%r8,%rdi,4),%xmm0 9596 197,249,112,192,68, //vpshufd $0x44,%xmm0,%xmm0 9597 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 9598 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 9599 196,99,117,12,192,64, //vblendps $0x40,%ymm0,%ymm1,%ymm8 9600 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0 9601 196,195,121,34,68,184,20,1, //vpinsrd $0x1,0x14(%r8,%rdi,4),%xmm0,%xmm0 9602 196,99,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm8 9603 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0 9604 196,195,121,34,68,184,16,0, //vpinsrd $0x0,0x10(%r8,%rdi,4),%xmm0,%xmm0 9605 196,99,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm8 9606 196,195,57,34,68,184,12,3, //vpinsrd $0x3,0xc(%r8,%rdi,4),%xmm8,%xmm0 9607 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8 9608 196,195,57,34,68,184,8,2, //vpinsrd $0x2,0x8(%r8,%rdi,4),%xmm8,%xmm0 9609 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8 9610 196,195,57,34,68,184,4,1, //vpinsrd $0x1,0x4(%r8,%rdi,4),%xmm8,%xmm0 9611 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8 9612 196,195,57,34,4,184,0, //vpinsrd $0x0,(%r8,%rdi,4),%xmm8,%xmm0 9613 196,99,61,12,192,15, //vblendps $0xf,%ymm0,%ymm8,%ymm8 9614 233,38,253,255,255, //jmpq 8fa <_sk_load_tables_avx+0x1e> 9615 238, //out %al,(%dx) 9616 255, //(bad) 9617 255, //(bad) 9618 255,224, //jmpq *%rax 9619 255, //(bad) 9620 255, //(bad) 9621 255,210, //callq *%rdx 9622 255, //(bad) 9623 255, //(bad) 9624 255,196, //inc %esp 9625 255, //(bad) 9626 255, //(bad) 9627 255,176,255,255,255,156, //pushq -0x63000001(%rax) 9628 255, //(bad) 9629 255, //(bad) 9630 255, //.byte 0xff 9631 128,255,255, //cmp $0xff,%bh 9632 255, //.byte 0xff 9633 }; 9634 9635 CODE const uint8_t sk_load_a8_avx[] = { 9636 73,137,200, //mov %rcx,%r8 9637 72,173, //lods %ds:(%rsi),%rax 9638 72,139,0, //mov (%rax),%rax 9639 72,1,248, //add %rdi,%rax 9640 77,133,192, //test %r8,%r8 9641 117,74, //jne c4a <_sk_load_a8_avx+0x5a> 9642 197,250,126,0, //vmovq (%rax),%xmm0 9643 196,226,121,49,200, //vpmovzxbd %xmm0,%xmm1 9644 196,227,121,4,192,229, //vpermilps $0xe5,%xmm0,%xmm0 9645 196,226,121,49,192, //vpmovzxbd %xmm0,%xmm0 9646 196,227,117,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm1,%ymm0 9647 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 9648 184,129,128,128,59, //mov $0x3b808081,%eax 9649 197,249,110,200, //vmovd %eax,%xmm1 9650 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 9651 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 9652 197,252,89,217, //vmulps %ymm1,%ymm0,%ymm3 9653 72,173, //lods %ds:(%rsi),%rax 9654 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 9655 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 9656 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 9657 76,137,193, //mov %r8,%rcx 9658 255,224, //jmpq *%rax 9659 49,201, //xor %ecx,%ecx 9660 77,137,194, //mov %r8,%r10 9661 69,49,201, //xor %r9d,%r9d 9662 68,15,182,24, //movzbl (%rax),%r11d 9663 72,255,192, //inc %rax 9664 73,211,227, //shl %cl,%r11 9665 77,9,217, //or %r11,%r9 9666 72,131,193,8, //add $0x8,%rcx 9667 73,255,202, //dec %r10 9668 117,234, //jne c52 <_sk_load_a8_avx+0x62> 9669 196,193,249,110,193, //vmovq %r9,%xmm0 9670 235,149, //jmp c04 <_sk_load_a8_avx+0x14> 9671 }; 9672 9673 CODE const uint8_t sk_store_a8_avx[] = { 9674 72,173, //lods %ds:(%rsi),%rax 9675 76,139,8, //mov (%rax),%r9 9676 184,0,0,127,67, //mov $0x437f0000,%eax 9677 197,121,110,192, //vmovd %eax,%xmm8 9678 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 9679 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 9680 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 9681 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 9682 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 9683 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 9684 196,65,57,103,192, //vpackuswb %xmm8,%xmm8,%xmm8 9685 72,133,201, //test %rcx,%rcx 9686 117,10, //jne cb1 <_sk_store_a8_avx+0x42> 9687 196,65,123,17,4,57, //vmovsd %xmm8,(%r9,%rdi,1) 9688 72,173, //lods %ds:(%rsi),%rax 9689 255,224, //jmpq *%rax 9690 65,137,200, //mov %ecx,%r8d 9691 65,128,224,7, //and $0x7,%r8b 9692 65,254,200, //dec %r8b 9693 65,128,248,6, //cmp $0x6,%r8b 9694 119,236, //ja cad <_sk_store_a8_avx+0x3e> 9695 196,66,121,48,192, //vpmovzxbw %xmm8,%xmm8 9696 65,15,182,192, //movzbl %r8b,%eax 9697 76,141,5,67,0,0,0, //lea 0x43(%rip),%r8 # d14 <_sk_store_a8_avx+0xa5> 9698 73,99,4,128, //movslq (%r8,%rax,4),%rax 9699 76,1,192, //add %r8,%rax 9700 255,224, //jmpq *%rax 9701 196,67,121,20,68,57,6,12, //vpextrb $0xc,%xmm8,0x6(%r9,%rdi,1) 9702 196,67,121,20,68,57,5,10, //vpextrb $0xa,%xmm8,0x5(%r9,%rdi,1) 9703 196,67,121,20,68,57,4,8, //vpextrb $0x8,%xmm8,0x4(%r9,%rdi,1) 9704 196,67,121,20,68,57,3,6, //vpextrb $0x6,%xmm8,0x3(%r9,%rdi,1) 9705 196,67,121,20,68,57,2,4, //vpextrb $0x4,%xmm8,0x2(%r9,%rdi,1) 9706 196,67,121,20,68,57,1,2, //vpextrb $0x2,%xmm8,0x1(%r9,%rdi,1) 9707 196,67,121,20,4,57,0, //vpextrb $0x0,%xmm8,(%r9,%rdi,1) 9708 235,154, //jmp cad <_sk_store_a8_avx+0x3e> 9709 144, //nop 9710 246,255, //idiv %bh 9711 255, //(bad) 9712 255, //(bad) 9713 238, //out %al,(%dx) 9714 255, //(bad) 9715 255, //(bad) 9716 255,230, //jmpq *%rsi 9717 255, //(bad) 9718 255, //(bad) 9719 255, //(bad) 9720 222,255, //fdivrp %st,%st(7) 9721 255, //(bad) 9722 255,214, //callq *%rsi 9723 255, //(bad) 9724 255, //(bad) 9725 255,206, //dec %esi 9726 255, //(bad) 9727 255, //(bad) 9728 255,198, //inc %esi 9729 255, //(bad) 9730 255, //(bad) 9731 255, //.byte 0xff 9732 }; 9733 9734 CODE const uint8_t sk_load_565_avx[] = { 9735 72,173, //lods %ds:(%rsi),%rax 9736 76,139,16, //mov (%rax),%r10 9737 72,133,201, //test %rcx,%rcx 9738 15,133,209,0,0,0, //jne e0f <_sk_load_565_avx+0xdf> 9739 196,193,122,111,4,122, //vmovdqu (%r10,%rdi,2),%xmm0 9740 197,241,239,201, //vpxor %xmm1,%xmm1,%xmm1 9741 197,249,105,201, //vpunpckhwd %xmm1,%xmm0,%xmm1 9742 196,226,121,51,192, //vpmovzxwd %xmm0,%xmm0 9743 196,227,125,24,209,1, //vinsertf128 $0x1,%xmm1,%ymm0,%ymm2 9744 184,0,248,0,0, //mov $0xf800,%eax 9745 197,249,110,192, //vmovd %eax,%xmm0 9746 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0 9747 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 9748 197,252,84,194, //vandps %ymm2,%ymm0,%ymm0 9749 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 9750 184,8,33,132,55, //mov $0x37842108,%eax 9751 197,249,110,200, //vmovd %eax,%xmm1 9752 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 9753 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 9754 197,252,89,193, //vmulps %ymm1,%ymm0,%ymm0 9755 184,224,7,0,0, //mov $0x7e0,%eax 9756 197,249,110,200, //vmovd %eax,%xmm1 9757 197,249,112,201,0, //vpshufd $0x0,%xmm1,%xmm1 9758 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 9759 197,244,84,202, //vandps %ymm2,%ymm1,%ymm1 9760 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1 9761 184,33,8,2,58, //mov $0x3a020821,%eax 9762 197,249,110,216, //vmovd %eax,%xmm3 9763 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 9764 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 9765 197,244,89,203, //vmulps %ymm3,%ymm1,%ymm1 9766 184,31,0,0,0, //mov $0x1f,%eax 9767 197,249,110,216, //vmovd %eax,%xmm3 9768 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 9769 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 9770 197,228,84,210, //vandps %ymm2,%ymm3,%ymm2 9771 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 9772 184,8,33,4,61, //mov $0x3d042108,%eax 9773 197,249,110,216, //vmovd %eax,%xmm3 9774 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 9775 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 9776 197,236,89,211, //vmulps %ymm3,%ymm2,%ymm2 9777 184,0,0,128,63, //mov $0x3f800000,%eax 9778 197,249,110,216, //vmovd %eax,%xmm3 9779 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 9780 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 9781 72,173, //lods %ds:(%rsi),%rax 9782 255,224, //jmpq *%rax 9783 65,137,200, //mov %ecx,%r8d 9784 65,128,224,7, //and $0x7,%r8b 9785 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0 9786 65,254,200, //dec %r8b 9787 65,128,248,6, //cmp $0x6,%r8b 9788 15,135,29,255,255,255, //ja d44 <_sk_load_565_avx+0x14> 9789 69,15,182,192, //movzbl %r8b,%r8d 9790 76,141,13,74,0,0,0, //lea 0x4a(%rip),%r9 # e7c <_sk_load_565_avx+0x14c> 9791 75,99,4,129, //movslq (%r9,%r8,4),%rax 9792 76,1,200, //add %r9,%rax 9793 255,224, //jmpq *%rax 9794 197,249,239,192, //vpxor %xmm0,%xmm0,%xmm0 9795 196,193,121,196,68,122,12,6, //vpinsrw $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0 9796 196,193,121,196,68,122,10,5, //vpinsrw $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0 9797 196,193,121,196,68,122,8,4, //vpinsrw $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0 9798 196,193,121,196,68,122,6,3, //vpinsrw $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0 9799 196,193,121,196,68,122,4,2, //vpinsrw $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0 9800 196,193,121,196,68,122,2,1, //vpinsrw $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0 9801 196,193,121,196,4,122,0, //vpinsrw $0x0,(%r10,%rdi,2),%xmm0,%xmm0 9802 233,201,254,255,255, //jmpq d44 <_sk_load_565_avx+0x14> 9803 144, //nop 9804 243,255, //repz (bad) 9805 255, //(bad) 9806 255, //(bad) 9807 235,255, //jmp e81 <_sk_load_565_avx+0x151> 9808 255, //(bad) 9809 255,227, //jmpq *%rbx 9810 255, //(bad) 9811 255, //(bad) 9812 255, //(bad) 9813 219,255, //(bad) 9814 255, //(bad) 9815 255,211, //callq *%rbx 9816 255, //(bad) 9817 255, //(bad) 9818 255,203, //dec %ebx 9819 255, //(bad) 9820 255, //(bad) 9821 255, //(bad) 9822 191, //.byte 0xbf 9823 255, //(bad) 9824 255, //(bad) 9825 255, //.byte 0xff 9826 }; 9827 9828 CODE const uint8_t sk_store_565_avx[] = { 9829 72,173, //lods %ds:(%rsi),%rax 9830 76,139,8, //mov (%rax),%r9 9831 184,0,0,248,65, //mov $0x41f80000,%eax 9832 197,121,110,192, //vmovd %eax,%xmm8 9833 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 9834 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 9835 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 9836 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 9837 196,193,41,114,241,11, //vpslld $0xb,%xmm9,%xmm10 9838 196,67,125,25,201,1, //vextractf128 $0x1,%ymm9,%xmm9 9839 196,193,49,114,241,11, //vpslld $0xb,%xmm9,%xmm9 9840 196,67,45,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm9 9841 184,0,0,124,66, //mov $0x427c0000,%eax 9842 197,121,110,208, //vmovd %eax,%xmm10 9843 196,67,121,4,210,0, //vpermilps $0x0,%xmm10,%xmm10 9844 196,67,45,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm10,%ymm10 9845 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 9846 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 9847 196,193,33,114,242,5, //vpslld $0x5,%xmm10,%xmm11 9848 196,67,125,25,210,1, //vextractf128 $0x1,%ymm10,%xmm10 9849 196,193,41,114,242,5, //vpslld $0x5,%xmm10,%xmm10 9850 196,67,37,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm11,%ymm10 9851 196,65,45,86,201, //vorpd %ymm9,%ymm10,%ymm9 9852 197,60,89,194, //vmulps %ymm2,%ymm8,%ymm8 9853 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 9854 196,65,53,86,192, //vorpd %ymm8,%ymm9,%ymm8 9855 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 9856 196,66,57,43,193, //vpackusdw %xmm9,%xmm8,%xmm8 9857 72,133,201, //test %rcx,%rcx 9858 117,10, //jne f36 <_sk_store_565_avx+0x9e> 9859 196,65,122,127,4,121, //vmovdqu %xmm8,(%r9,%rdi,2) 9860 72,173, //lods %ds:(%rsi),%rax 9861 255,224, //jmpq *%rax 9862 65,137,200, //mov %ecx,%r8d 9863 65,128,224,7, //and $0x7,%r8b 9864 65,254,200, //dec %r8b 9865 65,128,248,6, //cmp $0x6,%r8b 9866 119,236, //ja f32 <_sk_store_565_avx+0x9a> 9867 65,15,182,192, //movzbl %r8b,%eax 9868 76,141,5,67,0,0,0, //lea 0x43(%rip),%r8 # f94 <_sk_store_565_avx+0xfc> 9869 73,99,4,128, //movslq (%r8,%rax,4),%rax 9870 76,1,192, //add %r8,%rax 9871 255,224, //jmpq *%rax 9872 196,67,121,21,68,121,12,6, //vpextrw $0x6,%xmm8,0xc(%r9,%rdi,2) 9873 196,67,121,21,68,121,10,5, //vpextrw $0x5,%xmm8,0xa(%r9,%rdi,2) 9874 196,67,121,21,68,121,8,4, //vpextrw $0x4,%xmm8,0x8(%r9,%rdi,2) 9875 196,67,121,21,68,121,6,3, //vpextrw $0x3,%xmm8,0x6(%r9,%rdi,2) 9876 196,67,121,21,68,121,4,2, //vpextrw $0x2,%xmm8,0x4(%r9,%rdi,2) 9877 196,67,121,21,68,121,2,1, //vpextrw $0x1,%xmm8,0x2(%r9,%rdi,2) 9878 196,67,121,21,4,121,0, //vpextrw $0x0,%xmm8,(%r9,%rdi,2) 9879 235,159, //jmp f32 <_sk_store_565_avx+0x9a> 9880 144, //nop 9881 246,255, //idiv %bh 9882 255, //(bad) 9883 255, //(bad) 9884 238, //out %al,(%dx) 9885 255, //(bad) 9886 255, //(bad) 9887 255,230, //jmpq *%rsi 9888 255, //(bad) 9889 255, //(bad) 9890 255, //(bad) 9891 222,255, //fdivrp %st,%st(7) 9892 255, //(bad) 9893 255,214, //callq *%rsi 9894 255, //(bad) 9895 255, //(bad) 9896 255,206, //dec %esi 9897 255, //(bad) 9898 255, //(bad) 9899 255,198, //inc %esi 9900 255, //(bad) 9901 255, //(bad) 9902 255, //.byte 0xff 9903 }; 9904 9905 CODE const uint8_t sk_load_8888_avx[] = { 9906 72,173, //lods %ds:(%rsi),%rax 9907 76,139,16, //mov (%rax),%r10 9908 72,133,201, //test %rcx,%rcx 9909 15,133,157,0,0,0, //jne 105b <_sk_load_8888_avx+0xab> 9910 196,65,124,16,12,186, //vmovups (%r10,%rdi,4),%ymm9 9911 184,255,0,0,0, //mov $0xff,%eax 9912 197,249,110,192, //vmovd %eax,%xmm0 9913 197,249,112,192,0, //vpshufd $0x0,%xmm0,%xmm0 9914 196,99,125,24,216,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm11 9915 196,193,36,84,193, //vandps %ymm9,%ymm11,%ymm0 9916 197,252,91,192, //vcvtdq2ps %ymm0,%ymm0 9917 184,129,128,128,59, //mov $0x3b808081,%eax 9918 197,249,110,200, //vmovd %eax,%xmm1 9919 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 9920 196,99,117,24,193,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm8 9921 196,193,124,89,192, //vmulps %ymm8,%ymm0,%ymm0 9922 196,193,41,114,209,8, //vpsrld $0x8,%xmm9,%xmm10 9923 196,99,125,25,203,1, //vextractf128 $0x1,%ymm9,%xmm3 9924 197,241,114,211,8, //vpsrld $0x8,%xmm3,%xmm1 9925 196,227,45,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm10,%ymm1 9926 197,164,84,201, //vandps %ymm1,%ymm11,%ymm1 9927 197,252,91,201, //vcvtdq2ps %ymm1,%ymm1 9928 196,193,116,89,200, //vmulps %ymm8,%ymm1,%ymm1 9929 196,193,41,114,209,16, //vpsrld $0x10,%xmm9,%xmm10 9930 197,233,114,211,16, //vpsrld $0x10,%xmm3,%xmm2 9931 196,227,45,24,210,1, //vinsertf128 $0x1,%xmm2,%ymm10,%ymm2 9932 197,164,84,210, //vandps %ymm2,%ymm11,%ymm2 9933 197,252,91,210, //vcvtdq2ps %ymm2,%ymm2 9934 196,193,108,89,208, //vmulps %ymm8,%ymm2,%ymm2 9935 196,193,49,114,209,24, //vpsrld $0x18,%xmm9,%xmm9 9936 197,225,114,211,24, //vpsrld $0x18,%xmm3,%xmm3 9937 196,227,53,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm9,%ymm3 9938 197,252,91,219, //vcvtdq2ps %ymm3,%ymm3 9939 196,193,100,89,216, //vmulps %ymm8,%ymm3,%ymm3 9940 72,173, //lods %ds:(%rsi),%rax 9941 255,224, //jmpq *%rax 9942 65,137,200, //mov %ecx,%r8d 9943 65,128,224,7, //and $0x7,%r8b 9944 196,65,52,87,201, //vxorps %ymm9,%ymm9,%ymm9 9945 65,254,200, //dec %r8b 9946 65,128,248,6, //cmp $0x6,%r8b 9947 15,135,80,255,255,255, //ja fc4 <_sk_load_8888_avx+0x14> 9948 69,15,182,192, //movzbl %r8b,%r8d 9949 76,141,13,137,0,0,0, //lea 0x89(%rip),%r9 # 1108 <_sk_load_8888_avx+0x158> 9950 75,99,4,129, //movslq (%r9,%r8,4),%rax 9951 76,1,200, //add %r9,%rax 9952 255,224, //jmpq *%rax 9953 196,193,121,110,68,186,24, //vmovd 0x18(%r10,%rdi,4),%xmm0 9954 197,249,112,192,68, //vpshufd $0x44,%xmm0,%xmm0 9955 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 9956 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 9957 196,99,117,12,200,64, //vblendps $0x40,%ymm0,%ymm1,%ymm9 9958 196,99,125,25,200,1, //vextractf128 $0x1,%ymm9,%xmm0 9959 196,195,121,34,68,186,20,1, //vpinsrd $0x1,0x14(%r10,%rdi,4),%xmm0,%xmm0 9960 196,99,53,24,200,1, //vinsertf128 $0x1,%xmm0,%ymm9,%ymm9 9961 196,99,125,25,200,1, //vextractf128 $0x1,%ymm9,%xmm0 9962 196,195,121,34,68,186,16,0, //vpinsrd $0x0,0x10(%r10,%rdi,4),%xmm0,%xmm0 9963 196,99,53,24,200,1, //vinsertf128 $0x1,%xmm0,%ymm9,%ymm9 9964 196,195,49,34,68,186,12,3, //vpinsrd $0x3,0xc(%r10,%rdi,4),%xmm9,%xmm0 9965 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 9966 196,195,49,34,68,186,8,2, //vpinsrd $0x2,0x8(%r10,%rdi,4),%xmm9,%xmm0 9967 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 9968 196,195,49,34,68,186,4,1, //vpinsrd $0x1,0x4(%r10,%rdi,4),%xmm9,%xmm0 9969 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 9970 196,195,49,34,4,186,0, //vpinsrd $0x0,(%r10,%rdi,4),%xmm9,%xmm0 9971 196,99,53,12,200,15, //vblendps $0xf,%ymm0,%ymm9,%ymm9 9972 233,188,254,255,255, //jmpq fc4 <_sk_load_8888_avx+0x14> 9973 238, //out %al,(%dx) 9974 255, //(bad) 9975 255, //(bad) 9976 255,224, //jmpq *%rax 9977 255, //(bad) 9978 255, //(bad) 9979 255,210, //callq *%rdx 9980 255, //(bad) 9981 255, //(bad) 9982 255,196, //inc %esp 9983 255, //(bad) 9984 255, //(bad) 9985 255,176,255,255,255,156, //pushq -0x63000001(%rax) 9986 255, //(bad) 9987 255, //(bad) 9988 255, //.byte 0xff 9989 128,255,255, //cmp $0xff,%bh 9990 255, //.byte 0xff 9991 }; 9992 9993 CODE const uint8_t sk_store_8888_avx[] = { 9994 72,173, //lods %ds:(%rsi),%rax 9995 76,139,8, //mov (%rax),%r9 9996 184,0,0,127,67, //mov $0x437f0000,%eax 9997 197,121,110,192, //vmovd %eax,%xmm8 9998 196,67,121,4,192,0, //vpermilps $0x0,%xmm8,%xmm8 9999 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 10000 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 10001 196,65,125,91,201, //vcvtps2dq %ymm9,%ymm9 10002 197,60,89,209, //vmulps %ymm1,%ymm8,%ymm10 10003 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 10004 196,193,33,114,242,8, //vpslld $0x8,%xmm10,%xmm11 10005 196,67,125,25,210,1, //vextractf128 $0x1,%ymm10,%xmm10 10006 196,193,41,114,242,8, //vpslld $0x8,%xmm10,%xmm10 10007 196,67,37,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm11,%ymm10 10008 196,65,45,86,201, //vorpd %ymm9,%ymm10,%ymm9 10009 197,60,89,210, //vmulps %ymm2,%ymm8,%ymm10 10010 196,65,125,91,210, //vcvtps2dq %ymm10,%ymm10 10011 196,193,33,114,242,16, //vpslld $0x10,%xmm10,%xmm11 10012 196,67,125,25,210,1, //vextractf128 $0x1,%ymm10,%xmm10 10013 196,193,41,114,242,16, //vpslld $0x10,%xmm10,%xmm10 10014 196,67,37,24,210,1, //vinsertf128 $0x1,%xmm10,%ymm11,%ymm10 10015 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 10016 196,65,125,91,192, //vcvtps2dq %ymm8,%ymm8 10017 196,193,33,114,240,24, //vpslld $0x18,%xmm8,%xmm11 10018 196,67,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm8 10019 196,193,57,114,240,24, //vpslld $0x18,%xmm8,%xmm8 10020 196,67,37,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm11,%ymm8 10021 196,65,45,86,192, //vorpd %ymm8,%ymm10,%ymm8 10022 196,65,53,86,192, //vorpd %ymm8,%ymm9,%ymm8 10023 72,133,201, //test %rcx,%rcx 10024 117,10, //jne 11c8 <_sk_store_8888_avx+0xa4> 10025 196,65,124,17,4,185, //vmovups %ymm8,(%r9,%rdi,4) 10026 72,173, //lods %ds:(%rsi),%rax 10027 255,224, //jmpq *%rax 10028 65,137,200, //mov %ecx,%r8d 10029 65,128,224,7, //and $0x7,%r8b 10030 65,254,200, //dec %r8b 10031 65,128,248,6, //cmp $0x6,%r8b 10032 119,236, //ja 11c4 <_sk_store_8888_avx+0xa0> 10033 65,15,182,192, //movzbl %r8b,%eax 10034 76,141,5,85,0,0,0, //lea 0x55(%rip),%r8 # 1238 <_sk_store_8888_avx+0x114> 10035 73,99,4,128, //movslq (%r8,%rax,4),%rax 10036 76,1,192, //add %r8,%rax 10037 255,224, //jmpq *%rax 10038 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 10039 196,67,121,22,76,185,24,2, //vpextrd $0x2,%xmm9,0x18(%r9,%rdi,4) 10040 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 10041 196,67,121,22,76,185,20,1, //vpextrd $0x1,%xmm9,0x14(%r9,%rdi,4) 10042 196,67,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm9 10043 196,65,122,17,76,185,16, //vmovss %xmm9,0x10(%r9,%rdi,4) 10044 196,67,121,22,68,185,12,3, //vpextrd $0x3,%xmm8,0xc(%r9,%rdi,4) 10045 196,67,121,22,68,185,8,2, //vpextrd $0x2,%xmm8,0x8(%r9,%rdi,4) 10046 196,67,121,22,68,185,4,1, //vpextrd $0x1,%xmm8,0x4(%r9,%rdi,4) 10047 196,65,121,126,4,185, //vmovd %xmm8,(%r9,%rdi,4) 10048 235,143, //jmp 11c4 <_sk_store_8888_avx+0xa0> 10049 15,31,0, //nopl (%rax) 10050 245, //cmc 10051 255, //(bad) 10052 255, //(bad) 10053 255, //(bad) 10054 237, //in (%dx),%eax 10055 255, //(bad) 10056 255, //(bad) 10057 255,229, //jmpq *%rbp 10058 255, //(bad) 10059 255, //(bad) 10060 255, //(bad) 10061 221,255, //(bad) 10062 255, //(bad) 10063 255,208, //callq *%rax 10064 255, //(bad) 10065 255, //(bad) 10066 255,194, //inc %edx 10067 255, //(bad) 10068 255, //(bad) 10069 255, //.byte 0xff 10070 180,255, //mov $0xff,%ah 10071 255, //(bad) 10072 255, //.byte 0xff 10073 }; 10074 10075 CODE const uint8_t sk_load_f16_avx[] = { 10076 72,173, //lods %ds:(%rsi),%rax 10077 72,139,0, //mov (%rax),%rax 10078 72,133,201, //test %rcx,%rcx 10079 15,133,2,1,0,0, //jne 1364 <_sk_load_f16_avx+0x110> 10080 197,121,16,4,248, //vmovupd (%rax,%rdi,8),%xmm8 10081 197,249,16,84,248,16, //vmovupd 0x10(%rax,%rdi,8),%xmm2 10082 197,249,16,92,248,32, //vmovupd 0x20(%rax,%rdi,8),%xmm3 10083 197,122,111,76,248,48, //vmovdqu 0x30(%rax,%rdi,8),%xmm9 10084 197,185,97,194, //vpunpcklwd %xmm2,%xmm8,%xmm0 10085 197,185,105,210, //vpunpckhwd %xmm2,%xmm8,%xmm2 10086 196,193,97,97,201, //vpunpcklwd %xmm9,%xmm3,%xmm1 10087 196,193,97,105,217, //vpunpckhwd %xmm9,%xmm3,%xmm3 10088 197,121,97,194, //vpunpcklwd %xmm2,%xmm0,%xmm8 10089 197,249,105,194, //vpunpckhwd %xmm2,%xmm0,%xmm0 10090 197,241,97,211, //vpunpcklwd %xmm3,%xmm1,%xmm2 10091 197,113,105,203, //vpunpckhwd %xmm3,%xmm1,%xmm9 10092 184,0,4,0,4, //mov $0x4000400,%eax 10093 197,249,110,216, //vmovd %eax,%xmm3 10094 197,249,112,219,0, //vpshufd $0x0,%xmm3,%xmm3 10095 196,193,97,101,200, //vpcmpgtw %xmm8,%xmm3,%xmm1 10096 196,65,113,223,192, //vpandn %xmm8,%xmm1,%xmm8 10097 197,225,101,200, //vpcmpgtw %xmm0,%xmm3,%xmm1 10098 197,241,223,192, //vpandn %xmm0,%xmm1,%xmm0 10099 197,225,101,202, //vpcmpgtw %xmm2,%xmm3,%xmm1 10100 197,241,223,202, //vpandn %xmm2,%xmm1,%xmm1 10101 196,193,97,101,209, //vpcmpgtw %xmm9,%xmm3,%xmm2 10102 196,193,105,223,209, //vpandn %xmm9,%xmm2,%xmm2 10103 196,66,121,51,208, //vpmovzxwd %xmm8,%xmm10 10104 196,98,121,51,201, //vpmovzxwd %xmm1,%xmm9 10105 197,225,239,219, //vpxor %xmm3,%xmm3,%xmm3 10106 197,57,105,195, //vpunpckhwd %xmm3,%xmm8,%xmm8 10107 197,241,105,203, //vpunpckhwd %xmm3,%xmm1,%xmm1 10108 196,98,121,51,216, //vpmovzxwd %xmm0,%xmm11 10109 196,98,121,51,226, //vpmovzxwd %xmm2,%xmm12 10110 197,121,105,235, //vpunpckhwd %xmm3,%xmm0,%xmm13 10111 197,105,105,243, //vpunpckhwd %xmm3,%xmm2,%xmm14 10112 196,193,121,114,242,13, //vpslld $0xd,%xmm10,%xmm0 10113 196,193,105,114,241,13, //vpslld $0xd,%xmm9,%xmm2 10114 196,227,125,24,194,1, //vinsertf128 $0x1,%xmm2,%ymm0,%ymm0 10115 184,0,0,128,119, //mov $0x77800000,%eax 10116 197,249,110,208, //vmovd %eax,%xmm2 10117 197,249,112,210,0, //vpshufd $0x0,%xmm2,%xmm2 10118 196,99,109,24,202,1, //vinsertf128 $0x1,%xmm2,%ymm2,%ymm9 10119 197,180,89,192, //vmulps %ymm0,%ymm9,%ymm0 10120 196,193,105,114,240,13, //vpslld $0xd,%xmm8,%xmm2 10121 197,241,114,241,13, //vpslld $0xd,%xmm1,%xmm1 10122 196,227,109,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm2,%ymm1 10123 197,180,89,201, //vmulps %ymm1,%ymm9,%ymm1 10124 196,193,105,114,243,13, //vpslld $0xd,%xmm11,%xmm2 10125 196,193,97,114,244,13, //vpslld $0xd,%xmm12,%xmm3 10126 196,227,109,24,211,1, //vinsertf128 $0x1,%xmm3,%ymm2,%ymm2 10127 197,180,89,210, //vmulps %ymm2,%ymm9,%ymm2 10128 196,193,57,114,245,13, //vpslld $0xd,%xmm13,%xmm8 10129 196,193,97,114,246,13, //vpslld $0xd,%xmm14,%xmm3 10130 196,227,61,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm8,%ymm3 10131 197,180,89,219, //vmulps %ymm3,%ymm9,%ymm3 10132 72,173, //lods %ds:(%rsi),%rax 10133 255,224, //jmpq *%rax 10134 197,123,16,4,248, //vmovsd (%rax,%rdi,8),%xmm8 10135 196,65,49,239,201, //vpxor %xmm9,%xmm9,%xmm9 10136 72,131,249,1, //cmp $0x1,%rcx 10137 116,79, //je 13c3 <_sk_load_f16_avx+0x16f> 10138 197,57,22,68,248,8, //vmovhpd 0x8(%rax,%rdi,8),%xmm8,%xmm8 10139 72,131,249,3, //cmp $0x3,%rcx 10140 114,67, //jb 13c3 <_sk_load_f16_avx+0x16f> 10141 197,251,16,84,248,16, //vmovsd 0x10(%rax,%rdi,8),%xmm2 10142 72,131,249,3, //cmp $0x3,%rcx 10143 116,68, //je 13d0 <_sk_load_f16_avx+0x17c> 10144 197,233,22,84,248,24, //vmovhpd 0x18(%rax,%rdi,8),%xmm2,%xmm2 10145 72,131,249,5, //cmp $0x5,%rcx 10146 114,56, //jb 13d0 <_sk_load_f16_avx+0x17c> 10147 197,251,16,92,248,32, //vmovsd 0x20(%rax,%rdi,8),%xmm3 10148 72,131,249,5, //cmp $0x5,%rcx 10149 15,132,209,254,255,255, //je 1279 <_sk_load_f16_avx+0x25> 10150 197,225,22,92,248,40, //vmovhpd 0x28(%rax,%rdi,8),%xmm3,%xmm3 10151 72,131,249,7, //cmp $0x7,%rcx 10152 15,130,193,254,255,255, //jb 1279 <_sk_load_f16_avx+0x25> 10153 197,122,126,76,248,48, //vmovq 0x30(%rax,%rdi,8),%xmm9 10154 233,182,254,255,255, //jmpq 1279 <_sk_load_f16_avx+0x25> 10155 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 10156 197,233,87,210, //vxorpd %xmm2,%xmm2,%xmm2 10157 233,169,254,255,255, //jmpq 1279 <_sk_load_f16_avx+0x25> 10158 197,225,87,219, //vxorpd %xmm3,%xmm3,%xmm3 10159 233,160,254,255,255, //jmpq 1279 <_sk_load_f16_avx+0x25> 10160 }; 10161 10162 CODE const uint8_t sk_store_f16_avx[] = { 10163 72,173, //lods %ds:(%rsi),%rax 10164 76,139,0, //mov (%rax),%r8 10165 184,0,0,128,7, //mov $0x7800000,%eax 10166 197,121,110,192, //vmovd %eax,%xmm8 10167 196,65,121,112,192,0, //vpshufd $0x0,%xmm8,%xmm8 10168 196,67,61,24,192,1, //vinsertf128 $0x1,%xmm8,%ymm8,%ymm8 10169 197,60,89,200, //vmulps %ymm0,%ymm8,%ymm9 10170 196,67,125,25,202,1, //vextractf128 $0x1,%ymm9,%xmm10 10171 196,193,41,114,210,13, //vpsrld $0xd,%xmm10,%xmm10 10172 196,193,49,114,209,13, //vpsrld $0xd,%xmm9,%xmm9 10173 197,60,89,217, //vmulps %ymm1,%ymm8,%ymm11 10174 196,67,125,25,220,1, //vextractf128 $0x1,%ymm11,%xmm12 10175 196,193,25,114,212,13, //vpsrld $0xd,%xmm12,%xmm12 10176 196,193,33,114,211,13, //vpsrld $0xd,%xmm11,%xmm11 10177 197,60,89,234, //vmulps %ymm2,%ymm8,%ymm13 10178 196,67,125,25,238,1, //vextractf128 $0x1,%ymm13,%xmm14 10179 196,193,9,114,214,13, //vpsrld $0xd,%xmm14,%xmm14 10180 196,193,17,114,213,13, //vpsrld $0xd,%xmm13,%xmm13 10181 197,60,89,195, //vmulps %ymm3,%ymm8,%ymm8 10182 196,67,125,25,199,1, //vextractf128 $0x1,%ymm8,%xmm15 10183 196,193,1,114,215,13, //vpsrld $0xd,%xmm15,%xmm15 10184 196,193,57,114,208,13, //vpsrld $0xd,%xmm8,%xmm8 10185 196,193,33,115,251,2, //vpslldq $0x2,%xmm11,%xmm11 10186 196,65,33,235,201, //vpor %xmm9,%xmm11,%xmm9 10187 196,193,33,115,252,2, //vpslldq $0x2,%xmm12,%xmm11 10188 196,65,33,235,226, //vpor %xmm10,%xmm11,%xmm12 10189 196,193,57,115,248,2, //vpslldq $0x2,%xmm8,%xmm8 10190 196,65,57,235,197, //vpor %xmm13,%xmm8,%xmm8 10191 196,193,41,115,255,2, //vpslldq $0x2,%xmm15,%xmm10 10192 196,65,41,235,238, //vpor %xmm14,%xmm10,%xmm13 10193 196,65,49,98,216, //vpunpckldq %xmm8,%xmm9,%xmm11 10194 196,65,49,106,208, //vpunpckhdq %xmm8,%xmm9,%xmm10 10195 196,65,25,98,205, //vpunpckldq %xmm13,%xmm12,%xmm9 10196 196,65,25,106,197, //vpunpckhdq %xmm13,%xmm12,%xmm8 10197 72,133,201, //test %rcx,%rcx 10198 117,31, //jne 14af <_sk_store_f16_avx+0xd6> 10199 196,65,120,17,28,248, //vmovups %xmm11,(%r8,%rdi,8) 10200 196,65,120,17,84,248,16, //vmovups %xmm10,0x10(%r8,%rdi,8) 10201 196,65,120,17,76,248,32, //vmovups %xmm9,0x20(%r8,%rdi,8) 10202 196,65,122,127,68,248,48, //vmovdqu %xmm8,0x30(%r8,%rdi,8) 10203 72,173, //lods %ds:(%rsi),%rax 10204 255,224, //jmpq *%rax 10205 196,65,121,214,28,248, //vmovq %xmm11,(%r8,%rdi,8) 10206 72,131,249,1, //cmp $0x1,%rcx 10207 116,240, //je 14ab <_sk_store_f16_avx+0xd2> 10208 196,65,121,23,92,248,8, //vmovhpd %xmm11,0x8(%r8,%rdi,8) 10209 72,131,249,3, //cmp $0x3,%rcx 10210 114,227, //jb 14ab <_sk_store_f16_avx+0xd2> 10211 196,65,121,214,84,248,16, //vmovq %xmm10,0x10(%r8,%rdi,8) 10212 116,218, //je 14ab <_sk_store_f16_avx+0xd2> 10213 196,65,121,23,84,248,24, //vmovhpd %xmm10,0x18(%r8,%rdi,8) 10214 72,131,249,5, //cmp $0x5,%rcx 10215 114,205, //jb 14ab <_sk_store_f16_avx+0xd2> 10216 196,65,121,214,76,248,32, //vmovq %xmm9,0x20(%r8,%rdi,8) 10217 116,196, //je 14ab <_sk_store_f16_avx+0xd2> 10218 196,65,121,23,76,248,40, //vmovhpd %xmm9,0x28(%r8,%rdi,8) 10219 72,131,249,7, //cmp $0x7,%rcx 10220 114,183, //jb 14ab <_sk_store_f16_avx+0xd2> 10221 196,65,121,214,68,248,48, //vmovq %xmm8,0x30(%r8,%rdi,8) 10222 235,174, //jmp 14ab <_sk_store_f16_avx+0xd2> 10223 }; 10224 10225 CODE const uint8_t sk_store_f32_avx[] = { 10226 72,173, //lods %ds:(%rsi),%rax 10227 76,139,0, //mov (%rax),%r8 10228 72,141,4,189,0,0,0,0, //lea 0x0(,%rdi,4),%rax 10229 197,124,20,193, //vunpcklps %ymm1,%ymm0,%ymm8 10230 197,124,21,217, //vunpckhps %ymm1,%ymm0,%ymm11 10231 197,108,20,203, //vunpcklps %ymm3,%ymm2,%ymm9 10232 197,108,21,227, //vunpckhps %ymm3,%ymm2,%ymm12 10233 196,65,61,20,209, //vunpcklpd %ymm9,%ymm8,%ymm10 10234 196,65,61,21,201, //vunpckhpd %ymm9,%ymm8,%ymm9 10235 196,65,37,20,196, //vunpcklpd %ymm12,%ymm11,%ymm8 10236 196,65,37,21,220, //vunpckhpd %ymm12,%ymm11,%ymm11 10237 72,133,201, //test %rcx,%rcx 10238 117,55, //jne 156a <_sk_store_f32_avx+0x6d> 10239 196,67,45,24,225,1, //vinsertf128 $0x1,%xmm9,%ymm10,%ymm12 10240 196,67,61,24,235,1, //vinsertf128 $0x1,%xmm11,%ymm8,%ymm13 10241 196,67,45,6,201,49, //vperm2f128 $0x31,%ymm9,%ymm10,%ymm9 10242 196,67,61,6,195,49, //vperm2f128 $0x31,%ymm11,%ymm8,%ymm8 10243 196,65,125,17,36,128, //vmovupd %ymm12,(%r8,%rax,4) 10244 196,65,125,17,108,128,32, //vmovupd %ymm13,0x20(%r8,%rax,4) 10245 196,65,125,17,76,128,64, //vmovupd %ymm9,0x40(%r8,%rax,4) 10246 196,65,125,17,68,128,96, //vmovupd %ymm8,0x60(%r8,%rax,4) 10247 72,173, //lods %ds:(%rsi),%rax 10248 255,224, //jmpq *%rax 10249 196,65,121,17,20,128, //vmovupd %xmm10,(%r8,%rax,4) 10250 72,131,249,1, //cmp $0x1,%rcx 10251 116,240, //je 1566 <_sk_store_f32_avx+0x69> 10252 196,65,121,17,76,128,16, //vmovupd %xmm9,0x10(%r8,%rax,4) 10253 72,131,249,3, //cmp $0x3,%rcx 10254 114,227, //jb 1566 <_sk_store_f32_avx+0x69> 10255 196,65,121,17,68,128,32, //vmovupd %xmm8,0x20(%r8,%rax,4) 10256 116,218, //je 1566 <_sk_store_f32_avx+0x69> 10257 196,65,121,17,92,128,48, //vmovupd %xmm11,0x30(%r8,%rax,4) 10258 72,131,249,5, //cmp $0x5,%rcx 10259 114,205, //jb 1566 <_sk_store_f32_avx+0x69> 10260 196,67,125,25,84,128,64,1, //vextractf128 $0x1,%ymm10,0x40(%r8,%rax,4) 10261 116,195, //je 1566 <_sk_store_f32_avx+0x69> 10262 196,67,125,25,76,128,80,1, //vextractf128 $0x1,%ymm9,0x50(%r8,%rax,4) 10263 72,131,249,7, //cmp $0x7,%rcx 10264 114,181, //jb 1566 <_sk_store_f32_avx+0x69> 10265 196,67,125,25,68,128,96,1, //vextractf128 $0x1,%ymm8,0x60(%r8,%rax,4) 10266 235,171, //jmp 1566 <_sk_store_f32_avx+0x69> 10267 }; 10268 10269 CODE const uint8_t sk_clamp_x_avx[] = { 10270 72,173, //lods %ds:(%rsi),%rax 10271 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 10272 197,60,95,200, //vmaxps %ymm0,%ymm8,%ymm9 10273 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 10274 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0 10275 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10 10276 196,193,121,254,194, //vpaddd %xmm10,%xmm0,%xmm0 10277 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8 10278 196,227,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm0 10279 197,180,93,192, //vminps %ymm0,%ymm9,%ymm0 10280 72,173, //lods %ds:(%rsi),%rax 10281 255,224, //jmpq *%rax 10282 }; 10283 10284 CODE const uint8_t sk_clamp_y_avx[] = { 10285 72,173, //lods %ds:(%rsi),%rax 10286 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 10287 197,60,95,201, //vmaxps %ymm1,%ymm8,%ymm9 10288 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 10289 196,99,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm1 10290 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10 10291 196,193,113,254,202, //vpaddd %xmm10,%xmm1,%xmm1 10292 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8 10293 196,227,61,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm8,%ymm1 10294 197,180,93,201, //vminps %ymm1,%ymm9,%ymm1 10295 72,173, //lods %ds:(%rsi),%rax 10296 255,224, //jmpq *%rax 10297 }; 10298 10299 CODE const uint8_t sk_repeat_x_avx[] = { 10300 72,173, //lods %ds:(%rsi),%rax 10301 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 10302 196,65,124,94,200, //vdivps %ymm8,%ymm0,%ymm9 10303 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9 10304 196,65,52,89,200, //vmulps %ymm8,%ymm9,%ymm9 10305 196,65,124,92,201, //vsubps %ymm9,%ymm0,%ymm9 10306 196,99,125,25,192,1, //vextractf128 $0x1,%ymm8,%xmm0 10307 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10 10308 196,193,121,254,194, //vpaddd %xmm10,%xmm0,%xmm0 10309 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8 10310 196,227,61,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm8,%ymm0 10311 197,180,93,192, //vminps %ymm0,%ymm9,%ymm0 10312 72,173, //lods %ds:(%rsi),%rax 10313 255,224, //jmpq *%rax 10314 }; 10315 10316 CODE const uint8_t sk_repeat_y_avx[] = { 10317 72,173, //lods %ds:(%rsi),%rax 10318 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 10319 196,65,116,94,200, //vdivps %ymm8,%ymm1,%ymm9 10320 196,67,125,8,201,1, //vroundps $0x1,%ymm9,%ymm9 10321 196,65,52,89,200, //vmulps %ymm8,%ymm9,%ymm9 10322 196,65,116,92,201, //vsubps %ymm9,%ymm1,%ymm9 10323 196,99,125,25,193,1, //vextractf128 $0x1,%ymm8,%xmm1 10324 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10 10325 196,193,113,254,202, //vpaddd %xmm10,%xmm1,%xmm1 10326 196,65,57,254,194, //vpaddd %xmm10,%xmm8,%xmm8 10327 196,227,61,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm8,%ymm1 10328 197,180,93,201, //vminps %ymm1,%ymm9,%ymm1 10329 72,173, //lods %ds:(%rsi),%rax 10330 255,224, //jmpq *%rax 10331 }; 10332 10333 CODE const uint8_t sk_mirror_x_avx[] = { 10334 72,173, //lods %ds:(%rsi),%rax 10335 197,121,110,0, //vmovd (%rax),%xmm8 10336 196,65,121,112,200,0, //vpshufd $0x0,%xmm8,%xmm9 10337 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 10338 196,65,124,92,209, //vsubps %ymm9,%ymm0,%ymm10 10339 196,193,58,88,192, //vaddss %xmm8,%xmm8,%xmm0 10340 196,227,121,4,192,0, //vpermilps $0x0,%xmm0,%xmm0 10341 196,227,125,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm0,%ymm0 10342 197,44,94,192, //vdivps %ymm0,%ymm10,%ymm8 10343 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8 10344 197,188,89,192, //vmulps %ymm0,%ymm8,%ymm0 10345 197,172,92,192, //vsubps %ymm0,%ymm10,%ymm0 10346 196,193,124,92,193, //vsubps %ymm9,%ymm0,%ymm0 10347 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 10348 197,60,92,192, //vsubps %ymm0,%ymm8,%ymm8 10349 197,60,84,192, //vandps %ymm0,%ymm8,%ymm8 10350 196,99,125,25,200,1, //vextractf128 $0x1,%ymm9,%xmm0 10351 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10 10352 196,193,121,254,194, //vpaddd %xmm10,%xmm0,%xmm0 10353 196,65,49,254,202, //vpaddd %xmm10,%xmm9,%xmm9 10354 196,227,53,24,192,1, //vinsertf128 $0x1,%xmm0,%ymm9,%ymm0 10355 197,188,93,192, //vminps %ymm0,%ymm8,%ymm0 10356 72,173, //lods %ds:(%rsi),%rax 10357 255,224, //jmpq *%rax 10358 }; 10359 10360 CODE const uint8_t sk_mirror_y_avx[] = { 10361 72,173, //lods %ds:(%rsi),%rax 10362 197,121,110,0, //vmovd (%rax),%xmm8 10363 196,65,121,112,200,0, //vpshufd $0x0,%xmm8,%xmm9 10364 196,67,53,24,201,1, //vinsertf128 $0x1,%xmm9,%ymm9,%ymm9 10365 196,65,116,92,209, //vsubps %ymm9,%ymm1,%ymm10 10366 196,193,58,88,200, //vaddss %xmm8,%xmm8,%xmm1 10367 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 10368 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 10369 197,44,94,193, //vdivps %ymm1,%ymm10,%ymm8 10370 196,67,125,8,192,1, //vroundps $0x1,%ymm8,%ymm8 10371 197,188,89,201, //vmulps %ymm1,%ymm8,%ymm1 10372 197,172,92,201, //vsubps %ymm1,%ymm10,%ymm1 10373 196,193,116,92,201, //vsubps %ymm9,%ymm1,%ymm1 10374 196,65,60,87,192, //vxorps %ymm8,%ymm8,%ymm8 10375 197,60,92,193, //vsubps %ymm1,%ymm8,%ymm8 10376 197,60,84,193, //vandps %ymm1,%ymm8,%ymm8 10377 196,99,125,25,201,1, //vextractf128 $0x1,%ymm9,%xmm1 10378 196,65,41,118,210, //vpcmpeqd %xmm10,%xmm10,%xmm10 10379 196,193,113,254,202, //vpaddd %xmm10,%xmm1,%xmm1 10380 196,65,49,254,202, //vpaddd %xmm10,%xmm9,%xmm9 10381 196,227,53,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm9,%ymm1 10382 197,188,93,201, //vminps %ymm1,%ymm8,%ymm1 10383 72,173, //lods %ds:(%rsi),%rax 10384 255,224, //jmpq *%rax 10385 }; 10386 10387 CODE const uint8_t sk_luminance_to_alpha_avx[] = { 10388 184,208,179,89,62, //mov $0x3e59b3d0,%eax 10389 197,249,110,216, //vmovd %eax,%xmm3 10390 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 10391 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 10392 197,228,89,192, //vmulps %ymm0,%ymm3,%ymm0 10393 184,89,23,55,63, //mov $0x3f371759,%eax 10394 197,249,110,216, //vmovd %eax,%xmm3 10395 196,227,121,4,219,0, //vpermilps $0x0,%xmm3,%xmm3 10396 196,227,101,24,219,1, //vinsertf128 $0x1,%xmm3,%ymm3,%ymm3 10397 197,228,89,201, //vmulps %ymm1,%ymm3,%ymm1 10398 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 10399 184,152,221,147,61, //mov $0x3d93dd98,%eax 10400 197,249,110,200, //vmovd %eax,%xmm1 10401 196,227,121,4,201,0, //vpermilps $0x0,%xmm1,%xmm1 10402 196,227,117,24,201,1, //vinsertf128 $0x1,%xmm1,%ymm1,%ymm1 10403 197,244,89,202, //vmulps %ymm2,%ymm1,%ymm1 10404 197,252,88,217, //vaddps %ymm1,%ymm0,%ymm3 10405 72,173, //lods %ds:(%rsi),%rax 10406 197,252,87,192, //vxorps %ymm0,%ymm0,%ymm0 10407 197,244,87,201, //vxorps %ymm1,%ymm1,%ymm1 10408 197,236,87,210, //vxorps %ymm2,%ymm2,%ymm2 10409 255,224, //jmpq *%rax 10410 }; 10411 10412 CODE const uint8_t sk_matrix_2x3_avx[] = { 10413 72,173, //lods %ds:(%rsi),%rax 10414 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 10415 196,98,125,24,72,8, //vbroadcastss 0x8(%rax),%ymm9 10416 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10 10417 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9 10418 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9 10419 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8 10420 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8 10421 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9 10422 196,98,125,24,80,12, //vbroadcastss 0xc(%rax),%ymm10 10423 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11 10424 197,172,89,201, //vmulps %ymm1,%ymm10,%ymm1 10425 196,193,116,88,203, //vaddps %ymm11,%ymm1,%ymm1 10426 197,180,89,192, //vmulps %ymm0,%ymm9,%ymm0 10427 197,252,88,201, //vaddps %ymm1,%ymm0,%ymm1 10428 72,173, //lods %ds:(%rsi),%rax 10429 197,124,41,192, //vmovaps %ymm8,%ymm0 10430 255,224, //jmpq *%rax 10431 }; 10432 10433 CODE const uint8_t sk_matrix_3x4_avx[] = { 10434 72,173, //lods %ds:(%rsi),%rax 10435 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 10436 196,98,125,24,72,12, //vbroadcastss 0xc(%rax),%ymm9 10437 196,98,125,24,80,24, //vbroadcastss 0x18(%rax),%ymm10 10438 196,98,125,24,88,36, //vbroadcastss 0x24(%rax),%ymm11 10439 197,44,89,210, //vmulps %ymm2,%ymm10,%ymm10 10440 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 10441 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9 10442 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9 10443 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8 10444 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8 10445 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9 10446 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10 10447 196,98,125,24,88,28, //vbroadcastss 0x1c(%rax),%ymm11 10448 196,98,125,24,96,40, //vbroadcastss 0x28(%rax),%ymm12 10449 197,36,89,218, //vmulps %ymm2,%ymm11,%ymm11 10450 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11 10451 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 10452 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 10453 197,52,89,200, //vmulps %ymm0,%ymm9,%ymm9 10454 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9 10455 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10 10456 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11 10457 196,98,125,24,96,32, //vbroadcastss 0x20(%rax),%ymm12 10458 196,98,125,24,104,44, //vbroadcastss 0x2c(%rax),%ymm13 10459 197,156,89,210, //vmulps %ymm2,%ymm12,%ymm2 10460 196,193,108,88,213, //vaddps %ymm13,%ymm2,%ymm2 10461 197,164,89,201, //vmulps %ymm1,%ymm11,%ymm1 10462 197,244,88,202, //vaddps %ymm2,%ymm1,%ymm1 10463 197,172,89,192, //vmulps %ymm0,%ymm10,%ymm0 10464 197,252,88,209, //vaddps %ymm1,%ymm0,%ymm2 10465 72,173, //lods %ds:(%rsi),%rax 10466 197,124,41,192, //vmovaps %ymm8,%ymm0 10467 197,124,41,201, //vmovaps %ymm9,%ymm1 10468 255,224, //jmpq *%rax 10469 }; 10470 10471 CODE const uint8_t sk_matrix_4x5_avx[] = { 10472 72,173, //lods %ds:(%rsi),%rax 10473 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 10474 196,98,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm9 10475 196,98,125,24,80,32, //vbroadcastss 0x20(%rax),%ymm10 10476 196,98,125,24,88,48, //vbroadcastss 0x30(%rax),%ymm11 10477 196,98,125,24,96,64, //vbroadcastss 0x40(%rax),%ymm12 10478 197,36,89,219, //vmulps %ymm3,%ymm11,%ymm11 10479 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11 10480 197,44,89,210, //vmulps %ymm2,%ymm10,%ymm10 10481 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 10482 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9 10483 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9 10484 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8 10485 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8 10486 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9 10487 196,98,125,24,80,20, //vbroadcastss 0x14(%rax),%ymm10 10488 196,98,125,24,88,36, //vbroadcastss 0x24(%rax),%ymm11 10489 196,98,125,24,96,52, //vbroadcastss 0x34(%rax),%ymm12 10490 196,98,125,24,104,68, //vbroadcastss 0x44(%rax),%ymm13 10491 197,28,89,227, //vmulps %ymm3,%ymm12,%ymm12 10492 196,65,28,88,229, //vaddps %ymm13,%ymm12,%ymm12 10493 197,36,89,218, //vmulps %ymm2,%ymm11,%ymm11 10494 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11 10495 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 10496 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 10497 197,52,89,200, //vmulps %ymm0,%ymm9,%ymm9 10498 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9 10499 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10 10500 196,98,125,24,88,24, //vbroadcastss 0x18(%rax),%ymm11 10501 196,98,125,24,96,40, //vbroadcastss 0x28(%rax),%ymm12 10502 196,98,125,24,104,56, //vbroadcastss 0x38(%rax),%ymm13 10503 196,98,125,24,112,72, //vbroadcastss 0x48(%rax),%ymm14 10504 197,20,89,235, //vmulps %ymm3,%ymm13,%ymm13 10505 196,65,20,88,238, //vaddps %ymm14,%ymm13,%ymm13 10506 197,28,89,226, //vmulps %ymm2,%ymm12,%ymm12 10507 196,65,28,88,229, //vaddps %ymm13,%ymm12,%ymm12 10508 197,36,89,217, //vmulps %ymm1,%ymm11,%ymm11 10509 196,65,36,88,220, //vaddps %ymm12,%ymm11,%ymm11 10510 197,44,89,208, //vmulps %ymm0,%ymm10,%ymm10 10511 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 10512 196,98,125,24,88,12, //vbroadcastss 0xc(%rax),%ymm11 10513 196,98,125,24,96,28, //vbroadcastss 0x1c(%rax),%ymm12 10514 196,98,125,24,104,44, //vbroadcastss 0x2c(%rax),%ymm13 10515 196,98,125,24,112,60, //vbroadcastss 0x3c(%rax),%ymm14 10516 196,98,125,24,120,76, //vbroadcastss 0x4c(%rax),%ymm15 10517 197,140,89,219, //vmulps %ymm3,%ymm14,%ymm3 10518 196,193,100,88,223, //vaddps %ymm15,%ymm3,%ymm3 10519 197,148,89,210, //vmulps %ymm2,%ymm13,%ymm2 10520 197,236,88,211, //vaddps %ymm3,%ymm2,%ymm2 10521 197,156,89,201, //vmulps %ymm1,%ymm12,%ymm1 10522 197,244,88,202, //vaddps %ymm2,%ymm1,%ymm1 10523 197,164,89,192, //vmulps %ymm0,%ymm11,%ymm0 10524 197,252,88,217, //vaddps %ymm1,%ymm0,%ymm3 10525 72,173, //lods %ds:(%rsi),%rax 10526 197,124,41,192, //vmovaps %ymm8,%ymm0 10527 197,124,41,201, //vmovaps %ymm9,%ymm1 10528 197,124,41,210, //vmovaps %ymm10,%ymm2 10529 255,224, //jmpq *%rax 10530 }; 10531 10532 CODE const uint8_t sk_matrix_perspective_avx[] = { 10533 72,173, //lods %ds:(%rsi),%rax 10534 196,98,125,24,0, //vbroadcastss (%rax),%ymm8 10535 196,98,125,24,72,4, //vbroadcastss 0x4(%rax),%ymm9 10536 196,98,125,24,80,8, //vbroadcastss 0x8(%rax),%ymm10 10537 197,52,89,201, //vmulps %ymm1,%ymm9,%ymm9 10538 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9 10539 197,60,89,192, //vmulps %ymm0,%ymm8,%ymm8 10540 196,65,60,88,193, //vaddps %ymm9,%ymm8,%ymm8 10541 196,98,125,24,72,12, //vbroadcastss 0xc(%rax),%ymm9 10542 196,98,125,24,80,16, //vbroadcastss 0x10(%rax),%ymm10 10543 196,98,125,24,88,20, //vbroadcastss 0x14(%rax),%ymm11 10544 197,44,89,209, //vmulps %ymm1,%ymm10,%ymm10 10545 196,65,44,88,211, //vaddps %ymm11,%ymm10,%ymm10 10546 197,52,89,200, //vmulps %ymm0,%ymm9,%ymm9 10547 196,65,52,88,202, //vaddps %ymm10,%ymm9,%ymm9 10548 196,98,125,24,80,24, //vbroadcastss 0x18(%rax),%ymm10 10549 196,98,125,24,88,28, //vbroadcastss 0x1c(%rax),%ymm11 10550 196,98,125,24,96,32, //vbroadcastss 0x20(%rax),%ymm12 10551 197,164,89,201, //vmulps %ymm1,%ymm11,%ymm1 10552 196,193,116,88,204, //vaddps %ymm12,%ymm1,%ymm1 10553 197,172,89,192, //vmulps %ymm0,%ymm10,%ymm0 10554 197,252,88,193, //vaddps %ymm1,%ymm0,%ymm0 10555 197,252,83,200, //vrcpps %ymm0,%ymm1 10556 197,188,89,193, //vmulps %ymm1,%ymm8,%ymm0 10557 197,180,89,201, //vmulps %ymm1,%ymm9,%ymm1 10558 72,173, //lods %ds:(%rsi),%rax 10559 255,224, //jmpq *%rax 10560 }; 10561 10562 CODE const uint8_t sk_linear_gradient_2stops_avx[] = { 10563 72,173, //lods %ds:(%rsi),%rax 10564 196,226,125,24,72,16, //vbroadcastss 0x10(%rax),%ymm1 10565 196,226,125,24,16, //vbroadcastss (%rax),%ymm2 10566 197,244,89,200, //vmulps %ymm0,%ymm1,%ymm1 10567 197,108,88,193, //vaddps %ymm1,%ymm2,%ymm8 10568 196,226,125,24,72,20, //vbroadcastss 0x14(%rax),%ymm1 10569 196,226,125,24,80,4, //vbroadcastss 0x4(%rax),%ymm2 10570 197,244,89,200, //vmulps %ymm0,%ymm1,%ymm1 10571 197,236,88,201, //vaddps %ymm1,%ymm2,%ymm1 10572 196,226,125,24,80,24, //vbroadcastss 0x18(%rax),%ymm2 10573 196,226,125,24,88,8, //vbroadcastss 0x8(%rax),%ymm3 10574 197,236,89,208, //vmulps %ymm0,%ymm2,%ymm2 10575 197,228,88,210, //vaddps %ymm2,%ymm3,%ymm2 10576 196,226,125,24,88,28, //vbroadcastss 0x1c(%rax),%ymm3 10577 196,98,125,24,72,12, //vbroadcastss 0xc(%rax),%ymm9 10578 197,228,89,192, //vmulps %ymm0,%ymm3,%ymm0 10579 197,180,88,216, //vaddps %ymm0,%ymm9,%ymm3 10580 72,173, //lods %ds:(%rsi),%rax 10581 197,124,41,192, //vmovaps %ymm8,%ymm0 10582 255,224, //jmpq *%rax 10583 }; 10584 10585 CODE const uint8_t sk_start_pipeline_sse41[] = { 10586 65,87, //push %r15 10587 65,86, //push %r14 10588 65,85, //push %r13 10589 65,84, //push %r12 10590 86, //push %rsi 10591 87, //push %rdi 10592 83, //push %rbx 10593 72,129,236,160,0,0,0, //sub $0xa0,%rsp 10594 68,15,41,188,36,144,0,0,0, //movaps %xmm15,0x90(%rsp) 10595 68,15,41,180,36,128,0,0,0, //movaps %xmm14,0x80(%rsp) 10596 68,15,41,108,36,112, //movaps %xmm13,0x70(%rsp) 10597 68,15,41,100,36,96, //movaps %xmm12,0x60(%rsp) 10598 68,15,41,92,36,80, //movaps %xmm11,0x50(%rsp) 10599 68,15,41,84,36,64, //movaps %xmm10,0x40(%rsp) 10600 68,15,41,76,36,48, //movaps %xmm9,0x30(%rsp) 10601 68,15,41,68,36,32, //movaps %xmm8,0x20(%rsp) 10602 15,41,124,36,16, //movaps %xmm7,0x10(%rsp) 10603 15,41,52,36, //movaps %xmm6,(%rsp) 10604 77,137,207, //mov %r9,%r15 10605 77,137,198, //mov %r8,%r14 10606 72,137,203, //mov %rcx,%rbx 10607 72,137,214, //mov %rdx,%rsi 10608 72,173, //lods %ds:(%rsi),%rax 10609 73,137,196, //mov %rax,%r12 10610 73,137,245, //mov %rsi,%r13 10611 72,141,67,4, //lea 0x4(%rbx),%rax 10612 76,57,248, //cmp %r15,%rax 10613 118,5, //jbe 73 <_sk_start_pipeline_sse41+0x73> 10614 72,137,216, //mov %rbx,%rax 10615 235,52, //jmp a7 <_sk_start_pipeline_sse41+0xa7> 10616 15,87,192, //xorps %xmm0,%xmm0 10617 15,87,201, //xorps %xmm1,%xmm1 10618 15,87,210, //xorps %xmm2,%xmm2 10619 15,87,219, //xorps %xmm3,%xmm3 10620 15,87,228, //xorps %xmm4,%xmm4 10621 15,87,237, //xorps %xmm5,%xmm5 10622 15,87,246, //xorps %xmm6,%xmm6 10623 15,87,255, //xorps %xmm7,%xmm7 10624 72,137,223, //mov %rbx,%rdi 10625 76,137,238, //mov %r13,%rsi 10626 76,137,242, //mov %r14,%rdx 10627 65,255,212, //callq *%r12 10628 72,141,67,4, //lea 0x4(%rbx),%rax 10629 72,131,195,8, //add $0x8,%rbx 10630 76,57,251, //cmp %r15,%rbx 10631 72,137,195, //mov %rax,%rbx 10632 118,204, //jbe 73 <_sk_start_pipeline_sse41+0x73> 10633 15,40,52,36, //movaps (%rsp),%xmm6 10634 15,40,124,36,16, //movaps 0x10(%rsp),%xmm7 10635 68,15,40,68,36,32, //movaps 0x20(%rsp),%xmm8 10636 68,15,40,76,36,48, //movaps 0x30(%rsp),%xmm9 10637 68,15,40,84,36,64, //movaps 0x40(%rsp),%xmm10 10638 68,15,40,92,36,80, //movaps 0x50(%rsp),%xmm11 10639 68,15,40,100,36,96, //movaps 0x60(%rsp),%xmm12 10640 68,15,40,108,36,112, //movaps 0x70(%rsp),%xmm13 10641 68,15,40,180,36,128,0,0,0, //movaps 0x80(%rsp),%xmm14 10642 68,15,40,188,36,144,0,0,0, //movaps 0x90(%rsp),%xmm15 10643 72,129,196,160,0,0,0, //add $0xa0,%rsp 10644 91, //pop %rbx 10645 95, //pop %rdi 10646 94, //pop %rsi 10647 65,92, //pop %r12 10648 65,93, //pop %r13 10649 65,94, //pop %r14 10650 65,95, //pop %r15 10651 195, //retq 10652 }; 10653 10654 CODE const uint8_t sk_just_return_sse41[] = { 10655 195, //retq 10656 }; 10657 10658 CODE const uint8_t sk_seed_shader_sse41[] = { 10659 72,173, //lods %ds:(%rsi),%rax 10660 102,15,110,199, //movd %edi,%xmm0 10661 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 10662 15,91,200, //cvtdq2ps %xmm0,%xmm1 10663 185,0,0,0,63, //mov $0x3f000000,%ecx 10664 102,15,110,209, //movd %ecx,%xmm2 10665 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 10666 15,88,202, //addps %xmm2,%xmm1 10667 15,16,2, //movups (%rdx),%xmm0 10668 15,88,193, //addps %xmm1,%xmm0 10669 102,15,110,8, //movd (%rax),%xmm1 10670 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 10671 15,91,201, //cvtdq2ps %xmm1,%xmm1 10672 15,88,202, //addps %xmm2,%xmm1 10673 184,0,0,128,63, //mov $0x3f800000,%eax 10674 102,15,110,208, //movd %eax,%xmm2 10675 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 10676 72,173, //lods %ds:(%rsi),%rax 10677 15,87,219, //xorps %xmm3,%xmm3 10678 15,87,228, //xorps %xmm4,%xmm4 10679 15,87,237, //xorps %xmm5,%xmm5 10680 15,87,246, //xorps %xmm6,%xmm6 10681 15,87,255, //xorps %xmm7,%xmm7 10682 255,224, //jmpq *%rax 10683 }; 10684 10685 CODE const uint8_t sk_constant_color_sse41[] = { 10686 72,173, //lods %ds:(%rsi),%rax 10687 15,16,24, //movups (%rax),%xmm3 10688 15,40,195, //movaps %xmm3,%xmm0 10689 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 10690 15,40,203, //movaps %xmm3,%xmm1 10691 15,198,201,85, //shufps $0x55,%xmm1,%xmm1 10692 15,40,211, //movaps %xmm3,%xmm2 10693 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2 10694 15,198,219,255, //shufps $0xff,%xmm3,%xmm3 10695 72,173, //lods %ds:(%rsi),%rax 10696 255,224, //jmpq *%rax 10697 }; 10698 10699 CODE const uint8_t sk_clear_sse41[] = { 10700 72,173, //lods %ds:(%rsi),%rax 10701 15,87,192, //xorps %xmm0,%xmm0 10702 15,87,201, //xorps %xmm1,%xmm1 10703 15,87,210, //xorps %xmm2,%xmm2 10704 15,87,219, //xorps %xmm3,%xmm3 10705 255,224, //jmpq *%rax 10706 }; 10707 10708 CODE const uint8_t sk_plus__sse41[] = { 10709 15,88,196, //addps %xmm4,%xmm0 10710 15,88,205, //addps %xmm5,%xmm1 10711 15,88,214, //addps %xmm6,%xmm2 10712 15,88,223, //addps %xmm7,%xmm3 10713 72,173, //lods %ds:(%rsi),%rax 10714 255,224, //jmpq *%rax 10715 }; 10716 10717 CODE const uint8_t sk_srcover_sse41[] = { 10718 184,0,0,128,63, //mov $0x3f800000,%eax 10719 102,68,15,110,192, //movd %eax,%xmm8 10720 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 10721 68,15,92,195, //subps %xmm3,%xmm8 10722 69,15,40,200, //movaps %xmm8,%xmm9 10723 68,15,89,204, //mulps %xmm4,%xmm9 10724 65,15,88,193, //addps %xmm9,%xmm0 10725 69,15,40,200, //movaps %xmm8,%xmm9 10726 68,15,89,205, //mulps %xmm5,%xmm9 10727 65,15,88,201, //addps %xmm9,%xmm1 10728 69,15,40,200, //movaps %xmm8,%xmm9 10729 68,15,89,206, //mulps %xmm6,%xmm9 10730 65,15,88,209, //addps %xmm9,%xmm2 10731 68,15,89,199, //mulps %xmm7,%xmm8 10732 65,15,88,216, //addps %xmm8,%xmm3 10733 72,173, //lods %ds:(%rsi),%rax 10734 255,224, //jmpq *%rax 10735 }; 10736 10737 CODE const uint8_t sk_dstover_sse41[] = { 10738 184,0,0,128,63, //mov $0x3f800000,%eax 10739 102,68,15,110,192, //movd %eax,%xmm8 10740 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 10741 68,15,92,199, //subps %xmm7,%xmm8 10742 65,15,89,192, //mulps %xmm8,%xmm0 10743 15,88,196, //addps %xmm4,%xmm0 10744 65,15,89,200, //mulps %xmm8,%xmm1 10745 15,88,205, //addps %xmm5,%xmm1 10746 65,15,89,208, //mulps %xmm8,%xmm2 10747 15,88,214, //addps %xmm6,%xmm2 10748 65,15,89,216, //mulps %xmm8,%xmm3 10749 15,88,223, //addps %xmm7,%xmm3 10750 72,173, //lods %ds:(%rsi),%rax 10751 255,224, //jmpq *%rax 10752 }; 10753 10754 CODE const uint8_t sk_clamp_0_sse41[] = { 10755 69,15,87,192, //xorps %xmm8,%xmm8 10756 65,15,95,192, //maxps %xmm8,%xmm0 10757 65,15,95,200, //maxps %xmm8,%xmm1 10758 65,15,95,208, //maxps %xmm8,%xmm2 10759 65,15,95,216, //maxps %xmm8,%xmm3 10760 72,173, //lods %ds:(%rsi),%rax 10761 255,224, //jmpq *%rax 10762 }; 10763 10764 CODE const uint8_t sk_clamp_1_sse41[] = { 10765 184,0,0,128,63, //mov $0x3f800000,%eax 10766 102,68,15,110,192, //movd %eax,%xmm8 10767 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 10768 65,15,93,192, //minps %xmm8,%xmm0 10769 65,15,93,200, //minps %xmm8,%xmm1 10770 65,15,93,208, //minps %xmm8,%xmm2 10771 65,15,93,216, //minps %xmm8,%xmm3 10772 72,173, //lods %ds:(%rsi),%rax 10773 255,224, //jmpq *%rax 10774 }; 10775 10776 CODE const uint8_t sk_clamp_a_sse41[] = { 10777 184,0,0,128,63, //mov $0x3f800000,%eax 10778 102,68,15,110,192, //movd %eax,%xmm8 10779 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 10780 65,15,93,216, //minps %xmm8,%xmm3 10781 15,93,195, //minps %xmm3,%xmm0 10782 15,93,203, //minps %xmm3,%xmm1 10783 15,93,211, //minps %xmm3,%xmm2 10784 72,173, //lods %ds:(%rsi),%rax 10785 255,224, //jmpq *%rax 10786 }; 10787 10788 CODE const uint8_t sk_set_rgb_sse41[] = { 10789 72,173, //lods %ds:(%rsi),%rax 10790 243,15,16,0, //movss (%rax),%xmm0 10791 243,15,16,72,4, //movss 0x4(%rax),%xmm1 10792 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 10793 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 10794 243,15,16,80,8, //movss 0x8(%rax),%xmm2 10795 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 10796 72,173, //lods %ds:(%rsi),%rax 10797 255,224, //jmpq *%rax 10798 }; 10799 10800 CODE const uint8_t sk_swap_rb_sse41[] = { 10801 68,15,40,192, //movaps %xmm0,%xmm8 10802 72,173, //lods %ds:(%rsi),%rax 10803 15,40,194, //movaps %xmm2,%xmm0 10804 65,15,40,208, //movaps %xmm8,%xmm2 10805 255,224, //jmpq *%rax 10806 }; 10807 10808 CODE const uint8_t sk_swap_sse41[] = { 10809 68,15,40,195, //movaps %xmm3,%xmm8 10810 68,15,40,202, //movaps %xmm2,%xmm9 10811 68,15,40,209, //movaps %xmm1,%xmm10 10812 68,15,40,216, //movaps %xmm0,%xmm11 10813 72,173, //lods %ds:(%rsi),%rax 10814 15,40,196, //movaps %xmm4,%xmm0 10815 15,40,205, //movaps %xmm5,%xmm1 10816 15,40,214, //movaps %xmm6,%xmm2 10817 15,40,223, //movaps %xmm7,%xmm3 10818 65,15,40,227, //movaps %xmm11,%xmm4 10819 65,15,40,234, //movaps %xmm10,%xmm5 10820 65,15,40,241, //movaps %xmm9,%xmm6 10821 65,15,40,248, //movaps %xmm8,%xmm7 10822 255,224, //jmpq *%rax 10823 }; 10824 10825 CODE const uint8_t sk_move_src_dst_sse41[] = { 10826 72,173, //lods %ds:(%rsi),%rax 10827 15,40,224, //movaps %xmm0,%xmm4 10828 15,40,233, //movaps %xmm1,%xmm5 10829 15,40,242, //movaps %xmm2,%xmm6 10830 15,40,251, //movaps %xmm3,%xmm7 10831 255,224, //jmpq *%rax 10832 }; 10833 10834 CODE const uint8_t sk_move_dst_src_sse41[] = { 10835 72,173, //lods %ds:(%rsi),%rax 10836 15,40,196, //movaps %xmm4,%xmm0 10837 15,40,205, //movaps %xmm5,%xmm1 10838 15,40,214, //movaps %xmm6,%xmm2 10839 15,40,223, //movaps %xmm7,%xmm3 10840 255,224, //jmpq *%rax 10841 }; 10842 10843 CODE const uint8_t sk_premul_sse41[] = { 10844 15,89,195, //mulps %xmm3,%xmm0 10845 15,89,203, //mulps %xmm3,%xmm1 10846 15,89,211, //mulps %xmm3,%xmm2 10847 72,173, //lods %ds:(%rsi),%rax 10848 255,224, //jmpq *%rax 10849 }; 10850 10851 CODE const uint8_t sk_unpremul_sse41[] = { 10852 69,15,87,192, //xorps %xmm8,%xmm8 10853 184,0,0,128,63, //mov $0x3f800000,%eax 10854 102,68,15,110,200, //movd %eax,%xmm9 10855 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 10856 68,15,94,203, //divps %xmm3,%xmm9 10857 68,15,194,195,4, //cmpneqps %xmm3,%xmm8 10858 69,15,84,193, //andps %xmm9,%xmm8 10859 65,15,89,192, //mulps %xmm8,%xmm0 10860 65,15,89,200, //mulps %xmm8,%xmm1 10861 65,15,89,208, //mulps %xmm8,%xmm2 10862 72,173, //lods %ds:(%rsi),%rax 10863 255,224, //jmpq *%rax 10864 }; 10865 10866 CODE const uint8_t sk_from_srgb_sse41[] = { 10867 184,145,131,158,61, //mov $0x3d9e8391,%eax 10868 102,68,15,110,216, //movd %eax,%xmm11 10869 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 10870 69,15,40,211, //movaps %xmm11,%xmm10 10871 68,15,89,208, //mulps %xmm0,%xmm10 10872 68,15,40,240, //movaps %xmm0,%xmm14 10873 69,15,89,246, //mulps %xmm14,%xmm14 10874 184,154,153,153,62, //mov $0x3e99999a,%eax 10875 102,68,15,110,192, //movd %eax,%xmm8 10876 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 10877 184,92,143,50,63, //mov $0x3f328f5c,%eax 10878 102,68,15,110,224, //movd %eax,%xmm12 10879 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 10880 69,15,40,200, //movaps %xmm8,%xmm9 10881 68,15,89,200, //mulps %xmm0,%xmm9 10882 69,15,88,204, //addps %xmm12,%xmm9 10883 184,10,215,35,59, //mov $0x3b23d70a,%eax 10884 102,68,15,110,232, //movd %eax,%xmm13 10885 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 10886 69,15,89,206, //mulps %xmm14,%xmm9 10887 69,15,88,205, //addps %xmm13,%xmm9 10888 184,174,71,97,61, //mov $0x3d6147ae,%eax 10889 102,68,15,110,240, //movd %eax,%xmm14 10890 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 10891 65,15,194,198,1, //cmpltps %xmm14,%xmm0 10892 102,69,15,56,20,202, //blendvps %xmm0,%xmm10,%xmm9 10893 69,15,40,251, //movaps %xmm11,%xmm15 10894 68,15,89,249, //mulps %xmm1,%xmm15 10895 15,40,193, //movaps %xmm1,%xmm0 10896 15,89,192, //mulps %xmm0,%xmm0 10897 69,15,40,208, //movaps %xmm8,%xmm10 10898 68,15,89,209, //mulps %xmm1,%xmm10 10899 69,15,88,212, //addps %xmm12,%xmm10 10900 68,15,89,208, //mulps %xmm0,%xmm10 10901 69,15,88,213, //addps %xmm13,%xmm10 10902 65,15,194,206,1, //cmpltps %xmm14,%xmm1 10903 15,40,193, //movaps %xmm1,%xmm0 10904 102,69,15,56,20,215, //blendvps %xmm0,%xmm15,%xmm10 10905 68,15,89,218, //mulps %xmm2,%xmm11 10906 15,40,194, //movaps %xmm2,%xmm0 10907 15,89,192, //mulps %xmm0,%xmm0 10908 68,15,89,194, //mulps %xmm2,%xmm8 10909 69,15,88,196, //addps %xmm12,%xmm8 10910 68,15,89,192, //mulps %xmm0,%xmm8 10911 69,15,88,197, //addps %xmm13,%xmm8 10912 65,15,194,214,1, //cmpltps %xmm14,%xmm2 10913 15,40,194, //movaps %xmm2,%xmm0 10914 102,69,15,56,20,195, //blendvps %xmm0,%xmm11,%xmm8 10915 72,173, //lods %ds:(%rsi),%rax 10916 65,15,40,193, //movaps %xmm9,%xmm0 10917 65,15,40,202, //movaps %xmm10,%xmm1 10918 65,15,40,208, //movaps %xmm8,%xmm2 10919 255,224, //jmpq *%rax 10920 }; 10921 10922 CODE const uint8_t sk_to_srgb_sse41[] = { 10923 72,131,236,24, //sub $0x18,%rsp 10924 15,41,60,36, //movaps %xmm7,(%rsp) 10925 15,40,254, //movaps %xmm6,%xmm7 10926 15,40,245, //movaps %xmm5,%xmm6 10927 15,40,236, //movaps %xmm4,%xmm5 10928 15,40,227, //movaps %xmm3,%xmm4 10929 15,40,218, //movaps %xmm2,%xmm3 10930 15,40,209, //movaps %xmm1,%xmm2 10931 68,15,82,192, //rsqrtps %xmm0,%xmm8 10932 69,15,83,200, //rcpps %xmm8,%xmm9 10933 69,15,82,248, //rsqrtps %xmm8,%xmm15 10934 184,41,92,71,65, //mov $0x41475c29,%eax 10935 102,68,15,110,216, //movd %eax,%xmm11 10936 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 10937 69,15,40,211, //movaps %xmm11,%xmm10 10938 68,15,89,208, //mulps %xmm0,%xmm10 10939 184,0,0,128,63, //mov $0x3f800000,%eax 10940 102,68,15,110,192, //movd %eax,%xmm8 10941 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 10942 184,194,135,210,62, //mov $0x3ed287c2,%eax 10943 102,68,15,110,224, //movd %eax,%xmm12 10944 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 10945 184,206,111,48,63, //mov $0x3f306fce,%eax 10946 102,68,15,110,232, //movd %eax,%xmm13 10947 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 10948 184,168,87,202,61, //mov $0x3dca57a8,%eax 10949 53,0,0,0,128, //xor $0x80000000,%eax 10950 102,68,15,110,240, //movd %eax,%xmm14 10951 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 10952 69,15,89,205, //mulps %xmm13,%xmm9 10953 69,15,88,206, //addps %xmm14,%xmm9 10954 69,15,89,252, //mulps %xmm12,%xmm15 10955 69,15,88,249, //addps %xmm9,%xmm15 10956 69,15,40,200, //movaps %xmm8,%xmm9 10957 69,15,93,207, //minps %xmm15,%xmm9 10958 184,4,231,140,59, //mov $0x3b8ce704,%eax 10959 102,68,15,110,248, //movd %eax,%xmm15 10960 69,15,198,255,0, //shufps $0x0,%xmm15,%xmm15 10961 65,15,194,199,1, //cmpltps %xmm15,%xmm0 10962 102,69,15,56,20,202, //blendvps %xmm0,%xmm10,%xmm9 10963 68,15,82,210, //rsqrtps %xmm2,%xmm10 10964 65,15,83,194, //rcpps %xmm10,%xmm0 10965 69,15,82,210, //rsqrtps %xmm10,%xmm10 10966 65,15,89,197, //mulps %xmm13,%xmm0 10967 65,15,88,198, //addps %xmm14,%xmm0 10968 69,15,89,212, //mulps %xmm12,%xmm10 10969 68,15,88,208, //addps %xmm0,%xmm10 10970 65,15,40,200, //movaps %xmm8,%xmm1 10971 65,15,93,202, //minps %xmm10,%xmm1 10972 69,15,40,211, //movaps %xmm11,%xmm10 10973 68,15,89,210, //mulps %xmm2,%xmm10 10974 65,15,194,215,1, //cmpltps %xmm15,%xmm2 10975 15,40,194, //movaps %xmm2,%xmm0 10976 102,65,15,56,20,202, //blendvps %xmm0,%xmm10,%xmm1 10977 15,82,195, //rsqrtps %xmm3,%xmm0 10978 15,83,208, //rcpps %xmm0,%xmm2 10979 65,15,89,213, //mulps %xmm13,%xmm2 10980 65,15,88,214, //addps %xmm14,%xmm2 10981 15,82,192, //rsqrtps %xmm0,%xmm0 10982 65,15,89,196, //mulps %xmm12,%xmm0 10983 15,88,194, //addps %xmm2,%xmm0 10984 68,15,93,192, //minps %xmm0,%xmm8 10985 68,15,89,219, //mulps %xmm3,%xmm11 10986 65,15,194,223,1, //cmpltps %xmm15,%xmm3 10987 15,40,195, //movaps %xmm3,%xmm0 10988 102,69,15,56,20,195, //blendvps %xmm0,%xmm11,%xmm8 10989 72,173, //lods %ds:(%rsi),%rax 10990 65,15,40,193, //movaps %xmm9,%xmm0 10991 65,15,40,208, //movaps %xmm8,%xmm2 10992 15,40,220, //movaps %xmm4,%xmm3 10993 15,40,229, //movaps %xmm5,%xmm4 10994 15,40,238, //movaps %xmm6,%xmm5 10995 15,40,247, //movaps %xmm7,%xmm6 10996 15,40,60,36, //movaps (%rsp),%xmm7 10997 72,131,196,24, //add $0x18,%rsp 10998 255,224, //jmpq *%rax 10999 }; 11000 11001 CODE const uint8_t sk_scale_1_float_sse41[] = { 11002 72,173, //lods %ds:(%rsi),%rax 11003 243,68,15,16,0, //movss (%rax),%xmm8 11004 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11005 65,15,89,192, //mulps %xmm8,%xmm0 11006 65,15,89,200, //mulps %xmm8,%xmm1 11007 65,15,89,208, //mulps %xmm8,%xmm2 11008 65,15,89,216, //mulps %xmm8,%xmm3 11009 72,173, //lods %ds:(%rsi),%rax 11010 255,224, //jmpq *%rax 11011 }; 11012 11013 CODE const uint8_t sk_scale_u8_sse41[] = { 11014 72,173, //lods %ds:(%rsi),%rax 11015 72,139,0, //mov (%rax),%rax 11016 102,68,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm8 11017 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 11018 184,129,128,128,59, //mov $0x3b808081,%eax 11019 102,68,15,110,200, //movd %eax,%xmm9 11020 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 11021 69,15,89,200, //mulps %xmm8,%xmm9 11022 65,15,89,193, //mulps %xmm9,%xmm0 11023 65,15,89,201, //mulps %xmm9,%xmm1 11024 65,15,89,209, //mulps %xmm9,%xmm2 11025 65,15,89,217, //mulps %xmm9,%xmm3 11026 72,173, //lods %ds:(%rsi),%rax 11027 255,224, //jmpq *%rax 11028 }; 11029 11030 CODE const uint8_t sk_lerp_1_float_sse41[] = { 11031 72,173, //lods %ds:(%rsi),%rax 11032 243,68,15,16,0, //movss (%rax),%xmm8 11033 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11034 15,92,196, //subps %xmm4,%xmm0 11035 65,15,89,192, //mulps %xmm8,%xmm0 11036 15,88,196, //addps %xmm4,%xmm0 11037 15,92,205, //subps %xmm5,%xmm1 11038 65,15,89,200, //mulps %xmm8,%xmm1 11039 15,88,205, //addps %xmm5,%xmm1 11040 15,92,214, //subps %xmm6,%xmm2 11041 65,15,89,208, //mulps %xmm8,%xmm2 11042 15,88,214, //addps %xmm6,%xmm2 11043 15,92,223, //subps %xmm7,%xmm3 11044 65,15,89,216, //mulps %xmm8,%xmm3 11045 15,88,223, //addps %xmm7,%xmm3 11046 72,173, //lods %ds:(%rsi),%rax 11047 255,224, //jmpq *%rax 11048 }; 11049 11050 CODE const uint8_t sk_lerp_u8_sse41[] = { 11051 72,173, //lods %ds:(%rsi),%rax 11052 72,139,0, //mov (%rax),%rax 11053 102,68,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm8 11054 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 11055 184,129,128,128,59, //mov $0x3b808081,%eax 11056 102,68,15,110,200, //movd %eax,%xmm9 11057 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 11058 69,15,89,200, //mulps %xmm8,%xmm9 11059 15,92,196, //subps %xmm4,%xmm0 11060 65,15,89,193, //mulps %xmm9,%xmm0 11061 15,88,196, //addps %xmm4,%xmm0 11062 15,92,205, //subps %xmm5,%xmm1 11063 65,15,89,201, //mulps %xmm9,%xmm1 11064 15,88,205, //addps %xmm5,%xmm1 11065 15,92,214, //subps %xmm6,%xmm2 11066 65,15,89,209, //mulps %xmm9,%xmm2 11067 15,88,214, //addps %xmm6,%xmm2 11068 15,92,223, //subps %xmm7,%xmm3 11069 65,15,89,217, //mulps %xmm9,%xmm3 11070 15,88,223, //addps %xmm7,%xmm3 11071 72,173, //lods %ds:(%rsi),%rax 11072 255,224, //jmpq *%rax 11073 }; 11074 11075 CODE const uint8_t sk_lerp_565_sse41[] = { 11076 72,173, //lods %ds:(%rsi),%rax 11077 72,139,0, //mov (%rax),%rax 11078 102,68,15,56,51,4,120, //pmovzxwd (%rax,%rdi,2),%xmm8 11079 184,0,248,0,0, //mov $0xf800,%eax 11080 102,15,110,216, //movd %eax,%xmm3 11081 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 11082 102,65,15,219,216, //pand %xmm8,%xmm3 11083 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 11084 184,8,33,132,55, //mov $0x37842108,%eax 11085 102,68,15,110,208, //movd %eax,%xmm10 11086 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 11087 69,15,89,209, //mulps %xmm9,%xmm10 11088 184,224,7,0,0, //mov $0x7e0,%eax 11089 102,15,110,216, //movd %eax,%xmm3 11090 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 11091 102,65,15,219,216, //pand %xmm8,%xmm3 11092 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 11093 184,33,8,2,58, //mov $0x3a020821,%eax 11094 102,68,15,110,216, //movd %eax,%xmm11 11095 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 11096 69,15,89,217, //mulps %xmm9,%xmm11 11097 184,31,0,0,0, //mov $0x1f,%eax 11098 102,15,110,216, //movd %eax,%xmm3 11099 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 11100 102,65,15,219,216, //pand %xmm8,%xmm3 11101 68,15,91,195, //cvtdq2ps %xmm3,%xmm8 11102 184,8,33,4,61, //mov $0x3d042108,%eax 11103 102,15,110,216, //movd %eax,%xmm3 11104 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 11105 65,15,89,216, //mulps %xmm8,%xmm3 11106 15,92,196, //subps %xmm4,%xmm0 11107 65,15,89,194, //mulps %xmm10,%xmm0 11108 15,88,196, //addps %xmm4,%xmm0 11109 15,92,205, //subps %xmm5,%xmm1 11110 65,15,89,203, //mulps %xmm11,%xmm1 11111 15,88,205, //addps %xmm5,%xmm1 11112 15,92,214, //subps %xmm6,%xmm2 11113 15,89,211, //mulps %xmm3,%xmm2 11114 15,88,214, //addps %xmm6,%xmm2 11115 184,0,0,128,63, //mov $0x3f800000,%eax 11116 102,15,110,216, //movd %eax,%xmm3 11117 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 11118 72,173, //lods %ds:(%rsi),%rax 11119 255,224, //jmpq *%rax 11120 }; 11121 11122 CODE const uint8_t sk_load_tables_sse41[] = { 11123 72,173, //lods %ds:(%rsi),%rax 11124 72,139,8, //mov (%rax),%rcx 11125 76,139,64,8, //mov 0x8(%rax),%r8 11126 243,68,15,111,4,185, //movdqu (%rcx,%rdi,4),%xmm8 11127 185,255,0,0,0, //mov $0xff,%ecx 11128 102,15,110,193, //movd %ecx,%xmm0 11129 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 11130 102,65,15,111,200, //movdqa %xmm8,%xmm1 11131 102,15,114,209,8, //psrld $0x8,%xmm1 11132 102,15,219,200, //pand %xmm0,%xmm1 11133 102,65,15,111,208, //movdqa %xmm8,%xmm2 11134 102,15,114,210,16, //psrld $0x10,%xmm2 11135 102,15,219,208, //pand %xmm0,%xmm2 11136 102,65,15,219,192, //pand %xmm8,%xmm0 11137 102,72,15,58,22,193,1, //pextrq $0x1,%xmm0,%rcx 11138 65,137,201, //mov %ecx,%r9d 11139 72,193,233,32, //shr $0x20,%rcx 11140 102,73,15,126,194, //movq %xmm0,%r10 11141 69,137,211, //mov %r10d,%r11d 11142 73,193,234,32, //shr $0x20,%r10 11143 243,67,15,16,4,152, //movss (%r8,%r11,4),%xmm0 11144 102,67,15,58,33,4,144,16, //insertps $0x10,(%r8,%r10,4),%xmm0 11145 102,67,15,58,33,4,136,32, //insertps $0x20,(%r8,%r9,4),%xmm0 11146 102,65,15,58,33,4,136,48, //insertps $0x30,(%r8,%rcx,4),%xmm0 11147 76,139,64,16, //mov 0x10(%rax),%r8 11148 102,73,15,58,22,202,1, //pextrq $0x1,%xmm1,%r10 11149 77,137,209, //mov %r10,%r9 11150 73,193,233,32, //shr $0x20,%r9 11151 102,72,15,126,201, //movq %xmm1,%rcx 11152 65,137,203, //mov %ecx,%r11d 11153 65,129,227,255,255,255,0, //and $0xffffff,%r11d 11154 72,193,233,30, //shr $0x1e,%rcx 11155 65,129,226,255,255,255,0, //and $0xffffff,%r10d 11156 243,67,15,16,12,152, //movss (%r8,%r11,4),%xmm1 11157 102,65,15,58,33,12,8,16, //insertps $0x10,(%r8,%rcx,1),%xmm1 11158 243,67,15,16,28,144, //movss (%r8,%r10,4),%xmm3 11159 102,15,58,33,203,32, //insertps $0x20,%xmm3,%xmm1 11160 243,67,15,16,28,136, //movss (%r8,%r9,4),%xmm3 11161 102,15,58,33,203,48, //insertps $0x30,%xmm3,%xmm1 11162 76,139,72,24, //mov 0x18(%rax),%r9 11163 102,72,15,58,22,209,1, //pextrq $0x1,%xmm2,%rcx 11164 68,15,183,193, //movzwl %cx,%r8d 11165 72,193,233,32, //shr $0x20,%rcx 11166 102,72,15,126,208, //movq %xmm2,%rax 11167 68,15,183,208, //movzwl %ax,%r10d 11168 72,193,232,30, //shr $0x1e,%rax 11169 243,67,15,16,20,145, //movss (%r9,%r10,4),%xmm2 11170 102,65,15,58,33,20,1,16, //insertps $0x10,(%r9,%rax,1),%xmm2 11171 243,67,15,16,28,129, //movss (%r9,%r8,4),%xmm3 11172 102,15,58,33,211,32, //insertps $0x20,%xmm3,%xmm2 11173 243,65,15,16,28,137, //movss (%r9,%rcx,4),%xmm3 11174 102,15,58,33,211,48, //insertps $0x30,%xmm3,%xmm2 11175 102,65,15,114,208,24, //psrld $0x18,%xmm8 11176 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 11177 184,129,128,128,59, //mov $0x3b808081,%eax 11178 102,15,110,216, //movd %eax,%xmm3 11179 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 11180 65,15,89,216, //mulps %xmm8,%xmm3 11181 72,173, //lods %ds:(%rsi),%rax 11182 255,224, //jmpq *%rax 11183 }; 11184 11185 CODE const uint8_t sk_load_a8_sse41[] = { 11186 72,173, //lods %ds:(%rsi),%rax 11187 72,139,0, //mov (%rax),%rax 11188 102,15,56,49,4,56, //pmovzxbd (%rax,%rdi,1),%xmm0 11189 15,91,192, //cvtdq2ps %xmm0,%xmm0 11190 184,129,128,128,59, //mov $0x3b808081,%eax 11191 102,15,110,216, //movd %eax,%xmm3 11192 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 11193 15,89,216, //mulps %xmm0,%xmm3 11194 72,173, //lods %ds:(%rsi),%rax 11195 15,87,192, //xorps %xmm0,%xmm0 11196 15,87,201, //xorps %xmm1,%xmm1 11197 15,87,210, //xorps %xmm2,%xmm2 11198 255,224, //jmpq *%rax 11199 }; 11200 11201 CODE const uint8_t sk_store_a8_sse41[] = { 11202 72,173, //lods %ds:(%rsi),%rax 11203 72,139,0, //mov (%rax),%rax 11204 185,0,0,127,67, //mov $0x437f0000,%ecx 11205 102,68,15,110,193, //movd %ecx,%xmm8 11206 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11207 68,15,89,195, //mulps %xmm3,%xmm8 11208 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 11209 102,69,15,56,43,192, //packusdw %xmm8,%xmm8 11210 102,69,15,103,192, //packuswb %xmm8,%xmm8 11211 102,68,15,126,4,56, //movd %xmm8,(%rax,%rdi,1) 11212 72,173, //lods %ds:(%rsi),%rax 11213 255,224, //jmpq *%rax 11214 }; 11215 11216 CODE const uint8_t sk_load_565_sse41[] = { 11217 72,173, //lods %ds:(%rsi),%rax 11218 72,139,0, //mov (%rax),%rax 11219 102,15,56,51,20,120, //pmovzxwd (%rax,%rdi,2),%xmm2 11220 184,0,248,0,0, //mov $0xf800,%eax 11221 102,15,110,192, //movd %eax,%xmm0 11222 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 11223 102,15,219,194, //pand %xmm2,%xmm0 11224 15,91,200, //cvtdq2ps %xmm0,%xmm1 11225 184,8,33,132,55, //mov $0x37842108,%eax 11226 102,15,110,192, //movd %eax,%xmm0 11227 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 11228 15,89,193, //mulps %xmm1,%xmm0 11229 184,224,7,0,0, //mov $0x7e0,%eax 11230 102,15,110,200, //movd %eax,%xmm1 11231 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 11232 102,15,219,202, //pand %xmm2,%xmm1 11233 15,91,217, //cvtdq2ps %xmm1,%xmm3 11234 184,33,8,2,58, //mov $0x3a020821,%eax 11235 102,15,110,200, //movd %eax,%xmm1 11236 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 11237 15,89,203, //mulps %xmm3,%xmm1 11238 184,31,0,0,0, //mov $0x1f,%eax 11239 102,15,110,216, //movd %eax,%xmm3 11240 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 11241 102,15,219,218, //pand %xmm2,%xmm3 11242 15,91,219, //cvtdq2ps %xmm3,%xmm3 11243 184,8,33,4,61, //mov $0x3d042108,%eax 11244 102,15,110,208, //movd %eax,%xmm2 11245 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 11246 15,89,211, //mulps %xmm3,%xmm2 11247 184,0,0,128,63, //mov $0x3f800000,%eax 11248 102,15,110,216, //movd %eax,%xmm3 11249 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 11250 72,173, //lods %ds:(%rsi),%rax 11251 255,224, //jmpq *%rax 11252 }; 11253 11254 CODE const uint8_t sk_store_565_sse41[] = { 11255 72,173, //lods %ds:(%rsi),%rax 11256 72,139,0, //mov (%rax),%rax 11257 185,0,0,248,65, //mov $0x41f80000,%ecx 11258 102,68,15,110,193, //movd %ecx,%xmm8 11259 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11260 69,15,40,200, //movaps %xmm8,%xmm9 11261 68,15,89,200, //mulps %xmm0,%xmm9 11262 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 11263 102,65,15,114,241,11, //pslld $0xb,%xmm9 11264 185,0,0,124,66, //mov $0x427c0000,%ecx 11265 102,68,15,110,209, //movd %ecx,%xmm10 11266 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 11267 68,15,89,209, //mulps %xmm1,%xmm10 11268 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 11269 102,65,15,114,242,5, //pslld $0x5,%xmm10 11270 102,69,15,235,209, //por %xmm9,%xmm10 11271 68,15,89,194, //mulps %xmm2,%xmm8 11272 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 11273 102,69,15,86,194, //orpd %xmm10,%xmm8 11274 102,69,15,56,43,192, //packusdw %xmm8,%xmm8 11275 102,68,15,214,4,120, //movq %xmm8,(%rax,%rdi,2) 11276 72,173, //lods %ds:(%rsi),%rax 11277 255,224, //jmpq *%rax 11278 }; 11279 11280 CODE const uint8_t sk_load_8888_sse41[] = { 11281 72,173, //lods %ds:(%rsi),%rax 11282 72,139,0, //mov (%rax),%rax 11283 243,15,111,28,184, //movdqu (%rax,%rdi,4),%xmm3 11284 184,255,0,0,0, //mov $0xff,%eax 11285 102,15,110,192, //movd %eax,%xmm0 11286 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 11287 102,15,111,203, //movdqa %xmm3,%xmm1 11288 102,15,114,209,8, //psrld $0x8,%xmm1 11289 102,15,219,200, //pand %xmm0,%xmm1 11290 102,15,111,211, //movdqa %xmm3,%xmm2 11291 102,15,114,210,16, //psrld $0x10,%xmm2 11292 102,15,219,208, //pand %xmm0,%xmm2 11293 102,15,219,195, //pand %xmm3,%xmm0 11294 15,91,192, //cvtdq2ps %xmm0,%xmm0 11295 184,129,128,128,59, //mov $0x3b808081,%eax 11296 102,68,15,110,192, //movd %eax,%xmm8 11297 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11298 65,15,89,192, //mulps %xmm8,%xmm0 11299 15,91,201, //cvtdq2ps %xmm1,%xmm1 11300 65,15,89,200, //mulps %xmm8,%xmm1 11301 15,91,210, //cvtdq2ps %xmm2,%xmm2 11302 65,15,89,208, //mulps %xmm8,%xmm2 11303 102,15,114,211,24, //psrld $0x18,%xmm3 11304 15,91,219, //cvtdq2ps %xmm3,%xmm3 11305 65,15,89,216, //mulps %xmm8,%xmm3 11306 72,173, //lods %ds:(%rsi),%rax 11307 255,224, //jmpq *%rax 11308 }; 11309 11310 CODE const uint8_t sk_store_8888_sse41[] = { 11311 72,173, //lods %ds:(%rsi),%rax 11312 72,139,0, //mov (%rax),%rax 11313 185,0,0,127,67, //mov $0x437f0000,%ecx 11314 102,68,15,110,193, //movd %ecx,%xmm8 11315 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11316 69,15,40,200, //movaps %xmm8,%xmm9 11317 68,15,89,200, //mulps %xmm0,%xmm9 11318 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 11319 69,15,40,208, //movaps %xmm8,%xmm10 11320 68,15,89,209, //mulps %xmm1,%xmm10 11321 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 11322 102,65,15,114,242,8, //pslld $0x8,%xmm10 11323 102,69,15,235,209, //por %xmm9,%xmm10 11324 69,15,40,200, //movaps %xmm8,%xmm9 11325 68,15,89,202, //mulps %xmm2,%xmm9 11326 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 11327 102,65,15,114,241,16, //pslld $0x10,%xmm9 11328 68,15,89,195, //mulps %xmm3,%xmm8 11329 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 11330 102,65,15,114,240,24, //pslld $0x18,%xmm8 11331 102,69,15,235,193, //por %xmm9,%xmm8 11332 102,69,15,235,194, //por %xmm10,%xmm8 11333 243,68,15,127,4,184, //movdqu %xmm8,(%rax,%rdi,4) 11334 72,173, //lods %ds:(%rsi),%rax 11335 255,224, //jmpq *%rax 11336 }; 11337 11338 CODE const uint8_t sk_load_f16_sse41[] = { 11339 72,173, //lods %ds:(%rsi),%rax 11340 72,139,0, //mov (%rax),%rax 11341 243,15,111,4,248, //movdqu (%rax,%rdi,8),%xmm0 11342 243,15,111,76,248,16, //movdqu 0x10(%rax,%rdi,8),%xmm1 11343 102,15,111,208, //movdqa %xmm0,%xmm2 11344 102,15,97,209, //punpcklwd %xmm1,%xmm2 11345 102,15,105,193, //punpckhwd %xmm1,%xmm0 11346 102,68,15,111,194, //movdqa %xmm2,%xmm8 11347 102,68,15,97,192, //punpcklwd %xmm0,%xmm8 11348 102,15,105,208, //punpckhwd %xmm0,%xmm2 11349 184,0,4,0,4, //mov $0x4000400,%eax 11350 102,15,110,192, //movd %eax,%xmm0 11351 102,15,112,216,0, //pshufd $0x0,%xmm0,%xmm3 11352 102,15,111,203, //movdqa %xmm3,%xmm1 11353 102,65,15,101,200, //pcmpgtw %xmm8,%xmm1 11354 102,65,15,223,200, //pandn %xmm8,%xmm1 11355 102,15,101,218, //pcmpgtw %xmm2,%xmm3 11356 102,15,223,218, //pandn %xmm2,%xmm3 11357 102,15,56,51,193, //pmovzxwd %xmm1,%xmm0 11358 102,15,114,240,13, //pslld $0xd,%xmm0 11359 184,0,0,128,119, //mov $0x77800000,%eax 11360 102,15,110,208, //movd %eax,%xmm2 11361 102,68,15,112,194,0, //pshufd $0x0,%xmm2,%xmm8 11362 65,15,89,192, //mulps %xmm8,%xmm0 11363 102,69,15,239,201, //pxor %xmm9,%xmm9 11364 102,65,15,105,201, //punpckhwd %xmm9,%xmm1 11365 102,15,114,241,13, //pslld $0xd,%xmm1 11366 65,15,89,200, //mulps %xmm8,%xmm1 11367 102,15,56,51,211, //pmovzxwd %xmm3,%xmm2 11368 102,15,114,242,13, //pslld $0xd,%xmm2 11369 65,15,89,208, //mulps %xmm8,%xmm2 11370 102,65,15,105,217, //punpckhwd %xmm9,%xmm3 11371 102,15,114,243,13, //pslld $0xd,%xmm3 11372 65,15,89,216, //mulps %xmm8,%xmm3 11373 72,173, //lods %ds:(%rsi),%rax 11374 255,224, //jmpq *%rax 11375 }; 11376 11377 CODE const uint8_t sk_store_f16_sse41[] = { 11378 72,173, //lods %ds:(%rsi),%rax 11379 72,139,0, //mov (%rax),%rax 11380 185,0,0,128,7, //mov $0x7800000,%ecx 11381 102,68,15,110,193, //movd %ecx,%xmm8 11382 102,69,15,112,192,0, //pshufd $0x0,%xmm8,%xmm8 11383 102,69,15,111,200, //movdqa %xmm8,%xmm9 11384 68,15,89,200, //mulps %xmm0,%xmm9 11385 102,65,15,114,209,13, //psrld $0xd,%xmm9 11386 102,69,15,111,208, //movdqa %xmm8,%xmm10 11387 68,15,89,209, //mulps %xmm1,%xmm10 11388 102,65,15,114,210,13, //psrld $0xd,%xmm10 11389 102,69,15,111,216, //movdqa %xmm8,%xmm11 11390 68,15,89,218, //mulps %xmm2,%xmm11 11391 102,65,15,114,211,13, //psrld $0xd,%xmm11 11392 68,15,89,195, //mulps %xmm3,%xmm8 11393 102,65,15,114,208,13, //psrld $0xd,%xmm8 11394 102,65,15,115,250,2, //pslldq $0x2,%xmm10 11395 102,69,15,235,209, //por %xmm9,%xmm10 11396 102,65,15,115,248,2, //pslldq $0x2,%xmm8 11397 102,69,15,235,195, //por %xmm11,%xmm8 11398 102,69,15,111,202, //movdqa %xmm10,%xmm9 11399 102,69,15,98,200, //punpckldq %xmm8,%xmm9 11400 243,68,15,127,12,248, //movdqu %xmm9,(%rax,%rdi,8) 11401 102,69,15,106,208, //punpckhdq %xmm8,%xmm10 11402 243,68,15,127,84,248,16, //movdqu %xmm10,0x10(%rax,%rdi,8) 11403 72,173, //lods %ds:(%rsi),%rax 11404 255,224, //jmpq *%rax 11405 }; 11406 11407 CODE const uint8_t sk_store_f32_sse41[] = { 11408 72,173, //lods %ds:(%rsi),%rax 11409 72,139,0, //mov (%rax),%rax 11410 72,137,249, //mov %rdi,%rcx 11411 72,193,225,4, //shl $0x4,%rcx 11412 68,15,40,192, //movaps %xmm0,%xmm8 11413 68,15,40,200, //movaps %xmm0,%xmm9 11414 68,15,20,201, //unpcklps %xmm1,%xmm9 11415 68,15,40,210, //movaps %xmm2,%xmm10 11416 68,15,40,218, //movaps %xmm2,%xmm11 11417 68,15,20,219, //unpcklps %xmm3,%xmm11 11418 68,15,21,193, //unpckhps %xmm1,%xmm8 11419 68,15,21,211, //unpckhps %xmm3,%xmm10 11420 69,15,40,225, //movaps %xmm9,%xmm12 11421 102,69,15,20,227, //unpcklpd %xmm11,%xmm12 11422 69,15,18,217, //movhlps %xmm9,%xmm11 11423 69,15,40,200, //movaps %xmm8,%xmm9 11424 102,69,15,20,202, //unpcklpd %xmm10,%xmm9 11425 69,15,18,208, //movhlps %xmm8,%xmm10 11426 102,68,15,17,36,8, //movupd %xmm12,(%rax,%rcx,1) 11427 68,15,17,92,8,16, //movups %xmm11,0x10(%rax,%rcx,1) 11428 102,68,15,17,76,8,32, //movupd %xmm9,0x20(%rax,%rcx,1) 11429 68,15,17,84,8,48, //movups %xmm10,0x30(%rax,%rcx,1) 11430 72,173, //lods %ds:(%rsi),%rax 11431 255,224, //jmpq *%rax 11432 }; 11433 11434 CODE const uint8_t sk_clamp_x_sse41[] = { 11435 72,173, //lods %ds:(%rsi),%rax 11436 69,15,87,192, //xorps %xmm8,%xmm8 11437 68,15,95,192, //maxps %xmm0,%xmm8 11438 243,68,15,16,8, //movss (%rax),%xmm9 11439 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 11440 102,15,118,192, //pcmpeqd %xmm0,%xmm0 11441 102,65,15,254,193, //paddd %xmm9,%xmm0 11442 68,15,93,192, //minps %xmm0,%xmm8 11443 72,173, //lods %ds:(%rsi),%rax 11444 65,15,40,192, //movaps %xmm8,%xmm0 11445 255,224, //jmpq *%rax 11446 }; 11447 11448 CODE const uint8_t sk_clamp_y_sse41[] = { 11449 72,173, //lods %ds:(%rsi),%rax 11450 69,15,87,192, //xorps %xmm8,%xmm8 11451 68,15,95,193, //maxps %xmm1,%xmm8 11452 243,68,15,16,8, //movss (%rax),%xmm9 11453 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 11454 102,15,118,201, //pcmpeqd %xmm1,%xmm1 11455 102,65,15,254,201, //paddd %xmm9,%xmm1 11456 68,15,93,193, //minps %xmm1,%xmm8 11457 72,173, //lods %ds:(%rsi),%rax 11458 65,15,40,200, //movaps %xmm8,%xmm1 11459 255,224, //jmpq *%rax 11460 }; 11461 11462 CODE const uint8_t sk_repeat_x_sse41[] = { 11463 72,173, //lods %ds:(%rsi),%rax 11464 243,68,15,16,0, //movss (%rax),%xmm8 11465 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11466 68,15,40,200, //movaps %xmm0,%xmm9 11467 69,15,94,200, //divps %xmm8,%xmm9 11468 102,69,15,58,8,201,1, //roundps $0x1,%xmm9,%xmm9 11469 69,15,89,200, //mulps %xmm8,%xmm9 11470 65,15,92,193, //subps %xmm9,%xmm0 11471 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9 11472 102,69,15,254,200, //paddd %xmm8,%xmm9 11473 65,15,93,193, //minps %xmm9,%xmm0 11474 72,173, //lods %ds:(%rsi),%rax 11475 255,224, //jmpq *%rax 11476 }; 11477 11478 CODE const uint8_t sk_repeat_y_sse41[] = { 11479 72,173, //lods %ds:(%rsi),%rax 11480 243,68,15,16,0, //movss (%rax),%xmm8 11481 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11482 68,15,40,201, //movaps %xmm1,%xmm9 11483 69,15,94,200, //divps %xmm8,%xmm9 11484 102,69,15,58,8,201,1, //roundps $0x1,%xmm9,%xmm9 11485 69,15,89,200, //mulps %xmm8,%xmm9 11486 65,15,92,201, //subps %xmm9,%xmm1 11487 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9 11488 102,69,15,254,200, //paddd %xmm8,%xmm9 11489 65,15,93,201, //minps %xmm9,%xmm1 11490 72,173, //lods %ds:(%rsi),%rax 11491 255,224, //jmpq *%rax 11492 }; 11493 11494 CODE const uint8_t sk_mirror_x_sse41[] = { 11495 72,173, //lods %ds:(%rsi),%rax 11496 243,68,15,16,0, //movss (%rax),%xmm8 11497 69,15,40,200, //movaps %xmm8,%xmm9 11498 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 11499 65,15,92,193, //subps %xmm9,%xmm0 11500 243,69,15,88,192, //addss %xmm8,%xmm8 11501 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11502 68,15,40,208, //movaps %xmm0,%xmm10 11503 69,15,94,208, //divps %xmm8,%xmm10 11504 102,69,15,58,8,210,1, //roundps $0x1,%xmm10,%xmm10 11505 69,15,89,208, //mulps %xmm8,%xmm10 11506 65,15,92,194, //subps %xmm10,%xmm0 11507 65,15,92,193, //subps %xmm9,%xmm0 11508 69,15,87,192, //xorps %xmm8,%xmm8 11509 68,15,92,192, //subps %xmm0,%xmm8 11510 65,15,84,192, //andps %xmm8,%xmm0 11511 102,69,15,118,192, //pcmpeqd %xmm8,%xmm8 11512 102,69,15,254,193, //paddd %xmm9,%xmm8 11513 65,15,93,192, //minps %xmm8,%xmm0 11514 72,173, //lods %ds:(%rsi),%rax 11515 255,224, //jmpq *%rax 11516 }; 11517 11518 CODE const uint8_t sk_mirror_y_sse41[] = { 11519 72,173, //lods %ds:(%rsi),%rax 11520 243,68,15,16,0, //movss (%rax),%xmm8 11521 69,15,40,200, //movaps %xmm8,%xmm9 11522 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 11523 65,15,92,201, //subps %xmm9,%xmm1 11524 243,69,15,88,192, //addss %xmm8,%xmm8 11525 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11526 68,15,40,209, //movaps %xmm1,%xmm10 11527 69,15,94,208, //divps %xmm8,%xmm10 11528 102,69,15,58,8,210,1, //roundps $0x1,%xmm10,%xmm10 11529 69,15,89,208, //mulps %xmm8,%xmm10 11530 65,15,92,202, //subps %xmm10,%xmm1 11531 65,15,92,201, //subps %xmm9,%xmm1 11532 69,15,87,192, //xorps %xmm8,%xmm8 11533 68,15,92,193, //subps %xmm1,%xmm8 11534 65,15,84,200, //andps %xmm8,%xmm1 11535 102,69,15,118,192, //pcmpeqd %xmm8,%xmm8 11536 102,69,15,254,193, //paddd %xmm9,%xmm8 11537 65,15,93,200, //minps %xmm8,%xmm1 11538 72,173, //lods %ds:(%rsi),%rax 11539 255,224, //jmpq *%rax 11540 }; 11541 11542 CODE const uint8_t sk_luminance_to_alpha_sse41[] = { 11543 184,208,179,89,62, //mov $0x3e59b3d0,%eax 11544 102,15,110,216, //movd %eax,%xmm3 11545 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 11546 15,89,216, //mulps %xmm0,%xmm3 11547 184,89,23,55,63, //mov $0x3f371759,%eax 11548 102,15,110,192, //movd %eax,%xmm0 11549 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 11550 15,89,193, //mulps %xmm1,%xmm0 11551 15,88,195, //addps %xmm3,%xmm0 11552 184,152,221,147,61, //mov $0x3d93dd98,%eax 11553 102,15,110,216, //movd %eax,%xmm3 11554 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 11555 15,89,218, //mulps %xmm2,%xmm3 11556 15,88,216, //addps %xmm0,%xmm3 11557 72,173, //lods %ds:(%rsi),%rax 11558 15,87,192, //xorps %xmm0,%xmm0 11559 15,87,201, //xorps %xmm1,%xmm1 11560 15,87,210, //xorps %xmm2,%xmm2 11561 255,224, //jmpq *%rax 11562 }; 11563 11564 CODE const uint8_t sk_matrix_2x3_sse41[] = { 11565 68,15,40,201, //movaps %xmm1,%xmm9 11566 68,15,40,192, //movaps %xmm0,%xmm8 11567 72,173, //lods %ds:(%rsi),%rax 11568 243,15,16,0, //movss (%rax),%xmm0 11569 243,15,16,72,4, //movss 0x4(%rax),%xmm1 11570 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 11571 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 11572 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 11573 243,68,15,16,88,16, //movss 0x10(%rax),%xmm11 11574 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 11575 69,15,89,209, //mulps %xmm9,%xmm10 11576 69,15,88,211, //addps %xmm11,%xmm10 11577 65,15,89,192, //mulps %xmm8,%xmm0 11578 65,15,88,194, //addps %xmm10,%xmm0 11579 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 11580 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10 11581 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 11582 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11 11583 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 11584 69,15,89,209, //mulps %xmm9,%xmm10 11585 69,15,88,211, //addps %xmm11,%xmm10 11586 65,15,89,200, //mulps %xmm8,%xmm1 11587 65,15,88,202, //addps %xmm10,%xmm1 11588 72,173, //lods %ds:(%rsi),%rax 11589 255,224, //jmpq *%rax 11590 }; 11591 11592 CODE const uint8_t sk_matrix_3x4_sse41[] = { 11593 68,15,40,201, //movaps %xmm1,%xmm9 11594 68,15,40,192, //movaps %xmm0,%xmm8 11595 72,173, //lods %ds:(%rsi),%rax 11596 243,15,16,0, //movss (%rax),%xmm0 11597 243,15,16,72,4, //movss 0x4(%rax),%xmm1 11598 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 11599 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10 11600 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 11601 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11 11602 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 11603 243,68,15,16,96,36, //movss 0x24(%rax),%xmm12 11604 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 11605 68,15,89,218, //mulps %xmm2,%xmm11 11606 69,15,88,220, //addps %xmm12,%xmm11 11607 69,15,89,209, //mulps %xmm9,%xmm10 11608 69,15,88,211, //addps %xmm11,%xmm10 11609 65,15,89,192, //mulps %xmm8,%xmm0 11610 65,15,88,194, //addps %xmm10,%xmm0 11611 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 11612 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10 11613 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 11614 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11 11615 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 11616 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12 11617 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 11618 68,15,89,218, //mulps %xmm2,%xmm11 11619 69,15,88,220, //addps %xmm12,%xmm11 11620 69,15,89,209, //mulps %xmm9,%xmm10 11621 69,15,88,211, //addps %xmm11,%xmm10 11622 65,15,89,200, //mulps %xmm8,%xmm1 11623 65,15,88,202, //addps %xmm10,%xmm1 11624 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 11625 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 11626 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11 11627 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 11628 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12 11629 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 11630 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13 11631 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 11632 68,15,89,226, //mulps %xmm2,%xmm12 11633 69,15,88,229, //addps %xmm13,%xmm12 11634 69,15,89,217, //mulps %xmm9,%xmm11 11635 69,15,88,220, //addps %xmm12,%xmm11 11636 69,15,89,208, //mulps %xmm8,%xmm10 11637 69,15,88,211, //addps %xmm11,%xmm10 11638 72,173, //lods %ds:(%rsi),%rax 11639 65,15,40,210, //movaps %xmm10,%xmm2 11640 255,224, //jmpq *%rax 11641 }; 11642 11643 CODE const uint8_t sk_matrix_4x5_sse41[] = { 11644 68,15,40,201, //movaps %xmm1,%xmm9 11645 68,15,40,192, //movaps %xmm0,%xmm8 11646 72,173, //lods %ds:(%rsi),%rax 11647 243,15,16,0, //movss (%rax),%xmm0 11648 243,15,16,72,4, //movss 0x4(%rax),%xmm1 11649 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 11650 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10 11651 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 11652 243,68,15,16,88,32, //movss 0x20(%rax),%xmm11 11653 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 11654 243,68,15,16,96,48, //movss 0x30(%rax),%xmm12 11655 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 11656 243,68,15,16,104,64, //movss 0x40(%rax),%xmm13 11657 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 11658 68,15,89,227, //mulps %xmm3,%xmm12 11659 69,15,88,229, //addps %xmm13,%xmm12 11660 68,15,89,218, //mulps %xmm2,%xmm11 11661 69,15,88,220, //addps %xmm12,%xmm11 11662 69,15,89,209, //mulps %xmm9,%xmm10 11663 69,15,88,211, //addps %xmm11,%xmm10 11664 65,15,89,192, //mulps %xmm8,%xmm0 11665 65,15,88,194, //addps %xmm10,%xmm0 11666 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 11667 243,68,15,16,80,20, //movss 0x14(%rax),%xmm10 11668 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 11669 243,68,15,16,88,36, //movss 0x24(%rax),%xmm11 11670 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 11671 243,68,15,16,96,52, //movss 0x34(%rax),%xmm12 11672 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 11673 243,68,15,16,104,68, //movss 0x44(%rax),%xmm13 11674 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 11675 68,15,89,227, //mulps %xmm3,%xmm12 11676 69,15,88,229, //addps %xmm13,%xmm12 11677 68,15,89,218, //mulps %xmm2,%xmm11 11678 69,15,88,220, //addps %xmm12,%xmm11 11679 69,15,89,209, //mulps %xmm9,%xmm10 11680 69,15,88,211, //addps %xmm11,%xmm10 11681 65,15,89,200, //mulps %xmm8,%xmm1 11682 65,15,88,202, //addps %xmm10,%xmm1 11683 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 11684 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 11685 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11 11686 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 11687 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12 11688 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 11689 243,68,15,16,104,56, //movss 0x38(%rax),%xmm13 11690 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 11691 243,68,15,16,112,72, //movss 0x48(%rax),%xmm14 11692 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 11693 68,15,89,235, //mulps %xmm3,%xmm13 11694 69,15,88,238, //addps %xmm14,%xmm13 11695 68,15,89,226, //mulps %xmm2,%xmm12 11696 69,15,88,229, //addps %xmm13,%xmm12 11697 69,15,89,217, //mulps %xmm9,%xmm11 11698 69,15,88,220, //addps %xmm12,%xmm11 11699 69,15,89,208, //mulps %xmm8,%xmm10 11700 69,15,88,211, //addps %xmm11,%xmm10 11701 243,68,15,16,88,12, //movss 0xc(%rax),%xmm11 11702 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 11703 243,68,15,16,96,28, //movss 0x1c(%rax),%xmm12 11704 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 11705 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13 11706 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 11707 243,68,15,16,112,60, //movss 0x3c(%rax),%xmm14 11708 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 11709 243,68,15,16,120,76, //movss 0x4c(%rax),%xmm15 11710 69,15,198,255,0, //shufps $0x0,%xmm15,%xmm15 11711 68,15,89,243, //mulps %xmm3,%xmm14 11712 69,15,88,247, //addps %xmm15,%xmm14 11713 68,15,89,234, //mulps %xmm2,%xmm13 11714 69,15,88,238, //addps %xmm14,%xmm13 11715 69,15,89,225, //mulps %xmm9,%xmm12 11716 69,15,88,229, //addps %xmm13,%xmm12 11717 69,15,89,216, //mulps %xmm8,%xmm11 11718 69,15,88,220, //addps %xmm12,%xmm11 11719 72,173, //lods %ds:(%rsi),%rax 11720 65,15,40,210, //movaps %xmm10,%xmm2 11721 65,15,40,219, //movaps %xmm11,%xmm3 11722 255,224, //jmpq *%rax 11723 }; 11724 11725 CODE const uint8_t sk_matrix_perspective_sse41[] = { 11726 68,15,40,192, //movaps %xmm0,%xmm8 11727 72,173, //lods %ds:(%rsi),%rax 11728 243,15,16,0, //movss (%rax),%xmm0 11729 243,68,15,16,72,4, //movss 0x4(%rax),%xmm9 11730 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 11731 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 11732 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 11733 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 11734 68,15,89,201, //mulps %xmm1,%xmm9 11735 69,15,88,202, //addps %xmm10,%xmm9 11736 65,15,89,192, //mulps %xmm8,%xmm0 11737 65,15,88,193, //addps %xmm9,%xmm0 11738 243,68,15,16,72,12, //movss 0xc(%rax),%xmm9 11739 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 11740 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10 11741 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 11742 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11 11743 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 11744 68,15,89,209, //mulps %xmm1,%xmm10 11745 69,15,88,211, //addps %xmm11,%xmm10 11746 69,15,89,200, //mulps %xmm8,%xmm9 11747 69,15,88,202, //addps %xmm10,%xmm9 11748 243,68,15,16,80,24, //movss 0x18(%rax),%xmm10 11749 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 11750 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11 11751 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 11752 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12 11753 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 11754 68,15,89,217, //mulps %xmm1,%xmm11 11755 69,15,88,220, //addps %xmm12,%xmm11 11756 69,15,89,208, //mulps %xmm8,%xmm10 11757 69,15,88,211, //addps %xmm11,%xmm10 11758 65,15,83,202, //rcpps %xmm10,%xmm1 11759 15,89,193, //mulps %xmm1,%xmm0 11760 68,15,89,201, //mulps %xmm1,%xmm9 11761 72,173, //lods %ds:(%rsi),%rax 11762 65,15,40,201, //movaps %xmm9,%xmm1 11763 255,224, //jmpq *%rax 11764 }; 11765 11766 CODE const uint8_t sk_linear_gradient_2stops_sse41[] = { 11767 72,173, //lods %ds:(%rsi),%rax 11768 68,15,16,8, //movups (%rax),%xmm9 11769 15,16,88,16, //movups 0x10(%rax),%xmm3 11770 68,15,40,195, //movaps %xmm3,%xmm8 11771 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11772 65,15,40,201, //movaps %xmm9,%xmm1 11773 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 11774 68,15,89,192, //mulps %xmm0,%xmm8 11775 68,15,88,193, //addps %xmm1,%xmm8 11776 15,40,203, //movaps %xmm3,%xmm1 11777 15,198,201,85, //shufps $0x55,%xmm1,%xmm1 11778 65,15,40,209, //movaps %xmm9,%xmm2 11779 15,198,210,85, //shufps $0x55,%xmm2,%xmm2 11780 15,89,200, //mulps %xmm0,%xmm1 11781 15,88,202, //addps %xmm2,%xmm1 11782 15,40,211, //movaps %xmm3,%xmm2 11783 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2 11784 69,15,40,209, //movaps %xmm9,%xmm10 11785 69,15,198,210,170, //shufps $0xaa,%xmm10,%xmm10 11786 15,89,208, //mulps %xmm0,%xmm2 11787 65,15,88,210, //addps %xmm10,%xmm2 11788 15,198,219,255, //shufps $0xff,%xmm3,%xmm3 11789 69,15,198,201,255, //shufps $0xff,%xmm9,%xmm9 11790 15,89,216, //mulps %xmm0,%xmm3 11791 65,15,88,217, //addps %xmm9,%xmm3 11792 72,173, //lods %ds:(%rsi),%rax 11793 65,15,40,192, //movaps %xmm8,%xmm0 11794 255,224, //jmpq *%rax 11795 }; 11796 11797 CODE const uint8_t sk_start_pipeline_sse2[] = { 11798 65,87, //push %r15 11799 65,86, //push %r14 11800 65,85, //push %r13 11801 65,84, //push %r12 11802 86, //push %rsi 11803 87, //push %rdi 11804 83, //push %rbx 11805 72,129,236,160,0,0,0, //sub $0xa0,%rsp 11806 68,15,41,188,36,144,0,0,0, //movaps %xmm15,0x90(%rsp) 11807 68,15,41,180,36,128,0,0,0, //movaps %xmm14,0x80(%rsp) 11808 68,15,41,108,36,112, //movaps %xmm13,0x70(%rsp) 11809 68,15,41,100,36,96, //movaps %xmm12,0x60(%rsp) 11810 68,15,41,92,36,80, //movaps %xmm11,0x50(%rsp) 11811 68,15,41,84,36,64, //movaps %xmm10,0x40(%rsp) 11812 68,15,41,76,36,48, //movaps %xmm9,0x30(%rsp) 11813 68,15,41,68,36,32, //movaps %xmm8,0x20(%rsp) 11814 15,41,124,36,16, //movaps %xmm7,0x10(%rsp) 11815 15,41,52,36, //movaps %xmm6,(%rsp) 11816 77,137,207, //mov %r9,%r15 11817 77,137,198, //mov %r8,%r14 11818 72,137,203, //mov %rcx,%rbx 11819 72,137,214, //mov %rdx,%rsi 11820 72,173, //lods %ds:(%rsi),%rax 11821 73,137,196, //mov %rax,%r12 11822 73,137,245, //mov %rsi,%r13 11823 72,141,67,4, //lea 0x4(%rbx),%rax 11824 76,57,248, //cmp %r15,%rax 11825 118,5, //jbe 73 <_sk_start_pipeline_sse2+0x73> 11826 72,137,216, //mov %rbx,%rax 11827 235,52, //jmp a7 <_sk_start_pipeline_sse2+0xa7> 11828 15,87,192, //xorps %xmm0,%xmm0 11829 15,87,201, //xorps %xmm1,%xmm1 11830 15,87,210, //xorps %xmm2,%xmm2 11831 15,87,219, //xorps %xmm3,%xmm3 11832 15,87,228, //xorps %xmm4,%xmm4 11833 15,87,237, //xorps %xmm5,%xmm5 11834 15,87,246, //xorps %xmm6,%xmm6 11835 15,87,255, //xorps %xmm7,%xmm7 11836 72,137,223, //mov %rbx,%rdi 11837 76,137,238, //mov %r13,%rsi 11838 76,137,242, //mov %r14,%rdx 11839 65,255,212, //callq *%r12 11840 72,141,67,4, //lea 0x4(%rbx),%rax 11841 72,131,195,8, //add $0x8,%rbx 11842 76,57,251, //cmp %r15,%rbx 11843 72,137,195, //mov %rax,%rbx 11844 118,204, //jbe 73 <_sk_start_pipeline_sse2+0x73> 11845 15,40,52,36, //movaps (%rsp),%xmm6 11846 15,40,124,36,16, //movaps 0x10(%rsp),%xmm7 11847 68,15,40,68,36,32, //movaps 0x20(%rsp),%xmm8 11848 68,15,40,76,36,48, //movaps 0x30(%rsp),%xmm9 11849 68,15,40,84,36,64, //movaps 0x40(%rsp),%xmm10 11850 68,15,40,92,36,80, //movaps 0x50(%rsp),%xmm11 11851 68,15,40,100,36,96, //movaps 0x60(%rsp),%xmm12 11852 68,15,40,108,36,112, //movaps 0x70(%rsp),%xmm13 11853 68,15,40,180,36,128,0,0,0, //movaps 0x80(%rsp),%xmm14 11854 68,15,40,188,36,144,0,0,0, //movaps 0x90(%rsp),%xmm15 11855 72,129,196,160,0,0,0, //add $0xa0,%rsp 11856 91, //pop %rbx 11857 95, //pop %rdi 11858 94, //pop %rsi 11859 65,92, //pop %r12 11860 65,93, //pop %r13 11861 65,94, //pop %r14 11862 65,95, //pop %r15 11863 195, //retq 11864 }; 11865 11866 CODE const uint8_t sk_just_return_sse2[] = { 11867 195, //retq 11868 }; 11869 11870 CODE const uint8_t sk_seed_shader_sse2[] = { 11871 72,173, //lods %ds:(%rsi),%rax 11872 102,15,110,199, //movd %edi,%xmm0 11873 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 11874 15,91,200, //cvtdq2ps %xmm0,%xmm1 11875 185,0,0,0,63, //mov $0x3f000000,%ecx 11876 102,15,110,209, //movd %ecx,%xmm2 11877 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 11878 15,88,202, //addps %xmm2,%xmm1 11879 15,16,2, //movups (%rdx),%xmm0 11880 15,88,193, //addps %xmm1,%xmm0 11881 102,15,110,8, //movd (%rax),%xmm1 11882 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 11883 15,91,201, //cvtdq2ps %xmm1,%xmm1 11884 15,88,202, //addps %xmm2,%xmm1 11885 184,0,0,128,63, //mov $0x3f800000,%eax 11886 102,15,110,208, //movd %eax,%xmm2 11887 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 11888 72,173, //lods %ds:(%rsi),%rax 11889 15,87,219, //xorps %xmm3,%xmm3 11890 15,87,228, //xorps %xmm4,%xmm4 11891 15,87,237, //xorps %xmm5,%xmm5 11892 15,87,246, //xorps %xmm6,%xmm6 11893 15,87,255, //xorps %xmm7,%xmm7 11894 255,224, //jmpq *%rax 11895 }; 11896 11897 CODE const uint8_t sk_constant_color_sse2[] = { 11898 72,173, //lods %ds:(%rsi),%rax 11899 15,16,24, //movups (%rax),%xmm3 11900 15,40,195, //movaps %xmm3,%xmm0 11901 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 11902 15,40,203, //movaps %xmm3,%xmm1 11903 15,198,201,85, //shufps $0x55,%xmm1,%xmm1 11904 15,40,211, //movaps %xmm3,%xmm2 11905 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2 11906 15,198,219,255, //shufps $0xff,%xmm3,%xmm3 11907 72,173, //lods %ds:(%rsi),%rax 11908 255,224, //jmpq *%rax 11909 }; 11910 11911 CODE const uint8_t sk_clear_sse2[] = { 11912 72,173, //lods %ds:(%rsi),%rax 11913 15,87,192, //xorps %xmm0,%xmm0 11914 15,87,201, //xorps %xmm1,%xmm1 11915 15,87,210, //xorps %xmm2,%xmm2 11916 15,87,219, //xorps %xmm3,%xmm3 11917 255,224, //jmpq *%rax 11918 }; 11919 11920 CODE const uint8_t sk_plus__sse2[] = { 11921 15,88,196, //addps %xmm4,%xmm0 11922 15,88,205, //addps %xmm5,%xmm1 11923 15,88,214, //addps %xmm6,%xmm2 11924 15,88,223, //addps %xmm7,%xmm3 11925 72,173, //lods %ds:(%rsi),%rax 11926 255,224, //jmpq *%rax 11927 }; 11928 11929 CODE const uint8_t sk_srcover_sse2[] = { 11930 184,0,0,128,63, //mov $0x3f800000,%eax 11931 102,68,15,110,192, //movd %eax,%xmm8 11932 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11933 68,15,92,195, //subps %xmm3,%xmm8 11934 69,15,40,200, //movaps %xmm8,%xmm9 11935 68,15,89,204, //mulps %xmm4,%xmm9 11936 65,15,88,193, //addps %xmm9,%xmm0 11937 69,15,40,200, //movaps %xmm8,%xmm9 11938 68,15,89,205, //mulps %xmm5,%xmm9 11939 65,15,88,201, //addps %xmm9,%xmm1 11940 69,15,40,200, //movaps %xmm8,%xmm9 11941 68,15,89,206, //mulps %xmm6,%xmm9 11942 65,15,88,209, //addps %xmm9,%xmm2 11943 68,15,89,199, //mulps %xmm7,%xmm8 11944 65,15,88,216, //addps %xmm8,%xmm3 11945 72,173, //lods %ds:(%rsi),%rax 11946 255,224, //jmpq *%rax 11947 }; 11948 11949 CODE const uint8_t sk_dstover_sse2[] = { 11950 184,0,0,128,63, //mov $0x3f800000,%eax 11951 102,68,15,110,192, //movd %eax,%xmm8 11952 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11953 68,15,92,199, //subps %xmm7,%xmm8 11954 65,15,89,192, //mulps %xmm8,%xmm0 11955 15,88,196, //addps %xmm4,%xmm0 11956 65,15,89,200, //mulps %xmm8,%xmm1 11957 15,88,205, //addps %xmm5,%xmm1 11958 65,15,89,208, //mulps %xmm8,%xmm2 11959 15,88,214, //addps %xmm6,%xmm2 11960 65,15,89,216, //mulps %xmm8,%xmm3 11961 15,88,223, //addps %xmm7,%xmm3 11962 72,173, //lods %ds:(%rsi),%rax 11963 255,224, //jmpq *%rax 11964 }; 11965 11966 CODE const uint8_t sk_clamp_0_sse2[] = { 11967 69,15,87,192, //xorps %xmm8,%xmm8 11968 65,15,95,192, //maxps %xmm8,%xmm0 11969 65,15,95,200, //maxps %xmm8,%xmm1 11970 65,15,95,208, //maxps %xmm8,%xmm2 11971 65,15,95,216, //maxps %xmm8,%xmm3 11972 72,173, //lods %ds:(%rsi),%rax 11973 255,224, //jmpq *%rax 11974 }; 11975 11976 CODE const uint8_t sk_clamp_1_sse2[] = { 11977 184,0,0,128,63, //mov $0x3f800000,%eax 11978 102,68,15,110,192, //movd %eax,%xmm8 11979 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11980 65,15,93,192, //minps %xmm8,%xmm0 11981 65,15,93,200, //minps %xmm8,%xmm1 11982 65,15,93,208, //minps %xmm8,%xmm2 11983 65,15,93,216, //minps %xmm8,%xmm3 11984 72,173, //lods %ds:(%rsi),%rax 11985 255,224, //jmpq *%rax 11986 }; 11987 11988 CODE const uint8_t sk_clamp_a_sse2[] = { 11989 184,0,0,128,63, //mov $0x3f800000,%eax 11990 102,68,15,110,192, //movd %eax,%xmm8 11991 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 11992 65,15,93,216, //minps %xmm8,%xmm3 11993 15,93,195, //minps %xmm3,%xmm0 11994 15,93,203, //minps %xmm3,%xmm1 11995 15,93,211, //minps %xmm3,%xmm2 11996 72,173, //lods %ds:(%rsi),%rax 11997 255,224, //jmpq *%rax 11998 }; 11999 12000 CODE const uint8_t sk_set_rgb_sse2[] = { 12001 72,173, //lods %ds:(%rsi),%rax 12002 243,15,16,0, //movss (%rax),%xmm0 12003 243,15,16,72,4, //movss 0x4(%rax),%xmm1 12004 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 12005 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 12006 243,15,16,80,8, //movss 0x8(%rax),%xmm2 12007 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 12008 72,173, //lods %ds:(%rsi),%rax 12009 255,224, //jmpq *%rax 12010 }; 12011 12012 CODE const uint8_t sk_swap_rb_sse2[] = { 12013 68,15,40,192, //movaps %xmm0,%xmm8 12014 72,173, //lods %ds:(%rsi),%rax 12015 15,40,194, //movaps %xmm2,%xmm0 12016 65,15,40,208, //movaps %xmm8,%xmm2 12017 255,224, //jmpq *%rax 12018 }; 12019 12020 CODE const uint8_t sk_swap_sse2[] = { 12021 68,15,40,195, //movaps %xmm3,%xmm8 12022 68,15,40,202, //movaps %xmm2,%xmm9 12023 68,15,40,209, //movaps %xmm1,%xmm10 12024 68,15,40,216, //movaps %xmm0,%xmm11 12025 72,173, //lods %ds:(%rsi),%rax 12026 15,40,196, //movaps %xmm4,%xmm0 12027 15,40,205, //movaps %xmm5,%xmm1 12028 15,40,214, //movaps %xmm6,%xmm2 12029 15,40,223, //movaps %xmm7,%xmm3 12030 65,15,40,227, //movaps %xmm11,%xmm4 12031 65,15,40,234, //movaps %xmm10,%xmm5 12032 65,15,40,241, //movaps %xmm9,%xmm6 12033 65,15,40,248, //movaps %xmm8,%xmm7 12034 255,224, //jmpq *%rax 12035 }; 12036 12037 CODE const uint8_t sk_move_src_dst_sse2[] = { 12038 72,173, //lods %ds:(%rsi),%rax 12039 15,40,224, //movaps %xmm0,%xmm4 12040 15,40,233, //movaps %xmm1,%xmm5 12041 15,40,242, //movaps %xmm2,%xmm6 12042 15,40,251, //movaps %xmm3,%xmm7 12043 255,224, //jmpq *%rax 12044 }; 12045 12046 CODE const uint8_t sk_move_dst_src_sse2[] = { 12047 72,173, //lods %ds:(%rsi),%rax 12048 15,40,196, //movaps %xmm4,%xmm0 12049 15,40,205, //movaps %xmm5,%xmm1 12050 15,40,214, //movaps %xmm6,%xmm2 12051 15,40,223, //movaps %xmm7,%xmm3 12052 255,224, //jmpq *%rax 12053 }; 12054 12055 CODE const uint8_t sk_premul_sse2[] = { 12056 15,89,195, //mulps %xmm3,%xmm0 12057 15,89,203, //mulps %xmm3,%xmm1 12058 15,89,211, //mulps %xmm3,%xmm2 12059 72,173, //lods %ds:(%rsi),%rax 12060 255,224, //jmpq *%rax 12061 }; 12062 12063 CODE const uint8_t sk_unpremul_sse2[] = { 12064 69,15,87,192, //xorps %xmm8,%xmm8 12065 184,0,0,128,63, //mov $0x3f800000,%eax 12066 102,68,15,110,200, //movd %eax,%xmm9 12067 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 12068 68,15,94,203, //divps %xmm3,%xmm9 12069 68,15,194,195,4, //cmpneqps %xmm3,%xmm8 12070 69,15,84,193, //andps %xmm9,%xmm8 12071 65,15,89,192, //mulps %xmm8,%xmm0 12072 65,15,89,200, //mulps %xmm8,%xmm1 12073 65,15,89,208, //mulps %xmm8,%xmm2 12074 72,173, //lods %ds:(%rsi),%rax 12075 255,224, //jmpq *%rax 12076 }; 12077 12078 CODE const uint8_t sk_from_srgb_sse2[] = { 12079 184,145,131,158,61, //mov $0x3d9e8391,%eax 12080 102,68,15,110,192, //movd %eax,%xmm8 12081 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 12082 69,15,40,232, //movaps %xmm8,%xmm13 12083 68,15,89,232, //mulps %xmm0,%xmm13 12084 68,15,40,224, //movaps %xmm0,%xmm12 12085 69,15,89,228, //mulps %xmm12,%xmm12 12086 184,154,153,153,62, //mov $0x3e99999a,%eax 12087 102,68,15,110,200, //movd %eax,%xmm9 12088 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 12089 184,92,143,50,63, //mov $0x3f328f5c,%eax 12090 102,68,15,110,208, //movd %eax,%xmm10 12091 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 12092 69,15,40,241, //movaps %xmm9,%xmm14 12093 68,15,89,240, //mulps %xmm0,%xmm14 12094 69,15,88,242, //addps %xmm10,%xmm14 12095 184,10,215,35,59, //mov $0x3b23d70a,%eax 12096 102,68,15,110,216, //movd %eax,%xmm11 12097 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 12098 69,15,89,244, //mulps %xmm12,%xmm14 12099 69,15,88,243, //addps %xmm11,%xmm14 12100 184,174,71,97,61, //mov $0x3d6147ae,%eax 12101 102,68,15,110,224, //movd %eax,%xmm12 12102 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 12103 65,15,194,196,1, //cmpltps %xmm12,%xmm0 12104 68,15,84,232, //andps %xmm0,%xmm13 12105 65,15,85,198, //andnps %xmm14,%xmm0 12106 65,15,86,197, //orps %xmm13,%xmm0 12107 69,15,40,232, //movaps %xmm8,%xmm13 12108 68,15,89,233, //mulps %xmm1,%xmm13 12109 68,15,40,241, //movaps %xmm1,%xmm14 12110 69,15,89,246, //mulps %xmm14,%xmm14 12111 69,15,40,249, //movaps %xmm9,%xmm15 12112 68,15,89,249, //mulps %xmm1,%xmm15 12113 69,15,88,250, //addps %xmm10,%xmm15 12114 69,15,89,254, //mulps %xmm14,%xmm15 12115 69,15,88,251, //addps %xmm11,%xmm15 12116 65,15,194,204,1, //cmpltps %xmm12,%xmm1 12117 68,15,84,233, //andps %xmm1,%xmm13 12118 65,15,85,207, //andnps %xmm15,%xmm1 12119 65,15,86,205, //orps %xmm13,%xmm1 12120 68,15,89,194, //mulps %xmm2,%xmm8 12121 68,15,40,234, //movaps %xmm2,%xmm13 12122 69,15,89,237, //mulps %xmm13,%xmm13 12123 68,15,89,202, //mulps %xmm2,%xmm9 12124 69,15,88,202, //addps %xmm10,%xmm9 12125 69,15,89,205, //mulps %xmm13,%xmm9 12126 69,15,88,203, //addps %xmm11,%xmm9 12127 65,15,194,212,1, //cmpltps %xmm12,%xmm2 12128 68,15,84,194, //andps %xmm2,%xmm8 12129 65,15,85,209, //andnps %xmm9,%xmm2 12130 65,15,86,208, //orps %xmm8,%xmm2 12131 72,173, //lods %ds:(%rsi),%rax 12132 255,224, //jmpq *%rax 12133 }; 12134 12135 CODE const uint8_t sk_to_srgb_sse2[] = { 12136 68,15,82,192, //rsqrtps %xmm0,%xmm8 12137 69,15,83,248, //rcpps %xmm8,%xmm15 12138 69,15,82,232, //rsqrtps %xmm8,%xmm13 12139 184,41,92,71,65, //mov $0x41475c29,%eax 12140 102,68,15,110,192, //movd %eax,%xmm8 12141 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 12142 69,15,40,240, //movaps %xmm8,%xmm14 12143 68,15,89,240, //mulps %xmm0,%xmm14 12144 184,0,0,128,63, //mov $0x3f800000,%eax 12145 102,68,15,110,200, //movd %eax,%xmm9 12146 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 12147 184,194,135,210,62, //mov $0x3ed287c2,%eax 12148 102,68,15,110,208, //movd %eax,%xmm10 12149 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 12150 184,206,111,48,63, //mov $0x3f306fce,%eax 12151 102,68,15,110,216, //movd %eax,%xmm11 12152 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 12153 184,168,87,202,61, //mov $0x3dca57a8,%eax 12154 53,0,0,0,128, //xor $0x80000000,%eax 12155 102,68,15,110,224, //movd %eax,%xmm12 12156 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 12157 69,15,89,251, //mulps %xmm11,%xmm15 12158 69,15,88,252, //addps %xmm12,%xmm15 12159 69,15,89,234, //mulps %xmm10,%xmm13 12160 69,15,88,239, //addps %xmm15,%xmm13 12161 69,15,40,249, //movaps %xmm9,%xmm15 12162 69,15,93,253, //minps %xmm13,%xmm15 12163 184,4,231,140,59, //mov $0x3b8ce704,%eax 12164 102,68,15,110,232, //movd %eax,%xmm13 12165 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 12166 65,15,194,197,1, //cmpltps %xmm13,%xmm0 12167 68,15,84,240, //andps %xmm0,%xmm14 12168 65,15,85,199, //andnps %xmm15,%xmm0 12169 65,15,86,198, //orps %xmm14,%xmm0 12170 68,15,82,241, //rsqrtps %xmm1,%xmm14 12171 69,15,83,254, //rcpps %xmm14,%xmm15 12172 69,15,82,246, //rsqrtps %xmm14,%xmm14 12173 69,15,89,251, //mulps %xmm11,%xmm15 12174 69,15,88,252, //addps %xmm12,%xmm15 12175 69,15,89,242, //mulps %xmm10,%xmm14 12176 69,15,88,247, //addps %xmm15,%xmm14 12177 69,15,40,249, //movaps %xmm9,%xmm15 12178 69,15,93,254, //minps %xmm14,%xmm15 12179 69,15,40,240, //movaps %xmm8,%xmm14 12180 68,15,89,241, //mulps %xmm1,%xmm14 12181 65,15,194,205,1, //cmpltps %xmm13,%xmm1 12182 68,15,84,241, //andps %xmm1,%xmm14 12183 65,15,85,207, //andnps %xmm15,%xmm1 12184 65,15,86,206, //orps %xmm14,%xmm1 12185 68,15,82,242, //rsqrtps %xmm2,%xmm14 12186 69,15,83,254, //rcpps %xmm14,%xmm15 12187 69,15,89,251, //mulps %xmm11,%xmm15 12188 69,15,88,252, //addps %xmm12,%xmm15 12189 69,15,82,222, //rsqrtps %xmm14,%xmm11 12190 69,15,89,218, //mulps %xmm10,%xmm11 12191 69,15,88,223, //addps %xmm15,%xmm11 12192 69,15,93,203, //minps %xmm11,%xmm9 12193 68,15,89,194, //mulps %xmm2,%xmm8 12194 65,15,194,213,1, //cmpltps %xmm13,%xmm2 12195 68,15,84,194, //andps %xmm2,%xmm8 12196 65,15,85,209, //andnps %xmm9,%xmm2 12197 65,15,86,208, //orps %xmm8,%xmm2 12198 72,173, //lods %ds:(%rsi),%rax 12199 255,224, //jmpq *%rax 12200 }; 12201 12202 CODE const uint8_t sk_scale_1_float_sse2[] = { 12203 72,173, //lods %ds:(%rsi),%rax 12204 243,68,15,16,0, //movss (%rax),%xmm8 12205 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 12206 65,15,89,192, //mulps %xmm8,%xmm0 12207 65,15,89,200, //mulps %xmm8,%xmm1 12208 65,15,89,208, //mulps %xmm8,%xmm2 12209 65,15,89,216, //mulps %xmm8,%xmm3 12210 72,173, //lods %ds:(%rsi),%rax 12211 255,224, //jmpq *%rax 12212 }; 12213 12214 CODE const uint8_t sk_scale_u8_sse2[] = { 12215 72,173, //lods %ds:(%rsi),%rax 12216 72,139,0, //mov (%rax),%rax 12217 102,68,15,110,4,56, //movd (%rax,%rdi,1),%xmm8 12218 102,69,15,239,201, //pxor %xmm9,%xmm9 12219 102,69,15,96,193, //punpcklbw %xmm9,%xmm8 12220 102,69,15,97,193, //punpcklwd %xmm9,%xmm8 12221 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 12222 184,129,128,128,59, //mov $0x3b808081,%eax 12223 102,68,15,110,200, //movd %eax,%xmm9 12224 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 12225 69,15,89,200, //mulps %xmm8,%xmm9 12226 65,15,89,193, //mulps %xmm9,%xmm0 12227 65,15,89,201, //mulps %xmm9,%xmm1 12228 65,15,89,209, //mulps %xmm9,%xmm2 12229 65,15,89,217, //mulps %xmm9,%xmm3 12230 72,173, //lods %ds:(%rsi),%rax 12231 255,224, //jmpq *%rax 12232 }; 12233 12234 CODE const uint8_t sk_lerp_1_float_sse2[] = { 12235 72,173, //lods %ds:(%rsi),%rax 12236 243,68,15,16,0, //movss (%rax),%xmm8 12237 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 12238 15,92,196, //subps %xmm4,%xmm0 12239 65,15,89,192, //mulps %xmm8,%xmm0 12240 15,88,196, //addps %xmm4,%xmm0 12241 15,92,205, //subps %xmm5,%xmm1 12242 65,15,89,200, //mulps %xmm8,%xmm1 12243 15,88,205, //addps %xmm5,%xmm1 12244 15,92,214, //subps %xmm6,%xmm2 12245 65,15,89,208, //mulps %xmm8,%xmm2 12246 15,88,214, //addps %xmm6,%xmm2 12247 15,92,223, //subps %xmm7,%xmm3 12248 65,15,89,216, //mulps %xmm8,%xmm3 12249 15,88,223, //addps %xmm7,%xmm3 12250 72,173, //lods %ds:(%rsi),%rax 12251 255,224, //jmpq *%rax 12252 }; 12253 12254 CODE const uint8_t sk_lerp_u8_sse2[] = { 12255 72,173, //lods %ds:(%rsi),%rax 12256 72,139,0, //mov (%rax),%rax 12257 102,68,15,110,4,56, //movd (%rax,%rdi,1),%xmm8 12258 102,69,15,239,201, //pxor %xmm9,%xmm9 12259 102,69,15,96,193, //punpcklbw %xmm9,%xmm8 12260 102,69,15,97,193, //punpcklwd %xmm9,%xmm8 12261 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 12262 184,129,128,128,59, //mov $0x3b808081,%eax 12263 102,68,15,110,200, //movd %eax,%xmm9 12264 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 12265 69,15,89,200, //mulps %xmm8,%xmm9 12266 15,92,196, //subps %xmm4,%xmm0 12267 65,15,89,193, //mulps %xmm9,%xmm0 12268 15,88,196, //addps %xmm4,%xmm0 12269 15,92,205, //subps %xmm5,%xmm1 12270 65,15,89,201, //mulps %xmm9,%xmm1 12271 15,88,205, //addps %xmm5,%xmm1 12272 15,92,214, //subps %xmm6,%xmm2 12273 65,15,89,209, //mulps %xmm9,%xmm2 12274 15,88,214, //addps %xmm6,%xmm2 12275 15,92,223, //subps %xmm7,%xmm3 12276 65,15,89,217, //mulps %xmm9,%xmm3 12277 15,88,223, //addps %xmm7,%xmm3 12278 72,173, //lods %ds:(%rsi),%rax 12279 255,224, //jmpq *%rax 12280 }; 12281 12282 CODE const uint8_t sk_lerp_565_sse2[] = { 12283 72,173, //lods %ds:(%rsi),%rax 12284 72,139,0, //mov (%rax),%rax 12285 243,68,15,126,4,120, //movq (%rax,%rdi,2),%xmm8 12286 102,15,239,219, //pxor %xmm3,%xmm3 12287 102,68,15,97,195, //punpcklwd %xmm3,%xmm8 12288 184,0,248,0,0, //mov $0xf800,%eax 12289 102,15,110,216, //movd %eax,%xmm3 12290 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 12291 102,65,15,219,216, //pand %xmm8,%xmm3 12292 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 12293 184,8,33,132,55, //mov $0x37842108,%eax 12294 102,68,15,110,208, //movd %eax,%xmm10 12295 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 12296 69,15,89,209, //mulps %xmm9,%xmm10 12297 184,224,7,0,0, //mov $0x7e0,%eax 12298 102,15,110,216, //movd %eax,%xmm3 12299 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 12300 102,65,15,219,216, //pand %xmm8,%xmm3 12301 68,15,91,203, //cvtdq2ps %xmm3,%xmm9 12302 184,33,8,2,58, //mov $0x3a020821,%eax 12303 102,68,15,110,216, //movd %eax,%xmm11 12304 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 12305 69,15,89,217, //mulps %xmm9,%xmm11 12306 184,31,0,0,0, //mov $0x1f,%eax 12307 102,15,110,216, //movd %eax,%xmm3 12308 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 12309 102,65,15,219,216, //pand %xmm8,%xmm3 12310 68,15,91,195, //cvtdq2ps %xmm3,%xmm8 12311 184,8,33,4,61, //mov $0x3d042108,%eax 12312 102,15,110,216, //movd %eax,%xmm3 12313 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 12314 65,15,89,216, //mulps %xmm8,%xmm3 12315 15,92,196, //subps %xmm4,%xmm0 12316 65,15,89,194, //mulps %xmm10,%xmm0 12317 15,88,196, //addps %xmm4,%xmm0 12318 15,92,205, //subps %xmm5,%xmm1 12319 65,15,89,203, //mulps %xmm11,%xmm1 12320 15,88,205, //addps %xmm5,%xmm1 12321 15,92,214, //subps %xmm6,%xmm2 12322 15,89,211, //mulps %xmm3,%xmm2 12323 15,88,214, //addps %xmm6,%xmm2 12324 184,0,0,128,63, //mov $0x3f800000,%eax 12325 102,15,110,216, //movd %eax,%xmm3 12326 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 12327 72,173, //lods %ds:(%rsi),%rax 12328 255,224, //jmpq *%rax 12329 }; 12330 12331 CODE const uint8_t sk_load_tables_sse2[] = { 12332 72,173, //lods %ds:(%rsi),%rax 12333 72,139,8, //mov (%rax),%rcx 12334 76,139,64,8, //mov 0x8(%rax),%r8 12335 243,68,15,111,4,185, //movdqu (%rcx,%rdi,4),%xmm8 12336 185,255,0,0,0, //mov $0xff,%ecx 12337 102,15,110,193, //movd %ecx,%xmm0 12338 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 12339 102,69,15,111,200, //movdqa %xmm8,%xmm9 12340 102,65,15,114,209,8, //psrld $0x8,%xmm9 12341 102,68,15,219,200, //pand %xmm0,%xmm9 12342 102,69,15,111,208, //movdqa %xmm8,%xmm10 12343 102,65,15,114,210,16, //psrld $0x10,%xmm10 12344 102,68,15,219,208, //pand %xmm0,%xmm10 12345 102,65,15,219,192, //pand %xmm8,%xmm0 12346 102,15,112,216,78, //pshufd $0x4e,%xmm0,%xmm3 12347 102,72,15,126,217, //movq %xmm3,%rcx 12348 65,137,201, //mov %ecx,%r9d 12349 72,193,233,32, //shr $0x20,%rcx 12350 102,73,15,126,194, //movq %xmm0,%r10 12351 69,137,211, //mov %r10d,%r11d 12352 73,193,234,32, //shr $0x20,%r10 12353 243,67,15,16,28,144, //movss (%r8,%r10,4),%xmm3 12354 243,65,15,16,4,136, //movss (%r8,%rcx,4),%xmm0 12355 15,20,216, //unpcklps %xmm0,%xmm3 12356 243,67,15,16,4,152, //movss (%r8,%r11,4),%xmm0 12357 243,67,15,16,12,136, //movss (%r8,%r9,4),%xmm1 12358 15,20,193, //unpcklps %xmm1,%xmm0 12359 15,20,195, //unpcklps %xmm3,%xmm0 12360 76,139,64,16, //mov 0x10(%rax),%r8 12361 102,65,15,112,201,78, //pshufd $0x4e,%xmm9,%xmm1 12362 102,73,15,126,202, //movq %xmm1,%r10 12363 77,137,209, //mov %r10,%r9 12364 73,193,233,32, //shr $0x20,%r9 12365 102,76,15,126,201, //movq %xmm9,%rcx 12366 65,137,203, //mov %ecx,%r11d 12367 65,129,227,255,255,255,0, //and $0xffffff,%r11d 12368 72,193,233,30, //shr $0x1e,%rcx 12369 65,129,226,255,255,255,0, //and $0xffffff,%r10d 12370 243,65,15,16,28,8, //movss (%r8,%rcx,1),%xmm3 12371 243,67,15,16,12,136, //movss (%r8,%r9,4),%xmm1 12372 15,20,217, //unpcklps %xmm1,%xmm3 12373 243,67,15,16,12,152, //movss (%r8,%r11,4),%xmm1 12374 243,67,15,16,20,144, //movss (%r8,%r10,4),%xmm2 12375 15,20,202, //unpcklps %xmm2,%xmm1 12376 15,20,203, //unpcklps %xmm3,%xmm1 12377 76,139,72,24, //mov 0x18(%rax),%r9 12378 102,65,15,112,210,78, //pshufd $0x4e,%xmm10,%xmm2 12379 102,72,15,126,209, //movq %xmm2,%rcx 12380 68,15,183,193, //movzwl %cx,%r8d 12381 72,193,233,32, //shr $0x20,%rcx 12382 102,76,15,126,208, //movq %xmm10,%rax 12383 68,15,183,208, //movzwl %ax,%r10d 12384 72,193,232,30, //shr $0x1e,%rax 12385 243,69,15,16,12,1, //movss (%r9,%rax,1),%xmm9 12386 243,65,15,16,20,137, //movss (%r9,%rcx,4),%xmm2 12387 68,15,20,202, //unpcklps %xmm2,%xmm9 12388 243,67,15,16,20,145, //movss (%r9,%r10,4),%xmm2 12389 243,67,15,16,28,129, //movss (%r9,%r8,4),%xmm3 12390 15,20,211, //unpcklps %xmm3,%xmm2 12391 65,15,20,209, //unpcklps %xmm9,%xmm2 12392 102,65,15,114,208,24, //psrld $0x18,%xmm8 12393 69,15,91,192, //cvtdq2ps %xmm8,%xmm8 12394 184,129,128,128,59, //mov $0x3b808081,%eax 12395 102,15,110,216, //movd %eax,%xmm3 12396 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 12397 65,15,89,216, //mulps %xmm8,%xmm3 12398 72,173, //lods %ds:(%rsi),%rax 12399 255,224, //jmpq *%rax 12400 }; 12401 12402 CODE const uint8_t sk_load_a8_sse2[] = { 12403 72,173, //lods %ds:(%rsi),%rax 12404 72,139,0, //mov (%rax),%rax 12405 102,15,110,4,56, //movd (%rax,%rdi,1),%xmm0 12406 102,15,239,201, //pxor %xmm1,%xmm1 12407 102,15,96,193, //punpcklbw %xmm1,%xmm0 12408 102,15,97,193, //punpcklwd %xmm1,%xmm0 12409 15,91,192, //cvtdq2ps %xmm0,%xmm0 12410 184,129,128,128,59, //mov $0x3b808081,%eax 12411 102,15,110,216, //movd %eax,%xmm3 12412 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 12413 15,89,216, //mulps %xmm0,%xmm3 12414 72,173, //lods %ds:(%rsi),%rax 12415 15,87,192, //xorps %xmm0,%xmm0 12416 102,15,239,201, //pxor %xmm1,%xmm1 12417 15,87,210, //xorps %xmm2,%xmm2 12418 255,224, //jmpq *%rax 12419 }; 12420 12421 CODE const uint8_t sk_store_a8_sse2[] = { 12422 72,173, //lods %ds:(%rsi),%rax 12423 72,139,0, //mov (%rax),%rax 12424 185,0,0,127,67, //mov $0x437f0000,%ecx 12425 102,68,15,110,193, //movd %ecx,%xmm8 12426 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 12427 68,15,89,195, //mulps %xmm3,%xmm8 12428 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 12429 102,65,15,114,240,16, //pslld $0x10,%xmm8 12430 102,65,15,114,224,16, //psrad $0x10,%xmm8 12431 102,69,15,107,192, //packssdw %xmm8,%xmm8 12432 102,69,15,103,192, //packuswb %xmm8,%xmm8 12433 102,68,15,126,4,56, //movd %xmm8,(%rax,%rdi,1) 12434 72,173, //lods %ds:(%rsi),%rax 12435 255,224, //jmpq *%rax 12436 }; 12437 12438 CODE const uint8_t sk_load_565_sse2[] = { 12439 72,173, //lods %ds:(%rsi),%rax 12440 72,139,0, //mov (%rax),%rax 12441 243,15,126,20,120, //movq (%rax,%rdi,2),%xmm2 12442 102,15,239,192, //pxor %xmm0,%xmm0 12443 102,15,97,208, //punpcklwd %xmm0,%xmm2 12444 184,0,248,0,0, //mov $0xf800,%eax 12445 102,15,110,192, //movd %eax,%xmm0 12446 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 12447 102,15,219,194, //pand %xmm2,%xmm0 12448 15,91,200, //cvtdq2ps %xmm0,%xmm1 12449 184,8,33,132,55, //mov $0x37842108,%eax 12450 102,15,110,192, //movd %eax,%xmm0 12451 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 12452 15,89,193, //mulps %xmm1,%xmm0 12453 184,224,7,0,0, //mov $0x7e0,%eax 12454 102,15,110,200, //movd %eax,%xmm1 12455 102,15,112,201,0, //pshufd $0x0,%xmm1,%xmm1 12456 102,15,219,202, //pand %xmm2,%xmm1 12457 15,91,217, //cvtdq2ps %xmm1,%xmm3 12458 184,33,8,2,58, //mov $0x3a020821,%eax 12459 102,15,110,200, //movd %eax,%xmm1 12460 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 12461 15,89,203, //mulps %xmm3,%xmm1 12462 184,31,0,0,0, //mov $0x1f,%eax 12463 102,15,110,216, //movd %eax,%xmm3 12464 102,15,112,219,0, //pshufd $0x0,%xmm3,%xmm3 12465 102,15,219,218, //pand %xmm2,%xmm3 12466 15,91,219, //cvtdq2ps %xmm3,%xmm3 12467 184,8,33,4,61, //mov $0x3d042108,%eax 12468 102,15,110,208, //movd %eax,%xmm2 12469 15,198,210,0, //shufps $0x0,%xmm2,%xmm2 12470 15,89,211, //mulps %xmm3,%xmm2 12471 184,0,0,128,63, //mov $0x3f800000,%eax 12472 102,15,110,216, //movd %eax,%xmm3 12473 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 12474 72,173, //lods %ds:(%rsi),%rax 12475 255,224, //jmpq *%rax 12476 }; 12477 12478 CODE const uint8_t sk_store_565_sse2[] = { 12479 72,173, //lods %ds:(%rsi),%rax 12480 72,139,0, //mov (%rax),%rax 12481 185,0,0,248,65, //mov $0x41f80000,%ecx 12482 102,68,15,110,193, //movd %ecx,%xmm8 12483 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 12484 69,15,40,200, //movaps %xmm8,%xmm9 12485 68,15,89,200, //mulps %xmm0,%xmm9 12486 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 12487 102,65,15,114,241,11, //pslld $0xb,%xmm9 12488 185,0,0,124,66, //mov $0x427c0000,%ecx 12489 102,68,15,110,209, //movd %ecx,%xmm10 12490 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 12491 68,15,89,209, //mulps %xmm1,%xmm10 12492 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 12493 102,65,15,114,242,5, //pslld $0x5,%xmm10 12494 102,69,15,235,209, //por %xmm9,%xmm10 12495 68,15,89,194, //mulps %xmm2,%xmm8 12496 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 12497 102,69,15,86,194, //orpd %xmm10,%xmm8 12498 102,65,15,114,240,16, //pslld $0x10,%xmm8 12499 102,65,15,114,224,16, //psrad $0x10,%xmm8 12500 102,69,15,107,192, //packssdw %xmm8,%xmm8 12501 102,68,15,214,4,120, //movq %xmm8,(%rax,%rdi,2) 12502 72,173, //lods %ds:(%rsi),%rax 12503 255,224, //jmpq *%rax 12504 }; 12505 12506 CODE const uint8_t sk_load_8888_sse2[] = { 12507 72,173, //lods %ds:(%rsi),%rax 12508 72,139,0, //mov (%rax),%rax 12509 243,15,111,28,184, //movdqu (%rax,%rdi,4),%xmm3 12510 184,255,0,0,0, //mov $0xff,%eax 12511 102,15,110,192, //movd %eax,%xmm0 12512 102,15,112,192,0, //pshufd $0x0,%xmm0,%xmm0 12513 102,15,111,203, //movdqa %xmm3,%xmm1 12514 102,15,114,209,8, //psrld $0x8,%xmm1 12515 102,15,219,200, //pand %xmm0,%xmm1 12516 102,15,111,211, //movdqa %xmm3,%xmm2 12517 102,15,114,210,16, //psrld $0x10,%xmm2 12518 102,15,219,208, //pand %xmm0,%xmm2 12519 102,15,219,195, //pand %xmm3,%xmm0 12520 15,91,192, //cvtdq2ps %xmm0,%xmm0 12521 184,129,128,128,59, //mov $0x3b808081,%eax 12522 102,68,15,110,192, //movd %eax,%xmm8 12523 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 12524 65,15,89,192, //mulps %xmm8,%xmm0 12525 15,91,201, //cvtdq2ps %xmm1,%xmm1 12526 65,15,89,200, //mulps %xmm8,%xmm1 12527 15,91,210, //cvtdq2ps %xmm2,%xmm2 12528 65,15,89,208, //mulps %xmm8,%xmm2 12529 102,15,114,211,24, //psrld $0x18,%xmm3 12530 15,91,219, //cvtdq2ps %xmm3,%xmm3 12531 65,15,89,216, //mulps %xmm8,%xmm3 12532 72,173, //lods %ds:(%rsi),%rax 12533 255,224, //jmpq *%rax 12534 }; 12535 12536 CODE const uint8_t sk_store_8888_sse2[] = { 12537 72,173, //lods %ds:(%rsi),%rax 12538 72,139,0, //mov (%rax),%rax 12539 185,0,0,127,67, //mov $0x437f0000,%ecx 12540 102,68,15,110,193, //movd %ecx,%xmm8 12541 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 12542 69,15,40,200, //movaps %xmm8,%xmm9 12543 68,15,89,200, //mulps %xmm0,%xmm9 12544 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 12545 69,15,40,208, //movaps %xmm8,%xmm10 12546 68,15,89,209, //mulps %xmm1,%xmm10 12547 102,69,15,91,210, //cvtps2dq %xmm10,%xmm10 12548 102,65,15,114,242,8, //pslld $0x8,%xmm10 12549 102,69,15,235,209, //por %xmm9,%xmm10 12550 69,15,40,200, //movaps %xmm8,%xmm9 12551 68,15,89,202, //mulps %xmm2,%xmm9 12552 102,69,15,91,201, //cvtps2dq %xmm9,%xmm9 12553 102,65,15,114,241,16, //pslld $0x10,%xmm9 12554 68,15,89,195, //mulps %xmm3,%xmm8 12555 102,69,15,91,192, //cvtps2dq %xmm8,%xmm8 12556 102,65,15,114,240,24, //pslld $0x18,%xmm8 12557 102,69,15,235,193, //por %xmm9,%xmm8 12558 102,69,15,235,194, //por %xmm10,%xmm8 12559 243,68,15,127,4,184, //movdqu %xmm8,(%rax,%rdi,4) 12560 72,173, //lods %ds:(%rsi),%rax 12561 255,224, //jmpq *%rax 12562 }; 12563 12564 CODE const uint8_t sk_load_f16_sse2[] = { 12565 72,173, //lods %ds:(%rsi),%rax 12566 72,139,0, //mov (%rax),%rax 12567 243,15,111,4,248, //movdqu (%rax,%rdi,8),%xmm0 12568 243,15,111,76,248,16, //movdqu 0x10(%rax,%rdi,8),%xmm1 12569 102,15,111,208, //movdqa %xmm0,%xmm2 12570 102,15,97,209, //punpcklwd %xmm1,%xmm2 12571 102,15,105,193, //punpckhwd %xmm1,%xmm0 12572 102,68,15,111,194, //movdqa %xmm2,%xmm8 12573 102,68,15,97,192, //punpcklwd %xmm0,%xmm8 12574 102,15,105,208, //punpckhwd %xmm0,%xmm2 12575 184,0,4,0,4, //mov $0x4000400,%eax 12576 102,15,110,192, //movd %eax,%xmm0 12577 102,15,112,216,0, //pshufd $0x0,%xmm0,%xmm3 12578 102,15,111,203, //movdqa %xmm3,%xmm1 12579 102,65,15,101,200, //pcmpgtw %xmm8,%xmm1 12580 102,65,15,223,200, //pandn %xmm8,%xmm1 12581 102,15,101,218, //pcmpgtw %xmm2,%xmm3 12582 102,15,223,218, //pandn %xmm2,%xmm3 12583 102,69,15,239,192, //pxor %xmm8,%xmm8 12584 102,15,111,193, //movdqa %xmm1,%xmm0 12585 102,65,15,97,192, //punpcklwd %xmm8,%xmm0 12586 102,15,114,240,13, //pslld $0xd,%xmm0 12587 184,0,0,128,119, //mov $0x77800000,%eax 12588 102,15,110,208, //movd %eax,%xmm2 12589 102,68,15,112,202,0, //pshufd $0x0,%xmm2,%xmm9 12590 65,15,89,193, //mulps %xmm9,%xmm0 12591 102,65,15,105,200, //punpckhwd %xmm8,%xmm1 12592 102,15,114,241,13, //pslld $0xd,%xmm1 12593 65,15,89,201, //mulps %xmm9,%xmm1 12594 102,15,111,211, //movdqa %xmm3,%xmm2 12595 102,65,15,97,208, //punpcklwd %xmm8,%xmm2 12596 102,15,114,242,13, //pslld $0xd,%xmm2 12597 65,15,89,209, //mulps %xmm9,%xmm2 12598 102,65,15,105,216, //punpckhwd %xmm8,%xmm3 12599 102,15,114,243,13, //pslld $0xd,%xmm3 12600 65,15,89,217, //mulps %xmm9,%xmm3 12601 72,173, //lods %ds:(%rsi),%rax 12602 255,224, //jmpq *%rax 12603 }; 12604 12605 CODE const uint8_t sk_store_f16_sse2[] = { 12606 72,173, //lods %ds:(%rsi),%rax 12607 72,139,0, //mov (%rax),%rax 12608 185,0,0,128,7, //mov $0x7800000,%ecx 12609 102,68,15,110,193, //movd %ecx,%xmm8 12610 102,69,15,112,192,0, //pshufd $0x0,%xmm8,%xmm8 12611 102,69,15,111,200, //movdqa %xmm8,%xmm9 12612 68,15,89,200, //mulps %xmm0,%xmm9 12613 102,65,15,114,209,13, //psrld $0xd,%xmm9 12614 102,69,15,111,208, //movdqa %xmm8,%xmm10 12615 68,15,89,209, //mulps %xmm1,%xmm10 12616 102,65,15,114,210,13, //psrld $0xd,%xmm10 12617 102,69,15,111,216, //movdqa %xmm8,%xmm11 12618 68,15,89,218, //mulps %xmm2,%xmm11 12619 102,65,15,114,211,13, //psrld $0xd,%xmm11 12620 68,15,89,195, //mulps %xmm3,%xmm8 12621 102,65,15,114,208,13, //psrld $0xd,%xmm8 12622 102,65,15,115,250,2, //pslldq $0x2,%xmm10 12623 102,69,15,235,209, //por %xmm9,%xmm10 12624 102,65,15,115,248,2, //pslldq $0x2,%xmm8 12625 102,69,15,235,195, //por %xmm11,%xmm8 12626 102,69,15,111,202, //movdqa %xmm10,%xmm9 12627 102,69,15,98,200, //punpckldq %xmm8,%xmm9 12628 243,68,15,127,12,248, //movdqu %xmm9,(%rax,%rdi,8) 12629 102,69,15,106,208, //punpckhdq %xmm8,%xmm10 12630 243,68,15,127,84,248,16, //movdqu %xmm10,0x10(%rax,%rdi,8) 12631 72,173, //lods %ds:(%rsi),%rax 12632 255,224, //jmpq *%rax 12633 }; 12634 12635 CODE const uint8_t sk_store_f32_sse2[] = { 12636 72,173, //lods %ds:(%rsi),%rax 12637 72,139,0, //mov (%rax),%rax 12638 72,137,249, //mov %rdi,%rcx 12639 72,193,225,4, //shl $0x4,%rcx 12640 68,15,40,192, //movaps %xmm0,%xmm8 12641 68,15,40,200, //movaps %xmm0,%xmm9 12642 68,15,20,201, //unpcklps %xmm1,%xmm9 12643 68,15,40,210, //movaps %xmm2,%xmm10 12644 68,15,40,218, //movaps %xmm2,%xmm11 12645 68,15,20,219, //unpcklps %xmm3,%xmm11 12646 68,15,21,193, //unpckhps %xmm1,%xmm8 12647 68,15,21,211, //unpckhps %xmm3,%xmm10 12648 69,15,40,225, //movaps %xmm9,%xmm12 12649 102,69,15,20,227, //unpcklpd %xmm11,%xmm12 12650 69,15,18,217, //movhlps %xmm9,%xmm11 12651 69,15,40,200, //movaps %xmm8,%xmm9 12652 102,69,15,20,202, //unpcklpd %xmm10,%xmm9 12653 69,15,18,208, //movhlps %xmm8,%xmm10 12654 102,68,15,17,36,8, //movupd %xmm12,(%rax,%rcx,1) 12655 68,15,17,92,8,16, //movups %xmm11,0x10(%rax,%rcx,1) 12656 102,68,15,17,76,8,32, //movupd %xmm9,0x20(%rax,%rcx,1) 12657 68,15,17,84,8,48, //movups %xmm10,0x30(%rax,%rcx,1) 12658 72,173, //lods %ds:(%rsi),%rax 12659 255,224, //jmpq *%rax 12660 }; 12661 12662 CODE const uint8_t sk_clamp_x_sse2[] = { 12663 72,173, //lods %ds:(%rsi),%rax 12664 69,15,87,192, //xorps %xmm8,%xmm8 12665 68,15,95,192, //maxps %xmm0,%xmm8 12666 243,68,15,16,8, //movss (%rax),%xmm9 12667 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 12668 102,15,118,192, //pcmpeqd %xmm0,%xmm0 12669 102,65,15,254,193, //paddd %xmm9,%xmm0 12670 68,15,93,192, //minps %xmm0,%xmm8 12671 72,173, //lods %ds:(%rsi),%rax 12672 65,15,40,192, //movaps %xmm8,%xmm0 12673 255,224, //jmpq *%rax 12674 }; 12675 12676 CODE const uint8_t sk_clamp_y_sse2[] = { 12677 72,173, //lods %ds:(%rsi),%rax 12678 69,15,87,192, //xorps %xmm8,%xmm8 12679 68,15,95,193, //maxps %xmm1,%xmm8 12680 243,68,15,16,8, //movss (%rax),%xmm9 12681 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 12682 102,15,118,201, //pcmpeqd %xmm1,%xmm1 12683 102,65,15,254,201, //paddd %xmm9,%xmm1 12684 68,15,93,193, //minps %xmm1,%xmm8 12685 72,173, //lods %ds:(%rsi),%rax 12686 65,15,40,200, //movaps %xmm8,%xmm1 12687 255,224, //jmpq *%rax 12688 }; 12689 12690 CODE const uint8_t sk_repeat_x_sse2[] = { 12691 72,173, //lods %ds:(%rsi),%rax 12692 243,68,15,16,0, //movss (%rax),%xmm8 12693 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 12694 68,15,40,200, //movaps %xmm0,%xmm9 12695 69,15,94,200, //divps %xmm8,%xmm9 12696 243,69,15,91,209, //cvttps2dq %xmm9,%xmm10 12697 69,15,91,210, //cvtdq2ps %xmm10,%xmm10 12698 69,15,194,202,1, //cmpltps %xmm10,%xmm9 12699 184,0,0,128,63, //mov $0x3f800000,%eax 12700 102,68,15,110,216, //movd %eax,%xmm11 12701 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 12702 69,15,84,217, //andps %xmm9,%xmm11 12703 69,15,92,211, //subps %xmm11,%xmm10 12704 69,15,89,208, //mulps %xmm8,%xmm10 12705 65,15,92,194, //subps %xmm10,%xmm0 12706 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9 12707 102,69,15,254,200, //paddd %xmm8,%xmm9 12708 65,15,93,193, //minps %xmm9,%xmm0 12709 72,173, //lods %ds:(%rsi),%rax 12710 255,224, //jmpq *%rax 12711 }; 12712 12713 CODE const uint8_t sk_repeat_y_sse2[] = { 12714 72,173, //lods %ds:(%rsi),%rax 12715 243,68,15,16,0, //movss (%rax),%xmm8 12716 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 12717 68,15,40,201, //movaps %xmm1,%xmm9 12718 69,15,94,200, //divps %xmm8,%xmm9 12719 243,69,15,91,209, //cvttps2dq %xmm9,%xmm10 12720 69,15,91,210, //cvtdq2ps %xmm10,%xmm10 12721 69,15,194,202,1, //cmpltps %xmm10,%xmm9 12722 184,0,0,128,63, //mov $0x3f800000,%eax 12723 102,68,15,110,216, //movd %eax,%xmm11 12724 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 12725 69,15,84,217, //andps %xmm9,%xmm11 12726 69,15,92,211, //subps %xmm11,%xmm10 12727 69,15,89,208, //mulps %xmm8,%xmm10 12728 65,15,92,202, //subps %xmm10,%xmm1 12729 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9 12730 102,69,15,254,200, //paddd %xmm8,%xmm9 12731 65,15,93,201, //minps %xmm9,%xmm1 12732 72,173, //lods %ds:(%rsi),%rax 12733 255,224, //jmpq *%rax 12734 }; 12735 12736 CODE const uint8_t sk_mirror_x_sse2[] = { 12737 72,173, //lods %ds:(%rsi),%rax 12738 243,68,15,16,8, //movss (%rax),%xmm9 12739 69,15,40,193, //movaps %xmm9,%xmm8 12740 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 12741 65,15,92,192, //subps %xmm8,%xmm0 12742 243,69,15,88,201, //addss %xmm9,%xmm9 12743 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 12744 68,15,40,208, //movaps %xmm0,%xmm10 12745 69,15,94,209, //divps %xmm9,%xmm10 12746 243,69,15,91,218, //cvttps2dq %xmm10,%xmm11 12747 69,15,91,219, //cvtdq2ps %xmm11,%xmm11 12748 69,15,194,211,1, //cmpltps %xmm11,%xmm10 12749 184,0,0,128,63, //mov $0x3f800000,%eax 12750 102,68,15,110,224, //movd %eax,%xmm12 12751 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 12752 69,15,84,226, //andps %xmm10,%xmm12 12753 69,15,87,210, //xorps %xmm10,%xmm10 12754 69,15,92,220, //subps %xmm12,%xmm11 12755 69,15,89,217, //mulps %xmm9,%xmm11 12756 65,15,92,195, //subps %xmm11,%xmm0 12757 65,15,92,192, //subps %xmm8,%xmm0 12758 68,15,92,208, //subps %xmm0,%xmm10 12759 65,15,84,194, //andps %xmm10,%xmm0 12760 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9 12761 102,69,15,254,200, //paddd %xmm8,%xmm9 12762 65,15,93,193, //minps %xmm9,%xmm0 12763 72,173, //lods %ds:(%rsi),%rax 12764 255,224, //jmpq *%rax 12765 }; 12766 12767 CODE const uint8_t sk_mirror_y_sse2[] = { 12768 72,173, //lods %ds:(%rsi),%rax 12769 243,68,15,16,8, //movss (%rax),%xmm9 12770 69,15,40,193, //movaps %xmm9,%xmm8 12771 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 12772 65,15,92,200, //subps %xmm8,%xmm1 12773 243,69,15,88,201, //addss %xmm9,%xmm9 12774 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 12775 68,15,40,209, //movaps %xmm1,%xmm10 12776 69,15,94,209, //divps %xmm9,%xmm10 12777 243,69,15,91,218, //cvttps2dq %xmm10,%xmm11 12778 69,15,91,219, //cvtdq2ps %xmm11,%xmm11 12779 69,15,194,211,1, //cmpltps %xmm11,%xmm10 12780 184,0,0,128,63, //mov $0x3f800000,%eax 12781 102,68,15,110,224, //movd %eax,%xmm12 12782 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 12783 69,15,84,226, //andps %xmm10,%xmm12 12784 69,15,87,210, //xorps %xmm10,%xmm10 12785 69,15,92,220, //subps %xmm12,%xmm11 12786 69,15,89,217, //mulps %xmm9,%xmm11 12787 65,15,92,203, //subps %xmm11,%xmm1 12788 65,15,92,200, //subps %xmm8,%xmm1 12789 68,15,92,209, //subps %xmm1,%xmm10 12790 65,15,84,202, //andps %xmm10,%xmm1 12791 102,69,15,118,201, //pcmpeqd %xmm9,%xmm9 12792 102,69,15,254,200, //paddd %xmm8,%xmm9 12793 65,15,93,201, //minps %xmm9,%xmm1 12794 72,173, //lods %ds:(%rsi),%rax 12795 255,224, //jmpq *%rax 12796 }; 12797 12798 CODE const uint8_t sk_luminance_to_alpha_sse2[] = { 12799 184,208,179,89,62, //mov $0x3e59b3d0,%eax 12800 102,15,110,216, //movd %eax,%xmm3 12801 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 12802 15,89,216, //mulps %xmm0,%xmm3 12803 184,89,23,55,63, //mov $0x3f371759,%eax 12804 102,15,110,192, //movd %eax,%xmm0 12805 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 12806 15,89,193, //mulps %xmm1,%xmm0 12807 15,88,195, //addps %xmm3,%xmm0 12808 184,152,221,147,61, //mov $0x3d93dd98,%eax 12809 102,15,110,216, //movd %eax,%xmm3 12810 15,198,219,0, //shufps $0x0,%xmm3,%xmm3 12811 15,89,218, //mulps %xmm2,%xmm3 12812 15,88,216, //addps %xmm0,%xmm3 12813 72,173, //lods %ds:(%rsi),%rax 12814 15,87,192, //xorps %xmm0,%xmm0 12815 15,87,201, //xorps %xmm1,%xmm1 12816 15,87,210, //xorps %xmm2,%xmm2 12817 255,224, //jmpq *%rax 12818 }; 12819 12820 CODE const uint8_t sk_matrix_2x3_sse2[] = { 12821 68,15,40,201, //movaps %xmm1,%xmm9 12822 68,15,40,192, //movaps %xmm0,%xmm8 12823 72,173, //lods %ds:(%rsi),%rax 12824 243,15,16,0, //movss (%rax),%xmm0 12825 243,15,16,72,4, //movss 0x4(%rax),%xmm1 12826 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 12827 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 12828 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 12829 243,68,15,16,88,16, //movss 0x10(%rax),%xmm11 12830 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 12831 69,15,89,209, //mulps %xmm9,%xmm10 12832 69,15,88,211, //addps %xmm11,%xmm10 12833 65,15,89,192, //mulps %xmm8,%xmm0 12834 65,15,88,194, //addps %xmm10,%xmm0 12835 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 12836 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10 12837 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 12838 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11 12839 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 12840 69,15,89,209, //mulps %xmm9,%xmm10 12841 69,15,88,211, //addps %xmm11,%xmm10 12842 65,15,89,200, //mulps %xmm8,%xmm1 12843 65,15,88,202, //addps %xmm10,%xmm1 12844 72,173, //lods %ds:(%rsi),%rax 12845 255,224, //jmpq *%rax 12846 }; 12847 12848 CODE const uint8_t sk_matrix_3x4_sse2[] = { 12849 68,15,40,201, //movaps %xmm1,%xmm9 12850 68,15,40,192, //movaps %xmm0,%xmm8 12851 72,173, //lods %ds:(%rsi),%rax 12852 243,15,16,0, //movss (%rax),%xmm0 12853 243,15,16,72,4, //movss 0x4(%rax),%xmm1 12854 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 12855 243,68,15,16,80,12, //movss 0xc(%rax),%xmm10 12856 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 12857 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11 12858 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 12859 243,68,15,16,96,36, //movss 0x24(%rax),%xmm12 12860 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 12861 68,15,89,218, //mulps %xmm2,%xmm11 12862 69,15,88,220, //addps %xmm12,%xmm11 12863 69,15,89,209, //mulps %xmm9,%xmm10 12864 69,15,88,211, //addps %xmm11,%xmm10 12865 65,15,89,192, //mulps %xmm8,%xmm0 12866 65,15,88,194, //addps %xmm10,%xmm0 12867 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 12868 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10 12869 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 12870 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11 12871 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 12872 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12 12873 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 12874 68,15,89,218, //mulps %xmm2,%xmm11 12875 69,15,88,220, //addps %xmm12,%xmm11 12876 69,15,89,209, //mulps %xmm9,%xmm10 12877 69,15,88,211, //addps %xmm11,%xmm10 12878 65,15,89,200, //mulps %xmm8,%xmm1 12879 65,15,88,202, //addps %xmm10,%xmm1 12880 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 12881 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 12882 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11 12883 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 12884 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12 12885 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 12886 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13 12887 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 12888 68,15,89,226, //mulps %xmm2,%xmm12 12889 69,15,88,229, //addps %xmm13,%xmm12 12890 69,15,89,217, //mulps %xmm9,%xmm11 12891 69,15,88,220, //addps %xmm12,%xmm11 12892 69,15,89,208, //mulps %xmm8,%xmm10 12893 69,15,88,211, //addps %xmm11,%xmm10 12894 72,173, //lods %ds:(%rsi),%rax 12895 65,15,40,210, //movaps %xmm10,%xmm2 12896 255,224, //jmpq *%rax 12897 }; 12898 12899 CODE const uint8_t sk_matrix_4x5_sse2[] = { 12900 68,15,40,201, //movaps %xmm1,%xmm9 12901 68,15,40,192, //movaps %xmm0,%xmm8 12902 72,173, //lods %ds:(%rsi),%rax 12903 243,15,16,0, //movss (%rax),%xmm0 12904 243,15,16,72,4, //movss 0x4(%rax),%xmm1 12905 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 12906 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10 12907 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 12908 243,68,15,16,88,32, //movss 0x20(%rax),%xmm11 12909 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 12910 243,68,15,16,96,48, //movss 0x30(%rax),%xmm12 12911 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 12912 243,68,15,16,104,64, //movss 0x40(%rax),%xmm13 12913 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 12914 68,15,89,227, //mulps %xmm3,%xmm12 12915 69,15,88,229, //addps %xmm13,%xmm12 12916 68,15,89,218, //mulps %xmm2,%xmm11 12917 69,15,88,220, //addps %xmm12,%xmm11 12918 69,15,89,209, //mulps %xmm9,%xmm10 12919 69,15,88,211, //addps %xmm11,%xmm10 12920 65,15,89,192, //mulps %xmm8,%xmm0 12921 65,15,88,194, //addps %xmm10,%xmm0 12922 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 12923 243,68,15,16,80,20, //movss 0x14(%rax),%xmm10 12924 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 12925 243,68,15,16,88,36, //movss 0x24(%rax),%xmm11 12926 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 12927 243,68,15,16,96,52, //movss 0x34(%rax),%xmm12 12928 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 12929 243,68,15,16,104,68, //movss 0x44(%rax),%xmm13 12930 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 12931 68,15,89,227, //mulps %xmm3,%xmm12 12932 69,15,88,229, //addps %xmm13,%xmm12 12933 68,15,89,218, //mulps %xmm2,%xmm11 12934 69,15,88,220, //addps %xmm12,%xmm11 12935 69,15,89,209, //mulps %xmm9,%xmm10 12936 69,15,88,211, //addps %xmm11,%xmm10 12937 65,15,89,200, //mulps %xmm8,%xmm1 12938 65,15,88,202, //addps %xmm10,%xmm1 12939 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 12940 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 12941 243,68,15,16,88,24, //movss 0x18(%rax),%xmm11 12942 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 12943 243,68,15,16,96,40, //movss 0x28(%rax),%xmm12 12944 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 12945 243,68,15,16,104,56, //movss 0x38(%rax),%xmm13 12946 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 12947 243,68,15,16,112,72, //movss 0x48(%rax),%xmm14 12948 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 12949 68,15,89,235, //mulps %xmm3,%xmm13 12950 69,15,88,238, //addps %xmm14,%xmm13 12951 68,15,89,226, //mulps %xmm2,%xmm12 12952 69,15,88,229, //addps %xmm13,%xmm12 12953 69,15,89,217, //mulps %xmm9,%xmm11 12954 69,15,88,220, //addps %xmm12,%xmm11 12955 69,15,89,208, //mulps %xmm8,%xmm10 12956 69,15,88,211, //addps %xmm11,%xmm10 12957 243,68,15,16,88,12, //movss 0xc(%rax),%xmm11 12958 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 12959 243,68,15,16,96,28, //movss 0x1c(%rax),%xmm12 12960 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 12961 243,68,15,16,104,44, //movss 0x2c(%rax),%xmm13 12962 69,15,198,237,0, //shufps $0x0,%xmm13,%xmm13 12963 243,68,15,16,112,60, //movss 0x3c(%rax),%xmm14 12964 69,15,198,246,0, //shufps $0x0,%xmm14,%xmm14 12965 243,68,15,16,120,76, //movss 0x4c(%rax),%xmm15 12966 69,15,198,255,0, //shufps $0x0,%xmm15,%xmm15 12967 68,15,89,243, //mulps %xmm3,%xmm14 12968 69,15,88,247, //addps %xmm15,%xmm14 12969 68,15,89,234, //mulps %xmm2,%xmm13 12970 69,15,88,238, //addps %xmm14,%xmm13 12971 69,15,89,225, //mulps %xmm9,%xmm12 12972 69,15,88,229, //addps %xmm13,%xmm12 12973 69,15,89,216, //mulps %xmm8,%xmm11 12974 69,15,88,220, //addps %xmm12,%xmm11 12975 72,173, //lods %ds:(%rsi),%rax 12976 65,15,40,210, //movaps %xmm10,%xmm2 12977 65,15,40,219, //movaps %xmm11,%xmm3 12978 255,224, //jmpq *%rax 12979 }; 12980 12981 CODE const uint8_t sk_matrix_perspective_sse2[] = { 12982 68,15,40,192, //movaps %xmm0,%xmm8 12983 72,173, //lods %ds:(%rsi),%rax 12984 243,15,16,0, //movss (%rax),%xmm0 12985 243,68,15,16,72,4, //movss 0x4(%rax),%xmm9 12986 15,198,192,0, //shufps $0x0,%xmm0,%xmm0 12987 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 12988 243,68,15,16,80,8, //movss 0x8(%rax),%xmm10 12989 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 12990 68,15,89,201, //mulps %xmm1,%xmm9 12991 69,15,88,202, //addps %xmm10,%xmm9 12992 65,15,89,192, //mulps %xmm8,%xmm0 12993 65,15,88,193, //addps %xmm9,%xmm0 12994 243,68,15,16,72,12, //movss 0xc(%rax),%xmm9 12995 69,15,198,201,0, //shufps $0x0,%xmm9,%xmm9 12996 243,68,15,16,80,16, //movss 0x10(%rax),%xmm10 12997 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 12998 243,68,15,16,88,20, //movss 0x14(%rax),%xmm11 12999 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 13000 68,15,89,209, //mulps %xmm1,%xmm10 13001 69,15,88,211, //addps %xmm11,%xmm10 13002 69,15,89,200, //mulps %xmm8,%xmm9 13003 69,15,88,202, //addps %xmm10,%xmm9 13004 243,68,15,16,80,24, //movss 0x18(%rax),%xmm10 13005 69,15,198,210,0, //shufps $0x0,%xmm10,%xmm10 13006 243,68,15,16,88,28, //movss 0x1c(%rax),%xmm11 13007 69,15,198,219,0, //shufps $0x0,%xmm11,%xmm11 13008 243,68,15,16,96,32, //movss 0x20(%rax),%xmm12 13009 69,15,198,228,0, //shufps $0x0,%xmm12,%xmm12 13010 68,15,89,217, //mulps %xmm1,%xmm11 13011 69,15,88,220, //addps %xmm12,%xmm11 13012 69,15,89,208, //mulps %xmm8,%xmm10 13013 69,15,88,211, //addps %xmm11,%xmm10 13014 65,15,83,202, //rcpps %xmm10,%xmm1 13015 15,89,193, //mulps %xmm1,%xmm0 13016 68,15,89,201, //mulps %xmm1,%xmm9 13017 72,173, //lods %ds:(%rsi),%rax 13018 65,15,40,201, //movaps %xmm9,%xmm1 13019 255,224, //jmpq *%rax 13020 }; 13021 13022 CODE const uint8_t sk_linear_gradient_2stops_sse2[] = { 13023 72,173, //lods %ds:(%rsi),%rax 13024 68,15,16,8, //movups (%rax),%xmm9 13025 15,16,88,16, //movups 0x10(%rax),%xmm3 13026 68,15,40,195, //movaps %xmm3,%xmm8 13027 69,15,198,192,0, //shufps $0x0,%xmm8,%xmm8 13028 65,15,40,201, //movaps %xmm9,%xmm1 13029 15,198,201,0, //shufps $0x0,%xmm1,%xmm1 13030 68,15,89,192, //mulps %xmm0,%xmm8 13031 68,15,88,193, //addps %xmm1,%xmm8 13032 15,40,203, //movaps %xmm3,%xmm1 13033 15,198,201,85, //shufps $0x55,%xmm1,%xmm1 13034 65,15,40,209, //movaps %xmm9,%xmm2 13035 15,198,210,85, //shufps $0x55,%xmm2,%xmm2 13036 15,89,200, //mulps %xmm0,%xmm1 13037 15,88,202, //addps %xmm2,%xmm1 13038 15,40,211, //movaps %xmm3,%xmm2 13039 15,198,210,170, //shufps $0xaa,%xmm2,%xmm2 13040 69,15,40,209, //movaps %xmm9,%xmm10 13041 69,15,198,210,170, //shufps $0xaa,%xmm10,%xmm10 13042 15,89,208, //mulps %xmm0,%xmm2 13043 65,15,88,210, //addps %xmm10,%xmm2 13044 15,198,219,255, //shufps $0xff,%xmm3,%xmm3 13045 69,15,198,201,255, //shufps $0xff,%xmm9,%xmm9 13046 15,89,216, //mulps %xmm0,%xmm3 13047 65,15,88,217, //addps %xmm9,%xmm3 13048 72,173, //lods %ds:(%rsi),%rax 13049 65,15,40,192, //movaps %xmm8,%xmm0 13050 255,224, //jmpq *%rax 13051 }; 13052 #endif 13053