1/* 2 * 3 * GStreamer 4 * Copyright (c) 2001 Tom Barry. All rights reserved. 5 * Copyright (C) 2008,2010 Sebastian Dröge <slomo@collabora.co.uk> 6 * 7 * This library is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Library General Public 9 * License as published by the Free Software Foundation; either 10 * version 2 of the License, or (at your option) any later version. 11 * 12 * This library is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Library General Public License for more details. 16 * 17 * You should have received a copy of the GNU Library General Public 18 * License aglong with this library; if not, write to the 19 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, 20 * Boston, MA 02110-1301, USA. 21 */ 22 23 24/* 25 * Relicensed for GStreamer from GPL to LGPL with permit from Tom Barry. 26 * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 27 */ 28 29 30#include "x86-64_macros.inc" 31 32static void 33FUNCT_NAME_YUY2 (GstDeinterlaceMethodGreedyH *self, const guint8 * L1, const guint8 * L2, const guint8 * L3, const guint8 * L2P, guint8 * Dest, gint width) 34{ 35 36 // in tight loop some vars are accessed faster in local storage 37 gint64 YMask = 0x00ff00ff00ff00ffull; // to keep only luma 38 gint64 UVMask = 0xff00ff00ff00ff00ull; // to keep only chroma 39 gint64 ShiftMask = 0xfefefefefefefefeull; // to avoid shifting chroma to luma 40 gint64 QW256 = 0x0100010001000100ull; // 4 256's 41 gint64 MaxComb; 42 gint64 MotionThreshold; 43 gint64 MotionSense; 44 gint64 i; 45 glong LoopCtr; 46 glong oldbx = 0; 47 48 gint64 QW256B; 49 gint64 LastAvg = 0; //interp value from left qword 50 51 // FIXME: Use C implementation if the width is not a multiple of 4 52 // Do something more optimal later 53 if (width % 4 != 0) 54 C_FUNCT_YUY2 (self, L1, L2, L3, L2P, Dest, width); 55 56 // Set up our two parms that are actually evaluated for each pixel 57 i = self->max_comb; 58 MaxComb = 59 i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i; 60 61 i = self->motion_threshold; // scale to range of 0-257 62 MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask; 63 64 i = self->motion_sense; // scale to range of 0-257 65 MotionSense = i << 48 | i << 32 | i << 16 | i; 66 67 i = 0xffffffff - 256; 68 QW256B = i << 48 | i << 32 | i << 16 | i; // save a couple instr on PMINSW instruct. 69 70 LoopCtr = width / 8 - 1; // there are LineLength / 4 qwords per line but do 1 less, adj at end of loop 71 72 // For ease of reading, the comments below assume that we're operating on an odd 73 // field (i.e., that InfoIsOdd is true). Assume the obvious for even lines.. 74 __asm__ __volatile__ ( 75 // save ebx (-fPIC) 76 MOVX " %%" XBX ", %[oldbx]\n\t" 77 MOVX " %[L1], %%" XAX "\n\t" 78 LEAX " 8(%%" XAX "), %%" XBX "\n\t" // next qword needed by DJR 79 MOVX " %[L3], %%" XCX "\n\t" 80 SUBX " %%" XAX ", %%" XCX "\n\t" // carry L3 addr as an offset 81 MOVX " %[L2P], %%" XDX "\n\t" 82 MOVX " %[L2], %%" XSI "\n\t" 83 MOVX " %[Dest], %%" XDI "\n\t" // DL1 if Odd or DL2 if Even 84 85 ".align 8\n\t" 86 "1:\n\t" 87 "movq (%%" XSI "), %%mm0\n\t" // L2 - the newest weave pixel value 88 "movq (%%" XAX "), %%mm1\n\t" // L1 - the top pixel 89 "movq (%%" XDX "), %%mm2\n\t" // L2P - the prev weave pixel 90 "movq (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row 91 "movq %%mm1, %%mm6\n\t" // L1 - get simple single pixel interp 92 93 // pavgb mm6, mm3 // use macro below 94 V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]") 95 96 // DJR - Diagonal Jaggie Reduction 97 // In the event that we are going to use an average (Bob) pixel we do not want a jagged 98 // stair step effect. To combat this we avg in the 2 horizontally adjacen pixels into the 99 // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels. 100 101 "movq %[LastAvg], %%mm4\n\t" // the bob value from prev qword in row 102 "movq %%mm6, %[LastAvg]\n\t" // save for next pass 103 "psrlq $48, %%mm4\n\t" // right justify 1 pixel 104 "movq %%mm6, %%mm7\n\t" // copy of simple bob pixel 105 "psllq $16, %%mm7\n\t" // left justify 3 pixels 106 "por %%mm7, %%mm4\n\t" // and combine 107 "movq (%%" XBX "), %%mm5\n\t" // next horiz qword from L1 108 // pavgb mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below 109 110 V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]") 111 "psllq $48, %%mm5\n\t" // left just 1 pixel 112 "movq %%mm6, %%mm7\n\t" // another copy of simple bob pixel 113 "psrlq $16, %%mm7\n\t" // right just 3 pixels 114 "por %%mm7, %%mm5\n\t" // combine 115 // pavgb mm4, mm5 // avg of forward and prev by 1 pixel, use macro 116 V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]") // mm5 gets modified if MMX 117 // pavgb mm6, mm4 // avg of center and surround interp vals, use macro 118 V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") 119 120 // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors. 121#ifndef IS_MMX 122 // pavgb mm4, mm6 // 1/4 center, 3/4 adjacent 123 V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]") 124 // pavgb mm6, mm4 // 3/8 center, 5/8 adjacent 125 V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") 126#endif 127 128 // get abs value of possible L2 comb 129 "movq %%mm6, %%mm4\n\t" // work copy of interp val 130 "movq %%mm2, %%mm7\n\t" // L2 131 "psubusb %%mm4, %%mm7\n\t" // L2 - avg 132 "movq %%mm4, %%mm5\n\t" // avg 133 "psubusb %%mm2, %%mm5\n\t" // avg - L2 134 "por %%mm7, %%mm5\n\t" // abs(avg-L2) 135 136 // get abs value of possible L2P comb 137 "movq %%mm0, %%mm7\n\t" // L2P 138 "psubusb %%mm4, %%mm7\n\t" // L2P - avg 139 "psubusb %%mm0, %%mm4\n\t" // avg - L2P 140 "por %%mm7, %%mm4\n\t" // abs(avg-L2P) 141 142 // use L2 or L2P depending upon which makes smaller comb 143 "psubusb %%mm5, %%mm4\n\t" // see if it goes to zero 144 "psubusb %%mm5, %%mm5\n\t" // 0 145 "pcmpeqb %%mm5, %%mm4\n\t" // if (mm4=0) then FF else 0 146 "pcmpeqb %%mm4, %%mm5\n\t" // opposite of mm4 147 148 // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 149 "pand %%mm2, %%mm5\n\t" // use L2 if mm5 == ff, else 0 150 "pand %%mm0, %%mm4\n\t" // use L2P if mm4 = ff, else 0 151 "por %%mm5, %%mm4\n\t" // may the best win 152 153 // Inventory: at this point we have the following values: 154 // mm0 = L2P (or L2) 155 // mm1 = L1 156 // mm2 = L2 (or L2P) 157 // mm3 = L3 158 // mm4 = the best of L2,L2P weave pixel, base upon comb 159 // mm6 = the avg interpolated value, if we need to use it 160 // Let's measure movement, as how much the weave pixel has changed 161 162 "movq %%mm2, %%mm7\n\t" 163 "psubusb %%mm0, %%mm2\n\t" 164 "psubusb %%mm7, %%mm0\n\t" 165 "por %%mm2, %%mm0\n\t" // abs value of change, used later 166 167 // Now lets clip our chosen value to be not outside of the range 168 // of the high/low range L1-L3 by more than MaxComb. 169 // This allows some comb but limits the damages and also allows more 170 // detail than a boring oversmoothed clip. 171 172 "movq %%mm1, %%mm2\n\t" // copy L1 173 // pmaxub mm2, mm3 // use macro 174 V_PMAXUB ("%%mm2", "%%mm3") // now = Max(L1,L3) 175 "movq %%mm1, %%mm5\n\t" // copy L1 176 // pminub mm5, mm3 // now = Min(L1,L3), use macro 177 V_PMINUB ("%%mm5", "%%mm3", "%%mm7") 178 179 // allow the value to be above the high or below the low by amt of MaxComb 180 "psubusb %[MaxComb], %%mm5\n\t" // lower min by diff 181 "paddusb %[MaxComb], %%mm2\n\t" // increase max by diff 182 // pmaxub mm4, mm5 // now = Max(best,Min(L1,L3) use macro 183 V_PMAXUB ("%%mm4", "%%mm5") 184 // pminub mm4, mm2 // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped 185 V_PMINUB ("%%mm4", "%%mm2", "%%mm7") 186 187 // Blend weave pixel with bob pixel, depending on motion val in mm0 188 "psubusb %[MotionThreshold], %%mm0\n\t" // test Threshold, clear chroma change 189 "pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits 190 "movq %[QW256], %%mm7\n\t" 191#ifdef IS_MMXEXT 192 "pminsw %%mm7, %%mm0\n\t" // max = 256 193#else 194 "paddusw %[QW256B], %%mm0\n\t" // add, may sat at fff.. 195 "psubusw %[QW256B], %%mm0\n\t" // now = Min(L1,256) 196#endif 197 "psubusw %%mm0, %%mm7\n\t" // so the 2 sum to 256, weighted avg 198 "movq %%mm4, %%mm2\n\t" // save weave chroma info before trashing 199 "pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value 200 "pmullw %%mm7, %%mm4\n\t" // use more weave for less motion 201 "pand %[YMask], %%mm6\n\t" // keep only luma from calc'd value 202 "pmullw %%mm0, %%mm6\n\t" // use more bob for large motion 203 "paddusw %%mm6, %%mm4\n\t" // combine 204 "psrlw $8, %%mm4\n\t" // div by 256 to get weighted avg 205 // chroma comes from weave pixel 206 "pand %[UVMask], %%mm2\n\t" // keep chroma 207 "por %%mm4, %%mm2\n\t" // and combine 208 V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro 209 // bump ptrs and loop 210 LEAX " 8(%%" XAX "), %%" XAX "\n\t" 211 LEAX " 8(%%" XBX "), %%" XBX "\n\t" 212 LEAX " 8(%%" XDX "), %%" XDX "\n\t" 213 LEAX " 8(%%" XDI "), %%" XDI "\n\t" 214 LEAX " 8(%%" XSI "), %%" XSI "\n\t" 215 DECX " %[LoopCtr]\n\t" 216 217 "jg 1b\n\t" // loop if not to last line 218 // note P-III default assumes backward branches taken 219 "jl 1f\n\t" // done 220 MOVX " %%" XAX ", %%" XBX "\n\t" // sharpness lookahead 1 byte only, be wrong on 1 221 "jmp 1b\n\t" 222 223 "1:\n\t" 224 MOVX " %[oldbx], %%" XBX "\n\t" 225 "emms\n\t": /* no outputs */ 226 227 :[LastAvg] "m" (LastAvg), 228 [L1] "m" (L1), 229 [L3] "m" (L3), 230 [L2P] "m" (L2P), 231 [L2] "m" (L2), 232 [Dest] "m" (Dest), 233 [ShiftMask] "m" (ShiftMask), 234 [MaxComb] "m" (MaxComb), 235 [MotionThreshold] "m" (MotionThreshold), 236 [MotionSense] "m" (MotionSense), 237 [QW256B] "m" (QW256B), 238 [YMask] "m" (YMask), 239 [UVMask] "m" (UVMask), 240 [LoopCtr] "m" (LoopCtr), 241 [QW256] "m" (QW256), 242 [oldbx] "m" (oldbx) 243 : XAX, XCX, XDX, XSI, XDI, 244 "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", 245#ifdef __MMX__ 246 "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", 247#endif 248 "memory", "cc"); 249} 250 251static void 252FUNCT_NAME_UYVY (GstDeinterlaceMethodGreedyH *self, const guint8 * L1, const guint8 * L2, const guint8 * L3, const guint8 * L2P, guint8 * Dest, gint width) 253{ 254 255 // in tight loop some vars are accessed faster in local storage 256 gint64 YMask = 0xff00ff00ff00ff00ull; // to keep only luma 257 gint64 UVMask = 0x00ff00ff00ff00ffull; // to keep only chroma 258 gint64 ShiftMask = 0xfefefefefefefefeull; // to avoid shifting chroma to luma 259 gint64 QW256 = 0x0100010001000100ull; // 4 256's 260 gint64 MaxComb; 261 gint64 MotionThreshold; 262 gint64 MotionSense; 263 gint64 i; 264 glong LoopCtr; 265 glong oldbx = 0; 266 267 gint64 QW256B; 268 gint64 LastAvg = 0; //interp value from left qword 269 270 // FIXME: Use C implementation if the width is not a multiple of 4 271 // Do something more optimal later 272 if (width % 4 != 0) 273 C_FUNCT_UYVY (self, L1, L2, L3, L2P, Dest, width); 274 275 // Set up our two parms that are actually evaluated for each pixel 276 i = self->max_comb; 277 MaxComb = 278 i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i; 279 280 i = self->motion_threshold; // scale to range of 0-257 281 MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask; 282 283 i = self->motion_sense; // scale to range of 0-257 284 MotionSense = i << 48 | i << 32 | i << 16 | i; 285 286 i = 0xffffffff - 256; 287 QW256B = i << 48 | i << 32 | i << 16 | i; // save a couple instr on PMINSW instruct. 288 289 LoopCtr = width / 8 - 1; // there are LineLength / 4 qwords per line but do 1 less, adj at end of loop 290 291 // For ease of reading, the comments below assume that we're operating on an odd 292 // field (i.e., that InfoIsOdd is true). Assume the obvious for even lines.. 293 __asm__ __volatile__ ( 294 // save ebx (-fPIC) 295 MOVX " %%" XBX ", %[oldbx]\n\t" 296 MOVX " %[L1], %%" XAX "\n\t" 297 LEAX " 8(%%" XAX "), %%" XBX "\n\t" // next qword needed by DJR 298 MOVX " %[L3], %%" XCX "\n\t" 299 SUBX " %%" XAX ", %%" XCX "\n\t" // carry L3 addr as an offset 300 MOVX " %[L2P], %%" XDX "\n\t" 301 MOVX " %[L2], %%" XSI "\n\t" 302 MOVX " %[Dest], %%" XDI "\n\t" // DL1 if Odd or DL2 if Even 303 304 ".align 8\n\t" 305 "1:\n\t" 306 "movq (%%" XSI "), %%mm0\n\t" // L2 - the newest weave pixel value 307 "movq (%%" XAX "), %%mm1\n\t" // L1 - the top pixel 308 "movq (%%" XDX "), %%mm2\n\t" // L2P - the prev weave pixel 309 "movq (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row 310 "movq %%mm1, %%mm6\n\t" // L1 - get simple single pixel interp 311 312 // pavgb mm6, mm3 // use macro below 313 V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]") 314 315 // DJR - Diagonal Jaggie Reduction 316 // In the event that we are going to use an average (Bob) pixel we do not want a jagged 317 // stair step effect. To combat this we avg in the 2 horizontally adjacen pixels into the 318 // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels. 319 320 "movq %[LastAvg], %%mm4\n\t" // the bob value from prev qword in row 321 "movq %%mm6, %[LastAvg]\n\t" // save for next pass 322 "psrlq $48, %%mm4\n\t" // right justify 1 pixel 323 "movq %%mm6, %%mm7\n\t" // copy of simple bob pixel 324 "psllq $16, %%mm7\n\t" // left justify 3 pixels 325 "por %%mm7, %%mm4\n\t" // and combine 326 "movq (%%" XBX "), %%mm5\n\t" // next horiz qword from L1 327 // pavgb mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below 328 329 V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]") 330 "psllq $48, %%mm5\n\t" // left just 1 pixel 331 "movq %%mm6, %%mm7\n\t" // another copy of simple bob pixel 332 "psrlq $16, %%mm7\n\t" // right just 3 pixels 333 "por %%mm7, %%mm5\n\t" // combine 334 // pavgb mm4, mm5 // avg of forward and prev by 1 pixel, use macro 335 V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]") // mm5 gets modified if MMX 336 // pavgb mm6, mm4 // avg of center and surround interp vals, use macro 337 V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") 338 339 // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors. 340#ifndef IS_MMX 341 // pavgb mm4, mm6 // 1/4 center, 3/4 adjacent 342 V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]") 343 // pavgb mm6, mm4 // 3/8 center, 5/8 adjacent 344 V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") 345#endif 346 347 // get abs value of possible L2 comb 348 "movq %%mm6, %%mm4\n\t" // work copy of interp val 349 "movq %%mm2, %%mm7\n\t" // L2 350 "psubusb %%mm4, %%mm7\n\t" // L2 - avg 351 "movq %%mm4, %%mm5\n\t" // avg 352 "psubusb %%mm2, %%mm5\n\t" // avg - L2 353 "por %%mm7, %%mm5\n\t" // abs(avg-L2) 354 355 // get abs value of possible L2P comb 356 "movq %%mm0, %%mm7\n\t" // L2P 357 "psubusb %%mm4, %%mm7\n\t" // L2P - avg 358 "psubusb %%mm0, %%mm4\n\t" // avg - L2P 359 "por %%mm7, %%mm4\n\t" // abs(avg-L2P) 360 361 // use L2 or L2P depending upon which makes smaller comb 362 "psubusb %%mm5, %%mm4\n\t" // see if it goes to zero 363 "psubusb %%mm5, %%mm5\n\t" // 0 364 "pcmpeqb %%mm5, %%mm4\n\t" // if (mm4=0) then FF else 0 365 "pcmpeqb %%mm4, %%mm5\n\t" // opposite of mm4 366 367 // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 368 "pand %%mm2, %%mm5\n\t" // use L2 if mm5 == ff, else 0 369 "pand %%mm0, %%mm4\n\t" // use L2P if mm4 = ff, else 0 370 "por %%mm5, %%mm4\n\t" // may the best win 371 372 // Inventory: at this point we have the following values: 373 // mm0 = L2P (or L2) 374 // mm1 = L1 375 // mm2 = L2 (or L2P) 376 // mm3 = L3 377 // mm4 = the best of L2,L2P weave pixel, base upon comb 378 // mm6 = the avg interpolated value, if we need to use it 379 // Let's measure movement, as how much the weave pixel has changed 380 381 "movq %%mm2, %%mm7\n\t" 382 "psubusb %%mm0, %%mm2\n\t" 383 "psubusb %%mm7, %%mm0\n\t" 384 "por %%mm2, %%mm0\n\t" // abs value of change, used later 385 386 // Now lets clip our chosen value to be not outside of the range 387 // of the high/low range L1-L3 by more than MaxComb. 388 // This allows some comb but limits the damages and also allows more 389 // detail than a boring oversmoothed clip. 390 391 "movq %%mm1, %%mm2\n\t" // copy L1 392 // pmaxub mm2, mm3 // use macro 393 V_PMAXUB ("%%mm2", "%%mm3") // now = Max(L1,L3) 394 "movq %%mm1, %%mm5\n\t" // copy L1 395 // pminub mm5, mm3 // now = Min(L1,L3), use macro 396 V_PMINUB ("%%mm5", "%%mm3", "%%mm7") 397 398 // allow the value to be above the high or below the low by amt of MaxComb 399 "psubusb %[MaxComb], %%mm5\n\t" // lower min by diff 400 "paddusb %[MaxComb], %%mm2\n\t" // increase max by diff 401 // pmaxub mm4, mm5 // now = Max(best,Min(L1,L3) use macro 402 V_PMAXUB ("%%mm4", "%%mm5") 403 // pminub mm4, mm2 // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped 404 V_PMINUB ("%%mm4", "%%mm2", "%%mm7") 405 406 // Blend weave pixel with bob pixel, depending on motion val in mm0 407 "psubusb %[MotionThreshold], %%mm0\n\t" // test Threshold, clear chroma change 408 "psrlw $8, %%mm0\n\t" // div by 256 to get weighted avg 409 "pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits 410 "movq %[QW256], %%mm7\n\t" 411#ifdef IS_MMXEXT 412 "pminsw %%mm7, %%mm0\n\t" // max = 256 413#else 414 "paddusw %[QW256B], %%mm0\n\t" // add, may sat at fff.. 415 "psubusw %[QW256B], %%mm0\n\t" // now = Min(L1,256) 416#endif 417 "psubusw %%mm0, %%mm7\n\t" // so the 2 sum to 256, weighted avg 418 "movq %%mm4, %%mm2\n\t" // save weave chroma info before trashing 419 "pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value 420 "psrlw $8, %%mm4\n\t" // div by 256 to get weighted avg 421 "pmullw %%mm7, %%mm4\n\t" // use more weave for less motion 422 "pand %[YMask], %%mm6\n\t" // keep only luma from calc'd value 423 "psrlw $8, %%mm6\n\t" // div by 256 to get weighted avg 424 "pmullw %%mm0, %%mm6\n\t" // use more bob for large motion 425 "paddusw %%mm6, %%mm4\n\t" // combine 426 "pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value 427 // chroma comes from weave pixel 428 "pand %[UVMask], %%mm2\n\t" // keep chroma 429 "por %%mm4, %%mm2\n\t" // and combine 430 V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro 431 // bump ptrs and loop 432 LEAX " 8(%%" XAX "), %%" XAX "\n\t" 433 LEAX " 8(%%" XBX "), %%" XBX "\n\t" 434 LEAX " 8(%%" XDX "), %%" XDX "\n\t" 435 LEAX " 8(%%" XDI "), %%" XDI "\n\t" 436 LEAX " 8(%%" XSI "), %%" XSI "\n\t" 437 DECX " %[LoopCtr]\n\t" 438 439 "jg 1b\n\t" // loop if not to last line 440 // note P-III default assumes backward branches taken 441 "jl 1f\n\t" // done 442 MOVX " %%" XAX ", %%" XBX "\n\t" // sharpness lookahead 1 byte only, be wrong on 1 443 "jmp 1b\n\t" 444 445 "1:\n\t" 446 MOVX " %[oldbx], %%" XBX "\n\t" 447 "emms\n\t": /* no outputs */ 448 449 :[LastAvg] "m" (LastAvg), 450 [L1] "m" (L1), 451 [L3] "m" (L3), 452 [L2P] "m" (L2P), 453 [L2] "m" (L2), 454 [Dest] "m" (Dest), 455 [ShiftMask] "m" (ShiftMask), 456 [MaxComb] "m" (MaxComb), 457 [MotionThreshold] "m" (MotionThreshold), 458 [MotionSense] "m" (MotionSense), 459 [QW256B] "m" (QW256B), 460 [YMask] "m" (YMask), 461 [UVMask] "m" (UVMask), 462 [LoopCtr] "m" (LoopCtr), 463 [QW256] "m" (QW256), 464 [oldbx] "m" (oldbx) 465 : XAX, XCX, XDX, XSI, XDI, 466 "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", 467#ifdef __MMX__ 468 "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", 469#endif 470 "memory", "cc"); 471} 472 473