1// -*- c++ -*- 2 3unsigned char* pDest; 4const unsigned char* pSrcP; 5const unsigned char* pSrc; 6const unsigned char* pBob; 7const unsigned char* pBobP; 8 9// long is int32 on ARCH_368, int64 on ARCH_AMD64. Declaring it this way 10// saves a lot of xor's to delete 64bit garbage. 11 12#if defined(DBL_RESIZE) || defined(USE_FOR_DSCALER) 13long src_pitch2 = src_pitch; // even & odd lines are not interleaved in DScaler 14#else 15long src_pitch2 = 2 * src_pitch; // even & odd lines are interleaved in Avisynth 16#endif 17 18 19long dst_pitch2 = 2 * dst_pitch; 20long y; 21 22long Last8; 23 24// XXX; silence unused-but-set warnings turned into errors with Werror 25(void) pSrc; 26(void) pSrcP; 27(void) pBob; 28(void) pBobP; 29 30 pSrc = pWeaveSrc; // points 1 weave line above 31 pSrcP = pWeaveSrcP; // " 32 33#ifdef DBL_RESIZE 34 35#ifdef USE_VERTICAL_FILTER 36 pDest = pWeaveDest + dst_pitch2; 37#else 38 pDest = pWeaveDest + 3*dst_pitch; 39#endif 40 41#else 42 43#ifdef USE_VERTICAL_FILTER 44 pDest = pWeaveDest + dst_pitch; 45#else 46 pDest = pWeaveDest + dst_pitch2; 47#endif 48 49#endif 50 51 if (TopFirst) 52 { 53 pBob = pCopySrc + src_pitch2; // remember one weave line just copied previously 54 pBobP = pCopySrcP + src_pitch2; 55 } 56 else 57 { 58 pBob = pCopySrc; 59 pBobP = pCopySrcP; 60 } 61 62#ifndef IS_C 63 64#ifndef _pBob 65#define _pBob "%0" 66#define _src_pitch2 "%1" 67#define _ShiftMask "%2" 68#define _pDest "%3" 69#define _dst_pitchw "%4" 70#define _Last8 "%5" 71#define _pSrc "%6" 72#define _pSrcP "%7" 73#define _pBobP "%8" 74#define _DiffThres "%9" 75#define _Min_Vals "%10" 76#define _Max_Vals "%11" 77#define _FOURS "%12" 78#define _TENS "%13" 79#define _ONES "%14" 80#define _UVMask "%15" 81#define _Max_Mov "%16" 82#define _YMask "%17" 83#define _oldbx "%18" 84#endif 85 Last8 = (rowsize-8); 86 87 for (y=1; y < FldHeight-1; y++) 88 { 89 long dst_pitchw = dst_pitch; // local stor so asm can ref 90 int64_t Max_Mov = 0x0404040404040404ull; 91 int64_t DiffThres = 0x0f0f0f0f0f0f0f0full; 92 int64_t YMask = 0x00ff00ff00ff00ffull; // keeps only luma 93 int64_t UVMask = 0xff00ff00ff00ff00ull; // keeps only chroma 94 int64_t TENS = 0x0a0a0a0a0a0a0a0aull; 95 int64_t FOURS = 0x0404040404040404ull; 96 int64_t ONES = 0x0101010101010101ull; 97 int64_t Min_Vals = 0x0000000000000000ull; 98 int64_t Max_Vals = 0x0000000000000000ull; 99 int64_t ShiftMask = 0xfefffefffefffeffull; 100 101 long oldbx = 0; 102 103 // pretend it's indented -->> 104 __asm__ __volatile__ 105 ( 106 // Loop general reg usage 107 // 108 // XAX - pBobP, then pDest 109 // XBX - pBob 110 // XCX - src_pitch2 111 // XDX - current offset 112 // XDI - prev weave pixels, 1 line up 113 // XSI - next weave pixels, 1 line up 114 115 // Save "XBX" (-fPIC) 116 MOVX" %%"XBX", "_oldbx"\n\t" 117 118 // simple bob first 8 bytes 119 MOVX" "_pBob", %%"XBX"\n\t" 120 MOVX" "_src_pitch2", %%"XCX"\n\t" 121 122#ifdef USE_VERTICAL_FILTER 123 "movq (%%"XBX"), %%mm0\n\t" 124 "movq (%%"XBX", %%"XCX"), %%mm1\n\t" //, qword ptr["XBX"+"XCX"] 125 "movq %%mm0, %%mm2\n\t" 126 V_PAVGB ("%%mm2", "%%mm1", "%%mm3", _ShiftMask) // halfway between 127 V_PAVGB ("%%mm0", "%%mm2", "%%mm3", _ShiftMask) // 1/4 way 128 V_PAVGB ("%%mm1", "%%mm2", "%%mm3", _ShiftMask) // 3/4 way 129 MOVX" "_pDest", %%"XDI"\n\t" 130 MOVX" "_dst_pitchw", %%"XAX"\n\t" 131 V_MOVNTQ ("(%%"XDI")", "%%mm0") 132 V_MOVNTQ ("(%%"XDI", %%"XAX")", "%%mm1") // qword ptr["XDI"+"XAX"], mm1 133 134 // simple bob last 8 bytes 135 MOVX" "_Last8", %%"XDX"\n\t" 136 LEAX" (%%"XBX", %%"XDX"), %%"XSI"\n\t" // ["XBX"+"XDX"] 137 "movq (%%"XSI"), %%mm0\n\t" 138 "movq (%%"XSI", %%"XCX"), %%mm1\n\t" // qword ptr["XSI"+"XCX"] 139 "movq %%mm0, %%mm2\n\t" 140 V_PAVGB ("%%mm2", "%%mm1", "%%mm3", _ShiftMask) // halfway between 141 V_PAVGB ("%%mm0", "%%mm2", "%%mm3", _ShiftMask) // 1/4 way 142 V_PAVGB ("%%mm1", "%%mm2", "%%mm3", _ShiftMask) // 3/4 way 143 ADDX" %%"XDX", %%"XDI"\n\t" // last 8 bytes of dest 144 V_MOVNTQ ("%%"XDI"", "%%mm0") 145 V_MOVNTQ ("(%%"XDI", %%"XAX")", "%%mm1") // qword ptr["XDI"+"XAX"], mm1) 146 147#else 148 "movq (%%"XBX"), %%mm0\n\t" 149 // pavgb mm0, qword ptr["XBX"+"XCX"] 150 V_PAVGB ("%%mm0", "(%%"XBX", %%"XCX")", "%%mm2", _ShiftMask) // qword ptr["XBX"+"XCX"], mm2, ShiftMask) 151 MOVX" "_pDest", %%"XDI"\n\t" 152 V_MOVNTQ ("(%%"XDI")", "%%mm0") 153 154 // simple bob last 8 bytes 155 MOVX" "_Last8", %%"XDX"\n\t" 156 LEAX" (%%"XBX", %%"XDX"), %%"XSI"\n\t" //"XSI", ["XBX"+"XDX"] 157 "movq (%%"XSI"), %%mm0\n\t" 158 // pavgb mm0, qword ptr["XSI"+"XCX"] 159 V_PAVGB ("%%mm0", "(%%"XSI", %%"XCX")", "%%mm2", _ShiftMask) // qword ptr["XSI"+"XCX"], mm2, ShiftMask) 160 V_MOVNTQ ("(%%"XDI", %%"XDX")", "%%mm0") // qword ptr["XDI"+"XDX"], mm0) 161#endif 162 // now loop and get the middle qwords 163 MOVX" "_pSrc", %%"XSI"\n\t" 164 MOVX" "_pSrcP", %%"XDI"\n\t" 165 MOVX" $8, %%"XDX"\n\t" // curr offset longo all lines 166 167 "1:\n\t" 168 MOVX" "_pBobP", %%"XAX"\n\t" 169 ADDX" $8, %%"XDI"\n\t" 170 ADDX" $8, %%"XSI"\n\t" 171 ADDX" $8, %%"XBX"\n\t" 172 ADDX" %%"XDX", %%"XAX"\n\t" 173 174#ifdef USE_STRANGE_BOB 175#include "StrangeBob.inc" 176#else 177#include "WierdBob.inc" 178#endif 179 180 // For non-SSE2: 181 // through out most of the rest of this loop we will maintain 182 // mm4 our min bob value 183 // mm5 best weave pixels so far 184 // mm6 our max Bob value 185 // mm7 best weighted pixel ratings so far 186 187 // We will keep a slight bias to using the weave pixels 188 // from the current location, by rating them by the min distance 189 // from the Bob value instead of the avg distance from that value. 190 // our best and only rating so far 191 "pcmpeqb %%mm7, %%mm7\n\t" // ffff, say we didn't find anything good yet 192 193#else 194 Last8 = (rowsize - 4); 195 196 for (y=1; y < FldHeight-1; y++) 197 { 198 #ifdef USE_STRANGE_BOB 199 long DiffThres = 0x0f; 200 #endif 201 202 #ifndef SKIP_SEARCH 203 long weave[2], MaxVals[2], MinVals[2]; 204 #endif 205 206 long diff[2], best[2], avg[2], diff2[2], out[2], x; 207 208#ifdef USE_VERTICAL_FILTER 209 pDest[0] = (3 * pBob[0] + pBob[src_pitch2]) / 4; 210 pDest[1] = (3 * pBob[1] + pBob[src_pitch2 + 1]) / 4; 211 pDest[2] = (3 * pBob[2] + pBob[src_pitch2 + 2]) / 4; 212 pDest[3] = (3 * pBob[3] + pBob[src_pitch2 + 3]) / 4; 213 pDest[dst_pitchw] = (pBob[0] + 3 * pBob[src_pitch2]) / 4; 214 pDest[dst_pitchw + 1] = (pBob[1] + 3 * pBob[src_pitch2 + 1]) / 4; 215 pDest[dst_pitchw + 2] = (pBob[2] + 3 * pBob[src_pitch2 + 2]) / 4; 216 pDest[dst_pitchw + 3] = (pBob[3] + 3 * pBob[src_pitch2 + 3]) / 4; 217 218 // simple bob last byte 219 pDest[Last8] = (3 * pBob[Last8] + pBob[Last8 + src_pitch2]) / 4; 220 pDest[Last8 + 1] = (3 * pBob[Last8 + 1] + pBob[Last8 + src_pitch2 + 1]) / 4; 221 pDest[Last8 + 2] = (3 * pBob[Last8 + 2] + pBob[Last8 + src_pitch2 + 2]) / 4; 222 pDest[Last8 + 3] = (3 * pBob[Last8 + 3] + pBob[Last8 + src_pitch2 + 3]) / 4; 223 pDest[Last8 + src_pitch2] = (pBob[Last8] + 3 * pBob[Last8 + src_pitch2]) / 4; 224 pDest[Last8 + src_pitch2 + 1] = (pBob[Last8 + 1] + 3 * pBob[Last8 + src_pitch2 + 1]) / 4; 225 pDest[Last8 + src_pitch2 + 2] = (pBob[Last8 + 2] + 3 * pBob[Last8 + src_pitch2 + 2]) / 4; 226 pDest[Last8 + src_pitch2 + 3] = (pBob[Last8 + 3] + 3 * pBob[Last8 + src_pitch2 + 3]) / 4; 227#else 228 pDest[0] = (pBob[0] + pBob[src_pitch2 + 1]) / 2; 229 pDest[1] = (pBob[1] + pBob[src_pitch2 + 1]) / 2; 230 pDest[2] = (pBob[2] + pBob[src_pitch2 + 2]) / 2; 231 pDest[3] = (pBob[3] + pBob[src_pitch2 + 3]) / 2; 232 233 // simple bob last byte 234 pDest[Last8] = (pBob[Last8] + pBob[Last8 + src_pitch2]) / 2; 235 pDest[Last8 + 1] = (pBob[Last8 + 1] + pBob[Last8 + src_pitch2 + 1]) / 2; 236 pDest[Last8 + 2] = (pBob[Last8 + 2] + pBob[Last8 + src_pitch2 + 2]) / 2; 237 pDest[Last8 + 3] = (pBob[Last8 + 3] + pBob[Last8 + src_pitch2 + 3]) / 2; 238#endif 239 240 pBob += 4; 241 pBobP += 4; 242 pSrc += 4; 243 pSrcP += 4; 244 245 for (x=4; x < Last8; x += 2) { 246 247#ifdef USE_STRANGE_BOB 248#include "StrangeBob.inc" 249#else 250#include "WierdBob.inc" 251#endif 252 253 // We will keep a slight bias to using the weave pixels 254 // from the current location, by rating them by the min distance 255 // from the Bob value instead of the avg distance from that value. 256 // our best and only rating so far 257 diff[0] = diff[1] = 255; 258 259 260#endif 261