1 #include <string.h> 2 #include <math.h> 3 4 // Define a few macros for CPU dependent instructions. 5 // I suspect I don't really understand how the C macro preprocessor works but 6 // this seems to get the job done. // TRB 7/01 7 8 // BEFORE USING THESE YOU MUST SET: 9 10 // #define SIMD_TYPE MMXEXT (or MMX or 3DNOW) 11 12 // some macros for pavgb instruction 13 // V_PAVGB(mmr1, mmr2, mmr work register, smask) mmr2 may = mmrw if you can trash it 14 15 #define V_PAVGB_MMX(mmr1, mmr2, mmrw, smask) \ 16 "movq "mmr2", "mmrw"\n\t" \ 17 "pand "smask", "mmrw"\n\t" \ 18 "psrlw $1, "mmrw"\n\t" \ 19 "pand "smask", "mmr1"\n\t" \ 20 "psrlw $1, "mmr1"\n\t" \ 21 "paddusb "mmrw", "mmr1"\n\t" 22 #define V_PAVGB_MMXEXT(mmr1, mmr2, mmrw, smask) "pavgb "mmr2", "mmr1"\n\t" 23 #define V_PAVGB_3DNOW(mmr1, mmr2, mmrw, smask) "pavgusb "mmr2", "mmr1"\n\t" 24 #define V_PAVGB(mmr1, mmr2, mmrw, smask) V_PAVGB2(mmr1, mmr2, mmrw, smask, SIMD_TYPE) 25 #define V_PAVGB2(mmr1, mmr2, mmrw, smask, simd_type) V_PAVGB3(mmr1, mmr2, mmrw, smask, simd_type) 26 #define V_PAVGB3(mmr1, mmr2, mmrw, smask, simd_type) V_PAVGB_##simd_type(mmr1, mmr2, mmrw, smask) 27 28 // some macros for pmaxub instruction 29 #define V_PMAXUB_MMX(mmr1, mmr2) \ 30 "psubusb "mmr2", "mmr1"\n\t" \ 31 "paddusb "mmr2", "mmr1"\n\t" 32 #define V_PMAXUB_MMXEXT(mmr1, mmr2) "pmaxub "mmr2", "mmr1"\n\t" 33 #define V_PMAXUB_3DNOW(mmr1, mmr2) V_PMAXUB_MMX(mmr1, mmr2) // use MMX version 34 #define V_PMAXUB(mmr1, mmr2) V_PMAXUB2(mmr1, mmr2, SIMD_TYPE) 35 #define V_PMAXUB2(mmr1, mmr2, simd_type) V_PMAXUB3(mmr1, mmr2, simd_type) 36 #define V_PMAXUB3(mmr1, mmr2, simd_type) V_PMAXUB_##simd_type(mmr1, mmr2) 37 38 // some macros for pminub instruction 39 // V_PMINUB(mmr1, mmr2, mmr work register) mmr2 may NOT = mmrw 40 #define V_PMINUB_MMX(mmr1, mmr2, mmrw) \ 41 "pcmpeqb "mmrw", "mmrw"\n\t" \ 42 "psubusb "mmr2", "mmrw"\n\t" \ 43 "paddusb "mmrw", "mmr1"\n\t" \ 44 "psubusb "mmrw", "mmr1"\n\t" 45 #define V_PMINUB_MMXEXT(mmr1, mmr2, mmrw) "pminub "mmr2", "mmr1"\n\t" 46 #define V_PMINUB_3DNOW(mmr1, mmr2, mmrw) V_PMINUB_MMX(mmr1, mmr2, mmrw) // use MMX version 47 #define V_PMINUB(mmr1, mmr2, mmrw) V_PMINUB2(mmr1, mmr2, mmrw, SIMD_TYPE) 48 #define V_PMINUB2(mmr1, mmr2, mmrw, simd_type) V_PMINUB3(mmr1, mmr2, mmrw, simd_type) 49 #define V_PMINUB3(mmr1, mmr2, mmrw, simd_type) V_PMINUB_##simd_type(mmr1, mmr2, mmrw) 50 51 // some macros for movntq instruction 52 // V_MOVNTQ(mmr1, mmr2) 53 #define V_MOVNTQ_MMX(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t" 54 #define V_MOVNTQ_3DNOW(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t" 55 #define V_MOVNTQ_MMXEXT(mmr1, mmr2) "movntq "mmr2", "mmr1"\n\t" 56 #define V_MOVNTQ(mmr1, mmr2) V_MOVNTQ2(mmr1, mmr2, SIMD_TYPE) 57 #define V_MOVNTQ2(mmr1, mmr2, simd_type) V_MOVNTQ3(mmr1, mmr2, simd_type) 58 #define V_MOVNTQ3(mmr1, mmr2, simd_type) V_MOVNTQ_##simd_type(mmr1, mmr2) 59 60 // end of macros 61 62 #ifdef IS_SSE2 63 64 #define MERGE4PIXavg(PADDR1, PADDR2) \ 65 "movdqu "PADDR1", %%xmm0\n\t" /* our 4 pixels */ \ 66 "movdqu "PADDR2", %%xmm1\n\t" /* our pixel2 value */ \ 67 "movdqa %%xmm0, %%xmm2\n\t" /* another copy of our pixel1 value */ \ 68 "movdqa %%xmm1, %%xmm3\n\t" /* another copy of our pixel1 value */ \ 69 "psubusb %%xmm1, %%xmm2\n\t" \ 70 "psubusb %%xmm0, %%xmm3\n\t" \ 71 "por %%xmm3, %%xmm2\n\t" \ 72 "pavgb %%xmm1, %%xmm0\n\t" /* avg of 2 pixels */ \ 73 "movdqa %%xmm2, %%xmm3\n\t" /* another copy of our our weights */ \ 74 "pxor %%xmm1, %%xmm1\n\t" \ 75 "psubusb %%xmm7, %%xmm3\n\t" /* nonzero where old weights lower, else 0 */ \ 76 "pcmpeqb %%xmm1, %%xmm3\n\t" /* now ff where new better, else 00 */ \ 77 "pcmpeqb %%xmm3, %%xmm1\n\t" /* here ff where old better, else 00 */ \ 78 "pand %%xmm3, %%xmm0\n\t" /* keep only better new pixels */ \ 79 "pand %%xmm3, %%xmm2\n\t" /* and weights */ \ 80 "pand %%xmm1, %%xmm5\n\t" /* keep only better old pixels */ \ 81 "pand %%xmm1, %%xmm7\n\t" \ 82 "por %%xmm0, %%xmm5\n\t" /* and merge new & old vals */ \ 83 "por %%xmm2, %%xmm7\n\t" 84 85 #define MERGE4PIXavgH(PADDR1A, PADDR1B, PADDR2A, PADDR2B) \ 86 "movdqu "PADDR1A", %%xmm0\n\t" /* our 4 pixels */ \ 87 "movdqu "PADDR2A", %%xmm1\n\t" /* our pixel2 value */ \ 88 "movdqu "PADDR1B", %%xmm2\n\t" /* our 4 pixels */ \ 89 "movdqu "PADDR2B", %%xmm3\n\t" /* our pixel2 value */ \ 90 "pavgb %%xmm2, %%xmm0\n\t" \ 91 "pavgb %%xmm3, %%xmm1\n\t" \ 92 "movdqa %%xmm0, %%xmm2\n\t" /* another copy of our pixel1 value */ \ 93 "movdqa %%xmm1, %%xmm3\n\t" /* another copy of our pixel1 value */ \ 94 "psubusb %%xmm1, %%xmm2\n\t" \ 95 "psubusb %%xmm0, %%xmm3\n\t" \ 96 "por %%xmm3, %%xmm2\n\t" \ 97 "pavgb %%xmm1, %%xmm0\n\t" /* avg of 2 pixels */ \ 98 "movdqa %%xmm2, %%xmm3\n\t" /* another copy of our our weights */ \ 99 "pxor %%xmm1, %%xmm1\n\t" \ 100 "psubusb %%xmm7, %%xmm3\n\t" /* nonzero where old weights lower, else 0 */ \ 101 "pcmpeqb %%xmm1, %%xmm3\n\t" /* now ff where new better, else 00 */ \ 102 "pcmpeqb %%xmm3, %%xmm1\n\t" /* here ff where old better, else 00 */ \ 103 "pand %%xmm3, %%xmm0\n\t" /* keep only better new pixels */ \ 104 "pand %%xmm3, %%xmm2\n\t" /* and weights */ \ 105 "pand %%xmm1, %%xmm5\n\t" /* keep only better old pixels */ \ 106 "pand %%xmm1, %%xmm7\n\t" \ 107 "por %%xmm0, %%xmm5\n\t" /* and merge new & old vals */ \ 108 "por %%xmm2, %%xmm7\n\t" 109 110 #define RESET_CHROMA "por "_UVMask", %%xmm7\n\t" 111 112 #else // ifdef IS_SSE2 113 114 #define MERGE4PIXavg(PADDR1, PADDR2) \ 115 "movq "PADDR1", %%mm0\n\t" /* our 4 pixels */ \ 116 "movq "PADDR2", %%mm1\n\t" /* our pixel2 value */ \ 117 "movq %%mm0, %%mm2\n\t" /* another copy of our pixel1 value */ \ 118 "movq %%mm1, %%mm3\n\t" /* another copy of our pixel1 value */ \ 119 "psubusb %%mm1, %%mm2\n\t" \ 120 "psubusb %%mm0, %%mm3\n\t" \ 121 "por %%mm3, %%mm2\n\t" \ 122 V_PAVGB ("%%mm0", "%%mm1", "%%mm3", _ShiftMask) /* avg of 2 pixels */ \ 123 "movq %%mm2, %%mm3\n\t" /* another copy of our our weights */ \ 124 "pxor %%mm1, %%mm1\n\t" \ 125 "psubusb %%mm7, %%mm3\n\t" /* nonzero where old weights lower, else 0 */ \ 126 "pcmpeqb %%mm1, %%mm3\n\t" /* now ff where new better, else 00 */ \ 127 "pcmpeqb %%mm3, %%mm1\n\t" /* here ff where old better, else 00 */ \ 128 "pand %%mm3, %%mm0\n\t" /* keep only better new pixels */ \ 129 "pand %%mm3, %%mm2\n\t" /* and weights */ \ 130 "pand %%mm1, %%mm5\n\t" /* keep only better old pixels */ \ 131 "pand %%mm1, %%mm7\n\t" \ 132 "por %%mm0, %%mm5\n\t" /* and merge new & old vals */ \ 133 "por %%mm2, %%mm7\n\t" 134 135 #define MERGE4PIXavgH(PADDR1A, PADDR1B, PADDR2A, PADDR2B) \ 136 "movq "PADDR1A", %%mm0\n\t" /* our 4 pixels */ \ 137 "movq "PADDR2A", %%mm1\n\t" /* our pixel2 value */ \ 138 "movq "PADDR1B", %%mm2\n\t" /* our 4 pixels */ \ 139 "movq "PADDR2B", %%mm3\n\t" /* our pixel2 value */ \ 140 V_PAVGB("%%mm0", "%%mm2", "%%mm2", _ShiftMask) \ 141 V_PAVGB("%%mm1", "%%mm3", "%%mm3", _ShiftMask) \ 142 "movq %%mm0, %%mm2\n\t" /* another copy of our pixel1 value */ \ 143 "movq %%mm1, %%mm3\n\t" /* another copy of our pixel1 value */ \ 144 "psubusb %%mm1, %%mm2\n\t" \ 145 "psubusb %%mm0, %%mm3\n\t" \ 146 "por %%mm3, %%mm2\n\t" \ 147 V_PAVGB("%%mm0", "%%mm1", "%%mm3", _ShiftMask) /* avg of 2 pixels */ \ 148 "movq %%mm2, %%mm3\n\t" /* another copy of our our weights */ \ 149 "pxor %%mm1, %%mm1\n\t" \ 150 "psubusb %%mm7, %%mm3\n\t" /* nonzero where old weights lower, else 0 */ \ 151 "pcmpeqb %%mm1, %%mm3\n\t" /* now ff where new better, else 00 */ \ 152 "pcmpeqb %%mm3, %%mm1\n\t" /* here ff where old better, else 00 */ \ 153 "pand %%mm3, %%mm0\n\t" /* keep only better new pixels */ \ 154 "pand %%mm3, %%mm2\n\t" /* and weights */ \ 155 "pand %%mm1, %%mm5\n\t" /* keep only better old pixels */ \ 156 "pand %%mm1, %%mm7\n\t" \ 157 "por %%mm0, %%mm5\n\t" /* and merge new & old vals */ \ 158 "por %%mm2, %%mm7\n\t" 159 160 #define RESET_CHROMA "por "_UVMask", %%mm7\n\t" 161 162 #endif 163 164 165