| CODENOTIFIER | HelpYou are not signed inSign in |
Project: Theora
Revision: 15057
Author: xiphmont
Date: 22 Jun 2008 17:07:32
Changes:Eliminate use of SSE instruction in loop filter MMX code by replacing
current loop filter with one from Derf's new decoder. This required
some amount of code refactoring as Derf's code expects slightly
different input.
| ... | ...@@ -22,156 +22,381 @@ | |
| 22 | 22 | #if defined(USE_ASM) |
| 23 | 23 | |
| 24 | static const __attribute__((aligned(8),used)) ogg_int64_t V3= 0x0003000300030003LL; | |
| 25 | static const __attribute__((aligned(8),used)) ogg_int64_t V804= 0x0804080408040804LL; | |
| 24 | static const __attribute__((aligned(8),used)) ogg_int64_t OC_V3= | |
| 25 | 0x0003000300030003LL; | |
| 26 | static const __attribute__((aligned(8),used)) ogg_int64_t OC_V4= | |
| 27 | 0x0004000400040004LL; | |
| 26 | 28 | |
| 27 | #define OC_LOOP_H_4x4 \ | |
| 28 | "lea (%[ll],%[ll],2),%[s]\n" /* esi = ystride*3 */ \ | |
| 29 | "movd (%[pp]), %%mm0\n" /* 0 0 0 0 3 2 1 0 */ \ | |
| 30 | "movd (%[pp],%[ll]),%%mm1\n" /* 0 0 0 0 7 6 5 4 */ \ | |
| 31 | "movd (%[pp],%[ll],2),%%mm2\n" /* 0 0 0 0 b a 9 8 */ \ | |
| 32 | "movd (%[pp],%[s]),%%mm3\n" /* 0 0 0 0 f e d c */ \ | |
| 33 | "punpcklbw %%mm1,%%mm0\n" /* mm0 = 7 3 6 2 5 1 4 0 */ \ | |
| 34 | "punpcklbw %%mm3,%%mm2\n" /* mm2 = f b e a d 9 c 8 */ \ | |
| 35 | "movq %%mm0,%%mm1\n" /* mm1 = 7 3 6 2 5 1 4 0 */ \ | |
| 36 | "punpcklwd %%mm2,%%mm1\n" /* mm1 = d 9 5 1 c 8 4 0 */ \ | |
| 37 | "punpckhwd %%mm2,%%mm0\n" /* mm0 = f b 7 3 e a 6 2 */ \ | |
| 38 | "pxor %%mm7,%%mm7\n" \ | |
| 39 | "movq %%mm1,%%mm5\n" /* mm5 = d 9 5 1 c 8 4 0 */ \ | |
| 40 | "punpckhbw %%mm7,%%mm5\n" /* mm5 = 0 d 0 9 0 5 0 1 = pix[1]*/ \ | |
| 41 | "punpcklbw %%mm7,%%mm1\n" /* mm1 = 0 c 0 8 0 4 0 0 = pix[0]*/ \ | |
| 42 | "movq %%mm0,%%mm3\n" /* mm3 = f b 7 3 e a 6 2 */ \ | |
| 43 | "punpckhbw %%mm7,%%mm3\n" /* mm3 = 0 f 0 b 0 7 0 3 = pix[3]*/ \ | |
| 44 | "punpcklbw %%mm7,%%mm0\n" /* mm0 = 0 e 0 a 0 6 0 2 = pix[2]*/ \ | |
| 45 | \ | |
| 46 | "psubw %%mm3,%%mm1\n" /* mm1 = pix[0]-pix[3] mm1 - mm3 */ \ | |
| 47 | "movq %%mm0,%%mm7\n" /* mm7 = pix[2]*/ \ | |
| 48 | "psubw %%mm5,%%mm0\n" /* mm0 = pix[2]-pix[1] mm0 - mm5*/ \ | |
| 49 | "pmullw %[V3],%%mm0\n" /* *3 */ \ | |
| 50 | "paddw %%mm0,%%mm1\n" /* mm1 has f[0] ... f[4]*/ \ | |
| 51 | "paddw %[V804],%%mm1\n" /* add 4 */ /* add 256 after shift */ \ | |
| 52 | "psraw $3,%%mm1\n" /* >>3 */ \ | |
| 53 | " pextrw $0,%%mm1,%[s]\n" /* In MM1 we have 4 f coefs (16bits) */ \ | |
| 54 | " pextrw $1,%%mm1,%[d]\n" /* now perform MM4 = *(_bv+ f) */ \ | |
| 55 | " pinsrw $0,(%[bound],%[s],2),%%mm4\n" \ | |
| 56 | " pextrw $2,%%mm1,%[s]\n" \ | |
| 57 | " pinsrw $1,(%[bound],%[d],2),%%mm4\n" \ | |
| 58 | " pextrw $3,%%mm1,%[d]\n" \ | |
| 59 | " pinsrw $2,(%[bound],%[s],2),%%mm4\n" \ | |
| 60 | " pinsrw $3,(%[bound],%[d],2),%%mm4\n" /* new f vals loaded */ \ | |
| 61 | "pxor %%mm0,%%mm0\n" \ | |
| 62 | " paddw %%mm4,%%mm5\n" /*(pix[1]+f);*/ \ | |
| 63 | " psubw %%mm4,%%mm7\n" /* (pix[2]-f); */ \ | |
| 64 | " packuswb %%mm0,%%mm5\n" /* mm5 = x x x x newpix1 */ \ | |
| 65 | " packuswb %%mm0,%%mm7\n" /* mm7 = x x x x newpix2 */ \ | |
| 66 | " punpcklbw %%mm7,%%mm5\n" /* 2 1 2 1 2 1 2 1 */ \ | |
| 67 | " movd %%mm5,%[d]\n" /* edi = newpix21 */ \ | |
| 68 | " movw %[d],1(%[pp])\n" \ | |
| 69 | " psrlq $32,%%mm5\n" /* why is so big stall here ? */ \ | |
| 70 | " shr $16,%[d]\n" \ | |
| 71 | " movw %[d],1(%[pp],%[ll],1)\n" \ | |
| 72 | " movd %%mm5,%[d]\n" /* eax = newpix21 high part */ \ | |
| 73 | " lea (%[ll],%[ll],2),%[s]\n" \ | |
| 74 | " movw %[d],1(%[pp],%[ll],2)\n" \ | |
| 75 | " shr $16,%[d]\n" \ | |
| 76 | " movw %[d],1(%[pp],%[s])\n" | |
| 77 | ||
| 78 | static void FilterHoriz__mmx(unsigned char * PixelPtr, | |
| 79 | ogg_int32_t LineLength, | |
| 80 | ogg_int16_t *BoundingValuePtr){ | |
| 29 | static void loop_filter_v(unsigned char *_pix,int _ystride, | |
| 30 | const ogg_int16_t *_ll){ | |
| 81 | 31 | long esi; |
| 82 | long edi; | |
| 32 | _pix-=_ystride*2; | |
| 83 | 33 | __asm__ __volatile__( |
| 84 | OC_LOOP_H_4x4 | |
| 85 | : [s]"=&r"(esi),[d]"=&r"(edi) | |
| 86 | : [pp]"r"(PixelPtr), [ll]"r"((long)LineLength), [bound]"r"(BoundingValuePtr-256), [V3]"m"(V3), [V804]"m"(V804) | |
| 87 | : "memory" | |
| 34 | /*mm0=0*/ | |
| 35 | "pxor %%mm0,%%mm0\n\t" | |
| 36 | /*esi=_ystride*3*/ | |
| 37 | "lea (%[ystride],%[ystride],2),%[s]\n\t" | |
| 38 | /*mm7=_pix[0...8]*/ | |
| 39 | "movq (%[pix]),%%mm7\n\t" | |
| 40 | /*mm4=_pix[0...8+_ystride*3]*/ | |
| 41 | "movq (%[pix],%[s]),%%mm4\n\t" | |
| 42 | /*mm6=_pix[0...8]*/ | |
| 43 | "movq %%mm7,%%mm6\n\t" | |
| 44 | /*Expand unsigned _pix[0...3] to 16 bits.*/ | |
| 45 | "punpcklbw %%mm0,%%mm6\n\t" | |
| 46 | "movq %%mm4,%%mm5\n\t" | |
| 47 | /*Expand unsigned _pix[4...8] to 16 bits.*/ | |
| 48 | "punpckhbw %%mm0,%%mm7\n\t" | |
| 49 | /*Expand other arrays too.*/ | |
| 50 | "punpcklbw %%mm0,%%mm4\n\t" | |
| 51 | "punpckhbw %%mm0,%%mm5\n\t" | |
| 52 | /*mm7:mm6=_p[0...8]-_p[0...8+_ystride*3]:*/ | |
| 53 | "psubw %%mm4,%%mm6\n\t" | |
| 54 | "psubw %%mm5,%%mm7\n\t" | |
| 55 | /*mm5=mm4=_pix[0...8+_ystride]*/ | |
| 56 | "movq (%[pix],%[ystride]),%%mm4\n\t" | |
| 57 | /*mm1=mm3=mm2=_pix[0..8]+_ystride*2]*/ | |
| 58 | "movq (%[pix],%[ystride],2),%%mm2\n\t" | |
| 59 | "movq %%mm4,%%mm5\n\t" | |
| 60 | "movq %%mm2,%%mm3\n\t" | |
| 61 | "movq %%mm2,%%mm1\n\t" | |
| 62 | /*Expand these arrays.*/ | |
| 63 | "punpckhbw %%mm0,%%mm5\n\t" | |
| 64 | "punpcklbw %%mm0,%%mm4\n\t" | |
| 65 | "punpckhbw %%mm0,%%mm3\n\t" | |
| 66 | "punpcklbw %%mm0,%%mm2\n\t" | |
| 67 | /*Preload...*/ | |
| 68 | "movq %[OC_V3],%%mm0\n\t" | |
| 69 | /*mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/ | |
| 70 | "psubw %%mm5,%%mm3\n\t" | |
| 71 | "psubw %%mm4,%%mm2\n\t" | |
| 72 | /*Scale by 3.*/ | |
| 73 | "pmullw %%mm0,%%mm3\n\t" | |
| 74 | "pmullw %%mm0,%%mm2\n\t" | |
| 75 | /*Preload...*/ | |
| 76 | "movq %[OC_V4],%%mm0\n\t" | |
| 77 | /*f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+ | |
| 78 | 3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/ | |
| 79 | "paddw %%mm7,%%mm3\n\t" | |
| 80 | "paddw %%mm6,%%mm2\n\t" | |
| 81 | /*Add 4.*/ | |
| 82 | "paddw %%mm0,%%mm3\n\t" | |
| 83 | "paddw %%mm0,%%mm2\n\t" | |
| 84 | /*"Divide" by 8.*/ | |
| 85 | "psraw $3,%%mm3\n\t" | |
| 86 | "psraw $3,%%mm2\n\t" | |
| 87 | /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/ | |
| 88 | /*Free up mm5.*/ | |
| 89 | "packuswb %%mm5,%%mm4\n\t" | |
| 90 | /*mm0=L L L L*/ | |
| 91 | "movq (%[ll]),%%mm0\n\t" | |
| 92 | /*if(R_i<-2L||R_i>2L)R_i=0:*/ | |
| 93 | "movq %%mm2,%%mm5\n\t" | |
| 94 | "pxor %%mm6,%%mm6\n\t" | |
| 95 | "movq %%mm0,%%mm7\n\t" | |
| 96 | "psubw %%mm0,%%mm6\n\t" | |
| 97 | "psllw $1,%%mm7\n\t" | |
| 98 | "psllw $1,%%mm6\n\t" | |
| 99 | /*mm2==R_3 R_2 R_1 R_0*/ | |
| 100 | /*mm5==R_3 R_2 R_1 R_0*/ | |
| 101 | /*mm6==-2L -2L -2L -2L*/ | |
| 102 | /*mm7==2L 2L 2L 2L*/ | |
| 103 | "pcmpgtw %%mm2,%%mm7\n\t" | |
| 104 | "pcmpgtw %%mm6,%%mm5\n\t" | |
| 105 | "pand %%mm7,%%mm2\n\t" | |
| 106 | "movq %%mm0,%%mm7\n\t" | |
| 107 | "pand %%mm5,%%mm2\n\t" | |
| 108 | "psllw $1,%%mm7\n\t" | |
| 109 | "movq %%mm3,%%mm5\n\t" | |
| 110 | /*mm3==R_7 R_6 R_5 R_4*/ | |
| 111 | /*mm5==R_7 R_6 R_5 R_4*/ | |
| 112 | /*mm6==-2L -2L -2L -2L*/ | |
| 113 | /*mm7==2L 2L 2L 2L*/ | |
| 114 | "pcmpgtw %%mm3,%%mm7\n\t" | |
| 115 | "pcmpgtw %%mm6,%%mm5\n\t" | |
| 116 | "pand %%mm7,%%mm3\n\t" | |
| 117 | "movq %%mm0,%%mm7\n\t" | |
| 118 | "pand %%mm5,%%mm3\n\t" | |
| 119 | /*if(R_i<-L)R_i'=R_i+2L; | |
| 120 | if(R_i>L)R_i'=R_i-2L; | |
| 121 | if(R_i<-L||R_i>L)R_i=-R_i':*/ | |
| 122 | "psraw $1,%%mm6\n\t" | |
| 123 | "movq %%mm2,%%mm5\n\t" | |
| 124 | "psllw $1,%%mm7\n\t" | |
| 125 | /*mm2==R_3 R_2 R_1 R_0*/ | |
| 126 | /*mm5==R_3 R_2 R_1 R_0*/ | |
| 127 | /*mm6==-L -L -L -L*/ | |
| 128 | /*mm0==L L L L*/ | |
| 129 | /*mm5=R_i>L?FF:00*/ | |
| 130 | "pcmpgtw %%mm0,%%mm5\n\t" | |
| 131 | /*mm6=-L>R_i?FF:00*/ | |
| 132 | "pcmpgtw %%mm2,%%mm6\n\t" | |
| 133 | /*mm7=R_i>L?2L:0*/ | |
| 134 | "pand %%mm5,%%mm7\n\t" | |
| 135 | /*mm2=R_i>L?R_i-2L:R_i*/ | |
| 136 | "psubw %%mm7,%%mm2\n\t" | |
| 137 | "movq %%mm0,%%mm7\n\t" | |
| 138 | /*mm5=-L>R_i||R_i>L*/ | |
| 139 | "por %%mm6,%%mm5\n\t" | |
| 140 | "psllw $1,%%mm7\n\t" | |
| 141 | /*mm7=-L>R_i?2L:0*/ | |
| 142 | "pand %%mm6,%%mm7\n\t" | |
| 143 | "pxor %%mm6,%%mm6\n\t" | |
| 144 | /*mm2=-L>R_i?R_i+2L:R_i*/ | |
| 145 | "paddw %%mm7,%%mm2\n\t" | |
| 146 | "psubw %%mm0,%%mm6\n\t" | |
| 147 | /*mm5=-L>R_i||R_i>L?-R_i':0*/ | |
| 148 | "pand %%mm2,%%mm5\n\t" | |
| 149 | "movq %%mm0,%%mm7\n\t" | |
| 150 | /*mm2=-L>R_i||R_i>L?0:R_i*/ | |
| 151 | "psubw %%mm5,%%mm2\n\t" | |
| 152 | "psllw $1,%%mm7\n\t" | |
| 153 | /*mm2=-L>R_i||R_i>L?-R_i':R_i*/ | |
| 154 | "psubw %%mm5,%%mm2\n\t" | |
| 155 | "movq %%mm3,%%mm5\n\t" | |
| 156 | /*mm3==R_7 R_6 R_5 R_4*/ | |
| 157 | /*mm5==R_7 R_6 R_5 R_4*/ | |
| 158 | /*mm6==-L -L -L -L*/ | |
| 159 | /*mm0==L L L L*/ | |
| 160 | /*mm6=-L>R_i?FF:00*/ | |
| 161 | "pcmpgtw %%mm3,%%mm6\n\t" | |
| 162 | /*mm5=R_i>L?FF:00*/ | |
| 163 | "pcmpgtw %%mm0,%%mm5\n\t" | |
| 164 | /*mm7=R_i>L?2L:0*/ | |
| 165 | "pand %%mm5,%%mm7\n\t" | |
| 166 | /*mm2=R_i>L?R_i-2L:R_i*/ | |
| 167 | "psubw %%mm7,%%mm3\n\t" | |
| 168 | "psllw $1,%%mm0\n\t" | |
| 169 | /*mm5=-L>R_i||R_i>L*/ | |
| 170 | "por %%mm6,%%mm5\n\t" | |
| 171 | /*mm0=-L>R_i?2L:0*/ | |
| 172 | "pand %%mm6,%%mm0\n\t" | |
| 173 | /*mm3=-L>R_i?R_i+2L:R_i*/ | |
| 174 | "paddw %%mm0,%%mm3\n\t" | |
| 175 | /*mm5=-L>R_i||R_i>L?-R_i':0*/ | |
| 176 | "pand %%mm3,%%mm5\n\t" | |
| 177 | /*mm2=-L>R_i||R_i>L?0:R_i*/ | |
| 178 | "psubw %%mm5,%%mm3\n\t" | |
| 179 | /*mm2=-L>R_i||R_i>L?-R_i':R_i*/ | |
| 180 | "psubw %%mm5,%%mm3\n\t" | |
| 181 | /*Unfortunately, there's no unsigned byte+signed byte with unsigned | |
| 182 | saturation op code, so we have to promote things back 16 bits.*/ | |
| 183 | "pxor %%mm0,%%mm0\n\t" | |
| 184 | "movq %%mm4,%%mm5\n\t" | |
| 185 | "punpcklbw %%mm0,%%mm4\n\t" | |
| 186 | "punpckhbw %%mm0,%%mm5\n\t" | |
| 187 | "movq %%mm1,%%mm6\n\t" | |
| 188 | "punpcklbw %%mm0,%%mm1\n\t" | |
| 189 | "punpckhbw %%mm0,%%mm6\n\t" | |
| 190 | /*_pix[0...8+_ystride]+=R_i*/ | |
| 191 | "paddw %%mm2,%%mm4\n\t" | |
| 192 | "paddw %%mm3,%%mm5\n\t" | |
| 193 | /*_pix[0...8+_ystride*2]-=R_i*/ | |
| 194 | "psubw %%mm2,%%mm1\n\t" | |
| 195 | "psubw %%mm3,%%mm6\n\t" | |
| 196 | "packuswb %%mm5,%%mm4\n\t" | |
| 197 | "packuswb %%mm6,%%mm1\n\t" | |
| 198 | /*Write it back out.*/ | |
| 199 | "movq %%mm4,(%[pix],%[ystride])\n\t" | |
| 200 | "movq %%mm1,(%[pix],%[ystride],2)\n\t" | |
| 201 | :[s]"=&S"(esi) | |
| 202 | :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[ll]"r"(_ll), | |
| 203 | [OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4) | |
| 204 | :"memory" | |
| 88 | 205 | ); |
| 206 | } | |
| 89 | 207 | |
| 90 | PixelPtr += LineLength*4; | |
| 91 | ||
| 208 | /*This code implements the bulk of loop_filter_h(). | |
| 209 | Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all | |
| 210 | four p0's to one register we must transpose the values in four mmx regs. | |
| 211 | When half is done we repeat this for the rest.*/ | |
| 212 | static void loop_filter_h4(unsigned char *_pix,long _ystride, | |
| 213 | const ogg_int16_t *_ll){ | |
| 214 | long esi; | |
| 215 | long edi; | |
| 92 | 216 | __asm__ __volatile__( |
| 93 | OC_LOOP_H_4x4 | |
| 94 | "emms\n" | |
| 95 | : [s]"=&r"(esi),[d]"=&r"(edi) \ | |
| 96 | : [pp]"r"(PixelPtr), [ll]"r"((long)LineLength), [bound]"r"(BoundingValuePtr-256), [V3]"m"(V3), [V804]"m"(V804) \ | |
| 97 | : "memory" \ | |
| 98 | ); | |
| 217 | /*x x x x 3 2 1 0*/ | |
| 218 | "movd (%[pix]),%%mm0\n\t" | |
| 219 | /*esi=_ystride*3*/ | |
| 220 | "lea (%[ystride],%[ystride],2),%[s]\n\t" | |
| 221 | /*x x x x 7 6 5 4*/ | |
| 222 | "movd (%[pix],%[ystride]),%%mm1\n\t" | |
| 223 | /*x x x x B A 9 8*/ | |
| 224 | "movd (%[pix],%[ystride],2),%%mm2\n\t" | |
| 225 | /*x x x x F E D C*/ | |
| 226 | "movd (%[pix],%[s]),%%mm3\n\t" | |
| 227 | /*mm0=7 3 6 2 5 1 4 0*/ | |
| 228 | "punpcklbw %%mm1,%%mm0\n\t" | |
| 229 | /*mm2=F B E A D 9 C 8*/ | |
| 230 | "punpcklbw %%mm3,%%mm2\n\t" | |
| 231 | /*mm1=7 3 6 2 5 1 4 0*/ | |
| 232 | "movq %%mm0,%%mm1\n\t" | |
| 233 | /*mm0=F B 7 3 E A 6 2*/ | |
| 234 | "punpckhwd %%mm2,%%mm0\n\t" | |
| 235 | /*mm1=D 9 5 1 C 8 4 0*/ | |
| 236 | "punpcklwd %%mm2,%%mm1\n\t" | |
| 237 | "pxor %%mm7,%%mm7\n\t" | |
| 238 | /*mm5=D 9 5 1 C 8 4 0*/ | |
| 239 | "movq %%mm1,%%mm5\n\t" | |
| 240 | /*mm1=x C x 8 x 4 x 0==pix[0]*/ | |
| 241 | "punpcklbw %%mm7,%%mm1\n\t" | |
| 242 | /*mm5=x D x 9 x 5 x 1==pix[1]*/ | |
| 243 | "punpckhbw %%mm7,%%mm5\n\t" | |
| 244 | /*mm3=F B 7 3 E A 6 2*/ | |
| 245 | "movq %%mm0,%%mm3\n\t" | |
| 246 | /*mm0=x E x A x 6 x 2==pix[2]*/ | |
| 247 | "punpcklbw %%mm7,%%mm0\n\t" | |
| 248 | /*mm3=x F x B x 7 x 3==pix[3]*/ | |
| 249 | "punpckhbw %%mm7,%%mm3\n\t" | |
| 250 | /*mm1=mm1-mm3==pix[0]-pix[3]*/ | |
| 251 | "psubw %%mm3,%%mm1\n\t" | |
| 252 | /*Save a copy of pix[2] for later.*/ | |
| 253 | "movq %%mm0,%%mm4\n\t" | |
| 254 | /*mm0=mm0-mm5==pix[2]-pix[1]*/ | |
| 255 | "psubw %%mm5,%%mm0\n\t" | |
| 256 | /*Scale by 3.*/ | |
| 257 | "pmullw %[OC_V3],%%mm0\n\t" | |
| 258 | /*f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/ | |
| 259 | "paddw %%mm1,%%mm0\n\t" | |
| 260 | /*Add 4.*/ | |
| 261 | "paddw %[OC_V4],%%mm0\n\t" | |
| 262 | /*"Divide" by 8, producing the residuals R_i.*/ | |
| 263 | "psraw $3,%%mm0\n\t" | |
| 264 | /*Now compute lflim of mm0 cf. Section 7.10 of the sepc.*/ | |
| 265 | /*mm6=L L L L*/ | |
| 266 | "movq (%[ll]),%%mm6\n\t" | |
| 267 | /*if(R_i<-2L||R_i>2L)R_i=0:*/ | |
| 268 | "movq %%mm0,%%mm1\n\t" | |
| 269 | "pxor %%mm2,%%mm2\n\t" | |
| 270 | "movq %%mm6,%%mm3\n\t" | |
| 271 | "psubw %%mm6,%%mm2\n\t" | |
| 272 | "psllw $1,%%mm3\n\t" | |
| 273 | "psllw $1,%%mm2\n\t" | |
| 274 | /*mm0==R_3 R_2 R_1 R_0*/ | |
| 275 | /*mm1==R_3 R_2 R_1 R_0*/ | |
| 276 | /*mm2==-2L -2L -2L -2L*/ | |
| 277 | /*mm3==2L 2L 2L 2L*/ | |
| 278 | "pcmpgtw %%mm0,%%mm3\n\t" | |
| 279 | "pcmpgtw %%mm2,%%mm1\n\t" | |
| 280 | "pand %%mm3,%%mm0\n\t" | |
| 281 | "pand %%mm1,%%mm0\n\t" | |
| 282 | /*if(R_i<-L)R_i'=R_i+2L; | |
| 283 | if(R_i>L)R_i'=R_i-2L; | |
| 284 | if(R_i<-L||R_i>L)R_i=-R_i':*/ | |
| 285 | "psraw $1,%%mm2\n\t" | |
| 286 | "movq %%mm0,%%mm1\n\t" | |
| 287 | "movq %%mm6,%%mm3\n\t" | |
| 288 | /*mm0==R_3 R_2 R_1 R_0*/ | |
| 289 | /*mm1==R_3 R_2 R_1 R_0*/ | |
| 290 | /*mm2==-L -L -L -L*/ | |
| 291 | /*mm6==L L L L*/ | |
| 292 | /*mm2=-L>R_i?FF:00*/ | |
| 293 | "pcmpgtw %%mm0,%%mm2\n\t" | |
| 294 | /*mm1=R_i>L?FF:00*/ | |
| 295 | "pcmpgtw %%mm6,%%mm1\n\t" | |
| 296 | /*mm3=2L 2L 2L 2L*/ | |
| 297 | "psllw $1,%%mm3\n\t" | |
| 298 | /*mm6=2L 2L 2L 2L*/ | |
| 299 | "psllw $1,%%mm6\n\t" | |
| 300 | /*mm3=R_i>L?2L:0*/ | |
| 301 | "pand %%mm1,%%mm3\n\t" | |
| 302 | /*mm6=-L>R_i?2L:0*/ | |
| 303 | "pand %%mm2,%%mm6\n\t" | |
| 304 | /*mm0=R_i>L?R_i-2L:R_i*/ | |
| 305 | "psubw %%mm3,%%mm0\n\t" | |
| 306 | /*mm1=-L>R_i||R_i>L*/ | |
| 307 | "por %%mm2,%%mm1\n\t" | |
| 308 | /*mm0=-L>R_i?R_i+2L:R_i*/ | |
| 309 | "paddw %%mm6,%%mm0\n\t" | |
| 310 | /*mm1=-L>R_i||R_i>L?R_i':0*/ | |
| 311 | "pand %%mm0,%%mm1\n\t" | |
| 312 | /*mm0=-L>R_i||R_i>L?0:R_i*/ | |
| 313 | "psubw %%mm1,%%mm0\n\t" | |
| 314 | /*mm0=-L>R_i||R_i>L?-R_i':R_i*/ | |
| 315 | "psubw %%mm1,%%mm0\n\t" | |
| 316 | /*_pix[1]+=R_i;*/ | |
| 317 | "paddw %%mm0,%%mm5\n\t" | |
| 318 | /*_pix[2]-=R_i;*/ | |
| 319 | "psubw %%mm0,%%mm4\n\t" | |
| 320 | /*mm5=x x x x D 9 5 1*/ | |
| 321 | "packuswb %%mm7,%%mm5\n\t" | |
| 322 | /*mm4=x x x x E A 6 2*/ | |
| 323 | "packuswb %%mm7,%%mm4\n\t" | |
| 324 | /*mm5=E D A 9 6 5 2 1*/ | |
| 325 | "punpcklbw %%mm4,%%mm5\n\t" | |
| 326 | /*edi=6 5 2 1*/ | |
| 327 | "movd %%mm5,%%edi\n\t" | |
| 328 | "movw %%di,1(%[pix])\n\t" | |
| 329 | /*Why is there such a big stall here?*/ | |
| 330 | "psrlq $32,%%mm5\n\t" | |
| 331 | "shrl $16,%%edi\n\t" | |
| 332 | "movw %%di,1(%[pix],%[ystride])\n\t" | |
| 333 | /*edi=E D A 9*/ | |
| 334 | "movd %%mm5,%%edi\n\t" | |
| 335 | "movw %%di,1(%[pix],%[ystride],2)\n\t" | |
| 336 | "shrl $16,%%edi\n\t" | |
| 337 | "movw %%di,1(%[pix],%[s])\n\t" | |
| 338 | :[s]"=&S"(esi),[d]"=&D"(edi), | |
| 339 | [pix]"+r"(_pix),[ystride]"+r"(_ystride),[ll]"+r"(_ll) | |
| 340 | :[OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4) | |
| 341 | :"memory" | |
| 342 | ); | |
| 99 | 343 | } |
| 100 | 344 | |
| 101 | static void FilterVert__mmx(unsigned char * PixelPtr, | |
| 102 | ogg_int32_t LineLength, | |
| 103 | ogg_int16_t *BoundingValuePtr){ | |
| 104 | long esi,edi; | |
| 105 | __asm__ __volatile__( | |
| 106 | "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ | |
| 107 | "movq (%[pp]),%%mm7\n" /* mm7 = pix[0..7] */ | |
| 108 | "lea (%[ll],%[ll],2),%[s]\n" /* esi = ystride*3 */ | |
| 109 | "movq (%[pp],%[s]),%%mm4\n" /* mm4 = pix[0..7+ystride*3] */ | |
| 110 | "movq %%mm7,%%mm6\n" /* mm6 = pix[0..7] */ | |
| 111 | "punpcklbw %%mm0,%%mm6\n" /* expand unsigned pix[0..3] to 16 bits */ | |
| 112 | "movq %%mm4,%%mm5\n" | |
| 113 | "punpckhbw %%mm0,%%mm7\n" /* expand unsigned pix[4..7] to 16 bits */ | |
| 114 | "punpcklbw %%mm0,%%mm4\n" /* expand other arrays too */ | |
| 115 | "punpckhbw %%mm0,%%mm5\n" | |
| 116 | "psubw %%mm4,%%mm6\n" /* mm6 = mm6 - mm4 */ | |
| 117 | "psubw %%mm5,%%mm7\n" /* mm7 = mm7 - mm5 */ | |
| 118 | /* mm7:mm6 = _p[0]-_p[ystride*3] */ | |
| 119 | "movq (%[pp],%[ll]),%%mm4\n" /* mm4 = pix[0..7+ystride] */ | |
| 120 | "movq %%mm4,%%mm5\n" | |
| 121 | "movq (%[pp],%[ll],2),%%mm2\n" /* mm2 = pix[0..7+ystride*2] */ | |
| 122 | "movq %%mm2,%%mm3\n" | |
| 123 | "movq %%mm2,%%mm1\n" //ystride*2 | |
| 124 | "punpckhbw %%mm0,%%mm5\n" | |
| 125 | "punpcklbw %%mm0,%%mm4\n" | |
| 126 | "punpckhbw %%mm0,%%mm3\n" | |
| 127 | "punpcklbw %%mm0,%%mm2\n" | |
| 128 | "psubw %%mm5,%%mm3\n" | |
| 129 | "psubw %%mm4,%%mm2\n" | |
| 130 | /* mm3:mm2 = (pix[ystride*2]-pix[ystride]); */ | |
| 131 | "pmullw %[V3],%%mm3\n" /* *3 */ | |
| 132 | "pmullw %[V3],%%mm2\n" /* *3 */ | |
| 133 | "paddw %%mm7,%%mm3\n" /* highpart */ | |
| 134 | "paddw %%mm6,%%mm2\n" /* lowpart of pix[0]-pix[ystride*3]+3*(pix[ystride*2]-pix[ystride]); */ | |
| 135 | "paddw %[V804],%%mm3\n" /* add 4 */ /* add 256 after shift */ | |
| 136 | "paddw %[V804],%%mm2\n" /* add 4 */ /* add 256 after shift */ | |
| 137 | "psraw $3,%%mm3\n" /* >>3 f coefs high */ | |
| 138 | "psraw $3,%%mm2\n" /* >>3 f coefs low */ | |
| 139 | ||
| 140 | " pextrw $0,%%mm2,%[s]\n" /* In MM3:MM2 we have f coefs (16bits) */ | |
| 141 | " pextrw $1,%%mm2,%[d]\n" /* now perform MM7:MM6 = *(_bv+ f) */ | |
| 142 | " pinsrw $0,(%[bound],%[s],2),%%mm6\n" | |
| 143 | " pinsrw $1,(%[bound],%[d],2),%%mm6\n" | |
| 144 | ||
| 145 | " pextrw $2,%%mm2,%[s]\n" | |
| 146 | " pextrw $3,%%mm2,%[d]\n" | |
| 147 | " pinsrw $2,(%[bound],%[s],2),%%mm6\n" | |
| 148 | " pinsrw $3,(%[bound],%[d],2),%%mm6\n" | |
| 149 | ||
| 150 | " pextrw $0,%%mm3,%[s]\n" | |
| 151 | " pextrw $1,%%mm3,%[d]\n" | |
| 152 | " pinsrw $0,(%[bound],%[s],2),%%mm7\n" | |
| 153 | " pinsrw $1,(%[bound],%[d],2),%%mm7\n" | |
| 154 | ||
| 155 | " pextrw $2,%%mm3,%[s]\n" | |
| 156 | " pextrw $3,%%mm3,%[d]\n" | |
| 157 | " pinsrw $2,(%[bound],%[s],2),%%mm7\n" | |
| 158 | " pinsrw $3,(%[bound],%[d],2),%%mm7\n" //MM7 MM6 f=*(_bv+(f+4>>3)); | |
| 159 | ||
| 160 | "paddw %%mm6,%%mm4\n" /* (pix[ystride]+f); */ | |
| 161 | "paddw %%mm7,%%mm5\n" /* (pix[ystride]+f); */ | |
| 162 | "movq %%mm1,%%mm2\n" | |
| 163 | "punpcklbw %%mm0,%%mm1\n" | |
| 164 | "punpckhbw %%mm0,%%mm2\n" //[ystride*2] | |
| 165 | "psubw %%mm6,%%mm1\n" /* (pix[ystride*2]-f); */ | |
| 166 | "psubw %%mm7,%%mm2\n" /* (pix[ystride*2]-f); */ | |
| 167 | "packuswb %%mm2,%%mm1\n" | |
| 168 | "packuswb %%mm5,%%mm4\n" | |
| 169 | "movq %%mm1,(%[pp],%[ll],2)\n" /* pix[ystride*2]= */ | |
| 170 | "movq %%mm4,(%[pp],%[ll])\n" /* pix[ystride]= */ | |
| 171 | "emms\n" | |
| 172 | : [s]"=&r"(esi),[d]"=&r"(edi) | |
| 173 | : [pp]"r"(PixelPtr-2*LineLength), [ll]"r"((long)LineLength), [bound]"r"(BoundingValuePtr-256), [V3]"m"(V3), [V804]"m"(V804) | |
| 174 | : "memory" | |
| 175 | ); | |
| 345 | static void loop_filter_h(unsigned char *_pix,int _ystride, | |
| 346 | const ogg_int16_t *_ll){ | |
| 347 | _pix-=2; | |
| 348 | loop_filter_h4(_pix,_ystride,_ll); | |
| 349 | loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll); | |
| 350 | } | |
| 351 | ||
| 352 | static void loop_filter_mmx(PB_INSTANCE *pbi, int FLimit){ | |
| 353 | int j; | |
| 354 | ogg_int16_t __attribute__((aligned(8))) ll[4]; | |
| 355 | unsigned char *cp = pbi->display_fragments; | |
| 356 | ogg_uint32_t *bp = pbi->recon_pixel_index_table; | |
| 357 | ||
| 358 | if ( FLimit == 0 ) return; | |
| 359 | ll[0]=ll[1]=ll[2]=ll[3]=FLimit; | |
| 360 | ||
| 361 | for ( j = 0; j < 3 ; j++){ | |
| 362 | ogg_uint32_t *bp_begin = bp; | |
| 363 | ogg_uint32_t *bp_end; | |
| 364 | int stride; | |
| 365 | int h; | |
| 366 | ||
| 367 | switch(j) { | |
| 368 | case 0: /* y */ | |
| 369 | bp_end = bp + pbi->YPlaneFragments; | |
| 370 | h = pbi->HFragments; | |
| 371 | stride = pbi->YStride; | |
| 372 | break; | |
| 373 | default: /* u,v, 4:20 specific */ | |
| 374 | bp_end = bp + pbi->UVPlaneFragments; | |
| 375 | h = pbi->HFragments >> 1; | |
| 376 | stride = pbi->UVStride; | |
| 377 | break; | |
| 378 | } | |
| 379 | ||
| 380 | while(bp<bp_end){ | |
| 381 | ogg_uint32_t *bp_left = bp; | |
| 382 | ogg_uint32_t *bp_right = bp + h; | |
| 383 | while(bp<bp_right){ | |
| 384 | if(cp[0]){ | |
| 385 | if(bp>bp_left) | |
| 386 | loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,ll); | |
| 387 | if(bp_left>bp_begin) | |
| 388 | loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,ll); | |
| 389 | if(bp+1<bp_right && !cp[1]) | |
| 390 | loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,ll); | |
| 391 | if(bp+stride<bp_end && !cp[stride]) | |
| 392 | loop_filter_v(&pbi->LastFrameRecon[bp[h]]+8,stride,ll); | |
| 393 | } | |
| 394 | bp++; | |
| 395 | cp++; | |
| 396 | } | |
| 397 | } | |
| 398 | } | |
| 399 | ||
| 400 | __asm__ __volatile__("emms\n\t"); | |
| 176 | 401 | } |
| 177 | 402 | |
| ... | ...@@ -179,6 +404,5 @@ | |
| 179 | 404 | void dsp_mmx_dct_decode_init(DspFunctions *funcs) |
| 180 | 405 | { |
| 181 | funcs->FilterVert = FilterVert__mmx; | |
| 182 | funcs->FilterHoriz = FilterHoriz__mmx; | |
| 406 | funcs->LoopFilter = loop_filter_mmx; | |
| 183 | 407 | } |
| 184 | 408 |
| ... | ...@@ -22,156 +22,381 @@ | |
| 22 | 22 | #if defined(USE_ASM) |
| 23 | 23 | |
| 24 | static const __attribute__((aligned(8),used)) ogg_int64_t V3= 0x0003000300030003LL; | |
| 25 | static const __attribute__((aligned(8),used)) ogg_int64_t V804= 0x0804080408040804LL; | |
| 24 | static const __attribute__((aligned(8),used)) ogg_int64_t OC_V3= | |
| 25 | 0x0003000300030003LL; | |
| 26 | static const __attribute__((aligned(8),used)) ogg_int64_t OC_V4= | |
| 27 | 0x0004000400040004LL; | |
| 26 | 28 | |
| 27 | #define OC_LOOP_H_4x4 \ | |
| 28 | "lea (%[ll],%[ll],2),%[s]\n" /* esi = ystride*3 */ \ | |
| 29 | "movd (%[pp]), %%mm0\n" /* 0 0 0 0 3 2 1 0 */ \ | |
| 30 | "movd (%[pp],%[ll]),%%mm1\n" /* 0 0 0 0 7 6 5 4 */ \ | |
| 31 | "movd (%[pp],%[ll],2),%%mm2\n" /* 0 0 0 0 b a 9 8 */ \ | |
| 32 | "movd (%[pp],%[s]),%%mm3\n" /* 0 0 0 0 f e d c */ \ | |
| 33 | "punpcklbw %%mm1,%%mm0\n" /* mm0 = 7 3 6 2 5 1 4 0 */ \ | |
| 34 | "punpcklbw %%mm3,%%mm2\n" /* mm2 = f b e a d 9 c 8 */ \ | |
| 35 | "movq %%mm0,%%mm1\n" /* mm1 = 7 3 6 2 5 1 4 0 */ \ | |
| 36 | "punpcklwd %%mm2,%%mm1\n" /* mm1 = d 9 5 1 c 8 4 0 */ \ | |
| 37 | "punpckhwd %%mm2,%%mm0\n" /* mm0 = f b 7 3 e a 6 2 */ \ | |
| 38 | "pxor %%mm7,%%mm7\n" \ | |
| 39 | "movq %%mm1,%%mm5\n" /* mm5 = d 9 5 1 c 8 4 0 */ \ | |
| 40 | "punpckhbw %%mm7,%%mm5\n" /* mm5 = 0 d 0 9 0 5 0 1 = pix[1]*/ \ | |
| 41 | "punpcklbw %%mm7,%%mm1\n" /* mm1 = 0 c 0 8 0 4 0 0 = pix[0]*/ \ | |
| 42 | "movq %%mm0,%%mm3\n" /* mm3 = f b 7 3 e a 6 2 */ \ | |
| 43 | "punpckhbw %%mm7,%%mm3\n" /* mm3 = 0 f 0 b 0 7 0 3 = pix[3]*/ \ | |
| 44 | "punpcklbw %%mm7,%%mm0\n" /* mm0 = 0 e 0 a 0 6 0 2 = pix[2]*/ \ | |
| 45 | \ | |
| 46 | "psubw %%mm3,%%mm1\n" /* mm1 = pix[0]-pix[3] mm1 - mm3 */ \ | |
| 47 | "movq %%mm0,%%mm7\n" /* mm7 = pix[2]*/ \ | |
| 48 | "psubw %%mm5,%%mm0\n" /* mm0 = pix[2]-pix[1] mm0 - mm5*/ \ | |
| 49 | "pmullw %[V3],%%mm0\n" /* *3 */ \ | |
| 50 | "paddw %%mm0,%%mm1\n" /* mm1 has f[0] ... f[4]*/ \ | |
| 51 | "paddw %[V804],%%mm1\n" /* add 4 */ /* add 256 after shift */ \ | |
| 52 | "psraw $3,%%mm1\n" /* >>3 */ \ | |
| 53 | " pextrw $0,%%mm1,%[s]\n" /* In MM1 we have 4 f coefs (16bits) */ \ | |
| 54 | " pextrw $1,%%mm1,%[d]\n" /* now perform MM4 = *(_bv+ f) */ \ | |
| 55 | " pinsrw $0,(%[bound],%[s],2),%%mm4\n" \ | |
| 56 | " pextrw $2,%%mm1,%[s]\n" \ | |
| 57 | " pinsrw $1,(%[bound],%[d],2),%%mm4\n" \ | |
| 58 | " pextrw $3,%%mm1,%[d]\n" \ | |
| 59 | " pinsrw $2,(%[bound],%[s],2),%%mm4\n" \ | |
| 60 | " pinsrw $3,(%[bound],%[d],2),%%mm4\n" /* new f vals loaded */ \ | |
| 61 | "pxor %%mm0,%%mm0\n" \ | |
| 62 | " paddw %%mm4,%%mm5\n" /*(pix[1]+f);*/ \ | |
| 63 | " psubw %%mm4,%%mm7\n" /* (pix[2]-f); */ \ | |
| 64 | " packuswb %%mm0,%%mm5\n" /* mm5 = x x x x newpix1 */ \ | |
| 65 | " packuswb %%mm0,%%mm7\n" /* mm7 = x x x x newpix2 */ \ | |
| 66 | " punpcklbw %%mm7,%%mm5\n" /* 2 1 2 1 2 1 2 1 */ \ | |
| 67 | " movd %%mm5,%[d]\n" /* edi = newpix21 */ \ | |
| 68 | " movw %[d],1(%[pp])\n" \ | |
| 69 | " psrlq $32,%%mm5\n" /* why is so big stall here ? */ \ | |
| 70 | " shr $16,%[d]\n" \ | |
| 71 | " movw %[d],1(%[pp],%[ll],1)\n" \ | |
| 72 | " movd %%mm5,%[d]\n" /* eax = newpix21 high part */ \ | |
| 73 | " lea (%[ll],%[ll],2),%[s]\n" \ | |
| 74 | " movw %[d],1(%[pp],%[ll],2)\n" \ | |
| 75 | " shr $16,%[d]\n" \ | |
| 76 | " movw %[d],1(%[pp],%[s])\n" | |
| 77 | ||
| 78 | static void FilterHoriz__mmx(unsigned char * PixelPtr, | |
| 79 | ogg_int32_t LineLength, | |
| 80 | ogg_int16_t *BoundingValuePtr){ | |
| 29 | static void loop_filter_v(unsigned char *_pix,int _ystride, | |
| 30 | const ogg_int16_t *_ll){ | |
| 81 | 31 | long esi; |
| 82 | long edi; | |
| 32 | _pix-=_ystride*2; | |
| 83 | 33 | __asm__ __volatile__( |
| 84 | OC_LOOP_H_4x4 | |
| 85 | : [s]"=&r"(esi),[d]"=&r"(edi) | |
| 86 | : [pp]"r"(PixelPtr), [ll]"r"((long)LineLength), [bound]"r"(BoundingValuePtr-256), [V3]"m"(V3), [V804]"m"(V804) | |
| 87 | : "memory" | |
| 34 | /*mm0=0*/ | |
| 35 | "pxor %%mm0,%%mm0\n\t" | |
| 36 | /*esi=_ystride*3*/ | |
| 37 | "lea (%[ystride],%[ystride],2),%[s]\n\t" | |
| 38 | /*mm7=_pix[0...8]*/ | |
| 39 | "movq (%[pix]),%%mm7\n\t" | |
| 40 | /*mm4=_pix[0...8+_ystride*3]*/ | |
| 41 | "movq (%[pix],%[s]),%%mm4\n\t" | |
| 42 | /*mm6=_pix[0...8]*/ | |
| 43 | "movq %%mm7,%%mm6\n\t" | |
| 44 | /*Expand unsigned _pix[0...3] to 16 bits.*/ | |
| 45 | "punpcklbw %%mm0,%%mm6\n\t" | |
| 46 | "movq %%mm4,%%mm5\n\t" | |
| 47 | /*Expand unsigned _pix[4...8] to 16 bits.*/ | |
| 48 | "punpckhbw %%mm0,%%mm7\n\t" | |
| 49 | /*Expand other arrays too.*/ | |
| 50 | "punpcklbw %%mm0,%%mm4\n\t" | |
| 51 | "punpckhbw %%mm0,%%mm5\n\t" | |
| 52 | /*mm7:mm6=_p[0...8]-_p[0...8+_ystride*3]:*/ | |
| 53 | "psubw %%mm4,%%mm6\n\t" | |
| 54 | "psubw %%mm5,%%mm7\n\t" | |
| 55 | /*mm5=mm4=_pix[0...8+_ystride]*/ | |
| 56 | "movq (%[pix],%[ystride]),%%mm4\n\t" | |
| 57 | /*mm1=mm3=mm2=_pix[0..8]+_ystride*2]*/ | |
| 58 | "movq (%[pix],%[ystride],2),%%mm2\n\t" | |
| 59 | "movq %%mm4,%%mm5\n\t" | |
| 60 | "movq %%mm2,%%mm3\n\t" | |
| 61 | "movq %%mm2,%%mm1\n\t" | |
| 62 | /*Expand these arrays.*/ | |
| 63 | "punpckhbw %%mm0,%%mm5\n\t" | |
| 64 | "punpcklbw %%mm0,%%mm4\n\t" | |
| 65 | "punpckhbw %%mm0,%%mm3\n\t" | |
| 66 | "punpcklbw %%mm0,%%mm2\n\t" | |
| 67 | /*Preload...*/ | |
| 68 | "movq %[OC_V3],%%mm0\n\t" | |
| 69 | /*mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/ | |
| 70 | "psubw %%mm5,%%mm3\n\t" | |
| 71 | "psubw %%mm4,%%mm2\n\t" | |
| 72 | /*Scale by 3.*/ | |
| 73 | "pmullw %%mm0,%%mm3\n\t" | |
| 74 | "pmullw %%mm0,%%mm2\n\t" | |
| 75 | /*Preload...*/ | |
| 76 | "movq %[OC_V4],%%mm0\n\t" | |
| 77 | /*f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+ | |
| 78 | 3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/ | |
| 79 | "paddw %%mm7,%%mm3\n\t" | |
| 80 | "paddw %%mm6,%%mm2\n\t" | |
| 81 | /*Add 4.*/ | |
| 82 | "paddw %%mm0,%%mm3\n\t" | |
| 83 | "paddw %%mm0,%%mm2\n\t" | |
| 84 | /*"Divide" by 8.*/ | |
| 85 | "psraw $3,%%mm3\n\t" | |
| 86 | "psraw $3,%%mm2\n\t" | |
| 87 | /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/ | |
| 88 | /*Free up mm5.*/ | |
| 89 | "packuswb %%mm5,%%mm4\n\t" | |
| 90 | /*mm0=L L L L*/ | |
| 91 | "movq (%[ll]),%%mm0\n\t" | |
| 92 | /*if(R_i<-2L||R_i>2L)R_i=0:*/ | |
| 93 | "movq %%mm2,%%mm5\n\t" | |
| 94 | "pxor %%mm6,%%mm6\n\t" | |
| 95 | "movq %%mm0,%%mm7\n\t" | |
| 96 | "psubw %%mm0,%%mm6\n\t" | |
| 97 | "psllw $1,%%mm7\n\t" | |
| 98 | "psllw $1,%%mm6\n\t" | |
| 99 | /*mm2==R_3 R_2 R_1 R_0*/ | |
| 100 | /*mm5==R_3 R_2 R_1 R_0*/ | |
| 101 | /*mm6==-2L -2L -2L -2L*/ | |
| 102 | /*mm7==2L 2L 2L 2L*/ | |
| 103 | "pcmpgtw %%mm2,%%mm7\n\t" | |
| 104 | "pcmpgtw %%mm6,%%mm5\n\t" | |
| 105 | "pand %%mm7,%%mm2\n\t" | |
| 106 | "movq %%mm0,%%mm7\n\t" | |
| 107 | "pand %%mm5,%%mm2\n\t" | |
| 108 | "psllw $1,%%mm7\n\t" | |
| 109 | "movq %%mm3,%%mm5\n\t" | |
| 110 | /*mm3==R_7 R_6 R_5 R_4*/ | |
| 111 | /*mm5==R_7 R_6 R_5 R_4*/ | |
| 112 | /*mm6==-2L -2L -2L -2L*/ | |
| 113 | /*mm7==2L 2L 2L 2L*/ | |
| 114 | "pcmpgtw %%mm3,%%mm7\n\t" | |
| 115 | "pcmpgtw %%mm6,%%mm5\n\t" | |
| 116 | "pand %%mm7,%%mm3\n\t" | |
| 117 | "movq %%mm0,%%mm7\n\t" | |
| 118 | "pand %%mm5,%%mm3\n\t" | |
| 119 | /*if(R_i<-L)R_i'=R_i+2L; | |
| 120 | if(R_i>L)R_i'=R_i-2L; | |
| 121 | if(R_i<-L||R_i>L)R_i=-R_i':*/ | |
| 122 | "psraw $1,%%mm6\n\t" | |
| 123 | "movq %%mm2,%%mm5\n\t" | |
| 124 | "psllw $1,%%mm7\n\t" | |
| 125 | /*mm2==R_3 R_2 R_1 R_0*/ | |
| 126 | /*mm5==R_3 R_2 R_1 R_0*/ | |
| 127 | /*mm6==-L -L -L -L*/ | |
| 128 | /*mm0==L L L L*/ | |
| 129 | /*mm5=R_i>L?FF:00*/ | |
| 130 | "pcmpgtw %%mm0,%%mm5\n\t" | |
| 131 | /*mm6=-L>R_i?FF:00*/ | |
| 132 | "pcmpgtw %%mm2,%%mm6\n\t" | |
| 133 | /*mm7=R_i>L?2L:0*/ | |
| 134 | "pand %%mm5,%%mm7\n\t" | |
| 135 | /*mm2=R_i>L?R_i-2L:R_i*/ | |
| 136 | "psubw %%mm7,%%mm2\n\t" | |
| 137 | "movq %%mm0,%%mm7\n\t" | |
| 138 | /*mm5=-L>R_i||R_i>L*/ | |
| 139 | "por %%mm6,%%mm5\n\t" | |
| 140 | "psllw $1,%%mm7\n\t" | |
| 141 | /*mm7=-L>R_i?2L:0*/ | |
| 142 | "pand %%mm6,%%mm7\n\t" | |
| 143 | "pxor %%mm6,%%mm6\n\t" | |
| 144 | /*mm2=-L>R_i?R_i+2L:R_i*/ | |
| 145 | "paddw %%mm7,%%mm2\n\t" | |
| 146 | "psubw %%mm0,%%mm6\n\t" | |
| 147 | /*mm5=-L>R_i||R_i>L?-R_i':0*/ | |
| 148 | "pand %%mm2,%%mm5\n\t" | |
| 149 | "movq %%mm0,%%mm7\n\t" | |
| 150 | /*mm2=-L>R_i||R_i>L?0:R_i*/ | |
| 151 | "psubw %%mm5,%%mm2\n\t" | |
| 152 | "psllw $1,%%mm7\n\t" | |
| 153 | /*mm2=-L>R_i||R_i>L?-R_i':R_i*/ | |
| 154 | "psubw %%mm5,%%mm2\n\t" | |
| 155 | "movq %%mm3,%%mm5\n\t" | |
| 156 | /*mm3==R_7 R_6 R_5 R_4*/ | |
| 157 | /*mm5==R_7 R_6 R_5 R_4*/ | |
| 158 | /*mm6==-L -L -L -L*/ | |
| 159 | /*mm0==L L L L*/ | |
| 160 | /*mm6=-L>R_i?FF:00*/ | |
| 161 | "pcmpgtw %%mm3,%%mm6\n\t" | |
| 162 | /*mm5=R_i>L?FF:00*/ | |
| 163 | "pcmpgtw %%mm0,%%mm5\n\t" | |
| 164 | /*mm7=R_i>L?2L:0*/ | |
| 165 | "pand %%mm5,%%mm7\n\t" | |
| 166 | /*mm2=R_i>L?R_i-2L:R_i*/ | |
| 167 | "psubw %%mm7,%%mm3\n\t" | |
| 168 | "psllw $1,%%mm0\n\t" | |
| 169 | /*mm5=-L>R_i||R_i>L*/ | |
| 170 | "por %%mm6,%%mm5\n\t" | |
| 171 | /*mm0=-L>R_i?2L:0*/ | |
| 172 | "pand %%mm6,%%mm0\n\t" | |
| 173 | /*mm3=-L>R_i?R_i+2L:R_i*/ | |
| 174 | "paddw %%mm0,%%mm3\n\t" | |
| 175 | /*mm5=-L>R_i||R_i>L?-R_i':0*/ | |
| 176 | "pand %%mm3,%%mm5\n\t" | |
| 177 | /*mm2=-L>R_i||R_i>L?0:R_i*/ | |
| 178 | "psubw %%mm5,%%mm3\n\t" | |
| 179 | /*mm2=-L>R_i||R_i>L?-R_i':R_i*/ | |
| 180 | "psubw %%mm5,%%mm3\n\t" | |
| 181 | /*Unfortunately, there's no unsigned byte+signed byte with unsigned | |
| 182 | saturation op code, so we have to promote things back 16 bits.*/ | |
| 183 | "pxor %%mm0,%%mm0\n\t" | |
| 184 | "movq %%mm4,%%mm5\n\t" | |
| 185 | "punpcklbw %%mm0,%%mm4\n\t" | |
| 186 | "punpckhbw %%mm0,%%mm5\n\t" | |
| 187 | "movq %%mm1,%%mm6\n\t" | |
| 188 | "punpcklbw %%mm0,%%mm1\n\t" | |
| 189 | "punpckhbw %%mm0,%%mm6\n\t" | |
| 190 | /*_pix[0...8+_ystride]+=R_i*/ | |
| 191 | "paddw %%mm2,%%mm4\n\t" | |
| 192 | "paddw %%mm3,%%mm5\n\t" | |
| 193 | /*_pix[0...8+_ystride*2]-=R_i*/ | |
| 194 | "psubw %%mm2,%%mm1\n\t" | |
| 195 | "psubw %%mm3,%%mm6\n\t" | |
| 196 | "packuswb %%mm5,%%mm4\n\t" | |
| 197 | "packuswb %%mm6,%%mm1\n\t" | |
| 198 | /*Write it back out.*/ | |
| 199 | "movq %%mm4,(%[pix],%[ystride])\n\t" | |
| 200 | "movq %%mm1,(%[pix],%[ystride],2)\n\t" | |
| 201 | :[s]"=&S"(esi) | |
| 202 | :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[ll]"r"(_ll), | |
| 203 | [OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4) | |
| 204 | :"memory" | |
| 88 | 205 | ); |
| 206 | } | |
| 89 | 207 | |
| 90 | PixelPtr += LineLength*4; | |
| 91 | ||
| 208 | /*This code implements the bulk of loop_filter_h(). | |
| 209 | Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all | |
| 210 | four p0's to one register we must transpose the values in four mmx regs. | |
| 211 | When half is done we repeat this for the rest.*/ | |
| 212 | static void loop_filter_h4(unsigned char *_pix,long _ystride, | |
| 213 | const ogg_int16_t *_ll){ | |
| 214 | long esi; | |
| 215 | long edi; | |
| 92 | 216 | __asm__ __volatile__( |
| 93 | OC_LOOP_H_4x4 | |
| 94 | "emms\n" | |
| 95 | : [s]"=&r"(esi),[d]"=&r"(edi) \ | |
| 96 | : [pp]"r"(PixelPtr), [ll]"r"((long)LineLength), [bound]"r"(BoundingValuePtr-256), [V3]"m"(V3), [V804]"m"(V804) \ | |
| 97 | : "memory" \ | |
| 98 | ); | |
| 217 | /*x x x x 3 2 1 0*/ | |
| 218 | "movd (%[pix]),%%mm0\n\t" | |
| 219 | /*esi=_ystride*3*/ | |
| 220 | "lea (%[ystride],%[ystride],2),%[s]\n\t" | |
| 221 | /*x x x x 7 6 5 4*/ | |
| 222 | "movd (%[pix],%[ystride]),%%mm1\n\t" | |
| 223 | /*x x x x B A 9 8*/ | |
| 224 | "movd (%[pix],%[ystride],2),%%mm2\n\t" | |
| 225 | /*x x x x F E D C*/ | |
| 226 | "movd (%[pix],%[s]),%%mm3\n\t" | |
| 227 | /*mm0=7 3 6 2 5 1 4 0*/ | |
| 228 | "punpcklbw %%mm1,%%mm0\n\t" | |
| 229 | /*mm2=F B E A D 9 C 8*/ | |
| 230 | "punpcklbw %%mm3,%%mm2\n\t" | |
| 231 | /*mm1=7 3 6 2 5 1 4 0*/ | |
| 232 | "movq %%mm0,%%mm1\n\t" | |
| 233 | /*mm0=F B 7 3 E A 6 2*/ | |
| 234 | "punpckhwd %%mm2,%%mm0\n\t" | |
| 235 | /*mm1=D 9 5 1 C 8 4 0*/ | |
| 236 | "punpcklwd %%mm2,%%mm1\n\t" | |
| 237 | "pxor %%mm7,%%mm7\n\t" | |
| 238 | /*mm5=D 9 5 1 C 8 4 0*/ | |
| 239 | "movq %%mm1,%%mm5\n\t" | |
| 240 | /*mm1=x C x 8 x 4 x 0==pix[0]*/ | |
| 241 | "punpcklbw %%mm7,%%mm1\n\t" | |
| 242 | /*mm5=x D x 9 x 5 x 1==pix[1]*/ | |
| 243 | "punpckhbw %%mm7,%%mm5\n\t" | |
| 244 | /*mm3=F B 7 3 E A 6 2*/ | |