Go to the documentation of this file.
32 #if COMPILE_TEMPLATE_MMXEXT
33 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
34 #define MOVNTQ2 "movntq "
36 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
37 #define MOVNTQ2 "movq "
39 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
41 #if !COMPILE_TEMPLATE_MMXEXT
46 __asm__
volatile(
"pxor %%mm0, %%mm0\n\t"
47 "movq (%0), %%mm3\n\t"
48 "movq %%mm3, %%mm4\n\t"
49 "psrlq $24, %%mm3\n\t"
50 "psllq $40, %%mm4\n\t"
51 "por %%mm4, %%mm3\n\t"
52 "movq %%mm3, %%mm4\n\t"
53 "punpcklbw %%mm0, %%mm3\n\t"
54 "punpckhbw %%mm0, %%mm4\n\t"
58 __asm__
volatile(
"pxor %%mm0, %%mm0\n\t"
59 "movq (%0), %%mm3\n\t"
60 "movq %%mm3, %%mm4\n\t"
61 "punpcklbw %%mm0, %%mm3\n\t"
62 "punpckhbw %%mm0, %%mm4\n\t"
77 "punpcklwd %%mm1, %%mm1\n\t"
78 "punpckldq %%mm1, %%mm1\n\t"
80 "paddw %%mm1, %%mm3\n\t"
81 "paddw %%mm1, %%mm4\n\t"
88 "movq %%mm3, %%mm6\n\t"
89 "movq %%mm4, %%mm7\n\t"
91 "mov %0, %%"FF_REG_d
" \n\t"\
92 "mov (%%"FF_REG_d
"), %%"FF_REG_S
" \n\t"\
95 "movq 8(%%"FF_REG_d
"), %%mm0 \n\t" \
96 "movq (%%"FF_REG_S
", %%"FF_REG_c
", 2), %%mm2 \n\t" \
97 "movq 8(%%"FF_REG_S
", %%"FF_REG_c
", 2), %%mm5 \n\t" \
98 "add $16, %%"FF_REG_d
" \n\t"\
99 "mov (%%"FF_REG_d
"), %%"FF_REG_S
" \n\t"\
100 "test %%"FF_REG_S
", %%"FF_REG_S
" \n\t"\
101 "pmulhw %%mm0, %%mm2 \n\t"\
102 "pmulhw %%mm0, %%mm5 \n\t"\
103 "paddw %%mm2, %%mm3 \n\t"\
104 "paddw %%mm5, %%mm4 \n\t"\
106 "psraw $3, %%mm3 \n\t"\
107 "psraw $3, %%mm4 \n\t"\
108 "packuswb %%mm4, %%mm3 \n\t"
109 MOVNTQ2 " %%mm3, (%1, %%"FF_REG_c
")\n\t"
110 "add $8, %%"FF_REG_c
" \n\t"\
111 "cmp %2, %%"FF_REG_c
" \n\t"\
112 "movq %%mm6, %%mm3\n\t"
113 "movq %%mm7, %%mm4\n\t"
114 "mov %0, %%"FF_REG_d
" \n\t"\
115 "mov (%%"FF_REG_d
"), %%"FF_REG_S
" \n\t"\
119 :
"%"FF_REG_d,
"%"FF_REG_S,
"%"FF_REG_c
123 #define YSCALEYUV2PACKEDX_UV \
125 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
129 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
130 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
131 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
132 "movq %%mm3, %%mm4 \n\t"\
135 "movq 8(%%"FF_REG_d"), %%mm0 \n\t" \
136 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" \
137 "add %6, %%"FF_REG_S" \n\t" \
138 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" \
139 "add $16, %%"FF_REG_d" \n\t"\
140 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
141 "pmulhw %%mm0, %%mm2 \n\t"\
142 "pmulhw %%mm0, %%mm5 \n\t"\
143 "paddw %%mm2, %%mm3 \n\t"\
144 "paddw %%mm5, %%mm4 \n\t"\
145 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
148 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
149 "lea "offset"(%0), %%"FF_REG_d" \n\t"\
150 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
151 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
152 "movq "#dst1", "#dst2" \n\t"\
155 "movq 8(%%"FF_REG_d"), "#coeff" \n\t" \
156 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" \
157 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" \
158 "add $16, %%"FF_REG_d" \n\t"\
159 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
160 "pmulhw "#coeff", "#src1" \n\t"\
161 "pmulhw "#coeff", "#src2" \n\t"\
162 "paddw "#src1", "#dst1" \n\t"\
163 "paddw "#src2", "#dst2" \n\t"\
164 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
167 #define YSCALEYUV2PACKEDX \
168 YSCALEYUV2PACKEDX_UV \
169 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
171 #define YSCALEYUV2PACKEDX_END \
172 :: "r" (&c->redDither), \
173 "m" (dummy), "m" (dummy), "m" (dummy),\
174 "r" (dest), "m" (dstW_reg), "m"(uv_off) \
175 NAMED_CONSTRAINTS_ADD(bF8,bFC) \
176 : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \
179 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
181 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
185 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
186 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
187 "pxor %%mm4, %%mm4 \n\t"\
188 "pxor %%mm5, %%mm5 \n\t"\
189 "pxor %%mm6, %%mm6 \n\t"\
190 "pxor %%mm7, %%mm7 \n\t"\
193 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" \
194 "add %6, %%"FF_REG_S" \n\t" \
195 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" \
196 "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
197 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" \
198 "movq %%mm0, %%mm3 \n\t"\
199 "punpcklwd %%mm1, %%mm0 \n\t"\
200 "punpckhwd %%mm1, %%mm3 \n\t"\
201 "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1 \n\t" \
202 "pmaddwd %%mm1, %%mm0 \n\t"\
203 "pmaddwd %%mm1, %%mm3 \n\t"\
204 "paddd %%mm0, %%mm4 \n\t"\
205 "paddd %%mm3, %%mm5 \n\t"\
206 "add %6, %%"FF_REG_S" \n\t" \
207 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" \
208 "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
209 "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
210 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
211 "movq %%mm2, %%mm0 \n\t"\
212 "punpcklwd %%mm3, %%mm2 \n\t"\
213 "punpckhwd %%mm3, %%mm0 \n\t"\
214 "pmaddwd %%mm1, %%mm2 \n\t"\
215 "pmaddwd %%mm1, %%mm0 \n\t"\
216 "paddd %%mm2, %%mm6 \n\t"\
217 "paddd %%mm0, %%mm7 \n\t"\
219 "psrad $16, %%mm4 \n\t"\
220 "psrad $16, %%mm5 \n\t"\
221 "psrad $16, %%mm6 \n\t"\
222 "psrad $16, %%mm7 \n\t"\
223 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
224 "packssdw %%mm5, %%mm4 \n\t"\
225 "packssdw %%mm7, %%mm6 \n\t"\
226 "paddw %%mm0, %%mm4 \n\t"\
227 "paddw %%mm0, %%mm6 \n\t"\
228 "movq %%mm4, "U_TEMP"(%0) \n\t"\
229 "movq %%mm6, "V_TEMP"(%0) \n\t"\
231 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
232 "lea "offset"(%0), %%"FF_REG_d" \n\t"\
233 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
234 "pxor %%mm1, %%mm1 \n\t"\
235 "pxor %%mm5, %%mm5 \n\t"\
236 "pxor %%mm7, %%mm7 \n\t"\
237 "pxor %%mm6, %%mm6 \n\t"\
240 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" \
241 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" \
242 "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
243 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" \
244 "movq %%mm0, %%mm3 \n\t"\
245 "punpcklwd %%mm4, %%mm0 \n\t"\
246 "punpckhwd %%mm4, %%mm3 \n\t"\
247 "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4 \n\t" \
248 "pmaddwd %%mm4, %%mm0 \n\t"\
249 "pmaddwd %%mm4, %%mm3 \n\t"\
250 "paddd %%mm0, %%mm1 \n\t"\
251 "paddd %%mm3, %%mm5 \n\t"\
252 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3 \n\t" \
253 "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
254 "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
255 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
256 "movq %%mm2, %%mm0 \n\t"\
257 "punpcklwd %%mm3, %%mm2 \n\t"\
258 "punpckhwd %%mm3, %%mm0 \n\t"\
259 "pmaddwd %%mm4, %%mm2 \n\t"\
260 "pmaddwd %%mm4, %%mm0 \n\t"\
261 "paddd %%mm2, %%mm7 \n\t"\
262 "paddd %%mm0, %%mm6 \n\t"\
264 "psrad $16, %%mm1 \n\t"\
265 "psrad $16, %%mm5 \n\t"\
266 "psrad $16, %%mm7 \n\t"\
267 "psrad $16, %%mm6 \n\t"\
268 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
269 "packssdw %%mm5, %%mm1 \n\t"\
270 "packssdw %%mm6, %%mm7 \n\t"\
271 "paddw %%mm0, %%mm1 \n\t"\
272 "paddw %%mm0, %%mm7 \n\t"\
273 "movq "U_TEMP"(%0), %%mm3 \n\t"\
274 "movq "V_TEMP"(%0), %%mm4 \n\t"\
276 #define YSCALEYUV2PACKEDX_ACCURATE \
277 YSCALEYUV2PACKEDX_ACCURATE_UV \
278 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
280 #define YSCALEYUV2RGBX \
281 "psubw "U_OFFSET"(%0), %%mm3 \n\t" \
282 "psubw "V_OFFSET"(%0), %%mm4 \n\t" \
283 "movq %%mm3, %%mm2 \n\t" \
284 "movq %%mm4, %%mm5 \n\t" \
285 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
286 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
288 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
289 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
290 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" \
291 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" \
292 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
293 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
295 "paddw %%mm3, %%mm4 \n\t"\
296 "movq %%mm2, %%mm0 \n\t"\
297 "movq %%mm5, %%mm6 \n\t"\
298 "movq %%mm4, %%mm3 \n\t"\
299 "punpcklwd %%mm2, %%mm2 \n\t"\
300 "punpcklwd %%mm5, %%mm5 \n\t"\
301 "punpcklwd %%mm4, %%mm4 \n\t"\
302 "paddw %%mm1, %%mm2 \n\t"\
303 "paddw %%mm1, %%mm5 \n\t"\
304 "paddw %%mm1, %%mm4 \n\t"\
305 "punpckhwd %%mm0, %%mm0 \n\t"\
306 "punpckhwd %%mm6, %%mm6 \n\t"\
307 "punpckhwd %%mm3, %%mm3 \n\t"\
308 "paddw %%mm7, %%mm0 \n\t"\
309 "paddw %%mm7, %%mm6 \n\t"\
310 "paddw %%mm7, %%mm3 \n\t"\
312 "packuswb %%mm0, %%mm2 \n\t"\
313 "packuswb %%mm6, %%mm5 \n\t"\
314 "packuswb %%mm3, %%mm4 \n\t"\
316 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
317 "movq "#b", "#q2" \n\t" \
318 "movq "#r", "#t" \n\t" \
319 "punpcklbw "#g", "#b" \n\t" \
320 "punpcklbw "#a", "#r" \n\t" \
321 "punpckhbw "#g", "#q2" \n\t" \
322 "punpckhbw "#a", "#t" \n\t" \
323 "movq "#b", "#q0" \n\t" \
324 "movq "#q2", "#q3" \n\t" \
325 "punpcklwd "#r", "#q0" \n\t" \
326 "punpckhwd "#r", "#b" \n\t" \
327 "punpcklwd "#t", "#q2" \n\t" \
328 "punpckhwd "#t", "#q3" \n\t" \
330 MOVNTQ( q0, (dst, index, 4))\
331 MOVNTQ( b, 8(dst, index, 4))\
332 MOVNTQ( q2, 16(dst, index, 4))\
333 MOVNTQ( q3, 24(dst, index, 4))\
335 "add $8, "#index" \n\t"\
336 "cmp "dstw", "#index" \n\t"\
338 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
341 const int16_t **lumSrc,
int lumFilterSize,
342 const int16_t *chrFilter,
const int16_t **chrUSrc,
343 const int16_t **chrVSrc,
344 int chrFilterSize,
const int16_t **alpSrc,
345 uint8_t *dest,
int dstW,
int dstY)
354 "movq %%mm2, "U_TEMP"(%0) \n\t"
355 "movq %%mm4, "V_TEMP"(%0) \n\t"
356 "movq %%mm5, "Y_TEMP"(%0) \n\t"
358 "movq "Y_TEMP"(%0), %%mm5 \n\t"
359 "psraw $3, %%mm1 \n\t"
360 "psraw $3, %%mm7 \n\t"
361 "packuswb %%mm7, %%mm1 \n\t"
362 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
367 "pcmpeqd %%mm7, %%mm7 \n\t"
368 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
374 const int16_t **lumSrc,
int lumFilterSize,
375 const int16_t *chrFilter,
const int16_t **chrUSrc,
376 const int16_t **chrVSrc,
377 int chrFilterSize,
const int16_t **alpSrc,
378 uint8_t *dest,
int dstW,
int dstY)
388 "psraw $3, %%mm1 \n\t"
389 "psraw $3, %%mm7 \n\t"
390 "packuswb %%mm7, %%mm1 \n\t"
391 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
396 "pcmpeqd %%mm7, %%mm7 \n\t"
397 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
403 const int16_t **lumSrc,
int lumFilterSize,
404 const int16_t *chrFilter,
const int16_t **chrUSrc,
405 const int16_t **chrVSrc,
406 int chrFilterSize,
const int16_t **alpSrc,
407 uint8_t *dest,
int dstW,
int dstY)
417 "psraw $3, %%mm1 \n\t"
418 "psraw $3, %%mm7 \n\t"
419 "packuswb %%mm7, %%mm1 \n\t"
420 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
425 "pcmpeqd %%mm7, %%mm7 \n\t"
426 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
431 #define REAL_WRITERGB16(dst, dstw, index) \
432 "pand "MANGLE(bF8)", %%mm2 \n\t" \
433 "pand "MANGLE(bFC)", %%mm4 \n\t" \
434 "pand "MANGLE(bF8)", %%mm5 \n\t" \
435 "psrlq $3, %%mm2 \n\t"\
437 "movq %%mm2, %%mm1 \n\t"\
438 "movq %%mm4, %%mm3 \n\t"\
440 "punpcklbw %%mm7, %%mm3 \n\t"\
441 "punpcklbw %%mm5, %%mm2 \n\t"\
442 "punpckhbw %%mm7, %%mm4 \n\t"\
443 "punpckhbw %%mm5, %%mm1 \n\t"\
445 "psllq $3, %%mm3 \n\t"\
446 "psllq $3, %%mm4 \n\t"\
448 "por %%mm3, %%mm2 \n\t"\
449 "por %%mm4, %%mm1 \n\t"\
451 MOVNTQ(%%mm2, (dst, index, 2))\
452 MOVNTQ(%%mm1, 8(dst, index, 2))\
454 "add $8, "#index" \n\t"\
455 "cmp "dstw", "#index" \n\t"\
457 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
460 const int16_t **lumSrc,
int lumFilterSize,
461 const int16_t *chrFilter,
const int16_t **chrUSrc,
462 const int16_t **chrVSrc,
463 int chrFilterSize,
const int16_t **alpSrc,
464 uint8_t *dest,
int dstW,
int dstY)
472 "pxor %%mm7, %%mm7 \n\t"
484 const int16_t **lumSrc,
int lumFilterSize,
485 const int16_t *chrFilter,
const int16_t **chrUSrc,
486 const int16_t **chrVSrc,
487 int chrFilterSize,
const int16_t **alpSrc,
488 uint8_t *dest,
int dstW,
int dstY)
496 "pxor %%mm7, %%mm7 \n\t"
507 #define REAL_WRITERGB15(dst, dstw, index) \
508 "pand "MANGLE(bF8)", %%mm2 \n\t" \
509 "pand "MANGLE(bF8)", %%mm4 \n\t" \
510 "pand "MANGLE(bF8)", %%mm5 \n\t" \
511 "psrlq $3, %%mm2 \n\t"\
512 "psrlq $1, %%mm5 \n\t"\
514 "movq %%mm2, %%mm1 \n\t"\
515 "movq %%mm4, %%mm3 \n\t"\
517 "punpcklbw %%mm7, %%mm3 \n\t"\
518 "punpcklbw %%mm5, %%mm2 \n\t"\
519 "punpckhbw %%mm7, %%mm4 \n\t"\
520 "punpckhbw %%mm5, %%mm1 \n\t"\
522 "psllq $2, %%mm3 \n\t"\
523 "psllq $2, %%mm4 \n\t"\
525 "por %%mm3, %%mm2 \n\t"\
526 "por %%mm4, %%mm1 \n\t"\
528 MOVNTQ(%%mm2, (dst, index, 2))\
529 MOVNTQ(%%mm1, 8(dst, index, 2))\
531 "add $8, "#index" \n\t"\
532 "cmp "dstw", "#index" \n\t"\
534 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
537 const int16_t **lumSrc,
int lumFilterSize,
538 const int16_t *chrFilter,
const int16_t **chrUSrc,
539 const int16_t **chrVSrc,
540 int chrFilterSize,
const int16_t **alpSrc,
541 uint8_t *dest,
int dstW,
int dstY)
549 "pxor %%mm7, %%mm7 \n\t"
561 const int16_t **lumSrc,
int lumFilterSize,
562 const int16_t *chrFilter,
const int16_t **chrUSrc,
563 const int16_t **chrVSrc,
564 int chrFilterSize,
const int16_t **alpSrc,
565 uint8_t *dest,
int dstW,
int dstY)
573 "pxor %%mm7, %%mm7 \n\t"
584 #define WRITEBGR24MMX(dst, dstw, index) \
586 "movq %%mm2, %%mm1 \n\t" \
587 "movq %%mm5, %%mm6 \n\t" \
588 "punpcklbw %%mm4, %%mm2 \n\t" \
589 "punpcklbw %%mm7, %%mm5 \n\t" \
590 "punpckhbw %%mm4, %%mm1 \n\t" \
591 "punpckhbw %%mm7, %%mm6 \n\t" \
592 "movq %%mm2, %%mm0 \n\t" \
593 "movq %%mm1, %%mm3 \n\t" \
594 "punpcklwd %%mm5, %%mm0 \n\t" \
595 "punpckhwd %%mm5, %%mm2 \n\t" \
596 "punpcklwd %%mm6, %%mm1 \n\t" \
597 "punpckhwd %%mm6, %%mm3 \n\t" \
599 "movq %%mm0, %%mm4 \n\t" \
600 "movq %%mm2, %%mm6 \n\t" \
601 "movq %%mm1, %%mm5 \n\t" \
602 "movq %%mm3, %%mm7 \n\t" \
604 "psllq $40, %%mm0 \n\t" \
605 "psllq $40, %%mm2 \n\t" \
606 "psllq $40, %%mm1 \n\t" \
607 "psllq $40, %%mm3 \n\t" \
609 "punpckhdq %%mm4, %%mm0 \n\t" \
610 "punpckhdq %%mm6, %%mm2 \n\t" \
611 "punpckhdq %%mm5, %%mm1 \n\t" \
612 "punpckhdq %%mm7, %%mm3 \n\t" \
614 "psrlq $8, %%mm0 \n\t" \
615 "movq %%mm2, %%mm6 \n\t" \
616 "psllq $40, %%mm2 \n\t" \
617 "por %%mm2, %%mm0 \n\t" \
618 MOVNTQ(%%mm0, (dst))\
620 "psrlq $24, %%mm6 \n\t" \
621 "movq %%mm1, %%mm5 \n\t" \
622 "psllq $24, %%mm1 \n\t" \
623 "por %%mm1, %%mm6 \n\t" \
624 MOVNTQ(%%mm6, 8(dst))\
626 "psrlq $40, %%mm5 \n\t" \
627 "psllq $8, %%mm3 \n\t" \
628 "por %%mm3, %%mm5 \n\t" \
629 MOVNTQ(%%mm5, 16(dst))\
631 "add $24, "#dst" \n\t"\
633 "add $8, "#index" \n\t"\
634 "cmp "dstw", "#index" \n\t"\
637 #define WRITEBGR24MMXEXT(dst, dstw, index) \
639 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
640 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
641 "pshufw $0x50, %%mm2, %%mm1 \n\t" \
642 "pshufw $0x50, %%mm4, %%mm3 \n\t" \
643 "pshufw $0x00, %%mm5, %%mm6 \n\t" \
645 "pand %%mm0, %%mm1 \n\t" \
646 "pand %%mm0, %%mm3 \n\t" \
647 "pand %%mm7, %%mm6 \n\t" \
649 "psllq $8, %%mm3 \n\t" \
650 "por %%mm1, %%mm6 \n\t"\
651 "por %%mm3, %%mm6 \n\t"\
652 MOVNTQ(%%mm6, (dst))\
654 "psrlq $8, %%mm4 \n\t" \
655 "pshufw $0xA5, %%mm2, %%mm1 \n\t" \
656 "pshufw $0x55, %%mm4, %%mm3 \n\t" \
657 "pshufw $0xA5, %%mm5, %%mm6 \n\t" \
659 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" \
660 "pand %%mm7, %%mm3 \n\t" \
661 "pand %%mm0, %%mm6 \n\t" \
663 "por %%mm1, %%mm3 \n\t" \
664 "por %%mm3, %%mm6 \n\t"\
665 MOVNTQ(%%mm6, 8(dst))\
667 "pshufw $0xFF, %%mm2, %%mm1 \n\t" \
668 "pshufw $0xFA, %%mm4, %%mm3 \n\t" \
669 "pshufw $0xFA, %%mm5, %%mm6 \n\t" \
671 "pand %%mm7, %%mm1 \n\t" \
672 "pand %%mm0, %%mm3 \n\t" \
673 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" \
675 "por %%mm1, %%mm3 \n\t"\
676 "por %%mm3, %%mm6 \n\t"\
677 MOVNTQ(%%mm6, 16(dst))\
679 "add $24, "#dst" \n\t"\
681 "add $8, "#index" \n\t"\
682 "cmp "dstw", "#index" \n\t"\
685 #if COMPILE_TEMPLATE_MMXEXT
687 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
690 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
695 const int16_t **lumSrc,
int lumFilterSize,
696 const int16_t *chrFilter,
const int16_t **chrUSrc,
697 const int16_t **chrVSrc,
698 int chrFilterSize,
const int16_t **alpSrc,
699 uint8_t *dest,
int dstW,
int dstY)
707 "pxor %%mm7, %%mm7 \n\t"
708 "lea (%%"FF_REG_a
", %%"FF_REG_a
", 2), %%"FF_REG_c
"\n\t"
709 "add %4, %%"FF_REG_c
" \n\t"
711 ::
"r" (&
c->redDither),
713 "r" (dest),
"m" (dstW_reg),
"m"(uv_off)
715 :
"%"FF_REG_a,
"%"FF_REG_c,
"%"FF_REG_d,
"%"FF_REG_S
720 const int16_t **lumSrc,
int lumFilterSize,
721 const int16_t *chrFilter,
const int16_t **chrUSrc,
722 const int16_t **chrVSrc,
723 int chrFilterSize,
const int16_t **alpSrc,
724 uint8_t *dest,
int dstW,
int dstY)
732 "pxor %%mm7, %%mm7 \n\t"
733 "lea (%%"FF_REG_a
", %%"FF_REG_a
", 2), %%"FF_REG_c
" \n\t"
734 "add %4, %%"FF_REG_c
" \n\t"
736 ::
"r" (&
c->redDither),
738 "r" (dest),
"m" (dstW_reg),
"m"(uv_off)
740 :
"%"FF_REG_a,
"%"FF_REG_c,
"%"FF_REG_d,
"%"FF_REG_S
745 #define REAL_WRITEYUY2(dst, dstw, index) \
746 "packuswb %%mm3, %%mm3 \n\t"\
747 "packuswb %%mm4, %%mm4 \n\t"\
748 "packuswb %%mm7, %%mm1 \n\t"\
749 "punpcklbw %%mm4, %%mm3 \n\t"\
750 "movq %%mm1, %%mm7 \n\t"\
751 "punpcklbw %%mm3, %%mm1 \n\t"\
752 "punpckhbw %%mm3, %%mm7 \n\t"\
754 MOVNTQ(%%mm1, (dst, index, 2))\
755 MOVNTQ(%%mm7, 8(dst, index, 2))\
757 "add $8, "#index" \n\t"\
758 "cmp "dstw", "#index" \n\t"\
760 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
763 const int16_t **lumSrc,
int lumFilterSize,
764 const int16_t *chrFilter,
const int16_t **chrUSrc,
765 const int16_t **chrVSrc,
766 int chrFilterSize,
const int16_t **alpSrc,
767 uint8_t *dest,
int dstW,
int dstY)
775 "psraw $3, %%mm3 \n\t"
776 "psraw $3, %%mm4 \n\t"
777 "psraw $3, %%mm1 \n\t"
778 "psraw $3, %%mm7 \n\t"
784 const int16_t **lumSrc,
int lumFilterSize,
785 const int16_t *chrFilter,
const int16_t **chrUSrc,
786 const int16_t **chrVSrc,
787 int chrFilterSize,
const int16_t **alpSrc,
788 uint8_t *dest,
int dstW,
int dstY)
796 "psraw $3, %%mm3 \n\t"
797 "psraw $3, %%mm4 \n\t"
798 "psraw $3, %%mm1 \n\t"
799 "psraw $3, %%mm7 \n\t"
804 #define REAL_YSCALEYUV2RGB_UV(index, c) \
805 "xor "#index", "#index" \n\t"\
808 "movq (%2, "#index"), %%mm2 \n\t" \
809 "movq (%3, "#index"), %%mm3 \n\t" \
810 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
811 "movq (%2, "#index"), %%mm5 \n\t" \
812 "movq (%3, "#index"), %%mm4 \n\t" \
813 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
814 "psubw %%mm3, %%mm2 \n\t" \
815 "psubw %%mm4, %%mm5 \n\t" \
816 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
817 "pmulhw %%mm0, %%mm2 \n\t" \
818 "pmulhw %%mm0, %%mm5 \n\t" \
819 "psraw $4, %%mm3 \n\t" \
820 "psraw $4, %%mm4 \n\t" \
821 "paddw %%mm2, %%mm3 \n\t" \
822 "paddw %%mm5, %%mm4 \n\t" \
823 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
824 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
825 "movq %%mm3, %%mm2 \n\t" \
826 "movq %%mm4, %%mm5 \n\t" \
827 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
828 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
831 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
832 "movq ("#b1", "#index", 2), %%mm0 \n\t" \
833 "movq ("#b2", "#index", 2), %%mm1 \n\t" \
834 "movq 8("#b1", "#index", 2), %%mm6 \n\t" \
835 "movq 8("#b2", "#index", 2), %%mm7 \n\t" \
836 "psubw %%mm1, %%mm0 \n\t" \
837 "psubw %%mm7, %%mm6 \n\t" \
838 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \
839 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \
840 "psraw $4, %%mm1 \n\t" \
841 "psraw $4, %%mm7 \n\t" \
842 "paddw %%mm0, %%mm1 \n\t" \
843 "paddw %%mm6, %%mm7 \n\t" \
845 #define REAL_YSCALEYUV2RGB_COEFF(c) \
846 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
847 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
848 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
849 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
850 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
851 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
853 "paddw %%mm3, %%mm4 \n\t"\
854 "movq %%mm2, %%mm0 \n\t"\
855 "movq %%mm5, %%mm6 \n\t"\
856 "movq %%mm4, %%mm3 \n\t"\
857 "punpcklwd %%mm2, %%mm2 \n\t"\
858 "punpcklwd %%mm5, %%mm5 \n\t"\
859 "punpcklwd %%mm4, %%mm4 \n\t"\
860 "paddw %%mm1, %%mm2 \n\t"\
861 "paddw %%mm1, %%mm5 \n\t"\
862 "paddw %%mm1, %%mm4 \n\t"\
863 "punpckhwd %%mm0, %%mm0 \n\t"\
864 "punpckhwd %%mm6, %%mm6 \n\t"\
865 "punpckhwd %%mm3, %%mm3 \n\t"\
866 "paddw %%mm7, %%mm0 \n\t"\
867 "paddw %%mm7, %%mm6 \n\t"\
868 "paddw %%mm7, %%mm3 \n\t"\
870 "packuswb %%mm0, %%mm2 \n\t"\
871 "packuswb %%mm6, %%mm5 \n\t"\
872 "packuswb %%mm3, %%mm4 \n\t"\
874 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
876 #define YSCALEYUV2RGB(index, c) \
877 REAL_YSCALEYUV2RGB_UV(index, c) \
878 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
879 REAL_YSCALEYUV2RGB_COEFF(c)
885 const int16_t *ubuf[2],
const int16_t *vbuf[2],
886 const int16_t *abuf[2],
uint8_t *dest,
887 int dstW,
int yalpha,
int uvalpha,
int y)
889 const int16_t *buf0 = buf[0], *buf1 = buf[1],
890 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
893 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
898 "psraw $3, %%mm1 \n\t"
899 "psraw $3, %%mm7 \n\t"
900 "packuswb %%mm7, %%mm1 \n\t"
901 WRITEBGR32(%4,
DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
902 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"r" (dest),
904 "r" (abuf0),
"r" (abuf1)
908 c->u_temp=(intptr_t)abuf0;
909 c->v_temp=(intptr_t)abuf1;
912 "mov %4, %%"FF_REG_b
" \n\t"
913 "push %%"FF_REG_BP
" \n\t"
917 "mov "U_TEMP"(%5), %0 \n\t"
918 "mov "V_TEMP"(%5), %1 \n\t"
920 "psraw $3, %%mm1 \n\t"
921 "psraw $3, %%mm7 \n\t"
922 "packuswb %%mm7, %%mm1 \n\t"
925 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
926 "pop %%"FF_REG_BP
" \n\t"
928 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
935 "mov %4, %%"FF_REG_b
" \n\t"
936 "push %%"FF_REG_BP
" \n\t"
938 "pcmpeqd %%mm7, %%mm7 \n\t"
939 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
940 "pop %%"FF_REG_BP
" \n\t"
942 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
949 const int16_t *ubuf[2],
const int16_t *vbuf[2],
950 const int16_t *abuf[2],
uint8_t *dest,
951 int dstW,
int yalpha,
int uvalpha,
int y)
953 const int16_t *buf0 = buf[0], *buf1 = buf[1],
954 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
958 "mov %4, %%"FF_REG_b
" \n\t"
959 "push %%"FF_REG_BP
" \n\t"
961 "pxor %%mm7, %%mm7 \n\t"
963 "pop %%"FF_REG_BP
" \n\t"
965 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
972 const int16_t *ubuf[2],
const int16_t *vbuf[2],
973 const int16_t *abuf[2],
uint8_t *dest,
974 int dstW,
int yalpha,
int uvalpha,
int y)
976 const int16_t *buf0 = buf[0], *buf1 = buf[1],
977 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
981 "mov %4, %%"FF_REG_b
" \n\t"
982 "push %%"FF_REG_BP
" \n\t"
984 "pxor %%mm7, %%mm7 \n\t"
992 "pop %%"FF_REG_BP
" \n\t"
994 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1001 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1002 const int16_t *abuf[2],
uint8_t *dest,
1003 int dstW,
int yalpha,
int uvalpha,
int y)
1005 const int16_t *buf0 = buf[0], *buf1 = buf[1],
1006 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1010 "mov %4, %%"FF_REG_b
" \n\t"
1011 "push %%"FF_REG_BP
" \n\t"
1013 "pxor %%mm7, %%mm7 \n\t"
1021 "pop %%"FF_REG_BP
" \n\t"
1023 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1029 #define REAL_YSCALEYUV2PACKED(index, c) \
1030 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
1031 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
1032 "psraw $3, %%mm0 \n\t"\
1033 "psraw $3, %%mm1 \n\t"\
1034 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
1035 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
1036 "xor "#index", "#index" \n\t"\
1039 "movq (%2, "#index"), %%mm2 \n\t" \
1040 "movq (%3, "#index"), %%mm3 \n\t" \
1041 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1042 "movq (%2, "#index"), %%mm5 \n\t" \
1043 "movq (%3, "#index"), %%mm4 \n\t" \
1044 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1045 "psubw %%mm3, %%mm2 \n\t" \
1046 "psubw %%mm4, %%mm5 \n\t" \
1047 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
1048 "pmulhw %%mm0, %%mm2 \n\t" \
1049 "pmulhw %%mm0, %%mm5 \n\t" \
1050 "psraw $7, %%mm3 \n\t" \
1051 "psraw $7, %%mm4 \n\t" \
1052 "paddw %%mm2, %%mm3 \n\t" \
1053 "paddw %%mm5, %%mm4 \n\t" \
1054 "movq (%0, "#index", 2), %%mm0 \n\t" \
1055 "movq (%1, "#index", 2), %%mm1 \n\t" \
1056 "movq 8(%0, "#index", 2), %%mm6 \n\t" \
1057 "movq 8(%1, "#index", 2), %%mm7 \n\t" \
1058 "psubw %%mm1, %%mm0 \n\t" \
1059 "psubw %%mm7, %%mm6 \n\t" \
1060 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \
1061 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \
1062 "psraw $7, %%mm1 \n\t" \
1063 "psraw $7, %%mm7 \n\t" \
1064 "paddw %%mm0, %%mm1 \n\t" \
1065 "paddw %%mm6, %%mm7 \n\t" \
1067 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
1070 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1071 const int16_t *abuf[2],
uint8_t *dest,
1072 int dstW,
int yalpha,
int uvalpha,
int y)
1074 const int16_t *buf0 = buf[0], *buf1 = buf[1],
1075 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1079 "mov %4, %%"FF_REG_b
" \n\t"
1080 "push %%"FF_REG_BP
" \n\t"
1083 "pop %%"FF_REG_BP
" \n\t"
1085 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1090 #define REAL_YSCALEYUV2RGB1(index, c) \
1091 "xor "#index", "#index" \n\t"\
1094 "movq (%2, "#index"), %%mm3 \n\t" \
1095 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1096 "movq (%2, "#index"), %%mm4 \n\t" \
1097 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1098 "psraw $4, %%mm3 \n\t" \
1099 "psraw $4, %%mm4 \n\t" \
1100 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
1101 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
1102 "movq %%mm3, %%mm2 \n\t" \
1103 "movq %%mm4, %%mm5 \n\t" \
1104 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1105 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1107 "movq (%0, "#index", 2), %%mm1 \n\t" \
1108 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
1109 "psraw $4, %%mm1 \n\t" \
1110 "psraw $4, %%mm7 \n\t" \
1111 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1112 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1113 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
1114 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
1115 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1116 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1118 "paddw %%mm3, %%mm4 \n\t"\
1119 "movq %%mm2, %%mm0 \n\t"\
1120 "movq %%mm5, %%mm6 \n\t"\
1121 "movq %%mm4, %%mm3 \n\t"\
1122 "punpcklwd %%mm2, %%mm2 \n\t"\
1123 "punpcklwd %%mm5, %%mm5 \n\t"\
1124 "punpcklwd %%mm4, %%mm4 \n\t"\
1125 "paddw %%mm1, %%mm2 \n\t"\
1126 "paddw %%mm1, %%mm5 \n\t"\
1127 "paddw %%mm1, %%mm4 \n\t"\
1128 "punpckhwd %%mm0, %%mm0 \n\t"\
1129 "punpckhwd %%mm6, %%mm6 \n\t"\
1130 "punpckhwd %%mm3, %%mm3 \n\t"\
1131 "paddw %%mm7, %%mm0 \n\t"\
1132 "paddw %%mm7, %%mm6 \n\t"\
1133 "paddw %%mm7, %%mm3 \n\t"\
1135 "packuswb %%mm0, %%mm2 \n\t"\
1136 "packuswb %%mm6, %%mm5 \n\t"\
1137 "packuswb %%mm3, %%mm4 \n\t"\
1139 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
1142 #define REAL_YSCALEYUV2RGB1b(index, c) \
1143 "xor "#index", "#index" \n\t"\
1146 "movq (%2, "#index"), %%mm2 \n\t" \
1147 "movq (%3, "#index"), %%mm3 \n\t" \
1148 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1149 "movq (%2, "#index"), %%mm5 \n\t" \
1150 "movq (%3, "#index"), %%mm4 \n\t" \
1151 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1152 "paddw %%mm2, %%mm3 \n\t" \
1153 "paddw %%mm5, %%mm4 \n\t" \
1154 "psrlw $5, %%mm3 \n\t" \
1155 "psrlw $5, %%mm4 \n\t" \
1156 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
1157 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
1158 "movq %%mm3, %%mm2 \n\t" \
1159 "movq %%mm4, %%mm5 \n\t" \
1160 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1161 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1163 "movq (%0, "#index", 2), %%mm1 \n\t" \
1164 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
1165 "psraw $4, %%mm1 \n\t" \
1166 "psraw $4, %%mm7 \n\t" \
1167 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1168 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1169 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
1170 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
1171 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1172 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1174 "paddw %%mm3, %%mm4 \n\t"\
1175 "movq %%mm2, %%mm0 \n\t"\
1176 "movq %%mm5, %%mm6 \n\t"\
1177 "movq %%mm4, %%mm3 \n\t"\
1178 "punpcklwd %%mm2, %%mm2 \n\t"\
1179 "punpcklwd %%mm5, %%mm5 \n\t"\
1180 "punpcklwd %%mm4, %%mm4 \n\t"\
1181 "paddw %%mm1, %%mm2 \n\t"\
1182 "paddw %%mm1, %%mm5 \n\t"\
1183 "paddw %%mm1, %%mm4 \n\t"\
1184 "punpckhwd %%mm0, %%mm0 \n\t"\
1185 "punpckhwd %%mm6, %%mm6 \n\t"\
1186 "punpckhwd %%mm3, %%mm3 \n\t"\
1187 "paddw %%mm7, %%mm0 \n\t"\
1188 "paddw %%mm7, %%mm6 \n\t"\
1189 "paddw %%mm7, %%mm3 \n\t"\
1191 "packuswb %%mm0, %%mm2 \n\t"\
1192 "packuswb %%mm6, %%mm5 \n\t"\
1193 "packuswb %%mm3, %%mm4 \n\t"\
1195 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
1197 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1198 "movq (%1, "#index", 2), %%mm7 \n\t" \
1199 "movq 8(%1, "#index", 2), %%mm1 \n\t" \
1200 "psraw $7, %%mm7 \n\t" \
1201 "psraw $7, %%mm1 \n\t" \
1202 "packuswb %%mm1, %%mm7 \n\t"
1203 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1209 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1210 const int16_t *abuf0,
uint8_t *dest,
1211 int dstW,
int uvalpha,
int y)
1213 const int16_t *ubuf0 = ubuf[0];
1214 const int16_t *buf1= buf0;
1216 if (uvalpha < 2048) {
1217 const int16_t *ubuf1 = ubuf[0];
1221 "mov %4, %%"FF_REG_b
" \n\t"
1222 "push %%"FF_REG_BP
" \n\t"
1225 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1226 "pop %%"FF_REG_BP
" \n\t"
1228 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1234 "mov %4, %%"FF_REG_b
" \n\t"
1235 "push %%"FF_REG_BP
" \n\t"
1237 "pcmpeqd %%mm7, %%mm7 \n\t"
1238 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1239 "pop %%"FF_REG_BP
" \n\t"
1241 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1246 const int16_t *ubuf1 = ubuf[1];
1250 "mov %4, %%"FF_REG_b
" \n\t"
1251 "push %%"FF_REG_BP
" \n\t"
1254 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1255 "pop %%"FF_REG_BP
" \n\t"
1257 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1263 "mov %4, %%"FF_REG_b
" \n\t"
1264 "push %%"FF_REG_BP
" \n\t"
1266 "pcmpeqd %%mm7, %%mm7 \n\t"
1267 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1268 "pop %%"FF_REG_BP
" \n\t"
1270 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1278 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1279 const int16_t *abuf0,
uint8_t *dest,
1280 int dstW,
int uvalpha,
int y)
1282 const int16_t *ubuf0 = ubuf[0];
1283 const int16_t *buf1= buf0;
1285 if (uvalpha < 2048) {
1286 const int16_t *ubuf1 = ubuf[0];
1289 "mov %4, %%"FF_REG_b
" \n\t"
1290 "push %%"FF_REG_BP
" \n\t"
1292 "pxor %%mm7, %%mm7 \n\t"
1294 "pop %%"FF_REG_BP
" \n\t"
1296 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1301 const int16_t *ubuf1 = ubuf[1];
1304 "mov %4, %%"FF_REG_b
" \n\t"
1305 "push %%"FF_REG_BP
" \n\t"
1307 "pxor %%mm7, %%mm7 \n\t"
1309 "pop %%"FF_REG_BP
" \n\t"
1311 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1319 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1320 const int16_t *abuf0,
uint8_t *dest,
1321 int dstW,
int uvalpha,
int y)
1323 const int16_t *ubuf0 = ubuf[0];
1324 const int16_t *buf1= buf0;
1326 if (uvalpha < 2048) {
1327 const int16_t *ubuf1 = ubuf[0];
1330 "mov %4, %%"FF_REG_b
" \n\t"
1331 "push %%"FF_REG_BP
" \n\t"
1333 "pxor %%mm7, %%mm7 \n\t"
1341 "pop %%"FF_REG_BP
" \n\t"
1343 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1348 const int16_t *ubuf1 = ubuf[1];
1351 "mov %4, %%"FF_REG_b
" \n\t"
1352 "push %%"FF_REG_BP
" \n\t"
1354 "pxor %%mm7, %%mm7 \n\t"
1362 "pop %%"FF_REG_BP
" \n\t"
1364 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1372 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1373 const int16_t *abuf0,
uint8_t *dest,
1374 int dstW,
int uvalpha,
int y)
1376 const int16_t *ubuf0 = ubuf[0];
1377 const int16_t *buf1= buf0;
1379 if (uvalpha < 2048) {
1380 const int16_t *ubuf1 = ubuf[0];
1383 "mov %4, %%"FF_REG_b
" \n\t"
1384 "push %%"FF_REG_BP
" \n\t"
1386 "pxor %%mm7, %%mm7 \n\t"
1394 "pop %%"FF_REG_BP
" \n\t"
1396 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1401 const int16_t *ubuf1 = ubuf[1];
1404 "mov %4, %%"FF_REG_b
" \n\t"
1405 "push %%"FF_REG_BP
" \n\t"
1407 "pxor %%mm7, %%mm7 \n\t"
1415 "pop %%"FF_REG_BP
" \n\t"
1417 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1424 #define REAL_YSCALEYUV2PACKED1(index, c) \
1425 "xor "#index", "#index" \n\t"\
1428 "movq (%2, "#index"), %%mm3 \n\t" \
1429 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1430 "movq (%2, "#index"), %%mm4 \n\t" \
1431 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1432 "psraw $7, %%mm3 \n\t" \
1433 "psraw $7, %%mm4 \n\t" \
1434 "movq (%0, "#index", 2), %%mm1 \n\t" \
1435 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
1436 "psraw $7, %%mm1 \n\t" \
1437 "psraw $7, %%mm7 \n\t" \
1439 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
1441 #define REAL_YSCALEYUV2PACKED1b(index, c) \
1442 "xor "#index", "#index" \n\t"\
1445 "movq (%2, "#index"), %%mm2 \n\t" \
1446 "movq (%3, "#index"), %%mm3 \n\t" \
1447 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1448 "movq (%2, "#index"), %%mm5 \n\t" \
1449 "movq (%3, "#index"), %%mm4 \n\t" \
1450 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1451 "paddw %%mm2, %%mm3 \n\t" \
1452 "paddw %%mm5, %%mm4 \n\t" \
1453 "psrlw $8, %%mm3 \n\t" \
1454 "psrlw $8, %%mm4 \n\t" \
1455 "movq (%0, "#index", 2), %%mm1 \n\t" \
1456 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
1457 "psraw $7, %%mm1 \n\t" \
1458 "psraw $7, %%mm7 \n\t"
1459 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
1462 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1463 const int16_t *abuf0,
uint8_t *dest,
1464 int dstW,
int uvalpha,
int y)
1466 const int16_t *ubuf0 = ubuf[0];
1467 const int16_t *buf1= buf0;
1469 if (uvalpha < 2048) {
1470 const int16_t *ubuf1 = ubuf[0];
1473 "mov %4, %%"FF_REG_b
" \n\t"
1474 "push %%"FF_REG_BP
" \n\t"
1477 "pop %%"FF_REG_BP
" \n\t"
1479 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1483 const int16_t *ubuf1 = ubuf[1];
1486 "mov %4, %%"FF_REG_b
" \n\t"
1487 "push %%"FF_REG_BP
" \n\t"
1490 "pop %%"FF_REG_BP
" \n\t"
1492 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1501 c->use_mmx_vfilter= 0;
1507 switch (
c->dstFormat) {
1519 c->use_mmx_vfilter= 1;
1522 switch (
c->dstFormat) {
1536 switch (
c->dstFormat) {
1563 if (
c->srcBpc == 8 &&
c->dstBpc <= 14) {
1565 #if COMPILE_TEMPLATE_MMXEXT
1571 c->hyscale_fast =
NULL;
1572 c->hcscale_fast =
NULL;
1573 #if COMPILE_TEMPLATE_MMXEXT
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
static void RENAME() yuv2rgb32_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
vertical bilinear scale YV12 to RGB
AVPixelFormat
Pixel format.
#define YSCALEYUV2PACKEDX_ACCURATE
#define ALP_MMX_FILTER_OFFSET
#define YSCALEYUV2RGB1(index, c)
static void RENAME() yuv2rgb555_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
#define YSCALEYUV2PACKEDX_YA(offset, coeff, src1, src2, dst1, dst2)
static void RENAME() yuv2rgb555_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static void RENAME() yuv2rgb32_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define NAMED_CONSTRAINTS_ADD(...)
#define YSCALEYUV2RGB(index, c)
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
@ AV_PIX_FMT_GRAYF32LE
IEEE-754 single precision Y, 32bpp, little-endian.
static void RENAME() yuv2bgr32_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define SWS_FAST_BILINEAR
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc)
#define YSCALEYUV2RGB1b(index, c)
#define WRITERGB15(dst, dstw, index)
#define WRITEBGR24(dst, dstw, index)
static void RENAME() yuv2rgb565_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
static void RENAME() yuv2bgr24_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
#define YSCALEYUV2PACKEDX
@ AV_PIX_FMT_YUYV422
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
#define YSCALEYUV2RGB1_ALPHA(index)
static void RENAME() yuv2rgb555_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
#define CONFIG_SWSCALE_ALPHA
static void RENAME() yuv2yuyv422_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define SWS_FULL_CHR_H_INT
static av_cold void RENAME() sws_init_swscale(SwsContext *c)
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
static void RENAME() yuv2yuvX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
static void RENAME() yuv2rgb32_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
YV12 to RGB without scaling or interpolating.
#define YSCALEYUV2RGB_YA(index, c, b1, b2)
static void RENAME() yuv2yuyv422_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
static void RENAME() yuv2rgb565_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define AV_PIX_FMT_RGB555
#define YSCALEYUV2PACKED(index, c)
#define AV_PIX_FMT_RGB565
@ AV_PIX_FMT_GRAYF32BE
IEEE-754 single precision Y, 32bpp, big-endian.
static void RENAME() yuv2bgr24_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
static void RENAME() yuv2rgb32_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2yuyv422_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define WRITEYUY2(dst, dstw, index)
static void RENAME() yuv2rgb565_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset)
void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc)
static av_always_inline void dither_8to16(const uint8_t *srcDither, int rot)
static void RENAME() yuv2rgb565_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2rgb555_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define YSCALEYUV2PACKEDX_END
#define WRITERGB16(dst, dstw, index)
#define YSCALEYUV2PACKED1(index, c)
static void RENAME() yuv2yuyv422_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
#define YSCALEYUV2PACKED1b(index, c)
static const uint8_t dither[8][8]