39 #if COMPILE_TEMPLATE_AMD3DNOW
40 #define PREFETCH "prefetch"
41 #define PAVGB "pavgusb"
42 #elif COMPILE_TEMPLATE_MMXEXT
43 #define PREFETCH "prefetchnta"
46 #define PREFETCH " # nop"
49 #if COMPILE_TEMPLATE_AMD3DNOW
56 #if COMPILE_TEMPLATE_MMXEXT
57 #define MOVNTQ "movntq"
58 #define SFENCE "sfence"
61 #define SFENCE " # nop"
64 #if !COMPILE_TEMPLATE_SSE2
66 #if !COMPILE_TEMPLATE_AMD3DNOW
75 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
77 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask32a):
"memory");
81 "movd (%1), %%mm0 \n\t"
82 "punpckldq 3(%1), %%mm0 \n\t"
83 "movd 6(%1), %%mm1 \n\t"
84 "punpckldq 9(%1), %%mm1 \n\t"
85 "movd 12(%1), %%mm2 \n\t"
86 "punpckldq 15(%1), %%mm2 \n\t"
87 "movd 18(%1), %%mm3 \n\t"
88 "punpckldq 21(%1), %%mm3 \n\t"
89 "por %%mm7, %%mm0 \n\t"
90 "por %%mm7, %%mm1 \n\t"
91 "por %%mm7, %%mm2 \n\t"
92 "por %%mm7, %%mm3 \n\t"
95 MOVNTQ" %%mm2, 16(%0) \n\t"
102 __asm__
volatile(
SFENCE:::
"memory");
103 __asm__
volatile(
EMMS:::
"memory");
112 #define STORE_BGR24_MMX \
113 "psrlq $8, %%mm2 \n\t" \
114 "psrlq $8, %%mm3 \n\t" \
115 "psrlq $8, %%mm6 \n\t" \
116 "psrlq $8, %%mm7 \n\t" \
117 "pand "MANGLE(mask24l)", %%mm0\n\t" \
118 "pand "MANGLE(mask24l)", %%mm1\n\t" \
119 "pand "MANGLE(mask24l)", %%mm4\n\t" \
120 "pand "MANGLE(mask24l)", %%mm5\n\t" \
121 "pand "MANGLE(mask24h)", %%mm2\n\t" \
122 "pand "MANGLE(mask24h)", %%mm3\n\t" \
123 "pand "MANGLE(mask24h)", %%mm6\n\t" \
124 "pand "MANGLE(mask24h)", %%mm7\n\t" \
125 "por %%mm2, %%mm0 \n\t" \
126 "por %%mm3, %%mm1 \n\t" \
127 "por %%mm6, %%mm4 \n\t" \
128 "por %%mm7, %%mm5 \n\t" \
130 "movq %%mm1, %%mm2 \n\t" \
131 "movq %%mm4, %%mm3 \n\t" \
132 "psllq $48, %%mm2 \n\t" \
133 "psllq $32, %%mm3 \n\t" \
134 "por %%mm2, %%mm0 \n\t" \
135 "psrlq $16, %%mm1 \n\t" \
136 "psrlq $32, %%mm4 \n\t" \
137 "psllq $16, %%mm5 \n\t" \
138 "por %%mm3, %%mm1 \n\t" \
139 "por %%mm5, %%mm4 \n\t" \
141 MOVNTQ" %%mm0, (%0) \n\t" \
142 MOVNTQ" %%mm1, 8(%0) \n\t" \
143 MOVNTQ" %%mm4, 16(%0)"
153 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
158 "movq (%1), %%mm0 \n\t"
159 "movq 8(%1), %%mm1 \n\t"
160 "movq 16(%1), %%mm4 \n\t"
161 "movq 24(%1), %%mm5 \n\t"
162 "movq %%mm0, %%mm2 \n\t"
163 "movq %%mm1, %%mm3 \n\t"
164 "movq %%mm4, %%mm6 \n\t"
165 "movq %%mm5, %%mm7 \n\t"
173 __asm__
volatile(
SFENCE:::
"memory");
174 __asm__
volatile(
EMMS:::
"memory");
196 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
197 __asm__
volatile(
"movq %0, %%mm4"::
"m"(mask15s));
202 "movq (%1), %%mm0 \n\t"
203 "movq 8(%1), %%mm2 \n\t"
204 "movq %%mm0, %%mm1 \n\t"
205 "movq %%mm2, %%mm3 \n\t"
206 "pand %%mm4, %%mm0 \n\t"
207 "pand %%mm4, %%mm2 \n\t"
208 "paddw %%mm1, %%mm0 \n\t"
209 "paddw %%mm3, %%mm2 \n\t"
217 __asm__
volatile(
SFENCE:::
"memory");
218 __asm__
volatile(
EMMS:::
"memory");
221 register unsigned x= *((
const uint32_t *)s);
222 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
227 register unsigned short x= *((
const uint16_t *)s);
228 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
239 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
240 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask15rg));
241 __asm__
volatile(
"movq %0, %%mm6"::
"m"(mask15b));
246 "movq (%1), %%mm0 \n\t"
247 "movq 8(%1), %%mm2 \n\t"
248 "movq %%mm0, %%mm1 \n\t"
249 "movq %%mm2, %%mm3 \n\t"
250 "psrlq $1, %%mm0 \n\t"
251 "psrlq $1, %%mm2 \n\t"
252 "pand %%mm7, %%mm0 \n\t"
253 "pand %%mm7, %%mm2 \n\t"
254 "pand %%mm6, %%mm1 \n\t"
255 "pand %%mm6, %%mm3 \n\t"
256 "por %%mm1, %%mm0 \n\t"
257 "por %%mm3, %%mm2 \n\t"
265 __asm__
volatile(
SFENCE:::
"memory");
266 __asm__
volatile(
EMMS:::
"memory");
269 register uint32_t x= *((
const uint32_t*)s);
270 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
275 register uint16_t x= *((
const uint16_t*)s);
276 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
285 uint16_t *d = (uint16_t *)dst;
289 "movq %3, %%mm5 \n\t"
290 "movq %4, %%mm6 \n\t"
291 "movq %5, %%mm7 \n\t"
296 "movd (%1), %%mm0 \n\t"
297 "movd 4(%1), %%mm3 \n\t"
298 "punpckldq 8(%1), %%mm0 \n\t"
299 "punpckldq 12(%1), %%mm3 \n\t"
300 "movq %%mm0, %%mm1 \n\t"
301 "movq %%mm3, %%mm4 \n\t"
302 "pand %%mm6, %%mm0 \n\t"
303 "pand %%mm6, %%mm3 \n\t"
304 "pmaddwd %%mm7, %%mm0 \n\t"
305 "pmaddwd %%mm7, %%mm3 \n\t"
306 "pand %%mm5, %%mm1 \n\t"
307 "pand %%mm5, %%mm4 \n\t"
308 "por %%mm1, %%mm0 \n\t"
309 "por %%mm4, %%mm3 \n\t"
310 "psrld $5, %%mm0 \n\t"
311 "pslld $11, %%mm3 \n\t"
312 "por %%mm3, %%mm0 \n\t"
320 :
"r" (mm_end),
"m" (mask3216g),
"m" (mask3216br),
"m" (mul3216)
322 __asm__
volatile(
SFENCE:::
"memory");
323 __asm__
volatile(
EMMS:::
"memory");
325 register int rgb = *(
const uint32_t*)s; s += 4;
326 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
335 uint16_t *d = (uint16_t *)dst;
337 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
339 "movq %0, %%mm7 \n\t"
340 "movq %1, %%mm6 \n\t"
341 ::
"m"(red_16mask),
"m"(green_16mask));
346 "movd (%1), %%mm0 \n\t"
347 "movd 4(%1), %%mm3 \n\t"
348 "punpckldq 8(%1), %%mm0 \n\t"
349 "punpckldq 12(%1), %%mm3 \n\t"
350 "movq %%mm0, %%mm1 \n\t"
351 "movq %%mm0, %%mm2 \n\t"
352 "movq %%mm3, %%mm4 \n\t"
353 "movq %%mm3, %%mm5 \n\t"
354 "psllq $8, %%mm0 \n\t"
355 "psllq $8, %%mm3 \n\t"
356 "pand %%mm7, %%mm0 \n\t"
357 "pand %%mm7, %%mm3 \n\t"
358 "psrlq $5, %%mm1 \n\t"
359 "psrlq $5, %%mm4 \n\t"
360 "pand %%mm6, %%mm1 \n\t"
361 "pand %%mm6, %%mm4 \n\t"
362 "psrlq $19, %%mm2 \n\t"
363 "psrlq $19, %%mm5 \n\t"
364 "pand %2, %%mm2 \n\t"
365 "pand %2, %%mm5 \n\t"
366 "por %%mm1, %%mm0 \n\t"
367 "por %%mm4, %%mm3 \n\t"
368 "por %%mm2, %%mm0 \n\t"
369 "por %%mm5, %%mm3 \n\t"
370 "psllq $16, %%mm3 \n\t"
371 "por %%mm3, %%mm0 \n\t"
373 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
377 __asm__
volatile(
SFENCE:::
"memory");
378 __asm__
volatile(
EMMS:::
"memory");
380 register int rgb = *(
const uint32_t*)s; s += 4;
381 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
390 uint16_t *d = (uint16_t *)dst;
394 "movq %3, %%mm5 \n\t"
395 "movq %4, %%mm6 \n\t"
396 "movq %5, %%mm7 \n\t"
401 "movd (%1), %%mm0 \n\t"
402 "movd 4(%1), %%mm3 \n\t"
403 "punpckldq 8(%1), %%mm0 \n\t"
404 "punpckldq 12(%1), %%mm3 \n\t"
405 "movq %%mm0, %%mm1 \n\t"
406 "movq %%mm3, %%mm4 \n\t"
407 "pand %%mm6, %%mm0 \n\t"
408 "pand %%mm6, %%mm3 \n\t"
409 "pmaddwd %%mm7, %%mm0 \n\t"
410 "pmaddwd %%mm7, %%mm3 \n\t"
411 "pand %%mm5, %%mm1 \n\t"
412 "pand %%mm5, %%mm4 \n\t"
413 "por %%mm1, %%mm0 \n\t"
414 "por %%mm4, %%mm3 \n\t"
415 "psrld $6, %%mm0 \n\t"
416 "pslld $10, %%mm3 \n\t"
417 "por %%mm3, %%mm0 \n\t"
425 :
"r" (mm_end),
"m" (mask3215g),
"m" (mask3216br),
"m" (mul3215)
427 __asm__
volatile(
SFENCE:::
"memory");
428 __asm__
volatile(
EMMS:::
"memory");
430 register int rgb = *(
const uint32_t*)s; s += 4;
431 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
440 uint16_t *d = (uint16_t *)dst;
442 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
444 "movq %0, %%mm7 \n\t"
445 "movq %1, %%mm6 \n\t"
446 ::
"m"(red_15mask),
"m"(green_15mask));
451 "movd (%1), %%mm0 \n\t"
452 "movd 4(%1), %%mm3 \n\t"
453 "punpckldq 8(%1), %%mm0 \n\t"
454 "punpckldq 12(%1), %%mm3 \n\t"
455 "movq %%mm0, %%mm1 \n\t"
456 "movq %%mm0, %%mm2 \n\t"
457 "movq %%mm3, %%mm4 \n\t"
458 "movq %%mm3, %%mm5 \n\t"
459 "psllq $7, %%mm0 \n\t"
460 "psllq $7, %%mm3 \n\t"
461 "pand %%mm7, %%mm0 \n\t"
462 "pand %%mm7, %%mm3 \n\t"
463 "psrlq $6, %%mm1 \n\t"
464 "psrlq $6, %%mm4 \n\t"
465 "pand %%mm6, %%mm1 \n\t"
466 "pand %%mm6, %%mm4 \n\t"
467 "psrlq $19, %%mm2 \n\t"
468 "psrlq $19, %%mm5 \n\t"
469 "pand %2, %%mm2 \n\t"
470 "pand %2, %%mm5 \n\t"
471 "por %%mm1, %%mm0 \n\t"
472 "por %%mm4, %%mm3 \n\t"
473 "por %%mm2, %%mm0 \n\t"
474 "por %%mm5, %%mm3 \n\t"
475 "psllq $16, %%mm3 \n\t"
476 "por %%mm3, %%mm0 \n\t"
478 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
482 __asm__
volatile(
SFENCE:::
"memory");
483 __asm__
volatile(
EMMS:::
"memory");
485 register int rgb = *(
const uint32_t*)s; s += 4;
486 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
495 uint16_t *d = (uint16_t *)dst;
497 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
499 "movq %0, %%mm7 \n\t"
500 "movq %1, %%mm6 \n\t"
501 ::
"m"(red_16mask),
"m"(green_16mask));
506 "movd (%1), %%mm0 \n\t"
507 "movd 3(%1), %%mm3 \n\t"
508 "punpckldq 6(%1), %%mm0 \n\t"
509 "punpckldq 9(%1), %%mm3 \n\t"
510 "movq %%mm0, %%mm1 \n\t"
511 "movq %%mm0, %%mm2 \n\t"
512 "movq %%mm3, %%mm4 \n\t"
513 "movq %%mm3, %%mm5 \n\t"
514 "psrlq $3, %%mm0 \n\t"
515 "psrlq $3, %%mm3 \n\t"
516 "pand %2, %%mm0 \n\t"
517 "pand %2, %%mm3 \n\t"
518 "psrlq $5, %%mm1 \n\t"
519 "psrlq $5, %%mm4 \n\t"
520 "pand %%mm6, %%mm1 \n\t"
521 "pand %%mm6, %%mm4 \n\t"
522 "psrlq $8, %%mm2 \n\t"
523 "psrlq $8, %%mm5 \n\t"
524 "pand %%mm7, %%mm2 \n\t"
525 "pand %%mm7, %%mm5 \n\t"
526 "por %%mm1, %%mm0 \n\t"
527 "por %%mm4, %%mm3 \n\t"
528 "por %%mm2, %%mm0 \n\t"
529 "por %%mm5, %%mm3 \n\t"
530 "psllq $16, %%mm3 \n\t"
531 "por %%mm3, %%mm0 \n\t"
533 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
537 __asm__
volatile(
SFENCE:::
"memory");
538 __asm__
volatile(
EMMS:::
"memory");
543 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
552 uint16_t *d = (uint16_t *)dst;
554 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
556 "movq %0, %%mm7 \n\t"
557 "movq %1, %%mm6 \n\t"
558 ::
"m"(red_16mask),
"m"(green_16mask));
563 "movd (%1), %%mm0 \n\t"
564 "movd 3(%1), %%mm3 \n\t"
565 "punpckldq 6(%1), %%mm0 \n\t"
566 "punpckldq 9(%1), %%mm3 \n\t"
567 "movq %%mm0, %%mm1 \n\t"
568 "movq %%mm0, %%mm2 \n\t"
569 "movq %%mm3, %%mm4 \n\t"
570 "movq %%mm3, %%mm5 \n\t"
571 "psllq $8, %%mm0 \n\t"
572 "psllq $8, %%mm3 \n\t"
573 "pand %%mm7, %%mm0 \n\t"
574 "pand %%mm7, %%mm3 \n\t"
575 "psrlq $5, %%mm1 \n\t"
576 "psrlq $5, %%mm4 \n\t"
577 "pand %%mm6, %%mm1 \n\t"
578 "pand %%mm6, %%mm4 \n\t"
579 "psrlq $19, %%mm2 \n\t"
580 "psrlq $19, %%mm5 \n\t"
581 "pand %2, %%mm2 \n\t"
582 "pand %2, %%mm5 \n\t"
583 "por %%mm1, %%mm0 \n\t"
584 "por %%mm4, %%mm3 \n\t"
585 "por %%mm2, %%mm0 \n\t"
586 "por %%mm5, %%mm3 \n\t"
587 "psllq $16, %%mm3 \n\t"
588 "por %%mm3, %%mm0 \n\t"
590 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
594 __asm__
volatile(
SFENCE:::
"memory");
595 __asm__
volatile(
EMMS:::
"memory");
600 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
609 uint16_t *d = (uint16_t *)dst;
611 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
613 "movq %0, %%mm7 \n\t"
614 "movq %1, %%mm6 \n\t"
615 ::
"m"(red_15mask),
"m"(green_15mask));
620 "movd (%1), %%mm0 \n\t"
621 "movd 3(%1), %%mm3 \n\t"
622 "punpckldq 6(%1), %%mm0 \n\t"
623 "punpckldq 9(%1), %%mm3 \n\t"
624 "movq %%mm0, %%mm1 \n\t"
625 "movq %%mm0, %%mm2 \n\t"
626 "movq %%mm3, %%mm4 \n\t"
627 "movq %%mm3, %%mm5 \n\t"
628 "psrlq $3, %%mm0 \n\t"
629 "psrlq $3, %%mm3 \n\t"
630 "pand %2, %%mm0 \n\t"
631 "pand %2, %%mm3 \n\t"
632 "psrlq $6, %%mm1 \n\t"
633 "psrlq $6, %%mm4 \n\t"
634 "pand %%mm6, %%mm1 \n\t"
635 "pand %%mm6, %%mm4 \n\t"
636 "psrlq $9, %%mm2 \n\t"
637 "psrlq $9, %%mm5 \n\t"
638 "pand %%mm7, %%mm2 \n\t"
639 "pand %%mm7, %%mm5 \n\t"
640 "por %%mm1, %%mm0 \n\t"
641 "por %%mm4, %%mm3 \n\t"
642 "por %%mm2, %%mm0 \n\t"
643 "por %%mm5, %%mm3 \n\t"
644 "psllq $16, %%mm3 \n\t"
645 "por %%mm3, %%mm0 \n\t"
647 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
651 __asm__
volatile(
SFENCE:::
"memory");
652 __asm__
volatile(
EMMS:::
"memory");
657 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
666 uint16_t *d = (uint16_t *)dst;
668 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
670 "movq %0, %%mm7 \n\t"
671 "movq %1, %%mm6 \n\t"
672 ::
"m"(red_15mask),
"m"(green_15mask));
677 "movd (%1), %%mm0 \n\t"
678 "movd 3(%1), %%mm3 \n\t"
679 "punpckldq 6(%1), %%mm0 \n\t"
680 "punpckldq 9(%1), %%mm3 \n\t"
681 "movq %%mm0, %%mm1 \n\t"
682 "movq %%mm0, %%mm2 \n\t"
683 "movq %%mm3, %%mm4 \n\t"
684 "movq %%mm3, %%mm5 \n\t"
685 "psllq $7, %%mm0 \n\t"
686 "psllq $7, %%mm3 \n\t"
687 "pand %%mm7, %%mm0 \n\t"
688 "pand %%mm7, %%mm3 \n\t"
689 "psrlq $6, %%mm1 \n\t"
690 "psrlq $6, %%mm4 \n\t"
691 "pand %%mm6, %%mm1 \n\t"
692 "pand %%mm6, %%mm4 \n\t"
693 "psrlq $19, %%mm2 \n\t"
694 "psrlq $19, %%mm5 \n\t"
695 "pand %2, %%mm2 \n\t"
696 "pand %2, %%mm5 \n\t"
697 "por %%mm1, %%mm0 \n\t"
698 "por %%mm4, %%mm3 \n\t"
699 "por %%mm2, %%mm0 \n\t"
700 "por %%mm5, %%mm3 \n\t"
701 "psllq $16, %%mm3 \n\t"
702 "por %%mm3, %%mm0 \n\t"
704 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
708 __asm__
volatile(
SFENCE:::
"memory");
709 __asm__
volatile(
EMMS:::
"memory");
714 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
721 const uint16_t *mm_end;
723 const uint16_t *
s = (
const uint16_t*)
src;
724 end = s + src_size/2;
725 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
730 "movq (%1), %%mm0 \n\t"
731 "movq (%1), %%mm1 \n\t"
732 "movq (%1), %%mm2 \n\t"
733 "pand %2, %%mm0 \n\t"
734 "pand %3, %%mm1 \n\t"
735 "pand %4, %%mm2 \n\t"
736 "psllq $5, %%mm0 \n\t"
737 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
738 "pmulhw "MANGLE(mul15_mid)
", %%mm1 \n\t"
739 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
740 "movq %%mm0, %%mm3 \n\t"
741 "movq %%mm1, %%mm4 \n\t"
742 "movq %%mm2, %%mm5 \n\t"
743 "punpcklwd %5, %%mm0 \n\t"
744 "punpcklwd %5, %%mm1 \n\t"
745 "punpcklwd %5, %%mm2 \n\t"
746 "punpckhwd %5, %%mm3 \n\t"
747 "punpckhwd %5, %%mm4 \n\t"
748 "punpckhwd %5, %%mm5 \n\t"
749 "psllq $8, %%mm1 \n\t"
750 "psllq $16, %%mm2 \n\t"
751 "por %%mm1, %%mm0 \n\t"
752 "por %%mm2, %%mm0 \n\t"
753 "psllq $8, %%mm4 \n\t"
754 "psllq $16, %%mm5 \n\t"
755 "por %%mm4, %%mm3 \n\t"
756 "por %%mm5, %%mm3 \n\t"
758 "movq %%mm0, %%mm6 \n\t"
759 "movq %%mm3, %%mm7 \n\t"
761 "movq 8(%1), %%mm0 \n\t"
762 "movq 8(%1), %%mm1 \n\t"
763 "movq 8(%1), %%mm2 \n\t"
764 "pand %2, %%mm0 \n\t"
765 "pand %3, %%mm1 \n\t"
766 "pand %4, %%mm2 \n\t"
767 "psllq $5, %%mm0 \n\t"
768 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
769 "pmulhw "MANGLE(mul15_mid)
", %%mm1 \n\t"
770 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
771 "movq %%mm0, %%mm3 \n\t"
772 "movq %%mm1, %%mm4 \n\t"
773 "movq %%mm2, %%mm5 \n\t"
774 "punpcklwd %5, %%mm0 \n\t"
775 "punpcklwd %5, %%mm1 \n\t"
776 "punpcklwd %5, %%mm2 \n\t"
777 "punpckhwd %5, %%mm3 \n\t"
778 "punpckhwd %5, %%mm4 \n\t"
779 "punpckhwd %5, %%mm5 \n\t"
780 "psllq $8, %%mm1 \n\t"
781 "psllq $16, %%mm2 \n\t"
782 "por %%mm1, %%mm0 \n\t"
783 "por %%mm2, %%mm0 \n\t"
784 "psllq $8, %%mm4 \n\t"
785 "psllq $16, %%mm5 \n\t"
786 "por %%mm4, %%mm3 \n\t"
787 "por %%mm5, %%mm3 \n\t"
790 :
"r"(
s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r),
"m"(mmx_null)
795 "movq %%mm0, %%mm4 \n\t"
796 "movq %%mm3, %%mm5 \n\t"
797 "movq %%mm6, %%mm0 \n\t"
798 "movq %%mm7, %%mm1 \n\t"
800 "movq %%mm4, %%mm6 \n\t"
801 "movq %%mm5, %%mm7 \n\t"
802 "movq %%mm0, %%mm2 \n\t"
803 "movq %%mm1, %%mm3 \n\t"
813 __asm__
volatile(
SFENCE:::
"memory");
814 __asm__
volatile(
EMMS:::
"memory");
816 register uint16_t bgr;
818 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
819 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
820 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
827 const uint16_t *mm_end;
829 const uint16_t *
s = (
const uint16_t *)
src;
830 end = s + src_size/2;
831 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
836 "movq (%1), %%mm0 \n\t"
837 "movq (%1), %%mm1 \n\t"
838 "movq (%1), %%mm2 \n\t"
839 "pand %2, %%mm0 \n\t"
840 "pand %3, %%mm1 \n\t"
841 "pand %4, %%mm2 \n\t"
842 "psllq $5, %%mm0 \n\t"
843 "psrlq $1, %%mm2 \n\t"
844 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
845 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
846 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
847 "movq %%mm0, %%mm3 \n\t"
848 "movq %%mm1, %%mm4 \n\t"
849 "movq %%mm2, %%mm5 \n\t"
850 "punpcklwd %5, %%mm0 \n\t"
851 "punpcklwd %5, %%mm1 \n\t"
852 "punpcklwd %5, %%mm2 \n\t"
853 "punpckhwd %5, %%mm3 \n\t"
854 "punpckhwd %5, %%mm4 \n\t"
855 "punpckhwd %5, %%mm5 \n\t"
856 "psllq $8, %%mm1 \n\t"
857 "psllq $16, %%mm2 \n\t"
858 "por %%mm1, %%mm0 \n\t"
859 "por %%mm2, %%mm0 \n\t"
860 "psllq $8, %%mm4 \n\t"
861 "psllq $16, %%mm5 \n\t"
862 "por %%mm4, %%mm3 \n\t"
863 "por %%mm5, %%mm3 \n\t"
865 "movq %%mm0, %%mm6 \n\t"
866 "movq %%mm3, %%mm7 \n\t"
868 "movq 8(%1), %%mm0 \n\t"
869 "movq 8(%1), %%mm1 \n\t"
870 "movq 8(%1), %%mm2 \n\t"
871 "pand %2, %%mm0 \n\t"
872 "pand %3, %%mm1 \n\t"
873 "pand %4, %%mm2 \n\t"
874 "psllq $5, %%mm0 \n\t"
875 "psrlq $1, %%mm2 \n\t"
876 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
877 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
878 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
879 "movq %%mm0, %%mm3 \n\t"
880 "movq %%mm1, %%mm4 \n\t"
881 "movq %%mm2, %%mm5 \n\t"
882 "punpcklwd %5, %%mm0 \n\t"
883 "punpcklwd %5, %%mm1 \n\t"
884 "punpcklwd %5, %%mm2 \n\t"
885 "punpckhwd %5, %%mm3 \n\t"
886 "punpckhwd %5, %%mm4 \n\t"
887 "punpckhwd %5, %%mm5 \n\t"
888 "psllq $8, %%mm1 \n\t"
889 "psllq $16, %%mm2 \n\t"
890 "por %%mm1, %%mm0 \n\t"
891 "por %%mm2, %%mm0 \n\t"
892 "psllq $8, %%mm4 \n\t"
893 "psllq $16, %%mm5 \n\t"
894 "por %%mm4, %%mm3 \n\t"
895 "por %%mm5, %%mm3 \n\t"
897 :
"r"(
s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mmx_null)
902 "movq %%mm0, %%mm4 \n\t"
903 "movq %%mm3, %%mm5 \n\t"
904 "movq %%mm6, %%mm0 \n\t"
905 "movq %%mm7, %%mm1 \n\t"
907 "movq %%mm4, %%mm6 \n\t"
908 "movq %%mm5, %%mm7 \n\t"
909 "movq %%mm0, %%mm2 \n\t"
910 "movq %%mm1, %%mm3 \n\t"
920 __asm__
volatile(
SFENCE:::
"memory");
921 __asm__
volatile(
EMMS:::
"memory");
923 register uint16_t bgr;
925 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
926 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
927 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
939 "packuswb %%mm7, %%mm0 \n\t" \
940 "packuswb %%mm7, %%mm1 \n\t" \
941 "packuswb %%mm7, %%mm2 \n\t" \
942 "punpcklbw %%mm1, %%mm0 \n\t" \
943 "punpcklbw %%mm6, %%mm2 \n\t" \
944 "movq %%mm0, %%mm3 \n\t" \
945 "punpcklwd %%mm2, %%mm0 \n\t" \
946 "punpckhwd %%mm2, %%mm3 \n\t" \
947 MOVNTQ" %%mm0, (%0) \n\t" \
948 MOVNTQ" %%mm3, 8(%0) \n\t" \
953 const uint16_t *mm_end;
955 const uint16_t *
s = (
const uint16_t *)
src;
956 end = s + src_size/2;
957 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
958 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
959 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
964 "movq (%1), %%mm0 \n\t"
965 "movq (%1), %%mm1 \n\t"
966 "movq (%1), %%mm2 \n\t"
967 "pand %2, %%mm0 \n\t"
968 "pand %3, %%mm1 \n\t"
969 "pand %4, %%mm2 \n\t"
970 "psllq $5, %%mm0 \n\t"
971 "pmulhw %5, %%mm0 \n\t"
972 "pmulhw %5, %%mm1 \n\t"
973 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
975 ::
"r"(d),
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r) ,
"m"(mul15_mid)
981 __asm__
volatile(
SFENCE:::
"memory");
982 __asm__
volatile(
EMMS:::
"memory");
984 register uint16_t bgr;
986 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
987 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
988 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
996 const uint16_t *mm_end;
998 const uint16_t *
s = (
const uint16_t*)
src;
999 end = s + src_size/2;
1000 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
1001 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
1002 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
1004 while (s < mm_end) {
1007 "movq (%1), %%mm0 \n\t"
1008 "movq (%1), %%mm1 \n\t"
1009 "movq (%1), %%mm2 \n\t"
1010 "pand %2, %%mm0 \n\t"
1011 "pand %3, %%mm1 \n\t"
1012 "pand %4, %%mm2 \n\t"
1013 "psllq $5, %%mm0 \n\t"
1014 "psrlq $1, %%mm2 \n\t"
1015 "pmulhw %5, %%mm0 \n\t"
1016 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
1017 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
1019 ::
"r"(d),
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mul15_mid)
1025 __asm__
volatile(
SFENCE:::
"memory");
1026 __asm__
volatile(
EMMS:::
"memory");
1028 register uint16_t bgr;
1030 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1031 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1032 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1046 "movq %3, %%mm7 \n\t"
1047 "pxor %4, %%mm7 \n\t"
1048 "movq %%mm7, %%mm6 \n\t"
1049 "pxor %5, %%mm7 \n\t"
1053 "movq (%1, %0), %%mm0 \n\t"
1054 "movq 8(%1, %0), %%mm1 \n\t"
1055 # if COMPILE_TEMPLATE_MMXEXT
1056 "pshufw $177, %%mm0, %%mm3 \n\t"
1057 "pshufw $177, %%mm1, %%mm5 \n\t"
1058 "pand %%mm7, %%mm0 \n\t"
1059 "pand %%mm6, %%mm3 \n\t"
1060 "pand %%mm7, %%mm1 \n\t"
1061 "pand %%mm6, %%mm5 \n\t"
1062 "por %%mm3, %%mm0 \n\t"
1063 "por %%mm5, %%mm1 \n\t"
1065 "movq %%mm0, %%mm2 \n\t"
1066 "movq %%mm1, %%mm4 \n\t"
1067 "pand %%mm7, %%mm0 \n\t"
1068 "pand %%mm6, %%mm2 \n\t"
1069 "pand %%mm7, %%mm1 \n\t"
1070 "pand %%mm6, %%mm4 \n\t"
1071 "movq %%mm2, %%mm3 \n\t"
1072 "movq %%mm4, %%mm5 \n\t"
1073 "pslld $16, %%mm2 \n\t"
1074 "psrld $16, %%mm3 \n\t"
1075 "pslld $16, %%mm4 \n\t"
1076 "psrld $16, %%mm5 \n\t"
1077 "por %%mm2, %%mm0 \n\t"
1078 "por %%mm4, %%mm1 \n\t"
1079 "por %%mm3, %%mm0 \n\t"
1080 "por %%mm5, %%mm1 \n\t"
1082 MOVNTQ" %%mm0, (%2, %0) \n\t"
1083 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1090 :
"r" (s),
"r" (d),
"m" (mask32b),
"m" (mask32r),
"m" (mmx_one)
1092 for (; idx<15; idx+=4) {
1093 register unsigned v = *(
const uint32_t *)&s[idx],
g = v & 0xff00ff00;
1095 *(uint32_t *)&d[idx] = (v>>16) +
g + (v<<16);
1102 x86_reg mmx_size= 23 - src_size;
1104 "test %%"REG_a
", %%"REG_a
" \n\t"
1106 "movq "MANGLE(mask24r)
", %%mm5 \n\t"
1107 "movq "MANGLE(mask24g)
", %%mm6 \n\t"
1108 "movq "MANGLE(mask24b)
", %%mm7 \n\t"
1112 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1113 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1114 "movq 2(%1, %%"REG_a
"), %%mm2 \n\t"
1115 "psllq $16, %%mm0 \n\t"
1116 "pand %%mm5, %%mm0 \n\t"
1117 "pand %%mm6, %%mm1 \n\t"
1118 "pand %%mm7, %%mm2 \n\t"
1119 "por %%mm0, %%mm1 \n\t"
1120 "por %%mm2, %%mm1 \n\t"
1121 "movq 6(%1, %%"REG_a
"), %%mm0 \n\t"
1122 MOVNTQ" %%mm1, (%2, %%"REG_a
") \n\t"
1123 "movq 8(%1, %%"REG_a
"), %%mm1 \n\t"
1124 "movq 10(%1, %%"REG_a
"), %%mm2 \n\t"
1125 "pand %%mm7, %%mm0 \n\t"
1126 "pand %%mm5, %%mm1 \n\t"
1127 "pand %%mm6, %%mm2 \n\t"
1128 "por %%mm0, %%mm1 \n\t"
1129 "por %%mm2, %%mm1 \n\t"
1130 "movq 14(%1, %%"REG_a
"), %%mm0 \n\t"
1131 MOVNTQ" %%mm1, 8(%2, %%"REG_a
") \n\t"
1132 "movq 16(%1, %%"REG_a
"), %%mm1 \n\t"
1133 "movq 18(%1, %%"REG_a
"), %%mm2 \n\t"
1134 "pand %%mm6, %%mm0 \n\t"
1135 "pand %%mm7, %%mm1 \n\t"
1136 "pand %%mm5, %%mm2 \n\t"
1137 "por %%mm0, %%mm1 \n\t"
1138 "por %%mm2, %%mm1 \n\t"
1139 MOVNTQ" %%mm1, 16(%2, %%"REG_a
") \n\t"
1140 "add $24, %%"REG_a
" \n\t"
1144 :
"r" (
src-mmx_size),
"r"(dst-mmx_size)
1148 __asm__
volatile(
SFENCE:::
"memory");
1149 __asm__
volatile(
EMMS:::
"memory");
1151 if (mmx_size==23)
return;
1155 src_size= 23-mmx_size;
1158 for (i=0; i<src_size; i+=3) {
1161 dst[i + 1] =
src[i + 1];
1162 dst[i + 2] =
src[i + 0];
1169 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1173 for (y=0; y<
height; y++) {
1176 "xor %%"REG_a
", %%"REG_a
" \n\t"
1179 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1182 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1183 "movq %%mm0, %%mm2 \n\t"
1184 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1185 "punpcklbw %%mm1, %%mm0 \n\t"
1186 "punpckhbw %%mm1, %%mm2 \n\t"
1188 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1189 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1190 "movq %%mm3, %%mm4 \n\t"
1191 "movq %%mm5, %%mm6 \n\t"
1192 "punpcklbw %%mm0, %%mm3 \n\t"
1193 "punpckhbw %%mm0, %%mm4 \n\t"
1194 "punpcklbw %%mm2, %%mm5 \n\t"
1195 "punpckhbw %%mm2, %%mm6 \n\t"
1197 MOVNTQ" %%mm3, (%0, %%"REG_a
", 4) \n\t"
1198 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1199 MOVNTQ" %%mm5, 16(%0, %%"REG_a
", 4) \n\t"
1200 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1202 "add $8, %%"REG_a
" \n\t"
1203 "cmp %4, %%"REG_a
" \n\t"
1205 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1208 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1209 usrc += chromStride;
1210 vsrc += chromStride;
1226 int lumStride,
int chromStride,
int dstStride)
1234 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1238 for (y=0; y<
height; y++) {
1241 "xor %%"REG_a
", %%"REG_a
" \n\t"
1244 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1247 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1248 "movq %%mm0, %%mm2 \n\t"
1249 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1250 "punpcklbw %%mm1, %%mm0 \n\t"
1251 "punpckhbw %%mm1, %%mm2 \n\t"
1253 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1254 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1255 "movq %%mm0, %%mm4 \n\t"
1256 "movq %%mm2, %%mm6 \n\t"
1257 "punpcklbw %%mm3, %%mm0 \n\t"
1258 "punpckhbw %%mm3, %%mm4 \n\t"
1259 "punpcklbw %%mm5, %%mm2 \n\t"
1260 "punpckhbw %%mm5, %%mm6 \n\t"
1262 MOVNTQ" %%mm0, (%0, %%"REG_a
", 4) \n\t"
1263 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1264 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 4) \n\t"
1265 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1267 "add $8, %%"REG_a
" \n\t"
1268 "cmp %4, %%"REG_a
" \n\t"
1270 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1273 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1274 usrc += chromStride;
1275 vsrc += chromStride;
1291 int lumStride,
int chromStride,
int dstStride)
1302 int lumStride,
int chromStride,
int dstStride)
1312 int lumStride,
int chromStride,
int dstStride)
1323 int lumStride,
int chromStride,
int srcStride)
1327 for (y=0; y<
height; y+=2) {
1329 "xor %%"REG_a
", %%"REG_a
" \n\t"
1330 "pcmpeqw %%mm7, %%mm7 \n\t"
1331 "psrlw $8, %%mm7 \n\t"
1334 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1335 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1336 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1337 "movq %%mm0, %%mm2 \n\t"
1338 "movq %%mm1, %%mm3 \n\t"
1339 "psrlw $8, %%mm0 \n\t"
1340 "psrlw $8, %%mm1 \n\t"
1341 "pand %%mm7, %%mm2 \n\t"
1342 "pand %%mm7, %%mm3 \n\t"
1343 "packuswb %%mm1, %%mm0 \n\t"
1344 "packuswb %%mm3, %%mm2 \n\t"
1346 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1348 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1349 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1350 "movq %%mm1, %%mm3 \n\t"
1351 "movq %%mm2, %%mm4 \n\t"
1352 "psrlw $8, %%mm1 \n\t"
1353 "psrlw $8, %%mm2 \n\t"
1354 "pand %%mm7, %%mm3 \n\t"
1355 "pand %%mm7, %%mm4 \n\t"
1356 "packuswb %%mm2, %%mm1 \n\t"
1357 "packuswb %%mm4, %%mm3 \n\t"
1359 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1361 "movq %%mm0, %%mm2 \n\t"
1362 "movq %%mm1, %%mm3 \n\t"
1363 "psrlw $8, %%mm0 \n\t"
1364 "psrlw $8, %%mm1 \n\t"
1365 "pand %%mm7, %%mm2 \n\t"
1366 "pand %%mm7, %%mm3 \n\t"
1367 "packuswb %%mm1, %%mm0 \n\t"
1368 "packuswb %%mm3, %%mm2 \n\t"
1370 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1371 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1373 "add $8, %%"REG_a
" \n\t"
1374 "cmp %4, %%"REG_a
" \n\t"
1376 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1377 :
"memory",
"%"REG_a
1384 "xor %%"REG_a
", %%"REG_a
" \n\t"
1387 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1388 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1389 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1390 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1391 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1392 "pand %%mm7, %%mm0 \n\t"
1393 "pand %%mm7, %%mm1 \n\t"
1394 "pand %%mm7, %%mm2 \n\t"
1395 "pand %%mm7, %%mm3 \n\t"
1396 "packuswb %%mm1, %%mm0 \n\t"
1397 "packuswb %%mm3, %%mm2 \n\t"
1399 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1400 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1402 "add $8, %%"REG_a
" \n\t"
1403 "cmp %4, %%"REG_a
" \n\t"
1406 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1407 :
"memory",
"%"REG_a
1409 udst += chromStride;
1410 vdst += chromStride;
1414 __asm__
volatile(
EMMS" \n\t"
1420 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1428 for (x=0; x<srcWidth-1; x++) {
1429 dst[2*x+1]= (3*
src[x] +
src[x+1])>>2;
1430 dst[2*x+2]= (
src[x] + 3*
src[x+1])>>2;
1432 dst[2*srcWidth-1]=
src[srcWidth-1];
1436 for (y=1; y<srcHeight; y++) {
1437 const x86_reg mmxSize= srcWidth&~15;
1439 "mov %4, %%"REG_a
" \n\t"
1440 "movq "MANGLE(mmx_ff)
", %%mm0 \n\t"
1441 "movq (%0, %%"REG_a
"), %%mm4 \n\t"
1442 "movq %%mm4, %%mm2 \n\t"
1443 "psllq $8, %%mm4 \n\t"
1444 "pand %%mm0, %%mm2 \n\t"
1445 "por %%mm2, %%mm4 \n\t"
1446 "movq (%1, %%"REG_a
"), %%mm5 \n\t"
1447 "movq %%mm5, %%mm3 \n\t"
1448 "psllq $8, %%mm5 \n\t"
1449 "pand %%mm0, %%mm3 \n\t"
1450 "por %%mm3, %%mm5 \n\t"
1452 "movq (%0, %%"REG_a
"), %%mm0 \n\t"
1453 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1454 "movq 1(%0, %%"REG_a
"), %%mm2 \n\t"
1455 "movq 1(%1, %%"REG_a
"), %%mm3 \n\t"
1456 PAVGB" %%mm0, %%mm5 \n\t"
1457 PAVGB" %%mm0, %%mm3 \n\t"
1458 PAVGB" %%mm0, %%mm5 \n\t"
1459 PAVGB" %%mm0, %%mm3 \n\t"
1460 PAVGB" %%mm1, %%mm4 \n\t"
1461 PAVGB" %%mm1, %%mm2 \n\t"
1462 PAVGB" %%mm1, %%mm4 \n\t"
1463 PAVGB" %%mm1, %%mm2 \n\t"
1464 "movq %%mm5, %%mm7 \n\t"
1465 "movq %%mm4, %%mm6 \n\t"
1466 "punpcklbw %%mm3, %%mm5 \n\t"
1467 "punpckhbw %%mm3, %%mm7 \n\t"
1468 "punpcklbw %%mm2, %%mm4 \n\t"
1469 "punpckhbw %%mm2, %%mm6 \n\t"
1470 MOVNTQ" %%mm5, (%2, %%"REG_a
", 2) \n\t"
1471 MOVNTQ" %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1472 MOVNTQ" %%mm4, (%3, %%"REG_a
", 2) \n\t"
1473 MOVNTQ" %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1474 "add $8, %%"REG_a
" \n\t"
1475 "movq -1(%0, %%"REG_a
"), %%mm4 \n\t"
1476 "movq -1(%1, %%"REG_a
"), %%mm5 \n\t"
1478 ::
"r" (
src + mmxSize ),
"r" (
src + srcStride + mmxSize ),
1479 "r" (dst + mmxSize*2),
"r" (dst + dstStride + mmxSize*2),
1485 for (x=mmxSize-1; x<srcWidth-1; x++) {
1486 dst[2*x +1]= (3*
src[x+0] +
src[x+srcStride+1])>>2;
1487 dst[2*x+dstStride+2]= (
src[x+0] + 3*
src[x+srcStride+1])>>2;
1488 dst[2*x+dstStride+1]= (
src[x+1] + 3*
src[x+srcStride ])>>2;
1489 dst[2*x +2]= (3*
src[x+1] +
src[x+srcStride ])>>2;
1491 dst[srcWidth*2 -1 ]= (3*
src[srcWidth-1] +
src[srcWidth-1 + srcStride])>>2;
1492 dst[srcWidth*2 -1 + dstStride]= (
src[srcWidth-1] + 3*
src[srcWidth-1 + srcStride])>>2;
1501 for (x=0; x<srcWidth-1; x++) {
1502 dst[2*x+1]= (3*
src[x] +
src[x+1])>>2;
1503 dst[2*x+2]= (
src[x] + 3*
src[x+1])>>2;
1505 dst[2*srcWidth-1]=
src[srcWidth-1];
1507 __asm__
volatile(
EMMS" \n\t"
1513 #if !COMPILE_TEMPLATE_AMD3DNOW
1522 int lumStride,
int chromStride,
int srcStride)
1525 const x86_reg chromWidth= width>>1;
1526 for (y=0; y<
height; y+=2) {
1528 "xor %%"REG_a
", %%"REG_a
" \n\t"
1529 "pcmpeqw %%mm7, %%mm7 \n\t"
1530 "psrlw $8, %%mm7 \n\t"
1533 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1534 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1535 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1536 "movq %%mm0, %%mm2 \n\t"
1537 "movq %%mm1, %%mm3 \n\t"
1538 "pand %%mm7, %%mm0 \n\t"
1539 "pand %%mm7, %%mm1 \n\t"
1540 "psrlw $8, %%mm2 \n\t"
1541 "psrlw $8, %%mm3 \n\t"
1542 "packuswb %%mm1, %%mm0 \n\t"
1543 "packuswb %%mm3, %%mm2 \n\t"
1545 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1547 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1548 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1549 "movq %%mm1, %%mm3 \n\t"
1550 "movq %%mm2, %%mm4 \n\t"
1551 "pand %%mm7, %%mm1 \n\t"
1552 "pand %%mm7, %%mm2 \n\t"
1553 "psrlw $8, %%mm3 \n\t"
1554 "psrlw $8, %%mm4 \n\t"
1555 "packuswb %%mm2, %%mm1 \n\t"
1556 "packuswb %%mm4, %%mm3 \n\t"
1558 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1560 "movq %%mm0, %%mm2 \n\t"
1561 "movq %%mm1, %%mm3 \n\t"
1562 "psrlw $8, %%mm0 \n\t"
1563 "psrlw $8, %%mm1 \n\t"
1564 "pand %%mm7, %%mm2 \n\t"
1565 "pand %%mm7, %%mm3 \n\t"
1566 "packuswb %%mm1, %%mm0 \n\t"
1567 "packuswb %%mm3, %%mm2 \n\t"
1569 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1570 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1572 "add $8, %%"REG_a
" \n\t"
1573 "cmp %4, %%"REG_a
" \n\t"
1575 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1576 :
"memory",
"%"REG_a
1583 "xor %%"REG_a
", %%"REG_a
" \n\t"
1586 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1587 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1588 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1589 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1590 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1591 "psrlw $8, %%mm0 \n\t"
1592 "psrlw $8, %%mm1 \n\t"
1593 "psrlw $8, %%mm2 \n\t"
1594 "psrlw $8, %%mm3 \n\t"
1595 "packuswb %%mm1, %%mm0 \n\t"
1596 "packuswb %%mm3, %%mm2 \n\t"
1598 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1599 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1601 "add $8, %%"REG_a
" \n\t"
1602 "cmp %4, %%"REG_a
" \n\t"
1605 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1606 :
"memory",
"%"REG_a
1608 udst += chromStride;
1609 vdst += chromStride;
1613 __asm__
volatile(
EMMS" \n\t"
1629 int lumStride,
int chromStride,
int srcStride,
1632 #define BGR2Y_IDX "16*4+16*32"
1633 #define BGR2U_IDX "16*4+16*33"
1634 #define BGR2V_IDX "16*4+16*34"
1636 const x86_reg chromWidth= width>>1;
1641 ydst += 2*lumStride;
1642 udst += chromStride;
1643 vdst += chromStride;
1647 for (y=0; y<height-2; y+=2) {
1649 for (i=0; i<2; i++) {
1651 "mov %2, %%"REG_a
" \n\t"
1652 "movq "BGR2Y_IDX
"(%3), %%mm6 \n\t"
1653 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1654 "pxor %%mm7, %%mm7 \n\t"
1655 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1659 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1660 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t"
1661 "punpcklbw %%mm7, %%mm0 \n\t"
1662 "punpcklbw %%mm7, %%mm1 \n\t"
1663 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t"
1664 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t"
1665 "punpcklbw %%mm7, %%mm2 \n\t"
1666 "punpcklbw %%mm7, %%mm3 \n\t"
1667 "pmaddwd %%mm6, %%mm0 \n\t"
1668 "pmaddwd %%mm6, %%mm1 \n\t"
1669 "pmaddwd %%mm6, %%mm2 \n\t"
1670 "pmaddwd %%mm6, %%mm3 \n\t"
1671 "psrad $8, %%mm0 \n\t"
1672 "psrad $8, %%mm1 \n\t"
1673 "psrad $8, %%mm2 \n\t"
1674 "psrad $8, %%mm3 \n\t"
1675 "packssdw %%mm1, %%mm0 \n\t"
1676 "packssdw %%mm3, %%mm2 \n\t"
1677 "pmaddwd %%mm5, %%mm0 \n\t"
1678 "pmaddwd %%mm5, %%mm2 \n\t"
1679 "packssdw %%mm2, %%mm0 \n\t"
1680 "psraw $7, %%mm0 \n\t"
1682 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1683 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t"
1684 "punpcklbw %%mm7, %%mm4 \n\t"
1685 "punpcklbw %%mm7, %%mm1 \n\t"
1686 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t"
1687 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t"
1688 "punpcklbw %%mm7, %%mm2 \n\t"
1689 "punpcklbw %%mm7, %%mm3 \n\t"
1690 "pmaddwd %%mm6, %%mm4 \n\t"
1691 "pmaddwd %%mm6, %%mm1 \n\t"
1692 "pmaddwd %%mm6, %%mm2 \n\t"
1693 "pmaddwd %%mm6, %%mm3 \n\t"
1694 "psrad $8, %%mm4 \n\t"
1695 "psrad $8, %%mm1 \n\t"
1696 "psrad $8, %%mm2 \n\t"
1697 "psrad $8, %%mm3 \n\t"
1698 "packssdw %%mm1, %%mm4 \n\t"
1699 "packssdw %%mm3, %%mm2 \n\t"
1700 "pmaddwd %%mm5, %%mm4 \n\t"
1701 "pmaddwd %%mm5, %%mm2 \n\t"
1702 "add $24, %%"REG_d
" \n\t"
1703 "packssdw %%mm2, %%mm4 \n\t"
1704 "psraw $7, %%mm4 \n\t"
1706 "packuswb %%mm4, %%mm0 \n\t"
1707 "paddusb "MANGLE(ff_bgr2YOffset)
", %%mm0 \n\t"
1709 MOVNTQ" %%mm0, (%1, %%"REG_a
") \n\t"
1710 "add $8, %%"REG_a
" \n\t"
1712 : :
"r" (src+width*3),
"r" (ydst+width),
"g" ((
x86_reg)-width),
"r"(
rgb2yuv)
1714 :
"%"REG_a,
"%"REG_d
1721 "mov %4, %%"REG_a
" \n\t"
1722 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1723 "movq "BGR2U_IDX
"(%5), %%mm6 \n\t"
1724 "pxor %%mm7, %%mm7 \n\t"
1725 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1726 "add %%"REG_d
", %%"REG_d
" \n\t"
1731 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1732 "movq (%0, %%"REG_d
"), %%mm0 \n\t"
1733 "movq (%1, %%"REG_d
"), %%mm1 \n\t"
1734 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t"
1735 "movq 6(%1, %%"REG_d
"), %%mm3 \n\t"
1736 PAVGB" %%mm1, %%mm0 \n\t"
1737 PAVGB" %%mm3, %%mm2 \n\t"
1738 "movq %%mm0, %%mm1 \n\t"
1739 "movq %%mm2, %%mm3 \n\t"
1740 "psrlq $24, %%mm0 \n\t"
1741 "psrlq $24, %%mm2 \n\t"
1742 PAVGB" %%mm1, %%mm0 \n\t"
1743 PAVGB" %%mm3, %%mm2 \n\t"
1744 "punpcklbw %%mm7, %%mm0 \n\t"
1745 "punpcklbw %%mm7, %%mm2 \n\t"
1747 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1748 "movd (%1, %%"REG_d
"), %%mm1 \n\t"
1749 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t"
1750 "movd 3(%1, %%"REG_d
"), %%mm3 \n\t"
1751 "punpcklbw %%mm7, %%mm0 \n\t"
1752 "punpcklbw %%mm7, %%mm1 \n\t"
1753 "punpcklbw %%mm7, %%mm2 \n\t"
1754 "punpcklbw %%mm7, %%mm3 \n\t"
1755 "paddw %%mm1, %%mm0 \n\t"
1756 "paddw %%mm3, %%mm2 \n\t"
1757 "paddw %%mm2, %%mm0 \n\t"
1758 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t"
1759 "movd 6(%1, %%"REG_d
"), %%mm1 \n\t"
1760 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t"
1761 "movd 9(%1, %%"REG_d
"), %%mm3 \n\t"
1762 "punpcklbw %%mm7, %%mm4 \n\t"
1763 "punpcklbw %%mm7, %%mm1 \n\t"
1764 "punpcklbw %%mm7, %%mm2 \n\t"
1765 "punpcklbw %%mm7, %%mm3 \n\t"
1766 "paddw %%mm1, %%mm4 \n\t"
1767 "paddw %%mm3, %%mm2 \n\t"
1768 "paddw %%mm4, %%mm2 \n\t"
1769 "psrlw $2, %%mm0 \n\t"
1770 "psrlw $2, %%mm2 \n\t"
1772 "movq "BGR2V_IDX
"(%5), %%mm1 \n\t"
1773 "movq "BGR2V_IDX
"(%5), %%mm3 \n\t"
1775 "pmaddwd %%mm0, %%mm1 \n\t"
1776 "pmaddwd %%mm2, %%mm3 \n\t"
1777 "pmaddwd %%mm6, %%mm0 \n\t"
1778 "pmaddwd %%mm6, %%mm2 \n\t"
1779 "psrad $8, %%mm0 \n\t"
1780 "psrad $8, %%mm1 \n\t"
1781 "psrad $8, %%mm2 \n\t"
1782 "psrad $8, %%mm3 \n\t"
1783 "packssdw %%mm2, %%mm0 \n\t"
1784 "packssdw %%mm3, %%mm1 \n\t"
1785 "pmaddwd %%mm5, %%mm0 \n\t"
1786 "pmaddwd %%mm5, %%mm1 \n\t"
1787 "packssdw %%mm1, %%mm0 \n\t"
1788 "psraw $7, %%mm0 \n\t"
1790 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1791 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t"
1792 "movq 12(%1, %%"REG_d
"), %%mm1 \n\t"
1793 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t"
1794 "movq 18(%1, %%"REG_d
"), %%mm3 \n\t"
1795 PAVGB" %%mm1, %%mm4 \n\t"
1796 PAVGB" %%mm3, %%mm2 \n\t"
1797 "movq %%mm4, %%mm1 \n\t"
1798 "movq %%mm2, %%mm3 \n\t"
1799 "psrlq $24, %%mm4 \n\t"
1800 "psrlq $24, %%mm2 \n\t"
1801 PAVGB" %%mm1, %%mm4 \n\t"
1802 PAVGB" %%mm3, %%mm2 \n\t"
1803 "punpcklbw %%mm7, %%mm4 \n\t"
1804 "punpcklbw %%mm7, %%mm2 \n\t"
1806 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1807 "movd 12(%1, %%"REG_d
"), %%mm1 \n\t"
1808 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t"
1809 "movd 15(%1, %%"REG_d
"), %%mm3 \n\t"
1810 "punpcklbw %%mm7, %%mm4 \n\t"
1811 "punpcklbw %%mm7, %%mm1 \n\t"
1812 "punpcklbw %%mm7, %%mm2 \n\t"
1813 "punpcklbw %%mm7, %%mm3 \n\t"
1814 "paddw %%mm1, %%mm4 \n\t"
1815 "paddw %%mm3, %%mm2 \n\t"
1816 "paddw %%mm2, %%mm4 \n\t"
1817 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t"
1818 "movd 18(%1, %%"REG_d
"), %%mm1 \n\t"
1819 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t"
1820 "movd 21(%1, %%"REG_d
"), %%mm3 \n\t"
1821 "punpcklbw %%mm7, %%mm5 \n\t"
1822 "punpcklbw %%mm7, %%mm1 \n\t"
1823 "punpcklbw %%mm7, %%mm2 \n\t"
1824 "punpcklbw %%mm7, %%mm3 \n\t"
1825 "paddw %%mm1, %%mm5 \n\t"
1826 "paddw %%mm3, %%mm2 \n\t"
1827 "paddw %%mm5, %%mm2 \n\t"
1828 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1829 "psrlw $2, %%mm4 \n\t"
1830 "psrlw $2, %%mm2 \n\t"
1832 "movq "BGR2V_IDX
"(%5), %%mm1 \n\t"
1833 "movq "BGR2V_IDX
"(%5), %%mm3 \n\t"
1835 "pmaddwd %%mm4, %%mm1 \n\t"
1836 "pmaddwd %%mm2, %%mm3 \n\t"
1837 "pmaddwd %%mm6, %%mm4 \n\t"
1838 "pmaddwd %%mm6, %%mm2 \n\t"
1839 "psrad $8, %%mm4 \n\t"
1840 "psrad $8, %%mm1 \n\t"
1841 "psrad $8, %%mm2 \n\t"
1842 "psrad $8, %%mm3 \n\t"
1843 "packssdw %%mm2, %%mm4 \n\t"
1844 "packssdw %%mm3, %%mm1 \n\t"
1845 "pmaddwd %%mm5, %%mm4 \n\t"
1846 "pmaddwd %%mm5, %%mm1 \n\t"
1847 "add $24, %%"REG_d
" \n\t"
1848 "packssdw %%mm1, %%mm4 \n\t"
1849 "psraw $7, %%mm4 \n\t"
1851 "movq %%mm0, %%mm1 \n\t"
1852 "punpckldq %%mm4, %%mm0 \n\t"
1853 "punpckhdq %%mm4, %%mm1 \n\t"
1854 "packsswb %%mm1, %%mm0 \n\t"
1855 "paddb "MANGLE(ff_bgr2UVOffset)
", %%mm0 \n\t"
1856 "movd %%mm0, (%2, %%"REG_a
") \n\t"
1857 "punpckhdq %%mm0, %%mm0 \n\t"
1858 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1859 "add $4, %%"REG_a
" \n\t"
1861 : :
"r" (src+chromWidth*6),
"r" (src+srcStride+chromWidth*6),
"r" (udst+chromWidth),
"r" (vdst+chromWidth),
"g" (-chromWidth),
"r"(
rgb2yuv)
1863 :
"%"REG_a,
"%"REG_d
1866 udst += chromStride;
1867 vdst += chromStride;
1871 __asm__
volatile(
EMMS" \n\t"
1880 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
1883 int src2Stride,
int dstStride)
1887 for (h=0; h <
height; h++) {
1891 #if COMPILE_TEMPLATE_SSE2
1893 "xor %%"REG_a
", %%"REG_a
" \n\t"
1897 "movdqa (%1, %%"REG_a
"), %%xmm0 \n\t"
1898 "movdqa (%1, %%"REG_a
"), %%xmm1 \n\t"
1899 "movdqa (%2, %%"REG_a
"), %%xmm2 \n\t"
1900 "punpcklbw %%xmm2, %%xmm0 \n\t"
1901 "punpckhbw %%xmm2, %%xmm1 \n\t"
1902 "movntdq %%xmm0, (%0, %%"REG_a
", 2) \n\t"
1903 "movntdq %%xmm1, 16(%0, %%"REG_a
", 2) \n\t"
1904 "add $16, %%"REG_a
" \n\t"
1905 "cmp %3, %%"REG_a
" \n\t"
1907 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1908 :
"memory",
XMM_CLOBBERS(
"xmm0",
"xmm1",
"xmm2",)
"%"REG_a
1912 "xor %%"REG_a
", %%"REG_a
" \n\t"
1916 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1917 "movq 8(%1, %%"REG_a
"), %%mm2 \n\t"
1918 "movq %%mm0, %%mm1 \n\t"
1919 "movq %%mm2, %%mm3 \n\t"
1920 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
1921 "movq 8(%2, %%"REG_a
"), %%mm5 \n\t"
1922 "punpcklbw %%mm4, %%mm0 \n\t"
1923 "punpckhbw %%mm4, %%mm1 \n\t"
1924 "punpcklbw %%mm5, %%mm2 \n\t"
1925 "punpckhbw %%mm5, %%mm3 \n\t"
1926 MOVNTQ" %%mm0, (%0, %%"REG_a
", 2) \n\t"
1927 MOVNTQ" %%mm1, 8(%0, %%"REG_a
", 2) \n\t"
1928 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 2) \n\t"
1929 MOVNTQ" %%mm3, 24(%0, %%"REG_a
", 2) \n\t"
1930 "add $16, %%"REG_a
" \n\t"
1931 "cmp %3, %%"REG_a
" \n\t"
1933 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1934 :
"memory",
"%"REG_a
1937 for (w= (width&(~15)); w <
width; w++) {
1938 dest[2*w+0] = src1[w];
1939 dest[2*w+1] = src2[w];
1946 #
if !COMPILE_TEMPLATE_SSE2
1955 #if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
1956 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
1965 int dst1Stride,
int dst2Stride)
1969 for (h = 0; h <
height; h++) {
1976 #
if !COMPILE_TEMPLATE_SSE2
1986 #if !COMPILE_TEMPLATE_SSE2
1987 #if !COMPILE_TEMPLATE_AMD3DNOW
1991 int srcStride1,
int srcStride2,
1992 int dstStride1,
int dstStride2)
1996 w=width/2; h=height/2;
2000 ::
"m"(*(src1+srcStride1)),
"m"(*(src2+srcStride2)):
"memory");
2002 const uint8_t*
s1=src1+srcStride1*(y>>1);
2005 for (;x<w-31;x+=32) {
2008 "movq (%1,%2), %%mm0 \n\t"
2009 "movq 8(%1,%2), %%mm2 \n\t"
2010 "movq 16(%1,%2), %%mm4 \n\t"
2011 "movq 24(%1,%2), %%mm6 \n\t"
2012 "movq %%mm0, %%mm1 \n\t"
2013 "movq %%mm2, %%mm3 \n\t"
2014 "movq %%mm4, %%mm5 \n\t"
2015 "movq %%mm6, %%mm7 \n\t"
2016 "punpcklbw %%mm0, %%mm0 \n\t"
2017 "punpckhbw %%mm1, %%mm1 \n\t"
2018 "punpcklbw %%mm2, %%mm2 \n\t"
2019 "punpckhbw %%mm3, %%mm3 \n\t"
2020 "punpcklbw %%mm4, %%mm4 \n\t"
2021 "punpckhbw %%mm5, %%mm5 \n\t"
2022 "punpcklbw %%mm6, %%mm6 \n\t"
2023 "punpckhbw %%mm7, %%mm7 \n\t"
2024 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2025 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2026 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2027 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2028 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2029 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2030 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2031 MOVNTQ" %%mm7, 56(%0,%2,2)"
2032 ::
"r"(d),
"r"(s1),
"r"(x)
2035 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2038 const uint8_t*
s2=src2+srcStride2*(y>>1);
2041 for (;x<w-31;x+=32) {
2044 "movq (%1,%2), %%mm0 \n\t"
2045 "movq 8(%1,%2), %%mm2 \n\t"
2046 "movq 16(%1,%2), %%mm4 \n\t"
2047 "movq 24(%1,%2), %%mm6 \n\t"
2048 "movq %%mm0, %%mm1 \n\t"
2049 "movq %%mm2, %%mm3 \n\t"
2050 "movq %%mm4, %%mm5 \n\t"
2051 "movq %%mm6, %%mm7 \n\t"
2052 "punpcklbw %%mm0, %%mm0 \n\t"
2053 "punpckhbw %%mm1, %%mm1 \n\t"
2054 "punpcklbw %%mm2, %%mm2 \n\t"
2055 "punpckhbw %%mm3, %%mm3 \n\t"
2056 "punpcklbw %%mm4, %%mm4 \n\t"
2057 "punpckhbw %%mm5, %%mm5 \n\t"
2058 "punpcklbw %%mm6, %%mm6 \n\t"
2059 "punpckhbw %%mm7, %%mm7 \n\t"
2060 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2061 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2062 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2063 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2064 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2065 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2066 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2067 MOVNTQ" %%mm7, 56(%0,%2,2)"
2068 ::
"r"(d),
"r"(s2),
"r"(x)
2071 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2083 int srcStride1,
int srcStride2,
2084 int srcStride3,
int dstStride)
2090 const uint8_t* yp=src1+srcStride1*
y;
2091 const uint8_t* up=src2+srcStride2*(y>>2);
2092 const uint8_t* vp=src3+srcStride3*(y>>2);
2100 "movq (%1, %0, 4), %%mm0 \n\t"
2101 "movq (%2, %0), %%mm1 \n\t"
2102 "movq (%3, %0), %%mm2 \n\t"
2103 "movq %%mm0, %%mm3 \n\t"
2104 "movq %%mm1, %%mm4 \n\t"
2105 "movq %%mm2, %%mm5 \n\t"
2106 "punpcklbw %%mm1, %%mm1 \n\t"
2107 "punpcklbw %%mm2, %%mm2 \n\t"
2108 "punpckhbw %%mm4, %%mm4 \n\t"
2109 "punpckhbw %%mm5, %%mm5 \n\t"
2111 "movq %%mm1, %%mm6 \n\t"
2112 "punpcklbw %%mm2, %%mm1 \n\t"
2113 "punpcklbw %%mm1, %%mm0 \n\t"
2114 "punpckhbw %%mm1, %%mm3 \n\t"
2115 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2116 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2118 "punpckhbw %%mm2, %%mm6 \n\t"
2119 "movq 8(%1, %0, 4), %%mm0 \n\t"
2120 "movq %%mm0, %%mm3 \n\t"
2121 "punpcklbw %%mm6, %%mm0 \n\t"
2122 "punpckhbw %%mm6, %%mm3 \n\t"
2123 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2124 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2126 "movq %%mm4, %%mm6 \n\t"
2127 "movq 16(%1, %0, 4), %%mm0 \n\t"
2128 "movq %%mm0, %%mm3 \n\t"
2129 "punpcklbw %%mm5, %%mm4 \n\t"
2130 "punpcklbw %%mm4, %%mm0 \n\t"
2131 "punpckhbw %%mm4, %%mm3 \n\t"
2132 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2133 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2135 "punpckhbw %%mm5, %%mm6 \n\t"
2136 "movq 24(%1, %0, 4), %%mm0 \n\t"
2137 "movq %%mm0, %%mm3 \n\t"
2138 "punpcklbw %%mm6, %%mm0 \n\t"
2139 "punpckhbw %%mm6, %%mm3 \n\t"
2140 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2141 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2144 :
"r"(yp),
"r" (up),
"r"(vp),
"r"(d)
2148 const int x2 = x<<2;
2151 d[8*x+2] = yp[x2+1];
2153 d[8*x+4] = yp[x2+2];
2155 d[8*x+6] = yp[x2+3];
2176 "pcmpeqw %%mm7, %%mm7 \n\t"
2177 "psrlw $8, %%mm7 \n\t"
2179 "movq -30(%1, %0, 2), %%mm0 \n\t"
2180 "movq -22(%1, %0, 2), %%mm1 \n\t"
2181 "movq -14(%1, %0, 2), %%mm2 \n\t"
2182 "movq -6(%1, %0, 2), %%mm3 \n\t"
2183 "pand %%mm7, %%mm0 \n\t"
2184 "pand %%mm7, %%mm1 \n\t"
2185 "pand %%mm7, %%mm2 \n\t"
2186 "pand %%mm7, %%mm3 \n\t"
2187 "packuswb %%mm1, %%mm0 \n\t"
2188 "packuswb %%mm3, %%mm2 \n\t"
2189 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2190 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2194 :
"r"(src),
"r"(dst)
2214 "pcmpeqw %%mm7, %%mm7 \n\t"
2215 "psrlw $8, %%mm7 \n\t"
2217 "movq -32(%1, %0, 2), %%mm0 \n\t"
2218 "movq -24(%1, %0, 2), %%mm1 \n\t"
2219 "movq -16(%1, %0, 2), %%mm2 \n\t"
2220 "movq -8(%1, %0, 2), %%mm3 \n\t"
2221 "pand %%mm7, %%mm0 \n\t"
2222 "pand %%mm7, %%mm1 \n\t"
2223 "pand %%mm7, %%mm2 \n\t"
2224 "pand %%mm7, %%mm3 \n\t"
2225 "packuswb %%mm1, %%mm0 \n\t"
2226 "packuswb %%mm3, %%mm2 \n\t"
2227 MOVNTQ" %%mm0,-16(%2, %0) \n\t"
2228 MOVNTQ" %%mm2,- 8(%2, %0) \n\t"
2232 :
"r"(src),
"r"(dst)
2242 #if !COMPILE_TEMPLATE_AMD3DNOW
2252 "pcmpeqw %%mm7, %%mm7 \n\t"
2253 "psrlw $8, %%mm7 \n\t"
2255 "movq -28(%1, %0, 4), %%mm0 \n\t"
2256 "movq -20(%1, %0, 4), %%mm1 \n\t"
2257 "movq -12(%1, %0, 4), %%mm2 \n\t"
2258 "movq -4(%1, %0, 4), %%mm3 \n\t"
2259 "pand %%mm7, %%mm0 \n\t"
2260 "pand %%mm7, %%mm1 \n\t"
2261 "pand %%mm7, %%mm2 \n\t"
2262 "pand %%mm7, %%mm3 \n\t"
2263 "packuswb %%mm1, %%mm0 \n\t"
2264 "packuswb %%mm3, %%mm2 \n\t"
2265 "movq %%mm0, %%mm1 \n\t"
2266 "movq %%mm2, %%mm3 \n\t"
2267 "psrlw $8, %%mm0 \n\t"
2268 "psrlw $8, %%mm2 \n\t"
2269 "pand %%mm7, %%mm1 \n\t"
2270 "pand %%mm7, %%mm3 \n\t"
2271 "packuswb %%mm2, %%mm0 \n\t"
2272 "packuswb %%mm3, %%mm1 \n\t"
2273 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2274 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2278 :
"r"(src),
"r"(dst0),
"r"(dst1)
2301 "pcmpeqw %%mm7, %%mm7 \n\t"
2302 "psrlw $8, %%mm7 \n\t"
2304 "movq -28(%1, %0, 4), %%mm0 \n\t"
2305 "movq -20(%1, %0, 4), %%mm1 \n\t"
2306 "movq -12(%1, %0, 4), %%mm2 \n\t"
2307 "movq -4(%1, %0, 4), %%mm3 \n\t"
2308 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2309 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2310 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2311 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2312 "pand %%mm7, %%mm0 \n\t"
2313 "pand %%mm7, %%mm1 \n\t"
2314 "pand %%mm7, %%mm2 \n\t"
2315 "pand %%mm7, %%mm3 \n\t"
2316 "packuswb %%mm1, %%mm0 \n\t"
2317 "packuswb %%mm3, %%mm2 \n\t"
2318 "movq %%mm0, %%mm1 \n\t"
2319 "movq %%mm2, %%mm3 \n\t"
2320 "psrlw $8, %%mm0 \n\t"
2321 "psrlw $8, %%mm2 \n\t"
2322 "pand %%mm7, %%mm1 \n\t"
2323 "pand %%mm7, %%mm3 \n\t"
2324 "packuswb %%mm2, %%mm0 \n\t"
2325 "packuswb %%mm3, %%mm1 \n\t"
2326 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2327 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2331 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2343 #if !COMPILE_TEMPLATE_AMD3DNOW
2353 "pcmpeqw %%mm7, %%mm7 \n\t"
2354 "psrlw $8, %%mm7 \n\t"
2356 "movq -28(%1, %0, 4), %%mm0 \n\t"
2357 "movq -20(%1, %0, 4), %%mm1 \n\t"
2358 "movq -12(%1, %0, 4), %%mm2 \n\t"
2359 "movq -4(%1, %0, 4), %%mm3 \n\t"
2360 "psrlw $8, %%mm0 \n\t"
2361 "psrlw $8, %%mm1 \n\t"
2362 "psrlw $8, %%mm2 \n\t"
2363 "psrlw $8, %%mm3 \n\t"
2364 "packuswb %%mm1, %%mm0 \n\t"
2365 "packuswb %%mm3, %%mm2 \n\t"
2366 "movq %%mm0, %%mm1 \n\t"
2367 "movq %%mm2, %%mm3 \n\t"
2368 "psrlw $8, %%mm0 \n\t"
2369 "psrlw $8, %%mm2 \n\t"
2370 "pand %%mm7, %%mm1 \n\t"
2371 "pand %%mm7, %%mm3 \n\t"
2372 "packuswb %%mm2, %%mm0 \n\t"
2373 "packuswb %%mm3, %%mm1 \n\t"
2374 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2375 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2379 :
"r"(src),
"r"(dst0),
"r"(dst1)
2403 "pcmpeqw %%mm7, %%mm7 \n\t"
2404 "psrlw $8, %%mm7 \n\t"
2406 "movq -28(%1, %0, 4), %%mm0 \n\t"
2407 "movq -20(%1, %0, 4), %%mm1 \n\t"
2408 "movq -12(%1, %0, 4), %%mm2 \n\t"
2409 "movq -4(%1, %0, 4), %%mm3 \n\t"
2410 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2411 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2412 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2413 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2414 "psrlw $8, %%mm0 \n\t"
2415 "psrlw $8, %%mm1 \n\t"
2416 "psrlw $8, %%mm2 \n\t"
2417 "psrlw $8, %%mm3 \n\t"
2418 "packuswb %%mm1, %%mm0 \n\t"
2419 "packuswb %%mm3, %%mm2 \n\t"
2420 "movq %%mm0, %%mm1 \n\t"
2421 "movq %%mm2, %%mm3 \n\t"
2422 "psrlw $8, %%mm0 \n\t"
2423 "psrlw $8, %%mm2 \n\t"
2424 "pand %%mm7, %%mm1 \n\t"
2425 "pand %%mm7, %%mm3 \n\t"
2426 "packuswb %%mm2, %%mm0 \n\t"
2427 "packuswb %%mm3, %%mm1 \n\t"
2428 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2429 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2433 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2449 int lumStride,
int chromStride,
int srcStride)
2454 for (y=0; y<
height; y++) {
2472 #if !COMPILE_TEMPLATE_AMD3DNOW
2475 int lumStride,
int chromStride,
int srcStride)
2480 for (y=0; y<
height; y++) {
2499 int lumStride,
int chromStride,
int srcStride)
2504 for (y=0; y<
height; y++) {
2522 #if !COMPILE_TEMPLATE_AMD3DNOW
2525 int lumStride,
int chromStride,
int srcStride)
2530 for (y=0; y<
height; y++) {
2550 #if !COMPILE_TEMPLATE_SSE2
2551 #if !COMPILE_TEMPLATE_AMD3DNOW
2581 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2592 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
2595 #if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
2596 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
static void RENAME() rgb32tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuyvtoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() vu9_to_vu12(const uint8_t *src1, const uint8_t *src2, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride1, int srcStride2, int dstStride1, int dstStride2)
static void RENAME() uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() rgb16tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
void(* ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, int32_t *rgb2yuv)
Height should be a multiple of 2 and width should be a multiple of 2.
void(* planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride)
static void RENAME() rgb32tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb24tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuv422ptoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
static void RENAME() extract_even2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb15to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Macro definitions for various function/variable attributes.
static void RENAME() yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
static av_cold int end(AVCodecContext *avctx)
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, int32_t *rgb2yuv)
Height should be a multiple of 2 and width should be a multiple of 2.
static void RENAME() rgb24tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_odd2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb24tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() shuffle_bytes_2103(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12touyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16 (If this is a problem for anyon...
static av_cold void RENAME() rgb2rgb_init(void)
static void RENAME() yuvPlanartouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
static void RENAME() rgb24to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15to16(const uint8_t *src, uint8_t *dst, int src_size)
#define FF_CEIL_RSHIFT(a, b)
static void RENAME() rgb16to15(const uint8_t *src, uint8_t *dst, int src_size)
#define XMM_CLOBBERS(...)
static void RENAME() extract_odd(const uint8_t *src, uint8_t *dst, x86_reg count)
static void RENAME() rgb32to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_even2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() yuv422ptouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
static void RENAME() yuvPlanartoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
static void RENAME() rgb16to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void(WINAPI *cond_broadcast)(pthread_cond_t *cond)
static void RENAME() extract_even(const uint8_t *src, uint8_t *dst, x86_reg count)
BYTE int const BYTE int int int height
static void RENAME() rgb32tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuyvtoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() yvu9_to_yuy2(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, uint8_t *dst, int width, int height, int srcStride1, int srcStride2, int srcStride3, int dstStride)
static void RENAME() extract_odd2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb24tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
void(* deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride, int dst1Stride, int dst2Stride)
static void RENAME() rgb24to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() interleaveBytes(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, int width, int height, int src1Stride, int src2Stride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 2.
static av_always_inline uint32_t rgb2yuv(const uint32_t *r2y, uint32_t c)
#define NAMED_CONSTRAINTS_ADD(...)