35 #if COMPILE_TEMPLATE_AMD3DNOW
36 #define PREFETCH "prefetch"
37 #define PAVGB "pavgusb"
38 #elif COMPILE_TEMPLATE_MMXEXT
39 #define PREFETCH "prefetchnta"
42 #define PREFETCH " # nop"
45 #if COMPILE_TEMPLATE_AMD3DNOW
52 #if COMPILE_TEMPLATE_MMXEXT
53 #define MOVNTQ "movntq"
54 #define SFENCE "sfence"
57 #define SFENCE " # nop"
60 #if !COMPILE_TEMPLATE_SSE2
62 #if !COMPILE_TEMPLATE_AMD3DNOW
71 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
73 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask32a):
"memory");
77 "movd (%1), %%mm0 \n\t"
78 "punpckldq 3(%1), %%mm0 \n\t"
79 "movd 6(%1), %%mm1 \n\t"
80 "punpckldq 9(%1), %%mm1 \n\t"
81 "movd 12(%1), %%mm2 \n\t"
82 "punpckldq 15(%1), %%mm2 \n\t"
83 "movd 18(%1), %%mm3 \n\t"
84 "punpckldq 21(%1), %%mm3 \n\t"
85 "por %%mm7, %%mm0 \n\t"
86 "por %%mm7, %%mm1 \n\t"
87 "por %%mm7, %%mm2 \n\t"
88 "por %%mm7, %%mm3 \n\t"
91 MOVNTQ" %%mm2, 16(%0) \n\t"
98 __asm__
volatile(
SFENCE:::
"memory");
99 __asm__
volatile(
EMMS:::
"memory");
108 #define STORE_BGR24_MMX \
109 "psrlq $8, %%mm2 \n\t" \
110 "psrlq $8, %%mm3 \n\t" \
111 "psrlq $8, %%mm6 \n\t" \
112 "psrlq $8, %%mm7 \n\t" \
113 "pand "MANGLE(mask24l)", %%mm0\n\t" \
114 "pand "MANGLE(mask24l)", %%mm1\n\t" \
115 "pand "MANGLE(mask24l)", %%mm4\n\t" \
116 "pand "MANGLE(mask24l)", %%mm5\n\t" \
117 "pand "MANGLE(mask24h)", %%mm2\n\t" \
118 "pand "MANGLE(mask24h)", %%mm3\n\t" \
119 "pand "MANGLE(mask24h)", %%mm6\n\t" \
120 "pand "MANGLE(mask24h)", %%mm7\n\t" \
121 "por %%mm2, %%mm0 \n\t" \
122 "por %%mm3, %%mm1 \n\t" \
123 "por %%mm6, %%mm4 \n\t" \
124 "por %%mm7, %%mm5 \n\t" \
126 "movq %%mm1, %%mm2 \n\t" \
127 "movq %%mm4, %%mm3 \n\t" \
128 "psllq $48, %%mm2 \n\t" \
129 "psllq $32, %%mm3 \n\t" \
130 "por %%mm2, %%mm0 \n\t" \
131 "psrlq $16, %%mm1 \n\t" \
132 "psrlq $32, %%mm4 \n\t" \
133 "psllq $16, %%mm5 \n\t" \
134 "por %%mm3, %%mm1 \n\t" \
135 "por %%mm5, %%mm4 \n\t" \
137 MOVNTQ" %%mm0, (%0) \n\t" \
138 MOVNTQ" %%mm1, 8(%0) \n\t" \
139 MOVNTQ" %%mm4, 16(%0)"
149 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
154 "movq (%1), %%mm0 \n\t"
155 "movq 8(%1), %%mm1 \n\t"
156 "movq 16(%1), %%mm4 \n\t"
157 "movq 24(%1), %%mm5 \n\t"
158 "movq %%mm0, %%mm2 \n\t"
159 "movq %%mm1, %%mm3 \n\t"
160 "movq %%mm4, %%mm6 \n\t"
161 "movq %%mm5, %%mm7 \n\t"
168 __asm__
volatile(
SFENCE:::
"memory");
169 __asm__
volatile(
EMMS:::
"memory");
191 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
192 __asm__
volatile(
"movq %0, %%mm4"::
"m"(mask15s));
197 "movq (%1), %%mm0 \n\t"
198 "movq 8(%1), %%mm2 \n\t"
199 "movq %%mm0, %%mm1 \n\t"
200 "movq %%mm2, %%mm3 \n\t"
201 "pand %%mm4, %%mm0 \n\t"
202 "pand %%mm4, %%mm2 \n\t"
203 "paddw %%mm1, %%mm0 \n\t"
204 "paddw %%mm3, %%mm2 \n\t"
212 __asm__
volatile(
SFENCE:::
"memory");
213 __asm__
volatile(
EMMS:::
"memory");
216 register unsigned x= *((
const uint32_t *)s);
217 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
222 register unsigned short x= *((
const uint16_t *)s);
223 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
234 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
235 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask15rg));
236 __asm__
volatile(
"movq %0, %%mm6"::
"m"(mask15b));
241 "movq (%1), %%mm0 \n\t"
242 "movq 8(%1), %%mm2 \n\t"
243 "movq %%mm0, %%mm1 \n\t"
244 "movq %%mm2, %%mm3 \n\t"
245 "psrlq $1, %%mm0 \n\t"
246 "psrlq $1, %%mm2 \n\t"
247 "pand %%mm7, %%mm0 \n\t"
248 "pand %%mm7, %%mm2 \n\t"
249 "pand %%mm6, %%mm1 \n\t"
250 "pand %%mm6, %%mm3 \n\t"
251 "por %%mm1, %%mm0 \n\t"
252 "por %%mm3, %%mm2 \n\t"
260 __asm__
volatile(
SFENCE:::
"memory");
261 __asm__
volatile(
EMMS:::
"memory");
264 register uint32_t x= *((
const uint32_t*)s);
265 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
270 register uint16_t x= *((
const uint16_t*)s);
271 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
280 uint16_t *d = (uint16_t *)
dst;
284 "movq %3, %%mm5 \n\t"
285 "movq %4, %%mm6 \n\t"
286 "movq %5, %%mm7 \n\t"
291 "movd (%1), %%mm0 \n\t"
292 "movd 4(%1), %%mm3 \n\t"
293 "punpckldq 8(%1), %%mm0 \n\t"
294 "punpckldq 12(%1), %%mm3 \n\t"
295 "movq %%mm0, %%mm1 \n\t"
296 "movq %%mm3, %%mm4 \n\t"
297 "pand %%mm6, %%mm0 \n\t"
298 "pand %%mm6, %%mm3 \n\t"
299 "pmaddwd %%mm7, %%mm0 \n\t"
300 "pmaddwd %%mm7, %%mm3 \n\t"
301 "pand %%mm5, %%mm1 \n\t"
302 "pand %%mm5, %%mm4 \n\t"
303 "por %%mm1, %%mm0 \n\t"
304 "por %%mm4, %%mm3 \n\t"
305 "psrld $5, %%mm0 \n\t"
306 "pslld $11, %%mm3 \n\t"
307 "por %%mm3, %%mm0 \n\t"
315 :
"r" (mm_end),
"m" (mask3216g),
"m" (mask3216br),
"m" (mul3216)
317 __asm__
volatile(
SFENCE:::
"memory");
318 __asm__
volatile(
EMMS:::
"memory");
320 register int rgb = *(
const uint32_t*)s; s += 4;
321 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
330 uint16_t *d = (uint16_t *)
dst;
332 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
334 "movq %0, %%mm7 \n\t"
335 "movq %1, %%mm6 \n\t"
336 ::
"m"(red_16mask),
"m"(green_16mask));
341 "movd (%1), %%mm0 \n\t"
342 "movd 4(%1), %%mm3 \n\t"
343 "punpckldq 8(%1), %%mm0 \n\t"
344 "punpckldq 12(%1), %%mm3 \n\t"
345 "movq %%mm0, %%mm1 \n\t"
346 "movq %%mm0, %%mm2 \n\t"
347 "movq %%mm3, %%mm4 \n\t"
348 "movq %%mm3, %%mm5 \n\t"
349 "psllq $8, %%mm0 \n\t"
350 "psllq $8, %%mm3 \n\t"
351 "pand %%mm7, %%mm0 \n\t"
352 "pand %%mm7, %%mm3 \n\t"
353 "psrlq $5, %%mm1 \n\t"
354 "psrlq $5, %%mm4 \n\t"
355 "pand %%mm6, %%mm1 \n\t"
356 "pand %%mm6, %%mm4 \n\t"
357 "psrlq $19, %%mm2 \n\t"
358 "psrlq $19, %%mm5 \n\t"
359 "pand %2, %%mm2 \n\t"
360 "pand %2, %%mm5 \n\t"
361 "por %%mm1, %%mm0 \n\t"
362 "por %%mm4, %%mm3 \n\t"
363 "por %%mm2, %%mm0 \n\t"
364 "por %%mm5, %%mm3 \n\t"
365 "psllq $16, %%mm3 \n\t"
366 "por %%mm3, %%mm0 \n\t"
368 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
372 __asm__
volatile(
SFENCE:::
"memory");
373 __asm__
volatile(
EMMS:::
"memory");
375 register int rgb = *(
const uint32_t*)s; s += 4;
376 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
385 uint16_t *d = (uint16_t *)
dst;
389 "movq %3, %%mm5 \n\t"
390 "movq %4, %%mm6 \n\t"
391 "movq %5, %%mm7 \n\t"
396 "movd (%1), %%mm0 \n\t"
397 "movd 4(%1), %%mm3 \n\t"
398 "punpckldq 8(%1), %%mm0 \n\t"
399 "punpckldq 12(%1), %%mm3 \n\t"
400 "movq %%mm0, %%mm1 \n\t"
401 "movq %%mm3, %%mm4 \n\t"
402 "pand %%mm6, %%mm0 \n\t"
403 "pand %%mm6, %%mm3 \n\t"
404 "pmaddwd %%mm7, %%mm0 \n\t"
405 "pmaddwd %%mm7, %%mm3 \n\t"
406 "pand %%mm5, %%mm1 \n\t"
407 "pand %%mm5, %%mm4 \n\t"
408 "por %%mm1, %%mm0 \n\t"
409 "por %%mm4, %%mm3 \n\t"
410 "psrld $6, %%mm0 \n\t"
411 "pslld $10, %%mm3 \n\t"
412 "por %%mm3, %%mm0 \n\t"
420 :
"r" (mm_end),
"m" (mask3215g),
"m" (mask3216br),
"m" (mul3215)
422 __asm__
volatile(
SFENCE:::
"memory");
423 __asm__
volatile(
EMMS:::
"memory");
425 register int rgb = *(
const uint32_t*)s; s += 4;
426 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
435 uint16_t *d = (uint16_t *)
dst;
437 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
439 "movq %0, %%mm7 \n\t"
440 "movq %1, %%mm6 \n\t"
441 ::
"m"(red_15mask),
"m"(green_15mask));
446 "movd (%1), %%mm0 \n\t"
447 "movd 4(%1), %%mm3 \n\t"
448 "punpckldq 8(%1), %%mm0 \n\t"
449 "punpckldq 12(%1), %%mm3 \n\t"
450 "movq %%mm0, %%mm1 \n\t"
451 "movq %%mm0, %%mm2 \n\t"
452 "movq %%mm3, %%mm4 \n\t"
453 "movq %%mm3, %%mm5 \n\t"
454 "psllq $7, %%mm0 \n\t"
455 "psllq $7, %%mm3 \n\t"
456 "pand %%mm7, %%mm0 \n\t"
457 "pand %%mm7, %%mm3 \n\t"
458 "psrlq $6, %%mm1 \n\t"
459 "psrlq $6, %%mm4 \n\t"
460 "pand %%mm6, %%mm1 \n\t"
461 "pand %%mm6, %%mm4 \n\t"
462 "psrlq $19, %%mm2 \n\t"
463 "psrlq $19, %%mm5 \n\t"
464 "pand %2, %%mm2 \n\t"
465 "pand %2, %%mm5 \n\t"
466 "por %%mm1, %%mm0 \n\t"
467 "por %%mm4, %%mm3 \n\t"
468 "por %%mm2, %%mm0 \n\t"
469 "por %%mm5, %%mm3 \n\t"
470 "psllq $16, %%mm3 \n\t"
471 "por %%mm3, %%mm0 \n\t"
473 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
477 __asm__
volatile(
SFENCE:::
"memory");
478 __asm__
volatile(
EMMS:::
"memory");
480 register int rgb = *(
const uint32_t*)s; s += 4;
481 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
490 uint16_t *d = (uint16_t *)
dst;
492 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
494 "movq %0, %%mm7 \n\t"
495 "movq %1, %%mm6 \n\t"
496 ::
"m"(red_16mask),
"m"(green_16mask));
501 "movd (%1), %%mm0 \n\t"
502 "movd 3(%1), %%mm3 \n\t"
503 "punpckldq 6(%1), %%mm0 \n\t"
504 "punpckldq 9(%1), %%mm3 \n\t"
505 "movq %%mm0, %%mm1 \n\t"
506 "movq %%mm0, %%mm2 \n\t"
507 "movq %%mm3, %%mm4 \n\t"
508 "movq %%mm3, %%mm5 \n\t"
509 "psrlq $3, %%mm0 \n\t"
510 "psrlq $3, %%mm3 \n\t"
511 "pand %2, %%mm0 \n\t"
512 "pand %2, %%mm3 \n\t"
513 "psrlq $5, %%mm1 \n\t"
514 "psrlq $5, %%mm4 \n\t"
515 "pand %%mm6, %%mm1 \n\t"
516 "pand %%mm6, %%mm4 \n\t"
517 "psrlq $8, %%mm2 \n\t"
518 "psrlq $8, %%mm5 \n\t"
519 "pand %%mm7, %%mm2 \n\t"
520 "pand %%mm7, %%mm5 \n\t"
521 "por %%mm1, %%mm0 \n\t"
522 "por %%mm4, %%mm3 \n\t"
523 "por %%mm2, %%mm0 \n\t"
524 "por %%mm5, %%mm3 \n\t"
525 "psllq $16, %%mm3 \n\t"
526 "por %%mm3, %%mm0 \n\t"
528 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
532 __asm__
volatile(
SFENCE:::
"memory");
533 __asm__
volatile(
EMMS:::
"memory");
538 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
547 uint16_t *d = (uint16_t *)
dst;
549 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
551 "movq %0, %%mm7 \n\t"
552 "movq %1, %%mm6 \n\t"
553 ::
"m"(red_16mask),
"m"(green_16mask));
558 "movd (%1), %%mm0 \n\t"
559 "movd 3(%1), %%mm3 \n\t"
560 "punpckldq 6(%1), %%mm0 \n\t"
561 "punpckldq 9(%1), %%mm3 \n\t"
562 "movq %%mm0, %%mm1 \n\t"
563 "movq %%mm0, %%mm2 \n\t"
564 "movq %%mm3, %%mm4 \n\t"
565 "movq %%mm3, %%mm5 \n\t"
566 "psllq $8, %%mm0 \n\t"
567 "psllq $8, %%mm3 \n\t"
568 "pand %%mm7, %%mm0 \n\t"
569 "pand %%mm7, %%mm3 \n\t"
570 "psrlq $5, %%mm1 \n\t"
571 "psrlq $5, %%mm4 \n\t"
572 "pand %%mm6, %%mm1 \n\t"
573 "pand %%mm6, %%mm4 \n\t"
574 "psrlq $19, %%mm2 \n\t"
575 "psrlq $19, %%mm5 \n\t"
576 "pand %2, %%mm2 \n\t"
577 "pand %2, %%mm5 \n\t"
578 "por %%mm1, %%mm0 \n\t"
579 "por %%mm4, %%mm3 \n\t"
580 "por %%mm2, %%mm0 \n\t"
581 "por %%mm5, %%mm3 \n\t"
582 "psllq $16, %%mm3 \n\t"
583 "por %%mm3, %%mm0 \n\t"
585 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
589 __asm__
volatile(
SFENCE:::
"memory");
590 __asm__
volatile(
EMMS:::
"memory");
595 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
604 uint16_t *d = (uint16_t *)
dst;
606 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
608 "movq %0, %%mm7 \n\t"
609 "movq %1, %%mm6 \n\t"
610 ::
"m"(red_15mask),
"m"(green_15mask));
615 "movd (%1), %%mm0 \n\t"
616 "movd 3(%1), %%mm3 \n\t"
617 "punpckldq 6(%1), %%mm0 \n\t"
618 "punpckldq 9(%1), %%mm3 \n\t"
619 "movq %%mm0, %%mm1 \n\t"
620 "movq %%mm0, %%mm2 \n\t"
621 "movq %%mm3, %%mm4 \n\t"
622 "movq %%mm3, %%mm5 \n\t"
623 "psrlq $3, %%mm0 \n\t"
624 "psrlq $3, %%mm3 \n\t"
625 "pand %2, %%mm0 \n\t"
626 "pand %2, %%mm3 \n\t"
627 "psrlq $6, %%mm1 \n\t"
628 "psrlq $6, %%mm4 \n\t"
629 "pand %%mm6, %%mm1 \n\t"
630 "pand %%mm6, %%mm4 \n\t"
631 "psrlq $9, %%mm2 \n\t"
632 "psrlq $9, %%mm5 \n\t"
633 "pand %%mm7, %%mm2 \n\t"
634 "pand %%mm7, %%mm5 \n\t"
635 "por %%mm1, %%mm0 \n\t"
636 "por %%mm4, %%mm3 \n\t"
637 "por %%mm2, %%mm0 \n\t"
638 "por %%mm5, %%mm3 \n\t"
639 "psllq $16, %%mm3 \n\t"
640 "por %%mm3, %%mm0 \n\t"
642 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
646 __asm__
volatile(
SFENCE:::
"memory");
647 __asm__
volatile(
EMMS:::
"memory");
652 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
661 uint16_t *d = (uint16_t *)
dst;
663 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
665 "movq %0, %%mm7 \n\t"
666 "movq %1, %%mm6 \n\t"
667 ::
"m"(red_15mask),
"m"(green_15mask));
672 "movd (%1), %%mm0 \n\t"
673 "movd 3(%1), %%mm3 \n\t"
674 "punpckldq 6(%1), %%mm0 \n\t"
675 "punpckldq 9(%1), %%mm3 \n\t"
676 "movq %%mm0, %%mm1 \n\t"
677 "movq %%mm0, %%mm2 \n\t"
678 "movq %%mm3, %%mm4 \n\t"
679 "movq %%mm3, %%mm5 \n\t"
680 "psllq $7, %%mm0 \n\t"
681 "psllq $7, %%mm3 \n\t"
682 "pand %%mm7, %%mm0 \n\t"
683 "pand %%mm7, %%mm3 \n\t"
684 "psrlq $6, %%mm1 \n\t"
685 "psrlq $6, %%mm4 \n\t"
686 "pand %%mm6, %%mm1 \n\t"
687 "pand %%mm6, %%mm4 \n\t"
688 "psrlq $19, %%mm2 \n\t"
689 "psrlq $19, %%mm5 \n\t"
690 "pand %2, %%mm2 \n\t"
691 "pand %2, %%mm5 \n\t"
692 "por %%mm1, %%mm0 \n\t"
693 "por %%mm4, %%mm3 \n\t"
694 "por %%mm2, %%mm0 \n\t"
695 "por %%mm5, %%mm3 \n\t"
696 "psllq $16, %%mm3 \n\t"
697 "por %%mm3, %%mm0 \n\t"
699 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
703 __asm__
volatile(
SFENCE:::
"memory");
704 __asm__
volatile(
EMMS:::
"memory");
709 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
716 const uint16_t *mm_end;
718 const uint16_t *s = (
const uint16_t*)src;
719 end = s + src_size/2;
720 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
725 "movq (%1), %%mm0 \n\t"
726 "movq (%1), %%mm1 \n\t"
727 "movq (%1), %%mm2 \n\t"
728 "pand %2, %%mm0 \n\t"
729 "pand %3, %%mm1 \n\t"
730 "pand %4, %%mm2 \n\t"
731 "psllq $5, %%mm0 \n\t"
732 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
733 "pmulhw "MANGLE(mul15_mid)
", %%mm1 \n\t"
734 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
735 "movq %%mm0, %%mm3 \n\t"
736 "movq %%mm1, %%mm4 \n\t"
737 "movq %%mm2, %%mm5 \n\t"
738 "punpcklwd %5, %%mm0 \n\t"
739 "punpcklwd %5, %%mm1 \n\t"
740 "punpcklwd %5, %%mm2 \n\t"
741 "punpckhwd %5, %%mm3 \n\t"
742 "punpckhwd %5, %%mm4 \n\t"
743 "punpckhwd %5, %%mm5 \n\t"
744 "psllq $8, %%mm1 \n\t"
745 "psllq $16, %%mm2 \n\t"
746 "por %%mm1, %%mm0 \n\t"
747 "por %%mm2, %%mm0 \n\t"
748 "psllq $8, %%mm4 \n\t"
749 "psllq $16, %%mm5 \n\t"
750 "por %%mm4, %%mm3 \n\t"
751 "por %%mm5, %%mm3 \n\t"
753 "movq %%mm0, %%mm6 \n\t"
754 "movq %%mm3, %%mm7 \n\t"
756 "movq 8(%1), %%mm0 \n\t"
757 "movq 8(%1), %%mm1 \n\t"
758 "movq 8(%1), %%mm2 \n\t"
759 "pand %2, %%mm0 \n\t"
760 "pand %3, %%mm1 \n\t"
761 "pand %4, %%mm2 \n\t"
762 "psllq $5, %%mm0 \n\t"
763 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
764 "pmulhw "MANGLE(mul15_mid)
", %%mm1 \n\t"
765 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
766 "movq %%mm0, %%mm3 \n\t"
767 "movq %%mm1, %%mm4 \n\t"
768 "movq %%mm2, %%mm5 \n\t"
769 "punpcklwd %5, %%mm0 \n\t"
770 "punpcklwd %5, %%mm1 \n\t"
771 "punpcklwd %5, %%mm2 \n\t"
772 "punpckhwd %5, %%mm3 \n\t"
773 "punpckhwd %5, %%mm4 \n\t"
774 "punpckhwd %5, %%mm5 \n\t"
775 "psllq $8, %%mm1 \n\t"
776 "psllq $16, %%mm2 \n\t"
777 "por %%mm1, %%mm0 \n\t"
778 "por %%mm2, %%mm0 \n\t"
779 "psllq $8, %%mm4 \n\t"
780 "psllq $16, %%mm5 \n\t"
781 "por %%mm4, %%mm3 \n\t"
782 "por %%mm5, %%mm3 \n\t"
785 :
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r),
"m"(mmx_null)
789 "movq %%mm0, %%mm4 \n\t"
790 "movq %%mm3, %%mm5 \n\t"
791 "movq %%mm6, %%mm0 \n\t"
792 "movq %%mm7, %%mm1 \n\t"
794 "movq %%mm4, %%mm6 \n\t"
795 "movq %%mm5, %%mm7 \n\t"
796 "movq %%mm0, %%mm2 \n\t"
797 "movq %%mm1, %%mm3 \n\t"
806 __asm__
volatile(
SFENCE:::
"memory");
807 __asm__
volatile(
EMMS:::
"memory");
809 register uint16_t bgr;
811 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
812 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
813 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
820 const uint16_t *mm_end;
822 const uint16_t *s = (
const uint16_t *)src;
823 end = s + src_size/2;
824 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
829 "movq (%1), %%mm0 \n\t"
830 "movq (%1), %%mm1 \n\t"
831 "movq (%1), %%mm2 \n\t"
832 "pand %2, %%mm0 \n\t"
833 "pand %3, %%mm1 \n\t"
834 "pand %4, %%mm2 \n\t"
835 "psllq $5, %%mm0 \n\t"
836 "psrlq $1, %%mm2 \n\t"
837 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
838 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
839 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
840 "movq %%mm0, %%mm3 \n\t"
841 "movq %%mm1, %%mm4 \n\t"
842 "movq %%mm2, %%mm5 \n\t"
843 "punpcklwd %5, %%mm0 \n\t"
844 "punpcklwd %5, %%mm1 \n\t"
845 "punpcklwd %5, %%mm2 \n\t"
846 "punpckhwd %5, %%mm3 \n\t"
847 "punpckhwd %5, %%mm4 \n\t"
848 "punpckhwd %5, %%mm5 \n\t"
849 "psllq $8, %%mm1 \n\t"
850 "psllq $16, %%mm2 \n\t"
851 "por %%mm1, %%mm0 \n\t"
852 "por %%mm2, %%mm0 \n\t"
853 "psllq $8, %%mm4 \n\t"
854 "psllq $16, %%mm5 \n\t"
855 "por %%mm4, %%mm3 \n\t"
856 "por %%mm5, %%mm3 \n\t"
858 "movq %%mm0, %%mm6 \n\t"
859 "movq %%mm3, %%mm7 \n\t"
861 "movq 8(%1), %%mm0 \n\t"
862 "movq 8(%1), %%mm1 \n\t"
863 "movq 8(%1), %%mm2 \n\t"
864 "pand %2, %%mm0 \n\t"
865 "pand %3, %%mm1 \n\t"
866 "pand %4, %%mm2 \n\t"
867 "psllq $5, %%mm0 \n\t"
868 "psrlq $1, %%mm2 \n\t"
869 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
870 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
871 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
872 "movq %%mm0, %%mm3 \n\t"
873 "movq %%mm1, %%mm4 \n\t"
874 "movq %%mm2, %%mm5 \n\t"
875 "punpcklwd %5, %%mm0 \n\t"
876 "punpcklwd %5, %%mm1 \n\t"
877 "punpcklwd %5, %%mm2 \n\t"
878 "punpckhwd %5, %%mm3 \n\t"
879 "punpckhwd %5, %%mm4 \n\t"
880 "punpckhwd %5, %%mm5 \n\t"
881 "psllq $8, %%mm1 \n\t"
882 "psllq $16, %%mm2 \n\t"
883 "por %%mm1, %%mm0 \n\t"
884 "por %%mm2, %%mm0 \n\t"
885 "psllq $8, %%mm4 \n\t"
886 "psllq $16, %%mm5 \n\t"
887 "por %%mm4, %%mm3 \n\t"
888 "por %%mm5, %%mm3 \n\t"
890 :
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mmx_null)
894 "movq %%mm0, %%mm4 \n\t"
895 "movq %%mm3, %%mm5 \n\t"
896 "movq %%mm6, %%mm0 \n\t"
897 "movq %%mm7, %%mm1 \n\t"
899 "movq %%mm4, %%mm6 \n\t"
900 "movq %%mm5, %%mm7 \n\t"
901 "movq %%mm0, %%mm2 \n\t"
902 "movq %%mm1, %%mm3 \n\t"
911 __asm__
volatile(
SFENCE:::
"memory");
912 __asm__
volatile(
EMMS:::
"memory");
914 register uint16_t bgr;
916 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
917 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
918 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
930 "packuswb %%mm7, %%mm0 \n\t" \
931 "packuswb %%mm7, %%mm1 \n\t" \
932 "packuswb %%mm7, %%mm2 \n\t" \
933 "punpcklbw %%mm1, %%mm0 \n\t" \
934 "punpcklbw %%mm6, %%mm2 \n\t" \
935 "movq %%mm0, %%mm3 \n\t" \
936 "punpcklwd %%mm2, %%mm0 \n\t" \
937 "punpckhwd %%mm2, %%mm3 \n\t" \
938 MOVNTQ" %%mm0, (%0) \n\t" \
939 MOVNTQ" %%mm3, 8(%0) \n\t" \
944 const uint16_t *mm_end;
946 const uint16_t *s = (
const uint16_t *)src;
947 end = s + src_size/2;
948 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
949 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
950 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
955 "movq (%1), %%mm0 \n\t"
956 "movq (%1), %%mm1 \n\t"
957 "movq (%1), %%mm2 \n\t"
958 "pand %2, %%mm0 \n\t"
959 "pand %3, %%mm1 \n\t"
960 "pand %4, %%mm2 \n\t"
961 "psllq $5, %%mm0 \n\t"
962 "pmulhw %5, %%mm0 \n\t"
963 "pmulhw %5, %%mm1 \n\t"
964 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
966 ::
"r"(d),
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r) ,
"m"(mul15_mid)
971 __asm__
volatile(
SFENCE:::
"memory");
972 __asm__
volatile(
EMMS:::
"memory");
974 register uint16_t bgr;
976 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
977 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
978 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
986 const uint16_t *mm_end;
988 const uint16_t *s = (
const uint16_t*)src;
989 end = s + src_size/2;
990 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
991 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
992 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
997 "movq (%1), %%mm0 \n\t"
998 "movq (%1), %%mm1 \n\t"
999 "movq (%1), %%mm2 \n\t"
1000 "pand %2, %%mm0 \n\t"
1001 "pand %3, %%mm1 \n\t"
1002 "pand %4, %%mm2 \n\t"
1003 "psllq $5, %%mm0 \n\t"
1004 "psrlq $1, %%mm2 \n\t"
1005 "pmulhw %5, %%mm0 \n\t"
1006 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
1007 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
1009 ::
"r"(d),
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mul15_mid)
1014 __asm__
volatile(
SFENCE:::
"memory");
1015 __asm__
volatile(
EMMS:::
"memory");
1017 register uint16_t bgr;
1019 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1020 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1021 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1035 "movq %3, %%mm7 \n\t"
1036 "pxor %4, %%mm7 \n\t"
1037 "movq %%mm7, %%mm6 \n\t"
1038 "pxor %5, %%mm7 \n\t"
1042 "movq (%1, %0), %%mm0 \n\t"
1043 "movq 8(%1, %0), %%mm1 \n\t"
1044 # if COMPILE_TEMPLATE_MMXEXT
1045 "pshufw $177, %%mm0, %%mm3 \n\t"
1046 "pshufw $177, %%mm1, %%mm5 \n\t"
1047 "pand %%mm7, %%mm0 \n\t"
1048 "pand %%mm6, %%mm3 \n\t"
1049 "pand %%mm7, %%mm1 \n\t"
1050 "pand %%mm6, %%mm5 \n\t"
1051 "por %%mm3, %%mm0 \n\t"
1052 "por %%mm5, %%mm1 \n\t"
1054 "movq %%mm0, %%mm2 \n\t"
1055 "movq %%mm1, %%mm4 \n\t"
1056 "pand %%mm7, %%mm0 \n\t"
1057 "pand %%mm6, %%mm2 \n\t"
1058 "pand %%mm7, %%mm1 \n\t"
1059 "pand %%mm6, %%mm4 \n\t"
1060 "movq %%mm2, %%mm3 \n\t"
1061 "movq %%mm4, %%mm5 \n\t"
1062 "pslld $16, %%mm2 \n\t"
1063 "psrld $16, %%mm3 \n\t"
1064 "pslld $16, %%mm4 \n\t"
1065 "psrld $16, %%mm5 \n\t"
1066 "por %%mm2, %%mm0 \n\t"
1067 "por %%mm4, %%mm1 \n\t"
1068 "por %%mm3, %%mm0 \n\t"
1069 "por %%mm5, %%mm1 \n\t"
1071 MOVNTQ" %%mm0, (%2, %0) \n\t"
1072 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1079 :
"r" (s),
"r" (d),
"m" (mask32b),
"m" (mask32r),
"m" (mmx_one)
1081 for (; idx<15; idx+=4) {
1082 register int v = *(
const uint32_t *)&s[idx],
g = v & 0xff00ff00;
1084 *(uint32_t *)&d[idx] = (v>>16) +
g + (v<<16);
1091 x86_reg mmx_size= 23 - src_size;
1093 "test %%"REG_a
", %%"REG_a
" \n\t"
1095 "movq "MANGLE(mask24r)
", %%mm5 \n\t"
1096 "movq "MANGLE(mask24g)
", %%mm6 \n\t"
1097 "movq "MANGLE(mask24b)
", %%mm7 \n\t"
1101 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1102 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1103 "movq 2(%1, %%"REG_a
"), %%mm2 \n\t"
1104 "psllq $16, %%mm0 \n\t"
1105 "pand %%mm5, %%mm0 \n\t"
1106 "pand %%mm6, %%mm1 \n\t"
1107 "pand %%mm7, %%mm2 \n\t"
1108 "por %%mm0, %%mm1 \n\t"
1109 "por %%mm2, %%mm1 \n\t"
1110 "movq 6(%1, %%"REG_a
"), %%mm0 \n\t"
1111 MOVNTQ" %%mm1, (%2, %%"REG_a
") \n\t"
1112 "movq 8(%1, %%"REG_a
"), %%mm1 \n\t"
1113 "movq 10(%1, %%"REG_a
"), %%mm2 \n\t"
1114 "pand %%mm7, %%mm0 \n\t"
1115 "pand %%mm5, %%mm1 \n\t"
1116 "pand %%mm6, %%mm2 \n\t"
1117 "por %%mm0, %%mm1 \n\t"
1118 "por %%mm2, %%mm1 \n\t"
1119 "movq 14(%1, %%"REG_a
"), %%mm0 \n\t"
1120 MOVNTQ" %%mm1, 8(%2, %%"REG_a
") \n\t"
1121 "movq 16(%1, %%"REG_a
"), %%mm1 \n\t"
1122 "movq 18(%1, %%"REG_a
"), %%mm2 \n\t"
1123 "pand %%mm6, %%mm0 \n\t"
1124 "pand %%mm7, %%mm1 \n\t"
1125 "pand %%mm5, %%mm2 \n\t"
1126 "por %%mm0, %%mm1 \n\t"
1127 "por %%mm2, %%mm1 \n\t"
1128 MOVNTQ" %%mm1, 16(%2, %%"REG_a
") \n\t"
1129 "add $24, %%"REG_a
" \n\t"
1133 :
"r" (src-mmx_size),
"r"(
dst-mmx_size)
1136 __asm__
volatile(
SFENCE:::
"memory");
1137 __asm__
volatile(
EMMS:::
"memory");
1139 if (mmx_size==23)
return;
1143 src_size= 23-mmx_size;
1146 for (i=0; i<src_size; i+=3) {
1149 dst[i + 1] = src[i + 1];
1150 dst[i + 2] = src[i + 0];
1157 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1161 for (y=0; y<
height; y++) {
1164 "xor %%"REG_a
", %%"REG_a
" \n\t"
1167 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1170 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1171 "movq %%mm0, %%mm2 \n\t"
1172 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1173 "punpcklbw %%mm1, %%mm0 \n\t"
1174 "punpckhbw %%mm1, %%mm2 \n\t"
1176 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1177 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1178 "movq %%mm3, %%mm4 \n\t"
1179 "movq %%mm5, %%mm6 \n\t"
1180 "punpcklbw %%mm0, %%mm3 \n\t"
1181 "punpckhbw %%mm0, %%mm4 \n\t"
1182 "punpcklbw %%mm2, %%mm5 \n\t"
1183 "punpckhbw %%mm2, %%mm6 \n\t"
1185 MOVNTQ" %%mm3, (%0, %%"REG_a
", 4) \n\t"
1186 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1187 MOVNTQ" %%mm5, 16(%0, %%"REG_a
", 4) \n\t"
1188 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1190 "add $8, %%"REG_a
" \n\t"
1191 "cmp %4, %%"REG_a
" \n\t"
1193 ::
"r"(
dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1196 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1197 usrc += chromStride;
1198 vsrc += chromStride;
1214 int lumStride,
int chromStride,
int dstStride)
1222 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1226 for (y=0; y<
height; y++) {
1229 "xor %%"REG_a
", %%"REG_a
" \n\t"
1232 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1235 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1236 "movq %%mm0, %%mm2 \n\t"
1237 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1238 "punpcklbw %%mm1, %%mm0 \n\t"
1239 "punpckhbw %%mm1, %%mm2 \n\t"
1241 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1242 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1243 "movq %%mm0, %%mm4 \n\t"
1244 "movq %%mm2, %%mm6 \n\t"
1245 "punpcklbw %%mm3, %%mm0 \n\t"
1246 "punpckhbw %%mm3, %%mm4 \n\t"
1247 "punpcklbw %%mm5, %%mm2 \n\t"
1248 "punpckhbw %%mm5, %%mm6 \n\t"
1250 MOVNTQ" %%mm0, (%0, %%"REG_a
", 4) \n\t"
1251 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1252 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 4) \n\t"
1253 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1255 "add $8, %%"REG_a
" \n\t"
1256 "cmp %4, %%"REG_a
" \n\t"
1258 ::
"r"(
dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1261 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1262 usrc += chromStride;
1263 vsrc += chromStride;
1279 int lumStride,
int chromStride,
int dstStride)
1290 int lumStride,
int chromStride,
int dstStride)
1300 int lumStride,
int chromStride,
int dstStride)
1311 int lumStride,
int chromStride,
int srcStride)
1315 for (y=0; y<
height; y+=2) {
1317 "xor %%"REG_a
", %%"REG_a
" \n\t"
1318 "pcmpeqw %%mm7, %%mm7 \n\t"
1319 "psrlw $8, %%mm7 \n\t"
1322 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1323 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1324 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1325 "movq %%mm0, %%mm2 \n\t"
1326 "movq %%mm1, %%mm3 \n\t"
1327 "psrlw $8, %%mm0 \n\t"
1328 "psrlw $8, %%mm1 \n\t"
1329 "pand %%mm7, %%mm2 \n\t"
1330 "pand %%mm7, %%mm3 \n\t"
1331 "packuswb %%mm1, %%mm0 \n\t"
1332 "packuswb %%mm3, %%mm2 \n\t"
1334 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1336 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1337 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1338 "movq %%mm1, %%mm3 \n\t"
1339 "movq %%mm2, %%mm4 \n\t"
1340 "psrlw $8, %%mm1 \n\t"
1341 "psrlw $8, %%mm2 \n\t"
1342 "pand %%mm7, %%mm3 \n\t"
1343 "pand %%mm7, %%mm4 \n\t"
1344 "packuswb %%mm2, %%mm1 \n\t"
1345 "packuswb %%mm4, %%mm3 \n\t"
1347 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1349 "movq %%mm0, %%mm2 \n\t"
1350 "movq %%mm1, %%mm3 \n\t"
1351 "psrlw $8, %%mm0 \n\t"
1352 "psrlw $8, %%mm1 \n\t"
1353 "pand %%mm7, %%mm2 \n\t"
1354 "pand %%mm7, %%mm3 \n\t"
1355 "packuswb %%mm1, %%mm0 \n\t"
1356 "packuswb %%mm3, %%mm2 \n\t"
1358 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1359 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1361 "add $8, %%"REG_a
" \n\t"
1362 "cmp %4, %%"REG_a
" \n\t"
1364 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1365 :
"memory",
"%"REG_a
1372 "xor %%"REG_a
", %%"REG_a
" \n\t"
1375 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1376 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1377 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1378 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1379 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1380 "pand %%mm7, %%mm0 \n\t"
1381 "pand %%mm7, %%mm1 \n\t"
1382 "pand %%mm7, %%mm2 \n\t"
1383 "pand %%mm7, %%mm3 \n\t"
1384 "packuswb %%mm1, %%mm0 \n\t"
1385 "packuswb %%mm3, %%mm2 \n\t"
1387 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1388 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1390 "add $8, %%"REG_a
" \n\t"
1391 "cmp %4, %%"REG_a
" \n\t"
1394 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1395 :
"memory",
"%"REG_a
1397 udst += chromStride;
1398 vdst += chromStride;
1402 __asm__
volatile(
EMMS" \n\t"
1408 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1416 for (x=0; x<srcWidth-1; x++) {
1417 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1418 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1420 dst[2*srcWidth-1]= src[srcWidth-1];
1424 for (y=1; y<srcHeight; y++) {
1425 const x86_reg mmxSize= srcWidth&~15;
1427 "mov %4, %%"REG_a
" \n\t"
1428 "movq "MANGLE(mmx_ff)
", %%mm0 \n\t"
1429 "movq (%0, %%"REG_a
"), %%mm4 \n\t"
1430 "movq %%mm4, %%mm2 \n\t"
1431 "psllq $8, %%mm4 \n\t"
1432 "pand %%mm0, %%mm2 \n\t"
1433 "por %%mm2, %%mm4 \n\t"
1434 "movq (%1, %%"REG_a
"), %%mm5 \n\t"
1435 "movq %%mm5, %%mm3 \n\t"
1436 "psllq $8, %%mm5 \n\t"
1437 "pand %%mm0, %%mm3 \n\t"
1438 "por %%mm3, %%mm5 \n\t"
1440 "movq (%0, %%"REG_a
"), %%mm0 \n\t"
1441 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1442 "movq 1(%0, %%"REG_a
"), %%mm2 \n\t"
1443 "movq 1(%1, %%"REG_a
"), %%mm3 \n\t"
1444 PAVGB" %%mm0, %%mm5 \n\t"
1445 PAVGB" %%mm0, %%mm3 \n\t"
1446 PAVGB" %%mm0, %%mm5 \n\t"
1447 PAVGB" %%mm0, %%mm3 \n\t"
1448 PAVGB" %%mm1, %%mm4 \n\t"
1449 PAVGB" %%mm1, %%mm2 \n\t"
1450 PAVGB" %%mm1, %%mm4 \n\t"
1451 PAVGB" %%mm1, %%mm2 \n\t"
1452 "movq %%mm5, %%mm7 \n\t"
1453 "movq %%mm4, %%mm6 \n\t"
1454 "punpcklbw %%mm3, %%mm5 \n\t"
1455 "punpckhbw %%mm3, %%mm7 \n\t"
1456 "punpcklbw %%mm2, %%mm4 \n\t"
1457 "punpckhbw %%mm2, %%mm6 \n\t"
1458 MOVNTQ" %%mm5, (%2, %%"REG_a
", 2) \n\t"
1459 MOVNTQ" %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1460 MOVNTQ" %%mm4, (%3, %%"REG_a
", 2) \n\t"
1461 MOVNTQ" %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1462 "add $8, %%"REG_a
" \n\t"
1463 "movq -1(%0, %%"REG_a
"), %%mm4 \n\t"
1464 "movq -1(%1, %%"REG_a
"), %%mm5 \n\t"
1466 ::
"r" (src + mmxSize ),
"r" (src + srcStride + mmxSize ),
1467 "r" (
dst + mmxSize*2),
"r" (
dst + dstStride + mmxSize*2),
1472 for (x=mmxSize-1; x<srcWidth-1; x++) {
1473 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1474 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1475 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1476 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1478 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1479 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1488 for (x=0; x<srcWidth-1; x++) {
1489 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1490 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1492 dst[2*srcWidth-1]= src[srcWidth-1];
1494 __asm__
volatile(
EMMS" \n\t"
1500 #if !COMPILE_TEMPLATE_AMD3DNOW
1509 int lumStride,
int chromStride,
int srcStride)
1512 const x86_reg chromWidth= width>>1;
1513 for (y=0; y<
height; y+=2) {
1515 "xor %%"REG_a
", %%"REG_a
" \n\t"
1516 "pcmpeqw %%mm7, %%mm7 \n\t"
1517 "psrlw $8, %%mm7 \n\t"
1520 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1521 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1522 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1523 "movq %%mm0, %%mm2 \n\t"
1524 "movq %%mm1, %%mm3 \n\t"
1525 "pand %%mm7, %%mm0 \n\t"
1526 "pand %%mm7, %%mm1 \n\t"
1527 "psrlw $8, %%mm2 \n\t"
1528 "psrlw $8, %%mm3 \n\t"
1529 "packuswb %%mm1, %%mm0 \n\t"
1530 "packuswb %%mm3, %%mm2 \n\t"
1532 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1534 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1535 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1536 "movq %%mm1, %%mm3 \n\t"
1537 "movq %%mm2, %%mm4 \n\t"
1538 "pand %%mm7, %%mm1 \n\t"
1539 "pand %%mm7, %%mm2 \n\t"
1540 "psrlw $8, %%mm3 \n\t"
1541 "psrlw $8, %%mm4 \n\t"
1542 "packuswb %%mm2, %%mm1 \n\t"
1543 "packuswb %%mm4, %%mm3 \n\t"
1545 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1547 "movq %%mm0, %%mm2 \n\t"
1548 "movq %%mm1, %%mm3 \n\t"
1549 "psrlw $8, %%mm0 \n\t"
1550 "psrlw $8, %%mm1 \n\t"
1551 "pand %%mm7, %%mm2 \n\t"
1552 "pand %%mm7, %%mm3 \n\t"
1553 "packuswb %%mm1, %%mm0 \n\t"
1554 "packuswb %%mm3, %%mm2 \n\t"
1556 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1557 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1559 "add $8, %%"REG_a
" \n\t"
1560 "cmp %4, %%"REG_a
" \n\t"
1562 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1563 :
"memory",
"%"REG_a
1570 "xor %%"REG_a
", %%"REG_a
" \n\t"
1573 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1574 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1575 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1576 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1577 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1578 "psrlw $8, %%mm0 \n\t"
1579 "psrlw $8, %%mm1 \n\t"
1580 "psrlw $8, %%mm2 \n\t"
1581 "psrlw $8, %%mm3 \n\t"
1582 "packuswb %%mm1, %%mm0 \n\t"
1583 "packuswb %%mm3, %%mm2 \n\t"
1585 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1586 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1588 "add $8, %%"REG_a
" \n\t"
1589 "cmp %4, %%"REG_a
" \n\t"
1592 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1593 :
"memory",
"%"REG_a
1595 udst += chromStride;
1596 vdst += chromStride;
1600 __asm__
volatile(
EMMS" \n\t"
1615 int lumStride,
int chromStride,
int srcStride)
1618 const x86_reg chromWidth= width>>1;
1621 rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride);
1623 ydst += 2*lumStride;
1624 udst += chromStride;
1625 vdst += chromStride;
1629 for (y=0; y<height-2; y+=2) {
1631 for (i=0; i<2; i++) {
1633 "mov %2, %%"REG_a
" \n\t"
1634 "movq "MANGLE(ff_bgr2YCoeff)
", %%mm6 \n\t"
1635 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1636 "pxor %%mm7, %%mm7 \n\t"
1637 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1641 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1642 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t"
1643 "punpcklbw %%mm7, %%mm0 \n\t"
1644 "punpcklbw %%mm7, %%mm1 \n\t"
1645 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t"
1646 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t"
1647 "punpcklbw %%mm7, %%mm2 \n\t"
1648 "punpcklbw %%mm7, %%mm3 \n\t"
1649 "pmaddwd %%mm6, %%mm0 \n\t"
1650 "pmaddwd %%mm6, %%mm1 \n\t"
1651 "pmaddwd %%mm6, %%mm2 \n\t"
1652 "pmaddwd %%mm6, %%mm3 \n\t"
1653 #ifndef FAST_BGR2YV12
1654 "psrad $8, %%mm0 \n\t"
1655 "psrad $8, %%mm1 \n\t"
1656 "psrad $8, %%mm2 \n\t"
1657 "psrad $8, %%mm3 \n\t"
1659 "packssdw %%mm1, %%mm0 \n\t"
1660 "packssdw %%mm3, %%mm2 \n\t"
1661 "pmaddwd %%mm5, %%mm0 \n\t"
1662 "pmaddwd %%mm5, %%mm2 \n\t"
1663 "packssdw %%mm2, %%mm0 \n\t"
1664 "psraw $7, %%mm0 \n\t"
1666 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1667 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t"
1668 "punpcklbw %%mm7, %%mm4 \n\t"
1669 "punpcklbw %%mm7, %%mm1 \n\t"
1670 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t"
1671 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t"
1672 "punpcklbw %%mm7, %%mm2 \n\t"
1673 "punpcklbw %%mm7, %%mm3 \n\t"
1674 "pmaddwd %%mm6, %%mm4 \n\t"
1675 "pmaddwd %%mm6, %%mm1 \n\t"
1676 "pmaddwd %%mm6, %%mm2 \n\t"
1677 "pmaddwd %%mm6, %%mm3 \n\t"
1678 #ifndef FAST_BGR2YV12
1679 "psrad $8, %%mm4 \n\t"
1680 "psrad $8, %%mm1 \n\t"
1681 "psrad $8, %%mm2 \n\t"
1682 "psrad $8, %%mm3 \n\t"
1684 "packssdw %%mm1, %%mm4 \n\t"
1685 "packssdw %%mm3, %%mm2 \n\t"
1686 "pmaddwd %%mm5, %%mm4 \n\t"
1687 "pmaddwd %%mm5, %%mm2 \n\t"
1688 "add $24, %%"REG_d
" \n\t"
1689 "packssdw %%mm2, %%mm4 \n\t"
1690 "psraw $7, %%mm4 \n\t"
1692 "packuswb %%mm4, %%mm0 \n\t"
1693 "paddusb "MANGLE(ff_bgr2YOffset)
", %%mm0 \n\t"
1695 MOVNTQ" %%mm0, (%1, %%"REG_a
") \n\t"
1696 "add $8, %%"REG_a
" \n\t"
1698 : :
"r" (src+width*3),
"r" (ydst+width),
"g" ((
x86_reg)-width)
1699 :
"%"REG_a,
"%"REG_d
1706 "mov %4, %%"REG_a
" \n\t"
1707 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1708 "movq "MANGLE(ff_bgr2UCoeff)
", %%mm6 \n\t"
1709 "pxor %%mm7, %%mm7 \n\t"
1710 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1711 "add %%"REG_d
", %%"REG_d
" \n\t"
1716 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1717 "movq (%0, %%"REG_d
"), %%mm0 \n\t"
1718 "movq (%1, %%"REG_d
"), %%mm1 \n\t"
1719 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t"
1720 "movq 6(%1, %%"REG_d
"), %%mm3 \n\t"
1721 PAVGB" %%mm1, %%mm0 \n\t"
1722 PAVGB" %%mm3, %%mm2 \n\t"
1723 "movq %%mm0, %%mm1 \n\t"
1724 "movq %%mm2, %%mm3 \n\t"
1725 "psrlq $24, %%mm0 \n\t"
1726 "psrlq $24, %%mm2 \n\t"
1727 PAVGB" %%mm1, %%mm0 \n\t"
1728 PAVGB" %%mm3, %%mm2 \n\t"
1729 "punpcklbw %%mm7, %%mm0 \n\t"
1730 "punpcklbw %%mm7, %%mm2 \n\t"
1732 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1733 "movd (%1, %%"REG_d
"), %%mm1 \n\t"
1734 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t"
1735 "movd 3(%1, %%"REG_d
"), %%mm3 \n\t"
1736 "punpcklbw %%mm7, %%mm0 \n\t"
1737 "punpcklbw %%mm7, %%mm1 \n\t"
1738 "punpcklbw %%mm7, %%mm2 \n\t"
1739 "punpcklbw %%mm7, %%mm3 \n\t"
1740 "paddw %%mm1, %%mm0 \n\t"
1741 "paddw %%mm3, %%mm2 \n\t"
1742 "paddw %%mm2, %%mm0 \n\t"
1743 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t"
1744 "movd 6(%1, %%"REG_d
"), %%mm1 \n\t"
1745 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t"
1746 "movd 9(%1, %%"REG_d
"), %%mm3 \n\t"
1747 "punpcklbw %%mm7, %%mm4 \n\t"
1748 "punpcklbw %%mm7, %%mm1 \n\t"
1749 "punpcklbw %%mm7, %%mm2 \n\t"
1750 "punpcklbw %%mm7, %%mm3 \n\t"
1751 "paddw %%mm1, %%mm4 \n\t"
1752 "paddw %%mm3, %%mm2 \n\t"
1753 "paddw %%mm4, %%mm2 \n\t"
1754 "psrlw $2, %%mm0 \n\t"
1755 "psrlw $2, %%mm2 \n\t"
1757 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm1 \n\t"
1758 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm3 \n\t"
1760 "pmaddwd %%mm0, %%mm1 \n\t"
1761 "pmaddwd %%mm2, %%mm3 \n\t"
1762 "pmaddwd %%mm6, %%mm0 \n\t"
1763 "pmaddwd %%mm6, %%mm2 \n\t"
1764 #ifndef FAST_BGR2YV12
1765 "psrad $8, %%mm0 \n\t"
1766 "psrad $8, %%mm1 \n\t"
1767 "psrad $8, %%mm2 \n\t"
1768 "psrad $8, %%mm3 \n\t"
1770 "packssdw %%mm2, %%mm0 \n\t"
1771 "packssdw %%mm3, %%mm1 \n\t"
1772 "pmaddwd %%mm5, %%mm0 \n\t"
1773 "pmaddwd %%mm5, %%mm1 \n\t"
1774 "packssdw %%mm1, %%mm0 \n\t"
1775 "psraw $7, %%mm0 \n\t"
1777 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1778 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t"
1779 "movq 12(%1, %%"REG_d
"), %%mm1 \n\t"
1780 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t"
1781 "movq 18(%1, %%"REG_d
"), %%mm3 \n\t"
1782 PAVGB" %%mm1, %%mm4 \n\t"
1783 PAVGB" %%mm3, %%mm2 \n\t"
1784 "movq %%mm4, %%mm1 \n\t"
1785 "movq %%mm2, %%mm3 \n\t"
1786 "psrlq $24, %%mm4 \n\t"
1787 "psrlq $24, %%mm2 \n\t"
1788 PAVGB" %%mm1, %%mm4 \n\t"
1789 PAVGB" %%mm3, %%mm2 \n\t"
1790 "punpcklbw %%mm7, %%mm4 \n\t"
1791 "punpcklbw %%mm7, %%mm2 \n\t"
1793 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1794 "movd 12(%1, %%"REG_d
"), %%mm1 \n\t"
1795 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t"
1796 "movd 15(%1, %%"REG_d
"), %%mm3 \n\t"
1797 "punpcklbw %%mm7, %%mm4 \n\t"
1798 "punpcklbw %%mm7, %%mm1 \n\t"
1799 "punpcklbw %%mm7, %%mm2 \n\t"
1800 "punpcklbw %%mm7, %%mm3 \n\t"
1801 "paddw %%mm1, %%mm4 \n\t"
1802 "paddw %%mm3, %%mm2 \n\t"
1803 "paddw %%mm2, %%mm4 \n\t"
1804 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t"
1805 "movd 18(%1, %%"REG_d
"), %%mm1 \n\t"
1806 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t"
1807 "movd 21(%1, %%"REG_d
"), %%mm3 \n\t"
1808 "punpcklbw %%mm7, %%mm5 \n\t"
1809 "punpcklbw %%mm7, %%mm1 \n\t"
1810 "punpcklbw %%mm7, %%mm2 \n\t"
1811 "punpcklbw %%mm7, %%mm3 \n\t"
1812 "paddw %%mm1, %%mm5 \n\t"
1813 "paddw %%mm3, %%mm2 \n\t"
1814 "paddw %%mm5, %%mm2 \n\t"
1815 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1816 "psrlw $2, %%mm4 \n\t"
1817 "psrlw $2, %%mm2 \n\t"
1819 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm1 \n\t"
1820 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm3 \n\t"
1822 "pmaddwd %%mm4, %%mm1 \n\t"
1823 "pmaddwd %%mm2, %%mm3 \n\t"
1824 "pmaddwd %%mm6, %%mm4 \n\t"
1825 "pmaddwd %%mm6, %%mm2 \n\t"
1826 #ifndef FAST_BGR2YV12
1827 "psrad $8, %%mm4 \n\t"
1828 "psrad $8, %%mm1 \n\t"
1829 "psrad $8, %%mm2 \n\t"
1830 "psrad $8, %%mm3 \n\t"
1832 "packssdw %%mm2, %%mm4 \n\t"
1833 "packssdw %%mm3, %%mm1 \n\t"
1834 "pmaddwd %%mm5, %%mm4 \n\t"
1835 "pmaddwd %%mm5, %%mm1 \n\t"
1836 "add $24, %%"REG_d
" \n\t"
1837 "packssdw %%mm1, %%mm4 \n\t"
1838 "psraw $7, %%mm4 \n\t"
1840 "movq %%mm0, %%mm1 \n\t"
1841 "punpckldq %%mm4, %%mm0 \n\t"
1842 "punpckhdq %%mm4, %%mm1 \n\t"
1843 "packsswb %%mm1, %%mm0 \n\t"
1844 "paddb "MANGLE(ff_bgr2UVOffset)
", %%mm0 \n\t"
1845 "movd %%mm0, (%2, %%"REG_a
") \n\t"
1846 "punpckhdq %%mm0, %%mm0 \n\t"
1847 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1848 "add $4, %%"REG_a
" \n\t"
1850 : :
"r" (src+chromWidth*6),
"r" (src+srcStride+chromWidth*6),
"r" (udst+chromWidth),
"r" (vdst+chromWidth),
"g" (-chromWidth)
1851 :
"%"REG_a,
"%"REG_d
1854 udst += chromStride;
1855 vdst += chromStride;
1859 __asm__
volatile(
EMMS" \n\t"
1863 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1867 #if !COMPILE_TEMPLATE_AMD3DNOW
1870 int src2Stride,
int dstStride)
1874 for (h=0; h <
height; h++) {
1878 #if COMPILE_TEMPLATE_SSE2
1880 "xor %%"REG_a
", %%"REG_a
" \n\t"
1884 "movdqa (%1, %%"REG_a
"), %%xmm0 \n\t"
1885 "movdqa (%1, %%"REG_a
"), %%xmm1 \n\t"
1886 "movdqa (%2, %%"REG_a
"), %%xmm2 \n\t"
1887 "punpcklbw %%xmm2, %%xmm0 \n\t"
1888 "punpckhbw %%xmm2, %%xmm1 \n\t"
1889 "movntdq %%xmm0, (%0, %%"REG_a
", 2) \n\t"
1890 "movntdq %%xmm1, 16(%0, %%"REG_a
", 2) \n\t"
1891 "add $16, %%"REG_a
" \n\t"
1892 "cmp %3, %%"REG_a
" \n\t"
1894 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1895 :
"memory",
"%"REG_a
""
1899 "xor %%"REG_a
", %%"REG_a
" \n\t"
1903 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1904 "movq 8(%1, %%"REG_a
"), %%mm2 \n\t"
1905 "movq %%mm0, %%mm1 \n\t"
1906 "movq %%mm2, %%mm3 \n\t"
1907 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
1908 "movq 8(%2, %%"REG_a
"), %%mm5 \n\t"
1909 "punpcklbw %%mm4, %%mm0 \n\t"
1910 "punpckhbw %%mm4, %%mm1 \n\t"
1911 "punpcklbw %%mm5, %%mm2 \n\t"
1912 "punpckhbw %%mm5, %%mm3 \n\t"
1913 MOVNTQ" %%mm0, (%0, %%"REG_a
", 2) \n\t"
1914 MOVNTQ" %%mm1, 8(%0, %%"REG_a
", 2) \n\t"
1915 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 2) \n\t"
1916 MOVNTQ" %%mm3, 24(%0, %%"REG_a
", 2) \n\t"
1917 "add $16, %%"REG_a
" \n\t"
1918 "cmp %3, %%"REG_a
" \n\t"
1920 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1921 :
"memory",
"%"REG_a
1924 for (w= (width&(~15)); w <
width; w++) {
1925 dest[2*w+0] = src1[w];
1926 dest[2*w+1] = src2[w];
1940 #if !COMPILE_TEMPLATE_SSE2
1941 #if !COMPILE_TEMPLATE_AMD3DNOW
1945 int srcStride1,
int srcStride2,
1946 int dstStride1,
int dstStride2)
1950 w=width/2; h=height/2;
1954 ::
"m"(*(src1+srcStride1)),
"m"(*(src2+srcStride2)):
"memory");
1956 const uint8_t*
s1=src1+srcStride1*(y>>1);
1959 for (;x<w-31;x+=32) {
1962 "movq (%1,%2), %%mm0 \n\t"
1963 "movq 8(%1,%2), %%mm2 \n\t"
1964 "movq 16(%1,%2), %%mm4 \n\t"
1965 "movq 24(%1,%2), %%mm6 \n\t"
1966 "movq %%mm0, %%mm1 \n\t"
1967 "movq %%mm2, %%mm3 \n\t"
1968 "movq %%mm4, %%mm5 \n\t"
1969 "movq %%mm6, %%mm7 \n\t"
1970 "punpcklbw %%mm0, %%mm0 \n\t"
1971 "punpckhbw %%mm1, %%mm1 \n\t"
1972 "punpcklbw %%mm2, %%mm2 \n\t"
1973 "punpckhbw %%mm3, %%mm3 \n\t"
1974 "punpcklbw %%mm4, %%mm4 \n\t"
1975 "punpckhbw %%mm5, %%mm5 \n\t"
1976 "punpcklbw %%mm6, %%mm6 \n\t"
1977 "punpckhbw %%mm7, %%mm7 \n\t"
1978 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1979 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1980 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1981 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1982 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1983 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1984 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1985 MOVNTQ" %%mm7, 56(%0,%2,2)"
1986 ::
"r"(d),
"r"(s1),
"r"(x)
1989 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1992 const uint8_t*
s2=src2+srcStride2*(y>>1);
1995 for (;x<w-31;x+=32) {
1998 "movq (%1,%2), %%mm0 \n\t"
1999 "movq 8(%1,%2), %%mm2 \n\t"
2000 "movq 16(%1,%2), %%mm4 \n\t"
2001 "movq 24(%1,%2), %%mm6 \n\t"
2002 "movq %%mm0, %%mm1 \n\t"
2003 "movq %%mm2, %%mm3 \n\t"
2004 "movq %%mm4, %%mm5 \n\t"
2005 "movq %%mm6, %%mm7 \n\t"
2006 "punpcklbw %%mm0, %%mm0 \n\t"
2007 "punpckhbw %%mm1, %%mm1 \n\t"
2008 "punpcklbw %%mm2, %%mm2 \n\t"
2009 "punpckhbw %%mm3, %%mm3 \n\t"
2010 "punpcklbw %%mm4, %%mm4 \n\t"
2011 "punpckhbw %%mm5, %%mm5 \n\t"
2012 "punpcklbw %%mm6, %%mm6 \n\t"
2013 "punpckhbw %%mm7, %%mm7 \n\t"
2014 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2015 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2016 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2017 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2018 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2019 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2020 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2021 MOVNTQ" %%mm7, 56(%0,%2,2)"
2022 ::
"r"(d),
"r"(s2),
"r"(x)
2025 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2037 int srcStride1,
int srcStride2,
2038 int srcStride3,
int dstStride)
2044 const uint8_t* yp=src1+srcStride1*
y;
2045 const uint8_t* up=src2+srcStride2*(y>>2);
2046 const uint8_t* vp=src3+srcStride3*(y>>2);
2054 "movq (%1, %0, 4), %%mm0 \n\t"
2055 "movq (%2, %0), %%mm1 \n\t"
2056 "movq (%3, %0), %%mm2 \n\t"
2057 "movq %%mm0, %%mm3 \n\t"
2058 "movq %%mm1, %%mm4 \n\t"
2059 "movq %%mm2, %%mm5 \n\t"
2060 "punpcklbw %%mm1, %%mm1 \n\t"
2061 "punpcklbw %%mm2, %%mm2 \n\t"
2062 "punpckhbw %%mm4, %%mm4 \n\t"
2063 "punpckhbw %%mm5, %%mm5 \n\t"
2065 "movq %%mm1, %%mm6 \n\t"
2066 "punpcklbw %%mm2, %%mm1 \n\t"
2067 "punpcklbw %%mm1, %%mm0 \n\t"
2068 "punpckhbw %%mm1, %%mm3 \n\t"
2069 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2070 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2072 "punpckhbw %%mm2, %%mm6 \n\t"
2073 "movq 8(%1, %0, 4), %%mm0 \n\t"
2074 "movq %%mm0, %%mm3 \n\t"
2075 "punpcklbw %%mm6, %%mm0 \n\t"
2076 "punpckhbw %%mm6, %%mm3 \n\t"
2077 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2078 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2080 "movq %%mm4, %%mm6 \n\t"
2081 "movq 16(%1, %0, 4), %%mm0 \n\t"
2082 "movq %%mm0, %%mm3 \n\t"
2083 "punpcklbw %%mm5, %%mm4 \n\t"
2084 "punpcklbw %%mm4, %%mm0 \n\t"
2085 "punpckhbw %%mm4, %%mm3 \n\t"
2086 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2087 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2089 "punpckhbw %%mm5, %%mm6 \n\t"
2090 "movq 24(%1, %0, 4), %%mm0 \n\t"
2091 "movq %%mm0, %%mm3 \n\t"
2092 "punpcklbw %%mm6, %%mm0 \n\t"
2093 "punpckhbw %%mm6, %%mm3 \n\t"
2094 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2095 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2098 :
"r"(yp),
"r" (up),
"r"(vp),
"r"(d)
2102 const int x2 = x<<2;
2105 d[8*x+2] = yp[x2+1];
2107 d[8*x+4] = yp[x2+2];
2109 d[8*x+6] = yp[x2+3];
2130 "pcmpeqw %%mm7, %%mm7 \n\t"
2131 "psrlw $8, %%mm7 \n\t"
2133 "movq -30(%1, %0, 2), %%mm0 \n\t"
2134 "movq -22(%1, %0, 2), %%mm1 \n\t"
2135 "movq -14(%1, %0, 2), %%mm2 \n\t"
2136 "movq -6(%1, %0, 2), %%mm3 \n\t"
2137 "pand %%mm7, %%mm0 \n\t"
2138 "pand %%mm7, %%mm1 \n\t"
2139 "pand %%mm7, %%mm2 \n\t"
2140 "pand %%mm7, %%mm3 \n\t"
2141 "packuswb %%mm1, %%mm0 \n\t"
2142 "packuswb %%mm3, %%mm2 \n\t"
2143 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2144 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2148 :
"r"(src),
"r"(
dst)
2153 dst[count]= src[2*count];
2158 #if !COMPILE_TEMPLATE_AMD3DNOW
2168 "pcmpeqw %%mm7, %%mm7 \n\t"
2169 "psrlw $8, %%mm7 \n\t"
2171 "movq -28(%1, %0, 4), %%mm0 \n\t"
2172 "movq -20(%1, %0, 4), %%mm1 \n\t"
2173 "movq -12(%1, %0, 4), %%mm2 \n\t"
2174 "movq -4(%1, %0, 4), %%mm3 \n\t"
2175 "pand %%mm7, %%mm0 \n\t"
2176 "pand %%mm7, %%mm1 \n\t"
2177 "pand %%mm7, %%mm2 \n\t"
2178 "pand %%mm7, %%mm3 \n\t"
2179 "packuswb %%mm1, %%mm0 \n\t"
2180 "packuswb %%mm3, %%mm2 \n\t"
2181 "movq %%mm0, %%mm1 \n\t"
2182 "movq %%mm2, %%mm3 \n\t"
2183 "psrlw $8, %%mm0 \n\t"
2184 "psrlw $8, %%mm2 \n\t"
2185 "pand %%mm7, %%mm1 \n\t"
2186 "pand %%mm7, %%mm3 \n\t"
2187 "packuswb %%mm2, %%mm0 \n\t"
2188 "packuswb %%mm3, %%mm1 \n\t"
2189 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2190 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2194 :
"r"(src),
"r"(dst0),
"r"(dst1)
2199 dst0[count]= src[4*count+0];
2200 dst1[count]= src[4*count+2];
2217 "pcmpeqw %%mm7, %%mm7 \n\t"
2218 "psrlw $8, %%mm7 \n\t"
2220 "movq -28(%1, %0, 4), %%mm0 \n\t"
2221 "movq -20(%1, %0, 4), %%mm1 \n\t"
2222 "movq -12(%1, %0, 4), %%mm2 \n\t"
2223 "movq -4(%1, %0, 4), %%mm3 \n\t"
2224 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2225 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2226 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2227 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2228 "pand %%mm7, %%mm0 \n\t"
2229 "pand %%mm7, %%mm1 \n\t"
2230 "pand %%mm7, %%mm2 \n\t"
2231 "pand %%mm7, %%mm3 \n\t"
2232 "packuswb %%mm1, %%mm0 \n\t"
2233 "packuswb %%mm3, %%mm2 \n\t"
2234 "movq %%mm0, %%mm1 \n\t"
2235 "movq %%mm2, %%mm3 \n\t"
2236 "psrlw $8, %%mm0 \n\t"
2237 "psrlw $8, %%mm2 \n\t"
2238 "pand %%mm7, %%mm1 \n\t"
2239 "pand %%mm7, %%mm3 \n\t"
2240 "packuswb %%mm2, %%mm0 \n\t"
2241 "packuswb %%mm3, %%mm1 \n\t"
2242 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2243 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2247 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2253 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2254 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2259 #if !COMPILE_TEMPLATE_AMD3DNOW
2269 "pcmpeqw %%mm7, %%mm7 \n\t"
2270 "psrlw $8, %%mm7 \n\t"
2272 "movq -28(%1, %0, 4), %%mm0 \n\t"
2273 "movq -20(%1, %0, 4), %%mm1 \n\t"
2274 "movq -12(%1, %0, 4), %%mm2 \n\t"
2275 "movq -4(%1, %0, 4), %%mm3 \n\t"
2276 "psrlw $8, %%mm0 \n\t"
2277 "psrlw $8, %%mm1 \n\t"
2278 "psrlw $8, %%mm2 \n\t"
2279 "psrlw $8, %%mm3 \n\t"
2280 "packuswb %%mm1, %%mm0 \n\t"
2281 "packuswb %%mm3, %%mm2 \n\t"
2282 "movq %%mm0, %%mm1 \n\t"
2283 "movq %%mm2, %%mm3 \n\t"
2284 "psrlw $8, %%mm0 \n\t"
2285 "psrlw $8, %%mm2 \n\t"
2286 "pand %%mm7, %%mm1 \n\t"
2287 "pand %%mm7, %%mm3 \n\t"
2288 "packuswb %%mm2, %%mm0 \n\t"
2289 "packuswb %%mm3, %%mm1 \n\t"
2290 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2291 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2295 :
"r"(src),
"r"(dst0),
"r"(dst1)
2301 dst0[count]= src[4*count+0];
2302 dst1[count]= src[4*count+2];
2319 "pcmpeqw %%mm7, %%mm7 \n\t"
2320 "psrlw $8, %%mm7 \n\t"
2322 "movq -28(%1, %0, 4), %%mm0 \n\t"
2323 "movq -20(%1, %0, 4), %%mm1 \n\t"
2324 "movq -12(%1, %0, 4), %%mm2 \n\t"
2325 "movq -4(%1, %0, 4), %%mm3 \n\t"
2326 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2327 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2328 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2329 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2330 "psrlw $8, %%mm0 \n\t"
2331 "psrlw $8, %%mm1 \n\t"
2332 "psrlw $8, %%mm2 \n\t"
2333 "psrlw $8, %%mm3 \n\t"
2334 "packuswb %%mm1, %%mm0 \n\t"
2335 "packuswb %%mm3, %%mm2 \n\t"
2336 "movq %%mm0, %%mm1 \n\t"
2337 "movq %%mm2, %%mm3 \n\t"
2338 "psrlw $8, %%mm0 \n\t"
2339 "psrlw $8, %%mm2 \n\t"
2340 "pand %%mm7, %%mm1 \n\t"
2341 "pand %%mm7, %%mm3 \n\t"
2342 "packuswb %%mm2, %%mm0 \n\t"
2343 "packuswb %%mm3, %%mm1 \n\t"
2344 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2345 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2349 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2357 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2358 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2365 int lumStride,
int chromStride,
int srcStride)
2368 const int chromWidth= -((-
width)>>1);
2370 for (y=0; y<
height; y++) {
2388 #if !COMPILE_TEMPLATE_AMD3DNOW
2391 int lumStride,
int chromStride,
int srcStride)
2394 const int chromWidth= -((-
width)>>1);
2396 for (y=0; y<
height; y++) {
2415 int lumStride,
int chromStride,
int srcStride)
2418 const int chromWidth= -((-
width)>>1);
2420 for (y=0; y<
height; y++) {
2438 #if !COMPILE_TEMPLATE_AMD3DNOW
2441 int lumStride,
int chromStride,
int srcStride)
2444 const int chromWidth= -((-
width)>>1);
2446 for (y=0; y<
height; y++) {
2466 #if !COMPILE_TEMPLATE_SSE2
2467 #if !COMPILE_TEMPLATE_AMD3DNOW
2497 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2506 #if !COMPILE_TEMPLATE_AMD3DNOW