39 #if COMPILE_TEMPLATE_AMD3DNOW 40 #define PREFETCH "prefetch" 41 #define PAVGB "pavgusb" 42 #elif COMPILE_TEMPLATE_MMXEXT 43 #define PREFETCH "prefetchnta" 46 #define PREFETCH " # nop" 49 #if COMPILE_TEMPLATE_AMD3DNOW 56 #if COMPILE_TEMPLATE_MMXEXT 57 #define MOVNTQ "movntq" 58 #define SFENCE "sfence" 61 #define SFENCE " # nop" 64 #if !COMPILE_TEMPLATE_SSE2 66 #if !COMPILE_TEMPLATE_AMD3DNOW 75 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
77 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask32a):
"memory");
81 "movd (%1), %%mm0 \n\t" 82 "punpckldq 3(%1), %%mm0 \n\t" 83 "movd 6(%1), %%mm1 \n\t" 84 "punpckldq 9(%1), %%mm1 \n\t" 85 "movd 12(%1), %%mm2 \n\t" 86 "punpckldq 15(%1), %%mm2 \n\t" 87 "movd 18(%1), %%mm3 \n\t" 88 "punpckldq 21(%1), %%mm3 \n\t" 89 "por %%mm7, %%mm0 \n\t" 90 "por %%mm7, %%mm1 \n\t" 91 "por %%mm7, %%mm2 \n\t" 92 "por %%mm7, %%mm3 \n\t" 95 MOVNTQ" %%mm2, 16(%0) \n\t" 102 __asm__
volatile(
SFENCE:::
"memory");
103 __asm__
volatile(
EMMS:::
"memory");
112 #define STORE_BGR24_MMX \ 113 "psrlq $8, %%mm2 \n\t" \ 114 "psrlq $8, %%mm3 \n\t" \ 115 "psrlq $8, %%mm6 \n\t" \ 116 "psrlq $8, %%mm7 \n\t" \ 117 "pand "MANGLE(mask24l)", %%mm0\n\t" \ 118 "pand "MANGLE(mask24l)", %%mm1\n\t" \ 119 "pand "MANGLE(mask24l)", %%mm4\n\t" \ 120 "pand "MANGLE(mask24l)", %%mm5\n\t" \ 121 "pand "MANGLE(mask24h)", %%mm2\n\t" \ 122 "pand "MANGLE(mask24h)", %%mm3\n\t" \ 123 "pand "MANGLE(mask24h)", %%mm6\n\t" \ 124 "pand "MANGLE(mask24h)", %%mm7\n\t" \ 125 "por %%mm2, %%mm0 \n\t" \ 126 "por %%mm3, %%mm1 \n\t" \ 127 "por %%mm6, %%mm4 \n\t" \ 128 "por %%mm7, %%mm5 \n\t" \ 130 "movq %%mm1, %%mm2 \n\t" \ 131 "movq %%mm4, %%mm3 \n\t" \ 132 "psllq $48, %%mm2 \n\t" \ 133 "psllq $32, %%mm3 \n\t" \ 134 "por %%mm2, %%mm0 \n\t" \ 135 "psrlq $16, %%mm1 \n\t" \ 136 "psrlq $32, %%mm4 \n\t" \ 137 "psllq $16, %%mm5 \n\t" \ 138 "por %%mm3, %%mm1 \n\t" \ 139 "por %%mm5, %%mm4 \n\t" \ 141 MOVNTQ" %%mm0, (%0) \n\t" \ 142 MOVNTQ" %%mm1, 8(%0) \n\t" \ 143 MOVNTQ" %%mm4, 16(%0)" 153 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
158 "movq (%1), %%mm0 \n\t" 159 "movq 8(%1), %%mm1 \n\t" 160 "movq 16(%1), %%mm4 \n\t" 161 "movq 24(%1), %%mm5 \n\t" 162 "movq %%mm0, %%mm2 \n\t" 163 "movq %%mm1, %%mm3 \n\t" 164 "movq %%mm4, %%mm6 \n\t" 165 "movq %%mm5, %%mm7 \n\t" 173 __asm__
volatile(
SFENCE:::
"memory");
174 __asm__
volatile(
EMMS:::
"memory");
196 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
197 __asm__
volatile(
"movq %0, %%mm4"::
"m"(mask15s));
202 "movq (%1), %%mm0 \n\t" 203 "movq 8(%1), %%mm2 \n\t" 204 "movq %%mm0, %%mm1 \n\t" 205 "movq %%mm2, %%mm3 \n\t" 206 "pand %%mm4, %%mm0 \n\t" 207 "pand %%mm4, %%mm2 \n\t" 208 "paddw %%mm1, %%mm0 \n\t" 209 "paddw %%mm3, %%mm2 \n\t" 217 __asm__
volatile(
SFENCE:::
"memory");
218 __asm__
volatile(
EMMS:::
"memory");
221 register unsigned x= *((
const uint32_t *)s);
222 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
227 register unsigned short x= *((
const uint16_t *)s);
228 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
239 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
240 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask15rg));
241 __asm__
volatile(
"movq %0, %%mm6"::
"m"(mask15b));
246 "movq (%1), %%mm0 \n\t" 247 "movq 8(%1), %%mm2 \n\t" 248 "movq %%mm0, %%mm1 \n\t" 249 "movq %%mm2, %%mm3 \n\t" 250 "psrlq $1, %%mm0 \n\t" 251 "psrlq $1, %%mm2 \n\t" 252 "pand %%mm7, %%mm0 \n\t" 253 "pand %%mm7, %%mm2 \n\t" 254 "pand %%mm6, %%mm1 \n\t" 255 "pand %%mm6, %%mm3 \n\t" 256 "por %%mm1, %%mm0 \n\t" 257 "por %%mm3, %%mm2 \n\t" 265 __asm__
volatile(
SFENCE:::
"memory");
266 __asm__
volatile(
EMMS:::
"memory");
269 register uint32_t x= *((
const uint32_t*)s);
270 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
275 register uint16_t x= *((
const uint16_t*)s);
276 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
285 uint16_t *d = (uint16_t *)dst;
289 "movq %3, %%mm5 \n\t" 290 "movq %4, %%mm6 \n\t" 291 "movq %5, %%mm7 \n\t" 296 "movd (%1), %%mm0 \n\t" 297 "movd 4(%1), %%mm3 \n\t" 298 "punpckldq 8(%1), %%mm0 \n\t" 299 "punpckldq 12(%1), %%mm3 \n\t" 300 "movq %%mm0, %%mm1 \n\t" 301 "movq %%mm3, %%mm4 \n\t" 302 "pand %%mm6, %%mm0 \n\t" 303 "pand %%mm6, %%mm3 \n\t" 304 "pmaddwd %%mm7, %%mm0 \n\t" 305 "pmaddwd %%mm7, %%mm3 \n\t" 306 "pand %%mm5, %%mm1 \n\t" 307 "pand %%mm5, %%mm4 \n\t" 308 "por %%mm1, %%mm0 \n\t" 309 "por %%mm4, %%mm3 \n\t" 310 "psrld $5, %%mm0 \n\t" 311 "pslld $11, %%mm3 \n\t" 312 "por %%mm3, %%mm0 \n\t" 320 :
"r" (mm_end),
"m" (mask3216g),
"m" (mask3216br),
"m" (mul3216)
322 __asm__
volatile(
SFENCE:::
"memory");
323 __asm__
volatile(
EMMS:::
"memory");
325 register int rgb = *(
const uint32_t*)s; s += 4;
326 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
335 uint16_t *d = (uint16_t *)dst;
337 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
339 "movq %0, %%mm7 \n\t" 340 "movq %1, %%mm6 \n\t" 341 ::
"m"(red_16mask),
"m"(green_16mask));
346 "movd (%1), %%mm0 \n\t" 347 "movd 4(%1), %%mm3 \n\t" 348 "punpckldq 8(%1), %%mm0 \n\t" 349 "punpckldq 12(%1), %%mm3 \n\t" 350 "movq %%mm0, %%mm1 \n\t" 351 "movq %%mm0, %%mm2 \n\t" 352 "movq %%mm3, %%mm4 \n\t" 353 "movq %%mm3, %%mm5 \n\t" 354 "psllq $8, %%mm0 \n\t" 355 "psllq $8, %%mm3 \n\t" 356 "pand %%mm7, %%mm0 \n\t" 357 "pand %%mm7, %%mm3 \n\t" 358 "psrlq $5, %%mm1 \n\t" 359 "psrlq $5, %%mm4 \n\t" 360 "pand %%mm6, %%mm1 \n\t" 361 "pand %%mm6, %%mm4 \n\t" 362 "psrlq $19, %%mm2 \n\t" 363 "psrlq $19, %%mm5 \n\t" 364 "pand %2, %%mm2 \n\t" 365 "pand %2, %%mm5 \n\t" 366 "por %%mm1, %%mm0 \n\t" 367 "por %%mm4, %%mm3 \n\t" 368 "por %%mm2, %%mm0 \n\t" 369 "por %%mm5, %%mm3 \n\t" 370 "psllq $16, %%mm3 \n\t" 371 "por %%mm3, %%mm0 \n\t" 373 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
377 __asm__
volatile(
SFENCE:::
"memory");
378 __asm__
volatile(
EMMS:::
"memory");
380 register int rgb = *(
const uint32_t*)s; s += 4;
381 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
390 uint16_t *d = (uint16_t *)dst;
394 "movq %3, %%mm5 \n\t" 395 "movq %4, %%mm6 \n\t" 396 "movq %5, %%mm7 \n\t" 401 "movd (%1), %%mm0 \n\t" 402 "movd 4(%1), %%mm3 \n\t" 403 "punpckldq 8(%1), %%mm0 \n\t" 404 "punpckldq 12(%1), %%mm3 \n\t" 405 "movq %%mm0, %%mm1 \n\t" 406 "movq %%mm3, %%mm4 \n\t" 407 "pand %%mm6, %%mm0 \n\t" 408 "pand %%mm6, %%mm3 \n\t" 409 "pmaddwd %%mm7, %%mm0 \n\t" 410 "pmaddwd %%mm7, %%mm3 \n\t" 411 "pand %%mm5, %%mm1 \n\t" 412 "pand %%mm5, %%mm4 \n\t" 413 "por %%mm1, %%mm0 \n\t" 414 "por %%mm4, %%mm3 \n\t" 415 "psrld $6, %%mm0 \n\t" 416 "pslld $10, %%mm3 \n\t" 417 "por %%mm3, %%mm0 \n\t" 425 :
"r" (mm_end),
"m" (mask3215g),
"m" (mask3216br),
"m" (mul3215)
427 __asm__
volatile(
SFENCE:::
"memory");
428 __asm__
volatile(
EMMS:::
"memory");
430 register int rgb = *(
const uint32_t*)s; s += 4;
431 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
440 uint16_t *d = (uint16_t *)dst;
442 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
444 "movq %0, %%mm7 \n\t" 445 "movq %1, %%mm6 \n\t" 446 ::
"m"(red_15mask),
"m"(green_15mask));
451 "movd (%1), %%mm0 \n\t" 452 "movd 4(%1), %%mm3 \n\t" 453 "punpckldq 8(%1), %%mm0 \n\t" 454 "punpckldq 12(%1), %%mm3 \n\t" 455 "movq %%mm0, %%mm1 \n\t" 456 "movq %%mm0, %%mm2 \n\t" 457 "movq %%mm3, %%mm4 \n\t" 458 "movq %%mm3, %%mm5 \n\t" 459 "psllq $7, %%mm0 \n\t" 460 "psllq $7, %%mm3 \n\t" 461 "pand %%mm7, %%mm0 \n\t" 462 "pand %%mm7, %%mm3 \n\t" 463 "psrlq $6, %%mm1 \n\t" 464 "psrlq $6, %%mm4 \n\t" 465 "pand %%mm6, %%mm1 \n\t" 466 "pand %%mm6, %%mm4 \n\t" 467 "psrlq $19, %%mm2 \n\t" 468 "psrlq $19, %%mm5 \n\t" 469 "pand %2, %%mm2 \n\t" 470 "pand %2, %%mm5 \n\t" 471 "por %%mm1, %%mm0 \n\t" 472 "por %%mm4, %%mm3 \n\t" 473 "por %%mm2, %%mm0 \n\t" 474 "por %%mm5, %%mm3 \n\t" 475 "psllq $16, %%mm3 \n\t" 476 "por %%mm3, %%mm0 \n\t" 478 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
482 __asm__
volatile(
SFENCE:::
"memory");
483 __asm__
volatile(
EMMS:::
"memory");
485 register int rgb = *(
const uint32_t*)s; s += 4;
486 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
495 uint16_t *d = (uint16_t *)dst;
497 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
499 "movq %0, %%mm7 \n\t" 500 "movq %1, %%mm6 \n\t" 501 ::
"m"(red_16mask),
"m"(green_16mask));
506 "movd (%1), %%mm0 \n\t" 507 "movd 3(%1), %%mm3 \n\t" 508 "punpckldq 6(%1), %%mm0 \n\t" 509 "punpckldq 9(%1), %%mm3 \n\t" 510 "movq %%mm0, %%mm1 \n\t" 511 "movq %%mm0, %%mm2 \n\t" 512 "movq %%mm3, %%mm4 \n\t" 513 "movq %%mm3, %%mm5 \n\t" 514 "psrlq $3, %%mm0 \n\t" 515 "psrlq $3, %%mm3 \n\t" 516 "pand %2, %%mm0 \n\t" 517 "pand %2, %%mm3 \n\t" 518 "psrlq $5, %%mm1 \n\t" 519 "psrlq $5, %%mm4 \n\t" 520 "pand %%mm6, %%mm1 \n\t" 521 "pand %%mm6, %%mm4 \n\t" 522 "psrlq $8, %%mm2 \n\t" 523 "psrlq $8, %%mm5 \n\t" 524 "pand %%mm7, %%mm2 \n\t" 525 "pand %%mm7, %%mm5 \n\t" 526 "por %%mm1, %%mm0 \n\t" 527 "por %%mm4, %%mm3 \n\t" 528 "por %%mm2, %%mm0 \n\t" 529 "por %%mm5, %%mm3 \n\t" 530 "psllq $16, %%mm3 \n\t" 531 "por %%mm3, %%mm0 \n\t" 533 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
537 __asm__
volatile(
SFENCE:::
"memory");
538 __asm__
volatile(
EMMS:::
"memory");
543 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
552 uint16_t *d = (uint16_t *)dst;
554 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
556 "movq %0, %%mm7 \n\t" 557 "movq %1, %%mm6 \n\t" 558 ::
"m"(red_16mask),
"m"(green_16mask));
563 "movd (%1), %%mm0 \n\t" 564 "movd 3(%1), %%mm3 \n\t" 565 "punpckldq 6(%1), %%mm0 \n\t" 566 "punpckldq 9(%1), %%mm3 \n\t" 567 "movq %%mm0, %%mm1 \n\t" 568 "movq %%mm0, %%mm2 \n\t" 569 "movq %%mm3, %%mm4 \n\t" 570 "movq %%mm3, %%mm5 \n\t" 571 "psllq $8, %%mm0 \n\t" 572 "psllq $8, %%mm3 \n\t" 573 "pand %%mm7, %%mm0 \n\t" 574 "pand %%mm7, %%mm3 \n\t" 575 "psrlq $5, %%mm1 \n\t" 576 "psrlq $5, %%mm4 \n\t" 577 "pand %%mm6, %%mm1 \n\t" 578 "pand %%mm6, %%mm4 \n\t" 579 "psrlq $19, %%mm2 \n\t" 580 "psrlq $19, %%mm5 \n\t" 581 "pand %2, %%mm2 \n\t" 582 "pand %2, %%mm5 \n\t" 583 "por %%mm1, %%mm0 \n\t" 584 "por %%mm4, %%mm3 \n\t" 585 "por %%mm2, %%mm0 \n\t" 586 "por %%mm5, %%mm3 \n\t" 587 "psllq $16, %%mm3 \n\t" 588 "por %%mm3, %%mm0 \n\t" 590 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
594 __asm__
volatile(
SFENCE:::
"memory");
595 __asm__
volatile(
EMMS:::
"memory");
600 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
609 uint16_t *d = (uint16_t *)dst;
611 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
613 "movq %0, %%mm7 \n\t" 614 "movq %1, %%mm6 \n\t" 615 ::
"m"(red_15mask),
"m"(green_15mask));
620 "movd (%1), %%mm0 \n\t" 621 "movd 3(%1), %%mm3 \n\t" 622 "punpckldq 6(%1), %%mm0 \n\t" 623 "punpckldq 9(%1), %%mm3 \n\t" 624 "movq %%mm0, %%mm1 \n\t" 625 "movq %%mm0, %%mm2 \n\t" 626 "movq %%mm3, %%mm4 \n\t" 627 "movq %%mm3, %%mm5 \n\t" 628 "psrlq $3, %%mm0 \n\t" 629 "psrlq $3, %%mm3 \n\t" 630 "pand %2, %%mm0 \n\t" 631 "pand %2, %%mm3 \n\t" 632 "psrlq $6, %%mm1 \n\t" 633 "psrlq $6, %%mm4 \n\t" 634 "pand %%mm6, %%mm1 \n\t" 635 "pand %%mm6, %%mm4 \n\t" 636 "psrlq $9, %%mm2 \n\t" 637 "psrlq $9, %%mm5 \n\t" 638 "pand %%mm7, %%mm2 \n\t" 639 "pand %%mm7, %%mm5 \n\t" 640 "por %%mm1, %%mm0 \n\t" 641 "por %%mm4, %%mm3 \n\t" 642 "por %%mm2, %%mm0 \n\t" 643 "por %%mm5, %%mm3 \n\t" 644 "psllq $16, %%mm3 \n\t" 645 "por %%mm3, %%mm0 \n\t" 647 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
651 __asm__
volatile(
SFENCE:::
"memory");
652 __asm__
volatile(
EMMS:::
"memory");
657 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
666 uint16_t *d = (uint16_t *)dst;
668 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
670 "movq %0, %%mm7 \n\t" 671 "movq %1, %%mm6 \n\t" 672 ::
"m"(red_15mask),
"m"(green_15mask));
677 "movd (%1), %%mm0 \n\t" 678 "movd 3(%1), %%mm3 \n\t" 679 "punpckldq 6(%1), %%mm0 \n\t" 680 "punpckldq 9(%1), %%mm3 \n\t" 681 "movq %%mm0, %%mm1 \n\t" 682 "movq %%mm0, %%mm2 \n\t" 683 "movq %%mm3, %%mm4 \n\t" 684 "movq %%mm3, %%mm5 \n\t" 685 "psllq $7, %%mm0 \n\t" 686 "psllq $7, %%mm3 \n\t" 687 "pand %%mm7, %%mm0 \n\t" 688 "pand %%mm7, %%mm3 \n\t" 689 "psrlq $6, %%mm1 \n\t" 690 "psrlq $6, %%mm4 \n\t" 691 "pand %%mm6, %%mm1 \n\t" 692 "pand %%mm6, %%mm4 \n\t" 693 "psrlq $19, %%mm2 \n\t" 694 "psrlq $19, %%mm5 \n\t" 695 "pand %2, %%mm2 \n\t" 696 "pand %2, %%mm5 \n\t" 697 "por %%mm1, %%mm0 \n\t" 698 "por %%mm4, %%mm3 \n\t" 699 "por %%mm2, %%mm0 \n\t" 700 "por %%mm5, %%mm3 \n\t" 701 "psllq $16, %%mm3 \n\t" 702 "por %%mm3, %%mm0 \n\t" 704 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
708 __asm__
volatile(
SFENCE:::
"memory");
709 __asm__
volatile(
EMMS:::
"memory");
714 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
721 const uint16_t *mm_end;
723 const uint16_t *
s = (
const uint16_t*)
src;
724 end = s + src_size/2;
725 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
730 "movq (%1), %%mm0 \n\t" 731 "movq (%1), %%mm1 \n\t" 732 "movq (%1), %%mm2 \n\t" 733 "pand %2, %%mm0 \n\t" 734 "pand %3, %%mm1 \n\t" 735 "pand %4, %%mm2 \n\t" 736 "psllq $5, %%mm0 \n\t" 737 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t" 738 "pmulhw "MANGLE(mul15_mid)
", %%mm1 \n\t" 739 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t" 740 "movq %%mm0, %%mm3 \n\t" 741 "movq %%mm1, %%mm4 \n\t" 742 "movq %%mm2, %%mm5 \n\t" 743 "punpcklwd %5, %%mm0 \n\t" 744 "punpcklwd %5, %%mm1 \n\t" 745 "punpcklwd %5, %%mm2 \n\t" 746 "punpckhwd %5, %%mm3 \n\t" 747 "punpckhwd %5, %%mm4 \n\t" 748 "punpckhwd %5, %%mm5 \n\t" 749 "psllq $8, %%mm1 \n\t" 750 "psllq $16, %%mm2 \n\t" 751 "por %%mm1, %%mm0 \n\t" 752 "por %%mm2, %%mm0 \n\t" 753 "psllq $8, %%mm4 \n\t" 754 "psllq $16, %%mm5 \n\t" 755 "por %%mm4, %%mm3 \n\t" 756 "por %%mm5, %%mm3 \n\t" 758 "movq %%mm0, %%mm6 \n\t" 759 "movq %%mm3, %%mm7 \n\t" 761 "movq 8(%1), %%mm0 \n\t" 762 "movq 8(%1), %%mm1 \n\t" 763 "movq 8(%1), %%mm2 \n\t" 764 "pand %2, %%mm0 \n\t" 765 "pand %3, %%mm1 \n\t" 766 "pand %4, %%mm2 \n\t" 767 "psllq $5, %%mm0 \n\t" 768 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t" 769 "pmulhw "MANGLE(mul15_mid)
", %%mm1 \n\t" 770 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t" 771 "movq %%mm0, %%mm3 \n\t" 772 "movq %%mm1, %%mm4 \n\t" 773 "movq %%mm2, %%mm5 \n\t" 774 "punpcklwd %5, %%mm0 \n\t" 775 "punpcklwd %5, %%mm1 \n\t" 776 "punpcklwd %5, %%mm2 \n\t" 777 "punpckhwd %5, %%mm3 \n\t" 778 "punpckhwd %5, %%mm4 \n\t" 779 "punpckhwd %5, %%mm5 \n\t" 780 "psllq $8, %%mm1 \n\t" 781 "psllq $16, %%mm2 \n\t" 782 "por %%mm1, %%mm0 \n\t" 783 "por %%mm2, %%mm0 \n\t" 784 "psllq $8, %%mm4 \n\t" 785 "psllq $16, %%mm5 \n\t" 786 "por %%mm4, %%mm3 \n\t" 787 "por %%mm5, %%mm3 \n\t" 790 :
"r"(
s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r),
"m"(mmx_null)
795 "movq %%mm0, %%mm4 \n\t" 796 "movq %%mm3, %%mm5 \n\t" 797 "movq %%mm6, %%mm0 \n\t" 798 "movq %%mm7, %%mm1 \n\t" 800 "movq %%mm4, %%mm6 \n\t" 801 "movq %%mm5, %%mm7 \n\t" 802 "movq %%mm0, %%mm2 \n\t" 803 "movq %%mm1, %%mm3 \n\t" 813 __asm__
volatile(
SFENCE:::
"memory");
814 __asm__
volatile(
EMMS:::
"memory");
816 register uint16_t bgr;
818 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
819 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
820 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
827 const uint16_t *mm_end;
829 const uint16_t *
s = (
const uint16_t *)
src;
830 end = s + src_size/2;
831 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
836 "movq (%1), %%mm0 \n\t" 837 "movq (%1), %%mm1 \n\t" 838 "movq (%1), %%mm2 \n\t" 839 "pand %2, %%mm0 \n\t" 840 "pand %3, %%mm1 \n\t" 841 "pand %4, %%mm2 \n\t" 842 "psllq $5, %%mm0 \n\t" 843 "psrlq $1, %%mm2 \n\t" 844 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t" 845 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t" 846 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t" 847 "movq %%mm0, %%mm3 \n\t" 848 "movq %%mm1, %%mm4 \n\t" 849 "movq %%mm2, %%mm5 \n\t" 850 "punpcklwd %5, %%mm0 \n\t" 851 "punpcklwd %5, %%mm1 \n\t" 852 "punpcklwd %5, %%mm2 \n\t" 853 "punpckhwd %5, %%mm3 \n\t" 854 "punpckhwd %5, %%mm4 \n\t" 855 "punpckhwd %5, %%mm5 \n\t" 856 "psllq $8, %%mm1 \n\t" 857 "psllq $16, %%mm2 \n\t" 858 "por %%mm1, %%mm0 \n\t" 859 "por %%mm2, %%mm0 \n\t" 860 "psllq $8, %%mm4 \n\t" 861 "psllq $16, %%mm5 \n\t" 862 "por %%mm4, %%mm3 \n\t" 863 "por %%mm5, %%mm3 \n\t" 865 "movq %%mm0, %%mm6 \n\t" 866 "movq %%mm3, %%mm7 \n\t" 868 "movq 8(%1), %%mm0 \n\t" 869 "movq 8(%1), %%mm1 \n\t" 870 "movq 8(%1), %%mm2 \n\t" 871 "pand %2, %%mm0 \n\t" 872 "pand %3, %%mm1 \n\t" 873 "pand %4, %%mm2 \n\t" 874 "psllq $5, %%mm0 \n\t" 875 "psrlq $1, %%mm2 \n\t" 876 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t" 877 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t" 878 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t" 879 "movq %%mm0, %%mm3 \n\t" 880 "movq %%mm1, %%mm4 \n\t" 881 "movq %%mm2, %%mm5 \n\t" 882 "punpcklwd %5, %%mm0 \n\t" 883 "punpcklwd %5, %%mm1 \n\t" 884 "punpcklwd %5, %%mm2 \n\t" 885 "punpckhwd %5, %%mm3 \n\t" 886 "punpckhwd %5, %%mm4 \n\t" 887 "punpckhwd %5, %%mm5 \n\t" 888 "psllq $8, %%mm1 \n\t" 889 "psllq $16, %%mm2 \n\t" 890 "por %%mm1, %%mm0 \n\t" 891 "por %%mm2, %%mm0 \n\t" 892 "psllq $8, %%mm4 \n\t" 893 "psllq $16, %%mm5 \n\t" 894 "por %%mm4, %%mm3 \n\t" 895 "por %%mm5, %%mm3 \n\t" 897 :
"r"(
s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mmx_null)
902 "movq %%mm0, %%mm4 \n\t" 903 "movq %%mm3, %%mm5 \n\t" 904 "movq %%mm6, %%mm0 \n\t" 905 "movq %%mm7, %%mm1 \n\t" 907 "movq %%mm4, %%mm6 \n\t" 908 "movq %%mm5, %%mm7 \n\t" 909 "movq %%mm0, %%mm2 \n\t" 910 "movq %%mm1, %%mm3 \n\t" 920 __asm__
volatile(
SFENCE:::
"memory");
921 __asm__
volatile(
EMMS:::
"memory");
923 register uint16_t bgr;
925 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
926 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
927 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
939 "packuswb %%mm7, %%mm0 \n\t" \ 940 "packuswb %%mm7, %%mm1 \n\t" \ 941 "packuswb %%mm7, %%mm2 \n\t" \ 942 "punpcklbw %%mm1, %%mm0 \n\t" \ 943 "punpcklbw %%mm6, %%mm2 \n\t" \ 944 "movq %%mm0, %%mm3 \n\t" \ 945 "punpcklwd %%mm2, %%mm0 \n\t" \ 946 "punpckhwd %%mm2, %%mm3 \n\t" \ 947 MOVNTQ" %%mm0, (%0) \n\t" \ 948 MOVNTQ" %%mm3, 8(%0) \n\t" \ 953 const uint16_t *mm_end;
955 const uint16_t *
s = (
const uint16_t *)
src;
956 end = s + src_size/2;
957 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
958 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
959 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
964 "movq (%1), %%mm0 \n\t" 965 "movq (%1), %%mm1 \n\t" 966 "movq (%1), %%mm2 \n\t" 967 "pand %2, %%mm0 \n\t" 968 "pand %3, %%mm1 \n\t" 969 "pand %4, %%mm2 \n\t" 970 "psllq $5, %%mm0 \n\t" 971 "pmulhw %5, %%mm0 \n\t" 972 "pmulhw %5, %%mm1 \n\t" 973 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t" 975 ::
"r"(d),
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r) ,
"m"(mul15_mid)
981 __asm__
volatile(
SFENCE:::
"memory");
982 __asm__
volatile(
EMMS:::
"memory");
984 register uint16_t bgr;
986 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
987 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
988 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
996 const uint16_t *mm_end;
998 const uint16_t *
s = (
const uint16_t*)
src;
999 end = s + src_size/2;
1000 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
1001 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
1002 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
1004 while (s < mm_end) {
1007 "movq (%1), %%mm0 \n\t" 1008 "movq (%1), %%mm1 \n\t" 1009 "movq (%1), %%mm2 \n\t" 1010 "pand %2, %%mm0 \n\t" 1011 "pand %3, %%mm1 \n\t" 1012 "pand %4, %%mm2 \n\t" 1013 "psllq $5, %%mm0 \n\t" 1014 "psrlq $1, %%mm2 \n\t" 1015 "pmulhw %5, %%mm0 \n\t" 1016 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t" 1017 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t" 1019 ::
"r"(d),
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mul15_mid)
1025 __asm__
volatile(
SFENCE:::
"memory");
1026 __asm__
volatile(
EMMS:::
"memory");
1028 register uint16_t bgr;
1030 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1031 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1032 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1040 x86_reg mmx_size= 23 - src_size;
1042 "test %%"FF_REG_a
", %%"FF_REG_a
" \n\t" 1044 "movq "MANGLE(mask24r)
", %%mm5 \n\t" 1045 "movq "MANGLE(mask24g)
", %%mm6 \n\t" 1046 "movq "MANGLE(mask24b)
", %%mm7 \n\t" 1049 PREFETCH" 32(%1, %%"FF_REG_a
") \n\t" 1050 "movq (%1, %%"FF_REG_a
"), %%mm0 \n\t" 1051 "movq (%1, %%"FF_REG_a
"), %%mm1 \n\t" 1052 "movq 2(%1, %%"FF_REG_a
"), %%mm2 \n\t" 1053 "psllq $16, %%mm0 \n\t" 1054 "pand %%mm5, %%mm0 \n\t" 1055 "pand %%mm6, %%mm1 \n\t" 1056 "pand %%mm7, %%mm2 \n\t" 1057 "por %%mm0, %%mm1 \n\t" 1058 "por %%mm2, %%mm1 \n\t" 1059 "movq 6(%1, %%"FF_REG_a
"), %%mm0 \n\t" 1060 MOVNTQ" %%mm1,(%2, %%"FF_REG_a
") \n\t" 1061 "movq 8(%1, %%"FF_REG_a
"), %%mm1 \n\t" 1062 "movq 10(%1, %%"FF_REG_a
"), %%mm2 \n\t" 1063 "pand %%mm7, %%mm0 \n\t" 1064 "pand %%mm5, %%mm1 \n\t" 1065 "pand %%mm6, %%mm2 \n\t" 1066 "por %%mm0, %%mm1 \n\t" 1067 "por %%mm2, %%mm1 \n\t" 1068 "movq 14(%1, %%"FF_REG_a
"), %%mm0 \n\t" 1069 MOVNTQ" %%mm1, 8(%2, %%"FF_REG_a
")\n\t" 1070 "movq 16(%1, %%"FF_REG_a
"), %%mm1 \n\t" 1071 "movq 18(%1, %%"FF_REG_a
"), %%mm2 \n\t" 1072 "pand %%mm6, %%mm0 \n\t" 1073 "pand %%mm7, %%mm1 \n\t" 1074 "pand %%mm5, %%mm2 \n\t" 1075 "por %%mm0, %%mm1 \n\t" 1076 "por %%mm2, %%mm1 \n\t" 1077 MOVNTQ" %%mm1, 16(%2, %%"FF_REG_a
") \n\t" 1078 "add $24, %%"FF_REG_a
" \n\t" 1082 :
"r" (
src-mmx_size),
"r"(dst-mmx_size)
1086 __asm__
volatile(
SFENCE:::
"memory");
1087 __asm__
volatile(
EMMS:::
"memory");
1089 if (mmx_size==23)
return;
1093 src_size= 23-mmx_size;
1096 for (i=0; i<src_size; i+=3) {
1099 dst[i + 1] =
src[i + 1];
1100 dst[i + 2] =
src[i + 0];
1107 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1111 for (y=0; y<
height; y++) {
1114 "xor %%"FF_REG_a
", %%"FF_REG_a
" \n\t" 1117 PREFETCH" 32(%1, %%"FF_REG_a
", 2) \n\t" 1118 PREFETCH" 32(%2, %%"FF_REG_a
") \n\t" 1119 PREFETCH" 32(%3, %%"FF_REG_a
") \n\t" 1120 "movq (%2, %%"FF_REG_a
"), %%mm0 \n\t" 1121 "movq %%mm0, %%mm2 \n\t" 1122 "movq (%3, %%"FF_REG_a
"), %%mm1 \n\t" 1123 "punpcklbw %%mm1, %%mm0 \n\t" 1124 "punpckhbw %%mm1, %%mm2 \n\t" 1126 "movq (%1, %%"FF_REG_a
",2), %%mm3 \n\t" 1127 "movq 8(%1, %%"FF_REG_a
",2), %%mm5 \n\t" 1128 "movq %%mm3, %%mm4 \n\t" 1129 "movq %%mm5, %%mm6 \n\t" 1130 "punpcklbw %%mm0, %%mm3 \n\t" 1131 "punpckhbw %%mm0, %%mm4 \n\t" 1132 "punpcklbw %%mm2, %%mm5 \n\t" 1133 "punpckhbw %%mm2, %%mm6 \n\t" 1135 MOVNTQ" %%mm3, (%0, %%"FF_REG_a
", 4) \n\t" 1136 MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a
", 4) \n\t" 1137 MOVNTQ" %%mm5, 16(%0, %%"FF_REG_a
", 4) \n\t" 1138 MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a
", 4) \n\t" 1140 "add $8, %%"FF_REG_a
" \n\t" 1141 "cmp %4, %%"FF_REG_a
" \n\t" 1143 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1146 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1147 usrc += chromStride;
1148 vsrc += chromStride;
1164 int lumStride,
int chromStride,
int dstStride)
1172 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1176 for (y=0; y<
height; y++) {
1179 "xor %%"FF_REG_a
", %%"FF_REG_a
" \n\t" 1182 PREFETCH" 32(%1, %%"FF_REG_a
", 2) \n\t" 1183 PREFETCH" 32(%2, %%"FF_REG_a
") \n\t" 1184 PREFETCH" 32(%3, %%"FF_REG_a
") \n\t" 1185 "movq (%2, %%"FF_REG_a
"), %%mm0 \n\t" 1186 "movq %%mm0, %%mm2 \n\t" 1187 "movq (%3, %%"FF_REG_a
"), %%mm1 \n\t" 1188 "punpcklbw %%mm1, %%mm0 \n\t" 1189 "punpckhbw %%mm1, %%mm2 \n\t" 1191 "movq (%1, %%"FF_REG_a
",2), %%mm3 \n\t" 1192 "movq 8(%1, %%"FF_REG_a
",2), %%mm5 \n\t" 1193 "movq %%mm0, %%mm4 \n\t" 1194 "movq %%mm2, %%mm6 \n\t" 1195 "punpcklbw %%mm3, %%mm0 \n\t" 1196 "punpckhbw %%mm3, %%mm4 \n\t" 1197 "punpcklbw %%mm5, %%mm2 \n\t" 1198 "punpckhbw %%mm5, %%mm6 \n\t" 1200 MOVNTQ" %%mm0, (%0, %%"FF_REG_a
", 4) \n\t" 1201 MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a
", 4) \n\t" 1202 MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a
", 4) \n\t" 1203 MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a
", 4) \n\t" 1205 "add $8, %%"FF_REG_a
" \n\t" 1206 "cmp %4, %%"FF_REG_a
" \n\t" 1208 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1211 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1212 usrc += chromStride;
1213 vsrc += chromStride;
1229 int lumStride,
int chromStride,
int dstStride)
1240 int lumStride,
int chromStride,
int dstStride)
1250 int lumStride,
int chromStride,
int dstStride)
1261 int lumStride,
int chromStride,
int srcStride)
1265 for (y=0; y<
height; y+=2) {
1267 "xor %%"FF_REG_a
", %%"FF_REG_a
"\n\t" 1268 "pcmpeqw %%mm7, %%mm7 \n\t" 1269 "psrlw $8, %%mm7 \n\t" 1272 PREFETCH" 64(%0, %%"FF_REG_a
", 4) \n\t" 1273 "movq (%0, %%"FF_REG_a
", 4), %%mm0 \n\t" 1274 "movq 8(%0, %%"FF_REG_a
", 4), %%mm1 \n\t" 1275 "movq %%mm0, %%mm2 \n\t" 1276 "movq %%mm1, %%mm3 \n\t" 1277 "psrlw $8, %%mm0 \n\t" 1278 "psrlw $8, %%mm1 \n\t" 1279 "pand %%mm7, %%mm2 \n\t" 1280 "pand %%mm7, %%mm3 \n\t" 1281 "packuswb %%mm1, %%mm0 \n\t" 1282 "packuswb %%mm3, %%mm2 \n\t" 1284 MOVNTQ" %%mm2, (%1, %%"FF_REG_a
", 2) \n\t" 1286 "movq 16(%0, %%"FF_REG_a
", 4), %%mm1 \n\t" 1287 "movq 24(%0, %%"FF_REG_a
", 4), %%mm2 \n\t" 1288 "movq %%mm1, %%mm3 \n\t" 1289 "movq %%mm2, %%mm4 \n\t" 1290 "psrlw $8, %%mm1 \n\t" 1291 "psrlw $8, %%mm2 \n\t" 1292 "pand %%mm7, %%mm3 \n\t" 1293 "pand %%mm7, %%mm4 \n\t" 1294 "packuswb %%mm2, %%mm1 \n\t" 1295 "packuswb %%mm4, %%mm3 \n\t" 1297 MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a
", 2) \n\t" 1299 "movq %%mm0, %%mm2 \n\t" 1300 "movq %%mm1, %%mm3 \n\t" 1301 "psrlw $8, %%mm0 \n\t" 1302 "psrlw $8, %%mm1 \n\t" 1303 "pand %%mm7, %%mm2 \n\t" 1304 "pand %%mm7, %%mm3 \n\t" 1305 "packuswb %%mm1, %%mm0 \n\t" 1306 "packuswb %%mm3, %%mm2 \n\t" 1308 MOVNTQ" %%mm0, (%3, %%"FF_REG_a
") \n\t" 1309 MOVNTQ" %%mm2, (%2, %%"FF_REG_a
") \n\t" 1311 "add $8, %%"FF_REG_a
" \n\t" 1312 "cmp %4, %%"FF_REG_a
" \n\t" 1314 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1315 :
"memory",
"%"FF_REG_a
1322 "xor %%"FF_REG_a
", %%"FF_REG_a
"\n\t" 1325 PREFETCH" 64(%0, %%"FF_REG_a
", 4) \n\t" 1326 "movq (%0, %%"FF_REG_a
", 4), %%mm0 \n\t" 1327 "movq 8(%0, %%"FF_REG_a
", 4), %%mm1 \n\t" 1328 "movq 16(%0, %%"FF_REG_a
", 4), %%mm2 \n\t" 1329 "movq 24(%0, %%"FF_REG_a
", 4), %%mm3 \n\t" 1330 "pand %%mm7, %%mm0 \n\t" 1331 "pand %%mm7, %%mm1 \n\t" 1332 "pand %%mm7, %%mm2 \n\t" 1333 "pand %%mm7, %%mm3 \n\t" 1334 "packuswb %%mm1, %%mm0 \n\t" 1335 "packuswb %%mm3, %%mm2 \n\t" 1337 MOVNTQ" %%mm0, (%1, %%"FF_REG_a
", 2) \n\t" 1338 MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a
", 2) \n\t" 1340 "add $8, %%"FF_REG_a
"\n\t" 1341 "cmp %4, %%"FF_REG_a
"\n\t" 1344 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1345 :
"memory",
"%"FF_REG_a
1347 udst += chromStride;
1348 vdst += chromStride;
1352 __asm__
volatile(
EMMS" \n\t" 1358 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW 1366 for (x=0; x<srcWidth-1; x++) {
1367 dst[2*x+1]= (3*
src[x] +
src[x+1])>>2;
1368 dst[2*x+2]= (
src[x] + 3*
src[x+1])>>2;
1370 dst[2*srcWidth-1]=
src[srcWidth-1];
1374 for (y=1; y<srcHeight; y++) {
1375 x86_reg mmxSize= srcWidth&~15;
1379 "mov %4, %%"FF_REG_a
" \n\t" 1380 "movq "MANGLE(mmx_ff)
", %%mm0 \n\t" 1381 "movq (%0, %%"FF_REG_a
"), %%mm4 \n\t" 1382 "movq %%mm4, %%mm2 \n\t" 1383 "psllq $8, %%mm4 \n\t" 1384 "pand %%mm0, %%mm2 \n\t" 1385 "por %%mm2, %%mm4 \n\t" 1386 "movq (%1, %%"FF_REG_a
"), %%mm5 \n\t" 1387 "movq %%mm5, %%mm3 \n\t" 1388 "psllq $8, %%mm5 \n\t" 1389 "pand %%mm0, %%mm3 \n\t" 1390 "por %%mm3, %%mm5 \n\t" 1392 "movq (%0, %%"FF_REG_a
"), %%mm0 \n\t" 1393 "movq (%1, %%"FF_REG_a
"), %%mm1 \n\t" 1394 "movq 1(%0, %%"FF_REG_a
"), %%mm2 \n\t" 1395 "movq 1(%1, %%"FF_REG_a
"), %%mm3 \n\t" 1396 PAVGB" %%mm0, %%mm5 \n\t" 1397 PAVGB" %%mm0, %%mm3 \n\t" 1398 PAVGB" %%mm0, %%mm5 \n\t" 1399 PAVGB" %%mm0, %%mm3 \n\t" 1400 PAVGB" %%mm1, %%mm4 \n\t" 1401 PAVGB" %%mm1, %%mm2 \n\t" 1402 PAVGB" %%mm1, %%mm4 \n\t" 1403 PAVGB" %%mm1, %%mm2 \n\t" 1404 "movq %%mm5, %%mm7 \n\t" 1405 "movq %%mm4, %%mm6 \n\t" 1406 "punpcklbw %%mm3, %%mm5 \n\t" 1407 "punpckhbw %%mm3, %%mm7 \n\t" 1408 "punpcklbw %%mm2, %%mm4 \n\t" 1409 "punpckhbw %%mm2, %%mm6 \n\t" 1410 MOVNTQ" %%mm5, (%2, %%"FF_REG_a
", 2) \n\t" 1411 MOVNTQ" %%mm7, 8(%2, %%"FF_REG_a
", 2) \n\t" 1412 MOVNTQ" %%mm4, (%3, %%"FF_REG_a
", 2) \n\t" 1413 MOVNTQ" %%mm6, 8(%3, %%"FF_REG_a
", 2) \n\t" 1414 "add $8, %%"FF_REG_a
" \n\t" 1415 "movq -1(%0, %%"FF_REG_a
"), %%mm4 \n\t" 1416 "movq -1(%1, %%"FF_REG_a
"), %%mm5 \n\t" 1418 ::
"r" (
src + mmxSize ),
"r" (
src + srcStride + mmxSize ),
1419 "r" (dst + mmxSize*2),
"r" (dst + dstStride + mmxSize*2),
1426 dst[0] = (
src[0] * 3 +
src[srcStride]) >> 2;
1427 dst[dstStride] = (
src[0] + 3 *
src[srcStride]) >> 2;
1430 for (x=mmxSize-1; x<srcWidth-1; x++) {
1431 dst[2*x +1]= (3*
src[x+0] +
src[x+srcStride+1])>>2;
1432 dst[2*x+dstStride+2]= (
src[x+0] + 3*
src[x+srcStride+1])>>2;
1433 dst[2*x+dstStride+1]= (
src[x+1] + 3*
src[x+srcStride ])>>2;
1434 dst[2*x +2]= (3*
src[x+1] +
src[x+srcStride ])>>2;
1436 dst[srcWidth*2 -1 ]= (3*
src[srcWidth-1] +
src[srcWidth-1 + srcStride])>>2;
1437 dst[srcWidth*2 -1 + dstStride]= (
src[srcWidth-1] + 3*
src[srcWidth-1 + srcStride])>>2;
1446 for (x=0; x<srcWidth-1; x++) {
1447 dst[2*x+1]= (3*
src[x] +
src[x+1])>>2;
1448 dst[2*x+2]= (
src[x] + 3*
src[x+1])>>2;
1450 dst[2*srcWidth-1]=
src[srcWidth-1];
1452 __asm__
volatile(
EMMS" \n\t" 1458 #if !COMPILE_TEMPLATE_AMD3DNOW 1467 int lumStride,
int chromStride,
int srcStride)
1471 for (y=0; y<
height; y+=2) {
1473 "xor %%"FF_REG_a
", %%"FF_REG_a
" \n\t" 1474 "pcmpeqw %%mm7, %%mm7 \n\t" 1475 "psrlw $8, %%mm7 \n\t" 1478 PREFETCH" 64(%0, %%"FF_REG_a
", 4) \n\t" 1479 "movq (%0, %%"FF_REG_a
", 4), %%mm0 \n\t" 1480 "movq 8(%0, %%"FF_REG_a
", 4), %%mm1 \n\t" 1481 "movq %%mm0, %%mm2 \n\t" 1482 "movq %%mm1, %%mm3 \n\t" 1483 "pand %%mm7, %%mm0 \n\t" 1484 "pand %%mm7, %%mm1 \n\t" 1485 "psrlw $8, %%mm2 \n\t" 1486 "psrlw $8, %%mm3 \n\t" 1487 "packuswb %%mm1, %%mm0 \n\t" 1488 "packuswb %%mm3, %%mm2 \n\t" 1490 MOVNTQ" %%mm2, (%1, %%"FF_REG_a
", 2) \n\t" 1492 "movq 16(%0, %%"FF_REG_a
", 4), %%mm1 \n\t" 1493 "movq 24(%0, %%"FF_REG_a
", 4), %%mm2 \n\t" 1494 "movq %%mm1, %%mm3 \n\t" 1495 "movq %%mm2, %%mm4 \n\t" 1496 "pand %%mm7, %%mm1 \n\t" 1497 "pand %%mm7, %%mm2 \n\t" 1498 "psrlw $8, %%mm3 \n\t" 1499 "psrlw $8, %%mm4 \n\t" 1500 "packuswb %%mm2, %%mm1 \n\t" 1501 "packuswb %%mm4, %%mm3 \n\t" 1503 MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a
", 2) \n\t" 1505 "movq %%mm0, %%mm2 \n\t" 1506 "movq %%mm1, %%mm3 \n\t" 1507 "psrlw $8, %%mm0 \n\t" 1508 "psrlw $8, %%mm1 \n\t" 1509 "pand %%mm7, %%mm2 \n\t" 1510 "pand %%mm7, %%mm3 \n\t" 1511 "packuswb %%mm1, %%mm0 \n\t" 1512 "packuswb %%mm3, %%mm2 \n\t" 1514 MOVNTQ" %%mm0, (%3, %%"FF_REG_a
") \n\t" 1515 MOVNTQ" %%mm2, (%2, %%"FF_REG_a
") \n\t" 1517 "add $8, %%"FF_REG_a
" \n\t" 1518 "cmp %4, %%"FF_REG_a
" \n\t" 1520 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1521 :
"memory",
"%"FF_REG_a
1528 "xor %%"FF_REG_a
", %%"FF_REG_a
" \n\t" 1531 PREFETCH" 64(%0, %%"FF_REG_a
", 4) \n\t" 1532 "movq (%0, %%"FF_REG_a
", 4), %%mm0 \n\t" 1533 "movq 8(%0, %%"FF_REG_a
", 4), %%mm1 \n\t" 1534 "movq 16(%0, %%"FF_REG_a
", 4), %%mm2 \n\t" 1535 "movq 24(%0, %%"FF_REG_a
", 4), %%mm3 \n\t" 1536 "psrlw $8, %%mm0 \n\t" 1537 "psrlw $8, %%mm1 \n\t" 1538 "psrlw $8, %%mm2 \n\t" 1539 "psrlw $8, %%mm3 \n\t" 1540 "packuswb %%mm1, %%mm0 \n\t" 1541 "packuswb %%mm3, %%mm2 \n\t" 1543 MOVNTQ" %%mm0, (%1, %%"FF_REG_a
", 2) \n\t" 1544 MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a
", 2) \n\t" 1546 "add $8, %%"FF_REG_a
" \n\t" 1547 "cmp %4, %%"FF_REG_a
" \n\t" 1550 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1551 :
"memory",
"%"FF_REG_a
1553 udst += chromStride;
1554 vdst += chromStride;
1558 __asm__
volatile(
EMMS" \n\t" 1574 int lumStride,
int chromStride,
int srcStride,
1577 #define BGR2Y_IDX "16*4+16*32" 1578 #define BGR2U_IDX "16*4+16*33" 1579 #define BGR2V_IDX "16*4+16*34" 1586 ydst += 2*lumStride;
1587 udst += chromStride;
1588 vdst += chromStride;
1592 for (y=0; y<
height-2; y+=2) {
1594 for (i=0; i<2; i++) {
1596 "mov %2, %%"FF_REG_a
"\n\t" 1597 "movq "BGR2Y_IDX
"(%3), %%mm6 \n\t" 1598 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t" 1599 "pxor %%mm7, %%mm7 \n\t" 1600 "lea (%%"FF_REG_a
", %%"FF_REG_a
", 2), %%"FF_REG_d
" \n\t" 1603 PREFETCH" 64(%0, %%"FF_REG_d
") \n\t" 1604 "movd (%0, %%"FF_REG_d
"), %%mm0 \n\t" 1605 "movd 3(%0, %%"FF_REG_d
"), %%mm1 \n\t" 1606 "punpcklbw %%mm7, %%mm0 \n\t" 1607 "punpcklbw %%mm7, %%mm1 \n\t" 1608 "movd 6(%0, %%"FF_REG_d
"), %%mm2 \n\t" 1609 "movd 9(%0, %%"FF_REG_d
"), %%mm3 \n\t" 1610 "punpcklbw %%mm7, %%mm2 \n\t" 1611 "punpcklbw %%mm7, %%mm3 \n\t" 1612 "pmaddwd %%mm6, %%mm0 \n\t" 1613 "pmaddwd %%mm6, %%mm1 \n\t" 1614 "pmaddwd %%mm6, %%mm2 \n\t" 1615 "pmaddwd %%mm6, %%mm3 \n\t" 1616 "psrad $8, %%mm0 \n\t" 1617 "psrad $8, %%mm1 \n\t" 1618 "psrad $8, %%mm2 \n\t" 1619 "psrad $8, %%mm3 \n\t" 1620 "packssdw %%mm1, %%mm0 \n\t" 1621 "packssdw %%mm3, %%mm2 \n\t" 1622 "pmaddwd %%mm5, %%mm0 \n\t" 1623 "pmaddwd %%mm5, %%mm2 \n\t" 1624 "packssdw %%mm2, %%mm0 \n\t" 1625 "psraw $7, %%mm0 \n\t" 1627 "movd 12(%0, %%"FF_REG_d
"), %%mm4 \n\t" 1628 "movd 15(%0, %%"FF_REG_d
"), %%mm1 \n\t" 1629 "punpcklbw %%mm7, %%mm4 \n\t" 1630 "punpcklbw %%mm7, %%mm1 \n\t" 1631 "movd 18(%0, %%"FF_REG_d
"), %%mm2 \n\t" 1632 "movd 21(%0, %%"FF_REG_d
"), %%mm3 \n\t" 1633 "punpcklbw %%mm7, %%mm2 \n\t" 1634 "punpcklbw %%mm7, %%mm3 \n\t" 1635 "pmaddwd %%mm6, %%mm4 \n\t" 1636 "pmaddwd %%mm6, %%mm1 \n\t" 1637 "pmaddwd %%mm6, %%mm2 \n\t" 1638 "pmaddwd %%mm6, %%mm3 \n\t" 1639 "psrad $8, %%mm4 \n\t" 1640 "psrad $8, %%mm1 \n\t" 1641 "psrad $8, %%mm2 \n\t" 1642 "psrad $8, %%mm3 \n\t" 1643 "packssdw %%mm1, %%mm4 \n\t" 1644 "packssdw %%mm3, %%mm2 \n\t" 1645 "pmaddwd %%mm5, %%mm4 \n\t" 1646 "pmaddwd %%mm5, %%mm2 \n\t" 1647 "add $24, %%"FF_REG_d
"\n\t" 1648 "packssdw %%mm2, %%mm4 \n\t" 1649 "psraw $7, %%mm4 \n\t" 1651 "packuswb %%mm4, %%mm0 \n\t" 1652 "paddusb "MANGLE(ff_bgr2YOffset)
", %%mm0 \n\t" 1654 MOVNTQ" %%mm0, (%1, %%"FF_REG_a
") \n\t" 1655 "add $8, %%"FF_REG_a
" \n\t" 1659 :
"%"FF_REG_a,
"%"FF_REG_d
1666 "mov %4, %%"FF_REG_a
"\n\t" 1667 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t" 1668 "movq "BGR2U_IDX
"(%5), %%mm6 \n\t" 1669 "pxor %%mm7, %%mm7 \n\t" 1670 "lea (%%"FF_REG_a
", %%"FF_REG_a
", 2), %%"FF_REG_d
" \n\t" 1671 "add %%"FF_REG_d
", %%"FF_REG_d
"\n\t" 1674 PREFETCH" 64(%0, %%"FF_REG_d
") \n\t" 1675 PREFETCH" 64(%1, %%"FF_REG_d
") \n\t" 1676 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW 1677 "movq (%0, %%"FF_REG_d
"), %%mm0 \n\t" 1678 "movq (%1, %%"FF_REG_d
"), %%mm1 \n\t" 1679 "movq 6(%0, %%"FF_REG_d
"), %%mm2 \n\t" 1680 "movq 6(%1, %%"FF_REG_d
"), %%mm3 \n\t" 1681 PAVGB" %%mm1, %%mm0 \n\t" 1682 PAVGB" %%mm3, %%mm2 \n\t" 1683 "movq %%mm0, %%mm1 \n\t" 1684 "movq %%mm2, %%mm3 \n\t" 1685 "psrlq $24, %%mm0 \n\t" 1686 "psrlq $24, %%mm2 \n\t" 1687 PAVGB" %%mm1, %%mm0 \n\t" 1688 PAVGB" %%mm3, %%mm2 \n\t" 1689 "punpcklbw %%mm7, %%mm0 \n\t" 1690 "punpcklbw %%mm7, %%mm2 \n\t" 1692 "movd (%0, %%"FF_REG_d
"), %%mm0 \n\t" 1693 "movd (%1, %%"FF_REG_d
"), %%mm1 \n\t" 1694 "movd 3(%0, %%"FF_REG_d
"), %%mm2 \n\t" 1695 "movd 3(%1, %%"FF_REG_d
"), %%mm3 \n\t" 1696 "punpcklbw %%mm7, %%mm0 \n\t" 1697 "punpcklbw %%mm7, %%mm1 \n\t" 1698 "punpcklbw %%mm7, %%mm2 \n\t" 1699 "punpcklbw %%mm7, %%mm3 \n\t" 1700 "paddw %%mm1, %%mm0 \n\t" 1701 "paddw %%mm3, %%mm2 \n\t" 1702 "paddw %%mm2, %%mm0 \n\t" 1703 "movd 6(%0, %%"FF_REG_d
"), %%mm4 \n\t" 1704 "movd 6(%1, %%"FF_REG_d
"), %%mm1 \n\t" 1705 "movd 9(%0, %%"FF_REG_d
"), %%mm2 \n\t" 1706 "movd 9(%1, %%"FF_REG_d
"), %%mm3 \n\t" 1707 "punpcklbw %%mm7, %%mm4 \n\t" 1708 "punpcklbw %%mm7, %%mm1 \n\t" 1709 "punpcklbw %%mm7, %%mm2 \n\t" 1710 "punpcklbw %%mm7, %%mm3 \n\t" 1711 "paddw %%mm1, %%mm4 \n\t" 1712 "paddw %%mm3, %%mm2 \n\t" 1713 "paddw %%mm4, %%mm2 \n\t" 1714 "psrlw $2, %%mm0 \n\t" 1715 "psrlw $2, %%mm2 \n\t" 1717 "movq "BGR2V_IDX
"(%5), %%mm1 \n\t" 1718 "movq "BGR2V_IDX
"(%5), %%mm3 \n\t" 1720 "pmaddwd %%mm0, %%mm1 \n\t" 1721 "pmaddwd %%mm2, %%mm3 \n\t" 1722 "pmaddwd %%mm6, %%mm0 \n\t" 1723 "pmaddwd %%mm6, %%mm2 \n\t" 1724 "psrad $8, %%mm0 \n\t" 1725 "psrad $8, %%mm1 \n\t" 1726 "psrad $8, %%mm2 \n\t" 1727 "psrad $8, %%mm3 \n\t" 1728 "packssdw %%mm2, %%mm0 \n\t" 1729 "packssdw %%mm3, %%mm1 \n\t" 1730 "pmaddwd %%mm5, %%mm0 \n\t" 1731 "pmaddwd %%mm5, %%mm1 \n\t" 1732 "packssdw %%mm1, %%mm0 \n\t" 1733 "psraw $7, %%mm0 \n\t" 1735 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW 1736 "movq 12(%0, %%"FF_REG_d
"), %%mm4 \n\t" 1737 "movq 12(%1, %%"FF_REG_d
"), %%mm1 \n\t" 1738 "movq 18(%0, %%"FF_REG_d
"), %%mm2 \n\t" 1739 "movq 18(%1, %%"FF_REG_d
"), %%mm3 \n\t" 1740 PAVGB" %%mm1, %%mm4 \n\t" 1741 PAVGB" %%mm3, %%mm2 \n\t" 1742 "movq %%mm4, %%mm1 \n\t" 1743 "movq %%mm2, %%mm3 \n\t" 1744 "psrlq $24, %%mm4 \n\t" 1745 "psrlq $24, %%mm2 \n\t" 1746 PAVGB" %%mm1, %%mm4 \n\t" 1747 PAVGB" %%mm3, %%mm2 \n\t" 1748 "punpcklbw %%mm7, %%mm4 \n\t" 1749 "punpcklbw %%mm7, %%mm2 \n\t" 1751 "movd 12(%0, %%"FF_REG_d
"), %%mm4 \n\t" 1752 "movd 12(%1, %%"FF_REG_d
"), %%mm1 \n\t" 1753 "movd 15(%0, %%"FF_REG_d
"), %%mm2 \n\t" 1754 "movd 15(%1, %%"FF_REG_d
"), %%mm3 \n\t" 1755 "punpcklbw %%mm7, %%mm4 \n\t" 1756 "punpcklbw %%mm7, %%mm1 \n\t" 1757 "punpcklbw %%mm7, %%mm2 \n\t" 1758 "punpcklbw %%mm7, %%mm3 \n\t" 1759 "paddw %%mm1, %%mm4 \n\t" 1760 "paddw %%mm3, %%mm2 \n\t" 1761 "paddw %%mm2, %%mm4 \n\t" 1762 "movd 18(%0, %%"FF_REG_d
"), %%mm5 \n\t" 1763 "movd 18(%1, %%"FF_REG_d
"), %%mm1 \n\t" 1764 "movd 21(%0, %%"FF_REG_d
"), %%mm2 \n\t" 1765 "movd 21(%1, %%"FF_REG_d
"), %%mm3 \n\t" 1766 "punpcklbw %%mm7, %%mm5 \n\t" 1767 "punpcklbw %%mm7, %%mm1 \n\t" 1768 "punpcklbw %%mm7, %%mm2 \n\t" 1769 "punpcklbw %%mm7, %%mm3 \n\t" 1770 "paddw %%mm1, %%mm5 \n\t" 1771 "paddw %%mm3, %%mm2 \n\t" 1772 "paddw %%mm5, %%mm2 \n\t" 1773 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t" 1774 "psrlw $2, %%mm4 \n\t" 1775 "psrlw $2, %%mm2 \n\t" 1777 "movq "BGR2V_IDX
"(%5), %%mm1 \n\t" 1778 "movq "BGR2V_IDX
"(%5), %%mm3 \n\t" 1780 "pmaddwd %%mm4, %%mm1 \n\t" 1781 "pmaddwd %%mm2, %%mm3 \n\t" 1782 "pmaddwd %%mm6, %%mm4 \n\t" 1783 "pmaddwd %%mm6, %%mm2 \n\t" 1784 "psrad $8, %%mm4 \n\t" 1785 "psrad $8, %%mm1 \n\t" 1786 "psrad $8, %%mm2 \n\t" 1787 "psrad $8, %%mm3 \n\t" 1788 "packssdw %%mm2, %%mm4 \n\t" 1789 "packssdw %%mm3, %%mm1 \n\t" 1790 "pmaddwd %%mm5, %%mm4 \n\t" 1791 "pmaddwd %%mm5, %%mm1 \n\t" 1792 "add $24, %%"FF_REG_d
"\n\t" 1793 "packssdw %%mm1, %%mm4 \n\t" 1794 "psraw $7, %%mm4 \n\t" 1796 "movq %%mm0, %%mm1 \n\t" 1797 "punpckldq %%mm4, %%mm0 \n\t" 1798 "punpckhdq %%mm4, %%mm1 \n\t" 1799 "packsswb %%mm1, %%mm0 \n\t" 1800 "paddb "MANGLE(ff_bgr2UVOffset)
", %%mm0 \n\t" 1801 "movd %%mm0, (%2, %%"FF_REG_a
") \n\t" 1802 "punpckhdq %%mm0, %%mm0 \n\t" 1803 "movd %%mm0, (%3, %%"FF_REG_a
") \n\t" 1804 "add $4, %%"FF_REG_a
" \n\t" 1806 : :
"r" (
src+chromWidth*6),
"r" (
src+srcStride+chromWidth*6),
"r" (udst+chromWidth),
"r" (vdst+chromWidth),
"g" (-chromWidth),
"r"(
rgb2yuv)
1808 :
"%"FF_REG_a,
"%"FF_REG_d
1811 udst += chromStride;
1812 vdst += chromStride;
1816 __asm__
volatile(
EMMS" \n\t" 1825 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX 1828 int src2Stride,
int dstStride)
1832 for (h=0; h <
height; h++) {
1836 #if COMPILE_TEMPLATE_SSE2 1837 if (!((((intptr_t)
src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) {
1839 "xor %%"FF_REG_a
", %%"FF_REG_a
" \n\t" 1841 PREFETCH" 64(%1, %%"FF_REG_a
") \n\t" 1842 PREFETCH" 64(%2, %%"FF_REG_a
") \n\t" 1843 "movdqa (%1, %%"FF_REG_a
"), %%xmm0 \n\t" 1844 "movdqa (%1, %%"FF_REG_a
"), %%xmm1 \n\t" 1845 "movdqa (%2, %%"FF_REG_a
"), %%xmm2 \n\t" 1846 "punpcklbw %%xmm2, %%xmm0 \n\t" 1847 "punpckhbw %%xmm2, %%xmm1 \n\t" 1848 "movntdq %%xmm0, (%0, %%"FF_REG_a
", 2) \n\t" 1849 "movntdq %%xmm1, 16(%0, %%"FF_REG_a
", 2) \n\t" 1850 "add $16, %%"FF_REG_a
" \n\t" 1851 "cmp %3, %%"FF_REG_a
" \n\t" 1853 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)
width-15)
1854 :
"memory",
XMM_CLOBBERS(
"xmm0",
"xmm1",
"xmm2",)
"%"FF_REG_a
1859 "xor %%"FF_REG_a
", %%"FF_REG_a
" \n\t" 1861 PREFETCH" 64(%1, %%"FF_REG_a
") \n\t" 1862 PREFETCH" 64(%2, %%"FF_REG_a
") \n\t" 1863 "movq (%1, %%"FF_REG_a
"), %%mm0 \n\t" 1864 "movq 8(%1, %%"FF_REG_a
"), %%mm2 \n\t" 1865 "movq %%mm0, %%mm1 \n\t" 1866 "movq %%mm2, %%mm3 \n\t" 1867 "movq (%2, %%"FF_REG_a
"), %%mm4 \n\t" 1868 "movq 8(%2, %%"FF_REG_a
"), %%mm5 \n\t" 1869 "punpcklbw %%mm4, %%mm0 \n\t" 1870 "punpckhbw %%mm4, %%mm1 \n\t" 1871 "punpcklbw %%mm5, %%mm2 \n\t" 1872 "punpckhbw %%mm5, %%mm3 \n\t" 1873 MOVNTQ" %%mm0, (%0, %%"FF_REG_a
", 2) \n\t" 1874 MOVNTQ" %%mm1, 8(%0, %%"FF_REG_a
", 2) \n\t" 1875 MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a
", 2) \n\t" 1876 MOVNTQ" %%mm3, 24(%0, %%"FF_REG_a
", 2) \n\t" 1877 "add $16, %%"FF_REG_a
" \n\t" 1878 "cmp %3, %%"FF_REG_a
" \n\t" 1880 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)
width-15)
1881 :
"memory",
"%"FF_REG_a
1886 dest[2*w+0] =
src1[
w];
1887 dest[2*w+1] = src2[
w];
1901 #if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL 1902 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_X86ASM 1911 int dst1Stride,
int dst2Stride)
1915 for (h = 0; h <
height; h++) {
1922 #
if !COMPILE_TEMPLATE_SSE2
1932 #if !COMPILE_TEMPLATE_SSE2 1933 #if !COMPILE_TEMPLATE_AMD3DNOW 1937 int srcStride1,
int srcStride2,
1938 int dstStride1,
int dstStride2)
1946 ::
"m"(*(
src1+srcStride1)),
"m"(*(src2+srcStride2)):
"memory");
1951 for (;x<w-31;x+=32) {
1954 "movq (%1,%2), %%mm0 \n\t" 1955 "movq 8(%1,%2), %%mm2 \n\t" 1956 "movq 16(%1,%2), %%mm4 \n\t" 1957 "movq 24(%1,%2), %%mm6 \n\t" 1958 "movq %%mm0, %%mm1 \n\t" 1959 "movq %%mm2, %%mm3 \n\t" 1960 "movq %%mm4, %%mm5 \n\t" 1961 "movq %%mm6, %%mm7 \n\t" 1962 "punpcklbw %%mm0, %%mm0 \n\t" 1963 "punpckhbw %%mm1, %%mm1 \n\t" 1964 "punpcklbw %%mm2, %%mm2 \n\t" 1965 "punpckhbw %%mm3, %%mm3 \n\t" 1966 "punpcklbw %%mm4, %%mm4 \n\t" 1967 "punpckhbw %%mm5, %%mm5 \n\t" 1968 "punpcklbw %%mm6, %%mm6 \n\t" 1969 "punpckhbw %%mm7, %%mm7 \n\t" 1970 MOVNTQ" %%mm0, (%0,%2,2) \n\t" 1971 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" 1972 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" 1973 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" 1974 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" 1975 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" 1976 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" 1977 MOVNTQ" %%mm7, 56(%0,%2,2)" 1978 ::
"r"(d),
"r"(s1),
"r"(x)
1981 for (;x<
w;x++) d[2*x]=d[2*x+1]=s1[x];
1984 const uint8_t*
s2=src2+srcStride2*(y>>1);
1987 for (;x<w-31;x+=32) {
1990 "movq (%1,%2), %%mm0 \n\t" 1991 "movq 8(%1,%2), %%mm2 \n\t" 1992 "movq 16(%1,%2), %%mm4 \n\t" 1993 "movq 24(%1,%2), %%mm6 \n\t" 1994 "movq %%mm0, %%mm1 \n\t" 1995 "movq %%mm2, %%mm3 \n\t" 1996 "movq %%mm4, %%mm5 \n\t" 1997 "movq %%mm6, %%mm7 \n\t" 1998 "punpcklbw %%mm0, %%mm0 \n\t" 1999 "punpckhbw %%mm1, %%mm1 \n\t" 2000 "punpcklbw %%mm2, %%mm2 \n\t" 2001 "punpckhbw %%mm3, %%mm3 \n\t" 2002 "punpcklbw %%mm4, %%mm4 \n\t" 2003 "punpckhbw %%mm5, %%mm5 \n\t" 2004 "punpcklbw %%mm6, %%mm6 \n\t" 2005 "punpckhbw %%mm7, %%mm7 \n\t" 2006 MOVNTQ" %%mm0, (%0,%2,2) \n\t" 2007 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" 2008 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" 2009 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" 2010 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" 2011 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" 2012 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" 2013 MOVNTQ" %%mm7, 56(%0,%2,2)" 2014 ::
"r"(d),
"r"(s2),
"r"(x)
2017 for (;x<
w;x++) d[2*x]=d[2*x+1]=s2[x];
2029 int srcStride1,
int srcStride2,
2030 int srcStride3,
int dstStride)
2037 const uint8_t* up=src2+srcStride2*(y>>2);
2038 const uint8_t* vp=src3+srcStride3*(y>>2);
2046 "movq (%1, %0, 4), %%mm0 \n\t" 2047 "movq (%2, %0), %%mm1 \n\t" 2048 "movq (%3, %0), %%mm2 \n\t" 2049 "movq %%mm0, %%mm3 \n\t" 2050 "movq %%mm1, %%mm4 \n\t" 2051 "movq %%mm2, %%mm5 \n\t" 2052 "punpcklbw %%mm1, %%mm1 \n\t" 2053 "punpcklbw %%mm2, %%mm2 \n\t" 2054 "punpckhbw %%mm4, %%mm4 \n\t" 2055 "punpckhbw %%mm5, %%mm5 \n\t" 2057 "movq %%mm1, %%mm6 \n\t" 2058 "punpcklbw %%mm2, %%mm1 \n\t" 2059 "punpcklbw %%mm1, %%mm0 \n\t" 2060 "punpckhbw %%mm1, %%mm3 \n\t" 2061 MOVNTQ" %%mm0, (%4, %0, 8) \n\t" 2062 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" 2064 "punpckhbw %%mm2, %%mm6 \n\t" 2065 "movq 8(%1, %0, 4), %%mm0 \n\t" 2066 "movq %%mm0, %%mm3 \n\t" 2067 "punpcklbw %%mm6, %%mm0 \n\t" 2068 "punpckhbw %%mm6, %%mm3 \n\t" 2069 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" 2070 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" 2072 "movq %%mm4, %%mm6 \n\t" 2073 "movq 16(%1, %0, 4), %%mm0 \n\t" 2074 "movq %%mm0, %%mm3 \n\t" 2075 "punpcklbw %%mm5, %%mm4 \n\t" 2076 "punpcklbw %%mm4, %%mm0 \n\t" 2077 "punpckhbw %%mm4, %%mm3 \n\t" 2078 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" 2079 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" 2081 "punpckhbw %%mm5, %%mm6 \n\t" 2082 "movq 24(%1, %0, 4), %%mm0 \n\t" 2083 "movq %%mm0, %%mm3 \n\t" 2084 "punpcklbw %%mm6, %%mm0 \n\t" 2085 "punpckhbw %%mm6, %%mm3 \n\t" 2086 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" 2087 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" 2090 :
"r"(yp),
"r" (up),
"r"(vp),
"r"(d)
2094 const int x2 = x<<2;
2097 d[8*x+2] = yp[x2+1];
2099 d[8*x+4] = yp[x2+2];
2101 d[8*x+6] = yp[x2+3];
2122 "pcmpeqw %%mm7, %%mm7 \n\t" 2123 "psrlw $8, %%mm7 \n\t" 2125 "movq -30(%1, %0, 2), %%mm0 \n\t" 2126 "movq -22(%1, %0, 2), %%mm1 \n\t" 2127 "movq -14(%1, %0, 2), %%mm2 \n\t" 2128 "movq -6(%1, %0, 2), %%mm3 \n\t" 2129 "pand %%mm7, %%mm0 \n\t" 2130 "pand %%mm7, %%mm1 \n\t" 2131 "pand %%mm7, %%mm2 \n\t" 2132 "pand %%mm7, %%mm3 \n\t" 2133 "packuswb %%mm1, %%mm0 \n\t" 2134 "packuswb %%mm3, %%mm2 \n\t" 2135 MOVNTQ" %%mm0,-15(%2, %0) \n\t" 2136 MOVNTQ" %%mm2,- 7(%2, %0) \n\t" 2140 :
"r"(
src),
"r"(dst)
2160 "pcmpeqw %%mm7, %%mm7 \n\t" 2161 "psrlw $8, %%mm7 \n\t" 2163 "movq -32(%1, %0, 2), %%mm0 \n\t" 2164 "movq -24(%1, %0, 2), %%mm1 \n\t" 2165 "movq -16(%1, %0, 2), %%mm2 \n\t" 2166 "movq -8(%1, %0, 2), %%mm3 \n\t" 2167 "pand %%mm7, %%mm0 \n\t" 2168 "pand %%mm7, %%mm1 \n\t" 2169 "pand %%mm7, %%mm2 \n\t" 2170 "pand %%mm7, %%mm3 \n\t" 2171 "packuswb %%mm1, %%mm0 \n\t" 2172 "packuswb %%mm3, %%mm2 \n\t" 2173 MOVNTQ" %%mm0,-16(%2, %0) \n\t" 2174 MOVNTQ" %%mm2,- 8(%2, %0) \n\t" 2178 :
"r"(
src),
"r"(dst)
2188 #if !COMPILE_TEMPLATE_AMD3DNOW 2198 "pcmpeqw %%mm7, %%mm7 \n\t" 2199 "psrlw $8, %%mm7 \n\t" 2201 "movq -28(%1, %0, 4), %%mm0 \n\t" 2202 "movq -20(%1, %0, 4), %%mm1 \n\t" 2203 "movq -12(%1, %0, 4), %%mm2 \n\t" 2204 "movq -4(%1, %0, 4), %%mm3 \n\t" 2205 "pand %%mm7, %%mm0 \n\t" 2206 "pand %%mm7, %%mm1 \n\t" 2207 "pand %%mm7, %%mm2 \n\t" 2208 "pand %%mm7, %%mm3 \n\t" 2209 "packuswb %%mm1, %%mm0 \n\t" 2210 "packuswb %%mm3, %%mm2 \n\t" 2211 "movq %%mm0, %%mm1 \n\t" 2212 "movq %%mm2, %%mm3 \n\t" 2213 "psrlw $8, %%mm0 \n\t" 2214 "psrlw $8, %%mm2 \n\t" 2215 "pand %%mm7, %%mm1 \n\t" 2216 "pand %%mm7, %%mm3 \n\t" 2217 "packuswb %%mm2, %%mm0 \n\t" 2218 "packuswb %%mm3, %%mm1 \n\t" 2219 MOVNTQ" %%mm0,- 7(%3, %0) \n\t" 2220 MOVNTQ" %%mm1,- 7(%2, %0) \n\t" 2224 :
"r"(
src),
"r"(dst0),
"r"(dst1)
2247 "pcmpeqw %%mm7, %%mm7 \n\t" 2248 "psrlw $8, %%mm7 \n\t" 2250 "movq -28(%1, %0, 4), %%mm0 \n\t" 2251 "movq -20(%1, %0, 4), %%mm1 \n\t" 2252 "movq -12(%1, %0, 4), %%mm2 \n\t" 2253 "movq -4(%1, %0, 4), %%mm3 \n\t" 2254 PAVGB" -28(%2, %0, 4), %%mm0 \n\t" 2255 PAVGB" -20(%2, %0, 4), %%mm1 \n\t" 2256 PAVGB" -12(%2, %0, 4), %%mm2 \n\t" 2257 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" 2258 "pand %%mm7, %%mm0 \n\t" 2259 "pand %%mm7, %%mm1 \n\t" 2260 "pand %%mm7, %%mm2 \n\t" 2261 "pand %%mm7, %%mm3 \n\t" 2262 "packuswb %%mm1, %%mm0 \n\t" 2263 "packuswb %%mm3, %%mm2 \n\t" 2264 "movq %%mm0, %%mm1 \n\t" 2265 "movq %%mm2, %%mm3 \n\t" 2266 "psrlw $8, %%mm0 \n\t" 2267 "psrlw $8, %%mm2 \n\t" 2268 "pand %%mm7, %%mm1 \n\t" 2269 "pand %%mm7, %%mm3 \n\t" 2270 "packuswb %%mm2, %%mm0 \n\t" 2271 "packuswb %%mm3, %%mm1 \n\t" 2272 MOVNTQ" %%mm0,- 7(%4, %0) \n\t" 2273 MOVNTQ" %%mm1,- 7(%3, %0) \n\t" 2277 :
"r"(
src0),
"r"(
src1),
"r"(dst0),
"r"(dst1)
2289 #if !COMPILE_TEMPLATE_AMD3DNOW 2299 "pcmpeqw %%mm7, %%mm7 \n\t" 2300 "psrlw $8, %%mm7 \n\t" 2302 "movq -28(%1, %0, 4), %%mm0 \n\t" 2303 "movq -20(%1, %0, 4), %%mm1 \n\t" 2304 "movq -12(%1, %0, 4), %%mm2 \n\t" 2305 "movq -4(%1, %0, 4), %%mm3 \n\t" 2306 "psrlw $8, %%mm0 \n\t" 2307 "psrlw $8, %%mm1 \n\t" 2308 "psrlw $8, %%mm2 \n\t" 2309 "psrlw $8, %%mm3 \n\t" 2310 "packuswb %%mm1, %%mm0 \n\t" 2311 "packuswb %%mm3, %%mm2 \n\t" 2312 "movq %%mm0, %%mm1 \n\t" 2313 "movq %%mm2, %%mm3 \n\t" 2314 "psrlw $8, %%mm0 \n\t" 2315 "psrlw $8, %%mm2 \n\t" 2316 "pand %%mm7, %%mm1 \n\t" 2317 "pand %%mm7, %%mm3 \n\t" 2318 "packuswb %%mm2, %%mm0 \n\t" 2319 "packuswb %%mm3, %%mm1 \n\t" 2320 MOVNTQ" %%mm0,- 7(%3, %0) \n\t" 2321 MOVNTQ" %%mm1,- 7(%2, %0) \n\t" 2325 :
"r"(
src),
"r"(dst0),
"r"(dst1)
2349 "pcmpeqw %%mm7, %%mm7 \n\t" 2350 "psrlw $8, %%mm7 \n\t" 2352 "movq -28(%1, %0, 4), %%mm0 \n\t" 2353 "movq -20(%1, %0, 4), %%mm1 \n\t" 2354 "movq -12(%1, %0, 4), %%mm2 \n\t" 2355 "movq -4(%1, %0, 4), %%mm3 \n\t" 2356 PAVGB" -28(%2, %0, 4), %%mm0 \n\t" 2357 PAVGB" -20(%2, %0, 4), %%mm1 \n\t" 2358 PAVGB" -12(%2, %0, 4), %%mm2 \n\t" 2359 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" 2360 "psrlw $8, %%mm0 \n\t" 2361 "psrlw $8, %%mm1 \n\t" 2362 "psrlw $8, %%mm2 \n\t" 2363 "psrlw $8, %%mm3 \n\t" 2364 "packuswb %%mm1, %%mm0 \n\t" 2365 "packuswb %%mm3, %%mm2 \n\t" 2366 "movq %%mm0, %%mm1 \n\t" 2367 "movq %%mm2, %%mm3 \n\t" 2368 "psrlw $8, %%mm0 \n\t" 2369 "psrlw $8, %%mm2 \n\t" 2370 "pand %%mm7, %%mm1 \n\t" 2371 "pand %%mm7, %%mm3 \n\t" 2372 "packuswb %%mm2, %%mm0 \n\t" 2373 "packuswb %%mm3, %%mm1 \n\t" 2374 MOVNTQ" %%mm0,- 7(%4, %0) \n\t" 2375 MOVNTQ" %%mm1,- 7(%3, %0) \n\t" 2379 :
"r"(
src0),
"r"(
src1),
"r"(dst0),
"r"(dst1)
2395 int lumStride,
int chromStride,
int srcStride)
2400 for (y=0; y<
height; y++) {
2418 #if !COMPILE_TEMPLATE_AMD3DNOW 2421 int lumStride,
int chromStride,
int srcStride)
2426 for (y=0; y<
height; y++) {
2445 int lumStride,
int chromStride,
int srcStride)
2450 for (y=0; y<
height; y++) {
2468 #if !COMPILE_TEMPLATE_AMD3DNOW 2471 int lumStride,
int chromStride,
int srcStride)
2476 for (y=0; y<
height; y++) {
2496 #if !COMPILE_TEMPLATE_SSE2 2497 #if !COMPILE_TEMPLATE_AMD3DNOW 2526 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW 2537 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX 2540 #if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL 2541 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_X86ASM static void RENAME() rgb32tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuyvtoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() vu9_to_vu12(const uint8_t *src1, const uint8_t *src2, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride1, int srcStride2, int dstStride1, int dstStride2)
static void RENAME() uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() rgb16tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void fn() rgb2yuv(uint8_t *_yuv[3], const ptrdiff_t yuv_stride[3], int16_t *rgb[3], ptrdiff_t s, int w, int h, const int16_t rgb2yuv_coeffs[3][3][8], const int16_t yuv_offset[8])
static void RENAME() rgb32tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb24tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuv422ptoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
static void RENAME() extract_even2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb15to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Macro definitions for various function/variable attributes.
static void RENAME() yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
static av_cold int end(AVCodecContext *avctx)
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, int32_t *rgb2yuv)
Height should be a multiple of 2 and width should be a multiple of 2.
static void RENAME() rgb24tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_odd2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb24tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() yv12touyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16 (If this is a problem for anyon...
static av_cold void RENAME() rgb2rgb_init(void)
static void RENAME() yuvPlanartouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
void(* ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, int32_t *rgb2yuv)
Height should be a multiple of 2 and width should be a multiple of 2.
static void RENAME() rgb24to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb16to15(const uint8_t *src, uint8_t *dst, int src_size)
#define XMM_CLOBBERS(...)
static void RENAME() extract_odd(const uint8_t *src, uint8_t *dst, x86_reg count)
static void RENAME() rgb32to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_even2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() yuv422ptouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
void(* deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride, int dst1Stride, int dst2Stride)
typedef void(RENAME(mix_any_func_type))
static void RENAME() yuvPlanartoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
static void RENAME() rgb16to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_even(const uint8_t *src, uint8_t *dst, x86_reg count)
static void RENAME() rgb32tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuyvtoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() yvu9_to_yuy2(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, uint8_t *dst, int width, int height, int srcStride1, int srcStride2, int srcStride3, int dstStride)
static void RENAME() extract_odd2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb24tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
void(* planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride)
static void RENAME() rgb24to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() interleaveBytes(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, int width, int height, int src1Stride, int src2Stride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 2.
#define NAMED_CONSTRAINTS_ADD(...)
#define AV_CEIL_RSHIFT(a, b)