メインリポジトリ
Revision | 24a53358f2eb31b2eb210c9a9c491515dd722ad6 (tree) |
---|---|
Time | 2009-09-09 11:58:09 |
Author | Noumi Akira <noumiakira@user...> |
Commiter | Noumi Akira |
optimize DCT coefficients decoding.
@@ -529,17 +529,6 @@ static __inline void IDCT_C_8_4_MMX( | ||
529 | 529 | |
530 | 530 | /* */ |
531 | 531 | |
532 | -static const UINT8 TZZ[64] = { | |
533 | - 0, 2, 3, 9, 10, 20, 21, 35, | |
534 | - 1, 4, 8, 11, 19, 22, 34, 36, | |
535 | - 5, 7, 12, 18, 23, 33, 37, 48, | |
536 | - 6, 13, 17, 24, 32, 38, 47, 49, | |
537 | - 14, 16, 25, 31, 39, 46, 50, 57, | |
538 | - 15, 26, 30, 40, 45, 51, 56, 58, | |
539 | - 27, 29, 41, 44, 52, 55, 59, 62, | |
540 | - 28, 42, 43, 53, 54, 60, 61, 63 | |
541 | -}; | |
542 | - | |
543 | 532 | static __inline void DequantizeIDCT8x8_MMX( |
544 | 533 | const INT16* block, |
545 | 534 | const INT16* matrix, |
@@ -547,34 +536,30 @@ static __inline void DequantizeIDCT8x8_MMX( | ||
547 | 536 | { |
548 | 537 | ALIGN(0x10) INT16 c0[64]; |
549 | 538 | |
550 | - { /* Reorder */ | |
551 | - const UINT8* t = TZZ; | |
552 | - | |
553 | - INT16* c = c0; | |
554 | - INT16* e = c + 64; | |
555 | - for (; c < e; c += 8, t += 8) { | |
556 | - c[0] = block[t[0]]; | |
557 | - c[1] = block[t[1]]; | |
558 | - c[2] = block[t[2]]; | |
559 | - c[3] = block[t[3]]; | |
560 | - c[4] = block[t[4]]; | |
561 | - c[5] = block[t[5]]; | |
562 | - c[6] = block[t[6]]; | |
563 | - c[7] = block[t[7]]; | |
564 | - } | |
565 | - } | |
566 | - | |
567 | 539 | { /* Dequantize */ |
568 | - __m64* d = (__m64*) c0; | |
569 | - __m64* e = (__m64*)(c0 + 64); | |
540 | + const __m64* b = (const __m64*)block; | |
570 | 541 | const __m64* m = (const __m64*)matrix; |
542 | + __m64* d = (__m64*) c0; | |
571 | 543 | |
572 | - for (; d < e; d += 4, m += 4) { | |
573 | - d[0] = _mm_mullo_pi16(d[0], m[0]); | |
574 | - d[1] = _mm_mullo_pi16(d[1], m[1]); | |
575 | - d[2] = _mm_mullo_pi16(d[2], m[2]); | |
576 | - d[3] = _mm_mullo_pi16(d[3], m[3]); | |
577 | - } | |
544 | + d[ 0] = _mm_mullo_pi16(b[ 0], m[ 0]); | |
545 | + d[ 1] = _mm_mullo_pi16(b[ 1], m[ 1]); | |
546 | + d[ 2] = _mm_mullo_pi16(b[ 2], m[ 2]); | |
547 | + d[ 3] = _mm_mullo_pi16(b[ 3], m[ 3]); | |
548 | + | |
549 | + d[ 4] = _mm_mullo_pi16(b[ 4], m[ 4]); | |
550 | + d[ 5] = _mm_mullo_pi16(b[ 5], m[ 5]); | |
551 | + d[ 6] = _mm_mullo_pi16(b[ 6], m[ 6]); | |
552 | + d[ 7] = _mm_mullo_pi16(b[ 7], m[ 7]); | |
553 | + | |
554 | + d[ 8] = _mm_mullo_pi16(b[ 8], m[ 8]); | |
555 | + d[ 9] = _mm_mullo_pi16(b[ 9], m[ 9]); | |
556 | + d[10] = _mm_mullo_pi16(b[10], m[10]); | |
557 | + d[11] = _mm_mullo_pi16(b[11], m[11]); | |
558 | + | |
559 | + d[12] = _mm_mullo_pi16(b[12], m[12]); | |
560 | + d[13] = _mm_mullo_pi16(b[13], m[13]); | |
561 | + d[14] = _mm_mullo_pi16(b[14], m[14]); | |
562 | + d[15] = _mm_mullo_pi16(b[15], m[15]); | |
578 | 563 | } |
579 | 564 | |
580 | 565 | /* iDCT Row */ |
@@ -596,36 +581,15 @@ static __inline void DequantizeIDCT8x8_16_MMX( | ||
596 | 581 | { |
597 | 582 | ALIGN(0x10) INT16 c0[64]; |
598 | 583 | |
599 | - const __m64 z = _mm_setzero_si64(); | |
600 | - | |
601 | - *((__m64*)(c0 + 0x00)) = z; | |
602 | - *((__m64*)(c0 + 0x08)) = z; | |
603 | - *((__m64*)(c0 + 0x10)) = z; | |
604 | - *((__m64*)(c0 + 0x18)) = z; | |
605 | - | |
606 | - /* Reorder */ | |
607 | - c0[ 0 + 0] = block[TZZ[ 0 + 0]]; | |
608 | - c0[ 0 + 1] = block[TZZ[ 0 + 1]]; | |
609 | - c0[ 0 + 2] = block[TZZ[ 0 + 2]]; | |
610 | - c0[ 0 + 3] = block[TZZ[ 0 + 3]]; | |
611 | - | |
612 | - c0[ 8 + 0] = block[TZZ[ 8 + 0]]; | |
613 | - c0[ 8 + 1] = block[TZZ[ 8 + 1]]; | |
614 | - c0[ 8 + 2] = block[TZZ[ 8 + 2]]; | |
615 | - | |
616 | - c0[16 + 0] = block[TZZ[16 + 0]]; | |
617 | - c0[16 + 1] = block[TZZ[16 + 1]]; | |
618 | - | |
619 | - c0[24 + 0] = block[TZZ[24 + 0]]; | |
620 | - | |
621 | 584 | { /* Dequantize */ |
585 | + const __m64* b = (const __m64*)block; | |
622 | 586 | const __m64* m = (const __m64*)matrix; |
623 | 587 | __m64* d = (__m64*)c0; |
624 | 588 | |
625 | - d[0 * 2] = _mm_mullo_pi16(d[0 * 2], m[0 * 2]); | |
626 | - d[1 * 2] = _mm_mullo_pi16(d[1 * 2], m[1 * 2]); | |
627 | - d[2 * 2] = _mm_mullo_pi16(d[2 * 2], m[2 * 2]); | |
628 | - d[3 * 2] = _mm_mullo_pi16(d[3 * 2], m[3 * 2]); | |
589 | + d[0 * 2] = _mm_mullo_pi16(b[0 * 2], m[0 * 2]); | |
590 | + d[1 * 2] = _mm_mullo_pi16(b[1 * 2], m[1 * 2]); | |
591 | + d[2 * 2] = _mm_mullo_pi16(b[2 * 2], m[2 * 2]); | |
592 | + d[3 * 2] = _mm_mullo_pi16(b[3 * 2], m[3 * 2]); | |
629 | 593 | } |
630 | 594 | |
631 | 595 | /* iDCT Row */ |
@@ -679,28 +643,50 @@ struct DecodeCoefficientsContext { | ||
679 | 643 | |
680 | 644 | typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t; |
681 | 645 | |
646 | +ALIGN(0x10) static const UINT8 IZZ[64] = { | |
647 | + 0, 8, 1, 2, 9, 16, 24, 17, | |
648 | + 10, 3, 4, 11, 18, 25, 32, 40, | |
649 | + 33, 26, 19, 12, 5, 6, 13, 20, | |
650 | + 27, 34, 41, 48, 56, 49, 42, 35, | |
651 | + 28, 21, 14, 7, 15, 22, 29, 36, | |
652 | + 43, 50, 57, 58, 51, 44, 37, 30, | |
653 | + 23, 31, 38, 45, 52, 59, 60, 53, | |
654 | + 46, 39, 47, 54, 61, 62, 55, 63 | |
655 | +}; | |
656 | + | |
682 | 657 | static INT32 DecodeCoefficients_MMX( |
683 | 658 | FrameDecoder_t* t, |
684 | 659 | DecodeCoefficientsContext_t* ctx, |
685 | 660 | INT16* block) |
686 | 661 | { |
687 | - INT16* b = block; | |
688 | - INT16* e = b + 64; | |
662 | + const INT8* bi = IZZ; | |
663 | + const INT8* ei = IZZ + 64; | |
689 | 664 | |
690 | 665 | DecodeCoefficientsLeaf_t* leaf = ctx->Leaf; |
691 | 666 | |
692 | 667 | const __m64 z = _mm_setzero_si64(); |
693 | 668 | |
694 | - for (; b < e; b += 16) { | |
695 | - *((__m64*)(b + 0)) = z; | |
696 | - *((__m64*)(b + 4)) = z; | |
697 | - *((__m64*)(b + 8)) = z; | |
698 | - *((__m64*)(b + 12)) = z; | |
699 | - } | |
669 | + *((__m64*)(block + 0x00)) = z; | |
670 | + *((__m64*)(block + 0x04)) = z; | |
671 | + *((__m64*)(block + 0x08)) = z; | |
672 | + *((__m64*)(block + 0x0c)) = z; | |
673 | + | |
674 | + *((__m64*)(block + 0x10)) = z; | |
675 | + *((__m64*)(block + 0x14)) = z; | |
676 | + *((__m64*)(block + 0x18)) = z; | |
677 | + *((__m64*)(block + 0x1c)) = z; | |
700 | 678 | |
701 | - b = block; | |
679 | + *((__m64*)(block + 0x20)) = z; | |
680 | + *((__m64*)(block + 0x24)) = z; | |
681 | + *((__m64*)(block + 0x28)) = z; | |
682 | + *((__m64*)(block + 0x2c)) = z; | |
702 | 683 | |
703 | - while (b < e) { | |
684 | + *((__m64*)(block + 0x30)) = z; | |
685 | + *((__m64*)(block + 0x34)) = z; | |
686 | + *((__m64*)(block + 0x38)) = z; | |
687 | + *((__m64*)(block + 0x3c)) = z; | |
688 | + | |
689 | + while (bi < ei) { | |
704 | 690 | if (leaf->EOB_Run > 0) { |
705 | 691 | leaf->EOB_Run -= 1; |
706 | 692 | break; |
@@ -713,19 +699,16 @@ static INT32 DecodeCoefficients_MMX( | ||
713 | 699 | leaf->EOB_Run = coeff; |
714 | 700 | |
715 | 701 | } else { |
716 | - b += run; | |
717 | - if (b >= e) { | |
718 | - break; | |
719 | - } | |
702 | + bi += run; | |
720 | 703 | |
721 | - *(b++) = coeff; | |
704 | + block[*(bi++)] = coeff; | |
722 | 705 | |
723 | - leaf = ctx->Leaf + (b - block); | |
706 | + leaf = ctx->Leaf + (bi - IZZ); | |
724 | 707 | } |
725 | 708 | } |
726 | 709 | } |
727 | 710 | |
728 | - return b - block; | |
711 | + return bi - IZZ; | |
729 | 712 | } |
730 | 713 | |
731 | 714 | /* */ |
@@ -741,7 +724,7 @@ static void Reconstruct_IntraBlock( | ||
741 | 724 | Plane_t* r, |
742 | 725 | DecodeCoefficientsContext_t* ctx) |
743 | 726 | { |
744 | - ALIGN(0x10) INT16 block[64]; | |
727 | + ALIGN(0x10) INT16 block[64 + 64]; | |
745 | 728 | ALIGN(0x10) INT16 coeff[64]; |
746 | 729 | |
747 | 730 | const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][0]; |
@@ -783,7 +766,7 @@ static void Reconstruct_InterBlock( | ||
783 | 766 | Plane_t* r, |
784 | 767 | DecodeCoefficientsContext_t* ctx) |
785 | 768 | { |
786 | - ALIGN(0x10) INT16 block[64]; | |
769 | + ALIGN(0x10) INT16 block[64 + 64]; | |
787 | 770 | ALIGN(0x10) INT16 coeff[64]; |
788 | 771 | |
789 | 772 | const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][1]; |
@@ -537,17 +537,6 @@ static __inline void Transpose_U_SSE2( | ||
537 | 537 | |
538 | 538 | /* */ |
539 | 539 | |
540 | -static const UINT8 TZZ[64] = { | |
541 | - 0, 2, 3, 9, 10, 20, 21, 35, | |
542 | - 1, 4, 8, 11, 19, 22, 34, 36, | |
543 | - 5, 7, 12, 18, 23, 33, 37, 48, | |
544 | - 6, 13, 17, 24, 32, 38, 47, 49, | |
545 | - 14, 16, 25, 31, 39, 46, 50, 57, | |
546 | - 15, 26, 30, 40, 45, 51, 56, 58, | |
547 | - 27, 29, 41, 44, 52, 55, 59, 62, | |
548 | - 28, 42, 43, 53, 54, 60, 61, 63 | |
549 | -}; | |
550 | - | |
551 | 540 | static __inline void DequantizeIDCT8x8_SSE2( |
552 | 541 | const INT16* block, |
553 | 542 | const INT16* matrix, |
@@ -555,35 +544,19 @@ static __inline void DequantizeIDCT8x8_SSE2( | ||
555 | 544 | { |
556 | 545 | ALIGN(0x10) INT16 c0[64]; |
557 | 546 | |
558 | - { /* Reorder */ | |
559 | - const UINT8* t = TZZ; | |
560 | - | |
561 | - INT16* c = c0; | |
562 | - INT16* e = c + 64; | |
563 | - for (; c < e; c += 8, t += 8) { | |
564 | - c[0] = block[t[0]]; | |
565 | - c[1] = block[t[1]]; | |
566 | - c[2] = block[t[2]]; | |
567 | - c[3] = block[t[3]]; | |
568 | - c[4] = block[t[4]]; | |
569 | - c[5] = block[t[5]]; | |
570 | - c[6] = block[t[6]]; | |
571 | - c[7] = block[t[7]]; | |
572 | - } | |
573 | - } | |
574 | - | |
575 | 547 | { /* Dequantize */ |
548 | + const __m128i* b = (const __m128i*)block; | |
576 | 549 | const __m128i* m = (const __m128i*)matrix; |
577 | 550 | __m128i* d = (__m128i*)c0; |
578 | 551 | |
579 | - d[0] = _mm_mullo_epi16(d[0], m[0]); | |
580 | - d[1] = _mm_mullo_epi16(d[1], m[1]); | |
581 | - d[2] = _mm_mullo_epi16(d[2], m[2]); | |
582 | - d[3] = _mm_mullo_epi16(d[3], m[3]); | |
583 | - d[4] = _mm_mullo_epi16(d[4], m[4]); | |
584 | - d[5] = _mm_mullo_epi16(d[5], m[5]); | |
585 | - d[6] = _mm_mullo_epi16(d[6], m[6]); | |
586 | - d[7] = _mm_mullo_epi16(d[7], m[7]); | |
552 | + d[0] = _mm_mullo_epi16(b[0], m[0]); | |
553 | + d[1] = _mm_mullo_epi16(b[1], m[1]); | |
554 | + d[2] = _mm_mullo_epi16(b[2], m[2]); | |
555 | + d[3] = _mm_mullo_epi16(b[3], m[3]); | |
556 | + d[4] = _mm_mullo_epi16(b[4], m[4]); | |
557 | + d[5] = _mm_mullo_epi16(b[5], m[5]); | |
558 | + d[6] = _mm_mullo_epi16(b[6], m[6]); | |
559 | + d[7] = _mm_mullo_epi16(b[7], m[7]); | |
587 | 560 | } |
588 | 561 | |
589 | 562 | /* iDCT Row */ |
@@ -605,36 +578,15 @@ static __inline void DequantizeIDCT8x8_16_SSE2( | ||
605 | 578 | { |
606 | 579 | ALIGN(0x10) INT16 c0[64]; |
607 | 580 | |
608 | - const __m128i z = _mm_setzero_si128(); | |
609 | - | |
610 | - _mm_store_si128((__m128i*)(c0 + 0x00), z); | |
611 | - _mm_store_si128((__m128i*)(c0 + 0x08), z); | |
612 | - _mm_store_si128((__m128i*)(c0 + 0x10), z); | |
613 | - _mm_store_si128((__m128i*)(c0 + 0x18), z); | |
614 | - | |
615 | - /* Reorder */ | |
616 | - c0[ 0 + 0] = block[TZZ[ 0 + 0]]; | |
617 | - c0[ 0 + 1] = block[TZZ[ 0 + 1]]; | |
618 | - c0[ 0 + 2] = block[TZZ[ 0 + 2]]; | |
619 | - c0[ 0 + 3] = block[TZZ[ 0 + 3]]; | |
620 | - | |
621 | - c0[ 8 + 0] = block[TZZ[ 8 + 0]]; | |
622 | - c0[ 8 + 1] = block[TZZ[ 8 + 1]]; | |
623 | - c0[ 8 + 2] = block[TZZ[ 8 + 2]]; | |
624 | - | |
625 | - c0[16 + 0] = block[TZZ[16 + 0]]; | |
626 | - c0[16 + 1] = block[TZZ[16 + 1]]; | |
627 | - | |
628 | - c0[24 + 0] = block[TZZ[24 + 0]]; | |
629 | - | |
630 | 581 | { /* Dequantize */ |
582 | + const __m64* b = (__m64*)block; | |
631 | 583 | const __m64* m = (const __m64*)matrix; |
632 | 584 | __m64* d = (__m64*)c0; |
633 | 585 | |
634 | - d[0 * 2] = _mm_mullo_pi16(d[0 * 2], m[0 * 2]); | |
635 | - d[1 * 2] = _mm_mullo_pi16(d[1 * 2], m[1 * 2]); | |
636 | - d[2 * 2] = _mm_mullo_pi16(d[2 * 2], m[2 * 2]); | |
637 | - d[3 * 2] = _mm_mullo_pi16(d[3 * 2], m[3 * 2]); | |
586 | + d[0 * 2] = _mm_mullo_pi16(b[0 * 2], m[0 * 2]); | |
587 | + d[1 * 2] = _mm_mullo_pi16(b[1 * 2], m[1 * 2]); | |
588 | + d[2 * 2] = _mm_mullo_pi16(b[2 * 2], m[2 * 2]); | |
589 | + d[3 * 2] = _mm_mullo_pi16(b[3 * 2], m[3 * 2]); | |
638 | 590 | } |
639 | 591 | |
640 | 592 | /* iDCT Row */ |
@@ -688,13 +640,24 @@ struct DecodeCoefficientsContext { | ||
688 | 640 | |
689 | 641 | typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t; |
690 | 642 | |
643 | +ALIGN(0x10) static const UINT8 IZZ[64] = { | |
644 | + 0, 8, 1, 2, 9, 16, 24, 17, | |
645 | + 10, 3, 4, 11, 18, 25, 32, 40, | |
646 | + 33, 26, 19, 12, 5, 6, 13, 20, | |
647 | + 27, 34, 41, 48, 56, 49, 42, 35, | |
648 | + 28, 21, 14, 7, 15, 22, 29, 36, | |
649 | + 43, 50, 57, 58, 51, 44, 37, 30, | |
650 | + 23, 31, 38, 45, 52, 59, 60, 53, | |
651 | + 46, 39, 47, 54, 61, 62, 55, 63 | |
652 | +}; | |
653 | + | |
691 | 654 | static INT32 DecodeCoefficients_SSE2( |
692 | 655 | FrameDecoder_t* t, |
693 | 656 | DecodeCoefficientsContext_t* ctx, |
694 | 657 | INT16* block) |
695 | 658 | { |
696 | - INT16* b = block; | |
697 | - INT16* e = b + 64; | |
659 | + const INT8* bi = IZZ; | |
660 | + const INT8* ei = IZZ + 64; | |
698 | 661 | |
699 | 662 | DecodeCoefficientsLeaf_t* leaf = ctx->Leaf; |
700 | 663 |
@@ -709,7 +672,7 @@ static INT32 DecodeCoefficients_SSE2( | ||
709 | 672 | _mm_store_si128((__m128i*)(block + 0x30), z); |
710 | 673 | _mm_store_si128((__m128i*)(block + 0x38), z); |
711 | 674 | |
712 | - while (b < e) { | |
675 | + while (bi < ei) { | |
713 | 676 | if (leaf->EOB_Run > 0) { |
714 | 677 | leaf->EOB_Run -= 1; |
715 | 678 | break; |
@@ -722,19 +685,16 @@ static INT32 DecodeCoefficients_SSE2( | ||
722 | 685 | leaf->EOB_Run = coeff; |
723 | 686 | |
724 | 687 | } else { |
725 | - b += run; | |
726 | - if (b >= e) { | |
727 | - break; | |
728 | - } | |
688 | + bi += run; | |
729 | 689 | |
730 | - *(b++) = coeff; | |
690 | + block[*(bi++)] = coeff; | |
731 | 691 | |
732 | - leaf = ctx->Leaf + (b - block); | |
692 | + leaf = ctx->Leaf + (bi - IZZ); | |
733 | 693 | } |
734 | 694 | } |
735 | 695 | } |
736 | 696 | |
737 | - return b - block; | |
697 | + return bi - IZZ; | |
738 | 698 | } |
739 | 699 | |
740 | 700 | /* */ |
@@ -750,7 +710,7 @@ static void Reconstruct_IntraBlock( | ||
750 | 710 | Plane_t* r, |
751 | 711 | DecodeCoefficientsContext_t* ctx) |
752 | 712 | { |
753 | - ALIGN(0x10) INT16 block[64]; | |
713 | + ALIGN(0x10) INT16 block[64 + 64]; | |
754 | 714 | ALIGN(0x10) INT16 coeff[64]; |
755 | 715 | |
756 | 716 | const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][0]; |
@@ -790,7 +750,7 @@ static void Reconstruct_InterBlock( | ||
790 | 750 | Plane_t* r, |
791 | 751 | DecodeCoefficientsContext_t* ctx) |
792 | 752 | { |
793 | - ALIGN(0x10) INT16 block[64]; | |
753 | + ALIGN(0x10) INT16 block[64 + 64]; | |
794 | 754 | ALIGN(0x10) INT16 coeff[64]; |
795 | 755 | |
796 | 756 | const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][1]; |