• R/O
  • HTTP
  • SSH
  • HTTPS

main: Commit

メインリポジトリ


Commit MetaInfo

Revision24a53358f2eb31b2eb210c9a9c491515dd722ad6 (tree)
Time2009-09-09 11:58:09
AuthorNoumi Akira <noumiakira@user...>
CommiterNoumi Akira

Log Message

optimize DCT coefficients decoding.

Change Summary

Incremental Difference

--- a/Lib/QTheoraEx/FrameReconstructor_MMX.c
+++ b/Lib/QTheoraEx/FrameReconstructor_MMX.c
@@ -529,17 +529,6 @@ static __inline void IDCT_C_8_4_MMX(
529529
530530 /* */
531531
532-static const UINT8 TZZ[64] = {
533- 0, 2, 3, 9, 10, 20, 21, 35,
534- 1, 4, 8, 11, 19, 22, 34, 36,
535- 5, 7, 12, 18, 23, 33, 37, 48,
536- 6, 13, 17, 24, 32, 38, 47, 49,
537- 14, 16, 25, 31, 39, 46, 50, 57,
538- 15, 26, 30, 40, 45, 51, 56, 58,
539- 27, 29, 41, 44, 52, 55, 59, 62,
540- 28, 42, 43, 53, 54, 60, 61, 63
541-};
542-
543532 static __inline void DequantizeIDCT8x8_MMX(
544533 const INT16* block,
545534 const INT16* matrix,
@@ -547,34 +536,30 @@ static __inline void DequantizeIDCT8x8_MMX(
547536 {
548537 ALIGN(0x10) INT16 c0[64];
549538
550- { /* Reorder */
551- const UINT8* t = TZZ;
552-
553- INT16* c = c0;
554- INT16* e = c + 64;
555- for (; c < e; c += 8, t += 8) {
556- c[0] = block[t[0]];
557- c[1] = block[t[1]];
558- c[2] = block[t[2]];
559- c[3] = block[t[3]];
560- c[4] = block[t[4]];
561- c[5] = block[t[5]];
562- c[6] = block[t[6]];
563- c[7] = block[t[7]];
564- }
565- }
566-
567539 { /* Dequantize */
568- __m64* d = (__m64*) c0;
569- __m64* e = (__m64*)(c0 + 64);
540+ const __m64* b = (const __m64*)block;
570541 const __m64* m = (const __m64*)matrix;
542+ __m64* d = (__m64*) c0;
571543
572- for (; d < e; d += 4, m += 4) {
573- d[0] = _mm_mullo_pi16(d[0], m[0]);
574- d[1] = _mm_mullo_pi16(d[1], m[1]);
575- d[2] = _mm_mullo_pi16(d[2], m[2]);
576- d[3] = _mm_mullo_pi16(d[3], m[3]);
577- }
544+ d[ 0] = _mm_mullo_pi16(b[ 0], m[ 0]);
545+ d[ 1] = _mm_mullo_pi16(b[ 1], m[ 1]);
546+ d[ 2] = _mm_mullo_pi16(b[ 2], m[ 2]);
547+ d[ 3] = _mm_mullo_pi16(b[ 3], m[ 3]);
548+
549+ d[ 4] = _mm_mullo_pi16(b[ 4], m[ 4]);
550+ d[ 5] = _mm_mullo_pi16(b[ 5], m[ 5]);
551+ d[ 6] = _mm_mullo_pi16(b[ 6], m[ 6]);
552+ d[ 7] = _mm_mullo_pi16(b[ 7], m[ 7]);
553+
554+ d[ 8] = _mm_mullo_pi16(b[ 8], m[ 8]);
555+ d[ 9] = _mm_mullo_pi16(b[ 9], m[ 9]);
556+ d[10] = _mm_mullo_pi16(b[10], m[10]);
557+ d[11] = _mm_mullo_pi16(b[11], m[11]);
558+
559+ d[12] = _mm_mullo_pi16(b[12], m[12]);
560+ d[13] = _mm_mullo_pi16(b[13], m[13]);
561+ d[14] = _mm_mullo_pi16(b[14], m[14]);
562+ d[15] = _mm_mullo_pi16(b[15], m[15]);
578563 }
579564
580565 /* iDCT Row */
@@ -596,36 +581,15 @@ static __inline void DequantizeIDCT8x8_16_MMX(
596581 {
597582 ALIGN(0x10) INT16 c0[64];
598583
599- const __m64 z = _mm_setzero_si64();
600-
601- *((__m64*)(c0 + 0x00)) = z;
602- *((__m64*)(c0 + 0x08)) = z;
603- *((__m64*)(c0 + 0x10)) = z;
604- *((__m64*)(c0 + 0x18)) = z;
605-
606- /* Reorder */
607- c0[ 0 + 0] = block[TZZ[ 0 + 0]];
608- c0[ 0 + 1] = block[TZZ[ 0 + 1]];
609- c0[ 0 + 2] = block[TZZ[ 0 + 2]];
610- c0[ 0 + 3] = block[TZZ[ 0 + 3]];
611-
612- c0[ 8 + 0] = block[TZZ[ 8 + 0]];
613- c0[ 8 + 1] = block[TZZ[ 8 + 1]];
614- c0[ 8 + 2] = block[TZZ[ 8 + 2]];
615-
616- c0[16 + 0] = block[TZZ[16 + 0]];
617- c0[16 + 1] = block[TZZ[16 + 1]];
618-
619- c0[24 + 0] = block[TZZ[24 + 0]];
620-
621584 { /* Dequantize */
585+ const __m64* b = (const __m64*)block;
622586 const __m64* m = (const __m64*)matrix;
623587 __m64* d = (__m64*)c0;
624588
625- d[0 * 2] = _mm_mullo_pi16(d[0 * 2], m[0 * 2]);
626- d[1 * 2] = _mm_mullo_pi16(d[1 * 2], m[1 * 2]);
627- d[2 * 2] = _mm_mullo_pi16(d[2 * 2], m[2 * 2]);
628- d[3 * 2] = _mm_mullo_pi16(d[3 * 2], m[3 * 2]);
589+ d[0 * 2] = _mm_mullo_pi16(b[0 * 2], m[0 * 2]);
590+ d[1 * 2] = _mm_mullo_pi16(b[1 * 2], m[1 * 2]);
591+ d[2 * 2] = _mm_mullo_pi16(b[2 * 2], m[2 * 2]);
592+ d[3 * 2] = _mm_mullo_pi16(b[3 * 2], m[3 * 2]);
629593 }
630594
631595 /* iDCT Row */
@@ -679,28 +643,50 @@ struct DecodeCoefficientsContext {
679643
680644 typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t;
681645
646+ALIGN(0x10) static const UINT8 IZZ[64] = {
647+ 0, 8, 1, 2, 9, 16, 24, 17,
648+ 10, 3, 4, 11, 18, 25, 32, 40,
649+ 33, 26, 19, 12, 5, 6, 13, 20,
650+ 27, 34, 41, 48, 56, 49, 42, 35,
651+ 28, 21, 14, 7, 15, 22, 29, 36,
652+ 43, 50, 57, 58, 51, 44, 37, 30,
653+ 23, 31, 38, 45, 52, 59, 60, 53,
654+ 46, 39, 47, 54, 61, 62, 55, 63
655+};
656+
682657 static INT32 DecodeCoefficients_MMX(
683658 FrameDecoder_t* t,
684659 DecodeCoefficientsContext_t* ctx,
685660 INT16* block)
686661 {
687- INT16* b = block;
688- INT16* e = b + 64;
662+ const INT8* bi = IZZ;
663+ const INT8* ei = IZZ + 64;
689664
690665 DecodeCoefficientsLeaf_t* leaf = ctx->Leaf;
691666
692667 const __m64 z = _mm_setzero_si64();
693668
694- for (; b < e; b += 16) {
695- *((__m64*)(b + 0)) = z;
696- *((__m64*)(b + 4)) = z;
697- *((__m64*)(b + 8)) = z;
698- *((__m64*)(b + 12)) = z;
699- }
669+ *((__m64*)(block + 0x00)) = z;
670+ *((__m64*)(block + 0x04)) = z;
671+ *((__m64*)(block + 0x08)) = z;
672+ *((__m64*)(block + 0x0c)) = z;
673+
674+ *((__m64*)(block + 0x10)) = z;
675+ *((__m64*)(block + 0x14)) = z;
676+ *((__m64*)(block + 0x18)) = z;
677+ *((__m64*)(block + 0x1c)) = z;
700678
701- b = block;
679+ *((__m64*)(block + 0x20)) = z;
680+ *((__m64*)(block + 0x24)) = z;
681+ *((__m64*)(block + 0x28)) = z;
682+ *((__m64*)(block + 0x2c)) = z;
702683
703- while (b < e) {
684+ *((__m64*)(block + 0x30)) = z;
685+ *((__m64*)(block + 0x34)) = z;
686+ *((__m64*)(block + 0x38)) = z;
687+ *((__m64*)(block + 0x3c)) = z;
688+
689+ while (bi < ei) {
704690 if (leaf->EOB_Run > 0) {
705691 leaf->EOB_Run -= 1;
706692 break;
@@ -713,19 +699,16 @@ static INT32 DecodeCoefficients_MMX(
713699 leaf->EOB_Run = coeff;
714700
715701 } else {
716- b += run;
717- if (b >= e) {
718- break;
719- }
702+ bi += run;
720703
721- *(b++) = coeff;
704+ block[*(bi++)] = coeff;
722705
723- leaf = ctx->Leaf + (b - block);
706+ leaf = ctx->Leaf + (bi - IZZ);
724707 }
725708 }
726709 }
727710
728- return b - block;
711+ return bi - IZZ;
729712 }
730713
731714 /* */
@@ -741,7 +724,7 @@ static void Reconstruct_IntraBlock(
741724 Plane_t* r,
742725 DecodeCoefficientsContext_t* ctx)
743726 {
744- ALIGN(0x10) INT16 block[64];
727+ ALIGN(0x10) INT16 block[64 + 64];
745728 ALIGN(0x10) INT16 coeff[64];
746729
747730 const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][0];
@@ -783,7 +766,7 @@ static void Reconstruct_InterBlock(
783766 Plane_t* r,
784767 DecodeCoefficientsContext_t* ctx)
785768 {
786- ALIGN(0x10) INT16 block[64];
769+ ALIGN(0x10) INT16 block[64 + 64];
787770 ALIGN(0x10) INT16 coeff[64];
788771
789772 const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][1];
--- a/Lib/QTheoraEx/FrameReconstructor_SSE2.c
+++ b/Lib/QTheoraEx/FrameReconstructor_SSE2.c
@@ -537,17 +537,6 @@ static __inline void Transpose_U_SSE2(
537537
538538 /* */
539539
540-static const UINT8 TZZ[64] = {
541- 0, 2, 3, 9, 10, 20, 21, 35,
542- 1, 4, 8, 11, 19, 22, 34, 36,
543- 5, 7, 12, 18, 23, 33, 37, 48,
544- 6, 13, 17, 24, 32, 38, 47, 49,
545- 14, 16, 25, 31, 39, 46, 50, 57,
546- 15, 26, 30, 40, 45, 51, 56, 58,
547- 27, 29, 41, 44, 52, 55, 59, 62,
548- 28, 42, 43, 53, 54, 60, 61, 63
549-};
550-
551540 static __inline void DequantizeIDCT8x8_SSE2(
552541 const INT16* block,
553542 const INT16* matrix,
@@ -555,35 +544,19 @@ static __inline void DequantizeIDCT8x8_SSE2(
555544 {
556545 ALIGN(0x10) INT16 c0[64];
557546
558- { /* Reorder */
559- const UINT8* t = TZZ;
560-
561- INT16* c = c0;
562- INT16* e = c + 64;
563- for (; c < e; c += 8, t += 8) {
564- c[0] = block[t[0]];
565- c[1] = block[t[1]];
566- c[2] = block[t[2]];
567- c[3] = block[t[3]];
568- c[4] = block[t[4]];
569- c[5] = block[t[5]];
570- c[6] = block[t[6]];
571- c[7] = block[t[7]];
572- }
573- }
574-
575547 { /* Dequantize */
548+ const __m128i* b = (const __m128i*)block;
576549 const __m128i* m = (const __m128i*)matrix;
577550 __m128i* d = (__m128i*)c0;
578551
579- d[0] = _mm_mullo_epi16(d[0], m[0]);
580- d[1] = _mm_mullo_epi16(d[1], m[1]);
581- d[2] = _mm_mullo_epi16(d[2], m[2]);
582- d[3] = _mm_mullo_epi16(d[3], m[3]);
583- d[4] = _mm_mullo_epi16(d[4], m[4]);
584- d[5] = _mm_mullo_epi16(d[5], m[5]);
585- d[6] = _mm_mullo_epi16(d[6], m[6]);
586- d[7] = _mm_mullo_epi16(d[7], m[7]);
552+ d[0] = _mm_mullo_epi16(b[0], m[0]);
553+ d[1] = _mm_mullo_epi16(b[1], m[1]);
554+ d[2] = _mm_mullo_epi16(b[2], m[2]);
555+ d[3] = _mm_mullo_epi16(b[3], m[3]);
556+ d[4] = _mm_mullo_epi16(b[4], m[4]);
557+ d[5] = _mm_mullo_epi16(b[5], m[5]);
558+ d[6] = _mm_mullo_epi16(b[6], m[6]);
559+ d[7] = _mm_mullo_epi16(b[7], m[7]);
587560 }
588561
589562 /* iDCT Row */
@@ -605,36 +578,15 @@ static __inline void DequantizeIDCT8x8_16_SSE2(
605578 {
606579 ALIGN(0x10) INT16 c0[64];
607580
608- const __m128i z = _mm_setzero_si128();
609-
610- _mm_store_si128((__m128i*)(c0 + 0x00), z);
611- _mm_store_si128((__m128i*)(c0 + 0x08), z);
612- _mm_store_si128((__m128i*)(c0 + 0x10), z);
613- _mm_store_si128((__m128i*)(c0 + 0x18), z);
614-
615- /* Reorder */
616- c0[ 0 + 0] = block[TZZ[ 0 + 0]];
617- c0[ 0 + 1] = block[TZZ[ 0 + 1]];
618- c0[ 0 + 2] = block[TZZ[ 0 + 2]];
619- c0[ 0 + 3] = block[TZZ[ 0 + 3]];
620-
621- c0[ 8 + 0] = block[TZZ[ 8 + 0]];
622- c0[ 8 + 1] = block[TZZ[ 8 + 1]];
623- c0[ 8 + 2] = block[TZZ[ 8 + 2]];
624-
625- c0[16 + 0] = block[TZZ[16 + 0]];
626- c0[16 + 1] = block[TZZ[16 + 1]];
627-
628- c0[24 + 0] = block[TZZ[24 + 0]];
629-
630581 { /* Dequantize */
582+ const __m64* b = (__m64*)block;
631583 const __m64* m = (const __m64*)matrix;
632584 __m64* d = (__m64*)c0;
633585
634- d[0 * 2] = _mm_mullo_pi16(d[0 * 2], m[0 * 2]);
635- d[1 * 2] = _mm_mullo_pi16(d[1 * 2], m[1 * 2]);
636- d[2 * 2] = _mm_mullo_pi16(d[2 * 2], m[2 * 2]);
637- d[3 * 2] = _mm_mullo_pi16(d[3 * 2], m[3 * 2]);
586+ d[0 * 2] = _mm_mullo_pi16(b[0 * 2], m[0 * 2]);
587+ d[1 * 2] = _mm_mullo_pi16(b[1 * 2], m[1 * 2]);
588+ d[2 * 2] = _mm_mullo_pi16(b[2 * 2], m[2 * 2]);
589+ d[3 * 2] = _mm_mullo_pi16(b[3 * 2], m[3 * 2]);
638590 }
639591
640592 /* iDCT Row */
@@ -688,13 +640,24 @@ struct DecodeCoefficientsContext {
688640
689641 typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t;
690642
643+ALIGN(0x10) static const UINT8 IZZ[64] = {
644+ 0, 8, 1, 2, 9, 16, 24, 17,
645+ 10, 3, 4, 11, 18, 25, 32, 40,
646+ 33, 26, 19, 12, 5, 6, 13, 20,
647+ 27, 34, 41, 48, 56, 49, 42, 35,
648+ 28, 21, 14, 7, 15, 22, 29, 36,
649+ 43, 50, 57, 58, 51, 44, 37, 30,
650+ 23, 31, 38, 45, 52, 59, 60, 53,
651+ 46, 39, 47, 54, 61, 62, 55, 63
652+};
653+
691654 static INT32 DecodeCoefficients_SSE2(
692655 FrameDecoder_t* t,
693656 DecodeCoefficientsContext_t* ctx,
694657 INT16* block)
695658 {
696- INT16* b = block;
697- INT16* e = b + 64;
659+ const INT8* bi = IZZ;
660+ const INT8* ei = IZZ + 64;
698661
699662 DecodeCoefficientsLeaf_t* leaf = ctx->Leaf;
700663
@@ -709,7 +672,7 @@ static INT32 DecodeCoefficients_SSE2(
709672 _mm_store_si128((__m128i*)(block + 0x30), z);
710673 _mm_store_si128((__m128i*)(block + 0x38), z);
711674
712- while (b < e) {
675+ while (bi < ei) {
713676 if (leaf->EOB_Run > 0) {
714677 leaf->EOB_Run -= 1;
715678 break;
@@ -722,19 +685,16 @@ static INT32 DecodeCoefficients_SSE2(
722685 leaf->EOB_Run = coeff;
723686
724687 } else {
725- b += run;
726- if (b >= e) {
727- break;
728- }
688+ bi += run;
729689
730- *(b++) = coeff;
690+ block[*(bi++)] = coeff;
731691
732- leaf = ctx->Leaf + (b - block);
692+ leaf = ctx->Leaf + (bi - IZZ);
733693 }
734694 }
735695 }
736696
737- return b - block;
697+ return bi - IZZ;
738698 }
739699
740700 /* */
@@ -750,7 +710,7 @@ static void Reconstruct_IntraBlock(
750710 Plane_t* r,
751711 DecodeCoefficientsContext_t* ctx)
752712 {
753- ALIGN(0x10) INT16 block[64];
713+ ALIGN(0x10) INT16 block[64 + 64];
754714 ALIGN(0x10) INT16 coeff[64];
755715
756716 const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][0];
@@ -790,7 +750,7 @@ static void Reconstruct_InterBlock(
790750 Plane_t* r,
791751 DecodeCoefficientsContext_t* ctx)
792752 {
793- ALIGN(0x10) INT16 block[64];
753+ ALIGN(0x10) INT16 block[64 + 64];
794754 ALIGN(0x10) INT16 coeff[64];
795755
796756 const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][1];
Show on old repository browser