BoxBlur処理、繰り返し適用対応を少し進める。
@@ -94,10 +94,10 @@ | ||
94 | 94 | //blur_1b::test_4, |
95 | 95 | //blur_1b::test_5_h, |
96 | 96 | //blur_1b::test_5_v, |
97 | - blur_1b::test_5_h, | |
98 | - blur_1b::test_6_v, | |
99 | - //blur_1b::test_7_h, | |
100 | - //blur_1b::test_7_v, | |
97 | + //blur_1b::test_5_h, | |
98 | + //blur_1b::test_6_v, | |
99 | + blur_1b::test_7_h, | |
100 | + blur_1b::test_7_v, | |
101 | 101 | //blur_1b::test_8, |
102 | 102 | //blur_1b::test_9, |
103 | 103 | //blur_1b::test_10, |
@@ -766,137 +766,170 @@ | ||
766 | 766 | const size_t kxe0 = (size_t) std::max<int>(0, width - r); |
767 | 767 | const size_t kxs0_16end = kxs0 + (16 - (kxs0 % 16)); |
768 | 768 | const size_t kxe0_16end = kxe0 - (kxe0 % 16); |
769 | - const uint8_t* pSrcLine = pSrc; | |
770 | - uint8_t* pWorkLine = pWork; | |
771 | - for (size_t y=0; y<height; ++y) { | |
772 | - int total = *pSrcLine; | |
773 | - for (size_t kx=1; kx<kxs0; ++kx) { | |
774 | - total += pSrcLine[kx] * 2; | |
769 | + | |
770 | + for (size_t n=0; n<iterationCount; ++n) { | |
771 | + | |
772 | + const uint8_t* pFrom; | |
773 | + ptrdiff_t fromLineOffsetBytes; | |
774 | + if (n == 0) { | |
775 | + pFrom = pSrc; | |
776 | + fromLineOffsetBytes = srcLineOffsetBytes; | |
777 | + }else { | |
778 | + // TODO: make it compact | |
779 | + if (iterationCount & 1) { | |
780 | + pFrom = (n & 1) ? pWork : pWork2; | |
781 | + }else { | |
782 | + pFrom = (n & 1) ? pWork2 : pWork; | |
783 | + } | |
784 | + fromLineOffsetBytes = workLineOffsetBytes; | |
775 | 785 | } |
776 | - pWorkLine[0] = (total * invLen) >> SHIFT; | |
777 | - for (size_t x=1; x<kxs0; ++x) { | |
778 | - assert(kxs0 >= x); | |
779 | - total += - pSrcLine[kxs0 - x] + pSrcLine[kxs0 + x - 1]; | |
780 | - pWorkLine[x] = (total * invLen) >> SHIFT; | |
786 | + uint8_t* pTo; | |
787 | + ptrdiff_t toLineOffsetBytes; | |
788 | + if (n == iterationCount - 1) { | |
789 | + pTo = pWork; | |
790 | + toLineOffsetBytes = workLineOffsetBytes; | |
791 | + }else { | |
792 | + // TODO: make it compact | |
793 | + if (iterationCount & 1) { | |
794 | + pTo = (n & 1) ? pWork2 : pWork; | |
795 | + }else { | |
796 | + pTo = (n & 1) ? pWork : pWork2; | |
797 | + } | |
798 | + toLineOffsetBytes = workLineOffsetBytes; | |
781 | 799 | } |
782 | - for (size_t x=kxs0; x<kxs0_16end; ++x) { | |
783 | - total += - pSrcLine[x - r - 1] + pSrcLine[x + r]; | |
784 | - pWorkLine[x] = (total * invLen) >> SHIFT; | |
785 | - } | |
786 | 800 | |
787 | - __m128i* mPSub = (__m128i*) (pSrcLine + kxs0_16end - r - 1); | |
788 | - __m128i* mPAdd = (__m128i*) (pSrcLine + kxs0_16end + r); | |
789 | - __m128i mNextSub = _mm_loadu_si128(mPSub++); // hoist loading | |
790 | - __m128i mNextAdd = _mm_loadu_si128(mPAdd++); | |
791 | - __m128i* mPWork = (__m128i*) (pWorkLine + kxs0_16end); | |
792 | - | |
793 | -#if 1 | |
794 | - __m128i mTotal = _mm_set1_epi16(total); | |
795 | - for (size_t x=kxs0_16end; x<kxe0_16end; x+=16) { | |
796 | - __m128i mSub = mNextSub; | |
797 | - __m128i mAdd = mNextAdd; | |
798 | - mNextSub = _mm_loadu_si128(mPSub++); | |
799 | - mNextAdd = _mm_loadu_si128(mPAdd++); | |
801 | + const uint8_t* pFromLine = pFrom; | |
802 | + uint8_t* pToLine = pTo; | |
803 | + | |
804 | + for (size_t y=0; y<height; ++y) { | |
805 | + int total = *pFromLine; | |
806 | + for (size_t kx=1; kx<kxs0; ++kx) { | |
807 | + total += pFromLine[kx] * 2; | |
808 | + } | |
809 | + pToLine[0] = (total * invLen) >> SHIFT; | |
810 | + for (size_t x=1; x<kxs0; ++x) { | |
811 | + assert(kxs0 >= x); | |
812 | + total += - pFromLine[kxs0 - x] + pFromLine[kxs0 + x - 1]; | |
813 | + pToLine[x] = (total * invLen) >> SHIFT; | |
814 | + } | |
815 | + for (size_t x=kxs0; x<kxs0_16end; ++x) { | |
816 | + total += - pFromLine[x - r - 1] + pFromLine[x + r]; | |
817 | + pToLine[x] = (total * invLen) >> SHIFT; | |
818 | + } | |
800 | 819 | |
801 | - __m128i mSub0 = _mm_unpacklo_epi8(mSub, _mm_setzero_si128()); | |
802 | - __m128i mSub1 = _mm_unpackhi_epi8(mSub, _mm_setzero_si128()); | |
803 | - __m128i mAdd0 = _mm_unpacklo_epi8(mAdd, _mm_setzero_si128()); | |
804 | - __m128i mAdd1 = _mm_unpackhi_epi8(mAdd, _mm_setzero_si128()); | |
805 | - | |
806 | - __m128i mDiff0 = _mm_sub_epi16(mAdd0, mSub0); | |
807 | - __m128i mDiff1 = _mm_sub_epi16(mAdd1, mSub1); | |
808 | - mDiff0 = _mm_add_epi16(mDiff0, _mm_slli_si128(mDiff0, 2)); | |
809 | - mDiff0 = _mm_add_epi16(mDiff0, _mm_slli_si128(mDiff0, 4)); | |
810 | - mDiff0 = _mm_add_epi16(mDiff0, _mm_slli_si128(mDiff0, 8)); | |
811 | - __m128i jump = _mm_shufflehi_epi16(mDiff0, _MM_SHUFFLE(3,3,3,3)); | |
812 | - jump = _mm_unpackhi_epi64(jump, jump); | |
813 | - | |
814 | - mDiff1 = _mm_add_epi16(mDiff1, _mm_slli_si128(mDiff1, 2)); | |
815 | - mDiff1 = _mm_add_epi16(mDiff1, _mm_slli_si128(mDiff1, 4)); | |
816 | - mDiff1 = _mm_add_epi16(mDiff1, _mm_slli_si128(mDiff1, 8)); | |
817 | - mDiff1 = _mm_add_epi16(mDiff1, jump); | |
818 | - | |
819 | - __m128i left = _mm_add_epi16(mTotal, mDiff0); | |
820 | - __m128i right = _mm_add_epi16(mTotal, mDiff1); | |
821 | - __m128i left2 = _mm_mulhrs_epi16(left, mInvLeni); | |
822 | - __m128i right2 = _mm_mulhrs_epi16(right, mInvLeni); | |
823 | - __m128i result = _mm_packus_epi16(left2, right2); | |
824 | -// _mm_stream_si128(mPWork++, result); | |
825 | - *mPWork++ = result; | |
826 | - mTotal = _mm_shufflehi_epi16(right, _MM_SHUFFLE(3,3,3,3)); | |
827 | - mTotal = _mm_unpackhi_epi64(mTotal, mTotal); | |
828 | - } | |
829 | - total = _mm_extract_epi16(mTotal, 0); | |
830 | -#else | |
831 | - // SSE2 path | |
832 | - __m128i mTotal = _mm_set1_epi32(total); //_mm_shuffle_epi32(_mm_cvtsi32_si128(total), 1); | |
833 | - for (size_t x=kxs0_16end; x<kxe0_16end; x+=16) { | |
834 | - __m128i mSub = mNextSub; | |
835 | - __m128i mAdd = mNextAdd; | |
836 | - mNextSub = _mm_loadu_si128(mPSub++); | |
837 | - mNextAdd = _mm_loadu_si128(mPAdd++); | |
838 | - | |
839 | - __m128i mSub0 = _mm_unpacklo_epi8(mSub, _mm_setzero_si128()); | |
840 | - __m128i mSub1 = _mm_unpackhi_epi8(mSub, _mm_setzero_si128()); | |
841 | - __m128i mAdd0 = _mm_unpacklo_epi8(mAdd, _mm_setzero_si128()); | |
842 | - __m128i mAdd1 = _mm_unpackhi_epi8(mAdd, _mm_setzero_si128()); | |
843 | - | |
844 | - __m128i mDiff0 = _mm_sub_epi32(_mm_unpacklo_epi16(mAdd0, _mm_setzero_si128()), _mm_unpacklo_epi16(mSub0, _mm_setzero_si128())); | |
845 | - __m128i mDiff1 = _mm_sub_epi32(_mm_unpackhi_epi16(mAdd0, _mm_setzero_si128()), _mm_unpackhi_epi16(mSub0, _mm_setzero_si128())); | |
846 | - __m128i mDiff2 = _mm_sub_epi32(_mm_unpacklo_epi16(mAdd1, _mm_setzero_si128()), _mm_unpacklo_epi16(mSub1, _mm_setzero_si128())); | |
847 | - __m128i mDiff3 = _mm_sub_epi32(_mm_unpackhi_epi16(mAdd1, _mm_setzero_si128()), _mm_unpackhi_epi16(mSub1, _mm_setzero_si128())); | |
848 | - | |
849 | - mDiff0 = _mm_add_epi32(mDiff0, _mm_slli_si128(mDiff0, 4)); | |
850 | - mDiff0 = _mm_add_epi32(mDiff0, _mm_slli_si128(mDiff0, 8)); | |
851 | - mDiff1 = _mm_add_epi32(mDiff1, _mm_slli_si128(mDiff1, 4)); | |
852 | - mDiff1 = _mm_add_epi32(mDiff1, _mm_slli_si128(mDiff1, 8)); | |
853 | - mDiff2 = _mm_add_epi32(mDiff2, _mm_slli_si128(mDiff2, 4)); | |
854 | - mDiff2 = _mm_add_epi32(mDiff2, _mm_slli_si128(mDiff2, 8)); | |
855 | - mDiff3 = _mm_add_epi32(mDiff3, _mm_slli_si128(mDiff3, 4)); | |
856 | - mDiff3 = _mm_add_epi32(mDiff3, _mm_slli_si128(mDiff3, 8)); | |
820 | + __m128i* mPSub = (__m128i*) (pFromLine + kxs0_16end - r - 1); | |
821 | + __m128i* mPAdd = (__m128i*) (pFromLine + kxs0_16end + r); | |
822 | + __m128i mNextSub = _mm_loadu_si128(mPSub++); // hoist loading | |
823 | + __m128i mNextAdd = _mm_loadu_si128(mPAdd++); | |
824 | + __m128i* mPWork = (__m128i*) (pToLine + kxs0_16end); | |
857 | 825 | |
858 | - mTotal = _mm_add_epi32(mTotal, mDiff0); | |
859 | - __m128 mfTotal = _mm_cvtepi32_ps(mTotal); | |
860 | - __m128i mDest0 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen)); | |
861 | - | |
862 | - mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3)); | |
863 | - mTotal = _mm_add_epi32(mTotal, mDiff1); | |
864 | - mfTotal = _mm_cvtepi32_ps(mTotal); | |
865 | - __m128i mDest1 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen)); | |
826 | + #if 1 | |
827 | + __m128i mTotal = _mm_set1_epi16(total); | |
828 | + for (size_t x=kxs0_16end; x<kxe0_16end; x+=16) { | |
829 | + __m128i mSub = mNextSub; | |
830 | + __m128i mAdd = mNextAdd; | |
831 | + mNextSub = _mm_loadu_si128(mPSub++); | |
832 | + mNextAdd = _mm_loadu_si128(mPAdd++); | |
833 | + | |
834 | + __m128i mSub0 = _mm_unpacklo_epi8(mSub, _mm_setzero_si128()); | |
835 | + __m128i mSub1 = _mm_unpackhi_epi8(mSub, _mm_setzero_si128()); | |
836 | + __m128i mAdd0 = _mm_unpacklo_epi8(mAdd, _mm_setzero_si128()); | |
837 | + __m128i mAdd1 = _mm_unpackhi_epi8(mAdd, _mm_setzero_si128()); | |
838 | + | |
839 | + __m128i mDiff0 = _mm_sub_epi16(mAdd0, mSub0); | |
840 | + __m128i mDiff1 = _mm_sub_epi16(mAdd1, mSub1); | |
841 | + mDiff0 = _mm_add_epi16(mDiff0, _mm_slli_si128(mDiff0, 2)); | |
842 | + mDiff0 = _mm_add_epi16(mDiff0, _mm_slli_si128(mDiff0, 4)); | |
843 | + mDiff0 = _mm_add_epi16(mDiff0, _mm_slli_si128(mDiff0, 8)); | |
844 | + __m128i jump = _mm_shufflehi_epi16(mDiff0, _MM_SHUFFLE(3,3,3,3)); | |
845 | + jump = _mm_unpackhi_epi64(jump, jump); | |
846 | + | |
847 | + mDiff1 = _mm_add_epi16(mDiff1, _mm_slli_si128(mDiff1, 2)); | |
848 | + mDiff1 = _mm_add_epi16(mDiff1, _mm_slli_si128(mDiff1, 4)); | |
849 | + mDiff1 = _mm_add_epi16(mDiff1, _mm_slli_si128(mDiff1, 8)); | |
850 | + mDiff1 = _mm_add_epi16(mDiff1, jump); | |
851 | + | |
852 | + __m128i left = _mm_add_epi16(mTotal, mDiff0); | |
853 | + __m128i right = _mm_add_epi16(mTotal, mDiff1); | |
854 | + __m128i left2 = _mm_mulhrs_epi16(left, mInvLeni); | |
855 | + __m128i right2 = _mm_mulhrs_epi16(right, mInvLeni); | |
856 | + __m128i result = _mm_packus_epi16(left2, right2); | |
857 | + // _mm_stream_si128(mPWork++, result); | |
858 | + *mPWork++ = result; | |
859 | + mTotal = _mm_shufflehi_epi16(right, _MM_SHUFFLE(3,3,3,3)); | |
860 | + mTotal = _mm_unpackhi_epi64(mTotal, mTotal); | |
861 | + } | |
862 | + total = _mm_extract_epi16(mTotal, 0); | |
863 | + #else | |
864 | + // SSE2 path | |
865 | + __m128i mTotal = _mm_set1_epi32(total); //_mm_shuffle_epi32(_mm_cvtsi32_si128(total), 1); | |
866 | + for (size_t x=kxs0_16end; x<kxe0_16end; x+=16) { | |
867 | + __m128i mSub = mNextSub; | |
868 | + __m128i mAdd = mNextAdd; | |
869 | + mNextSub = _mm_loadu_si128(mPSub++); | |
870 | + mNextAdd = _mm_loadu_si128(mPAdd++); | |
871 | + | |
872 | + __m128i mSub0 = _mm_unpacklo_epi8(mSub, _mm_setzero_si128()); | |
873 | + __m128i mSub1 = _mm_unpackhi_epi8(mSub, _mm_setzero_si128()); | |
874 | + __m128i mAdd0 = _mm_unpacklo_epi8(mAdd, _mm_setzero_si128()); | |
875 | + __m128i mAdd1 = _mm_unpackhi_epi8(mAdd, _mm_setzero_si128()); | |
876 | + | |
877 | + __m128i mDiff0 = _mm_sub_epi32(_mm_unpacklo_epi16(mAdd0, _mm_setzero_si128()), _mm_unpacklo_epi16(mSub0, _mm_setzero_si128())); | |
878 | + __m128i mDiff1 = _mm_sub_epi32(_mm_unpackhi_epi16(mAdd0, _mm_setzero_si128()), _mm_unpackhi_epi16(mSub0, _mm_setzero_si128())); | |
879 | + __m128i mDiff2 = _mm_sub_epi32(_mm_unpacklo_epi16(mAdd1, _mm_setzero_si128()), _mm_unpacklo_epi16(mSub1, _mm_setzero_si128())); | |
880 | + __m128i mDiff3 = _mm_sub_epi32(_mm_unpackhi_epi16(mAdd1, _mm_setzero_si128()), _mm_unpackhi_epi16(mSub1, _mm_setzero_si128())); | |
881 | + | |
882 | + mDiff0 = _mm_add_epi32(mDiff0, _mm_slli_si128(mDiff0, 4)); | |
883 | + mDiff0 = _mm_add_epi32(mDiff0, _mm_slli_si128(mDiff0, 8)); | |
884 | + mDiff1 = _mm_add_epi32(mDiff1, _mm_slli_si128(mDiff1, 4)); | |
885 | + mDiff1 = _mm_add_epi32(mDiff1, _mm_slli_si128(mDiff1, 8)); | |
886 | + mDiff2 = _mm_add_epi32(mDiff2, _mm_slli_si128(mDiff2, 4)); | |
887 | + mDiff2 = _mm_add_epi32(mDiff2, _mm_slli_si128(mDiff2, 8)); | |
888 | + mDiff3 = _mm_add_epi32(mDiff3, _mm_slli_si128(mDiff3, 4)); | |
889 | + mDiff3 = _mm_add_epi32(mDiff3, _mm_slli_si128(mDiff3, 8)); | |
866 | 890 | |
867 | - mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3)); | |
868 | - mTotal = _mm_add_epi32(mTotal, mDiff2); | |
869 | - mfTotal = _mm_cvtepi32_ps(mTotal); | |
870 | - __m128i mDest2 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen)); | |
891 | + mTotal = _mm_add_epi32(mTotal, mDiff0); | |
892 | + __m128 mfTotal = _mm_cvtepi32_ps(mTotal); | |
893 | + __m128i mDest0 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen)); | |
894 | + | |
895 | + mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3)); | |
896 | + mTotal = _mm_add_epi32(mTotal, mDiff1); | |
897 | + mfTotal = _mm_cvtepi32_ps(mTotal); | |
898 | + __m128i mDest1 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen)); | |
871 | 899 | |
872 | - mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3)); | |
873 | - mTotal = _mm_add_epi32(mTotal, mDiff3); | |
874 | - mfTotal = _mm_cvtepi32_ps(mTotal); | |
875 | - mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3)); | |
876 | - __m128i mDest3 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen)); | |
900 | + mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3)); | |
901 | + mTotal = _mm_add_epi32(mTotal, mDiff2); | |
902 | + mfTotal = _mm_cvtepi32_ps(mTotal); | |
903 | + __m128i mDest2 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen)); | |
877 | 904 | |
878 | - *mPWork++ = | |
879 | - _mm_packus_epi16( | |
880 | - _mm_packs_epi32(mDest0, mDest1), | |
881 | - _mm_packs_epi32(mDest2, mDest3) | |
882 | - ) | |
883 | - ; | |
905 | + mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3)); | |
906 | + mTotal = _mm_add_epi32(mTotal, mDiff3); | |
907 | + mfTotal = _mm_cvtepi32_ps(mTotal); | |
908 | + mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3)); | |
909 | + __m128i mDest3 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen)); | |
910 | + | |
911 | + *mPWork++ = | |
912 | + _mm_packus_epi16( | |
913 | + _mm_packs_epi32(mDest0, mDest1), | |
914 | + _mm_packs_epi32(mDest2, mDest3) | |
915 | + ) | |
916 | + ; | |
917 | + } | |
918 | + total = _mm_cvtsi128_si32(mTotal); | |
919 | + #endif | |
920 | + | |
921 | + for (size_t x=kxe0_16end; x<kxe0; ++x) { | |
922 | + total += - pFromLine[x - r - 1] + pFromLine[x + r]; | |
923 | + pToLine[x] = (total * invLen) >> SHIFT; | |
924 | + } | |
925 | + for (size_t x=kxe0,cnt=0; x<width; ++x, ++cnt) { | |
926 | + total += - pFromLine[kxe0 - r + cnt] + pFromLine[width - 1 - cnt]; | |
927 | + pToLine[x] = (total * invLen) >> SHIFT; | |
928 | + } | |
929 | + OffsetPtr(pToLine, toLineOffsetBytes); | |
930 | + OffsetPtr(pFromLine, fromLineOffsetBytes); | |
884 | 931 | } |
885 | - total = _mm_cvtsi128_si32(mTotal); | |
886 | -#endif | |
887 | - | |
888 | - for (size_t x=kxe0_16end; x<kxe0; ++x) { | |
889 | - total += - pSrcLine[x - r - 1] + pSrcLine[x + r]; | |
890 | - pWorkLine[x] = (total * invLen) >> SHIFT; | |
891 | - } | |
892 | - for (size_t x=kxe0,cnt=0; x<width; ++x, ++cnt) { | |
893 | - total += - pSrcLine[kxe0 - r + cnt] + pSrcLine[width - 1 - cnt]; | |
894 | - pWorkLine[x] = (total * invLen) >> SHIFT; | |
895 | - } | |
896 | - OffsetPtr(pWorkLine, workLineOffsetBytes); | |
897 | - OffsetPtr(pSrcLine, srcLineOffsetBytes); | |
898 | 932 | } |
899 | - | |
900 | 933 | } |
901 | 934 | |
902 | 935 | void test_7_v(const Parameter& p) { |
@@ -975,86 +1008,109 @@ | ||
975 | 1008 | if (bBottom) { |
976 | 1009 | kye0 = std::max<int>(0, height - r); |
977 | 1010 | } |
978 | - const uint8_t* pWorkLine = pWork; | |
979 | - const uint8_t* pWorkLine2 = pWorkLine; | |
980 | - uint8_t* pDestLine = pDest; | |
981 | 1011 | |
982 | - if (bTop) { | |
983 | - for (size_t x=0; x<width; ++x) { | |
984 | - pTotalLine[x] = pWorkLine[x]; | |
1012 | + for (size_t n=0; n<iterationCount; ++n) { | |
1013 | + | |
1014 | + const uint8_t* pFrom; | |
1015 | + ptrdiff_t fromLineOffsetBytes; | |
1016 | + if (n == 0) { | |
1017 | + pFrom = pWork; | |
1018 | + fromLineOffsetBytes = workLineOffsetBytes; | |
1019 | + }else { | |
1020 | + pFrom = (n & 1) ? pWork2 : pWork; | |
1021 | + fromLineOffsetBytes = workLineOffsetBytes; | |
985 | 1022 | } |
986 | - OffsetPtr(pWorkLine, workLineOffsetBytes); | |
987 | - for (size_t ky=1; ky<=r; ++ky) { | |
1023 | + uint8_t* pTo; | |
1024 | + ptrdiff_t toLineOffsetBytes; | |
1025 | + if (n == iterationCount - 1) { | |
1026 | + pTo = pDest; | |
1027 | + toLineOffsetBytes = destLineOffsetBytes; | |
1028 | + }else { | |
1029 | + pTo = (n & 1) ? pWork : pWork2; | |
1030 | + toLineOffsetBytes = workLineOffsetBytes; | |
1031 | + } | |
1032 | + | |
1033 | + const uint8_t* pFromLine = pFrom; | |
1034 | + const uint8_t* pFromLine2 = pFromLine; | |
1035 | + uint8_t* pToLine = pTo; | |
1036 | + | |
1037 | + if (bTop) { | |
988 | 1038 | for (size_t x=0; x<width; ++x) { |
989 | - pTotalLine[x] += pWorkLine[x] * 2; | |
1039 | + pTotalLine[x] = pFromLine[x]; | |
990 | 1040 | } |
991 | - OffsetPtr(pWorkLine, workLineOffsetBytes); | |
992 | - } | |
993 | - for (size_t x=0; x<width; ++x) { | |
994 | - pDestLine[x] = (pTotalLine[x] * invLen) >> SHIFT; | |
995 | - } | |
996 | - OffsetPtr(pDestLine, destLineOffsetBytes); | |
997 | - OffsetPtr(pWorkLine2, r * workLineOffsetBytes); | |
998 | - | |
999 | - for (size_t y=1; y<=r; ++y) { | |
1000 | - Worker::process((const __m128i*)pWorkLine2, (const __m128i*)pWorkLine, (__m128i*)pTotalLine, (__m128i*)pDestLine, mInvLeni, width); | |
1001 | - OffsetPtr(pWorkLine2, -workLineOffsetBytes); | |
1002 | - OffsetPtr(pWorkLine, workLineOffsetBytes); | |
1003 | - OffsetPtr(pDestLine, destLineOffsetBytes); | |
1004 | - } | |
1005 | - | |
1006 | - }else { | |
1007 | - __m128i* pMTotal = (__m128i*)pTotalLine; | |
1008 | - for (size_t x=0; x<width>>4; ++x) { | |
1009 | - *pMTotal++ = _mm_setzero_si128(); | |
1010 | - *pMTotal++ = _mm_setzero_si128(); | |
1011 | - } | |
1012 | - for (size_t x=width&0xFFF0; x<width; ++x) { | |
1013 | - pTotalLine[x] = 0; | |
1014 | - } | |
1015 | - OffsetPtr(pWorkLine, -r * workLineOffsetBytes); | |
1016 | - pWorkLine2 = pWorkLine; | |
1017 | - for (int ky=-r; ky<=r; ++ky) { | |
1018 | - const __m128i* pMWork = (const __m128i*)pWorkLine; | |
1019 | - pMTotal = (__m128i*)pTotalLine; | |
1041 | + OffsetPtr(pFromLine, fromLineOffsetBytes); | |
1042 | + for (size_t ky=1; ky<=r; ++ky) { | |
1043 | + for (size_t x=0; x<width; ++x) { | |
1044 | + pTotalLine[x] += pFromLine[x] * 2; | |
1045 | + } | |
1046 | + OffsetPtr(pFromLine, fromLineOffsetBytes); | |
1047 | + } | |
1048 | + for (size_t x=0; x<width; ++x) { | |
1049 | + pToLine[x] = (pTotalLine[x] * invLen) >> SHIFT; | |
1050 | + } | |
1051 | + OffsetPtr(pToLine, toLineOffsetBytes); | |
1052 | + OffsetPtr(pFromLine2, r * fromLineOffsetBytes); | |
1053 | + | |
1054 | + for (size_t y=1; y<=r; ++y) { | |
1055 | + Worker::process((const __m128i*)pFromLine2, (const __m128i*)pFromLine, (__m128i*)pTotalLine, (__m128i*)pToLine, mInvLeni, width); | |
1056 | + OffsetPtr(pFromLine2, -fromLineOffsetBytes); | |
1057 | + OffsetPtr(pFromLine, fromLineOffsetBytes); | |
1058 | + OffsetPtr(pToLine, toLineOffsetBytes); | |
1059 | + } | |
1060 | + | |
1061 | + }else { | |
1062 | + __m128i* pMTotal = (__m128i*)pTotalLine; | |
1020 | 1063 | for (size_t x=0; x<width>>4; ++x) { |
1021 | - __m128i mData = pMWork[x]; | |
1022 | - __m128i mLeft = _mm_unpacklo_epi8(mData, _mm_setzero_si128()); | |
1023 | - __m128i mRight = _mm_unpackhi_epi8(mData, _mm_setzero_si128()); | |
1024 | - | |
1025 | - __m128i totalLeft = *pMTotal; | |
1026 | - __m128i totalRight = *(pMTotal+1); | |
1027 | - *pMTotal++ = _mm_add_epi16(totalLeft, mLeft); | |
1028 | - *pMTotal++ = _mm_add_epi16(totalRight, mRight); | |
1064 | + *pMTotal++ = _mm_setzero_si128(); | |
1065 | + *pMTotal++ = _mm_setzero_si128(); | |
1029 | 1066 | } |
1030 | 1067 | for (size_t x=width&0xFFF0; x<width; ++x) { |
1031 | - pTotalLine[x] += pWorkLine[x]; | |
1068 | + pTotalLine[x] = 0; | |
1032 | 1069 | } |
1033 | - OffsetPtr(pWorkLine, workLineOffsetBytes); | |
1070 | + OffsetPtr(pFromLine, -r * fromLineOffsetBytes); | |
1071 | + pFromLine2 = pFromLine; | |
1072 | + for (int ky=-r; ky<=r; ++ky) { | |
1073 | + const __m128i* pMWork = (const __m128i*)pFromLine; | |
1074 | + pMTotal = (__m128i*)pTotalLine; | |
1075 | + for (size_t x=0; x<width>>4; ++x) { | |
1076 | + __m128i mData = pMWork[x]; | |
1077 | + __m128i mLeft = _mm_unpacklo_epi8(mData, _mm_setzero_si128()); | |
1078 | + __m128i mRight = _mm_unpackhi_epi8(mData, _mm_setzero_si128()); | |
1079 | + | |
1080 | + __m128i totalLeft = *pMTotal; | |
1081 | + __m128i totalRight = *(pMTotal+1); | |
1082 | + *pMTotal++ = _mm_add_epi16(totalLeft, mLeft); | |
1083 | + *pMTotal++ = _mm_add_epi16(totalRight, mRight); | |
1084 | + } | |
1085 | + for (size_t x=width&0xFFF0; x<width; ++x) { | |
1086 | + pTotalLine[x] += pFromLine[x]; | |
1087 | + } | |
1088 | + OffsetPtr(pFromLine, fromLineOffsetBytes); | |
1089 | + } | |
1090 | + for (size_t x=0; x<width; ++x) { | |
1091 | + pToLine[x] = (pTotalLine[x] * invLen) >> SHIFT; | |
1092 | + } | |
1093 | + OffsetPtr(pToLine, toLineOffsetBytes); | |
1034 | 1094 | } |
1035 | - for (size_t x=0; x<width; ++x) { | |
1036 | - pDestLine[x] = (pTotalLine[x] * invLen) >> SHIFT; | |
1095 | + | |
1096 | + for (int y=kys0; y<kye0; ++y) { | |
1097 | + Worker::process((const __m128i*)pFromLine2, (const __m128i*)pFromLine, (__m128i*)pTotalLine, (__m128i*)pToLine, mInvLeni, width); | |
1098 | + OffsetPtr(pFromLine, fromLineOffsetBytes); | |
1099 | + OffsetPtr(pFromLine2, fromLineOffsetBytes); | |
1100 | + OffsetPtr(pToLine, toLineOffsetBytes); | |
1037 | 1101 | } |
1038 | - OffsetPtr(pDestLine, destLineOffsetBytes); | |
1039 | - } | |
1040 | - | |
1041 | - for (int y=kys0; y<kye0; ++y) { | |
1042 | - Worker::process((const __m128i*)pWorkLine2, (const __m128i*)pWorkLine, (__m128i*)pTotalLine, (__m128i*)pDestLine, mInvLeni, width); | |
1043 | - OffsetPtr(pWorkLine, workLineOffsetBytes); | |
1044 | - OffsetPtr(pWorkLine2, workLineOffsetBytes); | |
1045 | - OffsetPtr(pDestLine, destLineOffsetBytes); | |
1046 | - } | |
1047 | - | |
1048 | - if (bBottom) { | |
1049 | - pWorkLine2 = pWork; | |
1050 | - OffsetPtr(pWorkLine2, (kye0 - r) * workLineOffsetBytes); | |
1051 | - pWorkLine = pWork; | |
1052 | - OffsetPtr(pWorkLine, (height - 1) * workLineOffsetBytes); | |
1053 | - for (size_t y=kye0,cnt=0; y<height; ++y, ++cnt) { | |
1054 | - Worker::process((const __m128i*)pWorkLine2, (const __m128i*)pWorkLine, (__m128i*)pTotalLine, (__m128i*)pDestLine, mInvLeni, width); | |
1055 | - OffsetPtr(pWorkLine2, workLineOffsetBytes); | |
1056 | - OffsetPtr(pWorkLine, -workLineOffsetBytes); | |
1057 | - OffsetPtr(pDestLine, destLineOffsetBytes); | |
1102 | + | |
1103 | + if (bBottom) { | |
1104 | + pFromLine2 = pFrom; | |
1105 | + OffsetPtr(pFromLine2, (kye0 - r) * fromLineOffsetBytes); | |
1106 | + pFromLine = pFrom; | |
1107 | + OffsetPtr(pFromLine, (height - 1) * fromLineOffsetBytes); | |
1108 | + for (size_t y=kye0,cnt=0; y<height; ++y, ++cnt) { | |
1109 | + Worker::process((const __m128i*)pFromLine2, (const __m128i*)pFromLine, (__m128i*)pTotalLine, (__m128i*)pToLine, mInvLeni, width); | |
1110 | + OffsetPtr(pFromLine2, fromLineOffsetBytes); | |
1111 | + OffsetPtr(pFromLine, -fromLineOffsetBytes); | |
1112 | + OffsetPtr(pToLine, toLineOffsetBytes); | |
1113 | + } | |
1058 | 1114 | } |
1059 | 1115 | } |
1060 | 1116 | } |