• R/O
  • SSH
  • HTTPS

imagefilter: Commit


Commit MetaInfo

Revision10 (tree)
Time2010-02-15 02:31:49
Authorberupon

Log Message

BoxBlur処理、繰り返し適用対応を少し進める。

Change Summary

Incremental Difference

--- trunk/main.cpp (revision 9)
+++ trunk/main.cpp (revision 10)
@@ -94,10 +94,10 @@
9494 //blur_1b::test_4,
9595 //blur_1b::test_5_h,
9696 //blur_1b::test_5_v,
97- blur_1b::test_5_h,
98- blur_1b::test_6_v,
99- //blur_1b::test_7_h,
100- //blur_1b::test_7_v,
97+ //blur_1b::test_5_h,
98+ //blur_1b::test_6_v,
99+ blur_1b::test_7_h,
100+ blur_1b::test_7_v,
101101 //blur_1b::test_8,
102102 //blur_1b::test_9,
103103 //blur_1b::test_10,
--- trunk/blur_1b.cpp (revision 9)
+++ trunk/blur_1b.cpp (revision 10)
@@ -766,137 +766,170 @@
766766 const size_t kxe0 = (size_t) std::max<int>(0, width - r);
767767 const size_t kxs0_16end = kxs0 + (16 - (kxs0 % 16));
768768 const size_t kxe0_16end = kxe0 - (kxe0 % 16);
769- const uint8_t* pSrcLine = pSrc;
770- uint8_t* pWorkLine = pWork;
771- for (size_t y=0; y<height; ++y) {
772- int total = *pSrcLine;
773- for (size_t kx=1; kx<kxs0; ++kx) {
774- total += pSrcLine[kx] * 2;
769+
770+ for (size_t n=0; n<iterationCount; ++n) {
771+
772+ const uint8_t* pFrom;
773+ ptrdiff_t fromLineOffsetBytes;
774+ if (n == 0) {
775+ pFrom = pSrc;
776+ fromLineOffsetBytes = srcLineOffsetBytes;
777+ }else {
778+ // TODO: make it compact
779+ if (iterationCount & 1) {
780+ pFrom = (n & 1) ? pWork : pWork2;
781+ }else {
782+ pFrom = (n & 1) ? pWork2 : pWork;
783+ }
784+ fromLineOffsetBytes = workLineOffsetBytes;
775785 }
776- pWorkLine[0] = (total * invLen) >> SHIFT;
777- for (size_t x=1; x<kxs0; ++x) {
778- assert(kxs0 >= x);
779- total += - pSrcLine[kxs0 - x] + pSrcLine[kxs0 + x - 1];
780- pWorkLine[x] = (total * invLen) >> SHIFT;
786+ uint8_t* pTo;
787+ ptrdiff_t toLineOffsetBytes;
788+ if (n == iterationCount - 1) {
789+ pTo = pWork;
790+ toLineOffsetBytes = workLineOffsetBytes;
791+ }else {
792+ // TODO: make it compact
793+ if (iterationCount & 1) {
794+ pTo = (n & 1) ? pWork2 : pWork;
795+ }else {
796+ pTo = (n & 1) ? pWork : pWork2;
797+ }
798+ toLineOffsetBytes = workLineOffsetBytes;
781799 }
782- for (size_t x=kxs0; x<kxs0_16end; ++x) {
783- total += - pSrcLine[x - r - 1] + pSrcLine[x + r];
784- pWorkLine[x] = (total * invLen) >> SHIFT;
785- }
786800
787- __m128i* mPSub = (__m128i*) (pSrcLine + kxs0_16end - r - 1);
788- __m128i* mPAdd = (__m128i*) (pSrcLine + kxs0_16end + r);
789- __m128i mNextSub = _mm_loadu_si128(mPSub++); // hoist loading
790- __m128i mNextAdd = _mm_loadu_si128(mPAdd++);
791- __m128i* mPWork = (__m128i*) (pWorkLine + kxs0_16end);
792-
793-#if 1
794- __m128i mTotal = _mm_set1_epi16(total);
795- for (size_t x=kxs0_16end; x<kxe0_16end; x+=16) {
796- __m128i mSub = mNextSub;
797- __m128i mAdd = mNextAdd;
798- mNextSub = _mm_loadu_si128(mPSub++);
799- mNextAdd = _mm_loadu_si128(mPAdd++);
801+ const uint8_t* pFromLine = pFrom;
802+ uint8_t* pToLine = pTo;
803+
804+ for (size_t y=0; y<height; ++y) {
805+ int total = *pFromLine;
806+ for (size_t kx=1; kx<kxs0; ++kx) {
807+ total += pFromLine[kx] * 2;
808+ }
809+ pToLine[0] = (total * invLen) >> SHIFT;
810+ for (size_t x=1; x<kxs0; ++x) {
811+ assert(kxs0 >= x);
812+ total += - pFromLine[kxs0 - x] + pFromLine[kxs0 + x - 1];
813+ pToLine[x] = (total * invLen) >> SHIFT;
814+ }
815+ for (size_t x=kxs0; x<kxs0_16end; ++x) {
816+ total += - pFromLine[x - r - 1] + pFromLine[x + r];
817+ pToLine[x] = (total * invLen) >> SHIFT;
818+ }
800819
801- __m128i mSub0 = _mm_unpacklo_epi8(mSub, _mm_setzero_si128());
802- __m128i mSub1 = _mm_unpackhi_epi8(mSub, _mm_setzero_si128());
803- __m128i mAdd0 = _mm_unpacklo_epi8(mAdd, _mm_setzero_si128());
804- __m128i mAdd1 = _mm_unpackhi_epi8(mAdd, _mm_setzero_si128());
805-
806- __m128i mDiff0 = _mm_sub_epi16(mAdd0, mSub0);
807- __m128i mDiff1 = _mm_sub_epi16(mAdd1, mSub1);
808- mDiff0 = _mm_add_epi16(mDiff0, _mm_slli_si128(mDiff0, 2));
809- mDiff0 = _mm_add_epi16(mDiff0, _mm_slli_si128(mDiff0, 4));
810- mDiff0 = _mm_add_epi16(mDiff0, _mm_slli_si128(mDiff0, 8));
811- __m128i jump = _mm_shufflehi_epi16(mDiff0, _MM_SHUFFLE(3,3,3,3));
812- jump = _mm_unpackhi_epi64(jump, jump);
813-
814- mDiff1 = _mm_add_epi16(mDiff1, _mm_slli_si128(mDiff1, 2));
815- mDiff1 = _mm_add_epi16(mDiff1, _mm_slli_si128(mDiff1, 4));
816- mDiff1 = _mm_add_epi16(mDiff1, _mm_slli_si128(mDiff1, 8));
817- mDiff1 = _mm_add_epi16(mDiff1, jump);
818-
819- __m128i left = _mm_add_epi16(mTotal, mDiff0);
820- __m128i right = _mm_add_epi16(mTotal, mDiff1);
821- __m128i left2 = _mm_mulhrs_epi16(left, mInvLeni);
822- __m128i right2 = _mm_mulhrs_epi16(right, mInvLeni);
823- __m128i result = _mm_packus_epi16(left2, right2);
824-// _mm_stream_si128(mPWork++, result);
825- *mPWork++ = result;
826- mTotal = _mm_shufflehi_epi16(right, _MM_SHUFFLE(3,3,3,3));
827- mTotal = _mm_unpackhi_epi64(mTotal, mTotal);
828- }
829- total = _mm_extract_epi16(mTotal, 0);
830-#else
831- // SSE2 path
832- __m128i mTotal = _mm_set1_epi32(total); //_mm_shuffle_epi32(_mm_cvtsi32_si128(total), 1);
833- for (size_t x=kxs0_16end; x<kxe0_16end; x+=16) {
834- __m128i mSub = mNextSub;
835- __m128i mAdd = mNextAdd;
836- mNextSub = _mm_loadu_si128(mPSub++);
837- mNextAdd = _mm_loadu_si128(mPAdd++);
838-
839- __m128i mSub0 = _mm_unpacklo_epi8(mSub, _mm_setzero_si128());
840- __m128i mSub1 = _mm_unpackhi_epi8(mSub, _mm_setzero_si128());
841- __m128i mAdd0 = _mm_unpacklo_epi8(mAdd, _mm_setzero_si128());
842- __m128i mAdd1 = _mm_unpackhi_epi8(mAdd, _mm_setzero_si128());
843-
844- __m128i mDiff0 = _mm_sub_epi32(_mm_unpacklo_epi16(mAdd0, _mm_setzero_si128()), _mm_unpacklo_epi16(mSub0, _mm_setzero_si128()));
845- __m128i mDiff1 = _mm_sub_epi32(_mm_unpackhi_epi16(mAdd0, _mm_setzero_si128()), _mm_unpackhi_epi16(mSub0, _mm_setzero_si128()));
846- __m128i mDiff2 = _mm_sub_epi32(_mm_unpacklo_epi16(mAdd1, _mm_setzero_si128()), _mm_unpacklo_epi16(mSub1, _mm_setzero_si128()));
847- __m128i mDiff3 = _mm_sub_epi32(_mm_unpackhi_epi16(mAdd1, _mm_setzero_si128()), _mm_unpackhi_epi16(mSub1, _mm_setzero_si128()));
848-
849- mDiff0 = _mm_add_epi32(mDiff0, _mm_slli_si128(mDiff0, 4));
850- mDiff0 = _mm_add_epi32(mDiff0, _mm_slli_si128(mDiff0, 8));
851- mDiff1 = _mm_add_epi32(mDiff1, _mm_slli_si128(mDiff1, 4));
852- mDiff1 = _mm_add_epi32(mDiff1, _mm_slli_si128(mDiff1, 8));
853- mDiff2 = _mm_add_epi32(mDiff2, _mm_slli_si128(mDiff2, 4));
854- mDiff2 = _mm_add_epi32(mDiff2, _mm_slli_si128(mDiff2, 8));
855- mDiff3 = _mm_add_epi32(mDiff3, _mm_slli_si128(mDiff3, 4));
856- mDiff3 = _mm_add_epi32(mDiff3, _mm_slli_si128(mDiff3, 8));
820+ __m128i* mPSub = (__m128i*) (pFromLine + kxs0_16end - r - 1);
821+ __m128i* mPAdd = (__m128i*) (pFromLine + kxs0_16end + r);
822+ __m128i mNextSub = _mm_loadu_si128(mPSub++); // hoist loading
823+ __m128i mNextAdd = _mm_loadu_si128(mPAdd++);
824+ __m128i* mPWork = (__m128i*) (pToLine + kxs0_16end);
857825
858- mTotal = _mm_add_epi32(mTotal, mDiff0);
859- __m128 mfTotal = _mm_cvtepi32_ps(mTotal);
860- __m128i mDest0 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen));
861-
862- mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3));
863- mTotal = _mm_add_epi32(mTotal, mDiff1);
864- mfTotal = _mm_cvtepi32_ps(mTotal);
865- __m128i mDest1 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen));
826+ #if 1
827+ __m128i mTotal = _mm_set1_epi16(total);
828+ for (size_t x=kxs0_16end; x<kxe0_16end; x+=16) {
829+ __m128i mSub = mNextSub;
830+ __m128i mAdd = mNextAdd;
831+ mNextSub = _mm_loadu_si128(mPSub++);
832+ mNextAdd = _mm_loadu_si128(mPAdd++);
833+
834+ __m128i mSub0 = _mm_unpacklo_epi8(mSub, _mm_setzero_si128());
835+ __m128i mSub1 = _mm_unpackhi_epi8(mSub, _mm_setzero_si128());
836+ __m128i mAdd0 = _mm_unpacklo_epi8(mAdd, _mm_setzero_si128());
837+ __m128i mAdd1 = _mm_unpackhi_epi8(mAdd, _mm_setzero_si128());
838+
839+ __m128i mDiff0 = _mm_sub_epi16(mAdd0, mSub0);
840+ __m128i mDiff1 = _mm_sub_epi16(mAdd1, mSub1);
841+ mDiff0 = _mm_add_epi16(mDiff0, _mm_slli_si128(mDiff0, 2));
842+ mDiff0 = _mm_add_epi16(mDiff0, _mm_slli_si128(mDiff0, 4));
843+ mDiff0 = _mm_add_epi16(mDiff0, _mm_slli_si128(mDiff0, 8));
844+ __m128i jump = _mm_shufflehi_epi16(mDiff0, _MM_SHUFFLE(3,3,3,3));
845+ jump = _mm_unpackhi_epi64(jump, jump);
846+
847+ mDiff1 = _mm_add_epi16(mDiff1, _mm_slli_si128(mDiff1, 2));
848+ mDiff1 = _mm_add_epi16(mDiff1, _mm_slli_si128(mDiff1, 4));
849+ mDiff1 = _mm_add_epi16(mDiff1, _mm_slli_si128(mDiff1, 8));
850+ mDiff1 = _mm_add_epi16(mDiff1, jump);
851+
852+ __m128i left = _mm_add_epi16(mTotal, mDiff0);
853+ __m128i right = _mm_add_epi16(mTotal, mDiff1);
854+ __m128i left2 = _mm_mulhrs_epi16(left, mInvLeni);
855+ __m128i right2 = _mm_mulhrs_epi16(right, mInvLeni);
856+ __m128i result = _mm_packus_epi16(left2, right2);
857+ // _mm_stream_si128(mPWork++, result);
858+ *mPWork++ = result;
859+ mTotal = _mm_shufflehi_epi16(right, _MM_SHUFFLE(3,3,3,3));
860+ mTotal = _mm_unpackhi_epi64(mTotal, mTotal);
861+ }
862+ total = _mm_extract_epi16(mTotal, 0);
863+ #else
864+ // SSE2 path
865+ __m128i mTotal = _mm_set1_epi32(total); //_mm_shuffle_epi32(_mm_cvtsi32_si128(total), 1);
866+ for (size_t x=kxs0_16end; x<kxe0_16end; x+=16) {
867+ __m128i mSub = mNextSub;
868+ __m128i mAdd = mNextAdd;
869+ mNextSub = _mm_loadu_si128(mPSub++);
870+ mNextAdd = _mm_loadu_si128(mPAdd++);
871+
872+ __m128i mSub0 = _mm_unpacklo_epi8(mSub, _mm_setzero_si128());
873+ __m128i mSub1 = _mm_unpackhi_epi8(mSub, _mm_setzero_si128());
874+ __m128i mAdd0 = _mm_unpacklo_epi8(mAdd, _mm_setzero_si128());
875+ __m128i mAdd1 = _mm_unpackhi_epi8(mAdd, _mm_setzero_si128());
876+
877+ __m128i mDiff0 = _mm_sub_epi32(_mm_unpacklo_epi16(mAdd0, _mm_setzero_si128()), _mm_unpacklo_epi16(mSub0, _mm_setzero_si128()));
878+ __m128i mDiff1 = _mm_sub_epi32(_mm_unpackhi_epi16(mAdd0, _mm_setzero_si128()), _mm_unpackhi_epi16(mSub0, _mm_setzero_si128()));
879+ __m128i mDiff2 = _mm_sub_epi32(_mm_unpacklo_epi16(mAdd1, _mm_setzero_si128()), _mm_unpacklo_epi16(mSub1, _mm_setzero_si128()));
880+ __m128i mDiff3 = _mm_sub_epi32(_mm_unpackhi_epi16(mAdd1, _mm_setzero_si128()), _mm_unpackhi_epi16(mSub1, _mm_setzero_si128()));
881+
882+ mDiff0 = _mm_add_epi32(mDiff0, _mm_slli_si128(mDiff0, 4));
883+ mDiff0 = _mm_add_epi32(mDiff0, _mm_slli_si128(mDiff0, 8));
884+ mDiff1 = _mm_add_epi32(mDiff1, _mm_slli_si128(mDiff1, 4));
885+ mDiff1 = _mm_add_epi32(mDiff1, _mm_slli_si128(mDiff1, 8));
886+ mDiff2 = _mm_add_epi32(mDiff2, _mm_slli_si128(mDiff2, 4));
887+ mDiff2 = _mm_add_epi32(mDiff2, _mm_slli_si128(mDiff2, 8));
888+ mDiff3 = _mm_add_epi32(mDiff3, _mm_slli_si128(mDiff3, 4));
889+ mDiff3 = _mm_add_epi32(mDiff3, _mm_slli_si128(mDiff3, 8));
866890
867- mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3));
868- mTotal = _mm_add_epi32(mTotal, mDiff2);
869- mfTotal = _mm_cvtepi32_ps(mTotal);
870- __m128i mDest2 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen));
891+ mTotal = _mm_add_epi32(mTotal, mDiff0);
892+ __m128 mfTotal = _mm_cvtepi32_ps(mTotal);
893+ __m128i mDest0 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen));
894+
895+ mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3));
896+ mTotal = _mm_add_epi32(mTotal, mDiff1);
897+ mfTotal = _mm_cvtepi32_ps(mTotal);
898+ __m128i mDest1 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen));
871899
872- mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3));
873- mTotal = _mm_add_epi32(mTotal, mDiff3);
874- mfTotal = _mm_cvtepi32_ps(mTotal);
875- mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3));
876- __m128i mDest3 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen));
900+ mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3));
901+ mTotal = _mm_add_epi32(mTotal, mDiff2);
902+ mfTotal = _mm_cvtepi32_ps(mTotal);
903+ __m128i mDest2 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen));
877904
878- *mPWork++ =
879- _mm_packus_epi16(
880- _mm_packs_epi32(mDest0, mDest1),
881- _mm_packs_epi32(mDest2, mDest3)
882- )
883- ;
905+ mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3));
906+ mTotal = _mm_add_epi32(mTotal, mDiff3);
907+ mfTotal = _mm_cvtepi32_ps(mTotal);
908+ mTotal = _mm_shuffle_epi32(mTotal, _MM_SHUFFLE(3,3,3,3));
909+ __m128i mDest3 = _mm_cvttps_epi32(_mm_mul_ps(mfTotal, mInvLen));
910+
911+ *mPWork++ =
912+ _mm_packus_epi16(
913+ _mm_packs_epi32(mDest0, mDest1),
914+ _mm_packs_epi32(mDest2, mDest3)
915+ )
916+ ;
917+ }
918+ total = _mm_cvtsi128_si32(mTotal);
919+ #endif
920+
921+ for (size_t x=kxe0_16end; x<kxe0; ++x) {
922+ total += - pFromLine[x - r - 1] + pFromLine[x + r];
923+ pToLine[x] = (total * invLen) >> SHIFT;
924+ }
925+ for (size_t x=kxe0,cnt=0; x<width; ++x, ++cnt) {
926+ total += - pFromLine[kxe0 - r + cnt] + pFromLine[width - 1 - cnt];
927+ pToLine[x] = (total * invLen) >> SHIFT;
928+ }
929+ OffsetPtr(pToLine, toLineOffsetBytes);
930+ OffsetPtr(pFromLine, fromLineOffsetBytes);
884931 }
885- total = _mm_cvtsi128_si32(mTotal);
886-#endif
887-
888- for (size_t x=kxe0_16end; x<kxe0; ++x) {
889- total += - pSrcLine[x - r - 1] + pSrcLine[x + r];
890- pWorkLine[x] = (total * invLen) >> SHIFT;
891- }
892- for (size_t x=kxe0,cnt=0; x<width; ++x, ++cnt) {
893- total += - pSrcLine[kxe0 - r + cnt] + pSrcLine[width - 1 - cnt];
894- pWorkLine[x] = (total * invLen) >> SHIFT;
895- }
896- OffsetPtr(pWorkLine, workLineOffsetBytes);
897- OffsetPtr(pSrcLine, srcLineOffsetBytes);
898932 }
899-
900933 }
901934
902935 void test_7_v(const Parameter& p) {
@@ -975,86 +1008,109 @@
9751008 if (bBottom) {
9761009 kye0 = std::max<int>(0, height - r);
9771010 }
978- const uint8_t* pWorkLine = pWork;
979- const uint8_t* pWorkLine2 = pWorkLine;
980- uint8_t* pDestLine = pDest;
9811011
982- if (bTop) {
983- for (size_t x=0; x<width; ++x) {
984- pTotalLine[x] = pWorkLine[x];
1012+ for (size_t n=0; n<iterationCount; ++n) {
1013+
1014+ const uint8_t* pFrom;
1015+ ptrdiff_t fromLineOffsetBytes;
1016+ if (n == 0) {
1017+ pFrom = pWork;
1018+ fromLineOffsetBytes = workLineOffsetBytes;
1019+ }else {
1020+ pFrom = (n & 1) ? pWork2 : pWork;
1021+ fromLineOffsetBytes = workLineOffsetBytes;
9851022 }
986- OffsetPtr(pWorkLine, workLineOffsetBytes);
987- for (size_t ky=1; ky<=r; ++ky) {
1023+ uint8_t* pTo;
1024+ ptrdiff_t toLineOffsetBytes;
1025+ if (n == iterationCount - 1) {
1026+ pTo = pDest;
1027+ toLineOffsetBytes = destLineOffsetBytes;
1028+ }else {
1029+ pTo = (n & 1) ? pWork : pWork2;
1030+ toLineOffsetBytes = workLineOffsetBytes;
1031+ }
1032+
1033+ const uint8_t* pFromLine = pFrom;
1034+ const uint8_t* pFromLine2 = pFromLine;
1035+ uint8_t* pToLine = pTo;
1036+
1037+ if (bTop) {
9881038 for (size_t x=0; x<width; ++x) {
989- pTotalLine[x] += pWorkLine[x] * 2;
1039+ pTotalLine[x] = pFromLine[x];
9901040 }
991- OffsetPtr(pWorkLine, workLineOffsetBytes);
992- }
993- for (size_t x=0; x<width; ++x) {
994- pDestLine[x] = (pTotalLine[x] * invLen) >> SHIFT;
995- }
996- OffsetPtr(pDestLine, destLineOffsetBytes);
997- OffsetPtr(pWorkLine2, r * workLineOffsetBytes);
998-
999- for (size_t y=1; y<=r; ++y) {
1000- Worker::process((const __m128i*)pWorkLine2, (const __m128i*)pWorkLine, (__m128i*)pTotalLine, (__m128i*)pDestLine, mInvLeni, width);
1001- OffsetPtr(pWorkLine2, -workLineOffsetBytes);
1002- OffsetPtr(pWorkLine, workLineOffsetBytes);
1003- OffsetPtr(pDestLine, destLineOffsetBytes);
1004- }
1005-
1006- }else {
1007- __m128i* pMTotal = (__m128i*)pTotalLine;
1008- for (size_t x=0; x<width>>4; ++x) {
1009- *pMTotal++ = _mm_setzero_si128();
1010- *pMTotal++ = _mm_setzero_si128();
1011- }
1012- for (size_t x=width&0xFFF0; x<width; ++x) {
1013- pTotalLine[x] = 0;
1014- }
1015- OffsetPtr(pWorkLine, -r * workLineOffsetBytes);
1016- pWorkLine2 = pWorkLine;
1017- for (int ky=-r; ky<=r; ++ky) {
1018- const __m128i* pMWork = (const __m128i*)pWorkLine;
1019- pMTotal = (__m128i*)pTotalLine;
1041+ OffsetPtr(pFromLine, fromLineOffsetBytes);
1042+ for (size_t ky=1; ky<=r; ++ky) {
1043+ for (size_t x=0; x<width; ++x) {
1044+ pTotalLine[x] += pFromLine[x] * 2;
1045+ }
1046+ OffsetPtr(pFromLine, fromLineOffsetBytes);
1047+ }
1048+ for (size_t x=0; x<width; ++x) {
1049+ pToLine[x] = (pTotalLine[x] * invLen) >> SHIFT;
1050+ }
1051+ OffsetPtr(pToLine, toLineOffsetBytes);
1052+ OffsetPtr(pFromLine2, r * fromLineOffsetBytes);
1053+
1054+ for (size_t y=1; y<=r; ++y) {
1055+ Worker::process((const __m128i*)pFromLine2, (const __m128i*)pFromLine, (__m128i*)pTotalLine, (__m128i*)pToLine, mInvLeni, width);
1056+ OffsetPtr(pFromLine2, -fromLineOffsetBytes);
1057+ OffsetPtr(pFromLine, fromLineOffsetBytes);
1058+ OffsetPtr(pToLine, toLineOffsetBytes);
1059+ }
1060+
1061+ }else {
1062+ __m128i* pMTotal = (__m128i*)pTotalLine;
10201063 for (size_t x=0; x<width>>4; ++x) {
1021- __m128i mData = pMWork[x];
1022- __m128i mLeft = _mm_unpacklo_epi8(mData, _mm_setzero_si128());
1023- __m128i mRight = _mm_unpackhi_epi8(mData, _mm_setzero_si128());
1024-
1025- __m128i totalLeft = *pMTotal;
1026- __m128i totalRight = *(pMTotal+1);
1027- *pMTotal++ = _mm_add_epi16(totalLeft, mLeft);
1028- *pMTotal++ = _mm_add_epi16(totalRight, mRight);
1064+ *pMTotal++ = _mm_setzero_si128();
1065+ *pMTotal++ = _mm_setzero_si128();
10291066 }
10301067 for (size_t x=width&0xFFF0; x<width; ++x) {
1031- pTotalLine[x] += pWorkLine[x];
1068+ pTotalLine[x] = 0;
10321069 }
1033- OffsetPtr(pWorkLine, workLineOffsetBytes);
1070+ OffsetPtr(pFromLine, -r * fromLineOffsetBytes);
1071+ pFromLine2 = pFromLine;
1072+ for (int ky=-r; ky<=r; ++ky) {
1073+ const __m128i* pMWork = (const __m128i*)pFromLine;
1074+ pMTotal = (__m128i*)pTotalLine;
1075+ for (size_t x=0; x<width>>4; ++x) {
1076+ __m128i mData = pMWork[x];
1077+ __m128i mLeft = _mm_unpacklo_epi8(mData, _mm_setzero_si128());
1078+ __m128i mRight = _mm_unpackhi_epi8(mData, _mm_setzero_si128());
1079+
1080+ __m128i totalLeft = *pMTotal;
1081+ __m128i totalRight = *(pMTotal+1);
1082+ *pMTotal++ = _mm_add_epi16(totalLeft, mLeft);
1083+ *pMTotal++ = _mm_add_epi16(totalRight, mRight);
1084+ }
1085+ for (size_t x=width&0xFFF0; x<width; ++x) {
1086+ pTotalLine[x] += pFromLine[x];
1087+ }
1088+ OffsetPtr(pFromLine, fromLineOffsetBytes);
1089+ }
1090+ for (size_t x=0; x<width; ++x) {
1091+ pToLine[x] = (pTotalLine[x] * invLen) >> SHIFT;
1092+ }
1093+ OffsetPtr(pToLine, toLineOffsetBytes);
10341094 }
1035- for (size_t x=0; x<width; ++x) {
1036- pDestLine[x] = (pTotalLine[x] * invLen) >> SHIFT;
1095+
1096+ for (int y=kys0; y<kye0; ++y) {
1097+ Worker::process((const __m128i*)pFromLine2, (const __m128i*)pFromLine, (__m128i*)pTotalLine, (__m128i*)pToLine, mInvLeni, width);
1098+ OffsetPtr(pFromLine, fromLineOffsetBytes);
1099+ OffsetPtr(pFromLine2, fromLineOffsetBytes);
1100+ OffsetPtr(pToLine, toLineOffsetBytes);
10371101 }
1038- OffsetPtr(pDestLine, destLineOffsetBytes);
1039- }
1040-
1041- for (int y=kys0; y<kye0; ++y) {
1042- Worker::process((const __m128i*)pWorkLine2, (const __m128i*)pWorkLine, (__m128i*)pTotalLine, (__m128i*)pDestLine, mInvLeni, width);
1043- OffsetPtr(pWorkLine, workLineOffsetBytes);
1044- OffsetPtr(pWorkLine2, workLineOffsetBytes);
1045- OffsetPtr(pDestLine, destLineOffsetBytes);
1046- }
1047-
1048- if (bBottom) {
1049- pWorkLine2 = pWork;
1050- OffsetPtr(pWorkLine2, (kye0 - r) * workLineOffsetBytes);
1051- pWorkLine = pWork;
1052- OffsetPtr(pWorkLine, (height - 1) * workLineOffsetBytes);
1053- for (size_t y=kye0,cnt=0; y<height; ++y, ++cnt) {
1054- Worker::process((const __m128i*)pWorkLine2, (const __m128i*)pWorkLine, (__m128i*)pTotalLine, (__m128i*)pDestLine, mInvLeni, width);
1055- OffsetPtr(pWorkLine2, workLineOffsetBytes);
1056- OffsetPtr(pWorkLine, -workLineOffsetBytes);
1057- OffsetPtr(pDestLine, destLineOffsetBytes);
1102+
1103+ if (bBottom) {
1104+ pFromLine2 = pFrom;
1105+ OffsetPtr(pFromLine2, (kye0 - r) * fromLineOffsetBytes);
1106+ pFromLine = pFrom;
1107+ OffsetPtr(pFromLine, (height - 1) * fromLineOffsetBytes);
1108+ for (size_t y=kye0,cnt=0; y<height; ++y, ++cnt) {
1109+ Worker::process((const __m128i*)pFromLine2, (const __m128i*)pFromLine, (__m128i*)pTotalLine, (__m128i*)pToLine, mInvLeni, width);
1110+ OffsetPtr(pFromLine2, fromLineOffsetBytes);
1111+ OffsetPtr(pFromLine, -fromLineOffsetBytes);
1112+ OffsetPtr(pToLine, toLineOffsetBytes);
1113+ }
10581114 }
10591115 }
10601116 }
Show on old repository browser