• R/O
  • SSH
  • HTTPS

imagefilter: Commit


Commit MetaInfo

Revision31 (tree)
Time2013-08-05 00:29:08
Authorberupon

Log Message

changed new routine to take care of right and bottom side of a image

Change Summary

Incremental Difference

--- trunk/main.cpp (revision 30)
+++ trunk/main.cpp (revision 31)
@@ -57,8 +57,8 @@
5757 // const size_t nThreads = 2;
5858 // const size_t nThreads = 4;
5959 #else
60- const size_t nThreads = si.dwNumberOfProcessors;
61-// const size_t nThreads = 1;
60+// const size_t nThreads = si.dwNumberOfProcessors;
61+ const size_t nThreads = 1;
6262 #endif
6363 Threads<blur_1b::Parameter> threads;
6464 threads.SetUp(nThreads);
@@ -71,7 +71,7 @@
7171 pCommon.srcLineOffsetBytes =
7272 pCommon.workLineOffsetBytes =
7373 pCommon.destLineOffsetBytes = lineSize;
74- pCommon.radius = 7;
74+ pCommon.radius = 14;
7575 pCommon.iterationCount = 1;
7676 std::vector<blur_1b::Parameter> params(nThreads);
7777 for (size_t i=0; i<nThreads; ++i) {
@@ -91,20 +91,20 @@
9191 }
9292 typedef void (*BlurFuncPtr)(const blur_1b::Parameter& p);
9393 BlurFuncPtr ptrs[] = {
94- blur_1b::test_1,
95- blur_1b::test_2,
96- blur_1b::test_3,
97- blur_1b::test_4,
98- blur_1b::test_5_h,
99- blur_1b::test_5_v,
100- blur_1b::test_5_h,
101- blur_1b::test_6_v,
102- blur_1b::test_7_h,
103- blur_1b::test_7_v,
104- blur_1b::test_8,
105- blur_1b::test_9,
106- blur_1b::test_10,
107- blur_1b::test_11,
94+ //blur_1b::test_1,
95+ //blur_1b::test_2,
96+ //blur_1b::test_3,
97+ //blur_1b::test_4,
98+ //blur_1b::test_5_h,
99+ //blur_1b::test_5_v,
100+ //blur_1b::test_5_h,
101+ //blur_1b::test_6_v,
102+ //blur_1b::test_7_h,
103+ //blur_1b::test_7_v,
104+ //blur_1b::test_8,
105+ //blur_1b::test_9,
106+ //blur_1b::test_10,
107+ //blur_1b::test_11,
108108 blur_1b::test_12,
109109
110110 //blur_1b::test_20,
--- trunk/blur_1b.cpp (revision 30)
+++ trunk/blur_1b.cpp (revision 31)
@@ -1966,6 +1966,7 @@
19661966 size_t len;
19671967
19681968 size_t vcnt;
1969+ size_t remainCount;
19691970 const __m128i* src;
19701971 __m128i* pSumLine;
19711972 __m128i* pSubLine;
@@ -1989,12 +1990,10 @@
19891990 diff1 = shiftAdd16(diff1);
19901991 sum1 = _mm_add_epi16(sum1, diff1);
19911992 }
1992-
1993-
1994- template <typename T>
1993+
19951994 __forceinline
1996- void process(T& storer) {
1997-
1995+ void getPlusMinus(__m128i& plus, __m128i& minus, __m128i& sum3, const __m128i*& plusSrc, const __m128i*& minusSrc)
1996+ {
19981997 // 反転して要素を生成
19991998 __m128i minusSrc2[2];
20001999 minusSrc2[0] = _mm_shuffle_epi8(src[0], REVERSE); // 15-1
@@ -2003,15 +2002,28 @@
20032002 for (size_t i=0; i<1+len*2; ++i) {
20042003 sum += ((const uint8_t*)(minusSrc2+1)) [i - (len+1)];
20052004 }
2006- __m128i sum3 = _mm_set1_epi16(sum);
2007- const __m128i* plusSrc = (const __m128i*) ((const uint8_t*)src + len);
2008- const __m128i* minusSrc = (const __m128i*) ((const uint8_t*)src - (len+1));
2009- __m128i plus = _mm_loadu_si128(plusSrc);
2010- __m128i minus = _mm_loadu_si128(
2005+ sum3.m128i_u16[7] = sum;
2006+ plusSrc = (const __m128i*) ((const uint8_t*)src + len);
2007+ minusSrc = (const __m128i*) ((const uint8_t*)src - (len+1));
2008+ plus = _mm_loadu_si128(plusSrc);
2009+ minus = _mm_loadu_si128(
20112010 (const __m128i*) (
20122011 (const uint8_t*)(minusSrc2+1) - (len+1)
20132012 )
20142013 );
2014+ }
2015+
2016+ template <typename T>
2017+ __forceinline
2018+ void process(T& storer) {
2019+
2020+ const __m128i* plusSrc = 0;
2021+ const __m128i* minusSrc = 0;
2022+ __m128i plus;
2023+ __m128i minus;
2024+ __m128i sum3;
2025+ getPlusMinus(plus, minus, sum3, plusSrc, minusSrc);
2026+
20152027 for (size_t i=0; i<vcnt; ++i) {
20162028 __m128i nextPlus = _mm_loadu_si128(plusSrc+i*2+1);
20172029 __m128i nextMinus = _mm_loadu_si128(minusSrc+i*2+1);
@@ -2062,27 +2074,73 @@
20622074 plus = nextPlus;
20632075 minus = nextMinus;
20642076 }
2077+
2078+ {
2079+ const size_t index = vcnt * 4;
2080+ __m128i vsum0 = pSumLine[index+0];
2081+ __m128i vsum1 = pSumLine[index+1];
2082+ __m128i vsum2 = pSumLine[index+2];
2083+ __m128i vsum3 = pSumLine[index+3];
2084+
2085+ __m128i vminus0 = pSubLine[index+0];
2086+ __m128i vminus1 = pSubLine[index+1];
2087+ __m128i vminus2 = pSubLine[index+2];
2088+ __m128i vminus3 = pSubLine[index+3];
2089+
2090+ storer(vsum0, vsum1, vsum2, vsum3);
2091+ const uint8_t* plusSrc2 = (const uint8_t*) (plusSrc + vcnt * 2);
2092+ const uint8_t* minusSrc2 = (const uint8_t*) (minusSrc + vcnt * 2);
2093+ uint16_t sum = sum3.m128i_u16[7];
2094+ union {
2095+ uint16_t shorts[32];
2096+ struct {
2097+ __m128i v0;
2098+ __m128i v1;
2099+ __m128i v2;
2100+ __m128i v3;
2101+ } str;
2102+ } uni;
2103+ size_t i;
2104+ for (i=0; i<remainCount-len; ++i) {
2105+ sum += plusSrc2[i];
2106+ sum -= minusSrc2[i];
2107+ uni.shorts[i] = sum;
2108+ }
2109+ for (size_t i2=0; i2<len; ++i2) {
2110+ sum += plusSrc2[i-2-i2];
2111+ sum -= minusSrc2[i+i2];
2112+ uni.shorts[i+i2] = sum;
2113+ }
2114+ vsum0 = _mm_sub_epi16(vsum0, vminus0);
2115+ vsum1 = _mm_sub_epi16(vsum1, vminus1);
2116+ vsum2 = _mm_sub_epi16(vsum2, vminus2);
2117+ vsum3 = _mm_sub_epi16(vsum3, vminus3);
2118+ vsum0 = _mm_add_epi16(vsum0, uni.str.v0);
2119+ vsum1 = _mm_add_epi16(vsum1, uni.str.v1);
2120+ vsum2 = _mm_add_epi16(vsum2, uni.str.v2);
2121+ vsum3 = _mm_add_epi16(vsum3, uni.str.v3);
2122+
2123+ pAddLine[index+0] = uni.str.v0;
2124+ pAddLine[index+1] = uni.str.v1;
2125+ pAddLine[index+2] = uni.str.v2;
2126+ pAddLine[index+3] = uni.str.v3;
2127+
2128+ pSumLine[index+0] = vsum0;
2129+ pSumLine[index+1] = vsum1;
2130+ pSumLine[index+2] = vsum2;
2131+ pSumLine[index+3] = vsum3;
2132+ }
2133+
20652134 }
20662135
20672136 __forceinline
20682137 void process2() {
2069- // 反転して要素を生成
2070- __m128i minusSrc2[2];
2071- minusSrc2[0] = _mm_shuffle_epi8(src[0], REVERSE); // 15-1
2072- minusSrc2[1] = src[0];
2073- uint16_t sum = 0;
2074- for (size_t i=0; i<1+len*2; ++i) {
2075- sum += ((const uint8_t*)(minusSrc2+1)) [i - (len+1)];
2076- }
2077- __m128i sum3 = _mm_set1_epi16(sum);
2078- const __m128i* plusSrc = (const __m128i*) ((const uint8_t*)src + len);
2079- const __m128i* minusSrc = (const __m128i*) ((const uint8_t*)src - (len+1));
2080- __m128i plus = _mm_loadu_si128(plusSrc);
2081- __m128i minus = _mm_loadu_si128(
2082- (const __m128i*) (
2083- (const uint8_t*)(minusSrc2+1) - (len+1)
2084- )
2085- );
2138+ const __m128i* plusSrc = 0;
2139+ const __m128i* minusSrc = 0;
2140+ __m128i plus;
2141+ __m128i minus;
2142+ __m128i sum3;
2143+ getPlusMinus(plus, minus, sum3, plusSrc, minusSrc);
20862144 for (size_t i=0; i<vcnt; ++i) {
20872145 __m128i nextPlus = _mm_loadu_si128(plusSrc+i*2+1);
20882146 __m128i nextMinus = _mm_loadu_si128(minusSrc+i*2+1);
@@ -2126,27 +2184,35 @@
21262184 plus = nextPlus;
21272185 minus = nextMinus;
21282186 }
2187+
2188+ const uint8_t* plusSrc2 = (const uint8_t*) (plusSrc + vcnt * 2);
2189+ const uint8_t* minusSrc2 = (const uint8_t*) (minusSrc + vcnt * 2);
2190+ uint16_t* pAddLine2 = (uint16_t*) (pAddLine + vcnt * 4);
2191+ uint16_t* pSumLine2 = (uint16_t*) (pSumLine + vcnt * 4);
2192+ uint16_t sum = sum3.m128i_u16[7];
2193+ size_t i;
2194+ for (i=0; i<remainCount-len; ++i) {
2195+ sum += plusSrc2[i];
2196+ sum -= minusSrc2[i];
2197+ pAddLine2[i] = sum;
2198+ pSumLine2[i] += sum * 2;
2199+ }
2200+ for (size_t i2=0; i2<len; ++i2) {
2201+ sum += plusSrc2[i-2-i2];
2202+ sum -= minusSrc2[i+i2];
2203+ pAddLine2[i+i2] = sum;
2204+ pSumLine2[i+i2] += sum * 2;
2205+ }
21292206 }
21302207
21312208 __forceinline
21322209 void process3() {
2133- // 反転して要素を生成
2134- __m128i minusSrc2[2];
2135- minusSrc2[0] = _mm_shuffle_epi8(src[0], REVERSE); // 15-1
2136- minusSrc2[1] = src[0];
2137- uint16_t sum = 0;
2138- for (size_t i=0; i<1+len*2; ++i) {
2139- sum += ((const uint8_t*)(minusSrc2+1)) [i - (len+1)];
2140- }
2141- __m128i sum3 = _mm_set1_epi16(sum);
2142- const __m128i* plusSrc = (const __m128i*) ((const uint8_t*)src + len);
2143- const __m128i* minusSrc = (const __m128i*) ((const uint8_t*)src - (len+1));
2144- __m128i plus = _mm_loadu_si128(plusSrc);
2145- __m128i minus = _mm_loadu_si128(
2146- (const __m128i*) (
2147- (const uint8_t*)(minusSrc2+1) - (len+1)
2148- )
2149- );
2210+ const __m128i* plusSrc = 0;
2211+ const __m128i* minusSrc = 0;
2212+ __m128i plus;
2213+ __m128i minus;
2214+ __m128i sum3;
2215+ getPlusMinus(plus, minus, sum3, plusSrc, minusSrc);
21502216 for (size_t i=0; i<vcnt; ++i) {
21512217 __m128i nextPlus = _mm_loadu_si128(plusSrc+i*2+1);
21522218 __m128i nextMinus = _mm_loadu_si128(minusSrc+i*2+1);
@@ -2176,7 +2242,65 @@
21762242 plus = nextPlus;
21772243 minus = nextMinus;
21782244 }
2245+
2246+ const uint8_t* plusSrc2 = (const uint8_t*) (plusSrc + vcnt * 2);
2247+ const uint8_t* minusSrc2 = (const uint8_t*) (minusSrc + vcnt * 2);
2248+ uint16_t* pAddLine2 = (uint16_t*) (pAddLine + vcnt * 4);
2249+ uint16_t* pSumLine2 = (uint16_t*) (pSumLine + vcnt * 4);
2250+ uint16_t sum = sum3.m128i_u16[7];
2251+ size_t i;
2252+ for (i=0; i<remainCount-len; ++i) {
2253+ sum += plusSrc2[i];
2254+ sum -= minusSrc2[i];
2255+ pAddLine2[i] = sum;
2256+ pSumLine2[i] = sum;
2257+ }
2258+ for (size_t i2=0; i2<len; ++i2) {
2259+ sum += plusSrc2[i-2-i2];
2260+ sum -= minusSrc2[i+i2];
2261+ pAddLine2[i+i2] = sum;
2262+ pSumLine2[i+i2] = sum;
2263+ }
21792264 }
2265+
2266+ template <typename T>
2267+ __forceinline
2268+ void process4(T& storer) {
2269+
2270+ for (size_t i=0; i<vcnt+1; ++i) {
2271+ __m128i vsum0 = pSumLine[i*4+0];
2272+ __m128i vsum1 = pSumLine[i*4+1];
2273+ __m128i vsum2 = pSumLine[i*4+2];
2274+ __m128i vsum3 = pSumLine[i*4+3];
2275+
2276+ __m128i vminus0 = pSubLine[i*4+0];
2277+ __m128i vminus1 = pSubLine[i*4+1];
2278+ __m128i vminus2 = pSubLine[i*4+2];
2279+ __m128i vminus3 = pSubLine[i*4+3];
2280+
2281+ __m128i vplus0 = pAddLine[i*4+0];
2282+ __m128i vplus1 = pAddLine[i*4+1];
2283+ __m128i vplus2 = pAddLine[i*4+2];
2284+ __m128i vplus3 = pAddLine[i*4+3];
2285+
2286+ storer(vsum0, vsum1, vsum2, vsum3);
2287+
2288+ vsum0 = _mm_sub_epi16(vsum0, vminus0);
2289+ vsum1 = _mm_sub_epi16(vsum1, vminus1);
2290+ vsum2 = _mm_sub_epi16(vsum2, vminus2);
2291+ vsum3 = _mm_sub_epi16(vsum3, vminus3);
2292+ vsum0 = _mm_add_epi16(vsum0, vplus0);
2293+ vsum1 = _mm_add_epi16(vsum1, vplus1);
2294+ vsum2 = _mm_add_epi16(vsum2, vplus2);
2295+ vsum3 = _mm_add_epi16(vsum3, vplus3);
2296+
2297+ pSumLine[i*4+0] = vsum0;
2298+ pSumLine[i*4+1] = vsum1;
2299+ pSumLine[i*4+2] = vsum2;
2300+ pSumLine[i*4+3] = vsum3;
2301+
2302+ }
2303+ }
21802304 };
21812305
21822306 void test_12(const Parameter& p) {
@@ -2188,7 +2312,8 @@
21882312 if (len > 14 || len == 0) {
21892313 return;
21902314 }
2191- size_t vcnt = width / 32;
2315+ size_t vcnt = (width - len) / 32;
2316+ size_t remainCount = width - 32 * vcnt;
21922317 if (vcnt == 0) {
21932318 return;
21942319 }
@@ -2205,6 +2330,7 @@
22052330 HProcessor hProc;
22062331 hProc.len = p.radius;
22072332 hProc.vcnt = vcnt;
2333+ hProc.remainCount = remainCount;
22082334 hProc.pSumLine = pSumLine;
22092335 memset(pSumLine, 0, p.srcLineOffsetBytes*2);
22102336 hProc.pSubLine = pSubLine;
@@ -2212,14 +2338,15 @@
22122338
22132339 struct Storer
22142340 {
2215- __m128i invLen;
2341+ uint16_t invLen;
2342+ __m128i mInvLen;
22162343 __m128i* dst;
22172344
22182345 __forceinline
22192346 void operator () (__m128i& vsum0, __m128i& vsum1, __m128i& vsum2, __m128i& vsum3)
22202347 {
2221- __m128i store0 = _mm_packus_epi16(_mm_mulhi_epu16(vsum0, invLen), _mm_mulhi_epu16(vsum1, invLen));
2222- __m128i store1 = _mm_packus_epi16(_mm_mulhi_epu16(vsum2, invLen), _mm_mulhi_epu16(vsum3, invLen));
2348+ __m128i store0 = _mm_packus_epi16(_mm_mulhi_epu16(vsum0, mInvLen), _mm_mulhi_epu16(vsum1, mInvLen));
2349+ __m128i store1 = _mm_packus_epi16(_mm_mulhi_epu16(vsum2, mInvLen), _mm_mulhi_epu16(vsum3, mInvLen));
22232350 _mm_stream_si128(dst+0, store0);
22242351 _mm_stream_si128(dst+1, store1);
22252352 dst += 2;
@@ -2226,7 +2353,8 @@
22262353 }
22272354 } storer;
22282355 const size_t diameter = 1 + len * 2;
2229- storer.invLen = _mm_set1_epi16(0xFFFF / (diameter*diameter));
2356+ storer.invLen = 0xFFFF / (diameter*diameter);
2357+ storer.mInvLen = _mm_set1_epi16(storer.invLen);
22302358
22312359 hProc.src = src;
22322360 // collect sum (0 to len)
@@ -2266,7 +2394,16 @@
22662394 OffsetPtr(dst, p.destLineOffsetBytes);
22672395 storer.dst = dst;
22682396 }
2269-
2397+ // remain set
2398+ pSubLine.movePrev();
2399+ pSubLine.movePrev();
2400+ for (size_t i=0; i<=len; ++i) {
2401+ hProc.process4(storer);
2402+ pSubLine.movePrev();
2403+ hProc.pSubLine = pSubLine;
2404+ OffsetPtr(dst, p.destLineOffsetBytes);
2405+ storer.dst = dst;
2406+ }
22702407 }
22712408
22722409 void test_20(const Parameter& p) {
Show on old repository browser