changed new routine to take care of right and bottom side of a image
@@ -57,8 +57,8 @@ | ||
57 | 57 | // const size_t nThreads = 2; |
58 | 58 | // const size_t nThreads = 4; |
59 | 59 | #else |
60 | - const size_t nThreads = si.dwNumberOfProcessors; | |
61 | -// const size_t nThreads = 1; | |
60 | +// const size_t nThreads = si.dwNumberOfProcessors; | |
61 | + const size_t nThreads = 1; | |
62 | 62 | #endif |
63 | 63 | Threads<blur_1b::Parameter> threads; |
64 | 64 | threads.SetUp(nThreads); |
@@ -71,7 +71,7 @@ | ||
71 | 71 | pCommon.srcLineOffsetBytes = |
72 | 72 | pCommon.workLineOffsetBytes = |
73 | 73 | pCommon.destLineOffsetBytes = lineSize; |
74 | - pCommon.radius = 7; | |
74 | + pCommon.radius = 14; | |
75 | 75 | pCommon.iterationCount = 1; |
76 | 76 | std::vector<blur_1b::Parameter> params(nThreads); |
77 | 77 | for (size_t i=0; i<nThreads; ++i) { |
@@ -91,20 +91,20 @@ | ||
91 | 91 | } |
92 | 92 | typedef void (*BlurFuncPtr)(const blur_1b::Parameter& p); |
93 | 93 | BlurFuncPtr ptrs[] = { |
94 | - blur_1b::test_1, | |
95 | - blur_1b::test_2, | |
96 | - blur_1b::test_3, | |
97 | - blur_1b::test_4, | |
98 | - blur_1b::test_5_h, | |
99 | - blur_1b::test_5_v, | |
100 | - blur_1b::test_5_h, | |
101 | - blur_1b::test_6_v, | |
102 | - blur_1b::test_7_h, | |
103 | - blur_1b::test_7_v, | |
104 | - blur_1b::test_8, | |
105 | - blur_1b::test_9, | |
106 | - blur_1b::test_10, | |
107 | - blur_1b::test_11, | |
94 | + //blur_1b::test_1, | |
95 | + //blur_1b::test_2, | |
96 | + //blur_1b::test_3, | |
97 | + //blur_1b::test_4, | |
98 | + //blur_1b::test_5_h, | |
99 | + //blur_1b::test_5_v, | |
100 | + //blur_1b::test_5_h, | |
101 | + //blur_1b::test_6_v, | |
102 | + //blur_1b::test_7_h, | |
103 | + //blur_1b::test_7_v, | |
104 | + //blur_1b::test_8, | |
105 | + //blur_1b::test_9, | |
106 | + //blur_1b::test_10, | |
107 | + //blur_1b::test_11, | |
108 | 108 | blur_1b::test_12, |
109 | 109 | |
110 | 110 | //blur_1b::test_20, |
@@ -1966,6 +1966,7 @@ | ||
1966 | 1966 | size_t len; |
1967 | 1967 | |
1968 | 1968 | size_t vcnt; |
1969 | + size_t remainCount; | |
1969 | 1970 | const __m128i* src; |
1970 | 1971 | __m128i* pSumLine; |
1971 | 1972 | __m128i* pSubLine; |
@@ -1989,12 +1990,10 @@ | ||
1989 | 1990 | diff1 = shiftAdd16(diff1); |
1990 | 1991 | sum1 = _mm_add_epi16(sum1, diff1); |
1991 | 1992 | } |
1992 | - | |
1993 | - | |
1994 | - template <typename T> | |
1993 | + | |
1995 | 1994 | __forceinline |
1996 | - void process(T& storer) { | |
1997 | - | |
1995 | + void getPlusMinus(__m128i& plus, __m128i& minus, __m128i& sum3, const __m128i*& plusSrc, const __m128i*& minusSrc) | |
1996 | + { | |
1998 | 1997 | // 反転して要素を生成 |
1999 | 1998 | __m128i minusSrc2[2]; |
2000 | 1999 | minusSrc2[0] = _mm_shuffle_epi8(src[0], REVERSE); // 15-1 |
@@ -2003,15 +2002,28 @@ | ||
2003 | 2002 | for (size_t i=0; i<1+len*2; ++i) { |
2004 | 2003 | sum += ((const uint8_t*)(minusSrc2+1)) [i - (len+1)]; |
2005 | 2004 | } |
2006 | - __m128i sum3 = _mm_set1_epi16(sum); | |
2007 | - const __m128i* plusSrc = (const __m128i*) ((const uint8_t*)src + len); | |
2008 | - const __m128i* minusSrc = (const __m128i*) ((const uint8_t*)src - (len+1)); | |
2009 | - __m128i plus = _mm_loadu_si128(plusSrc); | |
2010 | - __m128i minus = _mm_loadu_si128( | |
2005 | + sum3.m128i_u16[7] = sum; | |
2006 | + plusSrc = (const __m128i*) ((const uint8_t*)src + len); | |
2007 | + minusSrc = (const __m128i*) ((const uint8_t*)src - (len+1)); | |
2008 | + plus = _mm_loadu_si128(plusSrc); | |
2009 | + minus = _mm_loadu_si128( | |
2011 | 2010 | (const __m128i*) ( |
2012 | 2011 | (const uint8_t*)(minusSrc2+1) - (len+1) |
2013 | 2012 | ) |
2014 | 2013 | ); |
2014 | + } | |
2015 | + | |
2016 | + template <typename T> | |
2017 | + __forceinline | |
2018 | + void process(T& storer) { | |
2019 | + | |
2020 | + const __m128i* plusSrc = 0; | |
2021 | + const __m128i* minusSrc = 0; | |
2022 | + __m128i plus; | |
2023 | + __m128i minus; | |
2024 | + __m128i sum3; | |
2025 | + getPlusMinus(plus, minus, sum3, plusSrc, minusSrc); | |
2026 | + | |
2015 | 2027 | for (size_t i=0; i<vcnt; ++i) { |
2016 | 2028 | __m128i nextPlus = _mm_loadu_si128(plusSrc+i*2+1); |
2017 | 2029 | __m128i nextMinus = _mm_loadu_si128(minusSrc+i*2+1); |
@@ -2062,27 +2074,73 @@ | ||
2062 | 2074 | plus = nextPlus; |
2063 | 2075 | minus = nextMinus; |
2064 | 2076 | } |
2077 | + | |
2078 | + { | |
2079 | + const size_t index = vcnt * 4; | |
2080 | + __m128i vsum0 = pSumLine[index+0]; | |
2081 | + __m128i vsum1 = pSumLine[index+1]; | |
2082 | + __m128i vsum2 = pSumLine[index+2]; | |
2083 | + __m128i vsum3 = pSumLine[index+3]; | |
2084 | + | |
2085 | + __m128i vminus0 = pSubLine[index+0]; | |
2086 | + __m128i vminus1 = pSubLine[index+1]; | |
2087 | + __m128i vminus2 = pSubLine[index+2]; | |
2088 | + __m128i vminus3 = pSubLine[index+3]; | |
2089 | + | |
2090 | + storer(vsum0, vsum1, vsum2, vsum3); | |
2091 | + const uint8_t* plusSrc2 = (const uint8_t*) (plusSrc + vcnt * 2); | |
2092 | + const uint8_t* minusSrc2 = (const uint8_t*) (minusSrc + vcnt * 2); | |
2093 | + uint16_t sum = sum3.m128i_u16[7]; | |
2094 | + union { | |
2095 | + uint16_t shorts[32]; | |
2096 | + struct { | |
2097 | + __m128i v0; | |
2098 | + __m128i v1; | |
2099 | + __m128i v2; | |
2100 | + __m128i v3; | |
2101 | + } str; | |
2102 | + } uni; | |
2103 | + size_t i; | |
2104 | + for (i=0; i<remainCount-len; ++i) { | |
2105 | + sum += plusSrc2[i]; | |
2106 | + sum -= minusSrc2[i]; | |
2107 | + uni.shorts[i] = sum; | |
2108 | + } | |
2109 | + for (size_t i2=0; i2<len; ++i2) { | |
2110 | + sum += plusSrc2[i-2-i2]; | |
2111 | + sum -= minusSrc2[i+i2]; | |
2112 | + uni.shorts[i+i2] = sum; | |
2113 | + } | |
2114 | + vsum0 = _mm_sub_epi16(vsum0, vminus0); | |
2115 | + vsum1 = _mm_sub_epi16(vsum1, vminus1); | |
2116 | + vsum2 = _mm_sub_epi16(vsum2, vminus2); | |
2117 | + vsum3 = _mm_sub_epi16(vsum3, vminus3); | |
2118 | + vsum0 = _mm_add_epi16(vsum0, uni.str.v0); | |
2119 | + vsum1 = _mm_add_epi16(vsum1, uni.str.v1); | |
2120 | + vsum2 = _mm_add_epi16(vsum2, uni.str.v2); | |
2121 | + vsum3 = _mm_add_epi16(vsum3, uni.str.v3); | |
2122 | + | |
2123 | + pAddLine[index+0] = uni.str.v0; | |
2124 | + pAddLine[index+1] = uni.str.v1; | |
2125 | + pAddLine[index+2] = uni.str.v2; | |
2126 | + pAddLine[index+3] = uni.str.v3; | |
2127 | + | |
2128 | + pSumLine[index+0] = vsum0; | |
2129 | + pSumLine[index+1] = vsum1; | |
2130 | + pSumLine[index+2] = vsum2; | |
2131 | + pSumLine[index+3] = vsum3; | |
2132 | + } | |
2133 | + | |
2065 | 2134 | } |
2066 | 2135 | |
2067 | 2136 | __forceinline |
2068 | 2137 | void process2() { |
2069 | - // 反転して要素を生成 | |
2070 | - __m128i minusSrc2[2]; | |
2071 | - minusSrc2[0] = _mm_shuffle_epi8(src[0], REVERSE); // 15-1 | |
2072 | - minusSrc2[1] = src[0]; | |
2073 | - uint16_t sum = 0; | |
2074 | - for (size_t i=0; i<1+len*2; ++i) { | |
2075 | - sum += ((const uint8_t*)(minusSrc2+1)) [i - (len+1)]; | |
2076 | - } | |
2077 | - __m128i sum3 = _mm_set1_epi16(sum); | |
2078 | - const __m128i* plusSrc = (const __m128i*) ((const uint8_t*)src + len); | |
2079 | - const __m128i* minusSrc = (const __m128i*) ((const uint8_t*)src - (len+1)); | |
2080 | - __m128i plus = _mm_loadu_si128(plusSrc); | |
2081 | - __m128i minus = _mm_loadu_si128( | |
2082 | - (const __m128i*) ( | |
2083 | - (const uint8_t*)(minusSrc2+1) - (len+1) | |
2084 | - ) | |
2085 | - ); | |
2138 | + const __m128i* plusSrc = 0; | |
2139 | + const __m128i* minusSrc = 0; | |
2140 | + __m128i plus; | |
2141 | + __m128i minus; | |
2142 | + __m128i sum3; | |
2143 | + getPlusMinus(plus, minus, sum3, plusSrc, minusSrc); | |
2086 | 2144 | for (size_t i=0; i<vcnt; ++i) { |
2087 | 2145 | __m128i nextPlus = _mm_loadu_si128(plusSrc+i*2+1); |
2088 | 2146 | __m128i nextMinus = _mm_loadu_si128(minusSrc+i*2+1); |
@@ -2126,27 +2184,35 @@ | ||
2126 | 2184 | plus = nextPlus; |
2127 | 2185 | minus = nextMinus; |
2128 | 2186 | } |
2187 | + | |
2188 | + const uint8_t* plusSrc2 = (const uint8_t*) (plusSrc + vcnt * 2); | |
2189 | + const uint8_t* minusSrc2 = (const uint8_t*) (minusSrc + vcnt * 2); | |
2190 | + uint16_t* pAddLine2 = (uint16_t*) (pAddLine + vcnt * 4); | |
2191 | + uint16_t* pSumLine2 = (uint16_t*) (pSumLine + vcnt * 4); | |
2192 | + uint16_t sum = sum3.m128i_u16[7]; | |
2193 | + size_t i; | |
2194 | + for (i=0; i<remainCount-len; ++i) { | |
2195 | + sum += plusSrc2[i]; | |
2196 | + sum -= minusSrc2[i]; | |
2197 | + pAddLine2[i] = sum; | |
2198 | + pSumLine2[i] += sum * 2; | |
2199 | + } | |
2200 | + for (size_t i2=0; i2<len; ++i2) { | |
2201 | + sum += plusSrc2[i-2-i2]; | |
2202 | + sum -= minusSrc2[i+i2]; | |
2203 | + pAddLine2[i+i2] = sum; | |
2204 | + pSumLine2[i+i2] += sum * 2; | |
2205 | + } | |
2129 | 2206 | } |
2130 | 2207 | |
2131 | 2208 | __forceinline |
2132 | 2209 | void process3() { |
2133 | - // 反転して要素を生成 | |
2134 | - __m128i minusSrc2[2]; | |
2135 | - minusSrc2[0] = _mm_shuffle_epi8(src[0], REVERSE); // 15-1 | |
2136 | - minusSrc2[1] = src[0]; | |
2137 | - uint16_t sum = 0; | |
2138 | - for (size_t i=0; i<1+len*2; ++i) { | |
2139 | - sum += ((const uint8_t*)(minusSrc2+1)) [i - (len+1)]; | |
2140 | - } | |
2141 | - __m128i sum3 = _mm_set1_epi16(sum); | |
2142 | - const __m128i* plusSrc = (const __m128i*) ((const uint8_t*)src + len); | |
2143 | - const __m128i* minusSrc = (const __m128i*) ((const uint8_t*)src - (len+1)); | |
2144 | - __m128i plus = _mm_loadu_si128(plusSrc); | |
2145 | - __m128i minus = _mm_loadu_si128( | |
2146 | - (const __m128i*) ( | |
2147 | - (const uint8_t*)(minusSrc2+1) - (len+1) | |
2148 | - ) | |
2149 | - ); | |
2210 | + const __m128i* plusSrc = 0; | |
2211 | + const __m128i* minusSrc = 0; | |
2212 | + __m128i plus; | |
2213 | + __m128i minus; | |
2214 | + __m128i sum3; | |
2215 | + getPlusMinus(plus, minus, sum3, plusSrc, minusSrc); | |
2150 | 2216 | for (size_t i=0; i<vcnt; ++i) { |
2151 | 2217 | __m128i nextPlus = _mm_loadu_si128(plusSrc+i*2+1); |
2152 | 2218 | __m128i nextMinus = _mm_loadu_si128(minusSrc+i*2+1); |
@@ -2176,7 +2242,65 @@ | ||
2176 | 2242 | plus = nextPlus; |
2177 | 2243 | minus = nextMinus; |
2178 | 2244 | } |
2245 | + | |
2246 | + const uint8_t* plusSrc2 = (const uint8_t*) (plusSrc + vcnt * 2); | |
2247 | + const uint8_t* minusSrc2 = (const uint8_t*) (minusSrc + vcnt * 2); | |
2248 | + uint16_t* pAddLine2 = (uint16_t*) (pAddLine + vcnt * 4); | |
2249 | + uint16_t* pSumLine2 = (uint16_t*) (pSumLine + vcnt * 4); | |
2250 | + uint16_t sum = sum3.m128i_u16[7]; | |
2251 | + size_t i; | |
2252 | + for (i=0; i<remainCount-len; ++i) { | |
2253 | + sum += plusSrc2[i]; | |
2254 | + sum -= minusSrc2[i]; | |
2255 | + pAddLine2[i] = sum; | |
2256 | + pSumLine2[i] = sum; | |
2257 | + } | |
2258 | + for (size_t i2=0; i2<len; ++i2) { | |
2259 | + sum += plusSrc2[i-2-i2]; | |
2260 | + sum -= minusSrc2[i+i2]; | |
2261 | + pAddLine2[i+i2] = sum; | |
2262 | + pSumLine2[i+i2] = sum; | |
2263 | + } | |
2179 | 2264 | } |
2265 | + | |
2266 | + template <typename T> | |
2267 | + __forceinline | |
2268 | + void process4(T& storer) { | |
2269 | + | |
2270 | + for (size_t i=0; i<vcnt+1; ++i) { | |
2271 | + __m128i vsum0 = pSumLine[i*4+0]; | |
2272 | + __m128i vsum1 = pSumLine[i*4+1]; | |
2273 | + __m128i vsum2 = pSumLine[i*4+2]; | |
2274 | + __m128i vsum3 = pSumLine[i*4+3]; | |
2275 | + | |
2276 | + __m128i vminus0 = pSubLine[i*4+0]; | |
2277 | + __m128i vminus1 = pSubLine[i*4+1]; | |
2278 | + __m128i vminus2 = pSubLine[i*4+2]; | |
2279 | + __m128i vminus3 = pSubLine[i*4+3]; | |
2280 | + | |
2281 | + __m128i vplus0 = pAddLine[i*4+0]; | |
2282 | + __m128i vplus1 = pAddLine[i*4+1]; | |
2283 | + __m128i vplus2 = pAddLine[i*4+2]; | |
2284 | + __m128i vplus3 = pAddLine[i*4+3]; | |
2285 | + | |
2286 | + storer(vsum0, vsum1, vsum2, vsum3); | |
2287 | + | |
2288 | + vsum0 = _mm_sub_epi16(vsum0, vminus0); | |
2289 | + vsum1 = _mm_sub_epi16(vsum1, vminus1); | |
2290 | + vsum2 = _mm_sub_epi16(vsum2, vminus2); | |
2291 | + vsum3 = _mm_sub_epi16(vsum3, vminus3); | |
2292 | + vsum0 = _mm_add_epi16(vsum0, vplus0); | |
2293 | + vsum1 = _mm_add_epi16(vsum1, vplus1); | |
2294 | + vsum2 = _mm_add_epi16(vsum2, vplus2); | |
2295 | + vsum3 = _mm_add_epi16(vsum3, vplus3); | |
2296 | + | |
2297 | + pSumLine[i*4+0] = vsum0; | |
2298 | + pSumLine[i*4+1] = vsum1; | |
2299 | + pSumLine[i*4+2] = vsum2; | |
2300 | + pSumLine[i*4+3] = vsum3; | |
2301 | + | |
2302 | + } | |
2303 | + } | |
2180 | 2304 | }; |
2181 | 2305 | |
2182 | 2306 | void test_12(const Parameter& p) { |
@@ -2188,7 +2312,8 @@ | ||
2188 | 2312 | if (len > 14 || len == 0) { |
2189 | 2313 | return; |
2190 | 2314 | } |
2191 | - size_t vcnt = width / 32; | |
2315 | + size_t vcnt = (width - len) / 32; | |
2316 | + size_t remainCount = width - 32 * vcnt; | |
2192 | 2317 | if (vcnt == 0) { |
2193 | 2318 | return; |
2194 | 2319 | } |
@@ -2205,6 +2330,7 @@ | ||
2205 | 2330 | HProcessor hProc; |
2206 | 2331 | hProc.len = p.radius; |
2207 | 2332 | hProc.vcnt = vcnt; |
2333 | + hProc.remainCount = remainCount; | |
2208 | 2334 | hProc.pSumLine = pSumLine; |
2209 | 2335 | memset(pSumLine, 0, p.srcLineOffsetBytes*2); |
2210 | 2336 | hProc.pSubLine = pSubLine; |
@@ -2212,14 +2338,15 @@ | ||
2212 | 2338 | |
2213 | 2339 | struct Storer |
2214 | 2340 | { |
2215 | - __m128i invLen; | |
2341 | + uint16_t invLen; | |
2342 | + __m128i mInvLen; | |
2216 | 2343 | __m128i* dst; |
2217 | 2344 | |
2218 | 2345 | __forceinline |
2219 | 2346 | void operator () (__m128i& vsum0, __m128i& vsum1, __m128i& vsum2, __m128i& vsum3) |
2220 | 2347 | { |
2221 | - __m128i store0 = _mm_packus_epi16(_mm_mulhi_epu16(vsum0, invLen), _mm_mulhi_epu16(vsum1, invLen)); | |
2222 | - __m128i store1 = _mm_packus_epi16(_mm_mulhi_epu16(vsum2, invLen), _mm_mulhi_epu16(vsum3, invLen)); | |
2348 | + __m128i store0 = _mm_packus_epi16(_mm_mulhi_epu16(vsum0, mInvLen), _mm_mulhi_epu16(vsum1, mInvLen)); | |
2349 | + __m128i store1 = _mm_packus_epi16(_mm_mulhi_epu16(vsum2, mInvLen), _mm_mulhi_epu16(vsum3, mInvLen)); | |
2223 | 2350 | _mm_stream_si128(dst+0, store0); |
2224 | 2351 | _mm_stream_si128(dst+1, store1); |
2225 | 2352 | dst += 2; |
@@ -2226,7 +2353,8 @@ | ||
2226 | 2353 | } |
2227 | 2354 | } storer; |
2228 | 2355 | const size_t diameter = 1 + len * 2; |
2229 | - storer.invLen = _mm_set1_epi16(0xFFFF / (diameter*diameter)); | |
2356 | + storer.invLen = 0xFFFF / (diameter*diameter); | |
2357 | + storer.mInvLen = _mm_set1_epi16(storer.invLen); | |
2230 | 2358 | |
2231 | 2359 | hProc.src = src; |
2232 | 2360 | // collect sum (0 to len) |
@@ -2266,7 +2394,16 @@ | ||
2266 | 2394 | OffsetPtr(dst, p.destLineOffsetBytes); |
2267 | 2395 | storer.dst = dst; |
2268 | 2396 | } |
2269 | - | |
2397 | + // remain set | |
2398 | + pSubLine.movePrev(); | |
2399 | + pSubLine.movePrev(); | |
2400 | + for (size_t i=0; i<=len; ++i) { | |
2401 | + hProc.process4(storer); | |
2402 | + pSubLine.movePrev(); | |
2403 | + hProc.pSubLine = pSubLine; | |
2404 | + OffsetPtr(dst, p.destLineOffsetBytes); | |
2405 | + storer.dst = dst; | |
2406 | + } | |
2270 | 2407 | } |
2271 | 2408 | |
2272 | 2409 | void test_20(const Parameter& p) { |