• R/O
  • HTTP
  • SSH
  • HTTPS

Commit

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

減色プログラム


Commit MetaInfo

Revision4b1196c90fb583a3002d91b6997db06f780726fe (tree)
Time2011-05-22 13:48:48
Authorberu <berupon@gmai...>
Commiterberu

Log Message

used AVX intrinsics but it doesn't improve processing speed. actually, it's slow compared to SSE version. why is that?

Change Summary

Incremental Difference

--- a/Array.h
+++ b/Array.h
@@ -12,7 +12,7 @@ struct Array2D
1212
1313 void allocate() {
1414 // pBuff_ = new T[width_ * height_];
15- pBuff_ = (T*) _aligned_malloc(sizeof(T) * width_ * height_, 16);
15+ pBuff_ = (T*) _aligned_malloc(sizeof(T) * width_ * height_, 32);
1616 }
1717
1818 Array2D(const Array2D& arr)
@@ -52,102 +52,102 @@ struct Array2D
5252 }
5353
5454 __forceinline
55- T* operator[] (int row) {
56- return &pBuff_[row * width_];
57- }
58-
59- __forceinline
60- const T* operator[] (int row) const {
61- return &pBuff_[row * width_];
62- }
55+ T* operator[] (int row) {
56+ return &pBuff_[row * width_];
57+ }
58+
59+ __forceinline
60+ const T* operator[] (int row) const {
61+ return &pBuff_[row * width_];
62+ }
6363
64- Array2D<T>& operator *= (const T& scalar) {
65-#if 1
66- for (size_t i=0; i<width_*height_; ++i) {
67- pBuff_[i] *= scalar;
68- }
69-#else
70- for (int i=0; i<width_; i++) {
71- for (int j=0; j<height_; j++) {
72- (*this)[j][i] *= scalar;
73- }
74- }
75-#endif
76- return *this;
77- }
78-
79- template <typename T2>
80- Array2D<T> operator * (const T2& scalar) {
81- Array2D<T> result(*this);
82- result *= scalar;
83- return result;
84- }
85-
86- std::vector<T> operator * (const std::vector<T>& vec) {
87- std::vector<T> result(height_);
88- T sum;
89- for (int row=0; row<height_; row++) {
90- sum = 0;
91- for (int col=0; col<width_; col++) {
92- sum += (*this)[row][col] * vec[col];
93- }
94- result[row] = sum;
95- }
96- return result;
64+ Array2D<T>& operator *= (const T& scalar) {
65+#if 1
66+ for (size_t i=0; i<width_*height_; ++i) {
67+ pBuff_[i] *= scalar;
68+ }
69+#else
70+ for (int i=0; i<width_; i++) {
71+ for (int j=0; j<height_; j++) {
72+ (*this)[j][i] *= scalar;
73+ }
74+ }
75+#endif
76+ return *this;
9777 }
9878
99- Array2D<T>& multiply_row_scalar(int row, double mult) {
100- for (int i=0; i<width_; i++) {
101- (*this)[row][i] *= mult;
102- }
103- return *this;
104- }
105-
106- Array2D<T>& add_row_multiple(int from_row, int to_row, double mult) {
107- for (int i=0; i<width_; ++i) {
108- (*this)[to_row][i] += mult*(*this)[from_row][i];
109- }
110- return *this;
111- }
112-
113- // We use simple Gaussian elimination - perf doesn't matter since
114- // the matrices will be K x K, where K = number of palette entries.
115- Array2D<T> matrix_inverse() {
116- Array2D<T> result(width_, height_);
117- Array2D<T>& a = *this;
118-
119- // Set result to identity matrix
120- result *= 0;
121- for (int i=0; i<width_; i++) {
122- result[i][i] = 1;
123- }
124- // Reduce to echelon form, mirroring in result
125- for (int i=0; i<width_; i++) {
126- result.multiply_row_scalar(i, 1/a[i][i]);
127- multiply_row_scalar(i, 1/a[i][i]);
128- for (int j=i+1; j<height_; j++) {
129- result.add_row_multiple(i, j, -a[j][i]);
130- add_row_multiple(i, j, -a[j][i]);
131- }
132- }
133- // Back substitute, mirroring in result
134- for (int i=width_-1; i>=0; i--) {
135- for (int j=i-1; j>=0; j--) {
136- result.add_row_multiple(i, j, -a[j][i]);
137- add_row_multiple(i, j, -a[j][i]);
138- }
139- }
140- // result is now the inverse
141- return result;
142- }
79+ template <typename T2>
80+ Array2D<T> operator * (const T2& scalar) {
81+ Array2D<T> result(*this);
82+ result *= scalar;
83+ return result;
84+ }
85+
86+ std::vector<T> operator * (const std::vector<T>& vec) {
87+ std::vector<T> result(height_);
88+ T sum;
89+ for (int row=0; row<height_; row++) {
90+ sum = 0;
91+ for (int col=0; col<width_; col++) {
92+ sum += (*this)[row][col] * vec[col];
93+ }
94+ result[row] = sum;
95+ }
96+ return result;
97+ }
98+
99+ Array2D<T>& multiply_row_scalar(int row, double mult) {
100+ for (int i=0; i<width_; i++) {
101+ (*this)[row][i] *= mult;
102+ }
103+ return *this;
104+ }
105+
106+ Array2D<T>& add_row_multiple(int from_row, int to_row, double mult) {
107+ for (int i=0; i<width_; ++i) {
108+ (*this)[to_row][i] += mult*(*this)[from_row][i];
109+ }
110+ return *this;
111+ }
112+
113+ // We use simple Gaussian elimination - perf doesn't matter since
114+ // the matrices will be K x K, where K = number of palette entries.
115+ Array2D<T> matrix_inverse() {
116+ Array2D<T> result(width_, height_);
117+ Array2D<T>& a = *this;
118+
119+ // Set result to identity matrix
120+ result *= 0;
121+ for (int i=0; i<width_; i++) {
122+ result[i][i] = 1;
123+ }
124+ // Reduce to echelon form, mirroring in result
125+ for (int i=0; i<width_; i++) {
126+ result.multiply_row_scalar(i, 1/a[i][i]);
127+ multiply_row_scalar(i, 1/a[i][i]);
128+ for (int j=i+1; j<height_; j++) {
129+ result.add_row_multiple(i, j, -a[j][i]);
130+ add_row_multiple(i, j, -a[j][i]);
131+ }
132+ }
133+ // Back substitute, mirroring in result
134+ for (int i=width_-1; i>=0; i--) {
135+ for (int j=i-1; j>=0; j--) {
136+ result.add_row_multiple(i, j, -a[j][i]);
137+ add_row_multiple(i, j, -a[j][i]);
138+ }
139+ }
140+ // result is now the inverse
141+ return result;
142+ }
143143
144144 };
145145
146-template <typename T>
147-Array2D<T> operator * (T scalar, const Array2D<T>& a) {
148- Array2D<T> tmp = a;
149- return tmp * scalar;
150-}
146+template <typename T>
147+Array2D<T> operator * (T scalar, const Array2D<T>& a) {
148+ Array2D<T> tmp = a;
149+ return tmp * scalar;
150+}
151151
152152 template <typename T>
153153 struct Array3D
@@ -186,11 +186,11 @@ public:
186186 }
187187
188188 /*
189- Array2D<T> operator[] (int depth) {
190- return Array2D<T>(width_, height_, &pBuff_[depth * width_ * height_]);
189+ Array2D<T> operator[] (int depth) {
190+ return Array2D<T>(width_, height_, &pBuff_[depth * width_ * height_]);
191191 }
192- Array2D<T> operator[] (int depth) const {
193- return Array2D<T>(width_, height_, &pBuff_[depth * width_ * height_]);
192+ Array2D<T> operator[] (int depth) const {
193+ return Array2D<T>(width_, height_, &pBuff_[depth * width_ * height_]);
194194 }
195195 */
196196 __forceinline
--- /dev/null
+++ b/Color4d_avx.h
@@ -0,0 +1,115 @@
1+#pragma once
2+
3+#include <immintrin.h>
4+
5+struct Color4d
6+{
7+ __m256d v;
8+
9+ Color4d() {
10+ ;
11+ }
12+
13+ Color4d(const Color4d& c) {
14+ *this = c;
15+ }
16+
17+ Color4d(double r, double g, double b, double a) {
18+ v = _mm256_setr_pd(r,g,b,a);
19+ }
20+
21+ Color4d& operator = (const Color4d& rhs) {
22+ v = rhs.v;
23+ return *this;
24+ }
25+
26+ Color4d direct_product(const Color4d& rhs) const {
27+ Color4d result;
28+ result.v = _mm256_mul_pd(v, rhs.v);
29+ return result;
30+ }
31+
32+ double dot_product(const Color4d& rhs) {
33+// http://www.icnet.ne.jp/~nsystem/simd_tobira/dpps.html
34+ __m256d s = _mm256_mul_pd(this->v, rhs.v);
35+ __m128d s1 = _mm256_extractf128_pd(s, 0);
36+ __m128d s2 = _mm256_extractf128_pd(s, 1);
37+ __m128d as = _mm_add_pd(s1, s2);
38+ as = _mm_hadd_pd(as, as);
39+ return as.m128d_f64[0];
40+ }
41+
42+ Color4d& operator += (const Color4d& rhs) {
43+ v = _mm256_add_pd(v, rhs.v);
44+ return *this;
45+ }
46+
47+ Color4d operator + (const Color4d& rhs) {
48+ return Color4d(*this) += rhs;
49+ }
50+
51+ Color4d& operator -= (const Color4d& rhs) {
52+ v = _mm256_sub_pd(v, rhs.v);
53+ return *this;
54+ }
55+
56+ Color4d operator - (const Color4d& rhs) {
57+ return Color4d(*this) -= rhs;
58+ }
59+
60+ Color4d& operator *= (const Color4d& rhs) {
61+ v = _mm256_mul_pd(v, rhs.v);
62+ return *this;
63+ }
64+
65+ Color4d operator * (const Color4d& rhs) {
66+ return Color4d(*this) *= rhs;
67+ }
68+
69+ Color4d& operator *= (double scalar) {
70+ __m256d s = _mm256_set1_pd(scalar);
71+ v = _mm256_mul_pd(v, s);
72+ return *this;
73+ }
74+
75+ Color4d operator * (double scalar) {
76+ return Color4d(*this) *= scalar;
77+ }
78+
79+ double& operator[] (int idx) {
80+ return ((double*)&v)[idx];
81+ }
82+ const double& operator[] (int idx) const {
83+ return ((double*)&v)[idx];
84+ }
85+
86+ double norm_squared() {
87+#if 1
88+ __m256d s = _mm256_mul_pd(v, v);
89+ __m128d s1 = _mm256_extractf128_pd(s, 0);
90+ __m128d s2 = _mm256_extractf128_pd(s, 1);
91+ __m128d as = _mm_add_pd(s1, s2);
92+ as = _mm_hadd_pd(as, as);
93+ return as.m128d_f64[0];
94+#else
95+ double result = 0;
96+ for (int i=0; i<3; i++) {
97+ result += (*this)[i] * (*this)[i];
98+ }
99+ return result;
100+#endif
101+ }
102+
103+ void zero() {
104+ v = _mm256_setzero_pd();
105+ }
106+};
107+
108+inline Color4d operator * (double scalar, const Color4d& c) {
109+ return Color4d(c) *= scalar;
110+}
111+
112+inline Color4d operator * (const Color4d& c, double scalar) {
113+ return Color4d(c) *= scalar;
114+}
115+
--- a/Color4d.h
+++ b/Color4d_sse.h
@@ -1,5 +1,7 @@
11 #pragma once
22
3+#include <intrin.h>
4+
35 struct Color4d
46 {
57 __m128d v[2];
@@ -24,16 +26,16 @@ struct Color4d
2426 }
2527
2628 Color4d direct_product(const Color4d& rhs) const {
27- Color4d result;
28-#if 1
29- result.v[0] = _mm_mul_pd(v[0], rhs.v[0]);
30- result.v[1] = _mm_mul_pd(v[1], rhs.v[1]);
31-#else
32- for (int i=0; i<3; i++) {
33- result[i] = (*this)[i] * rhs[i];
34- }
35-#endif
36- return result;
29+ Color4d result;
30+#if 1
31+ result.v[0] = _mm_mul_pd(v[0], rhs.v[0]);
32+ result.v[1] = _mm_mul_pd(v[1], rhs.v[1]);
33+#else
34+ for (int i=0; i<3; i++) {
35+ result[i] = (*this)[i] * rhs[i];
36+ }
37+#endif
38+ return result;
3739 }
3840
3941 double dot_product(const Color4d& rhs) {
@@ -44,10 +46,10 @@ struct Color4d
4446 v = _mm_hadd_pd(v, v);
4547 return v.m128d_f64[0];
4648 #else
47- double result = 0;
48- for (int i=0; i<3; i++) {
49- result += (*this)[i] * rhs[i];
50- }
49+ double result = 0;
50+ for (int i=0; i<3; i++) {
51+ result += (*this)[i] * rhs[i];
52+ }
5153 return result;
5254 #endif
5355 }
@@ -101,14 +103,14 @@ struct Color4d
101103 return Color4d(*this) *= scalar;
102104 }
103105
104- double& operator[] (int idx) {
105- return ((double*)&v)[idx];
106+ double& operator[] (int idx) {
107+ return ((double*)&v)[idx];
106108 }
107- const double& operator[] (int idx) const {
108- return ((double*)&v)[idx];
109+ const double& operator[] (int idx) const {
110+ return ((double*)&v)[idx];
109111 }
110112
111- double norm_squared() {
113+ double norm_squared() {
112114 #if 1
113115 __m128d t = _mm_add_pd(
114116 _mm_mul_pd(v[0], v[0]),
@@ -117,13 +119,13 @@ struct Color4d
117119 t = _mm_hadd_pd(t, t);
118120 return t.m128d_f64[0];
119121 #else
120- double result = 0;
121- for (int i=0; i<3; i++) {
122- result += (*this)[i] * (*this)[i];
123- }
124- return result;
125-#endif
126- }
122+ double result = 0;
123+ for (int i=0; i<3; i++) {
124+ result += (*this)[i] * (*this)[i];
125+ }
126+ return result;
127+#endif
128+ }
127129
128130 void zero() {
129131 v[0] = _mm_setzero_pd();
@@ -131,11 +133,11 @@ struct Color4d
131133 }
132134 };
133135
134-inline Color4d operator * (double scalar, const Color4d& c) {
135- return Color4d(c) *= scalar;
136-}
136+inline Color4d operator * (double scalar, const Color4d& c) {
137+ return Color4d(c) *= scalar;
138+}
137139
138-inline Color4d operator * (const Color4d& c, double scalar) {
139- return Color4d(c) *= scalar;
140-}
140+inline Color4d operator * (const Color4d& c, double scalar) {
141+ return Color4d(c) *= scalar;
142+}
141143
--- a/Color4f.cpp
+++ b/Color4f_sse.cpp
@@ -1,7 +1,6 @@
11 #include "stdafx.h"
2-#include "Color4f.h"
3-
4-#include "Color4d.h"
2+#include "Color4f_sse.h"
3+#include "Color4d_sse.h"
54
65 Color4f& Color4f::operator = (const Color4d& rhs)
76 {
--- a/Color4f.h
+++ b/Color4f_sse.h
@@ -30,23 +30,23 @@ struct Color4f
3030 Color4f& operator = (const Color4d& rhs);
3131
3232 Color4f direct_product(const Color4f& rhs) const {
33- Color4f result;
34-#if 1
35- result.v = _mm_mul_ps(v, rhs.v);
36-#else
37- for (int i=0; i<3; i++) {
38- result[i] = (*this)[i] * rhs[i];
39- }
40-#endif
41- return result;
33+ Color4f result;
34+#if 1
35+ result.v = _mm_mul_ps(v, rhs.v);
36+#else
37+ for (int i=0; i<3; i++) {
38+ result[i] = (*this)[i] * rhs[i];
39+ }
40+#endif
41+ return result;
4242 }
4343
4444 float dot_product(const Color4f& rhs) {
4545 // http://www.icnet.ne.jp/~nsystem/simd_tobira/dpps.html
46- float result = 0;
47- for (int i=0; i<3; i++) {
48- result += (*this)[i] * rhs[i];
49- }
46+ float result = 0;
47+ for (int i=0; i<3; i++) {
48+ result += (*this)[i] * rhs[i];
49+ }
5050 return result;
5151 }
5252
@@ -95,28 +95,28 @@ struct Color4f
9595 return result;
9696 }
9797
98- float& operator[] (int idx) {
99- return v.m128_f32[3-idx];
98+ float& operator[] (int idx) {
99+ return v.m128_f32[3-idx];
100100 }
101- const float& operator[] (int idx) const {
102- return v.m128_f32[3-idx];
101+ const float& operator[] (int idx) const {
102+ return v.m128_f32[3-idx];
103103 }
104104
105- float norm_squared() {
106- float result = 0;
107- for (int i=0; i<3; i++) {
108- result += (*this)[i] * (*this)[i];
109- }
110- return result;
111- }
105+ float norm_squared() {
106+ float result = 0;
107+ for (int i=0; i<3; i++) {
108+ result += (*this)[i] * (*this)[i];
109+ }
110+ return result;
111+ }
112112
113113 void zero() {
114114 v = _mm_setzero_ps();
115115 }
116116 };
117117
118-inline Color4f operator * (float scalar, const Color4f& c) {
119- Color4f tmp = c;
120- return tmp * scalar;
121-}
118+inline Color4f operator * (float scalar, const Color4f& c) {
119+ Color4f tmp = c;
120+ return tmp * scalar;
121+}
122122
--- a/quantize.h
+++ b/quantize.h
@@ -1,8 +1,8 @@
11 #pragma once
22
33 #include "Array.h"
4-#include "Color4f.h"
5-#include "Color4d.h"
4+#include "Color4d_sse.h"
5+//#include "Color4d_avx.h"
66
77 typedef Color4d Color;
88 typedef Array2D<Color> Image;
--- a/vs2008/color_quantizer.vcproj
+++ b/vs2008/color_quantizer.vcproj
@@ -339,7 +339,7 @@
339339 UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
340340 >
341341 <File
342- RelativePath="..\Color4f.cpp"
342+ RelativePath="..\Color4f_sse.cpp"
343343 >
344344 </File>
345345 <File
@@ -385,11 +385,11 @@
385385 >
386386 </File>
387387 <File
388- RelativePath="..\Color4d.h"
388+ RelativePath="..\Color4d_sse.h"
389389 >
390390 </File>
391391 <File
392- RelativePath="..\Color4f.h"
392+ RelativePath="..\Color4f_sse.h"
393393 >
394394 </File>
395395 <File
--- a/vs2010/color_quantizer.vcxproj
+++ b/vs2010/color_quantizer.vcxproj
@@ -50,11 +50,12 @@
5050 </PrecompiledHeader>
5151 <WarningLevel>Level3</WarningLevel>
5252 <Optimization>Disabled</Optimization>
53- <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
53+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
5454 <AdditionalIncludeDirectories>../</AdditionalIncludeDirectories>
5555 <BrowseInformation>true</BrowseInformation>
5656 <MultiProcessorCompilation>true</MultiProcessorCompilation>
5757 <ForcedIncludeFiles>common.h</ForcedIncludeFiles>
58+ <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
5859 </ClCompile>
5960 <Link>
6061 <SubSystem>Console</SubSystem>
@@ -75,6 +76,9 @@
7576 <FloatingPointModel>Fast</FloatingPointModel>
7677 <MultiProcessorCompilation>true</MultiProcessorCompilation>
7778 <ForcedIncludeFiles>common.h</ForcedIncludeFiles>
79+ <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
80+ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
81+ <AdditionalOptions>/arch:__AVX %(AdditionalOptions)</AdditionalOptions>
7882 </ClCompile>
7983 <Link>
8084 <SubSystem>Console</SubSystem>
@@ -93,7 +97,8 @@
9397 </ItemGroup>
9498 <ItemGroup>
9599 <ClInclude Include="..\Array.h" />
96- <ClInclude Include="..\Color4d.h" />
100+ <ClInclude Include="..\Color4d_avx.h" />
101+ <ClInclude Include="..\Color4d_sse.h" />
97102 <ClInclude Include="..\common.h" />
98103 <ClInclude Include="..\dxor.h" />
99104 <ClInclude Include="..\quantize.h" />