OSDN > Developer >

berupon > Chamber > color_quantizer > Commit

berupon

color quantizer
Fork

(Original repository, No fork origin)

R/O
HTTP
SSH
HTTPS

Commit

Commit MetaInfo

Revision	4b1196c90fb583a3002d91b6997db06f780726fe (tree)
Time	2011-05-22 13:48:48
Author	beru <berupon@gmai...>
Commiter	beru

Log Message

used AVX intrinsics but it doesn't improve processing speed. actually, it's slow compared to SSE version. why is that?

Change Summary

modified: Array.h (diff)
delete: Color4d.h => Color4d_sse.h
add: Color4d_avx.h (diff)
delete: Color4f.cpp => Color4f_sse.cpp
delete: Color4f.h => Color4f_sse.h
modified: quantize.h (diff)
modified: vs2008/color_quantizer.vcproj (diff)
modified: vs2010/color_quantizer.vcxproj (diff)

Incremental Difference

--- a/Array.h

+++ b/Array.h

		@@ -12,7 +12,7 @@ struct Array2D
12	12
13	13	void allocate() {
14	14	// pBuff_ = new T[width_ * height_];
15		- pBuff_ = (T) _aligned_malloc(sizeof(T) width_ * height_, 16);
	15	+ pBuff_ = (T) _aligned_malloc(sizeof(T) width_ * height_, 32);
16	16	}
17	17
18	18	Array2D(const Array2D& arr)

		@@ -52,102 +52,102 @@ struct Array2D
52	52	}
53	53
54	54	__forceinline
55		- T* operator[] (int row) {
56		- return &pBuff_[row * width_];
57		- }
58		-
59		- __forceinline
60		- const T* operator[] (int row) const {
61		- return &pBuff_[row * width_];
62		- }
	55	+ T* operator[] (int row) {
	56	+ return &pBuff_[row * width_];
	57	+ }
	58	+
	59	+ __forceinline
	60	+ const T* operator[] (int row) const {
	61	+ return &pBuff_[row * width_];
	62	+ }
63	63
64		- Array2D<T>& operator *= (const T& scalar) {
65		-#if 1
66		- for (size_t i=0; i<width_*height_; ++i) {
67		- pBuff_[i] *= scalar;
68		- }
69		-#else
70		- for (int i=0; i<width_; i++) {
71		- for (int j=0; j<height_; j++) {
72		- (this)[j][i] = scalar;
73		- }
74		- }
75		-#endif
76		- return *this;
77		- }
78		-
79		- template <typename T2>
80		- Array2D<T> operator * (const T2& scalar) {
81		- Array2D<T> result(*this);
82		- result *= scalar;
83		- return result;
84		- }
85		-
86		- std::vector<T> operator * (const std::vector<T>& vec) {
87		- std::vector<T> result(height_);
88		- T sum;
89		- for (int row=0; row<height_; row++) {
90		- sum = 0;
91		- for (int col=0; col<width_; col++) {
92		- sum += (this)[row][col] vec[col];
93		- }
94		- result[row] = sum;
95		- }
96		- return result;
	64	+ Array2D<T>& operator *= (const T& scalar) {
	65	+#if 1
	66	+ for (size_t i=0; i<width_*height_; ++i) {
	67	+ pBuff_[i] *= scalar;
	68	+ }
	69	+#else
	70	+ for (int i=0; i<width_; i++) {
	71	+ for (int j=0; j<height_; j++) {
	72	+ (this)[j][i] = scalar;
	73	+ }
	74	+ }
	75	+#endif
	76	+ return *this;
97	77	}
98	78
99		- Array2D<T>& multiply_row_scalar(int row, double mult) {
100		- for (int i=0; i<width_; i++) {
101		- (this)[row][i] = mult;
102		- }
103		- return *this;
104		- }
105		-
106		- Array2D<T>& add_row_multiple(int from_row, int to_row, double mult) {
107		- for (int i=0; i<width_; ++i) {
108		- (this)[to_row][i] += mult(*this)[from_row][i];
109		- }
110		- return *this;
111		- }
112		-
113		- // We use simple Gaussian elimination - perf doesn't matter since
114		- // the matrices will be K x K, where K = number of palette entries.
115		- Array2D<T> matrix_inverse() {
116		- Array2D<T> result(width_, height_);
117		- Array2D<T>& a = *this;
118		-
119		- // Set result to identity matrix
120		- result *= 0;
121		- for (int i=0; i<width_; i++) {
122		- result[i][i] = 1;
123		- }
124		- // Reduce to echelon form, mirroring in result
125		- for (int i=0; i<width_; i++) {
126		- result.multiply_row_scalar(i, 1/a[i][i]);
127		- multiply_row_scalar(i, 1/a[i][i]);
128		- for (int j=i+1; j<height_; j++) {
129		- result.add_row_multiple(i, j, -a[j][i]);
130		- add_row_multiple(i, j, -a[j][i]);
131		- }
132		- }
133		- // Back substitute, mirroring in result
134		- for (int i=width_-1; i>=0; i--) {
135		- for (int j=i-1; j>=0; j--) {
136		- result.add_row_multiple(i, j, -a[j][i]);
137		- add_row_multiple(i, j, -a[j][i]);
138		- }
139		- }
140		- // result is now the inverse
141		- return result;
142		- }
	79	+ template <typename T2>
	80	+ Array2D<T> operator * (const T2& scalar) {
	81	+ Array2D<T> result(*this);
	82	+ result *= scalar;
	83	+ return result;
	84	+ }
	85	+
	86	+ std::vector<T> operator * (const std::vector<T>& vec) {
	87	+ std::vector<T> result(height_);
	88	+ T sum;
	89	+ for (int row=0; row<height_; row++) {
	90	+ sum = 0;
	91	+ for (int col=0; col<width_; col++) {
	92	+ sum += (this)[row][col] vec[col];
	93	+ }
	94	+ result[row] = sum;
	95	+ }
	96	+ return result;
	97	+ }
	98	+
	99	+ Array2D<T>& multiply_row_scalar(int row, double mult) {
	100	+ for (int i=0; i<width_; i++) {
	101	+ (this)[row][i] = mult;
	102	+ }
	103	+ return *this;
	104	+ }
	105	+
	106	+ Array2D<T>& add_row_multiple(int from_row, int to_row, double mult) {
	107	+ for (int i=0; i<width_; ++i) {
	108	+ (this)[to_row][i] += mult(*this)[from_row][i];
	109	+ }
	110	+ return *this;
	111	+ }
	112	+
	113	+ // We use simple Gaussian elimination - perf doesn't matter since
	114	+ // the matrices will be K x K, where K = number of palette entries.
	115	+ Array2D<T> matrix_inverse() {
	116	+ Array2D<T> result(width_, height_);
	117	+ Array2D<T>& a = *this;
	118	+
	119	+ // Set result to identity matrix
	120	+ result *= 0;
	121	+ for (int i=0; i<width_; i++) {
	122	+ result[i][i] = 1;
	123	+ }
	124	+ // Reduce to echelon form, mirroring in result
	125	+ for (int i=0; i<width_; i++) {
	126	+ result.multiply_row_scalar(i, 1/a[i][i]);
	127	+ multiply_row_scalar(i, 1/a[i][i]);
	128	+ for (int j=i+1; j<height_; j++) {
	129	+ result.add_row_multiple(i, j, -a[j][i]);
	130	+ add_row_multiple(i, j, -a[j][i]);
	131	+ }
	132	+ }
	133	+ // Back substitute, mirroring in result
	134	+ for (int i=width_-1; i>=0; i--) {
	135	+ for (int j=i-1; j>=0; j--) {
	136	+ result.add_row_multiple(i, j, -a[j][i]);
	137	+ add_row_multiple(i, j, -a[j][i]);
	138	+ }
	139	+ }
	140	+ // result is now the inverse
	141	+ return result;
	142	+ }
143	143
144	144	};
145	145
146		-template <typename T>
147		-Array2D<T> operator * (T scalar, const Array2D<T>& a) {
148		- Array2D<T> tmp = a;
149		- return tmp * scalar;
150		-}
	146	+template <typename T>
	147	+Array2D<T> operator * (T scalar, const Array2D<T>& a) {
	148	+ Array2D<T> tmp = a;
	149	+ return tmp * scalar;
	150	+}
151	151
152	152	template <typename T>
153	153	struct Array3D

		@@ -186,11 +186,11 @@ public:
186	186	}
187	187
188	188	/*
189		- Array2D<T> operator[] (int depth) {
190		- return Array2D<T>(width_, height_, &pBuff_[depth * width_ * height_]);
	189	+ Array2D<T> operator[] (int depth) {
	190	+ return Array2D<T>(width_, height_, &pBuff_[depth * width_ * height_]);
191	191	}
192		- Array2D<T> operator[] (int depth) const {
193		- return Array2D<T>(width_, height_, &pBuff_[depth * width_ * height_]);
	192	+ Array2D<T> operator[] (int depth) const {
	193	+ return Array2D<T>(width_, height_, &pBuff_[depth * width_ * height_]);
194	194	}
195	195	*/
196	196	__forceinline

--- /dev/null

+++ b/Color4d_avx.h

		@@ -0,0 +1,115 @@
	1	+#pragma once
	2	+
	3	+#include <immintrin.h>
	4	+
	5	+struct Color4d
	6	+{
	7	+ __m256d v;
	8	+
	9	+ Color4d() {
	10	+ ;
	11	+ }
	12	+
	13	+ Color4d(const Color4d& c) {
	14	+ *this = c;
	15	+ }
	16	+
	17	+ Color4d(double r, double g, double b, double a) {
	18	+ v = _mm256_setr_pd(r,g,b,a);
	19	+ }
	20	+
	21	+ Color4d& operator = (const Color4d& rhs) {
	22	+ v = rhs.v;
	23	+ return *this;
	24	+ }
	25	+
	26	+ Color4d direct_product(const Color4d& rhs) const {
	27	+ Color4d result;
	28	+ result.v = _mm256_mul_pd(v, rhs.v);
	29	+ return result;
	30	+ }
	31	+
	32	+ double dot_product(const Color4d& rhs) {
	33	+// http://www.icnet.ne.jp/~nsystem/simd_tobira/dpps.html
	34	+ __m256d s = _mm256_mul_pd(this->v, rhs.v);
	35	+ __m128d s1 = _mm256_extractf128_pd(s, 0);
	36	+ __m128d s2 = _mm256_extractf128_pd(s, 1);
	37	+ __m128d as = _mm_add_pd(s1, s2);
	38	+ as = _mm_hadd_pd(as, as);
	39	+ return as.m128d_f64[0];
	40	+ }
	41	+
	42	+ Color4d& operator += (const Color4d& rhs) {
	43	+ v = _mm256_add_pd(v, rhs.v);
	44	+ return *this;
	45	+ }
	46	+
	47	+ Color4d operator + (const Color4d& rhs) {
	48	+ return Color4d(*this) += rhs;
	49	+ }
	50	+
	51	+ Color4d& operator -= (const Color4d& rhs) {
	52	+ v = _mm256_sub_pd(v, rhs.v);
	53	+ return *this;
	54	+ }
	55	+
	56	+ Color4d operator - (const Color4d& rhs) {
	57	+ return Color4d(*this) -= rhs;
	58	+ }
	59	+
	60	+ Color4d& operator *= (const Color4d& rhs) {
	61	+ v = _mm256_mul_pd(v, rhs.v);
	62	+ return *this;
	63	+ }
	64	+
	65	+ Color4d operator * (const Color4d& rhs) {
	66	+ return Color4d(this) = rhs;
	67	+ }
	68	+
	69	+ Color4d& operator *= (double scalar) {
	70	+ __m256d s = _mm256_set1_pd(scalar);
	71	+ v = _mm256_mul_pd(v, s);
	72	+ return *this;
	73	+ }
	74	+
	75	+ Color4d operator * (double scalar) {
	76	+ return Color4d(this) = scalar;
	77	+ }
	78	+
	79	+ double& operator[] (int idx) {
	80	+ return ((double*)&v)[idx];
	81	+ }
	82	+ const double& operator[] (int idx) const {
	83	+ return ((double*)&v)[idx];
	84	+ }
	85	+
	86	+ double norm_squared() {
	87	+#if 1
	88	+ __m256d s = _mm256_mul_pd(v, v);
	89	+ __m128d s1 = _mm256_extractf128_pd(s, 0);
	90	+ __m128d s2 = _mm256_extractf128_pd(s, 1);
	91	+ __m128d as = _mm_add_pd(s1, s2);
	92	+ as = _mm_hadd_pd(as, as);
	93	+ return as.m128d_f64[0];
	94	+#else
	95	+ double result = 0;
	96	+ for (int i=0; i<3; i++) {
	97	+ result += (this)[i] (*this)[i];
	98	+ }
	99	+ return result;
	100	+#endif
	101	+ }
	102	+
	103	+ void zero() {
	104	+ v = _mm256_setzero_pd();
	105	+ }
	106	+};
	107	+
	108	+inline Color4d operator * (double scalar, const Color4d& c) {
	109	+ return Color4d(c) *= scalar;
	110	+}
	111	+
	112	+inline Color4d operator * (const Color4d& c, double scalar) {
	113	+ return Color4d(c) *= scalar;
	114	+}
	115	+

--- a/Color4d.h

+++ b/Color4d_sse.h

		@@ -1,5 +1,7 @@
1	1	#pragma once
2	2
	3	+#include <intrin.h>
	4	+
3	5	struct Color4d
4	6	{
5	7	__m128d v[2];

		@@ -24,16 +26,16 @@ struct Color4d
24	26	}
25	27
26	28	Color4d direct_product(const Color4d& rhs) const {
27		- Color4d result;
28		-#if 1
29		- result.v[0] = _mm_mul_pd(v[0], rhs.v[0]);
30		- result.v[1] = _mm_mul_pd(v[1], rhs.v[1]);
31		-#else
32		- for (int i=0; i<3; i++) {
33		- result[i] = (this)[i] rhs[i];
34		- }
35		-#endif
36		- return result;
	29	+ Color4d result;
	30	+#if 1
	31	+ result.v[0] = _mm_mul_pd(v[0], rhs.v[0]);
	32	+ result.v[1] = _mm_mul_pd(v[1], rhs.v[1]);
	33	+#else
	34	+ for (int i=0; i<3; i++) {
	35	+ result[i] = (this)[i] rhs[i];
	36	+ }
	37	+#endif
	38	+ return result;
37	39	}
38	40
39	41	double dot_product(const Color4d& rhs) {

		@@ -44,10 +46,10 @@ struct Color4d
44	46	v = _mm_hadd_pd(v, v);
45	47	return v.m128d_f64[0];
46	48	#else
47		- double result = 0;
48		- for (int i=0; i<3; i++) {
49		- result += (this)[i] rhs[i];
50		- }
	49	+ double result = 0;
	50	+ for (int i=0; i<3; i++) {
	51	+ result += (this)[i] rhs[i];
	52	+ }
51	53	return result;
52	54	#endif
53	55	}

		@@ -101,14 +103,14 @@ struct Color4d
101	103	return Color4d(this) = scalar;
102	104	}
103	105
104		- double& operator[] (int idx) {
105		- return ((double*)&v)[idx];
	106	+ double& operator[] (int idx) {
	107	+ return ((double*)&v)[idx];
106	108	}
107		- const double& operator[] (int idx) const {
108		- return ((double*)&v)[idx];
	109	+ const double& operator[] (int idx) const {
	110	+ return ((double*)&v)[idx];
109	111	}
110	112
111		- double norm_squared() {
	113	+ double norm_squared() {
112	114	#if 1
113	115	__m128d t = _mm_add_pd(
114	116	_mm_mul_pd(v[0], v[0]),

		@@ -117,13 +119,13 @@ struct Color4d
117	119	t = _mm_hadd_pd(t, t);
118	120	return t.m128d_f64[0];
119	121	#else
120		- double result = 0;
121		- for (int i=0; i<3; i++) {
122		- result += (this)[i] (*this)[i];
123		- }
124		- return result;
125		-#endif
126		- }
	122	+ double result = 0;
	123	+ for (int i=0; i<3; i++) {
	124	+ result += (this)[i] (*this)[i];
	125	+ }
	126	+ return result;
	127	+#endif
	128	+ }
127	129
128	130	void zero() {
129	131	v[0] = _mm_setzero_pd();

		@@ -131,11 +133,11 @@ struct Color4d
131	133	}
132	134	};
133	135
134		-inline Color4d operator * (double scalar, const Color4d& c) {
135		- return Color4d(c) *= scalar;
136		-}
	136	+inline Color4d operator * (double scalar, const Color4d& c) {
	137	+ return Color4d(c) *= scalar;
	138	+}
137	139
138		-inline Color4d operator * (const Color4d& c, double scalar) {
139		- return Color4d(c) *= scalar;
140		-}
	140	+inline Color4d operator * (const Color4d& c, double scalar) {
	141	+ return Color4d(c) *= scalar;
	142	+}
141	143

--- a/Color4f.cpp

+++ b/Color4f_sse.cpp

		@@ -1,7 +1,6 @@
1	1	#include "stdafx.h"
2		-#include "Color4f.h"
3		-
4		-#include "Color4d.h"
	2	+#include "Color4f_sse.h"
	3	+#include "Color4d_sse.h"
5	4
6	5	Color4f& Color4f::operator = (const Color4d& rhs)
7	6	{

--- a/Color4f.h

+++ b/Color4f_sse.h

		@@ -30,23 +30,23 @@ struct Color4f
30	30	Color4f& operator = (const Color4d& rhs);
31	31
32	32	Color4f direct_product(const Color4f& rhs) const {
33		- Color4f result;
34		-#if 1
35		- result.v = _mm_mul_ps(v, rhs.v);
36		-#else
37		- for (int i=0; i<3; i++) {
38		- result[i] = (this)[i] rhs[i];
39		- }
40		-#endif
41		- return result;
	33	+ Color4f result;
	34	+#if 1
	35	+ result.v = _mm_mul_ps(v, rhs.v);
	36	+#else
	37	+ for (int i=0; i<3; i++) {
	38	+ result[i] = (this)[i] rhs[i];
	39	+ }
	40	+#endif
	41	+ return result;
42	42	}
43	43
44	44	float dot_product(const Color4f& rhs) {
45	45	// http://www.icnet.ne.jp/~nsystem/simd_tobira/dpps.html
46		- float result = 0;
47		- for (int i=0; i<3; i++) {
48		- result += (this)[i] rhs[i];
49		- }
	46	+ float result = 0;
	47	+ for (int i=0; i<3; i++) {
	48	+ result += (this)[i] rhs[i];
	49	+ }
50	50	return result;
51	51	}
52	52

		@@ -95,28 +95,28 @@ struct Color4f
95	95	return result;
96	96	}
97	97
98		- float& operator[] (int idx) {
99		- return v.m128_f32[3-idx];
	98	+ float& operator[] (int idx) {
	99	+ return v.m128_f32[3-idx];
100	100	}
101		- const float& operator[] (int idx) const {
102		- return v.m128_f32[3-idx];
	101	+ const float& operator[] (int idx) const {
	102	+ return v.m128_f32[3-idx];
103	103	}
104	104
105		- float norm_squared() {
106		- float result = 0;
107		- for (int i=0; i<3; i++) {
108		- result += (this)[i] (*this)[i];
109		- }
110		- return result;
111		- }
	105	+ float norm_squared() {
	106	+ float result = 0;
	107	+ for (int i=0; i<3; i++) {
	108	+ result += (this)[i] (*this)[i];
	109	+ }
	110	+ return result;
	111	+ }
112	112
113	113	void zero() {
114	114	v = _mm_setzero_ps();
115	115	}
116	116	};
117	117
118		-inline Color4f operator * (float scalar, const Color4f& c) {
119		- Color4f tmp = c;
120		- return tmp * scalar;
121		-}
	118	+inline Color4f operator * (float scalar, const Color4f& c) {
	119	+ Color4f tmp = c;
	120	+ return tmp * scalar;
	121	+}
122	122

--- a/quantize.h

+++ b/quantize.h

		@@ -1,8 +1,8 @@
1	1	#pragma once
2	2
3	3	#include "Array.h"
4		-#include "Color4f.h"
5		-#include "Color4d.h"
	4	+#include "Color4d_sse.h"
	5	+//#include "Color4d_avx.h"
6	6
7	7	typedef Color4d Color;
8	8	typedef Array2D<Color> Image;

--- a/vs2008/color_quantizer.vcproj

+++ b/vs2008/color_quantizer.vcproj

		@@ -339,7 +339,7 @@
339	339	UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
340	340	>
341	341	<File
342		- RelativePath="..\Color4f.cpp"
	342	+ RelativePath="..\Color4f_sse.cpp"
343	343	>
344	344	</File>
345	345	<File

		@@ -385,11 +385,11 @@
385	385	>
386	386	</File>
387	387	<File
388		- RelativePath="..\Color4d.h"
	388	+ RelativePath="..\Color4d_sse.h"
389	389	>
390	390	</File>
391	391	<File
392		- RelativePath="..\Color4f.h"
	392	+ RelativePath="..\Color4f_sse.h"
393	393	>
394	394	</File>
395	395	<File

--- a/vs2010/color_quantizer.vcxproj

+++ b/vs2010/color_quantizer.vcxproj

		@@ -50,11 +50,12 @@
50	50	</PrecompiledHeader>
51	51	<WarningLevel>Level3</WarningLevel>
52	52	<Optimization>Disabled</Optimization>
53		- <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
	53	+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
54	54	<AdditionalIncludeDirectories>../</AdditionalIncludeDirectories>
55	55	<BrowseInformation>true</BrowseInformation>
56	56	<MultiProcessorCompilation>true</MultiProcessorCompilation>
57	57	<ForcedIncludeFiles>common.h</ForcedIncludeFiles>
	58	+ <AdditionalOptions>/arch:AVX %(AdditionalOptions)</AdditionalOptions>
58	59	</ClCompile>
59	60	<Link>
60	61	<SubSystem>Console</SubSystem>

		@@ -75,6 +76,9 @@
75	76	<FloatingPointModel>Fast</FloatingPointModel>
76	77	<MultiProcessorCompilation>true</MultiProcessorCompilation>
77	78	<ForcedIncludeFiles>common.h</ForcedIncludeFiles>
	79	+ <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
	80	+ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
	81	+ <AdditionalOptions>/arch:__AVX %(AdditionalOptions)</AdditionalOptions>
78	82	</ClCompile>
79	83	<Link>
80	84	<SubSystem>Console</SubSystem>

		@@ -93,7 +97,8 @@
93	97	</ItemGroup>
94	98	<ItemGroup>
95	99	<ClInclude Include="..\Array.h" />
96		- <ClInclude Include="..\Color4d.h" />
	100	+ <ClInclude Include="..\Color4d_avx.h" />
	101	+ <ClInclude Include="..\Color4d_sse.h" />
97	102	<ClInclude Include="..\common.h" />
98	103	<ClInclude Include="..\dxor.h" />
99	104	<ClInclude Include="..\quantize.h" />

color quantizer Fork