|
HPC/並列プログラミングポータルでは、HPC(High Performance Computing)プログラミングや並列プログラミングに関する情報を集積・発信しています。 |
[記事一覧を見る]
このページでは『最適化・並列化コードを生み出す最新コンパイラ「インテル コンパイラー」』記事内で使用したソースコードをまとめています。
/*
* Matrix calculation test program
* matrix1.c
*/
#include stdio.h
#include stdlib.h
#include time.h
#include string.h
#define N 400
#define REPEAT 100
#define AT(matrix,i,j) (matrix[i*dim+j])
static unsigned long next_rand = 1;
void print_matrix( double* matrix, int dim );
double my_rand(void);
void prod_matrix( double* matrixA, double* matrixB, double* matrixC, int dim );
void cp_matrix( double* matrixA, double* matrixB, int dim );
void mult_matrix( double* matrixA, double* matrixB, double x, int dim );
int main( int argc, const char **argv) {
int i, j;
int rep;
int dim = N;
double* matrixA;
double* matrixB;
double* matrixC;
clock_t clock_start, clock_end;
clock_start = clock();
/* allocate matrix */
matrixA = (double*)malloc( sizeof(double) * dim * dim );
matrixB = (double*)malloc( sizeof(double) * dim * dim );
matrixC = (double*)malloc( sizeof(double) * dim * dim );
if( (matrixA == NULL) ||
(matrixB == NULL) ||
(matrixC == NULL) ) {
printf( "cannot allocate memory. exit.\n" );
free(matrixA);
free(matrixB);
free(matrixC);
return -1;
}
/* initialize matrix */
for( i = 0; i dim; i++ ) {
for( j = 0; j dim; j++ ) {
AT(matrixA,i,j) = my_rand();
AT(matrixB,i,j) = my_rand();
}
}
for( rep = 0; rep REPEAT; rep++ ) {
prod_matrix( matrixC, matrixA, matrixB, dim );
mult_matrix( matrixA, matrixC, 2.0/(double)dim, dim );
}
/* print_matrix(matrixC, dim);*/
free(matrixA);
free(matrixB);
free(matrixC);
clock_end = clock();
printf( "start: %d end: %d elapse: %d.\n", clock_start, clock_end, clock_end - clock_start );
printf( "elapse time: %lf sec.", ((double)(clock_end - clock_start)) / (double)CLOCKS_PER_SEC );
return 0;
}
/* A = B; copy matrix B to matrix A */
void cp_matrix( double* matrixA, double* matrixB,int dim ) {
memcpy( matrixA, matrixB, sizeof(double)*dim*dim );
}
/* A = B * C; matrixA = matrixB * matrixC */
void prod_matrix( double* matrixA, double* matrixB, double* matrixC, int dim ) {
int i, j, m;
memset( matrixA, 0, sizeof(double)*dim*dim );
for( i = 0; i dim; i++ ) {
for( m = 0; m dim; m++ ) {
for( j = 0; j dim; j++ ) {
AT(matrixA,i,j) += AT(matrixB,i,m) * AT(matrixC, m, j);
}
}
}
}
/* matrixA = matrixB * x */
void mult_matrix( double* matrixA, double* matrixB, double x, int dim ) {
int i, j;
for( i = 0; i dim; i++ ) {
for( j = 0; j dim; j++ ) {
AT(matrixA,i,j) = AT(matrixB,i,j) * x;
}
}
}
void print_matrix( double* matrix, int dim ) {
int i,j;
for( i = 0; i dim; i++ ) {
for( j = 0; j dim; j++ ) {
// output
printf( "%lf ", AT(matrix,i,j) );
}
printf( "\n" );
}
}
double my_rand(void) {
next_rand = next_rand * 1103515245 + 12345;
return (((double)((unsigned)(next_rand/65536) % 32768))/32768);
}
/* mkl1.c */
#include stdio.h
#include stdlib.h
#include memory.h
#include time.h
#include mkl_cblas.h
#define ITEST_PARAM_N 100
#define ITEST_PARAM_REPEAT 100
#define AT(matrix,i,j) (matrix[i*dim+j])
static unsigned long next_rand = 1;
void print_matrix( double* matrix, int dim );
double my_rand(void);
void prod_matrix_with_blas( double* matrixA, double* matrixB, double* matrixC, int dim );
void cp_matrix( double* matrixA, double* matrixB,int dim );
void mult_matrix( double* matrixA, double* matrixB, double x, int dim );
void mult_matrix_with_blas( double* matrixA, double x, int dim );
int main( int argc, const char **argv) {
int i, j;
int rep;
int dim = ITEST_PARAM_N;
double* matrixA;
double* matrixB;
double* matrixC;
clock_t clock_start, clock_end;
clock_start = clock();
/* allocate matrix */
matrixA = (double*)malloc( sizeof(double) * dim * dim );
matrixB = (double*)malloc( sizeof(double) * dim * dim );
matrixC = (double*)malloc( sizeof(double) * dim * dim );
if( (matrixA == NULL) ||
(matrixB == NULL) ||
(matrixC == NULL) ) {
printf( "cannot allocate memory. exit.\n");
free(matrixA);
free(matrixB);
free(matrixC);
return -1;
}
/* initialize matrix */
for( i = 0; i dim; i++ ) {
for( j = 0; j dim; j++ ) {
AT(matrixA,i,j) = my_rand();
AT(matrixB,i,j) = my_rand();
}
}
mult_matrix_with_blas( matrixB, 2.0/(double)dim, dim );
#ifdef TEST_SHOW_RESULT
print_matrix(matrixB, dim);
#endif
/* calculate C = A*(B/a)^REPEAT */
for( rep = 0; rep ITEST_PARAM_REPEAT; rep++ ) {
prod_matrix_with_blas( matrixC, matrixA, matrixB, dim );
}
#ifdef TEST_SHOW_RESULT
print_matrix(matrixC, dim);
#endif
free(matrixA);
free(matrixB);
free(matrixC);
clock_end = clock();
printf( "start: %d end: %d elapse: %d.\n", clock_start, clock_end, clock_end - clock_start );
printf( "elapse time: %lf sec.", ((double)(clock_end - clock_start)) / (double)CLOCKS_PER_SEC );
return 0;
}
/* A = B; copy matrix B to matrix A */
void cp_matrix( double* matrixA, double* matrixB,int dim ) {
memcpy( matrixA, matrixB, sizeof(double)*dim*dim );
}
/* A = B * C; matrixA = matrixB * matrixC */
/*
void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_TRANSPOSE TransB,
const int M, const int N, const int K, const double alpha,
const double *A, const int lda,
const double *B, const int ldb,
const double beta, double *C, const int ldc);
A:(M,K)
B:(K,N)
C:(M,N)
C = ALPHA*A*B + BETA*C
LDA,LDB,LDC=DIM
*/
void prod_matrix_with_blas( double* matrixA, double* matrixB, double* matrixC, int dim ) {
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
dim, dim, dim, 1.0,
matrixB, dim,
matrixC, dim, 0.0, matrixA, dim);
}
/* matrixA = matrixB * x */
void mult_matrix( double* matrixA, double* matrixB, double x, int dim ) {
int i, j;
for( i = 0; i dim; i++ ) {
for( j = 0; j dim; j++ ) {
AT(matrixA,i,j) = AT(matrixB,i,j) * x;
}
}
}
/* matrixA = matrixA * x */
/*
SUBROUTINE DSCAL(N,DA,DX,INCX)
DX = DA * DX
DA:scalar
DX:(N)
INCX:1
*/
void mult_matrix_with_blas( double* matrixA, double x, int dim ) {
cblas_dscal( dim*dim, x, matrixA, 1 );
}
void print_matrix( double* matrix, int dim ) {
int i,j;
for( i = 0; i dim; i++ ) {
for( j = 0; j dim; j++ ) {
// output
printf( "%lf ", AT(matrix,i,j) );
}
printf( "\n" );
}
}
double my_rand(void) {
next_rand = next_rand * 1103515245 + 12345;
return (((double)((unsigned)(next_rand/65536) % 32768))/32768);
}
/* fft1.c */
#include stdio.h
#include string.h
#include stdlib.h
#include memory.h
#include math.h
#include time.h
#include "fftw3.h"
/*
fft1 input file output_file
*/
#define FMODE_WRITE "w"
#define NORM(x,y) (sqrt(x*x+y*y))
#define POWR(x,y) (x*x+y*y)
#define c_re(x) (x[0])
#define c_im(x) (x[1])
/* prototypes */
unsigned char* load_image( char* filename,
int* pWidth,
int* pHeight,
int* pDepth );
void write_image( char* filename,
unsigned char* buff,
int width,
int height,
int depth );
unsigned char* load_image( char* filename,
int* pWidth,
int* pHeight,
int* pDepth ) {
FILE* fp_in;
unsigned char* buff;
char szBuf[512];
int height = 0;
int width = 0;
int depth = 0;
fp_in = fopen( filename, "r" );
if( fp_in == NULL ) {
fprintf( stderr, "cannot open file: %s\n", filename );
return NULL;
}
/* read PGM Header */
fgets( szBuf, 512, fp_in );
if( strlen( szBuf ) 3 ) {
fprintf( stderr, "invalid image file: %s\n", filename );
fclose( fp_in );
return NULL;
}
if( strcmp( szBuf, "P5\n" ) != 0 ) {
fprintf( stderr, "invalid image file: %s\n", filename );
fclose( fp_in );
return NULL;
}
/* read #comment */
fgets( szBuf, 512, fp_in );
while( szBuf[0] == '#' ) {
while( (szBuf[strlen(szBuf)-1] != '\n') (!feof(fp_in)) ) {
fgets( szBuf, 512, fp_in );
}
fgets( szBuf, 512, fp_in );
}
if(feof(fp_in)) {
fprintf( stderr, "invalid image file: %s\n", filename );
fclose( fp_in );
return NULL;
}
/* read image size */
if (szBuf[strlen(szBuf)-1] != '\n') {
fprintf( stderr, "image file is strange: %s\n", filename );
fclose( fp_in );
return NULL;
}
sscanf( szBuf, "%d %d", width, height );
if( (width == 0) || (height == 0) ) {
fprintf( stderr, "image file is strange: %s\n", filename );
fclose( fp_in );
return NULL;
}
fgets( szBuf, 512, fp_in );
/* read depth */
if (szBuf[strlen(szBuf)-1] != '\n') {
fprintf( stderr, "image file is strange: %s\n", filename );
fclose( fp_in );
return NULL;
}
sscanf( szBuf, "%d", depth );
if( depth == 0 ) {
fprintf( stderr, "image file is strange: %s\n", filename );
fclose( fp_in );
return NULL;
}
printf( "image size: %d x %d, depth: %d\n", width, height, depth );
/* allocate buffer */
buff = (unsigned char*)malloc( width * height * sizeof(unsigned char) );
if( buff == NULL ) {
fclose( fp_in );
return NULL;
}
fread( buff, 1, width*height, fp_in );
fclose( fp_in );
*pWidth = width;
*pHeight = height;
*pDepth = depth;
return buff;
}
void write_image( char* filename,
unsigned char* buff,
int width,
int height,
int depth ) {
FILE* fp_out;
fp_out = fopen( filename, FMODE_WRITE );
if( fp_out == NULL ) {
fprintf( stderr, "cannot open file: %s\n", filename );
return;
}
/* write PGM Header */
fprintf( fp_out, "P5\n" );
/* write image size */
fprintf( fp_out, "%d %d\n", width, height );
/* write depth */
fprintf( fp_out, "%d\n", depth );
fwrite( buff, 1, width*height, fp_out );
if( ferror(fp_out) ) {
fprintf( stderr, "file output error: %s\n", filename );
return;
}
}
void FFTWExchange(int sizex,int sizey, fftw_complex *src, fftw_complex *dst) {
int i,j,u,v;
int imgxd2=sizex/2, imgyd2=sizey/2;
for(j=0;jsizey;j++){
for(i=0;isizex;i++){
u = (iimgxd2)? imgxd2+i: i-imgxd2;
v = (jimgyd2)? imgyd2+j: j-imgyd2;
c_re(dst[j*sizex+i]) = c_re(src[u+sizey*v]);
c_im(dst[j*sizex+i]) = c_im(src[u+sizey*v]);
}
}
}
unsigned char* execute_fft( unsigned char* input_image_buf,
int width,
int height,
int depth ) {
fftw_complex *in, *out;
fftw_plan p;
unsigned char* output_image_buf;
int n;
int i,j;
double dMax, dMin, dNorm;
double* pTmp;
dMax = 0.0;
dMin = 0.0;
pTmp = (double*)malloc( width*height*sizeof(double) );
if( pTmp == NULL ) {
return NULL;
}
/* allocate memory */
in = fftw_malloc(sizeof(fftw_complex) * width * height);
out = fftw_malloc(sizeof(fftw_complex) * width * height);
/* copy data to fftw buffer */
for( n = 0; n width*height; n++ ) {
in[n][0] = (float)input_image_buf[n];
in[n][1] = 0.0;
}
/* create plan */
p = fftw_plan_dft_2d(height, width, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
/* do FFT */
fftw_execute(p);
/* allocate buffer */
output_image_buf =
(unsigned char*)malloc( width * height * sizeof(unsigned char) );
if( output_image_buf != NULL ) {
/* copy data to fftw buffer */
for( n = 0; n width*height; n++ ) {
pTmp[n] = log(POWR(out[n][0],out[n][1]) + 1);
if( pTmp[n] dMin ) {
dMin = pTmp[n];
}
if( dMax pTmp[n] ) {
dMax = pTmp[n];
}
}
for( n = 0; n width*height; n++ ) {
double dWork;
dWork = (pTmp[n] * 255.0 / (dMax - dMin));
output_image_buf[n] = (unsigned char)(dWork);
}
}
/* free memories */
fftwnd_destroy_plan(p);
free( pTmp );
fftw_free(in);
fftw_free(out);
return output_image_buf;
}
unsigned char* execute_ifft( unsigned char* input_image_buf,
int width,
int height,
int depth ) {
fftw_complex *in, *out;
fftw_plan p;
unsigned char* output_image_buf;
int n;
/* allocate memory and create plan */
in = fftw_malloc(sizeof(fftw_complex) * width * height);
out = fftw_malloc(sizeof(fftw_complex) * width * height);
p = fftw_plan_dft_2d(height, width, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);
/* allocate output buffer */
/* copy data to fftw buffer */
for( n = 0; n width*height; n++ ) {
in[n][0] = input_image_buf[n]; /* Re */
in[n][1] = 0; /* Im */
}
/* do FFT */
fftw_execute(p);
/* allocate buffer */
output_image_buf =
(unsigned char*)malloc( width * height * sizeof(unsigned char) );
if( output_image_buf != NULL ) {
/* copy data to fftw buffer */
for( n = 0; n width*height; n++ ) {
output_image_buf[n] = out[n][0];
}
}
/* free memories */
/*fftwnd_destroy_plan(p);*/
fftw_free(in);
fftw_free(out);
return output_image_buf;
}
int main( int argv, char** argc ) {
char* in_file_path;
char* out_file_path;
unsigned char* input_image_buf;
unsigned char* output_image_buf;
unsigned char* image_buf;
int width;
int height;
int depth;
clock_t clock_start, clock_end;
if( argv 3 ) {
printf( "%s input_file output_file\n", argc[0] );
return -1;
}
in_file_path = argc[1];
out_file_path = argc[2];
input_image_buf = load_image( in_file_path, width, height, depth );
if( input_image_buf == NULL ) {
fprintf( stderr, "cannot load image. exit.\n" );
return -1;
}
clock_start = clock();
output_image_buf =
execute_fft( input_image_buf, width, height, depth );
clock_end = clock();
if( output_image_buf != NULL ) {
write_image( out_file_path, output_image_buf, width, height, depth );
} else {
fprintf( stderr, "FFT operation faild.\n" );
}
free( input_image_buf );
free( output_image_buf );
printf( "start: %d end: %d elapse: %d.\n", clock_start, clock_end, clock_end - clock_start );
printf( "elapse time: %lf sec.", ((double)(clock_end - clock_start)) / (double)CLOCKS_PER_SEC );
/* free( image_buf ); */
return 0;
};
/* CommentLine: H.264 Sequence.*/ test1.yuv /* name of source files */ 1500 /* number of frames to encode */ 1 250 1 /* 1(reserved) , N (# of frames in between I frames), IDR interval. */ 0 1 /* Number of B frames between I (or P) and next P, treat B as a reference (only 0 is supported!) */ 1 1 0 /* num_ref_frames (2-16), minimum length of list1 for backward prediction (only 1 is supported!), number of slices. */ 77 0 /* profile_idc (77-main, 100-high); level_idc (set 0 for automatic selection) (check that num_ref_frames and frame size are in accordance with the level) */ 1280 /* horizontal_size */ 720 /* vertical_size */ 4 /* frame_rate_code [0,8] (0-30 fps,1-15 fps,2-24 fps,3-25 fps,4-30 fps,5-30 fps,6-50 fps,7-60 fps,8-60 fps)*/ 1 8 8 /* High profile: chroma_format_idc (0 - monochrom, 1 - 420, 2 - 422), bit_depth_luma [8,12], bit_depth_chroma [8,12] */ 0 8 0 0 0 /* High profile: aux_format_idc: [0,3], bit_depth_aux: [8,12], alpha_incr_flag: 0, 1; alpha_opaque_value: [0, 2^(bit_depth_aux + 9) -1]; alpha_transparent_value: [0, 2^(bit_depth_aux + 9) - 1] */ 1 20 20 20 2048000 /* RC method(0 - quant_codes, 1 - CBR MBwise, 2 - CBR framewise, 3 - Debug); start qp values for I, P, B slices; bitrate (bits per second) */ 2 0 8 8 /* ME method (1-6), subblock split, search x,search_y */ 0 0 0 /* weighted prediction, weighted biprediction implicit weighted biprediction (not supported!)*/ 0 1 /* direct type (0 - temporal 1 - spatial 2 - auto); direct_inference_flag */ 0 2 2 /* disable_deblocking_idc: 1-- off, 0 - on, 2 -- on(without crossing slice boundaries); deblocking_filter_alpha, deblocking_filter_beta */ 1 0 0 /* High profile: transform_8x8_mode: 0 -- off, 1 - on; 0 -- use standard, 1 -- use default scaling matrices for 8x8 quantization; qpprime_y_zero_transform_bypass_flag: (0, 1) */ 1280 /* display_horizontal_size */ 720 /* display_vertical_size */ 1 1 /* entropy coding mode (0-cavlc,1-cabac); cabac_init_idc (0,1,2) */ 0 /* picture coding type (0 - only FRM, 1 - only FLD , 2 - only AFRM, 3 - pure PicAFF(no MBAFF) 4 PicAFF + MBAFF). Only 0 (FRM) is supported! */ 0 0 /* speed/quality grade [0,3] (0-maximum speed, 3-maximum quality); OptimalQuantization (0, 1) */
[PageInfo]
LastUpdate: 2009-11-17 17:10:06, ModifiedBy: hiromichi-m
[Permissions]
view:all, edit:login users, delete/config:members