sumom****@users*****
sumom****@users*****
2008年 6月 17日 (火) 10:20:51 JST
Index: julius4/libsent/src/wav2mfcc/mfcc-core.c diff -u julius4/libsent/src/wav2mfcc/mfcc-core.c:1.3 julius4/libsent/src/wav2mfcc/mfcc-core.c:1.4 --- julius4/libsent/src/wav2mfcc/mfcc-core.c:1.3 Fri May 9 14:58:17 2008 +++ julius4/libsent/src/wav2mfcc/mfcc-core.c Tue Jun 17 10:20:51 2008 @@ -17,7 +17,7 @@ * @author Akinobu Lee * @date Mon Aug 7 11:55:45 2006 * - * $Revision: 1.3 $ + * $Revision: 1.4 $ * */ /* @@ -157,6 +157,60 @@ return(1127 * log(1 + (k-1) * fres)); } +/** + * Create fbank center frequency for VTLN. + * + * @param cf [i/o] center frequency of channels in Mel, will be changed considering VTLN + * @param para [in] analysis parameter + * @param mlo [in] fbank lower bound in Mel + * @param mhi [in] fbank upper bound in Mel + * @param maxChan [in] maximum number of channels + * + */ +static boolean +VTLN_recreate_fbank_cf(float *cf, Value *para, float mlo, float mhi, int maxChan) +{ + int chan; + float minf, maxf, cf_orig, cf_new; + float scale, cu, cl, au, al; + + /* restore frequency range to non-Mel */ + minf = 700.0 * (exp(mlo / 1127.0) - 1.0); + maxf = 700.0 * (exp(mhi / 1127.0) - 1.0); + + if (para->vtln_upper > maxf) { + jlog("Error: VTLN upper cut-off greater than upper frequency bound: %.1f > %.1f\n", para->vtln_upper, maxf); + return FALSE; + } + if (para->vtln_lower < minf) { + jlog("Error: VTLN lower cut-off smaller than lower frequency bound: %.1f < %.1f\n", para->vtln_lower, minf); + return FALSE; + } + + /* prepare variables for warping */ + scale = 1.0 / para->vtln_alpha; + cu = para->vtln_upper * 2 / ( 1 + scale); + cl = para->vtln_lower * 2 / ( 1 + scale); + au = (maxf - cu * scale) / (maxf - cu); + al = (cl * scale - minf) / (cl - minf); + + for (chan = 1; chan <= maxChan; chan++) { + /* get center frequency, restore to non-Mel */ + cf_orig = 700.0 * (exp(cf[chan] / 1127.0) - 1.0); + /* do warping */ + if( cf_orig > cu ){ + cf_new = au * (cf_orig - cu) + scale * cu; + } else if ( cf_orig < cl){ + cf_new = al * (cf_orig - minf) + minf; + } else { + cf_new = scale * cf_orig; + } + /* convert the new center frequency to Mel and store */ + cf[chan] = 1127.0 * log (1.0 + cf_new / 700.0); + } + return TRUE; +} + /** * Build filterbank information and generate tables for MFCC comptutation. * @@ -165,7 +219,7 @@ * * @return the generated filterbank information. */ -void +boolean InitFBank(MFCCWork *w, Value *para) { float mlo, mhi, ms, melk; @@ -202,6 +256,13 @@ for (chan = 1; chan <= maxChan; chan++) w->fb.cf[chan] = ((float)chan / maxChan)*ms + mlo; + if (para->vtln_alpha != 1.0) { + /* Modify fbank center frequencies for VTLN */ + if (VTLN_recreate_fbank_cf(w->fb.cf, para, mlo, mhi, maxChan) == FALSE) { + return FALSE; + } + } + /* Create loChan map, loChan[fftindex] -> lower channel index */ w->fb.loChan = (short *)mymalloc((nv2 + 1) * sizeof(short)); for(k = 1, chan = 1; k <= nv2; k++){ @@ -232,6 +293,7 @@ w->sqrt2var = sqrt(2.0 / para->fbank_num); + return TRUE; } /** @@ -558,7 +620,7 @@ memset(w, 0, sizeof(MFCCWork)); /* set filterbank information */ - InitFBank(w, para); + if (InitFBank(w, para) == FALSE) return NULL; #ifdef MFCC_SINCOS_TABLE /* prepare tables */ Index: julius4/libsent/src/wav2mfcc/para.c diff -u julius4/libsent/src/wav2mfcc/para.c:1.5 julius4/libsent/src/wav2mfcc/para.c:1.6 --- julius4/libsent/src/wav2mfcc/para.c:1.5 Fri Jun 6 12:20:14 2008 +++ julius4/libsent/src/wav2mfcc/para.c Tue Jun 17 10:20:51 2008 @@ -16,7 +16,7 @@ * @author Akinobu Lee * @date Fri Oct 27 14:55:00 2006 * - * $Revision: 1.5 $ + * $Revision: 1.6 $ * */ /* @@ -54,10 +54,14 @@ para->hipass = -2; /* undef */ para->lopass = -2; /* undef */ para->cmn = -1; + para->cvn = -1; para->raw_e = -1; para->c0 = -1; //para->ss_alpha = -1; //para->ss_floor = -1; + para->vtln_alpha = -1; + para->vtln_upper = -1; + para->vtln_lower = -1; para->zmeanframe = -1; para->usepower = -1; para->delta = -1; @@ -93,10 +97,12 @@ para->enormal = FALSE; para->escale = DEF_ESCALE; para->silFloor = DEF_SILFLOOR; + para->cvn = 0; para->hipass = -1; /* disabled */ para->lopass = -1; /* disabled */ //para->ss_alpha = DEF_SSALPHA; //para->ss_floor = DEF_SSFLOOR; + para->vtln_alpha = 1.0; /* disabled */ para->zmeanframe = FALSE; para->usepower = FALSE; } @@ -123,6 +129,7 @@ para->silFloor = 50.0; para->hipass = -1; /* disabled */ para->lopass = -1; /* disabled */ + para->vtln_alpha = 1.0; /* disabled */ para->zmeanframe = FALSE; para->usepower = FALSE; } @@ -153,10 +160,14 @@ if (dst->hipass == -2) dst->hipass = src->hipass; if (dst->lopass == -2) dst->lopass = src->lopass; if (dst->cmn == -1) dst->cmn = src->cmn; + if (dst->cvn == -1) dst->cvn = src->cvn; if (dst->raw_e == -1) dst->raw_e = src->raw_e; if (dst->c0 == -1) dst->c0 = src->c0; //if (dst->ss_alpha == -1) dst->ss_alpha = src->ss_alpha; //if (dst->ss_floor == -1) dst->ss_floor = src->ss_floor; + if (dst->vtln_alpha == -1) dst->vtln_alpha = src->vtln_alpha; + if (dst->vtln_upper == -1) dst->vtln_upper = src->vtln_upper; + if (dst->vtln_lower == -1) dst->vtln_lower = src->vtln_lower; if (dst->zmeanframe == -1) dst->zmeanframe = src->zmeanframe; if (dst->usepower == -1) dst->usepower = src->usepower; if (dst->delta == -1) dst->delta = src->delta; @@ -255,6 +266,12 @@ para->escale = atof(a); } else if (strmatch(d, "SILFLOOR")) { /* -silfloor */ para->silFloor = atof(a); + } else if (strmatch(d, "WARPFREQ")) { /* -vtln (1) */ + para->vtln_alpha = atof(a); + } else if (strmatch(d, "WARPLCUTOFF")) { /* -vtln (2) */ + para->vtln_lower = atof(a); + } else if (strmatch(d, "WARPUCUTOFF")) { /* -vtln (3) */ + para->vtln_upper = atof(a); } else if (strmatch(d, "TARGETKIND")) { jlog("Warning: para: TARGETKIND skipped (will be determined by AM header)\n"); skipped = TRUE; @@ -384,4 +401,11 @@ fprintf(fp, "\t use power = "); if (para->usepower) fprintf(fp, "ON\n"); else fprintf(fp, "OFF\n"); + fprintf(fp, "\t CVN = "); + if (para->cvn) fprintf(fp, "ON\n"); + else fprintf(fp, "OFF\n"); + fprintf(fp, "\t VTLN = "); + if(para->vtln_alpha != 1.0) { + fprintf(fp, "ON, alpha=%.3f, f_low=%.1f, f_high=%.1f\n", para->vtln_alpha, para->vtln_lower, para->vtln_upper); + } else fprintf(fp, "OFF\n"); } Index: julius4/libsent/src/wav2mfcc/wav2mfcc-buffer.c diff -u julius4/libsent/src/wav2mfcc/wav2mfcc-buffer.c:1.3 julius4/libsent/src/wav2mfcc/wav2mfcc-buffer.c:1.4 --- julius4/libsent/src/wav2mfcc/wav2mfcc-buffer.c:1.3 Fri Jun 6 11:56:10 2008 +++ julius4/libsent/src/wav2mfcc/wav2mfcc-buffer.c Tue Jun 17 10:20:51 2008 @@ -24,7 +24,7 @@ * @author Akinobu LEE * @date Thu Feb 17 17:43:35 2005 * - * $Revision: 1.3 $ + * $Revision: 1.4 $ * */ @@ -93,8 +93,9 @@ /* Acceleration */ if (para->acc) Accel(mfcc, frame_num, para); - /* Cepstrum Mean Normalization */ - if(para->cmn) CMN(mfcc, frame_num, para->mfcc_dim + (para->c0 ? 1 : 0)); + /* Cepstrum Mean and/or Variance Normalization */ + if (para->cmn && ! para->cvn) CMN(mfcc, frame_num, para->mfcc_dim + (para->c0 ? 1 : 0)); + else if (para->cmn || para->cvn) MVN(mfcc, frame_num, para); return(frame_num); } @@ -236,3 +237,55 @@ free(sum); free(mfcc_ave); } + +/** + * Cepstrum Mean/Variance Normalization (buffered) + * + * @param mfcc [i/o] array of MFCC vectors + * @param frame_num [in] number of frames + * @param para [in] configuration parameters + */ +void MVN(float **mfcc, int frame_num, Value *para) +{ + int i, t; + float *mfcc_mean, *mfcc_sd; + float x; + int basedim; + + basedim = para->mfcc_dim + (para->c0 ? 1 : 0); + + mfcc_mean = (float *)mycalloc(para->veclen, sizeof(float)); + if (para->cvn) mfcc_sd = (float *)mycalloc(para->veclen, sizeof(float)); + + /* get mean */ + for(i = 0; i < para->veclen; i++){ + mfcc_mean[i] = 0.0; + for(t = 0; t < frame_num; t++) + mfcc_mean[i] += mfcc[t][i]; + mfcc_mean[i] /= (float)frame_num; + } + if (para->cvn) { + /* get standard deviation */ + for(i = 0; i < para->veclen; i++){ + mfcc_sd[i] = 0.0; + for(t = 0; t < frame_num; t++) { + x = mfcc[t][i] - mfcc_mean[i]; + mfcc_sd[i] += x * x; + } + mfcc_sd[i] = sqrt(mfcc_sd[i] / (float)frame_num); + } + } + for(t = 0; t < frame_num; t++){ + if (para->cmn) { + /* mean normalization (base MFCC only) */ + for(i = 0; i < basedim; i++) mfcc[t][i] -= mfcc_mean[i]; + } + if (para->cvn) { + /* variance normalization (full MFCC) */ + for(i = 0; i < para->veclen; i++) mfcc[t][i] /= mfcc_sd[i]; + } + } + + if (para->cvn) free(mfcc_sd); + free(mfcc_mean); +} Index: julius4/libsent/src/wav2mfcc/wav2mfcc-pipe.c diff -u julius4/libsent/src/wav2mfcc/wav2mfcc-pipe.c:1.2 julius4/libsent/src/wav2mfcc/wav2mfcc-pipe.c:1.3 --- julius4/libsent/src/wav2mfcc/wav2mfcc-pipe.c:1.2 Tue Dec 18 17:45:54 2007 +++ julius4/libsent/src/wav2mfcc/wav2mfcc-pipe.c Tue Jun 17 10:20:51 2008 @@ -20,7 +20,7 @@ * @author Akinobu LEE * @date Thu Feb 17 18:12:30 2005 * - * $Revision: 1.2 $ + * $Revision: 1.3 $ * */ /* @@ -45,6 +45,7 @@ #include <sent/stddefs.h> #include <sent/mfcc.h> +#include <sent/htk_param.h> /***********************************************************************/ /** @@ -238,12 +239,12 @@ /** * Initialize MAP-CMN at startup. * - * @param dimension [in] vector dimension + * @param para [in] MFCC computation configuration parameter * @param weight [in] initial cepstral mean weight * */ CMNWork * -CMN_realtime_new(int dimension, float weight) +CMN_realtime_new(Value *para, float weight) { int i; @@ -251,19 +252,24 @@ c = (CMNWork *)mymalloc(sizeof(CMNWork)); - c->dim = dimension; c->cweight = weight; - + c->mfcc_dim = para->mfcc_dim + (para->c0 ? 1 : 0); + c->veclen = para->veclen; + c->mean = para->cmn ? TRUE : FALSE; + c->var = para->cvn ? TRUE : FALSE; c->clist_max = CPSTEP; c->clist_num = 0; c->clist = (CMEAN *)mymalloc(sizeof(CMEAN) * c->clist_max); for(i=0;i<c->clist_max;i++) { - c->clist[i].mfcc_sum = (float *)mymalloc(sizeof(float)*c->dim); + c->clist[i].mfcc_sum = (float *)mymalloc(sizeof(float)*c->veclen); + if (c->var) c->clist[i].mfcc_var = (float *)mymalloc(sizeof(float)*c->veclen); c->clist[i].framenum = 0; } - c->now.mfcc_sum = (float *)mymalloc(sizeof(float) * c->dim); + c->now.mfcc_sum = (float *)mymalloc(sizeof(float) * c->veclen); + if (c->var) c->now.mfcc_var = (float *)mymalloc(sizeof(float) * c->veclen); - c->cmean_init = (float *)mymalloc(sizeof(float) * c->dim); + c->cmean_init = (float *)mymalloc(sizeof(float) * c->veclen); + if (c->var) c->cvar_init = (float *)mymalloc(sizeof(float) * c->veclen); c->cmean_init_set = FALSE; return c; @@ -282,7 +288,12 @@ free(c->cmean_init); free(c->now.mfcc_sum); + if (c->var) { + free(c->cvar_init); + free(c->now.mfcc_var); + } for(i=0;i<c->clist_max;i++) { + if (c->var) free(c->clist[i].mfcc_var); free(c->clist[i].mfcc_sum); } free(c->clist); @@ -299,7 +310,10 @@ { int d; - for(d=0;d<c->dim;d++) c->now.mfcc_sum[d] = 0.0; + for(d=0;d<c->veclen;d++) c->now.mfcc_sum[d] = 0.0; + if (c->var) { + for(d=0;d<c->veclen;d++) c->now.mfcc_var[d] = 0.0; + } c->now.framenum = 0; } @@ -318,18 +332,50 @@ c->now.framenum++; if (c->cmean_init_set) { - for(d=0;d<c->dim;d++) { - /* accumulate value of given MFCC to sum */ + /* initial data exists */ + for(d=0;d<c->veclen;d++) { + /* accumulate current MFCC to sum */ c->now.mfcc_sum[d] += mfcc[d]; - /* calculate map-cmn and perform subtraction to the given vector */ + /* calculate map-mean */ x = c->now.mfcc_sum[d] + c->cweight * c->cmean_init[d]; y = (double)c->now.framenum + c->cweight; - mfcc[d] -= x / y; + x /= y; + if (c->var) { + /* calculate map-var */ + c->now.mfcc_var[d] += (mfcc[d] - x) * (mfcc[d] - x); + } + if (c->mean && d < c->mfcc_dim) { + /* mean normalization */ + mfcc[d] -= x; + } + if (c->var) { + /* variance normalization */ + x = c->now.mfcc_var[d] + c->cweight * c->cvar_init[d]; + y = (double)c->now.framenum + c->cweight; + mfcc[d] /= sqrt(x / y); + } } } else { - for(d=0;d<c->dim;d++) { + /* no initial data */ + for(d=0;d<c->veclen;d++) { + /* accumulate current MFCC to sum */ c->now.mfcc_sum[d] += mfcc[d]; - mfcc[d] -= c->now.mfcc_sum[d] / c->now.framenum; + /* calculate current mean */ + x = c->now.mfcc_sum[d] / c->now.framenum; + if (c->var) { + /* calculate current variance */ + c->now.mfcc_var[d] += (mfcc[d] - x) * (mfcc[d] - x); + } + if (c->mean && d < c->mfcc_dim) { + /* mean normalization */ + mfcc[d] -= x; + } +#if 0 /* not perform variance normalization on no initial data */ + if (c->var) { + /* variance normalization */ + mfcc[d] /= sqrt(c->now.mfcc_var[d] / c->now.framenum); + } +#endif } } } @@ -340,9 +386,9 @@ * @param c [i/o] CMN calculation work area */ void -CMN_realtime_update(CMNWork *c) +CMN_realtime_update(CMNWork *c, HTK_Param *param) { - float *tmp; + float *tmp, *tmp2; int i, d; int frames; @@ -350,15 +396,44 @@ /* this may occur by pausing just after startup */ if (c->now.framenum == 0) return; + /* re-calculate variance based on the final mean at the given param */ + if (c->var && param != NULL) { + float m, x; + if (param->samplenum != c->now.framenum) { + jlog("InternalError: CMN_realtime_update: param->samplenum != c->now.framenum\n"); + } else if (param->veclen != c->veclen) { + jlog("InternalError: CMN_realtime_update: param->veclen != c->veclen\n"); + } else { + for(d=0;d<c->veclen;d++) { + m = c->now.mfcc_sum[d] / (float) c->now.framenum; + x = 0; + for(i=0;i<param->samplenum;i++) { + x += (param->parvec[i][d] - m) * (param->parvec[i][d] - m); + } + c->now.mfcc_var[d] = x; + } + } + } + /* compute cepstral mean from now and previous sums up to CPMAX frames */ - for(d=0;d<c->dim;d++) c->cmean_init[d] = c->now.mfcc_sum[d]; + for(d=0;d<c->veclen;d++) c->cmean_init[d] = c->now.mfcc_sum[d]; + if (c->var) { + for(d=0;d<c->veclen;d++) c->cvar_init[d] = c->now.mfcc_var[d]; + } frames = c->now.framenum; for(i=0;i<c->clist_num;i++) { - for(d=0;d<c->dim;d++) c->cmean_init[d] += c->clist[i].mfcc_sum[d]; + for(d=0;d<c->veclen;d++) c->cmean_init[d] += c->clist[i].mfcc_sum[d]; + if (c->var) { + for(d=0;d<c->veclen;d++) c->cvar_init[d] += c->clist[i].mfcc_var[d]; + } frames += c->clist[i].framenum; if (frames >= CPMAX) break; } - for(d=0;d<c->dim;d++) c->cmean_init[d] /= (float) frames; + for(d=0;d<c->veclen;d++) c->cmean_init[d] /= (float) frames; + if (c->var) { + for(d=0;d<c->veclen;d++) c->cvar_init[d] /= (float) frames; + } + c->cmean_init_set = TRUE; /* expand clist if neccessary */ @@ -366,17 +441,21 @@ c->clist_max += CPSTEP; c->clist = (CMEAN *)myrealloc(c->clist, sizeof(CMEAN) * c->clist_max); for(i=c->clist_num;i<c->clist_max;i++) { - c->clist[i].mfcc_sum = (float *)mymalloc(sizeof(float)*c->dim); + c->clist[i].mfcc_sum = (float *)mymalloc(sizeof(float)*c->veclen); + if (c->var) c->clist[i].mfcc_var = (float *)mymalloc(sizeof(float)*c->veclen); c->clist[i].framenum = 0; } } /* shift clist */ tmp = c->clist[c->clist_max-1].mfcc_sum; + if (c->var) tmp2 = c->clist[c->clist_max-1].mfcc_var; memmove(&(c->clist[1]), &(c->clist[0]), sizeof(CMEAN) * (c->clist_max - 1)); c->clist[0].mfcc_sum = tmp; + if (c->var) c->clist[0].mfcc_var = tmp2; /* copy now to clist[0] */ - memcpy(c->clist[0].mfcc_sum, c->now.mfcc_sum, sizeof(float) * c->dim); + memcpy(c->clist[0].mfcc_sum, c->now.mfcc_sum, sizeof(float) * c->veclen); + if (c->var) memcpy(c->clist[0].mfcc_var, c->now.mfcc_var, sizeof(float) * c->veclen); c->clist[0].framenum = c->now.framenum; if (c->clist_num < c->clist_max) c->clist_num++; @@ -457,17 +536,26 @@ return(FALSE); } /* check length */ - if (veclen != c->dim) { - jlog("Error: wav2mfcc-pipe: vector dimension mismatch\n"); + if (veclen != c->veclen) { + jlog("Error: wav2mfcc-pipe: cepstral dimension mismatch\n"); + jlog("Error: wav2mfcc-pipe: process = %d, file = %d\n", c->veclen, veclen); fclose_readfile(fp); return(FALSE); } /* read body */ - if (myread(c->cmean_init, sizeof(float), c->dim, fp) == FALSE) { - jlog("Error: wav2mfcc-pipe: failed to read\n"); + if (myread(c->cmean_init, sizeof(float), c->veclen, fp) == FALSE) { + jlog("Error: wav2mfcc-pipe: failed to read mean for CMN\n"); fclose_readfile(fp); return(FALSE); } + if (c->var) { + if (myread(c->cvar_init, sizeof(float), c->veclen, fp) == FALSE) { + jlog("Error: wav2mfcc-pipe: failed to read variance for CVN\n"); + fclose_readfile(fp); + return(FALSE); + } + } + if (fclose_readfile(fp) == -1) { jlog("Error: wav2mfcc-pipe: failed to close\n"); return(FALSE); @@ -492,27 +580,35 @@ { int fd; - jlog("Stat: wav2mfcc-pipe: writing current CM to file \"%s\"\n", filename); + jlog("Stat: wav2mfcc-pipe: writing current cepstral data to file \"%s\"\n", filename); if ((fd = creat(filename, 0644)) == -1) { - jlog("Error: wav2mfcc-pipe: failed to open\n"); + jlog("Error: wav2mfcc-pipe: failed to open \"%s\" to write current cepstral data\n", filename); return(FALSE); } /* write header */ - if (mywrite(&(c->dim), sizeof(int), 1, fd) == FALSE) { - jlog("Error: wav2mfcc-pipe: failed to write header\n"); + if (mywrite(&(c->veclen), sizeof(int), 1, fd) == FALSE) { + jlog("Error: wav2mfcc-pipe: cannot write header to \"%s\" as current cepstral data\n", filename); close(fd); return(FALSE); } /* write body */ - if (mywrite(c->cmean_init, sizeof(float), c->dim, fd) == FALSE) { - jlog("Error: wav2mfcc-pipe: failed to write header\n"); + if (mywrite(c->cmean_init, sizeof(float), c->veclen, fd) == FALSE) { + jlog("Error: wav2mfcc-pipe: cannot write mean to \"%s\" as current cepstral data\n", filename); close(fd); return(FALSE); } + if (c->var) { + if (mywrite(c->cvar_init, sizeof(float), c->veclen, fd) == FALSE) { + jlog("Error: wav2mfcc-pipe: cannot write variance to \"%s\" as current cepstrum\n", filename); + close(fd); + return(FALSE); + } + } + close(fd); - jlog("Stat: wav2mfcc-pipe: wrote current CM\n"); + jlog("Stat: wav2mfcc-pipe: current cepstral data written to \"%s\"\n", filename); return(TRUE); }