sumom****@users*****
sumom****@users*****
2009年 2月 10日 (火) 17:15:48 JST
Index: julius4/libsent/src/ngram/init_ngram.c diff -u julius4/libsent/src/ngram/init_ngram.c:1.5 julius4/libsent/src/ngram/init_ngram.c:1.6 --- julius4/libsent/src/ngram/init_ngram.c:1.5 Tue Feb 10 02:27:46 2009 +++ julius4/libsent/src/ngram/init_ngram.c Tue Feb 10 17:15:48 2009 @@ -12,7 +12,7 @@ * @author Akinobu LEE * @date Wed Feb 16 07:40:53 2005 * - * $Revision: 1.5 $ + * $Revision: 1.6 $ * */ /* @@ -145,7 +145,7 @@ } if (ok_flag == FALSE) { jlog("Error: --- Failed to map %d words in dictionary to N-gram\n", count); - jlog("Error: --- Please fix the dict, or use open vocabulary N-gram that has either \"%s\" or \"%s\"\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2); + jlog("Error: --- Specify the word to which those words are mapped with \"-mapunk\" (default: \"<unk>\" or \"<UNK>\"\n"); return FALSE; } @@ -161,23 +161,31 @@ /** * @brief Set unknown word ID to the N-gram data. * - * Unknown word string should be UNK_WORD_DEFAULT or UNK_WORD_DEFAULT2, - * whose default is "<unk>" and "<UNK>". If any of these is not found - * in vocabulary, treat the LM as closed vocabulary. * * @param ndata [out] N-gram data to set unknown word ID. + * @param str [in] word name string of unknown word */ void -set_unknown_id(NGRAM_INFO *ndata) +set_unknown_id(NGRAM_INFO *ndata, char *str) { - ndata->isopen = TRUE; - ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT); + ndata->unk_id = ngram_lookup_word(ndata, str); if (ndata->unk_id == WORD_INVALID) { - ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2); + if (strmatch(str, UNK_WORD_DEFAULT)) { + /* if default "<unk>" is not found, also try "<UNK>" */ + ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2); + if (ndata->unk_id == WORD_INVALID) { + jlog("Stat: init_ngram: either \"%s\" and \"%s\" not found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2); + ndata->isopen = FALSE; + return; + } + } } if (ndata->unk_id == WORD_INVALID) { - jlog("Stat: \"%s\" or \"%s\" not found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2); + jlog("Stat: init_ngram: \"%s\" not found, assuming close vocabulary LM\n", str); ndata->isopen = FALSE; + } else { + jlog("Stat: init_ngram: unknown words will be mapped to \"%s\"\n", str); + ndata->isopen = TRUE; } } Index: julius4/libsent/src/ngram/ngram_read_arpa.c diff -u julius4/libsent/src/ngram/ngram_read_arpa.c:1.14 julius4/libsent/src/ngram/ngram_read_arpa.c:1.15 --- julius4/libsent/src/ngram/ngram_read_arpa.c:1.14 Tue Feb 10 02:27:46 2009 +++ julius4/libsent/src/ngram/ngram_read_arpa.c Tue Feb 10 17:15:48 2009 @@ -20,7 +20,7 @@ * @author Akinobu LEE * @date Wed Feb 16 16:52:24 2005 * - * $Revision: 1.14 $ + * $Revision: 1.15 $ * */ /* @@ -30,7 +30,7 @@ * All rights reserved */ -/* $Id: ngram_read_arpa.c,v 1.14 2009/02/09 17:27:46 sumomo Exp $ */ +/* $Id: ngram_read_arpa.c,v 1.15 2009/02/10 08:15:48 sumomo Exp $ */ /* words should be alphabetically sorted */ @@ -555,7 +555,7 @@ /* check if the numbers are the same with already read n-gram */ for(i=0;i<2;i++) { if (ndata->d[i].totalnum != num[i]) { - jlog("Warning: ngram_read_arpa: %d-gram total num differ between forward N-gram and backward N-gram, may cause some error\n", n+1); + jlog("Warning: ngram_read_arpa: %d-gram total num differ between forward N-gram and backward N-gram, may cause some error\n", i+1); } } /* read additional 1-gram data */ @@ -653,9 +653,6 @@ } } - /* set unknown (=OOV) word id */ - set_unknown_id(ndata); - /* swap <s> and </s> for backward SRILM N-gram */ if (ndata->dir == DIR_RL) { WORD_ID bos, eos; Index: julius4/libsent/src/ngram/ngram_read_bin.c diff -u julius4/libsent/src/ngram/ngram_read_bin.c:1.5 julius4/libsent/src/ngram/ngram_read_bin.c:1.6 --- julius4/libsent/src/ngram/ngram_read_bin.c:1.5 Sat Jan 31 00:04:18 2009 +++ julius4/libsent/src/ngram/ngram_read_bin.c Tue Feb 10 17:15:48 2009 @@ -48,7 +48,7 @@ * @author Akinobu LEE * @date Wed Feb 16 17:12:08 2005 * - * $Revision: 1.5 $ + * $Revision: 1.6 $ * */ /* @@ -641,9 +641,6 @@ jlog("Stat: ngram_read_bin: making entry name index\n"); ngram_make_lookup_tree(ndata); - /* set unknown id */ - set_unknown_id(ndata); - bi_prob_func_set(ndata); return TRUE;