[Julius-cvs 386] CVS update: julius4/libsent/src/ngram

Back to archive index

sumom****@users***** sumom****@users*****
2009年 2月 10日 (火) 17:15:48 JST


Index: julius4/libsent/src/ngram/init_ngram.c
diff -u julius4/libsent/src/ngram/init_ngram.c:1.5 julius4/libsent/src/ngram/init_ngram.c:1.6
--- julius4/libsent/src/ngram/init_ngram.c:1.5	Tue Feb 10 02:27:46 2009
+++ julius4/libsent/src/ngram/init_ngram.c	Tue Feb 10 17:15:48 2009
@@ -12,7 +12,7 @@
  * @author Akinobu LEE
  * @date   Wed Feb 16 07:40:53 2005
  *
- * $Revision: 1.5 $
+ * $Revision: 1.6 $
  * 
  */
 /*
@@ -145,7 +145,7 @@
   }
   if (ok_flag == FALSE) {
     jlog("Error: --- Failed to map %d words in dictionary to N-gram\n", count);
-    jlog("Error: --- Please fix the dict, or use open vocabulary N-gram that has either \"%s\" or \"%s\"\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2);
+    jlog("Error: --- Specify the word to which those words are mapped with \"-mapunk\" (default: \"<unk>\" or \"<UNK>\"\n");
     return FALSE;
   }
       
@@ -161,23 +161,31 @@
 /** 
  * @brief  Set unknown word ID to the N-gram data.
  *
- * Unknown word string should be UNK_WORD_DEFAULT or UNK_WORD_DEFAULT2,
- * whose default is "<unk>" and "<UNK>".  If any of these is not found
- * in vocabulary, treat the LM as closed vocabulary.
  * 
  * @param ndata [out] N-gram data to set unknown word ID.
+ * @param str [in] word name string of unknown word
  */
 void
-set_unknown_id(NGRAM_INFO *ndata)
+set_unknown_id(NGRAM_INFO *ndata, char *str)
 {
-  ndata->isopen = TRUE;
-  ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT);
+  ndata->unk_id = ngram_lookup_word(ndata, str);
   if (ndata->unk_id == WORD_INVALID) {
-    ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2);
+    if (strmatch(str, UNK_WORD_DEFAULT)) {
+      /* if default "<unk>" is not found, also try "<UNK>" */
+      ndata->unk_id = ngram_lookup_word(ndata, UNK_WORD_DEFAULT2);
+      if (ndata->unk_id == WORD_INVALID) {
+	jlog("Stat: init_ngram: either \"%s\" and \"%s\" not found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2);
+	ndata->isopen = FALSE;
+	return;
+      }
+    }
   }
   if (ndata->unk_id == WORD_INVALID) {
-    jlog("Stat: \"%s\" or \"%s\" not found, assuming close vocabulary LM\n", UNK_WORD_DEFAULT, UNK_WORD_DEFAULT2);
+    jlog("Stat: init_ngram: \"%s\" not found, assuming close vocabulary LM\n", str);
     ndata->isopen = FALSE;
+  } else {
+    jlog("Stat: init_ngram: unknown words will be mapped to \"%s\"\n", str);
+    ndata->isopen = TRUE;
   }
 }
 
Index: julius4/libsent/src/ngram/ngram_read_arpa.c
diff -u julius4/libsent/src/ngram/ngram_read_arpa.c:1.14 julius4/libsent/src/ngram/ngram_read_arpa.c:1.15
--- julius4/libsent/src/ngram/ngram_read_arpa.c:1.14	Tue Feb 10 02:27:46 2009
+++ julius4/libsent/src/ngram/ngram_read_arpa.c	Tue Feb 10 17:15:48 2009
@@ -20,7 +20,7 @@
  * @author Akinobu LEE
  * @date   Wed Feb 16 16:52:24 2005
  *
- * $Revision: 1.14 $
+ * $Revision: 1.15 $
  * 
  */
 /*
@@ -30,7 +30,7 @@
  * All rights reserved
  */
 
-/* $Id: ngram_read_arpa.c,v 1.14 2009/02/09 17:27:46 sumomo Exp $ */
+/* $Id: ngram_read_arpa.c,v 1.15 2009/02/10 08:15:48 sumomo Exp $ */
 
 /* words should be alphabetically sorted */
 
@@ -555,7 +555,7 @@
     /* check if the numbers are the same with already read n-gram */
     for(i=0;i<2;i++) {
       if (ndata->d[i].totalnum != num[i]) {
-	jlog("Warning: ngram_read_arpa: %d-gram total num differ between forward N-gram and backward N-gram, may cause some error\n", n+1);
+	jlog("Warning: ngram_read_arpa: %d-gram total num differ between forward N-gram and backward N-gram, may cause some error\n", i+1);
       }
     }
     /* read additional 1-gram data */
@@ -653,9 +653,6 @@
       }
     }
     
-    /* set unknown (=OOV) word id */
-    set_unknown_id(ndata);
-
     /* swap <s> and </s> for backward SRILM N-gram */
     if (ndata->dir == DIR_RL) {
       WORD_ID bos, eos;
Index: julius4/libsent/src/ngram/ngram_read_bin.c
diff -u julius4/libsent/src/ngram/ngram_read_bin.c:1.5 julius4/libsent/src/ngram/ngram_read_bin.c:1.6
--- julius4/libsent/src/ngram/ngram_read_bin.c:1.5	Sat Jan 31 00:04:18 2009
+++ julius4/libsent/src/ngram/ngram_read_bin.c	Tue Feb 10 17:15:48 2009
@@ -48,7 +48,7 @@
  * @author Akinobu LEE
  * @date   Wed Feb 16 17:12:08 2005
  *
- * $Revision: 1.5 $
+ * $Revision: 1.6 $
  * 
  */
 /*
@@ -641,9 +641,6 @@
   jlog("Stat: ngram_read_bin: making entry name index\n");
   ngram_make_lookup_tree(ndata);
 
-  /* set unknown id */
-  set_unknown_id(ndata);
-  
   bi_prob_func_set(ndata);
 
   return TRUE;



Julius-cvs メーリングリストの案内
Back to archive index