• R/O
  • SSH
  • HTTPS

unf: Commit


Commit MetaInfo

Revision45 (tree)
Time2011-11-19 11:14:53
Authorphjgt

Log Message

DAWG版の一つ目が一段落

Change Summary

Incremental Difference

--- trunk/lisp/gen-dawg-table.lisp (nonexistent)
+++ trunk/lisp/gen-dawg-table.lisp (revision 45)
@@ -0,0 +1,147 @@
1+(defun read-attr-def (path &aux acc)
2+ (each-file-line (line path)
3+ (push (list (subseq line 3) (parse-integer line :end 2 :radix 16)) acc))
4+ (sort (nreverse acc) #'string< :key #'first))
5+
6+(defun read-map-def (path &aux acc)
7+ (each-file-line (line path)
8+ (let ((p (position #\Tab line)))
9+ (push (list (subseq line 0 p) (subseq line (1+ p))) acc)))
10+ (sort (nreverse acc) #'string< :key #'first))
11+
12+(defparameter *cac*
13+ (read-map-def "data/canonical-composition.def"))
14+
15+(defparameter *cad*
16+ (read-map-def "data/canonical-decomposition.def"))
17+
18+(defparameter *cod*
19+ (read-map-def "data/compatibility-decomposition.def"))
20+
21+(defparameter *ccc*
22+ (read-attr-def "data/canonical-combining-class.def"))
23+
24+(defparameter *nic*
25+ (read-attr-def "data/nfc-illegal-char.def"))
26+
27+(defparameter *nfic*
28+ (read-attr-def "data/nfkc-illegal-char.def"))
29+
30+(defun add-prefix (prefix)
31+ (lambda (s)
32+ (s prefix (car s))))
33+
34+(defparameter *keys*
35+ (flatten
36+ (list (mapcar (add-prefix "0") *cac*)
37+ (mapcar (add-prefix "1") *cad*)
38+ (mapcar (add-prefix "2") *cod*)
39+ (mapcar (add-prefix "3") *ccc*)
40+ (mapcar (add-prefix "4") *nic*)
41+ (mapcar (add-prefix "5") *nfic*))))
42+
43+(defparameter *strs*
44+ (cat
45+ (sort
46+ (flatten
47+ (list (mapcar #'second *cac*)
48+ (mapcar #'second *cad*)
49+ (mapcar #'second *cod*)))
50+ #'> :key #'length)))
51+
52+(defparameter *octets* (string-to-octets *strs*))
53+
54+(defun cat (strs)
55+ (reduce (lambda (acc s)
56+ (declare (simple-string s acc))
57+ (let ((p (search s acc)))
58+ (if (null p)
59+ (concatenate 'string s acc)
60+ acc)))
61+ strs
62+ :initial-value ""))
63+
64+(dawg:build :input *keys* :output "/tmp/key.idx")
65+
66+(with-open-file (out "/tmp/str.dat" :direction :output
67+ :if-exists :supersede
68+ :element-type 'octet)
69+ (write-sequence (string-to-octets *strs*) out)
70+ 'done)
71+
72+(with-open-file (out "/tmp/val.dat" :direction :output
73+ :if-exists :supersede
74+ :element-type '(unsigned-byte 32))
75+ (loop FOR as IN (list *cac* *cad* *cod*)
76+ DO
77+ (loop FOR (_ v) IN as
78+ FOR bv = (string-to-octets v)
79+ FOR p = (search (the simple-octets bv) (the simple-octets *octets*))
80+ DO
81+ (write-byte (dpb (length bv) (byte 8 24) p) out)))
82+
83+ (loop FOR (_ attr) IN *ccc*
84+ DO
85+ (write-byte attr out))
86+ 'done)
87+
88+(defun gen-source (path)
89+ (with-open-file (out path :direction :output :if-exists :supersede)
90+ (format out "#ifndef UNF_TABLE2_HH~%")
91+ (format out "#define UNF_TABLE2_HH~%")
92+ (format out "namespace UNF {~%")
93+ (format out "namespace TABLE {~%")
94+
95+ (with-open-file (in "/tmp/key.idx" :element-type '(unsigned-byte 64))
96+ (let ((base (ldb (byte 29 0) (progn #1=(read-byte in nil nil) #1#))))
97+ (format out "const unsigned CANONICAL_COM_ROOT = ~d;~%" (+ base (char-code #\0)))
98+ (format out "const unsigned CANONICAL_DECOM_ROOT = ~d;~%" (+ base (char-code #\1)))
99+ (format out "const unsigned COMPATIBILITY_DECOM_ROOT = ~d;~%" (+ base (char-code #\2)))
100+ (format out "const unsigned CANONICAL_CLASS_ROOT = ~d;~%" (+ base (char-code #\3)))
101+ (format out "const unsigned NFC_ILLEGAL_ROOT = ~d;~%" (+ base (char-code #\4)))
102+ (format out "const unsigned NFKC_ILLEGAL_ROOT = ~d;~%" (+ base (char-code #\5)))))
103+
104+ (with-open-file (in "/tmp/key.idx" :element-type '(unsigned-byte 64))
105+ (format out "~%const unsigned NODES[]={")
106+ (read-byte in nil nil)
107+ (loop FOR v = (read-byte in nil nil)
108+ WHILE v
109+ FOR i FROM 0
110+ DO
111+ (when (zerop (mod i 5))
112+ (terpri out))
113+ (format out "0x~8,'0x," (ldb (byte 32 32) v))
114+ (format out "0x~8,'0x" (ldb (byte 32 0) v))
115+ (when (listen in)
116+ (format out ",")))
117+ (format out "};~%"))
118+
119+ (with-open-file (in "/tmp/val.dat" :element-type '(unsigned-byte 32))
120+ (format out "~%const unsigned VALUES[]={")
121+ (loop FOR v = (read-byte in nil nil)
122+ WHILE v
123+ FOR i FROM 0
124+ DO
125+ (when (zerop (mod i 10))
126+ (terpri out))
127+ (format out "0x~8,'0x" v)
128+ (when (listen in)
129+ (format out ",")))
130+ (format out "};~%"))
131+
132+ (with-open-file (in "/tmp/str.dat" :element-type '(signed-byte 8))
133+ (format out "~%const char STRINGS[]={")
134+ (loop FOR c = (read-byte in nil nil)
135+ WHILE c
136+ FOR i FROM 0
137+ DO
138+ (when (zerop (mod i 20))
139+ (terpri out))
140+ (format out "~4d" c)
141+ (when (listen in)
142+ (format out ",")))
143+ (format out "};~%"))
144+
145+ (format out "}~%")
146+ (format out "}~%")
147+ (format out "#endif~%")))
\ No newline at end of file
--- trunk/src/unf/trie/node2.hh (revision 44)
+++ trunk/src/unf/trie/node2.hh (revision 45)
@@ -31,8 +31,6 @@
3131 bool check_encoded_children(T& in) const {
3232 switch(type()) {
3333 case 0:
34- // std::cout << "@ " << (int)in.peek() << std::endl;
35- // std::cout << "# " << (int)enc_chck(0) << ", " << (int)enc_chck(1) << std::endl;
3634 return ((enc_chck(0)==0 || (enc_chck(0) == in.read() && !in.eos())) &&
3735 (enc_chck(1)==0 || (enc_chck(1) == in.read() && !in.eos())));
3836 case 1:
--- trunk/src/unf/trie/searcher.hh (revision 44)
+++ trunk/src/unf/trie/searcher.hh (revision 45)
@@ -27,8 +27,22 @@
2727 if(node.check_char() != in.peek())
2828 return default_value;
2929 }
30- }
30+ }
31+
32+ bool member(const char* key) const {
33+ Node2 node = nodes2[root];
3134
35+ for(CharStream in(key);; in.read()) {
36+ if(node.is_terminal()) return true;
37+ if(in.eos()) return false;
38+ if(node.check_encoded_children(in)==false) return false;
39+
40+ node = nodes2[node.jump(in.peek())];
41+ if(node.check_char() != in.peek())
42+ return false;
43+ }
44+ }
45+
3246 protected:
3347 const Node2* nodes2;
3448 const unsigned root;
@@ -106,8 +120,7 @@
106120 NormalizationForm(const unsigned* nodes, unsigned root, const unsigned* vals, const char* strs)
107121 : Searcher(Node2::from_uint_array(nodes), root, vals, strs) {}
108122
109- // TODO: set的な検索は専用メソッドを設ける
110- bool quick_check(const char* key) const { return find_value(key,0xFFFFFFFF)==0xFFFFFFFF; }
123+ bool quick_check(const char* key) const { return !member(key); }
111124
112125 void decompose(RangeCharStream in, std::string& buffer) const {
113126 loop_head:
Show on old repository browser