DAWG版の一つ目が一段落
@@ -0,0 +1,147 @@ | ||
1 | +(defun read-attr-def (path &aux acc) | |
2 | + (each-file-line (line path) | |
3 | + (push (list (subseq line 3) (parse-integer line :end 2 :radix 16)) acc)) | |
4 | + (sort (nreverse acc) #'string< :key #'first)) | |
5 | + | |
6 | +(defun read-map-def (path &aux acc) | |
7 | + (each-file-line (line path) | |
8 | + (let ((p (position #\Tab line))) | |
9 | + (push (list (subseq line 0 p) (subseq line (1+ p))) acc))) | |
10 | + (sort (nreverse acc) #'string< :key #'first)) | |
11 | + | |
12 | +(defparameter *cac* | |
13 | + (read-map-def "data/canonical-composition.def")) | |
14 | + | |
15 | +(defparameter *cad* | |
16 | + (read-map-def "data/canonical-decomposition.def")) | |
17 | + | |
18 | +(defparameter *cod* | |
19 | + (read-map-def "data/compatibility-decomposition.def")) | |
20 | + | |
21 | +(defparameter *ccc* | |
22 | + (read-attr-def "data/canonical-combining-class.def")) | |
23 | + | |
24 | +(defparameter *nic* | |
25 | + (read-attr-def "data/nfc-illegal-char.def")) | |
26 | + | |
27 | +(defparameter *nfic* | |
28 | + (read-attr-def "data/nfkc-illegal-char.def")) | |
29 | + | |
30 | +(defun add-prefix (prefix) | |
31 | + (lambda (s) | |
32 | + (s prefix (car s)))) | |
33 | + | |
34 | +(defparameter *keys* | |
35 | + (flatten | |
36 | + (list (mapcar (add-prefix "0") *cac*) | |
37 | + (mapcar (add-prefix "1") *cad*) | |
38 | + (mapcar (add-prefix "2") *cod*) | |
39 | + (mapcar (add-prefix "3") *ccc*) | |
40 | + (mapcar (add-prefix "4") *nic*) | |
41 | + (mapcar (add-prefix "5") *nfic*)))) | |
42 | + | |
43 | +(defparameter *strs* | |
44 | + (cat | |
45 | + (sort | |
46 | + (flatten | |
47 | + (list (mapcar #'second *cac*) | |
48 | + (mapcar #'second *cad*) | |
49 | + (mapcar #'second *cod*))) | |
50 | + #'> :key #'length))) | |
51 | + | |
52 | +(defparameter *octets* (string-to-octets *strs*)) | |
53 | + | |
54 | +(defun cat (strs) | |
55 | + (reduce (lambda (acc s) | |
56 | + (declare (simple-string s acc)) | |
57 | + (let ((p (search s acc))) | |
58 | + (if (null p) | |
59 | + (concatenate 'string s acc) | |
60 | + acc))) | |
61 | + strs | |
62 | + :initial-value "")) | |
63 | + | |
64 | +(dawg:build :input *keys* :output "/tmp/key.idx") | |
65 | + | |
66 | +(with-open-file (out "/tmp/str.dat" :direction :output | |
67 | + :if-exists :supersede | |
68 | + :element-type 'octet) | |
69 | + (write-sequence (string-to-octets *strs*) out) | |
70 | + 'done) | |
71 | + | |
72 | +(with-open-file (out "/tmp/val.dat" :direction :output | |
73 | + :if-exists :supersede | |
74 | + :element-type '(unsigned-byte 32)) | |
75 | + (loop FOR as IN (list *cac* *cad* *cod*) | |
76 | + DO | |
77 | + (loop FOR (_ v) IN as | |
78 | + FOR bv = (string-to-octets v) | |
79 | + FOR p = (search (the simple-octets bv) (the simple-octets *octets*)) | |
80 | + DO | |
81 | + (write-byte (dpb (length bv) (byte 8 24) p) out))) | |
82 | + | |
83 | + (loop FOR (_ attr) IN *ccc* | |
84 | + DO | |
85 | + (write-byte attr out)) | |
86 | + 'done) | |
87 | + | |
88 | +(defun gen-source (path) | |
89 | + (with-open-file (out path :direction :output :if-exists :supersede) | |
90 | + (format out "#ifndef UNF_TABLE2_HH~%") | |
91 | + (format out "#define UNF_TABLE2_HH~%") | |
92 | + (format out "namespace UNF {~%") | |
93 | + (format out "namespace TABLE {~%") | |
94 | + | |
95 | + (with-open-file (in "/tmp/key.idx" :element-type '(unsigned-byte 64)) | |
96 | + (let ((base (ldb (byte 29 0) (progn #1=(read-byte in nil nil) #1#)))) | |
97 | + (format out "const unsigned CANONICAL_COM_ROOT = ~d;~%" (+ base (char-code #\0))) | |
98 | + (format out "const unsigned CANONICAL_DECOM_ROOT = ~d;~%" (+ base (char-code #\1))) | |
99 | + (format out "const unsigned COMPATIBILITY_DECOM_ROOT = ~d;~%" (+ base (char-code #\2))) | |
100 | + (format out "const unsigned CANONICAL_CLASS_ROOT = ~d;~%" (+ base (char-code #\3))) | |
101 | + (format out "const unsigned NFC_ILLEGAL_ROOT = ~d;~%" (+ base (char-code #\4))) | |
102 | + (format out "const unsigned NFKC_ILLEGAL_ROOT = ~d;~%" (+ base (char-code #\5))))) | |
103 | + | |
104 | + (with-open-file (in "/tmp/key.idx" :element-type '(unsigned-byte 64)) | |
105 | + (format out "~%const unsigned NODES[]={") | |
106 | + (read-byte in nil nil) | |
107 | + (loop FOR v = (read-byte in nil nil) | |
108 | + WHILE v | |
109 | + FOR i FROM 0 | |
110 | + DO | |
111 | + (when (zerop (mod i 5)) | |
112 | + (terpri out)) | |
113 | + (format out "0x~8,'0x," (ldb (byte 32 32) v)) | |
114 | + (format out "0x~8,'0x" (ldb (byte 32 0) v)) | |
115 | + (when (listen in) | |
116 | + (format out ","))) | |
117 | + (format out "};~%")) | |
118 | + | |
119 | + (with-open-file (in "/tmp/val.dat" :element-type '(unsigned-byte 32)) | |
120 | + (format out "~%const unsigned VALUES[]={") | |
121 | + (loop FOR v = (read-byte in nil nil) | |
122 | + WHILE v | |
123 | + FOR i FROM 0 | |
124 | + DO | |
125 | + (when (zerop (mod i 10)) | |
126 | + (terpri out)) | |
127 | + (format out "0x~8,'0x" v) | |
128 | + (when (listen in) | |
129 | + (format out ","))) | |
130 | + (format out "};~%")) | |
131 | + | |
132 | + (with-open-file (in "/tmp/str.dat" :element-type '(signed-byte 8)) | |
133 | + (format out "~%const char STRINGS[]={") | |
134 | + (loop FOR c = (read-byte in nil nil) | |
135 | + WHILE c | |
136 | + FOR i FROM 0 | |
137 | + DO | |
138 | + (when (zerop (mod i 20)) | |
139 | + (terpri out)) | |
140 | + (format out "~4d" c) | |
141 | + (when (listen in) | |
142 | + (format out ","))) | |
143 | + (format out "};~%")) | |
144 | + | |
145 | + (format out "}~%") | |
146 | + (format out "}~%") | |
147 | + (format out "#endif~%"))) | |
\ No newline at end of file |
@@ -31,8 +31,6 @@ | ||
31 | 31 | bool check_encoded_children(T& in) const { |
32 | 32 | switch(type()) { |
33 | 33 | case 0: |
34 | - // std::cout << "@ " << (int)in.peek() << std::endl; | |
35 | - // std::cout << "# " << (int)enc_chck(0) << ", " << (int)enc_chck(1) << std::endl; | |
36 | 34 | return ((enc_chck(0)==0 || (enc_chck(0) == in.read() && !in.eos())) && |
37 | 35 | (enc_chck(1)==0 || (enc_chck(1) == in.read() && !in.eos()))); |
38 | 36 | case 1: |
@@ -27,8 +27,22 @@ | ||
27 | 27 | if(node.check_char() != in.peek()) |
28 | 28 | return default_value; |
29 | 29 | } |
30 | - } | |
30 | + } | |
31 | + | |
32 | + bool member(const char* key) const { | |
33 | + Node2 node = nodes2[root]; | |
31 | 34 | |
35 | + for(CharStream in(key);; in.read()) { | |
36 | + if(node.is_terminal()) return true; | |
37 | + if(in.eos()) return false; | |
38 | + if(node.check_encoded_children(in)==false) return false; | |
39 | + | |
40 | + node = nodes2[node.jump(in.peek())]; | |
41 | + if(node.check_char() != in.peek()) | |
42 | + return false; | |
43 | + } | |
44 | + } | |
45 | + | |
32 | 46 | protected: |
33 | 47 | const Node2* nodes2; |
34 | 48 | const unsigned root; |
@@ -106,8 +120,7 @@ | ||
106 | 120 | NormalizationForm(const unsigned* nodes, unsigned root, const unsigned* vals, const char* strs) |
107 | 121 | : Searcher(Node2::from_uint_array(nodes), root, vals, strs) {} |
108 | 122 | |
109 | - // TODO: set的な検索は専用メソッドを設ける | |
110 | - bool quick_check(const char* key) const { return find_value(key,0xFFFFFFFF)==0xFFFFFFFF; } | |
123 | + bool quick_check(const char* key) const { return !member(key); } | |
111 | 124 | |
112 | 125 | void decompose(RangeCharStream in, std::string& buffer) const { |
113 | 126 | loop_head: |