ver0.0.4用のTrie(DAWG)構築ライブラリソースを追加
@@ -0,0 +1,166 @@ | ||
1 | +(defpackage dawg.double-array-builder | |
2 | + (:use :common-lisp :dawg.global) | |
3 | + (:export build-from-bintrie)) | |
4 | +(in-package :dawg.double-array-builder) | |
5 | + | |
6 | +(package-alias :dawg.double-array.node-allocator :node-allocator) | |
7 | +(package-alias :dawg.double-array.buffered-output :output) | |
8 | +(package-alias :dawg.bintrie-builder :bintrie) | |
9 | + | |
10 | +;;;;;;;;;;;;;;; | |
11 | +;;; declamation | |
12 | +(declaim #.*fastest* | |
13 | + (inline set-base set-chck set-opts)) | |
14 | + | |
15 | +;;;;;;;;;;;; | |
16 | +;;; constant | |
17 | +(defconstant +BUFFER_SIZE+ 819200) | |
18 | + | |
19 | +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
20 | +;;; da (abbreviation of "double array") | |
21 | +(defstruct da | |
22 | + (node t :type output:buffered-output) | |
23 | + (exts t :type stream) | |
24 | + (done-count 0 :type positive-fixnum)) | |
25 | + | |
26 | +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
27 | +;;; node | |
28 | +(defstruct node | |
29 | + (index 0 :type positive-fixnum) | |
30 | + (base 0 :type positive-fixnum) | |
31 | + (terminal? t :type boolean) | |
32 | + (sibling-total 0 :type positive-fixnum) | |
33 | + (chck 0 :type uint1) | |
34 | + (children '() :type list)) | |
35 | + | |
36 | +(defun new-node (parent-base-idx trie) | |
37 | + (declare (positive-fixnum parent-base-idx)) | |
38 | + (make-node :index (+ parent-base-idx (bintrie:node-label trie)) | |
39 | + :sibling-total #1=(bintrie:node-sibling-total trie) | |
40 | + :terminal? (bintrie:node-terminal? trie) | |
41 | + :base (if (bintrie:node-terminal? trie) (bintrie::node-value trie) 0) | |
42 | + :chck (bintrie:node-label trie))) | |
43 | + | |
44 | +(defun child-acceptable-p (node) | |
45 | + nil) | |
46 | + | |
47 | +(defun add-child (node child) | |
48 | + (with-slots (children) (the node node) | |
49 | + (setf children (nconc children (list (bintrie:node-label child)))))) | |
50 | + | |
51 | +;;;;;;;;;;;;;;;;;;;;;; | |
52 | +;;; auxiliary function | |
53 | + | |
54 | +(defun merge-files-native-order (destination files) | |
55 | + ;; write each file size | |
56 | + (with-open-output-file (out destination 'uint4) | |
57 | + (loop FOR (file) IN files | |
58 | + DO (with-open-file (in file :element-type 'uint1) | |
59 | + (write-byte (file-length in) out)))) | |
60 | + | |
61 | + ;; write each file content | |
62 | + (with-open-output-file (out destination 'uint1 :if-exists :append) | |
63 | + (loop FOR (file) IN files | |
64 | + DO (with-open-file (in file :element-type 'uint1) | |
65 | + (loop FOR b = (read-byte in nil nil) | |
66 | + WHILE b | |
67 | + DO (write-byte b out)))))) | |
68 | + | |
69 | +(defun merge-files-reverse-order (destination files) | |
70 | + ;; write each file size | |
71 | + (with-open-output-file (out destination 'uint4) | |
72 | + (loop FOR (file) IN files | |
73 | + DO (with-open-file (in file :element-type 'uint1) | |
74 | + (write-byte (byte-reverse (file-length in) 4) out)))) | |
75 | + | |
76 | + ;; write each file content | |
77 | + (loop FOR (file type) IN files | |
78 | + FOR byte-size = (ecase type (uint4 4) (uint4 4)) | |
79 | + DO | |
80 | + (with-open-output-file (out destination type :if-exists :append) | |
81 | + (with-open-file (in file :element-type type) | |
82 | + (loop FOR b = (read-byte in nil nil) | |
83 | + WHILE b | |
84 | + DO (write-byte (byte-reverse b byte-size) out)))))) | |
85 | + | |
86 | +(defun merge-files (destination byte-order files) | |
87 | + (if (or (eq byte-order :native) | |
88 | + (eq byte-order +NATIVE_ORDER+)) | |
89 | + (merge-files-native-order destination files) | |
90 | + (merge-files-reverse-order destination files)) | |
91 | + (mapc #'delete-file (mapcar #'first files))) | |
92 | + | |
93 | +(defmacro show (fmt &rest args) | |
94 | + `(when show-progress | |
95 | + (format t ,fmt ,@args))) | |
96 | + | |
97 | +;;;;;;;;;;;;;;;;;; | |
98 | +;;; build function | |
99 | +(defun write-node-impl (node da) | |
100 | + (with-slots (index type base terminal? sibling-total chck children) (the node node) | |
101 | + (let ((n 0)) | |
102 | + (declare ((unsigned-byte 32) n)) | |
103 | + (setf (ldb (byte 24 0) n) base | |
104 | + (ldb (byte 8 24) n) chck) | |
105 | + (output:write-uint n (da-node da) :position index)))) | |
106 | + | |
107 | +(defun write-node (node da &key base) | |
108 | + (when base | |
109 | + (setf (node-base node) base)) | |
110 | + (write-node-impl node da)) | |
111 | + | |
112 | +(defmacro show-and-write-node (node da &key base) | |
113 | + `(progn | |
114 | + (incf #1=(da-done-count ,da)) | |
115 | + (when (and show-progress (zerop (mod #1# 100000))) | |
116 | + (show "; ~a nodes~%" #1#)) | |
117 | + (write-node ,node ,da :base ,base))) | |
118 | + | |
119 | +(defun build-impl (trie alloca da node memo &optional show-progress) | |
120 | + (let ((children (bintrie:collect-children trie))) | |
121 | + (loop WHILE (and (not #1=(gethash (bintrie:node-child trie) memo)) | |
122 | + (null (cdr children)) | |
123 | + (not (bintrie::node-terminal? (car children))) | |
124 | + (child-acceptable-p node)) | |
125 | + DO | |
126 | + (add-child node (car children)) | |
127 | + (setf trie (car children)) | |
128 | + (setf children (bintrie:collect-children trie))) | |
129 | + | |
130 | + (a.if #1# | |
131 | + (show-and-write-node node da :base it) | |
132 | + (if (null children) | |
133 | + (show-and-write-node node da) | |
134 | + (let ((base-idx (node-allocator:allocate | |
135 | + alloca | |
136 | + (mapcar #'bintrie:node-label children)))) | |
137 | + (setf #1# base-idx) | |
138 | + (show-and-write-node node da :base base-idx) | |
139 | + | |
140 | + (dolist (child children) | |
141 | + (build-impl child alloca da (new-node base-idx child) memo show-progress))))))) | |
142 | + | |
143 | + | |
144 | +;;;;;;;;;;;;;;;;;;;;; | |
145 | +;;; external function | |
146 | +(defun build-from-bintrie (trie &key output-file byte-order show-progress) | |
147 | + (show "~2&; build double array from trie:~%") | |
148 | + (let ((node-file (format nil "~a.node" output-file)) | |
149 | + (exts-file (format nil "~a.ext" output-file))) | |
150 | + (show "; create tmpfiles: ~a, ~a~%" node-file exts-file) | |
151 | + | |
152 | + (show "; build:~%") | |
153 | + (output:with-output (node node-file :byte-width 4) | |
154 | + (with-open-output-file (exts exts-file 'uint4) | |
155 | + (let ((da (make-da :node node :exts exts))) | |
156 | + (build-impl trie (node-allocator:make) da | |
157 | + (new-node 0 trie) | |
158 | + (make-hash-table :test #'eq) | |
159 | + show-progress)))) | |
160 | + (show "; concatenate tempfiles to ~A~%" output-file) | |
161 | + (merge-files output-file byte-order `((,node-file uint4) (,exts-file uint4)))) | |
162 | + 'done) | |
163 | + | |
164 | +(package-alias :dawg.double-array.node-allocator) | |
165 | +(package-alias :dawg.double-array.buffered-output) | |
166 | +(package-alias :dawg.bintrie-builder) |
@@ -0,0 +1,16 @@ | ||
1 | +;# <- バイトオーダー判定用文字列 | |
2 | +;; ファイル名: byte-order.lisp | |
3 | +(in-package :dawg.global) | |
4 | + | |
5 | +(eval-when (:compile-toplevel :load-toplevel) | |
6 | + (defun guess-byte-order (sample-file) | |
7 | + (with-open-file (1byte sample-file :element-type '(unsigned-byte 8)) | |
8 | + (with-open-file (2byte sample-file :element-type '(unsigned-byte 16)) | |
9 | + (if (= (read-byte 2byte) | |
10 | + (+ (ash (read-byte 1byte) 8) (read-byte 1byte))) | |
11 | + :big | |
12 | + :little))))) | |
13 | + | |
14 | +(defconstant +NATIVE_ORDER+ (guess-byte-order (or *COMPILE-FILE-PATHNAME* | |
15 | + *LOAD-PATHNAME*))) | |
16 | + |
@@ -0,0 +1,168 @@ | ||
1 | +(defpackage dawg.bintrie-builder | |
2 | + (:use :common-lisp :dawg.global) | |
3 | + (:export build-from-file | |
4 | + build-from-list | |
5 | + collect-children | |
6 | + node-label | |
7 | + node-terminal? | |
8 | + node-sibling-total | |
9 | + node-child | |
10 | + node-options | |
11 | + element-count)) | |
12 | +(in-package :dawg.bintrie-builder) | |
13 | + | |
14 | +(package-alias :dawg.octet-stream :stream) | |
15 | + | |
16 | +;;;;;;;;;;;;;;; | |
17 | +;;; declamation | |
18 | +(declaim #.*fastest* | |
19 | + (inline make-node collect-children calc-child-total calc-sibling-total | |
20 | + node-options element-count)) | |
21 | + | |
22 | +;;;;;;;; | |
23 | +;;; node | |
24 | +(defstruct node | |
25 | + (label 0 :type octet) | |
26 | + (terminal? nil :type boolean) | |
27 | + (child nil :type (or null node)) | |
28 | + (sibling nil :type (or null node)) | |
29 | + (child-total 0 :type positive-fixnum) ; amount of child side nodes | |
30 | + (sibling-total 0 :type positive-fixnum) ; amount of sibling side nodes | |
31 | + (value -1 :type fixnum) | |
32 | + (hash -1 :type fixnum)) | |
33 | + | |
34 | +;;;;;;;;;;;;;;;;;;;;;; | |
35 | +;;; auxiliary function | |
36 | +(macrolet ((calc-xxx-total (node slot) | |
37 | + `(with-slots (,slot) (the node ,node) | |
38 | + (if (null ,slot) | |
39 | + 0 | |
40 | + (the positive-fixnum | |
41 | + (+ (if (node-terminal? ,slot) 1 0) | |
42 | + (node-child-total ,slot) (node-sibling-total ,slot))))))) | |
43 | + (defun calc-child-total (node) (calc-xxx-total node child)) | |
44 | + (defun calc-sibling-total (node) (calc-xxx-total node sibling))) | |
45 | + | |
46 | +;;;;;;;;;;;;;;;;; | |
47 | +;;; hash function | |
48 | +(defun node= (n1 n2) | |
49 | + (and (eq (node-child n1) (node-child n2)) | |
50 | + (eq (node-sibling n1) (node-sibling n2)) | |
51 | + (= (node-value n1) (node-value n2)) | |
52 | + (= (node-label n1) (node-label n2)) | |
53 | + (eq (node-terminal? n1) (node-terminal? n2)))) | |
54 | + | |
55 | +(defun sxhash-node (node) | |
56 | + (if (null node) | |
57 | + #.(sxhash nil) | |
58 | + (with-slots (hash child-total sibling-total) (the node node) | |
59 | + (when (= -1 hash) | |
60 | + (setf hash (logxor (sxhash (node-label node)) | |
61 | + (sxhash (node-value node)) | |
62 | + (sxhash (node-terminal? node)) | |
63 | + (fixnumize (* (sxhash-node (node-child node)) 7)) | |
64 | + (fixnumize (* (sxhash-node (node-sibling node)) 13)))) | |
65 | + (setf child-total (calc-child-total node) | |
66 | + sibling-total (calc-sibling-total node))) | |
67 | + hash))) | |
68 | + | |
69 | +;;;;;;;;;;;;;;;;;; | |
70 | +;;; build function | |
71 | +(defun share (node memo) | |
72 | + (if (null node) | |
73 | + nil | |
74 | + (or (dict:get node memo) | |
75 | + (progn | |
76 | + (setf (node-child node) (share (node-child node) memo) | |
77 | + (node-sibling node) (share (node-sibling node) memo)) | |
78 | + (dict:get node memo)) | |
79 | + (setf (dict:get node memo) node)))) | |
80 | + | |
81 | +(defun push-child (in parent value) | |
82 | + (if (stream:eos? in) | |
83 | + (setf (node-terminal? parent) t | |
84 | + (node-value parent) value) | |
85 | + (let ((new-node (make-node :label (stream:read in)))) | |
86 | + (shiftf (node-sibling new-node) (node-child parent) new-node) | |
87 | + (push-child in new-node value)))) | |
88 | + | |
89 | +(defun insert (in parent memo value) | |
90 | + (let ((node (node-child parent))) | |
91 | + (if (or (null node) | |
92 | + (stream:eos? in) | |
93 | + (/= (stream:peek in) (node-label node))) | |
94 | + (progn | |
95 | + (setf (node-child parent) (share node memo)) | |
96 | + (push-child in parent value)) | |
97 | + (insert (stream:eat in) node memo value)))) | |
98 | + | |
99 | +(defun build-impl (key-generator show-progress) | |
100 | + (loop WITH trie = (make-node) | |
101 | + WITH memo = (dict:make :test #'node= :hash #'sxhash-node) | |
102 | + FOR num fixnum FROM 0 | |
103 | + FOR (key . value) = (funcall key-generator) | |
104 | + WHILE key | |
105 | + DO | |
106 | + (when (and show-progress (zerop (mod num 100000))) | |
107 | + (format t "~&; ~A~%" num)) | |
108 | + (let ((in (stream:make key))) | |
109 | + (declare (dynamic-extent in)) | |
110 | + (insert in trie memo value)) | |
111 | + | |
112 | + FINALLY | |
113 | + (return (share trie memo)))) | |
114 | + | |
115 | +(defun build-from-list (keyset &key show-progress) | |
116 | + (when show-progress | |
117 | + (format t "~&; build trie list (size ~A):~%" (length keyset))) | |
118 | + (build-impl (lambda () (prog1 (car keyset) | |
119 | + (setf keyset (cdr keyset)))) | |
120 | + show-progress)) | |
121 | + | |
122 | +(defun build-from-file (filepath &key show-progress) | |
123 | + (when show-progress | |
124 | + (format t "~&; build trie from ~A:~%" filepath)) | |
125 | + (with-open-file (in filepath) | |
126 | + (build-impl (lambda () (read-line in nil nil)) | |
127 | + show-progress))) | |
128 | + | |
129 | +;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
130 | +;;; other external function | |
131 | +(defun node-options (node) | |
132 | + "Encode terminal? and sibling-total fields into fixnum" | |
133 | + (with-slots (terminal? sibling-total) (the node node) | |
134 | + (fixnumize | |
135 | + (+ (if terminal? 1 0) | |
136 | + (ash sibling-total 1))))) | |
137 | + | |
138 | +(defun element-count (node) | |
139 | + (with-slots (terminal? child-total) (the node node) | |
140 | + (the fixnum (+ (if terminal? 1 0) child-total)))) | |
141 | + | |
142 | +(defun collect-children (node) | |
143 | + (loop WITH acc = '() | |
144 | + FOR child = (node-child node) | |
145 | + THEN (node-sibling child) | |
146 | + WHILE child | |
147 | + DO | |
148 | + (push child acc) | |
149 | + FINALLY | |
150 | + (return acc))) | |
151 | + | |
152 | +;;;;;;;;;;;;; | |
153 | +;;; for debug | |
154 | +(defun member? (key trie) | |
155 | + (declare #.*interface* | |
156 | + (simple-characters key) | |
157 | + (node trie)) | |
158 | + (let ((in (stream:make key))) | |
159 | + (declare (dynamic-extent in)) | |
160 | + (nlet recur ((in in) (node (node-child trie)) (parent trie)) | |
161 | + (cond ((stream:eos? in) (node-terminal? parent)) | |
162 | + ((null node) nil) | |
163 | + ((= (stream:peek in) (node-label node)) | |
164 | + (recur (stream:eat in) (node-child node) node)) | |
165 | + ((< (stream:peek in) (node-label node)) | |
166 | + (recur in (node-sibling node) parent)))))) | |
167 | + | |
168 | +(package-alias :dawg.octet-stream) |
@@ -0,0 +1,91 @@ | ||
1 | +(defpackage dawg.global | |
2 | + (:use :common-lisp) | |
3 | + (:export ;; special variable | |
4 | + *fastest* | |
5 | + *interface* | |
6 | + | |
7 | + ;; type | |
8 | + array-index | |
9 | + positive-fixnum | |
10 | + octet | |
11 | + simple-characters | |
12 | + unicode | |
13 | + uint8 | |
14 | + uint4 | |
15 | + uint1 | |
16 | + | |
17 | + ;; byte order | |
18 | + +NATIVE_ORDER+ | |
19 | + byte-reverse | |
20 | + | |
21 | + ;; utility function | |
22 | + fixnumize | |
23 | + package-alias | |
24 | + muffle | |
25 | + a.if | |
26 | + nlet | |
27 | + with-open-output-file | |
28 | + | |
29 | + ;; symbol for anaphoric macro | |
30 | + it)) | |
31 | +(in-package :dawg.global) | |
32 | + | |
33 | +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
34 | +;;; special variable for optimize declaration | |
35 | +(defvar *fastest* '(optimize (speed 3) (safety 0) (debug 0) (compilation-speed 0))) | |
36 | +(defvar *interface* '(optimize (speed 3) (safety 2) (debug 1) (compilation-speed 0))) | |
37 | + | |
38 | +;;;;;;;;;;;;;;;;;;; | |
39 | +;;; type definition | |
40 | +(deftype array-index () `(mod ,array-dimension-limit)) | |
41 | +(deftype positive-fixnum () `(integer 0 ,most-positive-fixnum)) | |
42 | +(deftype octet () '(unsigned-byte 8)) | |
43 | +(deftype simple-characters () '(simple-array character)) | |
44 | +(deftype unicode () `(mod ,char-code-limit)) | |
45 | +(deftype uint8 () '(unsigned-byte 64)) | |
46 | +(deftype uint4 () '(unsigned-byte 32)) | |
47 | +(deftype uint1 () '(unsigned-byte 8)) | |
48 | + | |
49 | +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
50 | +;;; unility function and macro | |
51 | +(declaim (inline fixnumize)) | |
52 | +(defun fixnumize (n) | |
53 | + (ldb (byte #.(integer-length most-positive-fixnum) 0) n)) | |
54 | + | |
55 | +(defmacro package-alias (package &rest alias-list) | |
56 | + `(eval-when (:compile-toplevel :load-toplevel :execute) | |
57 | + (rename-package ,package ,package ',alias-list))) | |
58 | + | |
59 | +(defmacro muffle (&body body) | |
60 | + `(locally | |
61 | + (declare #+SBCL (sb-ext:muffle-conditions sb-ext:compiler-note)) | |
62 | + ,@body)) | |
63 | + | |
64 | +(defmacro a.if (exp then else) | |
65 | + `(let ((it ,exp)) | |
66 | + (if it | |
67 | + ,then | |
68 | + ,else))) | |
69 | + | |
70 | +(defmacro nlet (fn-name letargs &body body) | |
71 | + `(labels ((,fn-name ,(mapcar #'car letargs) | |
72 | + ,@body)) | |
73 | + (,fn-name ,@(mapcar #'cadr letargs)))) | |
74 | + | |
75 | +(defmacro with-open-output-file ((stream path element-type &key (if-exists :supersede)) &body body) | |
76 | + `(with-open-file (,stream ,path :direction :output | |
77 | + :if-exists ,if-exists | |
78 | + :element-type ,element-type) | |
79 | + ,@body)) | |
80 | + | |
81 | +(declaim (inline byte-reverse)) | |
82 | +(defun byte-reverse (n size) | |
83 | + (declare ((member 2 4 8) size)) | |
84 | + (muffle | |
85 | + (loop FOR u fixnum FROM (1- size) DOWNTO 0 | |
86 | + FOR l fixnum FROM 0 TO (1- size) | |
87 | + WHILE (> u l) | |
88 | + DO | |
89 | + (rotatef (ldb (byte 8 (* u 8)) n) | |
90 | + (ldb (byte 8 (* l 8)) n))) | |
91 | + n)) |
@@ -0,0 +1,21 @@ | ||
1 | +The MIT License | |
2 | + | |
3 | +Copyright (c) 2010 Takeru Ohta <phjgt308@gmail.com> | |
4 | + | |
5 | +Permission is hereby granted, free of charge, to any person obtaining a copy | |
6 | +of this software and associated documentation files (the "Software"), to deal | |
7 | +in the Software without restriction, including without limitation the rights | |
8 | +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
9 | +copies of the Software, and to permit persons to whom the Software is | |
10 | +furnished to do so, subject to the following conditions: | |
11 | + | |
12 | +The above copyright notice and this permission notice shall be included in | |
13 | +all copies or substantial portions of the Software. | |
14 | + | |
15 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
16 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
17 | +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
18 | +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
19 | +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
20 | +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
21 | +THE SOFTWARE. |
@@ -0,0 +1,136 @@ | ||
1 | +(defpackage dawg | |
2 | + (:use :common-lisp :dawg.global) | |
3 | + (:shadow :common-lisp load) | |
4 | + (:export dawg | |
5 | + build | |
6 | + load | |
7 | + member? | |
8 | + get-id)) | |
9 | +(in-package :dawg) | |
10 | + | |
11 | +(package-alias :dawg.octet-stream :stream) | |
12 | + | |
13 | +;;;;;;;;;;;;;;;;;;;; | |
14 | +;;; special variable | |
15 | +(eval-when (:compile-toplevel) | |
16 | + (defvar *args-type* '(simple-characters dawg &key (:start positive-fixnum) | |
17 | + (:end positive-fixnum)))) | |
18 | +(defconstant +ARC_LIMIT+ #x100) | |
19 | + | |
20 | +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
21 | +;;; dawg (double-array format) | |
22 | +(defstruct dawg | |
23 | + (node #() :type (simple-array uint4))) | |
24 | + | |
25 | +(defmethod print-object ((o dawg) stream) | |
26 | + (print-unreadable-object (o stream :type t :identity t) | |
27 | + (format stream "~A:~A" :node-count (length (dawg-node o))))) | |
28 | + | |
29 | +;;;;;;;;;;;;;;; | |
30 | +;;; declamation | |
31 | +(declaim (inline check-encoded-children get-node | |
32 | + base chck terminal? sibling-total inc-id | |
33 | + get-id-impl member?-impl | |
34 | + each-common-prefix-impl each-predictive-impl) | |
35 | + (ftype (function #.*args-type* boolean) member?) | |
36 | + (ftype (function #.*args-type* (or null positive-fixnum)) get-id)) | |
37 | + | |
38 | +;;;;;;;;;;;;;;;;;;;;;;;;; | |
39 | +;;; auxiliary function(1) | |
40 | +(defun read-array (index-path &key size element-type offset byte-order) | |
41 | + (declare ((member uint4 uint8) element-type)) | |
42 | + (with-open-file (in index-path :element-type element-type) | |
43 | + (file-position in offset) | |
44 | + (let ((ary (make-array size :element-type element-type))) | |
45 | + (read-sequence ary in) | |
46 | + (unless (or (eq byte-order :native) | |
47 | + (eq byte-order +NATIVE_ORDER+)) | |
48 | + (let ((byte-size (ecase element-type | |
49 | + (uint4 4) | |
50 | + (uint8 8)))) | |
51 | + (dotimes (i size) | |
52 | + (setf (aref ary i) (byte-reverse (aref ary i) byte-size))))) | |
53 | + ary))) | |
54 | + | |
55 | +;;;;;;;;;;;;;;;;;;;;;;;; | |
56 | +;;; external function(1) | |
57 | +(defun build (&key input output (byte-order :native) show-progress) | |
58 | + (declare ((or string pathname list) input) | |
59 | + ((or string pathname) output) | |
60 | + ((member :native :little :big) byte-order)) | |
61 | + (let ((trie (if (listp input) | |
62 | + (dawg.bintrie-builder:build-from-list input :show-progress show-progress) | |
63 | + (dawg.bintrie-builder:build-from-file input :show-progress show-progress)))) | |
64 | + (dawg.double-array-builder:build-from-bintrie | |
65 | + trie :output-file output :byte-order byte-order :show-progress show-progress)) | |
66 | + t) | |
67 | + | |
68 | +(defun load (index-path &key (byte-order :native)) | |
69 | + (declare ((or string pathname file-stream) index-path) | |
70 | + ((member :native :little :big) byte-order)) | |
71 | + (let ((sizes (read-array index-path :size 2 :element-type 'uint4 :offset 0 | |
72 | + :byte-order byte-order))) | |
73 | + (make-dawg | |
74 | + :node (read-array index-path :element-type 'uint4 | |
75 | + :size (/ (aref sizes 0) 4) | |
76 | + :offset 2 | |
77 | + :byte-order byte-order)))) | |
78 | + | |
79 | +;;;;;;;;;;;;;;;;;;;;;;;;; | |
80 | +;;; auxiliary function(2) | |
81 | +(defun base (node) (ldb (byte 24 0) (the uint8 node))) | |
82 | +(defun chck (node) (ldb (byte 8 24) (the uint8 node))) | |
83 | + | |
84 | +(defun get-node (dawg index) | |
85 | + (declare (dawg dawg) | |
86 | + (positive-fixnum index)) | |
87 | + (aref (dawg-node dawg) index)) | |
88 | + | |
89 | +;;;;;;;;;;;;;;;;;;;;;;;; | |
90 | +;;; external function(2) | |
91 | +(defun member?-impl (in dawg) | |
92 | + (nlet recur ((node (get-node dawg 0))) | |
93 | + (if (stream:eos? in) | |
94 | + t | |
95 | + (let* ((arc (stream:read in)) | |
96 | + (next (get-node dawg (+ (base node) arc)))) | |
97 | + (when (= (chck next) arc) | |
98 | + (recur next)))))) | |
99 | + | |
100 | +(defun get-id-impl (in dawg) | |
101 | + (nlet recur ((node (get-node dawg 0))) | |
102 | + (print (list :in #1=(+ (base node) (stream:peek in)) (get-node dawg #1#) | |
103 | + (stream:peek in))) | |
104 | + (if (stream:eos? in) | |
105 | + (ldb (byte 24 0) node) | |
106 | + (let* ((arc (stream:read in)) | |
107 | + (next (get-node dawg (+ (base node) arc)))) | |
108 | + (when (= (chck next) arc) | |
109 | + (recur next)))))) | |
110 | + | |
111 | + | |
112 | +(defmacro with-key-stream ((in key &key start end) &body body) | |
113 | + (let ((k (gensym)) | |
114 | + (s (gensym)) | |
115 | + (e (gensym))) | |
116 | + `(let ((,k ,key) | |
117 | + (,s ,start) | |
118 | + (,e ,end)) | |
119 | + (declare #.*interface* | |
120 | + (simple-characters ,k) | |
121 | + (positive-fixnum ,s ,e)) | |
122 | + (locally | |
123 | + (declare #.*fastest*) | |
124 | + (let ((,in (stream:make ,k :start ,s :end ,e))) | |
125 | + (declare (dynamic-extent ,in)) | |
126 | + ,@body))))) | |
127 | + | |
128 | +(defun member? (key dawg &key (start 0) (end (length key))) | |
129 | + (with-key-stream (in key :start start :end end) | |
130 | + (member?-impl in dawg))) | |
131 | + | |
132 | +(defun get-id (key dawg &key (start 0) (end (length key))) | |
133 | + (with-key-stream (in key :start start :end end) | |
134 | + (get-id-impl in dawg))) | |
135 | + | |
136 | +(package-alias :dawg.octet-stream) |
@@ -0,0 +1,64 @@ | ||
1 | +(defpackage dawg.double-array.buffered-output | |
2 | + (:use :common-lisp :dawg.global) | |
3 | + (:export buffered-output | |
4 | + with-output | |
5 | + write-uint)) | |
6 | +(in-package :dawg.double-array.buffered-output) | |
7 | + | |
8 | +;;;;;;;;;;;;;;; | |
9 | +;;; declamation | |
10 | +(declaim #.*fastest*) | |
11 | + | |
12 | +;;;;;;;;;;;; | |
13 | +;;; constant | |
14 | +(defconstant +BUFFER_SIZE+ 819200) | |
15 | + | |
16 | +;;;;;;;;;;;;;;;;;;; | |
17 | +;; buffered-output | |
18 | +(defstruct buffered-output | |
19 | + (binary-output nil :type file-stream) | |
20 | + (buffer #() :type simple-array) | |
21 | + (offset 0 :type array-index)) | |
22 | + | |
23 | +;;;;;;;;;;;;;;;;;;;;; | |
24 | +;;; external function | |
25 | +(defmacro with-output ((out path &key (byte-width 1)) &body body) | |
26 | + (declare ((member 1 2 4 8) byte-width)) | |
27 | + `(with-open-file (,out ,path :element-type #1='(unsigned-byte ,(* 8 byte-width)) | |
28 | + :direction :output | |
29 | + :if-exists :supersede) | |
30 | + (let ((,out (make-buffered-output | |
31 | + :binary-output ,out | |
32 | + :buffer (make-array ,+BUFFER_SIZE+ :element-type #1# | |
33 | + :initial-element #xFF000000)))) | |
34 | + (unwind-protect | |
35 | + (locally ,@body) | |
36 | + (flush ,out :final t))))) | |
37 | + | |
38 | +(defun write-uint (uint out &key (position 0)) | |
39 | + (declare (buffered-output out) | |
40 | + (positive-fixnum position)) | |
41 | + (with-slots (binary-output buffer offset) out | |
42 | + (cond ((< position offset) | |
43 | + (file-position binary-output position) | |
44 | + (write-byte uint binary-output)) | |
45 | + ((< position (+ offset +BUFFER_SIZE+)) | |
46 | + (muffle | |
47 | + (setf (aref buffer (- position offset)) uint))) | |
48 | + (t | |
49 | + (flush out) | |
50 | + (incf offset +BUFFER_SIZE+) | |
51 | + (fill buffer #xFF000000) | |
52 | + (write-uint uint out :position position))))) | |
53 | + | |
54 | +(defun flush (out &key final) | |
55 | + (declare (buffered-output out)) | |
56 | + (with-slots (binary-output buffer offset) out | |
57 | + (file-position binary-output offset) | |
58 | + (if (null final) | |
59 | + (write-sequence buffer binary-output) | |
60 | + (let ((end (muffle | |
61 | + (or (position-if-not (lambda (x) (= x #xFF000000)) buffer :from-end t) | |
62 | + (1- +BUFFER_SIZE+))))) | |
63 | + (write-sequence buffer binary-output :end (1+ end)) | |
64 | + (loop REPEAT #x100 DO (write-byte #xFF000000 binary-output)))))) |
@@ -0,0 +1,98 @@ | ||
1 | +[概要] | |
2 | +・DAWG(Direct Acyclic Word Graph)によるマップの実装 | |
3 | +・入力キーセットを静的に受け取り、各キーに一意なIDをマッピングする | |
4 | + ・入力キーセットはソート済みであり、かつ各キーはユニークでなければならない | |
5 | + ・キーのID == 先頭から数えたキーの出現位置、となる ※ 先頭のキーのID値は0となる | |
6 | +・数千万程度のキーセットをCommon Lispで比較的手軽に扱えるようにするのが目的 | |
7 | + ・2~4GBのメモリを積んでいる32bitマシンで | |
8 | +・masterからSBCLへの依存性を排除したブランチ | |
9 | + | |
10 | + | |
11 | +[バージョン] | |
12 | +・0.2.2 | |
13 | + | |
14 | + | |
15 | +[依存パッケージ] | |
16 | +・dict-0.0.2 | |
17 | + ・ポータブルなハッシュテーブル | |
18 | + ・https://github.com/sile/dict | |
19 | + | |
20 | + | |
21 | +[インストール] | |
22 | +> (require :asdf) | |
23 | +> (require :asdf-install) | |
24 | +> (asdf-install:install "cl-dawg-VERSION.tar.gz") | |
25 | + | |
26 | + | |
27 | +[API] | |
28 | +### | |
29 | +# dawg | |
30 | + メインパッケージ | |
31 | + | |
32 | +#### | |
33 | +# dawg:build (&key input output byte-order show-progress) => t | |
34 | + キーセットファイル※1からDAWGインデックスファイル※2を作成する。 | |
35 | + = input: 入力キーセットファイルのパス名 or キーセットが格納されたリスト (必須) | |
36 | + = output: 出力DAWGインデックスファイルのパス名 (必須) | |
37 | + = byte-order: インデックスファイルのバイトオーダーを指定する。:native or :big or :little | |
38 | + デフォルトは :native | |
39 | + = show-progress: 作成時の進捗表示を行うかどうか。デフォルトはnil | |
40 | + | |
41 | +※1: キーセットが昇順に改行区切りで格納されているファイル | |
42 | +※2: 正確にはDAWGをDoubleArray形式で保存したインデックスファイル | |
43 | + | |
44 | +#### | |
45 | +# dawg:load (index-path &key byte-order) => dawg:dawg | |
46 | + DAWGインデックスファイルからDAWG(DoubleArray形式)を読み込む。 | |
47 | + = index-path: build関数で作成したDAWGインデックスファイル | |
48 | + = byte-order: インデックスファイルのバイトオーダーを指定する。:native or :big or :little | |
49 | + デフォルトは :native | |
50 | + | |
51 | +### | |
52 | +# dawg:member? (key dawg &key start end) => boolean | |
53 | + キーがDAWGに含まれているかどうかを判定する。 | |
54 | + = key: 対象のキー文字列。(simple-array character)型でなければならない | |
55 | + = dawg: DAWG | |
56 | + = start: キー文字列内の開始位置 | |
57 | + = end: キー文字列内の終端位置 | |
58 | + | |
59 | +### | |
60 | +# dawg:get-id (key dawg &key start end) => (or null fixnum) | |
61 | + キーに紐付くIDを取得する。 | |
62 | + キーがDAWG内に存在しない場合はnilを返す。 | |
63 | + = key: 対象のキー文字列。(simple-array character)型でなければならない | |
64 | + = dawg: DAWG | |
65 | + = start: キー文字列内の開始位置 | |
66 | + = end: キー文字列内の終端位置 | |
67 | + | |
68 | +### | |
69 | +# dawg:each-common-prefix ((match-id match-end) | |
70 | + (key dawg &key start end) | |
71 | + &body body) | |
72 | + 入力キーに対して共通接頭辞検索を行う。 | |
73 | + 入力キーの接頭辞部分にマッチするDAWG内の各キーに対して、body部分の処理が実行される。 | |
74 | + ※ return関数を使うことで、途中でループを抜けることが可能 | |
75 | + = match-id: 入力キーの接頭辞部分にマッチしたキーのID値 | |
76 | + = match-end: 入力キー内のマッチした部分の終端位置 | |
77 | + = key: 入力キー文字列。(simple-array character)型でなければならない | |
78 | + = start: キー文字列内の開始位置 | |
79 | + = end: キー文字列内の終端位置 | |
80 | + = body: マッチの度に実行される式 | |
81 | + | |
82 | +### | |
83 | +# dawg:each-predictive ((match-id) | |
84 | + (key dawg &key start end) | |
85 | + &body body) | |
86 | + 入力キーが接頭辞となる全ての要素を走査する。 | |
87 | + 各走査で要素のIDを受け取り、body部分の処理が実行される。 | |
88 | + ※ return関数を使うことで、途中でループを抜けることが可能 | |
89 | + = match-id: 入力キーが接頭辞となる要素のID値 | |
90 | + = key: 入力キー文字列。(simple-array character)型でなければならない | |
91 | + = start: キー文字列内の開始位置 | |
92 | + = end: キー文字列内の終端位置 | |
93 | + = body: マッチの度に実行される式 | |
94 | + | |
95 | + | |
96 | +[注意事項] | |
97 | +・DAWGのキー内の文字にヌル文字を含めることは出来ない | |
98 | + ・DoubleArray中で(CHECK配列の初期値として)特別扱いしているため |
@@ -0,0 +1,131 @@ | ||
1 | +(defpackage dawg.double-array.node-allocator | |
2 | + (:use :common-lisp :dawg.global) | |
3 | + (:export make | |
4 | + allocate)) | |
5 | +(in-package :dawg.double-array.node-allocator) | |
6 | + | |
7 | +;;;;;;;;;;;;;;; | |
8 | +;;; declamation | |
9 | +(declaim #.*fastest* | |
10 | + (inline get-next can-allocate?)) | |
11 | + | |
12 | +;;;;;;;;;;;; | |
13 | +;;; constant | |
14 | +(eval-when (:compile-toplevel :load-toplevel :execute) | |
15 | + (defconstant +BUFFER_SIZE+ 89120)) | |
16 | + | |
17 | +;;;;;;;;;;;;;;;;;; | |
18 | +;;; node-allocator | |
19 | +(defstruct node-allocator | |
20 | + (head #x100 :type array-index) | |
21 | + (bits #* :type (simple-bit-vector #.+BUFFER_SIZE+)) | |
22 | + (nexts #() :type (simple-array fixnum (#.+BUFFER_SIZE+))) | |
23 | + (prevs #() :type (simple-array fixnum (#.+BUFFER_SIZE+))) | |
24 | + (offset 0 :type array-index)) | |
25 | + | |
26 | +;;;;;;;;;;;;;;; | |
27 | +;;; constructor | |
28 | +(defun make () | |
29 | + (let ((bits (make-array +BUFFER_SIZE+ :element-type 'bit :initial-element 0)) | |
30 | + (nexts (make-array +BUFFER_SIZE+ :element-type 'fixnum)) | |
31 | + (prevs (make-array +BUFFER_SIZE+ :element-type 'fixnum))) | |
32 | + (loop FOR i FROM 0 BELOW +BUFFER_SIZE+ | |
33 | + DO | |
34 | + (setf (aref nexts i) (1+ i) | |
35 | + (aref prevs i) (1- i))) | |
36 | + (make-node-allocator :nexts nexts :prevs prevs :bits bits))) | |
37 | + | |
38 | +;;;;;;;;;;;;;;;;;;;;;; | |
39 | +;;; auxiliary function | |
40 | +(defun shift (alloca) | |
41 | + (with-slots (bits nexts prevs offset head) (the node-allocator alloca) | |
42 | + (let ((new-offset head)) | |
43 | + (loop WHILE (< new-offset (+ offset (- +BUFFER_SIZE+ (* #x100 2)))) | |
44 | + DO | |
45 | + (setf new-offset (aref nexts (- new-offset offset)))) | |
46 | + (let* ((delta (- new-offset offset)) | |
47 | + (use-len (- +BUFFER_SIZE+ delta))) | |
48 | + (shiftf (subseq bits 0 use-len) (subseq bits delta)) | |
49 | + (fill bits 0 :start use-len) | |
50 | + | |
51 | + (setf offset new-offset) | |
52 | + | |
53 | + (shiftf (subseq nexts 0 use-len) (subseq nexts delta)) | |
54 | + (shiftf (subseq prevs 0 use-len) (subseq prevs delta)) | |
55 | + (loop FOR i FROM (+ offset use-len) BELOW (+ offset +BUFFER_SIZE+) | |
56 | + DO | |
57 | + (setf (aref nexts (- i offset)) (1+ i) | |
58 | + (aref prevs (- i offset)) (1- i))) | |
59 | + | |
60 | + (setf head offset) | |
61 | + (loop WHILE (< head (+ offset #x100)) | |
62 | + DO | |
63 | + (setf head (aref nexts (- head offset))))))) | |
64 | + alloca) | |
65 | + | |
66 | +(defun ref (alloca index) | |
67 | + (declare (array-index index)) | |
68 | + (with-slots (offset nexts) (the node-allocator alloca) | |
69 | + (if (<= (+ offset +BUFFER_SIZE+) index) | |
70 | + (ref (shift alloca) index) | |
71 | + (aref nexts (- index offset))))) | |
72 | + | |
73 | +(defun bref (alloca index) | |
74 | + (declare (array-index index)) | |
75 | + (with-slots (bits offset) (the node-allocator alloca) | |
76 | + (if (> offset index) | |
77 | + 1 | |
78 | + (if (<= (+ offset +BUFFER_SIZE+) index) | |
79 | + (bref (shift alloca) index) | |
80 | + (bit bits (- index offset)))))) | |
81 | + | |
82 | +(defun get-next (alloca index) | |
83 | + (ref alloca index)) | |
84 | + | |
85 | +(defun can-allocate? (alloca index arcs) | |
86 | + (declare (list arcs) | |
87 | + (array-index index)) | |
88 | + (and (zerop (bref alloca index)) | |
89 | + (every (lambda (arc) | |
90 | + (declare (octet arc)) | |
91 | + (/= -1 (ref alloca (+ index arc)))) | |
92 | + arcs))) | |
93 | + | |
94 | +(defun allocate-impl (alloca index arcs) | |
95 | + (declare (array-index index)) | |
96 | + (with-slots (bits head prevs nexts offset) (the node-allocator alloca) | |
97 | + (when (<= offset index) | |
98 | + (setf (bit bits (- index offset)) 1)) | |
99 | + (loop WITH base = index | |
100 | + FOR arc OF-TYPE (mod #x100) IN arcs | |
101 | + FOR index OF-TYPE fixnum = (+ base arc) | |
102 | + DO | |
103 | + (when (<= offset index) | |
104 | + (ref alloca index) | |
105 | + | |
106 | + (let ((prev (aref prevs (- index offset))) | |
107 | + (next (aref nexts (- index offset)))) | |
108 | + (setf (aref prevs (- index offset)) -1 | |
109 | + (aref nexts (- index offset)) -1) | |
110 | + | |
111 | + (when (= head index) | |
112 | + (setf head next)) | |
113 | + | |
114 | + (when (<= offset prev) | |
115 | + (setf (aref nexts (- prev offset)) next)) | |
116 | + | |
117 | + (when (<= offset next) | |
118 | + (ref alloca next) | |
119 | + (setf (aref prevs (- next offset)) prev))))))) | |
120 | + | |
121 | +;;;;;;;;;;;;;;;;;;;;; | |
122 | +;;; external function | |
123 | +(defun allocate (alloca arcs) | |
124 | + (with-slots (head) (the node-allocator alloca) | |
125 | + (loop WITH front OF-TYPE (mod #x100) = (car arcs) | |
126 | + FOR cur = (get-next alloca head) THEN (get-next alloca cur) | |
127 | + FOR base OF-TYPE fixnum = (- cur front) | |
128 | + UNTIL (and (plusp base) (can-allocate? alloca base (cdr arcs))) | |
129 | + FINALLY | |
130 | + (allocate-impl alloca base arcs) | |
131 | + (return base)))) |
@@ -0,0 +1,78 @@ | ||
1 | +(defpackage dawg.octet-stream | |
2 | + (:use :common-lisp :dawg.global) | |
3 | + (:shadow :common-lisp read peek position) | |
4 | + (:export make | |
5 | + read | |
6 | + peek | |
7 | + eos? | |
8 | + eat | |
9 | + position)) | |
10 | +(in-package :dawg.octet-stream) | |
11 | + | |
12 | +;;;;;;;;;;;;;;;; | |
13 | +;;; declamation | |
14 | +(declaim #.*fastest* | |
15 | + (inline make-octet-stream make eos? octet-length peek read eat position)) | |
16 | + | |
17 | +;;;;;;;;;;;;;;;; | |
18 | +;;; octet-stream | |
19 | +(defstruct octet-stream | |
20 | + (src "" :type simple-characters) | |
21 | + (pos 0 :type array-index) | |
22 | + (end 0 :type array-index) | |
23 | + (code 0 :type unicode) | |
24 | + (octet-pos 0 :type (mod 5)) | |
25 | + (octet-len 0 :type (mod 5))) | |
26 | + | |
27 | +;;;;;;;;;;;;;;;;;;;;;; | |
28 | +;;; auxiliary function | |
29 | +(defun octet-length (code) | |
30 | + (declare (unicode code)) | |
31 | + (cond ((< code #x80) 1) | |
32 | + ((< code #x800) 2) | |
33 | + ((< code #x10000) 3) | |
34 | + (t 4))) | |
35 | + | |
36 | +;;;;;;;;;;;;;;;;;;;;; | |
37 | +;;; external function | |
38 | +(defun position (in) | |
39 | + (octet-stream-pos in)) | |
40 | + | |
41 | +(defun make (string &key (start 0) (end (length string))) | |
42 | + (declare (simple-characters string) | |
43 | + (array-index start end)) | |
44 | + (let* ((code (if (= start (length string)) | |
45 | + 0 | |
46 | + (char-code (char string start)))) | |
47 | + (len (octet-length code))) | |
48 | + (make-octet-stream :src string :pos start :end end | |
49 | + :code code :octet-pos len :octet-len len))) | |
50 | + | |
51 | +(defun eos? (in) | |
52 | + (with-slots ( pos end) (the octet-stream in) | |
53 | + (= pos end))) | |
54 | + | |
55 | +(defun peek (in) | |
56 | + (with-slots (code octet-pos octet-len) (the octet-stream in) | |
57 | + (if (= octet-pos octet-len) | |
58 | + (case octet-len | |
59 | + (1 code) | |
60 | + (2 (+ #b11000000 (ldb (byte 5 6) code))) | |
61 | + (3 (+ #b11100000 (ldb (byte 4 12) code))) | |
62 | + (t (+ #b11110000 (ldb (byte 3 18) code)))) | |
63 | + (+ #b10000000 (ldb (byte 6 (* (the (mod 4) (1- octet-pos)) 6)) code))))) | |
64 | + | |
65 | +(defun eat (in) | |
66 | + (with-slots (src pos code octet-pos octet-len) (the octet-stream in) | |
67 | + (decf octet-pos) | |
68 | + (when (zerop octet-pos) | |
69 | + (incf pos) | |
70 | + (unless (eos? in) | |
71 | + (setf code (char-code (char src pos)) | |
72 | + octet-len (octet-length code) | |
73 | + octet-pos octet-len)))) | |
74 | + in) | |
75 | + | |
76 | +(defun read (in) | |
77 | + (prog1 (peek in) | |
78 | + (eat in))) |