• R/O
  • SSH
  • HTTPS

unf: Commit


Commit MetaInfo

Revision54 (tree)
Time2011-11-19 19:10:12
Authorphjgt

Log Message

ver0.0.4用のTrie(DAWG)構築ライブラリソースを追加

Change Summary

Incremental Difference

--- trunk/lisp/lib/cl-dawg-0.2.2-unf/double-array-builder.lisp (nonexistent)
+++ trunk/lisp/lib/cl-dawg-0.2.2-unf/double-array-builder.lisp (revision 54)
@@ -0,0 +1,166 @@
1+(defpackage dawg.double-array-builder
2+ (:use :common-lisp :dawg.global)
3+ (:export build-from-bintrie))
4+(in-package :dawg.double-array-builder)
5+
6+(package-alias :dawg.double-array.node-allocator :node-allocator)
7+(package-alias :dawg.double-array.buffered-output :output)
8+(package-alias :dawg.bintrie-builder :bintrie)
9+
10+;;;;;;;;;;;;;;;
11+;;; declamation
12+(declaim #.*fastest*
13+ (inline set-base set-chck set-opts))
14+
15+;;;;;;;;;;;;
16+;;; constant
17+(defconstant +BUFFER_SIZE+ 819200)
18+
19+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
20+;;; da (abbreviation of "double array")
21+(defstruct da
22+ (node t :type output:buffered-output)
23+ (exts t :type stream)
24+ (done-count 0 :type positive-fixnum))
25+
26+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
27+;;; node
28+(defstruct node
29+ (index 0 :type positive-fixnum)
30+ (base 0 :type positive-fixnum)
31+ (terminal? t :type boolean)
32+ (sibling-total 0 :type positive-fixnum)
33+ (chck 0 :type uint1)
34+ (children '() :type list))
35+
36+(defun new-node (parent-base-idx trie)
37+ (declare (positive-fixnum parent-base-idx))
38+ (make-node :index (+ parent-base-idx (bintrie:node-label trie))
39+ :sibling-total #1=(bintrie:node-sibling-total trie)
40+ :terminal? (bintrie:node-terminal? trie)
41+ :base (if (bintrie:node-terminal? trie) (bintrie::node-value trie) 0)
42+ :chck (bintrie:node-label trie)))
43+
44+(defun child-acceptable-p (node)
45+ nil)
46+
47+(defun add-child (node child)
48+ (with-slots (children) (the node node)
49+ (setf children (nconc children (list (bintrie:node-label child))))))
50+
51+;;;;;;;;;;;;;;;;;;;;;;
52+;;; auxiliary function
53+
54+(defun merge-files-native-order (destination files)
55+ ;; write each file size
56+ (with-open-output-file (out destination 'uint4)
57+ (loop FOR (file) IN files
58+ DO (with-open-file (in file :element-type 'uint1)
59+ (write-byte (file-length in) out))))
60+
61+ ;; write each file content
62+ (with-open-output-file (out destination 'uint1 :if-exists :append)
63+ (loop FOR (file) IN files
64+ DO (with-open-file (in file :element-type 'uint1)
65+ (loop FOR b = (read-byte in nil nil)
66+ WHILE b
67+ DO (write-byte b out))))))
68+
69+(defun merge-files-reverse-order (destination files)
70+ ;; write each file size
71+ (with-open-output-file (out destination 'uint4)
72+ (loop FOR (file) IN files
73+ DO (with-open-file (in file :element-type 'uint1)
74+ (write-byte (byte-reverse (file-length in) 4) out))))
75+
76+ ;; write each file content
77+ (loop FOR (file type) IN files
78+ FOR byte-size = (ecase type (uint4 4) (uint4 4))
79+ DO
80+ (with-open-output-file (out destination type :if-exists :append)
81+ (with-open-file (in file :element-type type)
82+ (loop FOR b = (read-byte in nil nil)
83+ WHILE b
84+ DO (write-byte (byte-reverse b byte-size) out))))))
85+
86+(defun merge-files (destination byte-order files)
87+ (if (or (eq byte-order :native)
88+ (eq byte-order +NATIVE_ORDER+))
89+ (merge-files-native-order destination files)
90+ (merge-files-reverse-order destination files))
91+ (mapc #'delete-file (mapcar #'first files)))
92+
93+(defmacro show (fmt &rest args)
94+ `(when show-progress
95+ (format t ,fmt ,@args)))
96+
97+;;;;;;;;;;;;;;;;;;
98+;;; build function
99+(defun write-node-impl (node da)
100+ (with-slots (index type base terminal? sibling-total chck children) (the node node)
101+ (let ((n 0))
102+ (declare ((unsigned-byte 32) n))
103+ (setf (ldb (byte 24 0) n) base
104+ (ldb (byte 8 24) n) chck)
105+ (output:write-uint n (da-node da) :position index))))
106+
107+(defun write-node (node da &key base)
108+ (when base
109+ (setf (node-base node) base))
110+ (write-node-impl node da))
111+
112+(defmacro show-and-write-node (node da &key base)
113+ `(progn
114+ (incf #1=(da-done-count ,da))
115+ (when (and show-progress (zerop (mod #1# 100000)))
116+ (show "; ~a nodes~%" #1#))
117+ (write-node ,node ,da :base ,base)))
118+
119+(defun build-impl (trie alloca da node memo &optional show-progress)
120+ (let ((children (bintrie:collect-children trie)))
121+ (loop WHILE (and (not #1=(gethash (bintrie:node-child trie) memo))
122+ (null (cdr children))
123+ (not (bintrie::node-terminal? (car children)))
124+ (child-acceptable-p node))
125+ DO
126+ (add-child node (car children))
127+ (setf trie (car children))
128+ (setf children (bintrie:collect-children trie)))
129+
130+ (a.if #1#
131+ (show-and-write-node node da :base it)
132+ (if (null children)
133+ (show-and-write-node node da)
134+ (let ((base-idx (node-allocator:allocate
135+ alloca
136+ (mapcar #'bintrie:node-label children))))
137+ (setf #1# base-idx)
138+ (show-and-write-node node da :base base-idx)
139+
140+ (dolist (child children)
141+ (build-impl child alloca da (new-node base-idx child) memo show-progress)))))))
142+
143+
144+;;;;;;;;;;;;;;;;;;;;;
145+;;; external function
146+(defun build-from-bintrie (trie &key output-file byte-order show-progress)
147+ (show "~2&; build double array from trie:~%")
148+ (let ((node-file (format nil "~a.node" output-file))
149+ (exts-file (format nil "~a.ext" output-file)))
150+ (show "; create tmpfiles: ~a, ~a~%" node-file exts-file)
151+
152+ (show "; build:~%")
153+ (output:with-output (node node-file :byte-width 4)
154+ (with-open-output-file (exts exts-file 'uint4)
155+ (let ((da (make-da :node node :exts exts)))
156+ (build-impl trie (node-allocator:make) da
157+ (new-node 0 trie)
158+ (make-hash-table :test #'eq)
159+ show-progress))))
160+ (show "; concatenate tempfiles to ~A~%" output-file)
161+ (merge-files output-file byte-order `((,node-file uint4) (,exts-file uint4))))
162+ 'done)
163+
164+(package-alias :dawg.double-array.node-allocator)
165+(package-alias :dawg.double-array.buffered-output)
166+(package-alias :dawg.bintrie-builder)
--- trunk/lisp/lib/cl-dawg-0.2.2-unf/byte-order.lisp (nonexistent)
+++ trunk/lisp/lib/cl-dawg-0.2.2-unf/byte-order.lisp (revision 54)
@@ -0,0 +1,16 @@
1+;# <- バイトオーダー判定用文字列
2+;; ファイル名: byte-order.lisp
3+(in-package :dawg.global)
4+
5+(eval-when (:compile-toplevel :load-toplevel)
6+ (defun guess-byte-order (sample-file)
7+ (with-open-file (1byte sample-file :element-type '(unsigned-byte 8))
8+ (with-open-file (2byte sample-file :element-type '(unsigned-byte 16))
9+ (if (= (read-byte 2byte)
10+ (+ (ash (read-byte 1byte) 8) (read-byte 1byte)))
11+ :big
12+ :little)))))
13+
14+(defconstant +NATIVE_ORDER+ (guess-byte-order (or *COMPILE-FILE-PATHNAME*
15+ *LOAD-PATHNAME*)))
16+
--- trunk/lisp/lib/cl-dawg-0.2.2-unf/bintrie-builder.lisp (nonexistent)
+++ trunk/lisp/lib/cl-dawg-0.2.2-unf/bintrie-builder.lisp (revision 54)
@@ -0,0 +1,168 @@
1+(defpackage dawg.bintrie-builder
2+ (:use :common-lisp :dawg.global)
3+ (:export build-from-file
4+ build-from-list
5+ collect-children
6+ node-label
7+ node-terminal?
8+ node-sibling-total
9+ node-child
10+ node-options
11+ element-count))
12+(in-package :dawg.bintrie-builder)
13+
14+(package-alias :dawg.octet-stream :stream)
15+
16+;;;;;;;;;;;;;;;
17+;;; declamation
18+(declaim #.*fastest*
19+ (inline make-node collect-children calc-child-total calc-sibling-total
20+ node-options element-count))
21+
22+;;;;;;;;
23+;;; node
24+(defstruct node
25+ (label 0 :type octet)
26+ (terminal? nil :type boolean)
27+ (child nil :type (or null node))
28+ (sibling nil :type (or null node))
29+ (child-total 0 :type positive-fixnum) ; amount of child side nodes
30+ (sibling-total 0 :type positive-fixnum) ; amount of sibling side nodes
31+ (value -1 :type fixnum)
32+ (hash -1 :type fixnum))
33+
34+;;;;;;;;;;;;;;;;;;;;;;
35+;;; auxiliary function
36+(macrolet ((calc-xxx-total (node slot)
37+ `(with-slots (,slot) (the node ,node)
38+ (if (null ,slot)
39+ 0
40+ (the positive-fixnum
41+ (+ (if (node-terminal? ,slot) 1 0)
42+ (node-child-total ,slot) (node-sibling-total ,slot)))))))
43+ (defun calc-child-total (node) (calc-xxx-total node child))
44+ (defun calc-sibling-total (node) (calc-xxx-total node sibling)))
45+
46+;;;;;;;;;;;;;;;;;
47+;;; hash function
48+(defun node= (n1 n2)
49+ (and (eq (node-child n1) (node-child n2))
50+ (eq (node-sibling n1) (node-sibling n2))
51+ (= (node-value n1) (node-value n2))
52+ (= (node-label n1) (node-label n2))
53+ (eq (node-terminal? n1) (node-terminal? n2))))
54+
55+(defun sxhash-node (node)
56+ (if (null node)
57+ #.(sxhash nil)
58+ (with-slots (hash child-total sibling-total) (the node node)
59+ (when (= -1 hash)
60+ (setf hash (logxor (sxhash (node-label node))
61+ (sxhash (node-value node))
62+ (sxhash (node-terminal? node))
63+ (fixnumize (* (sxhash-node (node-child node)) 7))
64+ (fixnumize (* (sxhash-node (node-sibling node)) 13))))
65+ (setf child-total (calc-child-total node)
66+ sibling-total (calc-sibling-total node)))
67+ hash)))
68+
69+;;;;;;;;;;;;;;;;;;
70+;;; build function
71+(defun share (node memo)
72+ (if (null node)
73+ nil
74+ (or (dict:get node memo)
75+ (progn
76+ (setf (node-child node) (share (node-child node) memo)
77+ (node-sibling node) (share (node-sibling node) memo))
78+ (dict:get node memo))
79+ (setf (dict:get node memo) node))))
80+
81+(defun push-child (in parent value)
82+ (if (stream:eos? in)
83+ (setf (node-terminal? parent) t
84+ (node-value parent) value)
85+ (let ((new-node (make-node :label (stream:read in))))
86+ (shiftf (node-sibling new-node) (node-child parent) new-node)
87+ (push-child in new-node value))))
88+
89+(defun insert (in parent memo value)
90+ (let ((node (node-child parent)))
91+ (if (or (null node)
92+ (stream:eos? in)
93+ (/= (stream:peek in) (node-label node)))
94+ (progn
95+ (setf (node-child parent) (share node memo))
96+ (push-child in parent value))
97+ (insert (stream:eat in) node memo value))))
98+
99+(defun build-impl (key-generator show-progress)
100+ (loop WITH trie = (make-node)
101+ WITH memo = (dict:make :test #'node= :hash #'sxhash-node)
102+ FOR num fixnum FROM 0
103+ FOR (key . value) = (funcall key-generator)
104+ WHILE key
105+ DO
106+ (when (and show-progress (zerop (mod num 100000)))
107+ (format t "~&; ~A~%" num))
108+ (let ((in (stream:make key)))
109+ (declare (dynamic-extent in))
110+ (insert in trie memo value))
111+
112+ FINALLY
113+ (return (share trie memo))))
114+
115+(defun build-from-list (keyset &key show-progress)
116+ (when show-progress
117+ (format t "~&; build trie list (size ~A):~%" (length keyset)))
118+ (build-impl (lambda () (prog1 (car keyset)
119+ (setf keyset (cdr keyset))))
120+ show-progress))
121+
122+(defun build-from-file (filepath &key show-progress)
123+ (when show-progress
124+ (format t "~&; build trie from ~A:~%" filepath))
125+ (with-open-file (in filepath)
126+ (build-impl (lambda () (read-line in nil nil))
127+ show-progress)))
128+
129+;;;;;;;;;;;;;;;;;;;;;;;;;;;
130+;;; other external function
131+(defun node-options (node)
132+ "Encode terminal? and sibling-total fields into fixnum"
133+ (with-slots (terminal? sibling-total) (the node node)
134+ (fixnumize
135+ (+ (if terminal? 1 0)
136+ (ash sibling-total 1)))))
137+
138+(defun element-count (node)
139+ (with-slots (terminal? child-total) (the node node)
140+ (the fixnum (+ (if terminal? 1 0) child-total))))
141+
142+(defun collect-children (node)
143+ (loop WITH acc = '()
144+ FOR child = (node-child node)
145+ THEN (node-sibling child)
146+ WHILE child
147+ DO
148+ (push child acc)
149+ FINALLY
150+ (return acc)))
151+
152+;;;;;;;;;;;;;
153+;;; for debug
154+(defun member? (key trie)
155+ (declare #.*interface*
156+ (simple-characters key)
157+ (node trie))
158+ (let ((in (stream:make key)))
159+ (declare (dynamic-extent in))
160+ (nlet recur ((in in) (node (node-child trie)) (parent trie))
161+ (cond ((stream:eos? in) (node-terminal? parent))
162+ ((null node) nil)
163+ ((= (stream:peek in) (node-label node))
164+ (recur (stream:eat in) (node-child node) node))
165+ ((< (stream:peek in) (node-label node))
166+ (recur in (node-sibling node) parent))))))
167+
168+(package-alias :dawg.octet-stream)
--- trunk/lisp/lib/cl-dawg-0.2.2-unf/global.lisp (nonexistent)
+++ trunk/lisp/lib/cl-dawg-0.2.2-unf/global.lisp (revision 54)
@@ -0,0 +1,91 @@
1+(defpackage dawg.global
2+ (:use :common-lisp)
3+ (:export ;; special variable
4+ *fastest*
5+ *interface*
6+
7+ ;; type
8+ array-index
9+ positive-fixnum
10+ octet
11+ simple-characters
12+ unicode
13+ uint8
14+ uint4
15+ uint1
16+
17+ ;; byte order
18+ +NATIVE_ORDER+
19+ byte-reverse
20+
21+ ;; utility function
22+ fixnumize
23+ package-alias
24+ muffle
25+ a.if
26+ nlet
27+ with-open-output-file
28+
29+ ;; symbol for anaphoric macro
30+ it))
31+(in-package :dawg.global)
32+
33+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
34+;;; special variable for optimize declaration
35+(defvar *fastest* '(optimize (speed 3) (safety 0) (debug 0) (compilation-speed 0)))
36+(defvar *interface* '(optimize (speed 3) (safety 2) (debug 1) (compilation-speed 0)))
37+
38+;;;;;;;;;;;;;;;;;;;
39+;;; type definition
40+(deftype array-index () `(mod ,array-dimension-limit))
41+(deftype positive-fixnum () `(integer 0 ,most-positive-fixnum))
42+(deftype octet () '(unsigned-byte 8))
43+(deftype simple-characters () '(simple-array character))
44+(deftype unicode () `(mod ,char-code-limit))
45+(deftype uint8 () '(unsigned-byte 64))
46+(deftype uint4 () '(unsigned-byte 32))
47+(deftype uint1 () '(unsigned-byte 8))
48+
49+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
50+;;; unility function and macro
51+(declaim (inline fixnumize))
52+(defun fixnumize (n)
53+ (ldb (byte #.(integer-length most-positive-fixnum) 0) n))
54+
55+(defmacro package-alias (package &rest alias-list)
56+ `(eval-when (:compile-toplevel :load-toplevel :execute)
57+ (rename-package ,package ,package ',alias-list)))
58+
59+(defmacro muffle (&body body)
60+ `(locally
61+ (declare #+SBCL (sb-ext:muffle-conditions sb-ext:compiler-note))
62+ ,@body))
63+
64+(defmacro a.if (exp then else)
65+ `(let ((it ,exp))
66+ (if it
67+ ,then
68+ ,else)))
69+
70+(defmacro nlet (fn-name letargs &body body)
71+ `(labels ((,fn-name ,(mapcar #'car letargs)
72+ ,@body))
73+ (,fn-name ,@(mapcar #'cadr letargs))))
74+
75+(defmacro with-open-output-file ((stream path element-type &key (if-exists :supersede)) &body body)
76+ `(with-open-file (,stream ,path :direction :output
77+ :if-exists ,if-exists
78+ :element-type ,element-type)
79+ ,@body))
80+
81+(declaim (inline byte-reverse))
82+(defun byte-reverse (n size)
83+ (declare ((member 2 4 8) size))
84+ (muffle
85+ (loop FOR u fixnum FROM (1- size) DOWNTO 0
86+ FOR l fixnum FROM 0 TO (1- size)
87+ WHILE (> u l)
88+ DO
89+ (rotatef (ldb (byte 8 (* u 8)) n)
90+ (ldb (byte 8 (* l 8)) n)))
91+ n))
--- trunk/lisp/lib/cl-dawg-0.2.2-unf/COPYING (nonexistent)
+++ trunk/lisp/lib/cl-dawg-0.2.2-unf/COPYING (revision 54)
@@ -0,0 +1,21 @@
1+The MIT License
2+
3+Copyright (c) 2010 Takeru Ohta <phjgt308@gmail.com>
4+
5+Permission is hereby granted, free of charge, to any person obtaining a copy
6+of this software and associated documentation files (the "Software"), to deal
7+in the Software without restriction, including without limitation the rights
8+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+copies of the Software, and to permit persons to whom the Software is
10+furnished to do so, subject to the following conditions:
11+
12+The above copyright notice and this permission notice shall be included in
13+all copies or substantial portions of the Software.
14+
15+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21+THE SOFTWARE.
--- trunk/lisp/lib/cl-dawg-0.2.2-unf/dawg.lisp (nonexistent)
+++ trunk/lisp/lib/cl-dawg-0.2.2-unf/dawg.lisp (revision 54)
@@ -0,0 +1,136 @@
1+(defpackage dawg
2+ (:use :common-lisp :dawg.global)
3+ (:shadow :common-lisp load)
4+ (:export dawg
5+ build
6+ load
7+ member?
8+ get-id))
9+(in-package :dawg)
10+
11+(package-alias :dawg.octet-stream :stream)
12+
13+;;;;;;;;;;;;;;;;;;;;
14+;;; special variable
15+(eval-when (:compile-toplevel)
16+ (defvar *args-type* '(simple-characters dawg &key (:start positive-fixnum)
17+ (:end positive-fixnum))))
18+(defconstant +ARC_LIMIT+ #x100)
19+
20+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
21+;;; dawg (double-array format)
22+(defstruct dawg
23+ (node #() :type (simple-array uint4)))
24+
25+(defmethod print-object ((o dawg) stream)
26+ (print-unreadable-object (o stream :type t :identity t)
27+ (format stream "~A:~A" :node-count (length (dawg-node o)))))
28+
29+;;;;;;;;;;;;;;;
30+;;; declamation
31+(declaim (inline check-encoded-children get-node
32+ base chck terminal? sibling-total inc-id
33+ get-id-impl member?-impl
34+ each-common-prefix-impl each-predictive-impl)
35+ (ftype (function #.*args-type* boolean) member?)
36+ (ftype (function #.*args-type* (or null positive-fixnum)) get-id))
37+
38+;;;;;;;;;;;;;;;;;;;;;;;;;
39+;;; auxiliary function(1)
40+(defun read-array (index-path &key size element-type offset byte-order)
41+ (declare ((member uint4 uint8) element-type))
42+ (with-open-file (in index-path :element-type element-type)
43+ (file-position in offset)
44+ (let ((ary (make-array size :element-type element-type)))
45+ (read-sequence ary in)
46+ (unless (or (eq byte-order :native)
47+ (eq byte-order +NATIVE_ORDER+))
48+ (let ((byte-size (ecase element-type
49+ (uint4 4)
50+ (uint8 8))))
51+ (dotimes (i size)
52+ (setf (aref ary i) (byte-reverse (aref ary i) byte-size)))))
53+ ary)))
54+
55+;;;;;;;;;;;;;;;;;;;;;;;;
56+;;; external function(1)
57+(defun build (&key input output (byte-order :native) show-progress)
58+ (declare ((or string pathname list) input)
59+ ((or string pathname) output)
60+ ((member :native :little :big) byte-order))
61+ (let ((trie (if (listp input)
62+ (dawg.bintrie-builder:build-from-list input :show-progress show-progress)
63+ (dawg.bintrie-builder:build-from-file input :show-progress show-progress))))
64+ (dawg.double-array-builder:build-from-bintrie
65+ trie :output-file output :byte-order byte-order :show-progress show-progress))
66+ t)
67+
68+(defun load (index-path &key (byte-order :native))
69+ (declare ((or string pathname file-stream) index-path)
70+ ((member :native :little :big) byte-order))
71+ (let ((sizes (read-array index-path :size 2 :element-type 'uint4 :offset 0
72+ :byte-order byte-order)))
73+ (make-dawg
74+ :node (read-array index-path :element-type 'uint4
75+ :size (/ (aref sizes 0) 4)
76+ :offset 2
77+ :byte-order byte-order))))
78+
79+;;;;;;;;;;;;;;;;;;;;;;;;;
80+;;; auxiliary function(2)
81+(defun base (node) (ldb (byte 24 0) (the uint8 node)))
82+(defun chck (node) (ldb (byte 8 24) (the uint8 node)))
83+
84+(defun get-node (dawg index)
85+ (declare (dawg dawg)
86+ (positive-fixnum index))
87+ (aref (dawg-node dawg) index))
88+
89+;;;;;;;;;;;;;;;;;;;;;;;;
90+;;; external function(2)
91+(defun member?-impl (in dawg)
92+ (nlet recur ((node (get-node dawg 0)))
93+ (if (stream:eos? in)
94+ t
95+ (let* ((arc (stream:read in))
96+ (next (get-node dawg (+ (base node) arc))))
97+ (when (= (chck next) arc)
98+ (recur next))))))
99+
100+(defun get-id-impl (in dawg)
101+ (nlet recur ((node (get-node dawg 0)))
102+ (print (list :in #1=(+ (base node) (stream:peek in)) (get-node dawg #1#)
103+ (stream:peek in)))
104+ (if (stream:eos? in)
105+ (ldb (byte 24 0) node)
106+ (let* ((arc (stream:read in))
107+ (next (get-node dawg (+ (base node) arc))))
108+ (when (= (chck next) arc)
109+ (recur next))))))
110+
111+
112+(defmacro with-key-stream ((in key &key start end) &body body)
113+ (let ((k (gensym))
114+ (s (gensym))
115+ (e (gensym)))
116+ `(let ((,k ,key)
117+ (,s ,start)
118+ (,e ,end))
119+ (declare #.*interface*
120+ (simple-characters ,k)
121+ (positive-fixnum ,s ,e))
122+ (locally
123+ (declare #.*fastest*)
124+ (let ((,in (stream:make ,k :start ,s :end ,e)))
125+ (declare (dynamic-extent ,in))
126+ ,@body)))))
127+
128+(defun member? (key dawg &key (start 0) (end (length key)))
129+ (with-key-stream (in key :start start :end end)
130+ (member?-impl in dawg)))
131+
132+(defun get-id (key dawg &key (start 0) (end (length key)))
133+ (with-key-stream (in key :start start :end end)
134+ (get-id-impl in dawg)))
135+
136+(package-alias :dawg.octet-stream)
--- trunk/lisp/lib/cl-dawg-0.2.2-unf/double-array-buffered-output.lisp (nonexistent)
+++ trunk/lisp/lib/cl-dawg-0.2.2-unf/double-array-buffered-output.lisp (revision 54)
@@ -0,0 +1,64 @@
1+(defpackage dawg.double-array.buffered-output
2+ (:use :common-lisp :dawg.global)
3+ (:export buffered-output
4+ with-output
5+ write-uint))
6+(in-package :dawg.double-array.buffered-output)
7+
8+;;;;;;;;;;;;;;;
9+;;; declamation
10+(declaim #.*fastest*)
11+
12+;;;;;;;;;;;;
13+;;; constant
14+(defconstant +BUFFER_SIZE+ 819200)
15+
16+;;;;;;;;;;;;;;;;;;;
17+;; buffered-output
18+(defstruct buffered-output
19+ (binary-output nil :type file-stream)
20+ (buffer #() :type simple-array)
21+ (offset 0 :type array-index))
22+
23+;;;;;;;;;;;;;;;;;;;;;
24+;;; external function
25+(defmacro with-output ((out path &key (byte-width 1)) &body body)
26+ (declare ((member 1 2 4 8) byte-width))
27+ `(with-open-file (,out ,path :element-type #1='(unsigned-byte ,(* 8 byte-width))
28+ :direction :output
29+ :if-exists :supersede)
30+ (let ((,out (make-buffered-output
31+ :binary-output ,out
32+ :buffer (make-array ,+BUFFER_SIZE+ :element-type #1#
33+ :initial-element #xFF000000))))
34+ (unwind-protect
35+ (locally ,@body)
36+ (flush ,out :final t)))))
37+
38+(defun write-uint (uint out &key (position 0))
39+ (declare (buffered-output out)
40+ (positive-fixnum position))
41+ (with-slots (binary-output buffer offset) out
42+ (cond ((< position offset)
43+ (file-position binary-output position)
44+ (write-byte uint binary-output))
45+ ((< position (+ offset +BUFFER_SIZE+))
46+ (muffle
47+ (setf (aref buffer (- position offset)) uint)))
48+ (t
49+ (flush out)
50+ (incf offset +BUFFER_SIZE+)
51+ (fill buffer #xFF000000)
52+ (write-uint uint out :position position)))))
53+
54+(defun flush (out &key final)
55+ (declare (buffered-output out))
56+ (with-slots (binary-output buffer offset) out
57+ (file-position binary-output offset)
58+ (if (null final)
59+ (write-sequence buffer binary-output)
60+ (let ((end (muffle
61+ (or (position-if-not (lambda (x) (= x #xFF000000)) buffer :from-end t)
62+ (1- +BUFFER_SIZE+)))))
63+ (write-sequence buffer binary-output :end (1+ end))
64+ (loop REPEAT #x100 DO (write-byte #xFF000000 binary-output))))))
--- trunk/lisp/lib/cl-dawg-0.2.2-unf/README (nonexistent)
+++ trunk/lisp/lib/cl-dawg-0.2.2-unf/README (revision 54)
@@ -0,0 +1,98 @@
1+[概要]
2+・DAWG(Direct Acyclic Word Graph)によるマップの実装
3+・入力キーセットを静的に受け取り、各キーに一意なIDをマッピングする
4+ ・入力キーセットはソート済みであり、かつ各キーはユニークでなければならない
5+ ・キーのID == 先頭から数えたキーの出現位置、となる ※ 先頭のキーのID値は0となる
6+・数千万程度のキーセットをCommon Lispで比較的手軽に扱えるようにするのが目的
7+ ・2~4GBのメモリを積んでいる32bitマシンで
8+・masterからSBCLへの依存性を排除したブランチ
9+
10+
11+[バージョン]
12+・0.2.2
13+
14+
15+[依存パッケージ]
16+・dict-0.0.2
17+ ・ポータブルなハッシュテーブル
18+ ・https://github.com/sile/dict
19+
20+
21+[インストール]
22+> (require :asdf)
23+> (require :asdf-install)
24+> (asdf-install:install "cl-dawg-VERSION.tar.gz")
25+
26+
27+[API]
28+###
29+# dawg
30+ メインパッケージ
31+
32+####
33+# dawg:build (&key input output byte-order show-progress) => t
34+ キーセットファイル※1からDAWGインデックスファイル※2を作成する。
35+ = input: 入力キーセットファイルのパス名 or キーセットが格納されたリスト (必須)
36+ = output: 出力DAWGインデックスファイルのパス名 (必須)
37+ = byte-order: インデックスファイルのバイトオーダーを指定する。:native or :big or :little
38+        デフォルトは :native
39+ = show-progress: 作成時の進捗表示を行うかどうか。デフォルトはnil
40+
41+※1: キーセットが昇順に改行区切りで格納されているファイル
42+※2: 正確にはDAWGをDoubleArray形式で保存したインデックスファイル
43+
44+####
45+# dawg:load (index-path &key byte-order) => dawg:dawg
46+ DAWGインデックスファイルからDAWG(DoubleArray形式)を読み込む。
47+ = index-path: build関数で作成したDAWGインデックスファイル
48+ = byte-order: インデックスファイルのバイトオーダーを指定する。:native or :big or :little
49+        デフォルトは :native
50+
51+###
52+# dawg:member? (key dawg &key start end) => boolean
53+ キーがDAWGに含まれているかどうかを判定する。
54+ = key: 対象のキー文字列。(simple-array character)型でなければならない
55+ = dawg: DAWG
56+ = start: キー文字列内の開始位置
57+ = end: キー文字列内の終端位置
58+
59+###
60+# dawg:get-id (key dawg &key start end) => (or null fixnum)
61+ キーに紐付くIDを取得する。
62+ キーがDAWG内に存在しない場合はnilを返す。
63+ = key: 対象のキー文字列。(simple-array character)型でなければならない
64+ = dawg: DAWG
65+ = start: キー文字列内の開始位置
66+ = end: キー文字列内の終端位置
67+ 
68+###
69+# dawg:each-common-prefix ((match-id match-end)
70+ (key dawg &key start end)
71+ &body body)
72+ 入力キーに対して共通接頭辞検索を行う。
73+ 入力キーの接頭辞部分にマッチするDAWG内の各キーに対して、body部分の処理が実行される。
74+ ※ return関数を使うことで、途中でループを抜けることが可能
75+ = match-id: 入力キーの接頭辞部分にマッチしたキーのID値
76+ = match-end: 入力キー内のマッチした部分の終端位置
77+ = key: 入力キー文字列。(simple-array character)型でなければならない
78+ = start: キー文字列内の開始位置
79+ = end: キー文字列内の終端位置
80+ = body: マッチの度に実行される式
81+
82+###
83+# dawg:each-predictive ((match-id)
84+ (key dawg &key start end)
85+ &body body)
86+ 入力キーが接頭辞となる全ての要素を走査する。
87+ 各走査で要素のIDを受け取り、body部分の処理が実行される。
88+ ※ return関数を使うことで、途中でループを抜けることが可能
89+ = match-id: 入力キーが接頭辞となる要素のID値
90+ = key: 入力キー文字列。(simple-array character)型でなければならない
91+ = start: キー文字列内の開始位置
92+ = end: キー文字列内の終端位置
93+ = body: マッチの度に実行される式
94+
95+
96+[注意事項]
97+・DAWGのキー内の文字にヌル文字を含めることは出来ない
98+ ・DoubleArray中で(CHECK配列の初期値として)特別扱いしているため
--- trunk/lisp/lib/cl-dawg-0.2.2-unf/double-array-node-allocator.lisp (nonexistent)
+++ trunk/lisp/lib/cl-dawg-0.2.2-unf/double-array-node-allocator.lisp (revision 54)
@@ -0,0 +1,131 @@
1+(defpackage dawg.double-array.node-allocator
2+ (:use :common-lisp :dawg.global)
3+ (:export make
4+ allocate))
5+(in-package :dawg.double-array.node-allocator)
6+
7+;;;;;;;;;;;;;;;
8+;;; declamation
9+(declaim #.*fastest*
10+ (inline get-next can-allocate?))
11+
12+;;;;;;;;;;;;
13+;;; constant
14+(eval-when (:compile-toplevel :load-toplevel :execute)
15+ (defconstant +BUFFER_SIZE+ 89120))
16+
17+;;;;;;;;;;;;;;;;;;
18+;;; node-allocator
19+(defstruct node-allocator
20+ (head #x100 :type array-index)
21+ (bits #* :type (simple-bit-vector #.+BUFFER_SIZE+))
22+ (nexts #() :type (simple-array fixnum (#.+BUFFER_SIZE+)))
23+ (prevs #() :type (simple-array fixnum (#.+BUFFER_SIZE+)))
24+ (offset 0 :type array-index))
25+
26+;;;;;;;;;;;;;;;
27+;;; constructor
28+(defun make ()
29+ (let ((bits (make-array +BUFFER_SIZE+ :element-type 'bit :initial-element 0))
30+ (nexts (make-array +BUFFER_SIZE+ :element-type 'fixnum))
31+ (prevs (make-array +BUFFER_SIZE+ :element-type 'fixnum)))
32+ (loop FOR i FROM 0 BELOW +BUFFER_SIZE+
33+ DO
34+ (setf (aref nexts i) (1+ i)
35+ (aref prevs i) (1- i)))
36+ (make-node-allocator :nexts nexts :prevs prevs :bits bits)))
37+
38+;;;;;;;;;;;;;;;;;;;;;;
39+;;; auxiliary function
40+(defun shift (alloca)
41+ (with-slots (bits nexts prevs offset head) (the node-allocator alloca)
42+ (let ((new-offset head))
43+ (loop WHILE (< new-offset (+ offset (- +BUFFER_SIZE+ (* #x100 2))))
44+ DO
45+ (setf new-offset (aref nexts (- new-offset offset))))
46+ (let* ((delta (- new-offset offset))
47+ (use-len (- +BUFFER_SIZE+ delta)))
48+ (shiftf (subseq bits 0 use-len) (subseq bits delta))
49+ (fill bits 0 :start use-len)
50+
51+ (setf offset new-offset)
52+
53+ (shiftf (subseq nexts 0 use-len) (subseq nexts delta))
54+ (shiftf (subseq prevs 0 use-len) (subseq prevs delta))
55+ (loop FOR i FROM (+ offset use-len) BELOW (+ offset +BUFFER_SIZE+)
56+ DO
57+ (setf (aref nexts (- i offset)) (1+ i)
58+ (aref prevs (- i offset)) (1- i)))
59+
60+ (setf head offset)
61+ (loop WHILE (< head (+ offset #x100))
62+ DO
63+ (setf head (aref nexts (- head offset)))))))
64+ alloca)
65+
66+(defun ref (alloca index)
67+ (declare (array-index index))
68+ (with-slots (offset nexts) (the node-allocator alloca)
69+ (if (<= (+ offset +BUFFER_SIZE+) index)
70+ (ref (shift alloca) index)
71+ (aref nexts (- index offset)))))
72+
73+(defun bref (alloca index)
74+ (declare (array-index index))
75+ (with-slots (bits offset) (the node-allocator alloca)
76+ (if (> offset index)
77+ 1
78+ (if (<= (+ offset +BUFFER_SIZE+) index)
79+ (bref (shift alloca) index)
80+ (bit bits (- index offset))))))
81+
82+(defun get-next (alloca index)
83+ (ref alloca index))
84+
85+(defun can-allocate? (alloca index arcs)
86+ (declare (list arcs)
87+ (array-index index))
88+ (and (zerop (bref alloca index))
89+ (every (lambda (arc)
90+ (declare (octet arc))
91+ (/= -1 (ref alloca (+ index arc))))
92+ arcs)))
93+
94+(defun allocate-impl (alloca index arcs)
95+ (declare (array-index index))
96+ (with-slots (bits head prevs nexts offset) (the node-allocator alloca)
97+ (when (<= offset index)
98+ (setf (bit bits (- index offset)) 1))
99+ (loop WITH base = index
100+ FOR arc OF-TYPE (mod #x100) IN arcs
101+ FOR index OF-TYPE fixnum = (+ base arc)
102+ DO
103+ (when (<= offset index)
104+ (ref alloca index)
105+
106+ (let ((prev (aref prevs (- index offset)))
107+ (next (aref nexts (- index offset))))
108+ (setf (aref prevs (- index offset)) -1
109+ (aref nexts (- index offset)) -1)
110+
111+ (when (= head index)
112+ (setf head next))
113+
114+ (when (<= offset prev)
115+ (setf (aref nexts (- prev offset)) next))
116+
117+ (when (<= offset next)
118+ (ref alloca next)
119+ (setf (aref prevs (- next offset)) prev)))))))
120+
121+;;;;;;;;;;;;;;;;;;;;;
122+;;; external function
123+(defun allocate (alloca arcs)
124+ (with-slots (head) (the node-allocator alloca)
125+ (loop WITH front OF-TYPE (mod #x100) = (car arcs)
126+ FOR cur = (get-next alloca head) THEN (get-next alloca cur)
127+ FOR base OF-TYPE fixnum = (- cur front)
128+ UNTIL (and (plusp base) (can-allocate? alloca base (cdr arcs)))
129+ FINALLY
130+ (allocate-impl alloca base arcs)
131+ (return base))))
--- trunk/lisp/lib/cl-dawg-0.2.2-unf/octet-stream.lisp (nonexistent)
+++ trunk/lisp/lib/cl-dawg-0.2.2-unf/octet-stream.lisp (revision 54)
@@ -0,0 +1,78 @@
1+(defpackage dawg.octet-stream
2+ (:use :common-lisp :dawg.global)
3+ (:shadow :common-lisp read peek position)
4+ (:export make
5+ read
6+ peek
7+ eos?
8+ eat
9+ position))
10+(in-package :dawg.octet-stream)
11+
12+;;;;;;;;;;;;;;;;
13+;;; declamation
14+(declaim #.*fastest*
15+ (inline make-octet-stream make eos? octet-length peek read eat position))
16+
17+;;;;;;;;;;;;;;;;
18+;;; octet-stream
19+(defstruct octet-stream
20+ (src "" :type simple-characters)
21+ (pos 0 :type array-index)
22+ (end 0 :type array-index)
23+ (code 0 :type unicode)
24+ (octet-pos 0 :type (mod 5))
25+ (octet-len 0 :type (mod 5)))
26+
27+;;;;;;;;;;;;;;;;;;;;;;
28+;;; auxiliary function
29+(defun octet-length (code)
30+ (declare (unicode code))
31+ (cond ((< code #x80) 1)
32+ ((< code #x800) 2)
33+ ((< code #x10000) 3)
34+ (t 4)))
35+
36+;;;;;;;;;;;;;;;;;;;;;
37+;;; external function
38+(defun position (in)
39+ (octet-stream-pos in))
40+
41+(defun make (string &key (start 0) (end (length string)))
42+ (declare (simple-characters string)
43+ (array-index start end))
44+ (let* ((code (if (= start (length string))
45+ 0
46+ (char-code (char string start))))
47+ (len (octet-length code)))
48+ (make-octet-stream :src string :pos start :end end
49+ :code code :octet-pos len :octet-len len)))
50+
51+(defun eos? (in)
52+ (with-slots ( pos end) (the octet-stream in)
53+ (= pos end)))
54+
55+(defun peek (in)
56+ (with-slots (code octet-pos octet-len) (the octet-stream in)
57+ (if (= octet-pos octet-len)
58+ (case octet-len
59+ (1 code)
60+ (2 (+ #b11000000 (ldb (byte 5 6) code)))
61+ (3 (+ #b11100000 (ldb (byte 4 12) code)))
62+ (t (+ #b11110000 (ldb (byte 3 18) code))))
63+ (+ #b10000000 (ldb (byte 6 (* (the (mod 4) (1- octet-pos)) 6)) code)))))
64+
65+(defun eat (in)
66+ (with-slots (src pos code octet-pos octet-len) (the octet-stream in)
67+ (decf octet-pos)
68+ (when (zerop octet-pos)
69+ (incf pos)
70+ (unless (eos? in)
71+ (setf code (char-code (char src pos))
72+ octet-len (octet-length code)
73+ octet-pos octet-len))))
74+ in)
75+
76+(defun read (in)
77+ (prog1 (peek in)
78+ (eat in)))
Show on old repository browser