cl-igo: 0.0.1: tags保存
@@ -0,0 +1,78 @@ | ||
1 | +(in-package :igo) | |
2 | + | |
3 | +(igo::set-package-nickname :igo.word-dic :wdc) | |
4 | +(igo::set-package-nickname :igo.unknown :unk) | |
5 | +(igo::set-package-nickname :igo.matrix :mtx) | |
6 | +(igo::set-package-nickname :igo.code-stream :code-stream) | |
7 | +(igo::set-package-nickname :igo.viterbi-node :vn) | |
8 | + | |
9 | +(defstruct tagger | |
10 | + (wdc nil :type wdc:word-dic) | |
11 | + (unk nil :type unk:unknown) | |
12 | + (mtx nil :type mtx:matrix)) | |
13 | + | |
14 | +(defun tagger-new (data-dir &optional (feature-parser #'identity)) | |
15 | + (make-tagger :wdc (wdc:load data-dir feature-parser) | |
16 | + :unk (unk:load data-dir) | |
17 | + :mtx (mtx:load data-dir))) | |
18 | + | |
19 | +(eval-when (:compile-toplevel :load-toplevel :execute) | |
20 | + (igo::defconst-once-only +BOS-NODES+ (list (vn:new-bos/eos)))) | |
21 | + | |
22 | +(defmacro nconcf (lst1 lst2) | |
23 | + `(setf ,lst1 (nconc ,lst1 ,lst2))) | |
24 | + | |
25 | +(defun set-mincost-node (vn prevs mtx wdc) | |
26 | + (flet ((calc-cost (prev cur) | |
27 | + (+ (vn:cost prev) (mtx:link-cost (vn:right-id prev) (vn:left-id cur) mtx)))) | |
28 | + (let ((fst (first prevs))) | |
29 | + (setf (vn:prev vn) fst | |
30 | + (vn:cost vn) (calc-cost fst vn)) | |
31 | + | |
32 | + (dolist (p (cdr prevs)) | |
33 | + (let ((cost (calc-cost p vn))) | |
34 | + (when (< cost (vn:cost vn)) | |
35 | + (setf (vn:cost vn) cost | |
36 | + (vn:prev vn) p)))) | |
37 | + | |
38 | + (incf (vn:cost vn) (wdc:cost (vn:word-id vn) wdc)) | |
39 | + vn))) | |
40 | + | |
41 | +(defun parse-impl (tagger cs len) | |
42 | + (let ((nodes (make-sequence 'simple-vector (1+ len) :initial-element nil)) | |
43 | + (wdc (tagger-wdc tagger)) | |
44 | + (unk (tagger-unk tagger)) | |
45 | + (mtx (tagger-mtx tagger))) | |
46 | + (setf (aref nodes 0) +BOS-NODES+) | |
47 | + | |
48 | + (loop FOR i FROM 0 BELOW len | |
49 | + FOR prevs = (aref nodes i) DO | |
50 | + (setf (code-stream:position cs) i) | |
51 | + (when prevs | |
52 | + (dolist (vn (unk:search cs unk wdc (wdc:search cs '() wdc))) | |
53 | + (if (vn:space? vn) | |
54 | + (nconcf (aref nodes (vn:end vn)) prevs) | |
55 | + (push (set-mincost-node vn prevs mtx wdc) (aref nodes (vn:end vn))))))) | |
56 | + | |
57 | + (vn:prev (set-mincost-node (vn:new-bos/eos) (aref nodes len) mtx wdc)))) | |
58 | + | |
59 | +(defun parse (tagger text &aux (wdc (tagger-wdc tagger)) rlt) | |
60 | + (do ((vn (parse-impl tagger (code-stream:make text 0) (length text)) | |
61 | + (vn:prev vn))) | |
62 | + ((null (vn:prev vn)) rlt) | |
63 | + (push (igo::morpheme-new (subseq text (vn:start vn) (vn:end vn)) | |
64 | + (wdc:word-data (vn:word-id vn) wdc) | |
65 | + (vn:start vn)) | |
66 | + rlt))) | |
67 | + | |
68 | +(defun wakati (tagger text &aux rlt) | |
69 | + (do ((vn (parse-impl tagger (code-stream:make text 0) (length text)) | |
70 | + (vn:prev vn))) | |
71 | + ((null (vn:prev vn)) rlt) | |
72 | + (push (subseq text (vn:start vn) (vn:end vn)) rlt))) | |
73 | + | |
74 | +(igo::delete-package-nickname :igo.word-dic) | |
75 | +(igo::delete-package-nickname :igo.unknown) | |
76 | +(igo::delete-package-nickname :igo.matrix) | |
77 | +(igo::delete-package-nickname :igo.code-stream) | |
78 | +(igo::delete-package-nickname :igo.viterbi-node) | |
\ No newline at end of file |
@@ -0,0 +1,10 @@ | ||
1 | +(defpackage igo | |
2 | + (:use :common-lisp) | |
3 | + (:export morpheme | |
4 | + morpheme-surface | |
5 | + morpheme-feature | |
6 | + morpheme-start | |
7 | + | |
8 | + tagger-new | |
9 | + parse | |
10 | + wakati)) | |
\ No newline at end of file |
@@ -0,0 +1,6 @@ | ||
1 | +(in-package :igo) | |
2 | + | |
3 | +(defstruct (morpheme (:constructor morpheme-new (surface feature start))) | |
4 | + (surface "" :type string) ; TODO: 共有した方が良いかどうかは後で検討する | |
5 | + (feature nil :type t) | |
6 | + (start 0 :type fixnum)) | |
\ No newline at end of file |
@@ -0,0 +1,48 @@ | ||
1 | +(in-package :igo) | |
2 | + | |
3 | +(defmacro set-package-nickname (package nickname) | |
4 | + `(eval-when (:compile-toplevel :load-toplevel :execute) | |
5 | + (rename-package ,package ,package '(,nickname)))) | |
6 | + | |
7 | +(defmacro delete-package-nickname (package) | |
8 | + `(eval-when (:compile-toplevel :load-toplevel :execute) | |
9 | + (rename-package ,package ,package))) | |
10 | + | |
11 | +(defmacro defconst-once-only (name value &optional documentation) | |
12 | + `(unless (boundp ',name) | |
13 | + (defconstant ,name ,value ,documentation))) | |
14 | + | |
15 | +(defun formalize-letargs (args) | |
16 | + (mapcar (lambda (a) (if (atom a) (list a) a)) args)) | |
17 | + | |
18 | +(defmacro nlet (fn-name letargs &body body) | |
19 | + (setf letargs (formalize-letargs letargs)) | |
20 | + `(labels ((,fn-name ,(mapcar #'car letargs) | |
21 | + ,@body)) | |
22 | + (,fn-name ,@(mapcar #'cadr letargs)))) | |
23 | + | |
24 | +;; XXX: | |
25 | +(defmacro split-by-chars (delims str &optional count (remove-delim t)) | |
26 | + (assert (typep delims 'string) (delims) "DELIMS must be STRING (input is ~A)" (type-of delims)) | |
27 | + `(let (tokens (len (length ,str)) ,@(when count (list (list 'cnt count)))) | |
28 | + ,(when count '(declare (fixnum cnt))) | |
29 | + (nlet self ((pos 0) (beg 0)) | |
30 | + (declare (fixnum pos beg)) | |
31 | + (if (= pos len) | |
32 | + (nreverse (if (= beg pos) tokens (cons (subseq ,str beg pos) tokens))) | |
33 | + (case (schar ,str pos) | |
34 | + (,(coerce delims 'list) | |
35 | + (push (subseq ,str beg pos) tokens) | |
36 | + (loop while (and (/= pos len) | |
37 | + (case (schar ,str pos) | |
38 | + (,(coerce delims 'list) | |
39 | + ,(unless remove-delim | |
40 | + `(push (subseq ,str pos (1+ pos)) tokens)) | |
41 | + (incf pos))))) | |
42 | + ,(when count | |
43 | + `(when (zerop (decf cnt)) | |
44 | + (return-from self (nreverse | |
45 | + (if (>= pos len) tokens (cons (subseq ,str pos) tokens)))))) | |
46 | + (self pos pos)) | |
47 | + (otherwise | |
48 | + (self (1+ pos) beg))))))) | |
\ No newline at end of file |
@@ -0,0 +1,62 @@ | ||
1 | +(defpackage :igo.unknown | |
2 | + (:use :common-lisp) | |
3 | + (:shadow load | |
4 | + search) | |
5 | + (:export load | |
6 | + unknown | |
7 | + search)) | |
8 | +(in-package :igo.unknown) | |
9 | + | |
10 | +(igo::set-package-nickname :igo.char-category :cc) | |
11 | +(igo::set-package-nickname :igo.code-stream :code-stream) | |
12 | +(igo::set-package-nickname :igo.word-dic :dic) | |
13 | + | |
14 | +(defstruct (unknown (:conc-name "")) | |
15 | + (categorys nil :type cc::category-set) | |
16 | + (space-id 0 :type fixnum)) | |
17 | + | |
18 | +(defun load (root-dir) | |
19 | + (let* ((cts (cc:load root-dir)) | |
20 | + (unk (make-unknown :categorys cts))) | |
21 | + (setf (space-id unk) | |
22 | + (cc:category-trie-id (cc:category (char-code #\Space) cts))) | |
23 | + unk)) | |
24 | + | |
25 | +(defun search (cs unk wdic result) | |
26 | + (prog* ((start (code-stream:position cs)) | |
27 | + (code (code-stream:read cs)) | |
28 | + (categorys (categorys unk)) | |
29 | + (ct (cc:category code categorys))) | |
30 | + (when (and result | |
31 | + (not (cc:category-invoke? ct))) | |
32 | + (go :end)) | |
33 | + | |
34 | + (let* ((trie-id (cc:category-trie-id ct)) | |
35 | + (space? (= trie-id (space-id unk))) | |
36 | + (limit (cc:category-length ct))) | |
37 | + (loop FOR len FROM 1 TO limit DO | |
38 | + (setf result | |
39 | + ;; XXX: (+ start len) => surrogate? | |
40 | + (dic:search-from-trie-id trie-id start (+ start len) space? result wdic)) | |
41 | + (when (or (code-stream:end? cs) | |
42 | + (not (cc:compatible? code (code-stream:read cs) categorys))) | |
43 | + (go :end))) | |
44 | + | |
45 | + (when (and (cc:category-group? ct)) | |
46 | + (if (code-stream:end? cs) | |
47 | + (setf result (dic:search-from-trie-id trie-id start (code-stream:length cs) | |
48 | + space? result wdic)) | |
49 | + (progn (loop WHILE (and (not (code-stream:end? cs)) | |
50 | + (cc:compatible? code (code-stream:read cs) categorys))) | |
51 | + | |
52 | + (setf result | |
53 | + (dic:search-from-trie-id trie-id start | |
54 | + (1- (code-stream:position cs)) | |
55 | + space? result wdic)))))) | |
56 | + :end | |
57 | + (setf (code-stream:position cs) start)) | |
58 | + result) | |
59 | + | |
60 | +(igo::delete-package-nickname :igo.char-category) | |
61 | +(igo::delete-package-nickname :igo.code-stream) | |
62 | +(igo::delete-package-nickname :igo.word-dic) | |
\ No newline at end of file |
@@ -0,0 +1,82 @@ | ||
1 | +(defpackage igo.code-stream | |
2 | + (:use :common-lisp) | |
3 | + (:shadow read | |
4 | + length | |
5 | + position) | |
6 | + (:export read | |
7 | + read2 | |
8 | + make | |
9 | + end? | |
10 | + length | |
11 | + position | |
12 | + +TERMINATE-CODE+)) | |
13 | +(in-package :igo.code-stream) | |
14 | + | |
15 | +(defstruct (code-stream (:constructor make (source start &aux (cur start))) | |
16 | + (:conc-name "")) | |
17 | + (source "" :type string) | |
18 | + (cur 0 :type fixnum) | |
19 | + (surrogate? nil :type boolean)) | |
20 | + | |
21 | +(eval-when (:compile-toplevel :load-toplevel :execute) | |
22 | + (igo::defconst-once-only +TERMINATE-CODE+ 0)) | |
23 | + | |
24 | +(defun position (code-stream) | |
25 | + (cur code-stream)) | |
26 | + | |
27 | +(defsetf position (code-stream) (new-position) | |
28 | + `(setf (cur ,code-stream) ,new-position)) | |
29 | + | |
30 | +;; XXX: なくす | |
31 | +(defun length (code-stream) | |
32 | + (common-lisp:length (source code-stream))) | |
33 | + | |
34 | +(defun end? (code-stream) | |
35 | + (>= (cur code-stream) (length code-stream))) | |
36 | + | |
37 | +(defun code (code-stream) | |
38 | + (char-code (schar (source code-stream) (cur code-stream)))) | |
39 | + | |
40 | +(defun low-surrogate (code) | |
41 | + (+ #xDC00 (ldb (byte 10 0) code))) | |
42 | + | |
43 | +(defun high-surrogate (code) | |
44 | + (+ #xB800 (- (ldb (byte 11 10) code) #b1000000))) | |
45 | + | |
46 | +(defun read (code-stream) | |
47 | + (with-slots (cur surrogate?) code-stream | |
48 | + (cond (surrogate? | |
49 | + (setf surrogate? nil) | |
50 | + (prog1 (low-surrogate (code code-stream)) | |
51 | + (incf cur))) | |
52 | + | |
53 | + ((end? code-stream) | |
54 | + ;;(incf cur) | |
55 | + +TERMINATE-CODE+) | |
56 | + | |
57 | + (t | |
58 | + (let ((code (code code-stream))) | |
59 | + (if (> code #xFFFF) | |
60 | + (progn (setf surrogate? t) | |
61 | + (high-surrogate code)) | |
62 | + (progn (incf cur) | |
63 | + code))))))) | |
64 | + | |
65 | +;; XXX | |
66 | +(defun read2 (code-stream) | |
67 | + (with-slots (cur surrogate?) code-stream | |
68 | + (cond (surrogate? | |
69 | + (setf surrogate? nil) | |
70 | + (prog1 (low-surrogate (code code-stream)) | |
71 | + (incf cur))) | |
72 | + | |
73 | + ((end? code-stream) | |
74 | + +TERMINATE-CODE+) | |
75 | + | |
76 | + (t | |
77 | + (let ((code (code code-stream))) | |
78 | + (if (> code #xFFFF) | |
79 | + (progn (setf surrogate? t) | |
80 | + (high-surrogate code)) | |
81 | + (progn (incf cur) | |
82 | + code))))))) | |
\ No newline at end of file |
@@ -0,0 +1,71 @@ | ||
1 | +(defpackage igo.trie | |
2 | + (:use :common-lisp) | |
3 | + (:shadow load) | |
4 | + (:export trie | |
5 | + load | |
6 | + each-common-prefix)) | |
7 | +(in-package :igo.trie) | |
8 | + | |
9 | +(igo::set-package-nickname :igo.varied-byte-stream :vbs) | |
10 | +(igo::set-package-nickname :igo.code-stream :code-stream) | |
11 | + | |
12 | +(defstruct (trie (:conc-name "")) | |
13 | + (element-count 0 :type fixnum) | |
14 | + (begs #() :type (simple-array (signed-byte 32))) | |
15 | + (lens #() :type (simple-array (signed-byte 16))) | |
16 | + (base #() :type (simple-array (signed-byte 32))) | |
17 | + (chck #() :type (simple-array (unsigned-byte 16))) | |
18 | + (tail #() :type (simple-array (unsigned-byte 16)))) | |
19 | + | |
20 | +(defmethod print-object ((o trie) stream) | |
21 | + (print-unreadable-object (o stream :type t) | |
22 | + (format stream ":element-count ~D" (element-count o)))) | |
23 | + | |
24 | +(defun load (path) | |
25 | + (vbs:with-input-file (in path) | |
26 | + (let ((node-size (vbs:read-byte in 4)) | |
27 | + (tind-size (vbs:read-byte in 4)) | |
28 | + (tail-size (vbs:read-byte in 4))) | |
29 | + (make-trie | |
30 | + :element-count tind-size | |
31 | + :begs (vbs:read-sequence in 4 tind-size) | |
32 | + :base (vbs:read-sequence in 4 node-size) | |
33 | + :lens (vbs:read-sequence in 2 tind-size) | |
34 | + :chck (vbs:read-sequence in 2 node-size :signed nil) | |
35 | + :tail (vbs:read-sequence in 2 tail-size :signed nil))))) | |
36 | + | |
37 | +(defun id (node) | |
38 | + (1- (- node))) | |
39 | + | |
40 | +(defun including-tail? (cs node trie &aux (id (id node))) | |
41 | + (let ((tail (tail trie)) | |
42 | + (beg (aref (begs trie) id)) | |
43 | + (len (aref (lens trie) id))) | |
44 | + (loop FOR i FROM beg BELOW (+ beg len) | |
45 | + ALWAYS (= (aref tail i) (code-stream:read cs))))) | |
46 | + | |
47 | +(defun each-common-prefix (fn cs trie) | |
48 | + (let* ((base (base trie)) | |
49 | + (chck (chck trie)) | |
50 | + (node (aref base 0))) | |
51 | + (loop FOR code = (code-stream:read cs) DO | |
52 | + (let ((terminal-idx (+ node code-stream:+TERMINATE-CODE+))) | |
53 | + (when (= (aref chck terminal-idx) code-stream:+TERMINATE-CODE+) | |
54 | + (if (code-stream:end? cs) | |
55 | + (progn (funcall fn (code-stream:position cs) (id (aref base terminal-idx))) | |
56 | + (return-from each-common-prefix)) | |
57 | + (funcall fn (1- (code-stream:position cs)) (id (aref base terminal-idx)))))) | |
58 | + | |
59 | + (prog ((idx (+ node code))) | |
60 | + (setf node (aref base idx)) | |
61 | + (when (= (aref chck idx) code) | |
62 | + (if (plusp node) | |
63 | + (go :continue) | |
64 | + (when (including-tail? cs node trie) | |
65 | + (funcall fn (code-stream:position cs) (id node))))) | |
66 | + (return-from each-common-prefix) | |
67 | + | |
68 | + :continue)))) | |
69 | + | |
70 | +(igo::delete-package-nickname :igo.varied-byte-stream) | |
71 | +(igo::delete-package-nickname :igo.code-stream) | |
\ No newline at end of file |
@@ -0,0 +1,24 @@ | ||
1 | +(defpackage igo.matrix | |
2 | + (:use :common-lisp) | |
3 | + (:shadow load) | |
4 | + (:export load | |
5 | + link-cost | |
6 | + matrix)) | |
7 | +(in-package :igo.matrix) | |
8 | + | |
9 | +(igo::set-package-nickname :igo.varied-byte-stream :vbs) | |
10 | + | |
11 | +(deftype matrix () 'function) | |
12 | + | |
13 | +(defun load (data-dir) | |
14 | + (vbs:with-input-file (in (merge-pathnames "matrix.bin" data-dir)) | |
15 | + (let* ((left-size (vbs:read-byte in 4)) | |
16 | + (right-size (vbs:read-byte in 4)) | |
17 | + (matrix (vbs:read-sequence in 2 (* left-size right-size)))) | |
18 | + (lambda (left-id right-id) | |
19 | + (aref matrix (+ (* right-id right-size) left-id)))))) | |
20 | + | |
21 | +(defun link-cost (left-id right-id matrix) | |
22 | + (funcall matrix left-id right-id)) | |
23 | + | |
24 | +(igo::delete-package-nickname :igo.varied-byte-stream) | |
\ No newline at end of file |
@@ -0,0 +1,27 @@ | ||
1 | +(defpackage igo.viterbi-node | |
2 | + (:use :common-lisp) | |
3 | + (:export new | |
4 | + new-bos/eos | |
5 | + cost | |
6 | + prev | |
7 | + word-id | |
8 | + left-id | |
9 | + right-id | |
10 | + start | |
11 | + end | |
12 | + space?)) | |
13 | +(in-package :igo.viterbi-node) | |
14 | + | |
15 | +(defstruct (viterbi-node (:constructor new (word-id start end left-id right-id space?)) | |
16 | + (:conc-name "")) | |
17 | + (cost 0 :type fixnum) | |
18 | + (prev nil :type (or null viterbi-node)) | |
19 | + (word-id 0 :type fixnum) | |
20 | + (left-id 0 :type fixnum) | |
21 | + (right-id 0 :type fixnum) | |
22 | + (start 0 :type fixnum) | |
23 | + (end 0 :type fixnum) | |
24 | + (space? nil :type boolean)) | |
25 | + | |
26 | +(defun new-bos/eos () | |
27 | + (new 0 0 0 0 0 nil)) | |
\ No newline at end of file |
@@ -0,0 +1,52 @@ | ||
1 | +(defpackage igo.char-category | |
2 | + (:use :common-lisp) | |
3 | + (:shadow load) | |
4 | + (:export load | |
5 | + category | |
6 | + compatible? | |
7 | + category-trie-id | |
8 | + category-length | |
9 | + category-invoke? | |
10 | + category-group?)) | |
11 | +(in-package :igo.char-category) | |
12 | + | |
13 | +(igo::set-package-nickname :igo.varied-byte-stream :vbs) | |
14 | + | |
15 | +(defstruct category | |
16 | + (trie-id 0 :type fixnum) | |
17 | + (length 0 :type fixnum) | |
18 | + (invoke? nil :type boolean) | |
19 | + (group? nil :type boolean)) | |
20 | + | |
21 | +(defstruct (category-set (:conc-name "")) | |
22 | + (categorys #() :type (simple-array category)) | |
23 | + (char->id #() :type (simple-array (signed-byte 32))) | |
24 | + (eql-masks #() :type (simple-array (signed-byte 32)))) | |
25 | + | |
26 | +(defun load-categorys (root-dir) | |
27 | + (vbs:with-input-file (in (merge-pathnames "char.category" root-dir)) | |
28 | + (let ((data (vbs:read-sequence in 4 (/ (vbs:file-size in) 4)))) | |
29 | + (coerce | |
30 | + (loop FOR i FROM 0 BELOW (length data) BY 4 COLLECT | |
31 | + (make-category :trie-id (aref data (+ i 0)) | |
32 | + :length (aref data (+ i 1)) | |
33 | + :invoke? (= 1 (aref data (+ i 2))) | |
34 | + :group? (= 1 (aref data (+ i 3))))) | |
35 | + 'vector)))) | |
36 | + | |
37 | +(defun load (root-dir) | |
38 | + (vbs:with-input-file (in (merge-pathnames "code2category" root-dir)) | |
39 | + (make-category-set | |
40 | + :categorys (load-categorys root-dir) | |
41 | + :char->id (vbs:read-sequence in 4 (/ (vbs:file-size in) 4 2)) | |
42 | + :eql-masks (vbs:read-sequence in 4 (/ (vbs:file-size in) 4 2))))) | |
43 | + | |
44 | +(defun category (code category-set) | |
45 | + (with-slots (categorys char->id) category-set | |
46 | + (aref categorys (aref char->id code)))) | |
47 | + | |
48 | +(defun compatible? (code1 code2 category-set) | |
49 | + (with-slots (eql-masks) category-set | |
50 | + (logtest (aref eql-masks code1) (aref eql-masks code2)))) | |
51 | + | |
52 | +(igo::delete-package-nickname :igo.varied-byte-stream) | |
\ No newline at end of file |
@@ -0,0 +1,93 @@ | ||
1 | +(defpackage igo.word-dic | |
2 | + (:use :common-lisp) | |
3 | + (:shadow load | |
4 | + search) | |
5 | + (:export load | |
6 | + *ipadic-feature-parser* | |
7 | + word-dic | |
8 | + word-data | |
9 | + cost | |
10 | + search | |
11 | + search-from-trie-id)) | |
12 | +(in-package :igo.word-dic) | |
13 | + | |
14 | +(igo::set-package-nickname :igo.varied-byte-stream :vbs) | |
15 | +(igo::set-package-nickname :igo.trie :trie) | |
16 | +(igo::set-package-nickname :igo.code-stream :code-stream) | |
17 | +(igo::set-package-nickname :igo.viterbi-node :viterbi-node) | |
18 | + | |
19 | +(defstruct word-dic | |
20 | + (trie nil :type trie:trie) | |
21 | + (costs #() :type (simple-array (signed-byte 16))) | |
22 | + (left-ids #() :type (simple-array (signed-byte 16))) | |
23 | + (right-ids #() :type (simple-array (signed-byte 16))) | |
24 | + (data #() :type simple-array) ;; XXX: surrogate? and TODO: features | |
25 | + (indices #() :type (simple-array (signed-byte 32)))) | |
26 | + | |
27 | +(defun read-indices (path) | |
28 | + (vbs:with-input-file (in path) | |
29 | + (vbs:read-sequence in 4 (/ (vbs:file-size in) 4)))) | |
30 | + | |
31 | +(defun read-data (path) | |
32 | + (vbs:with-input-file (in path) | |
33 | + (map 'string #'code-char (vbs:read-sequence in 2 (/ (vbs:file-size in) 2) :signed nil)))) | |
34 | + | |
35 | +(defun split-data (data offsets feature-parser) | |
36 | + (let ((ary (make-array (1- (length offsets))))) | |
37 | + (dotimes (i (length ary) ary) | |
38 | + (setf (aref ary i) | |
39 | + (funcall feature-parser (subseq data (aref offsets i) (aref offsets (1+ i)))))))) | |
40 | + | |
41 | +(defvar *ipadic-feature-parser* | |
42 | + (lambda (feature) | |
43 | + (flet ((kw (s) (intern s :keyword)) | |
44 | + (kw-if-* (s) (if (string= s "*") (intern s :keyword) s))) | |
45 | + (let ((fs (igo::split-by-chars "," feature))) | |
46 | + (nconc (mapcar #'kw (subseq fs 0 6)) | |
47 | + (mapcar #'kw-if-* (subseq fs 6))))))) | |
48 | + | |
49 | +(defun load (root-dir &optional (feature-parser #'identity)) | |
50 | + (flet ((fullpath (name) (merge-pathnames root-dir name))) | |
51 | + (vbs:with-input-file (in (fullpath "word.inf")) | |
52 | + (let* ((word-count (/ (vbs:file-size in) (+ 4 2 2 2))) | |
53 | + (data (read-data (fullpath "word.dat"))) | |
54 | + (offsets (vbs:read-sequence in 4 word-count))) | |
55 | + (make-word-dic | |
56 | + :trie (trie:load (fullpath "word2id")) | |
57 | + :indices (read-indices (fullpath "word.ary.idx")) | |
58 | + :data (split-data data offsets feature-parser) | |
59 | + | |
60 | + :left-ids (vbs:read-sequence in 2 word-count) | |
61 | + :right-ids (vbs:read-sequence in 2 word-count) | |
62 | + :costs (vbs:read-sequence in 2 word-count)))))) | |
63 | + | |
64 | +(defun cost (word-id wdic) (aref (word-dic-costs wdic) word-id)) | |
65 | +(defun left-id (word-id wdic) (aref (word-dic-left-ids wdic) word-id)) | |
66 | +(defun right-id (word-id wdic) (aref (word-dic-right-ids wdic) word-id)) | |
67 | + | |
68 | +(defun search (cs result wdic) | |
69 | + (let ((start (code-stream:position cs)) | |
70 | + (indices (word-dic-indices wdic))) | |
71 | + (trie:each-common-prefix | |
72 | + (lambda (end id) | |
73 | + (loop FOR i FROM (aref indices id) BELOW (aref indices (1+ id)) DO | |
74 | + (push (viterbi-node:new i start end (left-id i wdic) (right-id i wdic) nil) | |
75 | + result))) | |
76 | + cs | |
77 | + (word-dic-trie wdic))) | |
78 | + result) | |
79 | + | |
80 | +(defun search-from-trie-id (id start end space? result wdic) | |
81 | + (let ((indices (word-dic-indices wdic))) | |
82 | + (loop FOR i FROM (aref indices id) BELOW (aref indices (1+ id)) DO | |
83 | + (push (viterbi-node:new i start end (left-id i wdic) (right-id i wdic) space?) | |
84 | + result))) | |
85 | + result) | |
86 | + | |
87 | +(defun word-data (word-id wdic) | |
88 | + (aref (word-dic-data wdic) word-id)) | |
89 | + | |
90 | +(igo::delete-package-nickname :igo.varied-byte-stream) | |
91 | +(igo::delete-package-nickname :igo.trie) | |
92 | +(igo::delete-package-nickname :igo.code-stream) | |
93 | +(igo::delete-package-nickname :igo.viterbi-node) | |
\ No newline at end of file |
@@ -0,0 +1,21 @@ | ||
1 | +The MIT License | |
2 | + | |
3 | +Copyright (c) 2010 Takeru Ohta <phjgt308@ybb.ne.jp> | |
4 | + | |
5 | +Permission is hereby granted, free of charge, to any person obtaining a copy | |
6 | +of this software and associated documentation files (the "Software"), to deal | |
7 | +in the Software without restriction, including without limitation the rights | |
8 | +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
9 | +copies of the Software, and to permit persons to whom the Software is | |
10 | +furnished to do so, subject to the following conditions: | |
11 | + | |
12 | +The above copyright notice and this permission notice shall be included in | |
13 | +all copies or substantial portions of the Software. | |
14 | + | |
15 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
16 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
17 | +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
18 | +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
19 | +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
20 | +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
21 | +THE SOFTWARE. |
@@ -0,0 +1,40 @@ | ||
1 | +(defpackage igo.varied-byte-stream | |
2 | + (:use :common-lisp) | |
3 | + (:shadow read-byte | |
4 | + read-sequence) | |
5 | + (:export with-input-file | |
6 | + read-byte | |
7 | + read-sequence | |
8 | + file-size)) | |
9 | +(in-package :igo.varied-byte-stream) | |
10 | + | |
11 | +(deftype n-byte (byte-size signed?) | |
12 | + `(,(if signed? 'signed-byte 'unsigned-byte) ,(* byte-size 8))) | |
13 | + | |
14 | +(defstruct varied-byte-stream | |
15 | + (source nil :type file-stream) | |
16 | + (offset 0 :type fixnum)) | |
17 | + | |
18 | +(defmacro with-input-file ((stream filespec) &body body) | |
19 | + `(with-open-file (,stream ,filespec) | |
20 | + (let ((,stream (make-varied-byte-stream :source ,stream))) | |
21 | + ,@body))) | |
22 | + | |
23 | +(defun file-size (vbs) | |
24 | + (file-length (varied-byte-stream-source vbs))) | |
25 | + | |
26 | +(defun read-byte (varied-byte-stream byte-size &key (signed t)) | |
27 | + (with-slots (source offset) varied-byte-stream | |
28 | + (with-open-file (in source :element-type `(n-byte ,byte-size ,signed)) | |
29 | + (file-position in (/ offset byte-size)) | |
30 | + (prog1 (common-lisp:read-byte in) | |
31 | + (incf offset byte-size))))) | |
32 | + | |
33 | +(defun read-sequence (varied-byte-stream byte-size count &key (signed t)) | |
34 | + (with-slots (source offset) varied-byte-stream | |
35 | + (with-open-file (in source :element-type `(n-byte ,byte-size ,signed)) | |
36 | + (file-position in (/ offset byte-size)) | |
37 | + (let ((buf (make-array count :element-type `(n-byte ,byte-size ,signed)))) | |
38 | + (common-lisp:read-sequence buf in) | |
39 | + (incf offset (* byte-size count)) | |
40 | + buf)))) | |
\ No newline at end of file |