• R/O
  • SSH
  • HTTPS

creole: Commit


Commit MetaInfo

Revision12 (tree)
Time2010-05-02 21:09:53
Authorphjgt

Log Message

utf8デコード高速化の試行中

Change Summary

Incremental Difference

--- octets-to-string.lisp (revision 11)
+++ octets-to-string.lisp (revision 12)
@@ -18,14 +18,9 @@
1818 (external-format-filename external-format)
1919 (merge-pathnames #P"decode/" *data-dir*))))))
2020
21-(defun illegal-octet-error (octets position)
22- (declare (ignore octets))
23- (error "Illegal character starting at byte position ~D." position))
24-
25-(defun general-octets-to-string (octets trie replace-fn)
26- (declare (optimize (speed 3) (debug 0) (safety 0))
27- (simple-octets octets)
28- (function replace-fn))
21+(defun general-octets-to-string (octets trie)
22+ (declare #.*fastest*
23+ (simple-octets octets))
2924 (let* ((len (length octets))
3025 (buf (make-array len :element-type 'character))
3126 (tail-pos -1)
@@ -34,16 +29,7 @@
3429 (declare (fixnum tail-pos i))
3530 (loop (setf (values char i) (to-unicode octets i trie))
3631
37- (when (null char)
38- (multiple-value-bind (new-char #1=consuming-octets-count)
39- (funcall replace-fn octets (1- i))
40- (check-type new-char character)
41- (check-type #1# (or null positive-fixnum))
42- (setf char new-char)
43- (when #1#
44- (incf i (1- (the positive-fixnum #1#))))))
45-
46- (setf (aref buf (incf tail-pos)) char)
32+ (setf (aref buf (incf tail-pos)) (or char +UNKNOWN-CHAR+))
4733 (when (>= i len)
4834 (return)))
4935 (subseq buf 0 (1+ tail-pos))))
@@ -50,15 +36,13 @@
5036
5137 ;;;;;;;;;;;;;;;;;;;;;
5238 ;;; external function
53-(defun octets-to-string (octets &key (external-format *default-external-format*)
54- (replace-fn #'illegal-octet-error))
55- (ensure-function replace-fn)
39+(defun octets-to-string (octets &key (external-format *default-external-format*) (safe t))
40+ (declare #.*fastest*)
5641 (check-type octets simple-octets)
57-
58- (case (external-format-key external-format)
59- (:|utf-8| (utf8-octets-to-string octets replace-fn))
60- (:|utf-16be| (utf16be-octets-to-string octets replace-fn))
61- (:|utf-16le| (utf16le-octets-to-string octets replace-fn))
62- (t (general-octets-to-string octets
63- (get-decode-trie external-format)
64- replace-fn))))
\ No newline at end of file
42+ (locally
43+ (declare (simple-octets octets))
44+ (case (external-format-key external-format)
45+ (:|utf-8| (utf8-octets-to-string octets safe))
46+ (:|utf-16be| (utf16-octets-to-string octets :be))
47+ (:|utf-16le| (utf16-octets-to-string octets :le))
48+ (t (general-octets-to-string octets (get-decode-trie external-format))))))
--- string-to-octets.lisp (revision 11)
+++ string-to-octets.lisp (revision 12)
@@ -49,8 +49,7 @@
4949 ;;;;;;;;;;;;;;;;;;;;;
5050 ;;; external function
5151 (defun string-to-octets (string &key (external-format *default-external-format*))
52- (declare #.*fastest*
53- (string string))
52+ (declare #.*fastest*)
5453 (ensure-simple-characters string
5554 (case (external-format-key external-format)
5655 (:|utf-8| (utf8-string-to-octets string))
--- utf16.lisp (revision 11)
+++ utf16.lisp (revision 12)
@@ -63,76 +63,35 @@
6363
6464 ;;;;;;;;;;;;;;;;;;;;
6565 ;;; octets => string
66-(defun utf16be-octets-to-string (octets replace-fn &aux (octets-len (length octets)))
67- (declare (optimize (speed 3) (debug 0) (safety 0) (compilation-speed 0))
68- (simple-octets octets)
69- (function replace-fn))
70- (let ((buf (make-array octets-len :element-type 'character))
71- (pos -1)
72- (limit (if (evenp octets-len) octets-len (1- octets-len))))
73- (declare (fixnum pos))
74-
75- (macrolet ((with-replace ((new-char) &body body)
76- `(multiple-value-bind (,new-char #1=consuming-octets-count)
77- (funcall replace-fn octets (- i 2))
78- (check-type ,new-char character)
79- (check-type #1# (or null positive-fixnum))
80- (setf surrogate nil
81- i (+ (- i 2) (the positive-fixnum (or #1# 1))))
82- ,@body)))
83-
84- (loop WITH surrogate = nil
85- FOR i fixnum FROM 0 BELOW limit BY 2
86- FOR code = (to-utf16be-code octets i)
87- DO
88- (cond ((<= #xDC00 code #xDFFF)
89- (if (null surrogate)
90- (with-replace (new-char)
91- (setf (aref buf (incf pos)) new-char))
92- (setf (aref buf (incf pos)) (decode-surrogate-pair surrogate code)
93- surrogate nil)))
94- (surrogate
95- (with-replace (new-char)
96- (setf (aref buf (incf pos)) new-char)))
97- ((<= #xD800 code #xDBFF)
98- (setf surrogate code))
99- (t
100- (setf (aref buf (incf pos)) (code-char code)))))
101- (subseq buf 0 (1+ pos)))))
102-
103-(defun utf16le-octets-to-string (octets replace-fn &aux (octets-len (length octets)))
104- (declare (optimize (speed 3) (debug 0) (safety 0) (compilation-speed 0))
105- (simple-octets octets)
106- (function replace-fn))
107- (let ((buf (make-array octets-len :element-type 'character))
108- (pos -1)
109- (limit (if (evenp octets-len) octets-len (1- octets-len))))
110- (declare (fixnum pos))
111-
112- (macrolet ((with-replace ((new-char) &body body)
113- `(multiple-value-bind (,new-char #1=consuming-octets-count)
114- (funcall replace-fn octets (- i 2))
115- (check-type ,new-char character)
116- (check-type #1# (or null positive-fixnum))
117- (setf surrogate nil
118- i (+ (- i 2) (the positive-fixnum (or #1# 1))))
119- ,@body)))
120-
121- (loop WITH surrogate = nil
122- FOR i fixnum FROM 0 BELOW limit BY 2
123- FOR code = (to-utf16le-code octets i)
124- DO
125- (cond ((<= #xDC00 code #xDFFF)
126- (if (null surrogate)
127- (with-replace (new-char)
128- (setf (aref buf (incf pos)) new-char))
129- (setf (aref buf (incf pos)) (decode-surrogate-pair surrogate code)
130- surrogate nil)))
131- (surrogate
132- (with-replace (new-char)
133- (setf (aref buf (incf pos)) new-char)))
134- ((<= #xD800 code #xDBFF)
135- (setf surrogate code))
136- (t
137- (setf (aref buf (incf pos)) (code-char code)))))
138- (subseq buf 0 (1+ pos)))))
\ No newline at end of file
66+(defmacro utf16-octets-to-string (octets endian)
67+ `(let* ((octets-len (length ,octets))
68+ (buf-len (ceiling octets-len 2))
69+ (buf (make-array buf-len :element-type 'character))
70+ (pos -1)
71+ (including-illegal-octet? nil))
72+ (declare (fixnum pos))
73+ (flet ((add-char (char) (setf (aref buf (incf pos)) char))
74+ (add-unk-char () (setf (aref buf (incf pos)) +UNKNOWN-CHAR+
75+ including-illegal-octet? t)))
76+ (declare (inline add-char add-unk-char))
77+ (loop WITH surrogate = nil
78+ FOR i FROM 0 BELOW (1- octets-len) BY 2
79+ FOR code = (,(if (eq endian :le) 'to-utf16le-code 'to-utf16be-code) ,octets i)
80+ DO
81+ (cond ((<= #xDC00 code #xDFFF)
82+ (if (null surrogate)
83+ (add-unk-char)
84+ (progn
85+ (add-char (decode-surrogate-pair surrogate code))
86+ (setf surrogate nil))))
87+ (surrogate
88+ (add-unk-char)
89+ (setf surrogate nil))
90+ ((<= #xD800 code #xDBFF)
91+ (setf surrogate code))
92+ (t
93+ (add-char (code-char code)))))
94+ (when (oddp octets-len)
95+ (add-unk-char)))
96+ (values (if (= (1+ pos) buf-len) buf (subseq buf 0 (1+ pos)))
97+ (not including-illegal-octet?))))
\ No newline at end of file
--- utf8.lisp (revision 11)
+++ utf8.lisp (revision 12)
@@ -3,7 +3,8 @@
33 ;;;;;;;;;;;
44 ;;; declaim
55 (declaim (inline bit-off? bit-val 10xxxxxx-p
6- utf8-octets-length
6+ utf8-octets-length utf8-string-length
7+ utf8-octets-to-string-safe utf8-octets-to-string-unsafe
78 utf8-octets-to-unicode utf8-octets-to-string))
89
910 ;;;;;;;;;;;;;;;;;;;;;
@@ -17,18 +18,7 @@
1718 (defun 10xxxxxx-p (octet)
1819 (= (ldb (byte 2 6) octet) #b10))
1920
20-(defun utf8-call-replace-fn (replace-fn octets pos)
21- (declare (optimize (speed 3) (debug 0) (safety 0))
22- (fixnum pos)
23- (function replace-fn))
24- (multiple-value-bind (new-char #1=consuming-octets-count)
25- (funcall replace-fn octets pos)
26- (check-type new-char character)
27- (check-type #1# (or null positive-fixnum))
28- (values (char-code new-char)
29- (the positive-fixnum (+ pos (the positive-fixnum (or #1# 1)))))))
30-
31-(defun utf8-octets-to-unicode(octets pos string j replace-fn octets-len &aux (os octets))
21+(defun utf8-octets-to-unicode(octets pos string j octets-len &aux (os octets))
3222 (declare #.*fastest*
3323 (simple-octets os)
3424 (fixnum pos j octets-len)
@@ -35,20 +25,22 @@
3525 (simple-characters string))
3626 (macrolet ((with-validate (num exp)
3727 `(if (and (< (+ ,num pos) octets-len)
38- ;; TOOD: 一番初めは0ではいけない -> check
28+ ;; TODO: 一番初めは0ではいけない -> check
3929 ,@(loop FOR i fixnum FROM 1 TO num
4030 COLLECT `(10xxxxxx-p (aref os (+ ,i pos)))))
4131 (values ,exp
42- (the positive-fixnum (+ ,(1+ num) pos)))
43- (utf8-call-replace-fn replace-fn octets pos))))
44-
32+ (the positive-fixnum (+ ,(1+ num) pos))
33+ t)
34+ (values +UNKNOWN-CODE+
35+ (the positive-fixnum (+ ,(1+ num) pos))
36+ nil))))
4537 (let ((octet (aref os pos)))
46- (multiple-value-bind (code new-pos)
38+ (multiple-value-bind (code new-pos legal?)
4739 ;; #b0xxxxxxx
48- (cond ((bit-off? 7 octet) (values octet (1+ pos)))
40+ (cond ((bit-off? 7 octet) (values octet (1+ pos) t))
4941
5042 ;; #b10xxxxxx
51- ((bit-off? 6 octet) (utf8-call-replace-fn replace-fn os pos))
43+ ((bit-off? 6 octet) (values +UNKNOWN-CODE+ (1+ pos) nil))
5244
5345 ;; #b110xxxxx #b10xxxxxx
5446 ((bit-off? 5 octet) (with-validate 1
@@ -68,21 +60,70 @@
6860 (bit-val 6 (aref os (+ 2 pos)) 6)
6961 (bit-val 6 (aref os (+ 3 pos))))))
7062
71- (t (utf8-call-replace-fn replace-fn os pos)))
63+ (t (values +UNKNOWN-CODE+ (1+ pos) nil)))
7264 (setf (aref string j) (code-char code))
73- (the fixnum new-pos)))))
65+ (values (the fixnum new-pos) legal?)))))
7466
67+(defun utf8-octets-to-string-safe (octets &aux (len (length octets)))
68+ (let ((buf (make-array len :element-type 'character))
69+ (including-illegal-octet? nil)
70+ (legal? t))
71+ (do ((i 0)
72+ (j 0 (1+ j)))
73+ ((>= i len) (values (subseq buf 0 j)
74+ (not including-illegal-octet?)))
75+ (declare (optimize-hack-array-index j i))
76+ (setf (values i legal?) (utf8-octets-to-unicode octets i buf j len))
77+ (unless legal?
78+ (setf including-illegal-octet? t)))))
79+
80+(defun utf8-string-length (octets octets-len)
81+ (do ((i 0)
82+ (len 0 (1+ len)))
83+ ((= i octets-len) len)
84+ (declare (optimize-hack-array-index i len))
85+ (let ((octet (aref octets i)))
86+ (cond ((bit-off? 7 octet) (incf i))
87+ ((bit-off? 5 octet) (incf i 2))
88+ ((bit-off? 4 octet) (incf i 3))
89+ ((bit-off? 3 octet) (incf i 4))))))
90+
91+(defun utf8-octets-to-string-unsafe (octets &aux (len (length octets)))
92+ (let ((buf (make-array len #+C(utf8-string-length octets len) :element-type 'character)))
93+ (do ((i 0 (1+ i))
94+ (j 0))
95+ ((= j len) (values (subseq buf 0 i) t))
96+ (declare (optimize-hack-array-index i j))
97+ (flet ((add-char (code) (setf (aref buf i) (code-char code))))
98+ (declare (inline add-char))
99+ (let ((octet (aref octets j)))
100+ (cond ((bit-off? 7 octet)
101+ (add-char octet)
102+ (incf j))
103+ ((bit-off? 5 octet)
104+ (add-char (+ (bit-val 5 octet 6)
105+ (bit-val 6 (aref octets (+ 1 j)))))
106+ (incf j 2))
107+ ((bit-off? 4 octet)
108+ (add-char (+ (bit-val 4 octet 12)
109+ (bit-val 6 (aref octets (+ 1 j)) 6)
110+ (bit-val 6 (aref octets (+ 2 j)))))
111+ (incf j 3))
112+ ((bit-off? 3 octet)
113+ (add-char (+ (bit-val 3 octet 18)
114+ (bit-val 6 (aref octets (+ 1 j)) 12)
115+ (bit-val 6 (aref octets (+ 2 j)) 6)
116+ (bit-val 6 (aref octets (+ 3 j)))))
117+ (incf j 4))))))))
118+
75119 ;;;;;;;;;;;;;;;;;;;;
76120 ;;; octets => string
77-(defun utf8-octets-to-string (octets replace-fn &aux (len (length octets)))
121+(defun utf8-octets-to-string (octets safe)
78122 (declare #.*fastest*
79123 (simple-octets octets))
80- (let ((buf (make-array len :element-type 'character))
81- (j -1))
82- (declare (fixnum j))
83- (do ((i 0 (utf8-octets-to-unicode octets i buf (incf j) replace-fn len)))
84- ((>= i len) (subseq buf 0 (1+ j)))
85- (declare (fixnum i)))))
124+ (if safe
125+ (utf8-octets-to-string-safe octets)
126+ (utf8-octets-to-string-unsafe octets)))
86127
87128 ;;;;;;;;;;;;;;;;;;;;
88129 ;;; string => octets
Show on old repository browser