• R/O
  • SSH
  • HTTPS

creole: Commit


Commit MetaInfo

Revision11 (tree)
Time2010-05-02 17:54:15
Authorphjgt

Log Message

string-to-octets: エンコード不能文字は、#\?で埋めるように修正

Change Summary

Incremental Difference

--- constant.lisp (nonexistent)
+++ constant.lisp (revision 11)
@@ -0,0 +1,5 @@
1+(in-package :creole)
2+
3+(defconstant +UNKNOWN-CHAR+ #\?)
4+(defconstant +UNKNOWN-CODE+ (char-code #\?))
5+(defconst-onceonly +UNKNOWN-OCTETS+ (coerce `(,(char-code #\?)) '(vector octet)))
\ No newline at end of file
--- package.lisp (revision 10)
+++ package.lisp (revision 11)
@@ -2,10 +2,6 @@
22 (:use :common-lisp)
33 (:export string-to-octets
44 octets-to-string
5- octets-encoding-error ;; TODO
6- octet-decoding-error ;; TODO
7- illegal-octet-error
8- illegal-character-error
95 *external-formats*
106 *default-external-format*))
117 (in-package :creole)
@@ -22,4 +18,7 @@
2218 (deftype simple-characters () '(simple-array character (*)))
2319
2420 (deftype optimize-hack-array-index () #+SBCL '(integer 0 100)
25- #-SBCL 'array-index)
\ No newline at end of file
21+ #-SBCL 'array-index)
22+
23+(defvar *fastest* '(optimize (speed 3) (debug 0) (safety 0) (compilation-speed 0)))
24+
--- util.lisp (revision 10)
+++ util.lisp (revision 11)
@@ -11,18 +11,28 @@
1111 #+SBCL (declare (sb-ext:muffle-conditions sb-ext:compiler-note))
1212 ,@body))
1313
14+;; TODO: なくす
1415 (defmacro ensure-function (function-desginator)
1516 `(etypecase ,function-desginator
1617 (function)
1718 (symbol (setf ,function-desginator (symbol-function ,function-desginator)))))
1819
19-(defmacro ensure-simple-characters (s)
20- `(etypecase ,s
21- (simple-characters)
22- (simple-base-string (setf ,s (make-array (length ,s)
23- :element-type 'character
24- :initial-contents ,s)))
25- (string (setf ,s (muffle-warn (copy-seq ,s))))))
20+(defmacro each-char-code ((code string &optional return) &body body)
21+ (let ((char (gensym)))
22+ `(loop FOR ,char ACROSS ,string
23+ FOR ,code = (char-code ,char)
24+ DO ,@body
25+ FINALLY (return ,return))))
26+
27+(defmacro ensure-simple-characters (s &body body)
28+ `(let ((,s (etypecase ,s
29+ (simple-base-string (make-array (length ,s)
30+ :element-type 'character
31+ :initial-contents ,s))
32+ (simple-characters ,s)
33+ (string (muffle-warn (copy-seq ,s))))))
34+ (declare (simple-characters ,s))
35+ ,@body))
2636
2737 (defmacro defconst-onceonly (name value &optional doc)
2838 `(defconstant ,name (if (boundp ',name) (symbol-value ',name) ,value)
--- string-to-octets.lisp (revision 10)
+++ string-to-octets.lisp (revision 11)
@@ -2,8 +2,9 @@
22
33 ;;;;;;;;;;;
44 ;;; declaim
5-(declaim (inline get-encode-table string-to-octets legal-string-to-octets)
6- (ftype (function (string &key (:external-format t) (:replace-fn t)) simple-octets) string-to-octets))
5+(declaim (inline get-encode-table #+C string-to-octets legal-string-to-octets illegal-string-to-octets)
6+ (ftype (function (string &key (:external-format t)) (values simple-octets boolean)) string-to-octets)
7+ (ftype (function (t) simple-vector) get-encode-table))
78
89 ;;;;;;;;;;;;;;;;;;;;
910 ;;; special variable
@@ -28,65 +29,45 @@
2829 it
2930 (setf #1# (load-encode-table external-format))))
3031
31-(defun illegal-character-error (char-code)
32- (error "Unable to encode character ~D." char-code))
33-
3432 (defun legal-string-to-octets (string octets-length table)
35- (let ((buf (make-array octets-length :element-type 'octet)))
36- (loop WITH i OF-TYPE fixnum = -1
37- FOR ch ACROSS string
38- FOR octets = (svref table (char-code ch)) DO
39- (loop FOR o ACROSS (the simple-octets octets) DO
40- (setf (aref buf (incf i)) o)))
41- buf))
33+ (let ((buf (make-array octets-length :element-type 'octet))
34+ (i -1))
35+ (each-char-code (code string (values buf t))
36+ (loop FOR o ACROSS (the simple-octets (svref table code)) DO
37+ (setf (aref buf (incf (the fixnum i))) o)))))
4238
43-(defun illegal-string-to-octets (string table replace-fn)
44- (declare (optimize (speed 3) (debug 0) (safety 0))
45- (simple-characters string)
46- (function replace-fn)
47- (simple-vector table))
48- (let ((buf (make-array (length string) :fill-pointer 0
49- :adjustable t
50- :element-type 'octet))
51- (code-limit (length table)))
52- (loop FOR i OF-TYPE fixnum FROM 0 BELOW (length string)
53- FOR ch = (char string i)
54- FOR cd = (char-code ch)
55- FOR octets = (and (< cd code-limit) (svref table cd)) DO
56- (when (null octets)
57- (let ((new-octets (funcall replace-fn cd)))
58- (check-type new-octets simple-octets)
59- (setf octets new-octets)))
39+(defun illegal-string-to-octets (string octets-length table)
40+ (let ((buf (make-array octets-length :element-type 'octet))
41+ (code-limit (length table))
42+ (i -1))
43+ (each-char-code (code string (values buf nil))
44+ (loop FOR o ACROSS (the simple-octets
45+ (or (and (< code code-limit) (svref table code))
46+ +UNKNOWN-OCTETS+)) DO
47+ (setf (aref buf (incf (the fixnum i))) o)))))
6048
61- (loop FOR o ACROSS (the simple-octets octets) DO
62- (vector-push-extend o buf)))
63- (coerce buf '(vector octet))))
64-
6549 ;;;;;;;;;;;;;;;;;;;;;
6650 ;;; external function
67-(defun string-to-octets (string &key (external-format *default-external-format*)
68- (replace-fn #'illegal-character-error))
69- (declare (optimize (speed 3) (debug 1) (safety 0))
51+(defun string-to-octets (string &key (external-format *default-external-format*))
52+ (declare #.*fastest*
7053 (string string))
71- (ensure-function replace-fn)
72- (ensure-simple-characters string)
73- (locally
74- (declare (simple-characters string))
54+ (ensure-simple-characters string
7555 (case (external-format-key external-format)
7656 (:|utf-8| (utf8-string-to-octets string))
77- (:|utf-16be| (utf16-string-to-octets string :be replace-fn))
78- (:|utf-16le| (utf16-string-to-octets string :le replace-fn))
57+ (:|utf-16be| (utf16-string-to-octets string :be))
58+ (:|utf-16le| (utf16-string-to-octets string :le))
7959 (t
8060 (let* ((table (get-encode-table external-format))
81- (code-limit (length (the simple-vector table)))
82- (len (loop WITH len OF-TYPE fixnum = 0
83- FOR ch ACROSS string
84- FOR cd = (char-code ch)
85- FOR octets = (and (< cd code-limit) (svref table cd))
86- DO (when (null octets)
87- (return))
88- (incf len (length (the simple-octets octets)))
89- FINALLY (return len))))
90- (if len
91- (legal-string-to-octets string len table)
92- (illegal-string-to-octets string table replace-fn)))))))
\ No newline at end of file
61+ (code-limit (length table))
62+ (including-illegal-character? nil)
63+ (len 0))
64+ (declare (fixnum len))
65+ (each-char-code (code string)
66+ (let ((octets (and (< code code-limit) (svref table code))))
67+ (when (null octets)
68+ (setf octets +UNKNOWN-OCTETS+
69+ including-illegal-character? t))
70+ (incf len (length (the simple-octets octets)))))
71+ (if including-illegal-character?
72+ (illegal-string-to-octets string len table)
73+ (legal-string-to-octets string len table)))))))
\ No newline at end of file
--- utf16.lisp (revision 10)
+++ utf16.lisp (revision 11)
@@ -28,11 +28,15 @@
2828
2929 ;;;;;;;;;;;;;;;;;;;;
3030 ;;; string => octets
31-(defmacro utf16-string-to-octets (string endian replace-fn)
31+(defmacro utf16-string-to-octets (string endian)
3232 (symbol-macrolet ((p1 (if (eq endian :be) 8 0))
3333 (p2 (if (eq endian :be) 0 8)))
34- `(let ((octets (make-array (* 4 (length ,string)) :element-type 'octet))
35- (i 0))
34+ `(let* ((has-surrogate? (each-char-code (cd ,string nil)
35+ (when (>= cd #x10000)
36+ (return t))))
37+ (buf-len (* (if has-surrogate? 4 2) (length ,string)))
38+ (octets (make-array buf-len :element-type 'octet))
39+ (i 0))
3640 (declare (optimize-hack-array-index i))
3741 (macrolet ((add-octets (&rest octet-list &aux (n (length octet-list)))
3842 (declare (optimize (speed 0)))
@@ -40,23 +44,22 @@
4044 FOR o IN octet-list COLLECT
4145 `(setf (aref octets (+ i ,i)) ,o))
4246 (incf i ,n))))
43- (loop FOR char ACROSS ,string
44- FOR code = (char-code char) DO
45- (if (< code #x10000)
46- (progn
47- (loop WHILE (<= #xD800 code #xDFFF) DO
48- (multiple-value-bind (new-char) (funcall ,replace-fn code)
49- (check-type new-char character)
50- (setf code (char-code new-char))))
51- (add-octets (ldb (byte 8 ,p1) code)
52- (ldb (byte 8 ,p2) code)))
53- (let ((low (low-surrogate code))
54- (high (high-surrogate code)))
55- (add-octets (ldb (byte 8 ,p1) high)
56- (ldb (byte 8 ,p2) high)
57- (ldb (byte 8 ,p1) low)
58- (ldb (byte 8 ,p2) low))))))
59- (subseq octets 0 i))))
47+ (if (not has-surrogate?)
48+ (each-char-code (cd ,string octets)
49+ #1=(if (<= #xD800 cd #xDFFF)
50+ (add-octets (ldb (byte 8 ,p1) +UNKNOWN-CODE+)
51+ (ldb (byte 8 ,p2) +UNKNOWN-CODE+))
52+ (add-octets (ldb (byte 8 ,p1) cd)
53+ (ldb (byte 8 ,p2) cd))))
54+ (each-char-code (cd ,string (subseq octets 0 i))
55+ (if (< cd #x10000)
56+ #1#
57+ (let ((low (low-surrogate cd))
58+ (high (high-surrogate cd)))
59+ (add-octets (ldb (byte 8 ,p1) high)
60+ (ldb (byte 8 ,p2) high)
61+ (ldb (byte 8 ,p1) low)
62+ (ldb (byte 8 ,p2) low))))))))))
6063
6164 ;;;;;;;;;;;;;;;;;;;;
6265 ;;; octets => string
--- utf8.lisp (revision 10)
+++ utf8.lisp (revision 11)
@@ -3,6 +3,7 @@
33 ;;;;;;;;;;;
44 ;;; declaim
55 (declaim (inline bit-off? bit-val 10xxxxxx-p
6+ utf8-octets-length
67 utf8-octets-to-unicode utf8-octets-to-string))
78
89 ;;;;;;;;;;;;;;;;;;;;;
@@ -28,7 +29,7 @@
2829 (the positive-fixnum (+ pos (the positive-fixnum (or #1# 1)))))))
2930
3031 (defun utf8-octets-to-unicode(octets pos string j replace-fn octets-len &aux (os octets))
31- (declare (optimize (speed 3) (debug 0) (safety 0))
32+ (declare #.*fastest*
3233 (simple-octets os)
3334 (fixnum pos j octets-len)
3435 (simple-characters string))
@@ -74,7 +75,7 @@
7475 ;;;;;;;;;;;;;;;;;;;;
7576 ;;; octets => string
7677 (defun utf8-octets-to-string (octets replace-fn &aux (len (length octets)))
77- (declare (optimize (speed 3) (debug 0) (safety 0))
78+ (declare #.*fastest*
7879 (simple-octets octets))
7980 (let ((buf (make-array len :element-type 'character))
8081 (j -1))
@@ -85,25 +86,16 @@
8586
8687 ;;;;;;;;;;;;;;;;;;;;
8788 ;;; string => octets
88-(declaim (inline utf8-octets-length))
89-(defun utf8-octets-length (string)
90- (declare (optimize (speed 3) (debug 0) (safety 0) (compilation-speed 0))
91- (simple-string string))
92- (loop WITH len OF-TYPE array-index = 0
93- FOR ch ACROSS string
94- FOR cd = (char-code ch)
95- DO
96- (incf len
97- (cond ((< cd #x80) 1)
98- ((< cd #x800) 2)
89+(defun utf8-octets-length (string &aux (len 0))
90+ (each-char-code (cd string len)
91+ (incf (the array-index len)
92+ (cond ((< cd #x80) 1)
93+ ((< cd #x800) 2)
9994 ((< cd #x10000) 3)
100- (t 4)))
101- FINALLY (return len)))
95+ (t 4)))))
10296
103-(deftype optimize-hack-integer () '(integer 0 100))
104-
10597 (defun utf8-string-to-octets (string)
106- (declare (optimize (speed 3) (debug 0) (safety 0))
98+ (declare #.*fastest*
10799 (simple-characters string))
108100 (let ((octets (make-array (utf8-octets-length string) :element-type 'octet))
109101 (i 0))
@@ -114,8 +106,7 @@
114106 FOR o IN octet-list COLLECT
115107 `(setf (aref octets (+ i ,i)) ,o))
116108 (incf i ,n))))
117- (loop FOR ch ACROSS string
118- FOR cd = (char-code ch) DO
109+ (each-char-code (cd string)
119110 (cond ((< cd #x80)
120111 (add-octets cd))
121112 ((< cd #x800)
@@ -122,14 +113,16 @@
122113 (add-octets (+ #b11000000 (ldb (byte 5 6) cd))
123114 (+ #b10000000 (ldb (byte 6 0) cd))))
124115 ((< cd #x10000)
125- (when (<= #xd800 cd #xdfff)
126- (error ""))
127- (add-octets (+ #b11100000 (ldb (byte 4 12) cd))
128- (+ #b10000000 (ldb (byte 6 6) cd))
129- (+ #b10000000 (ldb (byte 6 0) cd))))
116+ (if (<= #xD800 cd #xDFFF)
117+ (add-octets +UNKNOWN-CODE+)
118+ (add-octets (+ #b11100000 (ldb (byte 4 12) cd))
119+ (+ #b10000000 (ldb (byte 6 6) cd))
120+ (+ #b10000000 (ldb (byte 6 0) cd)))))
130121 (t
131122 (add-octets (+ #b11110000 (ldb (byte 3 18) cd))
132123 (+ #b10000000 (ldb (byte 6 12) cd))
133124 (+ #b10000000 (ldb (byte 6 6) cd))
134125 (+ #b10000000 (ldb (byte 6 0) cd)))))))
135- octets))
\ No newline at end of file
126+ (if (= i (length octets))
127+ (values octets t)
128+ (values (subseq octets i) nil))))
\ No newline at end of file
Show on old repository browser