commit 3fa532f2a7325f94a5798f86d77e8964df6e3434
parent 6e32c8bc706b32b633157d46f2ccd4feb9d37b6f
Author: Tomas Hlavaty <tom@logand.com>
Date: Sun, 20 Sep 2015 18:15:40 +0200
utf8 codepoint reader added
Diffstat:
M | rw.lisp | | | 53 | +++++++++++++++++++++++++++++++++++++++++++++++++++++ |
1 file changed, 53 insertions(+), 0 deletions(-)
diff --git a/rw.lisp b/rw.lisp
@@ -44,6 +44,7 @@
:next-u32be
:next-u32le
:next-u8
+ :next-utf8
:next-z0
:peek
:peek-reader
@@ -63,6 +64,7 @@
:u32
:u32be
:u32le
+ :utf8-reader
:wrap-reader
:wrap-writer
:write-octets
@@ -334,6 +336,57 @@
(when x
(parse-integer (coerce x 'string) :radix radix))))
+(defun next-utf8 (reader)
+ (let ((i1 (rw:next reader)) i2 i3 i4 o2 o3 o4)
+ (macrolet ((wrong ()
+ `(error "wrong UTF-8 sequence ~x ~x ~x ~x" i1 i2 i3 i4))
+ (tail (i o)
+ `(progn
+ (setq ,i (rw:next reader))
+ (unless (and (typep ,i '(unsigned-byte 8))
+ (= #x80 (logand #b11000000 ,i)))
+ (wrong))
+ (setq ,o (logand #b00111111 ,i)))))
+ (cond
+ ((not i1) nil)
+ ((not (typep i1 '(unsigned-byte 8)))
+ (wrong))
+ ((<= #b00000000 i1 #b01111111) ;; one
+ i1)
+ ((<= #b11000000 i1 #b11011111) ;; two
+ (tail i2 o2)
+ (let ((z (logior (ash (logand #x1f i1) 6) o2)))
+ (unless (<= #x000080 z #x0007ff)
+ (wrong))
+ z))
+ ((<= #b11100000 i1 #b11101111) ;; three
+ (tail i2 o2)
+ (tail i3 o3)
+ (let ((z (logior (ash (logand #x0f i1) 12) (ash o2 6) o3)))
+ (unless (or (<= #x000800 z #x00d7ff)
+ (<= #x00e000 z #x00ffff))
+ (wrong))
+ z))
+ ((<= #b11110000 i1 #b11110111) ;; four
+ (tail i2 o2)
+ (tail i3 o3)
+ (tail i4 o4)
+ (let ((z (logior (ash (logand #x07 i1) 18) (ash o2 12) (ash o3 6) o4)))
+ (unless (<= #x010000 z #x10ffff)
+ (wrong))
+ z))
+ (t (wrong))))))
+
+(defun utf8-reader (octet-reader)
+ (lambda ()
+ (next-utf8 octet-reader)))
+
+;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#x24)))))
+;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#xc2 #xa2)))))
+;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#xe2 #x82 #xac)))))
+;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#xf0 #x90 #x8d #x88)))))
+;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#xc0 #x80))))) ;; overlong
+
(defun bit-reader (octet-reader)
(let (octet bit)
(lambda ()