xml.lisp (6349B)
1 ;;; Copyright (C) 2013, 2014 Tomas Hlavaty <tom@logand.com> 2 ;;; 3 ;;; Permission is hereby granted, free of charge, to any person 4 ;;; obtaining a copy of this software and associated documentation 5 ;;; files (the "Software"), to deal in the Software without 6 ;;; restriction, including without limitation the rights to use, copy, 7 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies 8 ;;; of the Software, and to permit persons to whom the Software is 9 ;;; furnished to do so, subject to the following conditions: 10 ;;; 11 ;;; The above copyright notice and this permission notice shall be 12 ;;; included in all copies or substantial portions of the Software. 13 ;;; 14 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 18 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 19 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 ;;; DEALINGS IN THE SOFTWARE. 22 23 (defpackage :rw.xml 24 (:use :cl) 25 (:export :xmarkup-reader 26 :parse-xml)) 27 28 (in-package :rw.xml) 29 30 ;; https://github.com/drewc/smug/blob/master/smug.org 31 ;; http://www.htmlhelp.com/reference/wilbur/misc/comment.html 32 33 ;; TODO xml is made of bytes, not chars 34 35 (defun parse-xml-attributes (reader finish) ;; finish='((#\/))|'((#\?)) 36 (flet ((peek () (rw:peek reader)) 37 (next () (rw:next reader)) 38 (skip () (rw:skip reader)) 39 (till (markers) (rw:till reader markers))) 40 (do (z) 41 ((progn (skip) (eql #\> (peek))) 42 (next) 43 (let ((f (equal (car z) finish))) 44 (when f (pop z)) 45 (values (nreverse z) f))) 46 (push (cons (prog1 (till '(#\space #\tab #\return #\newline #\> #\=)) 47 (skip) 48 (when (eql #\= (peek)) 49 (next) 50 (skip))) 51 (let ((q (peek))) 52 (case q 53 ((#\" #\') 54 (next) 55 (prog1 (till (list q)) 56 (assert (eql q (next))))) 57 (t ;; TODO only when enabled no-quoted-attributes 58 (till '(#\space #\tab #\return #\newline #\>)))))) 59 z)))) 60 61 (defun xmarkup-reader (reader) 62 (flet ((peek () (rw:peek reader)) 63 (next () (rw:next reader)) 64 (skip () (rw:skip reader)) 65 (till (markers) (rw:till reader markers))) 66 (lambda () 67 (case (peek) 68 ((nil)) 69 (#\< 70 (next) 71 (skip) 72 (let ((e (till '(#\space #\tab #\return #\newline #\>)))) 73 (skip) 74 (case (car e) ;; TODO doctype 75 (#\? 76 (multiple-value-bind (a f) 77 (parse-xml-attributes reader '((#\?))) 78 (assert f) 79 (cons :pi (cons (cdr e) a)))) 80 (#\! 81 (prog1 (cons :comment (till '(#\>))) ;; TODO properly 82 (assert (eql #\> (next))))) 83 (#\/ 84 (assert (eql #\> (next))) 85 (cons :end (cdr e))) 86 (t 87 (multiple-value-bind (a f) 88 (parse-xml-attributes reader '((#\/))) 89 (unless f 90 (when (equal '(#\/) (last e)) 91 (setq f t 92 e (nreverse (cdr (nreverse e)))))) ;; TODO better 93 (cons (if f :begin/ :begin) (cons e a))))))) 94 (t (cons :text (till '(#\<)))))))) ;; TODO entities 95 96 (defun parse-xml (x &optional upcasep package) 97 (labels ((id (x) 98 (intern (if upcasep 99 (string-upcase (concatenate 'string x)) 100 (concatenate 'string x)) 101 (or package :keyword))) 102 (xattrs (x) 103 (loop 104 for (f . r) in x 105 appending (list (id f) (concatenate 'string r)))) 106 (parse (r) 107 (do ((z (list nil)) 108 (r (xmarkup-reader (rw:skip r))) 109 a) 110 ((not (setq a (rw:next r))) 111 (let ((y (pop z))) 112 (assert (not z)) 113 (assert (not (cdr y))) 114 (car y))) 115 (ecase (car a) 116 (:pi) 117 (:comment) 118 (:begin/ 119 (let ((tag (id (cadr a))) 120 (attrs (xattrs (cddr a)))) 121 (push (list (if attrs (cons tag attrs) tag)) (car z)))) 122 (:begin 123 (let ((tag (cadr a)) 124 (attrs (xattrs (cddr a)))) 125 (push (list (if attrs (cons tag attrs) tag)) z)) 126 (push nil z)) 127 (:end 128 (let ((tag (cdr a)) 129 (b (nreverse (pop z))) 130 (e (pop z))) 131 (assert e) 132 (assert z) 133 (let* ((h (car e)) 134 (tag2 (if (atom (car h)) h (car h))) 135 (attrs (unless (atom (car h)) (cdr h)))) 136 (assert (equal tag tag2)) 137 (push (cons (if attrs (cons (id tag) attrs) (id tag)) b) 138 (car z))))) 139 (:text 140 (when (and (cdr z) 141 (find-if-not 142 (lambda (c) 143 (or (member c '(#\space #\tab #\return #\newline)))) 144 (cdr a))) 145 (push (string-trim '(#\space #\tab #\return #\newline) 146 (concatenate 'string (cdr a))) 147 (car z)))))))) 148 (etypecase x 149 (function (parse x)) 150 (stream (parse (rw:peek-reader (rw:char-reader x)))) 151 ((or list vector) (parse (rw:peek-reader (rw:reader x)))) 152 (pathname (with-open-file (s x) 153 (parse (rw:peek-reader (rw:char-reader s)))))))) 154 155 ;;(parse-xml "<rss><ahoj/>hi<cau></cau><br x='314'/></rss>") 156 ;;(parse-xml "<rss a='1' b='2'>hi<br/></rss>") 157 ;;(parse-xml "<rss>hi<br/></hello>") 158 ;;(parse-xml #p"/home/tomas/git/cl-rw/a.xml")