;;; HTML parsing - very rudimentary
;;;
;;; Copyright (C) 1997-2002 by Sam Steingold
;;; This is Free Software, covered by the GNU GPL (v2)
;;; See http://www.gnu.org/copyleft/gpl.html
;;;
;;; $Id: html.lisp,v 1.18 2002/04/21 20:10:31 sds Exp $
;;; $Source: /cvsroot/clocc/clocc/src/cllib/html.lisp,v $
(eval-when (compile load eval)
(require :cllib-base (translate-logical-pathname "clocc:src;cllib;base"))
(require :port-gray (translate-logical-pathname "port:gray"))
;; `xml-read-comment'
(require :cllib-xml (translate-logical-pathname "cllib:xml"))
;; `with-open-url' in `dump-url-tokens'
(require :cllib-url (translate-logical-pathname "cllib:url")))
(in-package :cllib)
(export
'(*html-readtable* html-translate-specials
text-stream *ts-kill* read-next next-token next-number dump-url-tokens
xml-read-from-url))
;;;
;;; {{{ HTML parsing via `read'
;;;
;; (setq *read-eval* nil *read-suppress* t) ; for parsing
;; (setq *read-eval* t *read-suppress* nil) ; original
(defstruct html-tag data)
;;
(defcustom *html-specials* list
'(("gt" . #\>) ("lt" . #\<) ("quot" . #\") ("amp" . #\&) ("nbsp" . #\Space)
("acute" . #\') ("ast" . #\*) ("colon" . #\:) ("comma" . #\,)
("commat" . #\@) ("copy" . "(C)") ("curren" . #\$) ("divide" . #\/)
("dollar" . #\$) ("equals" . #\=) ("excl" . #\!) ("grave" . #\`)
("half" . "1/2") ("hyphen" . #\-) ("lowbar" . #\_) ("lpar" . #\()
("rpar" . #\)) ("lsqb" . #\[) ("rsqb" . #\]) ("num" . #\#) ("period" . #\.)
("plus" . #\+) ("plusmn" . "+-") ("pound" . #\#) ("quest" . #\?)
("laquo" . "<<") ("raquo" . ">>") ("lcub" . #\{) ("rcub" . #\})
("semi" . #\;) ("shy" . #\-) ("times" . #\*) ("verbar" . #\|))
"Alist of translations of HTML specials like `&*'.")
(defun html-translate-specials (str &optional space)
"Replace (non-destructively) HTML specals with their interpretations.
HTML tags, surrounded by `<>', are removed or replaced with a space, if
optional argument SPACE is non-nil."
(declare (string str))
(do ((beg 0 (1+ beg)) res (len (length str)))
((>= beg len) (coerce (nreverse res) 'string))
(declare (type index-t beg len))
(case (char str beg)
(#\< (setq beg (or (position #\> str :start beg) len))
(when space (push #\Space res)))
(#\&
(let ((pa (assoc str *html-specials* :test
(lambda (str tag)
(let ((end (+ beg (length tag))))
(and (>= len end)
(string= str tag :start1 beg
:end1 end)))))))
(cond (pa (incf beg (1- (length (car pa))))
(push (cdr pa) res))
(t (when space (push #\Space res))
(setq beg (or (position #\; str :start beg) len))))))
(t (push (char str beg) res)))))
(defun strip-html-markup (str)
"Return a new string, sans HTML."
(declare (simple-string str))
(do* ((p0 (position #\< str) (position #\< str :start p1))
(res (list (subseq str 0 p0)))
(p1 (position #\> str) (position #\> str :start (or p0 0))))
((or (null p0) (null p1))
(apply #'concatenate 'string (nreverse res)))
(push (subseq str (1+ p1) (position #\< str :start p1)) res)))
(defun read-html-markup (stream char)
"Skip through the HTML markup. CHAR=`<'"
(declare (stream stream) (character char))
(ecase char
(#\< (let ((obj (read stream t nil t)))
(make-html-tag
:data (if (eq :!-- obj) (xml-read-comment stream)
;; FIXME - there might be comments!
(cons obj (read-delimited-list #\> stream t))))))
(#\&
(do ((cc (read-char stream nil nil t) (read-char stream nil nil t)) rr)
((or (null cc) (char= cc #\;) (char= cc #\#))
(if (null cc) (error "`&' must be terminated with `;' or `#'")
(or (cdr (assoc (coerce (nreverse rr) 'string)
*html-specials* :test #'string-equal))
#\Space)))
(push cc rr)))))
(defun make-html-readtable (&optional (rt (copy-readtable)))
"Make the readtable for parsing HTML."
(set-macro-character #\< #'read-html-markup nil rt)
(set-macro-character #\& #'read-html-markup nil rt)
(set-macro-character #\> (get-macro-character #\)) nil rt)
(set-syntax-from-char #\; #\a rt)
;;(set-macro-character #\; #'read-standalone-char nil rt)
(set-syntax-from-char #\# #\a rt)
(set-syntax-from-char #\: #\a rt)
(set-macro-character #\: #'read-standalone-char nil rt)
(set-syntax-from-char #\, #\a rt)
(set-macro-character #\, #'read-standalone-char nil rt)
rt)
(defcustom *html-readtable* readtable (make-html-readtable)
"The readtable for HTML parsing.")
;;;
;;; }}}{{{ HTML streams
;;;
(defclass html-stream-in (fundamental-character-input-stream)
((input :initarg :stream :initarg :input :type stream :reader html-in))
(:documentation "The input stream for reading HTML."))
(defcustom *html-unterminated-tags* list '(:p :li :dd :dt :tr :td :th)
"*The list of tags without the corresponding `/' tag.")
(defun html-end-tag (tag)
(if (member tag *html-unterminated-tags* :test #'eq) tag
(keyword-concat "/" tag)))
(defmethod stream-read-char ((in html-stream-in))
(read-char (html-in in) nil :eof))
(defmethod stream-unread-char ((in html-stream-in) (char character))
(unread-char char (html-in in)))
(defmethod stream-read-char-no-hang ((in html-stream-in))
(read-char-no-hang (html-in in) nil :eof))
(defmethod stream-peek-char ((in html-stream-in))
(peek-char nil (html-in in) nil :eof))
(defmethod stream-listen ((in html-stream-in))
(listen (html-in in)))
(defmethod stream-read-line ((in html-stream-in))
(read-line (html-in in)))
(defmethod stream-clear-input ((in html-stream-in))
(clear-input (html-in in)))
;;;
;;; }}}{{{ HTML parsing via `text-stream'
;;;
(defstruct (text-stream (:conc-name ts-))
"Text stream - to read a tream of text - skipping junk."
(sock nil) ; socket to read from
(buff "" :type simple-string) ; buffer string
(posn 0 :type fixnum)) ; position in the buffer
(defcustom *ts-kill* list nil "*The list of extra characters to kill.")
(defun ts-pull-next (ts &optional (concat-p t) (kill *ts-kill*))
"Read the next line from the socket, put it into the buffer.
If CONCAT-P is non-NIL, the new line is appended,
otherwise the buffer is replaced.
Return the new buffer or NIL on EOF."
(declare (type text-stream ts))
(let ((str (or (read-line (ts-sock ts) nil nil)
(return-from ts-pull-next nil))))
(declare (type simple-string str))
(when kill
(dolist (ch (to-list kill))
(setq str (nsubstitute #\Space ch str))))
;; ' .. ' is an error and
;; (nsubstitute #\space #\. str) breaks floats, so we have to be smart
(do ((beg -1) (len (1- (length str))))
((or (>= beg len)
(null (setq beg (position #\. str :start (1+ beg))))))
(declare (type (signed-byte 21) beg len))
(if (or (and (plusp beg) (alphanumericp (schar str (1- beg))))
(and (< beg len) (alphanumericp (schar str (1+ beg)))))
(incf beg) (setf (schar str beg) #\Space)))
(if concat-p
(setf (ts-buff ts) (concatenate 'string (ts-buff ts) str))
(setf (ts-posn ts) 0 (ts-buff ts) str))))
(defun read-next (ts &key errorp (kill *ts-kill*) skip)
"Read the next something from TS - a text stream."
(declare (type text-stream ts) (type (or null function) skip))
(loop :with *package* = +kwd+ :and tok :and pos
:and *read-default-float-format* = 'double-float
:when (and (or (typep pos 'error)
(>= (ts-posn ts) (length (ts-buff ts))))
(null (ts-pull-next ts nil kill))) ; (typep pos 'error)
:do (if (typep pos 'error) (error pos)
(if errorp (error "EOF on ~a" ts)
(return-from read-next +eof+)))
:do (setf (values tok pos)
(ignore-errors (read-from-string (ts-buff ts) nil +eof+
:start (ts-posn ts))))
:unless (typep pos 'error) :do (setf (ts-posn ts) pos)
:unless (or (typep pos 'error) (eq +eof+ tok))
:return (if (and skip (funcall skip tok))
(read-next ts :errorp errorp :kill kill :skip skip)
tok)))
(defun ts-skip-scripts (ts)
"Read from the text stream one script."
(declare (type text-stream ts))
(do ((tok (read-next ts) (read-next ts)))
((and (html-tag-p tok) (eq :script (car (html-tag-data tok))))))
(do (pos)
((setq pos (search "" (ts-buff ts) :test #'char-equal))
(setf (ts-buff ts) (subseq (ts-buff ts) (+ pos (length "")))))
(ts-pull-next ts)))
(defun next-token (ts &key (num 1) type dflt (kill *ts-kill*))
"Get the next NUM-th non-tag token from the HTML stream TS."
(declare (type text-stream ts) (type index-t num))
(let (tt)
(dotimes (ii num)
(declare (type index-t ii))
(do () ((not (html-tag-p (setq tt (read-next ts :errorp t :kill kill))))
(mesg :log t "~d token (~s): ~s~%" ii (type-of tt) tt))
(mesg :log t "tag: ~s~%" tt)))
(if (and type (not (typep tt type))) dflt tt)))
(defun next-number (ts &key (num 1) (kill *ts-kill*))
"Get the next NUM-th number from the HTML stream TS."
(declare (type text-stream ts) (type index-t num))
(pushnew #\% kill :test #'char=)
(let (tt)
(dotimes (ii num)
(declare (type index-t ii))
(do () ((numberp (setq tt (next-token ts :kill kill)))))
(mesg :log t "~d - number: ~a~%" ii tt))
(mesg :log t " -><- number: ~a~%" tt)
tt))
(defun skip-tokens (ts end &key (test #'eql) (key #'identity) kill)
"Skip tokens until END, i.e., until (test (key token) end) is T."
(declare (type text-stream ts))
(do (tt) ((funcall test (setq tt (funcall key (next-token ts :kill kill)))
end)
tt)))
;;;###autoload
(defun dump-url-tokens (url &key (fmt "~3d: ~a~%")
(out *standard-output*) (err *error-output*)
((:max-retry *url-max-retry*) *url-max-retry*))
"Dump the URL token by token.
See `dump-url' about the optional parameters.
This is mostly a debugging function, to be called interactively."
(declare (stream out) (simple-string fmt))
(setq url (url url))
(with-open-url (sock url :rt *html-readtable* :err err)
(do (rr (ii 0 (1+ ii)) (ts (make-text-stream :sock sock)))
((eq +eof+ (setq rr (read-next ts))))
(declare (type index-t ii))
(format out fmt ii rr))))
;;;###autoload
(defun xml-read-from-url (url &key (repeat t)
(reset-ent (xml-default-reset-entities))
(resolve-namespaces *xml-read-balanced*)
(out *standard-output*)
((:max-retry *url-max-retry*) *url-max-retry*)
((:timeout *url-timeout*) *url-timeout*))
"Read all XML objects from the stream."
(when reset-ent (xml-init-entities :out out))
(let ((obj (with-open-url (sock url :err out)
(http-parse-header sock :out out)
(with-xml-input (xin sock)
(read-from-stream xin :repeat repeat)))))
(if resolve-namespaces
(xml-resolve-namespaces obj :out out)
obj)))
;;;}}}
(provide :cllib-html)
;;; file html.lisp ends here