;;;Extracting Links from an HTML File ;;; This is an example of reading HTML documents was originally from: ;;; http://developer.java.sun.com/developer/TechTips/1999/tt0923.html#tip1 ;;; This version collects broken links. For example: '(walk-links (URL. "http://groupscheme.sourceforge.net") collect-broken-links '() 2 (HashSet.)) (load "elf/basic.scm") (import "java.io.FileReader") (import "java.io.InputStreamReader") (import "java.lang.Boolean") (import "java.net.URL") (import "java.net.URLConnection") (import "javax.swing.text.Document") (import "javax.swing.text.Element") (import "javax.swing.text.ElementIterator") (import "javax.swing.text.SimpleAttributeSet") (import "javax.swing.text.html.HTML") (import "javax.swing.text.html.HTML$Attribute") (import "javax.swing.text.html.HTML$Tag") (import "javax.swing.text.html.HTMLEditorKit") (import "jlib.ExceptionHandler") ;;; Provide a HTTPS url parser. ;;; The proper jar's must be inserted in jre/lib/ext to actually work. (System.setProperty "java.protocol.handler.pkgs" "com.sun.net.ssl.internal.www.protocol") (define (getReader url) (tryCatch (InputStreamReader. (.getInputStream (.openConnection url))) (lambda (e) #null))) (define (walk-links url how so-far level set) (define (read-doc kit rd doc) (tryCatch (begin (.read kit rd doc 0) #t) (lambda (e) (print e) #f))) (define (handle-element elem) (let ((s (.getAttribute (.getAttributes elem) HTML$Tag.A$))) (if (not (isNull s)) (let ((href (.getAttribute s HTML$Attribute.HREF$))) (if (and (not (isNull href)) (not (= (.indexOf href "javascript:") 0)) (not (= (.indexOf href "mailto:") 0))) (let ((u (URL. url href))) (print u) (if (not (.contains set u)) (begin (.add set u) (set! so-far (how url href so-far)) (set! so-far (walk-links u how so-far (- level 1) set)))))))))) (if (> level 0) (let* ((kit (HTMLEditorKit.)) (doc (.createDefaultDocument kit)) (rd (getReader url))) (if (not (isNull rd)) (begin ;; KRA 28FEB02: ;; The Document class does not yet handle charset's properly. (.putProperty doc "IgnoreCharsetDirective" Boolean.TRUE$) ;; Parse the HTML. (if (read-doc kit rd doc) ;; Iterate through the elements of the HTML document. (iterate (ElementIterator. doc) handle-element)))))) so-far) (define (collect-broken-links page link so-far) (print page) (let* ((u (URL. page link)) (r (getReader u))) (display ".") (if (isNull r) (cons (print (list page link)) so-far) so-far)))