#lang scheme (require 2htdp/batch-io test-engine/scheme-tests srfi/13) (provide (all-from-out 2htdp/batch-io)) (require xml/xml net/url net/head htdp/error html) (provide url-exists? ;; String -> Boolean ;; false, if this url returns a '404'; true otherwise url-html-neighbors ;; String -> [Listof String] ;; produce the list of (fully resolve) .html references in a elements from url xexpr-as-string ;; Xexpr -> String ;; turn the X-expression into a string read-xexpr ;; String -> Xexpr ;; given a file name (must exist in the same directory as the program), ;; read the ((first piece of) XML in the file and turn into an X-expression read-xexpr/web ;; String -> Xexpr or false ;; locate the FIRST xml element at the given string-url, produce as an Xexpr ;; false if the web page isn't found ;; signals an error if the url is unreachable or if it doesn't locate any XML ) (define (url-html-neighbors u) (define x (read-xexpr/web u)) (if x (url-html-neighbors-aux u x) '())) (define (url-exists? url:string) (define URL (string->url url:string)) (with-handlers ([exn:fail:network? (lambda (e) (define msg (format "working url expected (~a)" (exn-message e))) (check-arg 'read-xexpr/web #f msg "" url:string))]) (define h (call/input-url URL get-impure-port read-line)) (not (404? h)))) (define (read-xexpr f) (define a (and (string? f) (regexp-match "\\.xml$" f) (file-exists? f))) (check-arg 'read-xexpr a "(local) file with extension .xml" "" f) (xml->xexpr (with-input-from-file f read-xml/element))) (define (xexpr-as-string x) (check-arg 'xexpr->string (and (pair? x) (xexpr? x)) 'xexpr "first" x) (call-with-output-string (curry display-xml/content (xexpr->xml x)))) (define (read-xexpr/web url:string) (define URL (string->url url:string)) (with-handlers ([exn:fail:network? (lambda (e) (define msg (format "working url expected (~a)" (exn-message e))) (check-arg 'read-xexpr/web #f msg "" url:string))]) (define h (call/input-url URL get-impure-port read-line)) (if (404? h) #f (let () (define x (call/input-url URL get-pure-port (lambda (i) (read-html-as-xml i)))) (define e (filter element? x)) (if (empty? e) #f `(div ,@(map xml->xexpr e))))))) ;; String -> Boolean ;; does the string contain "404" (define (404? s) (pair? (regexp-match "404" s))) ; (define home (read-xexpr/web "http://www.ccs.neu.edu/home/matthias/")) ; (define bad (read-xexpr/web "http://www.ccs.neu.edu/home/mf/")) #| Xexpr is one of: -- (cons Symbol (cons LOA Xbody)) -- (cons Symbol Xbody) Xbody is one of: -- empty -- (cons String Xbody) -- (cons Xexpr Xbody) LOA is one of: -- empty -- (cons (list Symbol String) LOA) Note: (list Symbol String) is called an Attribute. Furthermore, (list 'a "some text") is called an a-Attribute and "some text" is its value. |# (check-expect (url-html-neighbors-aux "http://fun.com/" '(div (a "hello") (a ((href " one.html")) "world") (a ((href "Papers/four.pdf")) "world") (img ((alt "two"))) (img ((alt "three") (src "three.jpg"))))) '("http://fun.com/one.html")) (define (url-html-neighbors-aux u xexpr) (local ((define url (string->url u))) (if (boolean? xexpr) (error "bad xexpr!") (local ((define a-el (xexpr-elements xexpr 'a)) (define href (map string-trim-both (apply append (map (loa-attr 'href) a-el)))) (define comp (map (lambda (u) (combine-url/relative url u)) href)) (define html (filter url-ends-in-html? comp))) (map url->string html))))) ;; URL -> Boolean ;; does the url end in html? (check-expect (url-ends-in-html? (string->url "seconds.html")) true) (check-expect (url-ends-in-html? (string->url "seconds.pdf")) false) (define (url-ends-in-html? u) (local ((define p (url-path u)) (define q (reverse (map path/param-path p)))) (if (empty? q) false (pair? (regexp-match ".html$" (first q)))))) ;; Xexpr ;; [Attribute -> X] ;; Y [String Y -> Y] [Xexpr Y -> Y] ;; [Xexpr [Listof X] Z -> W] [Symbol Z -> W] ;; -> ;; W (define (xexpr-abs x0 attr body0 s-combine x-combine loa-combine plain-combine) (local (;; Xexpr -> W (define (f-xexpr x) (cond [(and (cons? (rest x)) (loa? (second x))) (loa-combine x (f-loa (second x)) (f-xbody (rest (rest x))))] [else (plain-combine x (f-xbody (rest x)))])) ;; Xbody -> Z (define (f-xbody x) (cond [(empty? x) body0] [(string? (first x)) (s-combine (first x) (f-xbody (rest x)))] [(cons? (first x)) (x-combine (f-xexpr (first x)) (f-xbody (rest x)))] [else (f-xbody (rest x))])) ;; LOA -> [Listof X] (define (f-loa x) (cond [(empty? x) '()] [else (attr (first x) (f-loa (rest x)))]))) (f-xexpr x0))) ;; String or (cons Symbol Y) or empty (cons (list Symbol String) Any) ;; --> Boolean ;; is the given value a loa? (define (loa? x) (or (empty? x) (and (cons? x) (cons? (first x))))) ;; Xexpr Symbol -> [Listof Xexpr] ;; retrieve all elements whose tag is 'tag' (check-expect (xexpr-elements '(p () (p ((align "center")) "hello") (a)) 'p) (list '(p () (p ((align "center")) "hello") (a)) '(p ((align "center")) "hello"))) (define (xexpr-elements x0 tag) (local ((define (K- fst rst) rst)) (xexpr-abs x0 K- '() K- append (lambda (e loa rst) (if (symbol=? (first e) tag) (cons e rst) rst)) (lambda (e rst) (if (symbol=? (first e) tag) (cons e rst) rst))))) ;; Symbol -> (LOA -> [Listof URL]) ;; retrieve all attribute values for attr (define (loa-attr attr) (lambda (x) (local ((define (f-loa x) (cond [(empty? x) '()] [else (local ((define fst (first x)) (define rst (f-loa (rest x)))) (if (symbol=? (first fst) attr) (cons (second fst) rst) rst))]))) (cond [(and (cons? (rest x)) (loa? (rest x))) (f-loa (second x))] [else '()])))) (test)