@@ -0,0 +1,143 @@
package crawler
import (
"io"
"log"
"net/http"
"net/url"
"strings"
"sync"
"golang.org/x/net/html"
"github.com/hellerve/crawl/pretty"
)
// Visited is the global list of visited links.
// This bit of global state makes the function interfaces cleaner.
// As long as we take care of locking this should be fine.
//
// Another consideration is that we currently only keep track of one level of
// nesting. If we made it a “real” sitemap with a graph or somesuch, this would
// be way more fancy.
var Visited = struct {
sync . RWMutex
visited map [ string ] bool
} { visited : make ( map [ string ] bool ) }
// visitNode inspects the current node and, if it contains a link we haven’ t
// visited it yet, will spawn a goroutine for it. It will also return that link,
// because we have to add it to our list of linked nodes.
func visitNode ( node * html . Node , parent , current string , wg * sync . WaitGroup ) ( * string , error ) {
var val * string
if node . Type == html . ElementNode && node . Data == "a" {
for _ , a := range node . Attr {
if a . Key != "href" {
continue
}
parsedUrl , err := url . Parse ( a . Val )
if err != nil {
return nil , err
}
if parsedUrl . IsAbs ( ) && ! strings . HasPrefix ( a . Val , parent ) {
continue
}
currentUrl , err := url . Parse ( current )
if err != nil {
return nil , err
}
val = currentUrl . ResolveReference ( parsedUrl ) . String ( )
Visited . RLock ( )
if ! Visited . visited [ val ] {
Visited . RUnlock ( )
Visited . Lock ( )
Visited . visited [ val ] = true
Visited . Unlock ( )
go doCrawl ( val , parent , wg )
} else {
Visited . RUnlock ( )
}
}
}
return val , nil
}
// parseNode parses a single node. It is recursive, and will first be called
// with the whole document node. We do a lot of appends, which is kind of yucky,
// but we don’ t know the amount of links we will encounter yet.
func parseNode ( node * html . Node , parent , current string , wg * sync . WaitGroup ) ( [ ] string , error ) {
links := [ ] string { }
val , err := visitNode ( node , parent , current , wg )
if err != nil {
return nil , err
}
if val != nil {
links = append ( links , val )
}
for c := node . FirstChild ; c != nil ; c = c . NextSibling {
newLinks , err := parseNode ( c , parent , current , wg )
if err != nil {
return nil , err
}
links = append ( links , newLinks ... )
}
return links , nil
}
// parseRequest takes a single request body and parses it. It will then call
// parseNode, which recursively looks through the document. If the body is not
// HTML, this will error.
func parseRequest ( body io . ReadCloser , parent , url string , wg * sync . WaitGroup ) {
document , err := html . Parse ( body )
defer body . Close ( )
if err != nil {
log . Println ( err )
return
}
links , err := parseNode ( document , parent , url , wg )
// this can look weird with concurrent printing, but oh well. I’ m not sure
// it’ s worth it to make this linear for now.
pretty . Print ( url , links )
}
// doCrawl is the actual crawler. It keeps track of what to visit currently and
// what our base URL was. We also keep track of a WaitGroup to make sure we
// don’ t exit prematurely, since this is all concurrent.
func doCrawl ( toVisit string , parent string , wg * sync . WaitGroup ) {
wg . Add ( 1 )
resp , err := http . Get ( toVisit )
defer wg . Done ( )
if err != nil {
log . Println ( err )
return
}
Visited . Lock ( )
Visited . visited [ toVisit ] = true
Visited . Unlock ( )
parseRequest ( resp . Body , parent , toVisit , wg )
}
// Crawl is our actual crawler. It creates a WaitGroup and calls the function
// that actually does the work (doCrawl).
func Crawl ( toCrawl string ) {
var wg sync . WaitGroup
doCrawl ( toCrawl , toCrawl , & wg )
wg . Wait ( )
}