package crawler import ( "io" "log" "net/http" "net/url" "strings" "sync" "golang.org/x/net/html" "github.com/hellerve/crawl/pretty" ) // Visited is the global list of visited links. // This bit of global state makes the function interfaces cleaner. // As long as we take care of locking this should be fine. // // Another consideration is that we currently only keep track of one level of // nesting. If we made it a “real” sitemap with a graph or somesuch, this would // be way more fancy. var Visited = struct { sync.RWMutex visited map[string]bool }{visited: make(map[string]bool)} // visitNode inspects the current node and, if it contains a link we haven’t // visited it yet, will spawn a goroutine for it. It will also return that link, // because we have to add it to our list of linked nodes. func visitNode(node *html.Node, parent, current string, wg *sync.WaitGroup) (*string, error) { var val string if node.Type == html.ElementNode && node.Data == "a" { for _, a := range node.Attr { if a.Key != "href" { continue } parsedUrl, err := url.Parse(a.Val) if err != nil { return nil, err } if parsedUrl.IsAbs() && !strings.HasPrefix(a.Val, parent) { continue } currentUrl, err := url.Parse(current) if err != nil { return nil, err } val = currentUrl.ResolveReference(parsedUrl).String() Visited.RLock() if !Visited.visited[val] { Visited.RUnlock() Visited.Lock() Visited.visited[val] = true Visited.Unlock() go doCrawl(val, parent, wg) } else { Visited.RUnlock() } } } return &val, nil } // parseNode parses a single node. It is recursive, and will first be called // with the whole document node. We do a lot of appends, which is kind of yucky, // but we don’t know the amount of links we will encounter yet. func parseNode(node *html.Node, parent, current string, wg *sync.WaitGroup) ([]string, error) { links := []string{} val, err := visitNode(node, parent, current, wg) if err != nil { return nil, err } if val != nil { links = append(links, *val) } for c := node.FirstChild; c != nil; c = c.NextSibling { newLinks, err := parseNode(c, parent, current, wg) if err != nil { return nil, err } links = append(links, newLinks...) } return links, nil } // parseRequest takes a single request body and parses it. It will then call // parseNode, which recursively looks through the document. If the body is not // HTML, this will error. func parseRequest(body io.ReadCloser, parent, url string, wg *sync.WaitGroup) { document, err := html.Parse(body) defer body.Close() if err != nil { log.Println(err) return } links, err := parseNode(document, parent, url, wg) // this can look weird with concurrent printing, but oh well. I’m not sure // it’s worth it to make this linear for now. pretty.Print(url, links) } // doCrawl is the actual crawler. It keeps track of what to visit currently and // what our base URL was. We also keep track of a WaitGroup to make sure we // don’t exit prematurely, since this is all concurrent. func doCrawl(toVisit string, parent string, wg *sync.WaitGroup) { wg.Add(1) defer wg.Done() resp, err := http.Get(toVisit) if err != nil { log.Println(err) return } Visited.Lock() Visited.visited[toVisit] = true Visited.Unlock() parseRequest(resp.Body, parent, toVisit, wg) } // Crawl is our actual crawler. It creates a WaitGroup and calls the function // that actually does the work (doCrawl). func Crawl(toCrawl string) { var wg sync.WaitGroup doCrawl(toCrawl, toCrawl, &wg) wg.Wait() }