crawl/crawler/crawl.go

package crawler

import (
	"io"
	"log"
	"net/http"
	"net/url"
	"strings"
	"sync"

	"golang.org/x/net/html"

	"github.com/hellerve/crawl/pretty"
)

// Visited is the global list of visited links.
// This bit of global state makes the function interfaces cleaner.
// As long as we take care of locking this should be fine.
//
// Another consideration is that we currently only keep track of one level of
// nesting. If we made it a “real” sitemap with a graph or somesuch, this would
// be way more fancy.
var Visited = struct {
	sync.RWMutex
	visited map[string]bool
}{visited: make(map[string]bool)}

// visitNode inspects the current node and, if it contains a link we haven’t
// visited it yet, will spawn a goroutine for it. It will also return that link,
// because we have to add it to our list of linked nodes.
func visitNode(node *html.Node, parent, current string, wg *sync.WaitGroup) (*string, error) {
	var val *string
	if node.Type == html.ElementNode && node.Data == "a" {
		for _, a := range node.Attr {
			if a.Key != "href" {
				continue
			}
			parsedUrl, err := url.Parse(a.Val)

			if err != nil {
				return nil, err
			}

			if parsedUrl.IsAbs() && !strings.HasPrefix(a.Val, parent) {
				continue
			}

			currentUrl, err := url.Parse(current)

			if err != nil {
				return nil, err
			}

			val = currentUrl.ResolveReference(parsedUrl).String()

			Visited.RLock()
			if !Visited.visited[val] {
				Visited.RUnlock()
				Visited.Lock()
				Visited.visited[val] = true
				Visited.Unlock()
				go doCrawl(val, parent, wg)
			} else {
				Visited.RUnlock()
			}
		}
	}
	return val, nil
}

// parseNode parses a single node. It is recursive, and will first be called
// with the whole document node. We do a lot of appends, which is kind of yucky,
// but we don’t know the amount of links we will encounter yet.
func parseNode(node *html.Node, parent, current string, wg *sync.WaitGroup) ([]string, error) {
	links := []string{}
	val, err := visitNode(node, parent, current, wg)

	if err != nil {
		return nil, err
	}

	if val != nil {
		links = append(links, val)
	}

	for c := node.FirstChild; c != nil; c = c.NextSibling {
		newLinks, err := parseNode(c, parent, current, wg)

		if err != nil {
			return nil, err
		}
		links = append(links, newLinks...)
	}
	return links, nil
}

// parseRequest takes a single request body and parses it. It will then call
// parseNode, which recursively looks through the document. If the body is not
// HTML, this will error.
func parseRequest(body io.ReadCloser, parent, url string, wg *sync.WaitGroup) {
	document, err := html.Parse(body)
	defer body.Close()

	if err != nil {
		log.Println(err)
		return
	}

	links, err := parseNode(document, parent, url, wg)

	// this can look weird with concurrent printing, but oh well. I’m not sure
	// it’s worth it to make this linear for now.
	pretty.Print(url, links)
}

// doCrawl is the actual crawler. It keeps track of what to visit currently and
// what our base URL was. We also keep track of a WaitGroup to make sure we
// don’t exit prematurely, since this is all concurrent.
func doCrawl(toVisit string, parent string, wg *sync.WaitGroup) {
	wg.Add(1)
	defer wg.Done()

	resp, err := http.Get(toVisit)

	if err != nil {
		log.Println(err)
		return
	}

	Visited.Lock()
	Visited.visited[toVisit] = true
	Visited.Unlock()

	parseRequest(resp.Body, parent, toVisit, wg)
}

// Crawl is our actual crawler. It creates a WaitGroup and calls the function
// that actually does the work (doCrawl).
func Crawl(toCrawl string) {
	var wg sync.WaitGroup
	doCrawl(toCrawl, toCrawl, &wg)
	wg.Wait()
}