Files
crawl/crawler/crawl.go

144 lines
3.5 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package crawler
import (
"io"
"log"
"net/http"
"net/url"
"strings"
"sync"
"golang.org/x/net/html"
"github.com/hellerve/crawl/pretty"
)
// Visited is the global list of visited links.
// This bit of global state makes the function interfaces cleaner.
// As long as we take care of locking this should be fine.
//
// Another consideration is that we currently only keep track of one level of
// nesting. If we made it a “real” sitemap with a graph or somesuch, this would
// be way more fancy.
var Visited = struct {
sync.RWMutex
visited map[string]bool
}{visited: make(map[string]bool)}
// visitNode inspects the current node and, if it contains a link we havent
// visited it yet, will spawn a goroutine for it. It will also return that link,
// because we have to add it to our list of linked nodes.
func visitNode(node *html.Node, parent, current string, wg *sync.WaitGroup) (*string, error) {
var val string
if node.Type == html.ElementNode && node.Data == "a" {
for _, a := range node.Attr {
if a.Key != "href" {
continue
}
parsedUrl, err := url.Parse(a.Val)
if err != nil {
return nil, err
}
if parsedUrl.IsAbs() && !strings.HasPrefix(a.Val, parent) {
continue
}
currentUrl, err := url.Parse(current)
if err != nil {
return nil, err
}
val = currentUrl.ResolveReference(parsedUrl).String()
Visited.RLock()
if !Visited.visited[val] {
Visited.RUnlock()
Visited.Lock()
Visited.visited[val] = true
Visited.Unlock()
go doCrawl(val, parent, wg)
} else {
Visited.RUnlock()
}
}
}
return &val, nil
}
// parseNode parses a single node. It is recursive, and will first be called
// with the whole document node. We do a lot of appends, which is kind of yucky,
// but we dont know the amount of links we will encounter yet.
func parseNode(node *html.Node, parent, current string, wg *sync.WaitGroup) ([]string, error) {
links := []string{}
val, err := visitNode(node, parent, current, wg)
if err != nil {
return nil, err
}
if val != nil {
links = append(links, *val)
}
for c := node.FirstChild; c != nil; c = c.NextSibling {
newLinks, err := parseNode(c, parent, current, wg)
if err != nil {
return nil, err
}
links = append(links, newLinks...)
}
return links, nil
}
// parseRequest takes a single request body and parses it. It will then call
// parseNode, which recursively looks through the document. If the body is not
// HTML, this will error.
func parseRequest(body io.ReadCloser, parent, url string, wg *sync.WaitGroup) {
document, err := html.Parse(body)
defer body.Close()
if err != nil {
log.Println(err)
return
}
links, err := parseNode(document, parent, url, wg)
// this can look weird with concurrent printing, but oh well. Im not sure
// its worth it to make this linear for now.
pretty.Print(url, links)
}
// doCrawl is the actual crawler. It keeps track of what to visit currently and
// what our base URL was. We also keep track of a WaitGroup to make sure we
// dont exit prematurely, since this is all concurrent.
func doCrawl(toVisit string, parent string, wg *sync.WaitGroup) {
wg.Add(1)
defer wg.Done()
resp, err := http.Get(toVisit)
if err != nil {
log.Println(err)
return
}
Visited.Lock()
Visited.visited[toVisit] = true
Visited.Unlock()
parseRequest(resp.Body, parent, toVisit, wg)
}
// Crawl is our actual crawler. It creates a WaitGroup and calls the function
// that actually does the work (doCrawl).
func Crawl(toCrawl string) {
var wg sync.WaitGroup
doCrawl(toCrawl, toCrawl, &wg)
wg.Wait()
}