This commit is contained in:
2018-05-22 19:24:19 +02:00
commit 546188fb7d
5 changed files with 180 additions and 0 deletions

143
crawler/crawl.go Normal file
View File

@@ -0,0 +1,143 @@
package crawler
import (
"io"
"log"
"net/http"
"net/url"
"strings"
"sync"
"golang.org/x/net/html"
"github.com/hellerve/crawl/pretty"
)
// Visited is the global list of visited links.
// This bit of global state makes the function interfaces cleaner.
// As long as we take care of locking this should be fine.
//
// Another consideration is that we currently only keep track of one level of
// nesting. If we made it a “real” sitemap with a graph or somesuch, this would
// be way more fancy.
var Visited = struct {
sync.RWMutex
visited map[string]bool
}{visited: make(map[string]bool)}
// visitNode inspects the current node and, if it contains a link we havent
// visited it yet, will spawn a goroutine for it. It will also return that link,
// because we have to add it to our list of linked nodes.
func visitNode(node *html.Node, parent, current string, wg *sync.WaitGroup) (*string, error) {
var val *string
if node.Type == html.ElementNode && node.Data == "a" {
for _, a := range node.Attr {
if a.Key != "href" {
continue
}
parsedUrl, err := url.Parse(a.Val)
if err != nil {
return nil, err
}
if parsedUrl.IsAbs() && !strings.HasPrefix(a.Val, parent) {
continue
}
currentUrl, err := url.Parse(current)
if err != nil {
return nil, err
}
val = currentUrl.ResolveReference(parsedUrl).String()
Visited.RLock()
if !Visited.visited[val] {
Visited.RUnlock()
Visited.Lock()
Visited.visited[val] = true
Visited.Unlock()
go doCrawl(val, parent, wg)
} else {
Visited.RUnlock()
}
}
}
return val, nil
}
// parseNode parses a single node. It is recursive, and will first be called
// with the whole document node. We do a lot of appends, which is kind of yucky,
// but we dont know the amount of links we will encounter yet.
func parseNode(node *html.Node, parent, current string, wg *sync.WaitGroup) ([]string, error) {
links := []string{}
val, err := visitNode(node, parent, current, wg)
if err != nil {
return nil, err
}
if val != nil {
links = append(links, val)
}
for c := node.FirstChild; c != nil; c = c.NextSibling {
newLinks, err := parseNode(c, parent, current, wg)
if err != nil {
return nil, err
}
links = append(links, newLinks...)
}
return links, nil
}
// parseRequest takes a single request body and parses it. It will then call
// parseNode, which recursively looks through the document. If the body is not
// HTML, this will error.
func parseRequest(body io.ReadCloser, parent, url string, wg *sync.WaitGroup) {
document, err := html.Parse(body)
defer body.Close()
if err != nil {
log.Println(err)
return
}
links, err := parseNode(document, parent, url, wg)
// this can look weird with concurrent printing, but oh well. Im not sure
// its worth it to make this linear for now.
pretty.Print(url, links)
}
// doCrawl is the actual crawler. It keeps track of what to visit currently and
// what our base URL was. We also keep track of a WaitGroup to make sure we
// dont exit prematurely, since this is all concurrent.
func doCrawl(toVisit string, parent string, wg *sync.WaitGroup) {
wg.Add(1)
resp, err := http.Get(toVisit)
defer wg.Done()
if err != nil {
log.Println(err)
return
}
Visited.Lock()
Visited.visited[toVisit] = true
Visited.Unlock()
parseRequest(resp.Body, parent, toVisit, wg)
}
// Crawl is our actual crawler. It creates a WaitGroup and calls the function
// that actually does the work (doCrawl).
func Crawl(toCrawl string) {
var wg sync.WaitGroup
doCrawl(toCrawl, toCrawl, &wg)
wg.Wait()
}