initial

2018-05-22 19:24:19 +02:00
commit 546188fb7d
5 changed files with 180 additions and 0 deletions
--- a/crawler/crawl.go
+++ b/crawler/crawl.go
@@ -0,0 +1,143 @@
+package crawler
+
+import (
+	"io"
+	"log"
+	"net/http"
+	"net/url"
+	"strings"
+	"sync"
+
+	"golang.org/x/net/html"
+
+	"github.com/hellerve/crawl/pretty"
+)
+
+// Visited is the global list of visited links.
+// This bit of global state makes the function interfaces cleaner.
+// As long as we take care of locking this should be fine.
+//
+// Another consideration is that we currently only keep track of one level of
+// nesting. If we made it a “real” sitemap with a graph or somesuch, this would
+// be way more fancy.
+var Visited = struct {
+	sync.RWMutex
+	visited map[string]bool
+}{visited: make(map[string]bool)}
+
+
+// visitNode inspects the current node and, if it contains a link we haven’t
+// visited it yet, will spawn a goroutine for it. It will also return that link,
+// because we have to add it to our list of linked nodes.
+func visitNode(node *html.Node, parent, current string, wg *sync.WaitGroup) (*string, error) {
+  var val *string
+	if node.Type == html.ElementNode && node.Data == "a" {
+		for _, a := range node.Attr {
+			if a.Key != "href" {
+				continue
+			}
+			parsedUrl, err := url.Parse(a.Val)
+
+			if err != nil {
+				return nil, err
+			}
+
+			if parsedUrl.IsAbs() && !strings.HasPrefix(a.Val, parent) {
+				continue
+			}
+
+			currentUrl, err := url.Parse(current)
+
+			if err != nil {
+				return nil, err
+			}
+
+			val = currentUrl.ResolveReference(parsedUrl).String()
+
+			Visited.RLock()
+			if !Visited.visited[val] {
+        Visited.RUnlock()
+				Visited.Lock()
+				Visited.visited[val] = true
+				Visited.Unlock()
+				go doCrawl(val, parent, wg)
+			} else {
+        Visited.RUnlock()
+      }
+		}
+	}
+  return val, nil
+}
+
+// parseNode parses a single node. It is recursive, and will first be called
+// with the whole document node. We do a lot of appends, which is kind of yucky,
+// but we don’t know the amount of links we will encounter yet.
+func parseNode(node *html.Node, parent, current string, wg *sync.WaitGroup) ([]string, error) {
+	links := []string{}
+  val, err := visitNode(node, parent, current, wg)
+
+  if err != nil {
+    return nil, err
+  }
+
+  if val != nil {
+    links = append(links, val)
+  }
+
+	for c := node.FirstChild; c != nil; c = c.NextSibling {
+		newLinks, err := parseNode(c, parent, current, wg)
+
+		if err != nil {
+			return nil, err
+		}
+		links = append(links, newLinks...)
+	}
+	return links, nil
+}
+
+// parseRequest takes a single request body and parses it. It will then call
+// parseNode, which recursively looks through the document. If the body is not
+// HTML, this will error.
+func parseRequest(body io.ReadCloser, parent, url string, wg *sync.WaitGroup) {
+	document, err := html.Parse(body)
+	defer body.Close()
+
+	if err != nil {
+		log.Println(err)
+		return
+	}
+
+	links, err := parseNode(document, parent, url, wg)
+
+  // this can look weird with concurrent printing, but oh well. I’m not sure
+  // it’s worth it to make this linear for now.
+	pretty.Print(url, links)
+}
+
+// doCrawl is the actual crawler. It keeps track of what to visit currently and
+// what our base URL was. We also keep track of a WaitGroup to make sure we
+// don’t exit prematurely, since this is all concurrent.
+func doCrawl(toVisit string, parent string, wg *sync.WaitGroup) {
+	wg.Add(1)
+	resp, err := http.Get(toVisit)
+	defer wg.Done()
+
+	if err != nil {
+		log.Println(err)
+		return
+	}
+
+	Visited.Lock()
+	Visited.visited[toVisit] = true
+	Visited.Unlock()
+
+	parseRequest(resp.Body, parent, toVisit, wg)
+}
+
+// Crawl is our actual crawler. It creates a WaitGroup and calls the function
+// that actually does the work (doCrawl).
+func Crawl(toCrawl string) {
+	var wg sync.WaitGroup
+	doCrawl(toCrawl, toCrawl, &wg)
+	wg.Wait()
+}