initial

2018-05-22 19:24:19 +02:00
commit 546188fb7d
5 changed files with 180 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 crawl
--- a/README.md
+++ b/README.md
@@ -0,0 +1,7 @@
 # crawl
 `crawl` is a simple web crawler that is local to one domain. It’s using
 goroutines to parallelize things.
 The generated sitemap is exceedingly simple, we do not keep track of a real
 graph, but rather just a shallow list of links for every page.
--- a/crawler/crawl.go
+++ b/crawler/crawl.go
@@ -0,0 +1,143 @@
 package crawler
 import (
 	"io"
 	"log"
 	"net/http"
 	"net/url"
 	"strings"
 	"sync"
 	"golang.org/x/net/html"
 	"github.com/hellerve/crawl/pretty"
 )
 // Visited is the global list of visited links.
 // This bit of global state makes the function interfaces cleaner.
 // As long as we take care of locking this should be fine.
 //
 // Another consideration is that we currently only keep track of one level of
 // nesting. If we made it a “real” sitemap with a graph or somesuch, this would
 // be way more fancy.
 var Visited = struct {
 	sync.RWMutex
 	visited map[string]bool
 }{visited: make(map[string]bool)}
 // visitNode inspects the current node and, if it contains a link we haven’t
 // visited it yet, will spawn a goroutine for it. It will also return that link,
 // because we have to add it to our list of linked nodes.
 func visitNode(node *html.Node, parent, current string, wg *sync.WaitGroup) (*string, error) {
  var val *string
 	if node.Type == html.ElementNode && node.Data == "a" {
 		for _, a := range node.Attr {
 			if a.Key != "href" {
 				continue
 			}
 			parsedUrl, err := url.Parse(a.Val)
 			if err != nil {
 				return nil, err
 			}
 			if parsedUrl.IsAbs() && !strings.HasPrefix(a.Val, parent) {
 				continue
 			}
 			currentUrl, err := url.Parse(current)
 			if err != nil {
 				return nil, err
 			}
 			val = currentUrl.ResolveReference(parsedUrl).String()
 			Visited.RLock()
 			if !Visited.visited[val] {
        Visited.RUnlock()
 				Visited.Lock()
 				Visited.visited[val] = true
 				Visited.Unlock()
 				go doCrawl(val, parent, wg)
 			} else {
        Visited.RUnlock()
      }
 		}
 	}
  return val, nil
 }
 // parseNode parses a single node. It is recursive, and will first be called
 // with the whole document node. We do a lot of appends, which is kind of yucky,
 // but we don’t know the amount of links we will encounter yet.
 func parseNode(node *html.Node, parent, current string, wg *sync.WaitGroup) ([]string, error) {
 	links := []string{}
  val, err := visitNode(node, parent, current, wg)
  if err != nil {
    return nil, err
  }
  if val != nil {
    links = append(links, val)
  }
 	for c := node.FirstChild; c != nil; c = c.NextSibling {
 		newLinks, err := parseNode(c, parent, current, wg)
 		if err != nil {
 			return nil, err
 		}
 		links = append(links, newLinks...)
 	}
 	return links, nil
 }
 // parseRequest takes a single request body and parses it. It will then call
 // parseNode, which recursively looks through the document. If the body is not
 // HTML, this will error.
 func parseRequest(body io.ReadCloser, parent, url string, wg *sync.WaitGroup) {
 	document, err := html.Parse(body)
 	defer body.Close()
 	if err != nil {
 		log.Println(err)
 		return
 	}
 	links, err := parseNode(document, parent, url, wg)
  // this can look weird with concurrent printing, but oh well. I’m not sure
  // it’s worth it to make this linear for now.
 	pretty.Print(url, links)
 }
 // doCrawl is the actual crawler. It keeps track of what to visit currently and
 // what our base URL was. We also keep track of a WaitGroup to make sure we
 // don’t exit prematurely, since this is all concurrent.
 func doCrawl(toVisit string, parent string, wg *sync.WaitGroup) {
 	wg.Add(1)
 	resp, err := http.Get(toVisit)
 	defer wg.Done()
 	if err != nil {
 		log.Println(err)
 		return
 	}
 	Visited.Lock()
 	Visited.visited[toVisit] = true
 	Visited.Unlock()
 	parseRequest(resp.Body, parent, toVisit, wg)
 }
 // Crawl is our actual crawler. It creates a WaitGroup and calls the function
 // that actually does the work (doCrawl).
 func Crawl(toCrawl string) {
 	var wg sync.WaitGroup
 	doCrawl(toCrawl, toCrawl, &wg)
 	wg.Wait()
 }
--- a/main.go
+++ b/main.go
@@ -0,0 +1,14 @@
 package main
 import (
 	"flag"
 	"github.com/hellerve/crawl/crawler"
 )
 func main() {
 	url := flag.String("url", "https://example.com/", "The URL to crawl")
 	flag.Parse()
 	crawler.Crawl(*url)
 }
--- a/pretty/pretty.go
+++ b/pretty/pretty.go
@@ -0,0 +1,15 @@
 package pretty
 import (
 	"fmt"
 )
 // Print will pretty-print the sitemap. This is basically a stub for now,
 // because it’s the least interesting part of the program. If we used a graph
 // this would be more fun.
 func Print(url string, links []string) {
 	fmt.Println(url, ":")
 	for _, link := range links {
 		fmt.Println("\t", link)
 	}
 }