commit 546188fb7dca6c923e0df1e150a1475a1348bfb4 Author: hellerve Date: Tue May 22 19:24:19 2018 +0200 initial diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d1d3445 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +crawl diff --git a/README.md b/README.md new file mode 100644 index 0000000..a91d270 --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# crawl + +`crawl` is a simple web crawler that is local to one domain. It’s using +goroutines to parallelize things. + +The generated sitemap is exceedingly simple, we do not keep track of a real +graph, but rather just a shallow list of links for every page. diff --git a/crawler/crawl.go b/crawler/crawl.go new file mode 100644 index 0000000..7c15cf4 --- /dev/null +++ b/crawler/crawl.go @@ -0,0 +1,143 @@ +package crawler + +import ( + "io" + "log" + "net/http" + "net/url" + "strings" + "sync" + + "golang.org/x/net/html" + + "github.com/hellerve/crawl/pretty" +) + +// Visited is the global list of visited links. +// This bit of global state makes the function interfaces cleaner. +// As long as we take care of locking this should be fine. +// +// Another consideration is that we currently only keep track of one level of +// nesting. If we made it a “real” sitemap with a graph or somesuch, this would +// be way more fancy. +var Visited = struct { + sync.RWMutex + visited map[string]bool +}{visited: make(map[string]bool)} + + +// visitNode inspects the current node and, if it contains a link we haven’t +// visited it yet, will spawn a goroutine for it. It will also return that link, +// because we have to add it to our list of linked nodes. +func visitNode(node *html.Node, parent, current string, wg *sync.WaitGroup) (*string, error) { + var val *string + if node.Type == html.ElementNode && node.Data == "a" { + for _, a := range node.Attr { + if a.Key != "href" { + continue + } + parsedUrl, err := url.Parse(a.Val) + + if err != nil { + return nil, err + } + + if parsedUrl.IsAbs() && !strings.HasPrefix(a.Val, parent) { + continue + } + + currentUrl, err := url.Parse(current) + + if err != nil { + return nil, err + } + + val = currentUrl.ResolveReference(parsedUrl).String() + + Visited.RLock() + if !Visited.visited[val] { + Visited.RUnlock() + Visited.Lock() + Visited.visited[val] = true + Visited.Unlock() + go doCrawl(val, parent, wg) + } else { + Visited.RUnlock() + } + } + } + return val, nil +} + +// parseNode parses a single node. It is recursive, and will first be called +// with the whole document node. We do a lot of appends, which is kind of yucky, +// but we don’t know the amount of links we will encounter yet. +func parseNode(node *html.Node, parent, current string, wg *sync.WaitGroup) ([]string, error) { + links := []string{} + val, err := visitNode(node, parent, current, wg) + + if err != nil { + return nil, err + } + + if val != nil { + links = append(links, val) + } + + for c := node.FirstChild; c != nil; c = c.NextSibling { + newLinks, err := parseNode(c, parent, current, wg) + + if err != nil { + return nil, err + } + links = append(links, newLinks...) + } + return links, nil +} + +// parseRequest takes a single request body and parses it. It will then call +// parseNode, which recursively looks through the document. If the body is not +// HTML, this will error. +func parseRequest(body io.ReadCloser, parent, url string, wg *sync.WaitGroup) { + document, err := html.Parse(body) + defer body.Close() + + if err != nil { + log.Println(err) + return + } + + links, err := parseNode(document, parent, url, wg) + + // this can look weird with concurrent printing, but oh well. I’m not sure + // it’s worth it to make this linear for now. + pretty.Print(url, links) +} + +// doCrawl is the actual crawler. It keeps track of what to visit currently and +// what our base URL was. We also keep track of a WaitGroup to make sure we +// don’t exit prematurely, since this is all concurrent. +func doCrawl(toVisit string, parent string, wg *sync.WaitGroup) { + wg.Add(1) + resp, err := http.Get(toVisit) + defer wg.Done() + + if err != nil { + log.Println(err) + return + } + + Visited.Lock() + Visited.visited[toVisit] = true + Visited.Unlock() + + parseRequest(resp.Body, parent, toVisit, wg) +} + +// Crawl is our actual crawler. It creates a WaitGroup and calls the function +// that actually does the work (doCrawl). +func Crawl(toCrawl string) { + var wg sync.WaitGroup + doCrawl(toCrawl, toCrawl, &wg) + wg.Wait() +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..45d4cf1 --- /dev/null +++ b/main.go @@ -0,0 +1,14 @@ +package main + +import ( + "flag" + + "github.com/hellerve/crawl/crawler" +) + +func main() { + url := flag.String("url", "https://example.com/", "The URL to crawl") + flag.Parse() + + crawler.Crawl(*url) +} diff --git a/pretty/pretty.go b/pretty/pretty.go new file mode 100644 index 0000000..642f35d --- /dev/null +++ b/pretty/pretty.go @@ -0,0 +1,15 @@ +package pretty + +import ( + "fmt" +) + +// Print will pretty-print the sitemap. This is basically a stub for now, +// because it’s the least interesting part of the program. If we used a graph +// this would be more fun. +func Print(url string, links []string) { + fmt.Println(url, ":") + for _, link := range links { + fmt.Println("\t", link) + } +}