initial
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
crawl
|
7
README.md
Normal file
7
README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# crawl
|
||||
|
||||
`crawl` is a simple web crawler that is local to one domain. It’s using
|
||||
goroutines to parallelize things.
|
||||
|
||||
The generated sitemap is exceedingly simple, we do not keep track of a real
|
||||
graph, but rather just a shallow list of links for every page.
|
143
crawler/crawl.go
Normal file
143
crawler/crawl.go
Normal file
@@ -0,0 +1,143 @@
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
|
||||
"github.com/hellerve/crawl/pretty"
|
||||
)
|
||||
|
||||
// Visited is the global list of visited links.
|
||||
// This bit of global state makes the function interfaces cleaner.
|
||||
// As long as we take care of locking this should be fine.
|
||||
//
|
||||
// Another consideration is that we currently only keep track of one level of
|
||||
// nesting. If we made it a “real” sitemap with a graph or somesuch, this would
|
||||
// be way more fancy.
|
||||
var Visited = struct {
|
||||
sync.RWMutex
|
||||
visited map[string]bool
|
||||
}{visited: make(map[string]bool)}
|
||||
|
||||
|
||||
// visitNode inspects the current node and, if it contains a link we haven’t
|
||||
// visited it yet, will spawn a goroutine for it. It will also return that link,
|
||||
// because we have to add it to our list of linked nodes.
|
||||
func visitNode(node *html.Node, parent, current string, wg *sync.WaitGroup) (*string, error) {
|
||||
var val *string
|
||||
if node.Type == html.ElementNode && node.Data == "a" {
|
||||
for _, a := range node.Attr {
|
||||
if a.Key != "href" {
|
||||
continue
|
||||
}
|
||||
parsedUrl, err := url.Parse(a.Val)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if parsedUrl.IsAbs() && !strings.HasPrefix(a.Val, parent) {
|
||||
continue
|
||||
}
|
||||
|
||||
currentUrl, err := url.Parse(current)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
val = currentUrl.ResolveReference(parsedUrl).String()
|
||||
|
||||
Visited.RLock()
|
||||
if !Visited.visited[val] {
|
||||
Visited.RUnlock()
|
||||
Visited.Lock()
|
||||
Visited.visited[val] = true
|
||||
Visited.Unlock()
|
||||
go doCrawl(val, parent, wg)
|
||||
} else {
|
||||
Visited.RUnlock()
|
||||
}
|
||||
}
|
||||
}
|
||||
return val, nil
|
||||
}
|
||||
|
||||
// parseNode parses a single node. It is recursive, and will first be called
|
||||
// with the whole document node. We do a lot of appends, which is kind of yucky,
|
||||
// but we don’t know the amount of links we will encounter yet.
|
||||
func parseNode(node *html.Node, parent, current string, wg *sync.WaitGroup) ([]string, error) {
|
||||
links := []string{}
|
||||
val, err := visitNode(node, parent, current, wg)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if val != nil {
|
||||
links = append(links, val)
|
||||
}
|
||||
|
||||
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
||||
newLinks, err := parseNode(c, parent, current, wg)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
links = append(links, newLinks...)
|
||||
}
|
||||
return links, nil
|
||||
}
|
||||
|
||||
// parseRequest takes a single request body and parses it. It will then call
|
||||
// parseNode, which recursively looks through the document. If the body is not
|
||||
// HTML, this will error.
|
||||
func parseRequest(body io.ReadCloser, parent, url string, wg *sync.WaitGroup) {
|
||||
document, err := html.Parse(body)
|
||||
defer body.Close()
|
||||
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
links, err := parseNode(document, parent, url, wg)
|
||||
|
||||
// this can look weird with concurrent printing, but oh well. I’m not sure
|
||||
// it’s worth it to make this linear for now.
|
||||
pretty.Print(url, links)
|
||||
}
|
||||
|
||||
// doCrawl is the actual crawler. It keeps track of what to visit currently and
|
||||
// what our base URL was. We also keep track of a WaitGroup to make sure we
|
||||
// don’t exit prematurely, since this is all concurrent.
|
||||
func doCrawl(toVisit string, parent string, wg *sync.WaitGroup) {
|
||||
wg.Add(1)
|
||||
resp, err := http.Get(toVisit)
|
||||
defer wg.Done()
|
||||
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
Visited.Lock()
|
||||
Visited.visited[toVisit] = true
|
||||
Visited.Unlock()
|
||||
|
||||
parseRequest(resp.Body, parent, toVisit, wg)
|
||||
}
|
||||
|
||||
// Crawl is our actual crawler. It creates a WaitGroup and calls the function
|
||||
// that actually does the work (doCrawl).
|
||||
func Crawl(toCrawl string) {
|
||||
var wg sync.WaitGroup
|
||||
doCrawl(toCrawl, toCrawl, &wg)
|
||||
wg.Wait()
|
||||
}
|
14
main.go
Normal file
14
main.go
Normal file
@@ -0,0 +1,14 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
|
||||
"github.com/hellerve/crawl/crawler"
|
||||
)
|
||||
|
||||
func main() {
|
||||
url := flag.String("url", "https://example.com/", "The URL to crawl")
|
||||
flag.Parse()
|
||||
|
||||
crawler.Crawl(*url)
|
||||
}
|
15
pretty/pretty.go
Normal file
15
pretty/pretty.go
Normal file
@@ -0,0 +1,15 @@
|
||||
package pretty
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// Print will pretty-print the sitemap. This is basically a stub for now,
|
||||
// because it’s the least interesting part of the program. If we used a graph
|
||||
// this would be more fun.
|
||||
func Print(url string, links []string) {
|
||||
fmt.Println(url, ":")
|
||||
for _, link := range links {
|
||||
fmt.Println("\t", link)
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user