From b0ad7dce102d35123bb0092527b7ceea6df8ad86 Mon Sep 17 00:00:00 2001 From: cblgh Date: Wed, 30 Mar 2022 15:13:16 +0200 Subject: [PATCH] add custom webring selector for precrawl --- README.md | 3 +++ crawler/crawler.go | 21 ++++++++++------ types/types.go | 13 +++++----- util/util.go | 63 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 37255c1..f12b14c 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,9 @@ The config file is written in [TOML](https://toml.io/en/). name = "Merveilles Webring" # used by the precrawl command and linked to in /about route url = "https://webring.xxiivv.com" +# used by the precrawl command to populate the Crawler.Webring file; +# takes simple html selectors. might be a bit wonky :) +webringSelector = "li > a[href]:first-of-type" port = 10001 [theme] diff --git a/crawler/crawler.go b/crawler/crawler.go index 91b1662..d85ec9c 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -154,6 +154,10 @@ func collectHeadingText(heading string, e *colly.HTMLElement) { } func SetupDefaultProxy(config types.Config) error { + // no proxy configured, go back + if config.General.Proxy == "" { + return nil + } proxyURL, err := url.Parse(config.General.Proxy) if err != nil { return err @@ -165,7 +169,6 @@ func SetupDefaultProxy(config types.Config) error { }, } - //colly.SetHTTPClient(httpClient) http.DefaultClient = httpClient return nil } @@ -176,6 +179,7 @@ func Precrawl(config types.Config) { if err != nil { log.Fatal(err) } + res, err := http.Get(config.General.URL) util.Check(err) defer res.Body.Close() @@ -188,11 +192,12 @@ func Precrawl(config types.Config) { util.Check(err) items := make([]string, 0) - doc.Find("li").Each(func(i int, s *goquery.Selection) { - if domain, exists := s.Find("a").Attr("href"); exists { - items = append(items, domain) - } - }) + s := doc.Find("html") + query := config.General.WebringSelector + if query == "" { + query = "li > a[href]:first-of-type" + } + util.QuerySelector(query, s, &items) BANNED := getBannedDomains(config.Crawler.BannedDomains) for _, item := range items { @@ -226,7 +231,9 @@ func Crawl(config types.Config) { c := colly.NewCollector( colly.MaxDepth(3), ) - c.SetProxy(config.General.Proxy) + if config.General.Proxy != "" { + c.SetProxy(config.General.Proxy) + } q, _ := queue.New( 5, /* threads */ diff --git a/types/types.go b/types/types.go index bb893c6..3503a5b 100644 --- a/types/types.go +++ b/types/types.go @@ -15,12 +15,13 @@ type PageData struct { type Config struct { General struct { - Name string `json:name` - Tagline string `json:tagline` - Placeholder string `json:placeholder` - URL string `json:url` - Port int `json:port` - Proxy string `json:proxy` + Name string `json:name` + Tagline string `json:tagline` + Placeholder string `json:placeholder` + URL string `json:url` + WebringSelector string `json:"webringSelector"` + Port int `json:port` + Proxy string `json:proxy` } `json:general` Theme struct { Foreground string `json:"foreground"` diff --git a/util/util.go b/util/util.go index fd34ac1..263871d 100644 --- a/util/util.go +++ b/util/util.go @@ -4,10 +4,12 @@ import ( "bytes" "encoding/json" "fmt" + "github.com/PuerkitoBio/goquery" "io/ioutil" "log" "net" "os" + "regexp" "strings" "lieu/types" @@ -30,6 +32,66 @@ func Check(err error) { } } +// document.querySelectorAll-type functionality. limited functionality as of now (no classes or id support atm, i think!!) +func QuerySelector(query string, current *goquery.Selection, results *[]string) { + var op, operand string + + attrPattern := regexp.MustCompile(`(\w+)\[(\w+)\](.+)?`) + attrValuePattern := regexp.MustCompile(`\[(\w+)\]`) + + if len(query) == 0 { + return + } + + fields := strings.Fields(query) + part := fields[0] + query = strings.Join(fields[1:], " ") + if part == ">" { + op = "subchild" + } else if attrPattern.MatchString(part) { + op = "element" + matches := attrPattern.FindStringSubmatch(part) + operand = matches[1] + var optional string + if len(matches) == 4 { + optional = matches[3] + } + query = strings.TrimSpace(fmt.Sprintf("[%s]%s %s", matches[2], optional, query)) + } else if attrValuePattern.MatchString(part) { + op = "attr" + operand = attrValuePattern.FindStringSubmatch(part)[1] + } else if len(query) == 0 { + op = "final" + } else { + op = "element" + operand = part + } + + switch op { + case "element": // e.g. [el]; bla > [el]; but also [el] > bla + current = current.Find(operand) + if strings.HasSuffix(query, "first-of-type") { + break + } + fallthrough + case "subchild": // [preceding] > [future] + // recurse querySelector on all [preceding] element types + current.Each(func(j int, s *goquery.Selection) { + QuerySelector(query, s, results) + }) + return + case "attr": // x[attr] + // extract the attribute + if str, exists := current.Attr(operand); exists { + *results = append(*results, str) + } + return + case "final": // no more in query, and we did not end on an attr: get text + *results = append(*results, current.Text()) + } + QuerySelector(query, current, results) +} + func DatabaseDoesNotExist(filepath string) { fmt.Printf("lieu: database %s does not exist\n", filepath) fmt.Println("lieu: try running `lieu ingest` if you have already crawled source data") @@ -104,6 +166,7 @@ func WriteMockConfig() { name = "Sweet Webring" # used by the precrawl command and linked to in /about route url = "https://example.com/" +webringSelector = "li > a" port = 10001 [theme]