From 687aa29ba359e9018ab58ad1a23efdbaf67d28e4 Mon Sep 17 00:00:00 2001 From: Slatian Date: Tue, 15 Nov 2022 16:38:02 +0100 Subject: [PATCH] Made scring the preview text configurable and improved the cleanup function a tiny bit. --- crawler/crawler.go | 25 +++++++++++++++++++------ types/types.go | 9 +++++---- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/crawler/crawler.go b/crawler/crawler.go index d85ec9c..59bb2b5 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -34,6 +34,15 @@ func getBoringDomains(path string) []string { return util.ReadList(path, "\n") } +func getPreviewQueries(path string) []string { + previewQueries := util.ReadList(path, "\n") + if len(previewQueries) > 0 { + return previewQueries; + } else { + return []string{"main p", "article p", "section p", "p"}; + } +} + func find(list []string, query string) bool { for _, item := range list { if item == query { @@ -104,12 +113,12 @@ func cleanText(s string) string { s = strings.TrimSpace(s) s = strings.ReplaceAll(s, "\n", " ") s = strings.ReplaceAll(s, "|", " ") - whitespace := regexp.MustCompile(`\p{Z}`) + whitespace := regexp.MustCompile(`\p{Z}+`) s = whitespace.ReplaceAllString(s, " ") return s } -func handleIndexing(c *colly.Collector) { +func handleIndexing(c *colly.Collector, previewQueries []string) { c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) { fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL) }) @@ -134,9 +143,12 @@ func handleIndexing(c *colly.Collector) { }) c.OnHTML("body", func(e *colly.HTMLElement) { - paragraph := cleanText(e.DOM.Find("p").First().Text()) - if len(paragraph) < 1500 && len(paragraph) > 0 { - fmt.Println("para", paragraph, e.Request.URL) + for i := 0; i < len(previewQueries); i++ { + paragraph := cleanText(e.DOM.Find(previewQueries[i]).First().Text()) + if len(paragraph) < 1500 && len(paragraph) > 0 { + fmt.Println("para", paragraph, e.Request.URL) + break + } } // get all relevant page headings collectHeadingText("h1", e) @@ -254,6 +266,7 @@ func Crawl(config types.Config) { boringDomains := getBoringDomains(config.Crawler.BoringDomains) boringWords := getBoringWords(config.Crawler.BoringWords) + previewQueries := getPreviewQueries(config.Crawler.PreviewQueries) // on every a element which has an href attribute, call callback c.OnHTML("a[href]", func(e *colly.HTMLElement) { @@ -302,7 +315,7 @@ func Crawl(config types.Config) { } }) - handleIndexing(c) + handleIndexing(c, previewQueries) // start scraping q.Run(c) diff --git a/types/types.go b/types/types.go index 3503a5b..46f3fa9 100644 --- a/types/types.go +++ b/types/types.go @@ -29,10 +29,10 @@ type Config struct { Links string `json:"links"` } `json:"theme"` Data struct { - Source string `json:source` - Database string `json:database` - Heuristics string `json:heuristics` - Wordlist string `json:wordlist` + Source string `json:source` + Database string `json:database` + Heuristics string `json:heuristics` + Wordlist string `json:wordlist` } `json:data` Crawler struct { Webring string `json:webring` @@ -40,5 +40,6 @@ type Config struct { BannedSuffixes string `json:bannedSuffixes` BoringWords string `json:boringWords` BoringDomains string `json:boringDomains` + PreviewQueries string `json:"previewQueryList"` } `json:crawler` }