From 687aa29ba359e9018ab58ad1a23efdbaf67d28e4 Mon Sep 17 00:00:00 2001
From: Slatian <baschdel@disroot.org>
Date: Tue, 15 Nov 2022 16:38:02 +0100
Subject: [PATCH] Made scring the preview text configurable and improved the
 cleanup function  a tiny bit.

---
 crawler/crawler.go | 25 +++++++++++++++++++------
 types/types.go     |  9 +++++----
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/crawler/crawler.go b/crawler/crawler.go
index d85ec9c..59bb2b5 100644
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@@ -34,6 +34,15 @@ func getBoringDomains(path string) []string {
 	return util.ReadList(path, "\n")
 }
 
+func getPreviewQueries(path string) []string {
+	previewQueries := util.ReadList(path, "\n")
+	if len(previewQueries) > 0 {
+		return previewQueries;
+	} else {
+		return []string{"main p", "article p", "section p", "p"};
+	}
+}
+
 func find(list []string, query string) bool {
 	for _, item := range list {
 		if item == query {
@@ -104,12 +113,12 @@ func cleanText(s string) string {
 	s = strings.TrimSpace(s)
 	s = strings.ReplaceAll(s, "\n", " ")
 	s = strings.ReplaceAll(s, "|", " ")
-	whitespace := regexp.MustCompile(`\p{Z}`)
+	whitespace := regexp.MustCompile(`\p{Z}+`)
 	s = whitespace.ReplaceAllString(s, " ")
 	return s
 }
 
-func handleIndexing(c *colly.Collector) {
+func handleIndexing(c *colly.Collector, previewQueries []string) {
 	c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) {
 		fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL)
 	})
@@ -134,9 +143,12 @@ func handleIndexing(c *colly.Collector) {
 	})
 
 	c.OnHTML("body", func(e *colly.HTMLElement) {
-		paragraph := cleanText(e.DOM.Find("p").First().Text())
-		if len(paragraph) < 1500 && len(paragraph) > 0 {
-			fmt.Println("para", paragraph, e.Request.URL)
+		for i := 0; i < len(previewQueries); i++ {
+			paragraph := cleanText(e.DOM.Find(previewQueries[i]).First().Text())
+			if len(paragraph) < 1500 && len(paragraph) > 0 {
+				fmt.Println("para", paragraph, e.Request.URL)
+				break
+			}
 		}
 		// get all relevant page headings
 		collectHeadingText("h1", e)
@@ -254,6 +266,7 @@ func Crawl(config types.Config) {
 
 	boringDomains := getBoringDomains(config.Crawler.BoringDomains)
 	boringWords := getBoringWords(config.Crawler.BoringWords)
+	previewQueries := getPreviewQueries(config.Crawler.PreviewQueries)
 
 	// on every a element which has an href attribute, call callback
 	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
@@ -302,7 +315,7 @@ func Crawl(config types.Config) {
 		}
 	})
 
-	handleIndexing(c)
+	handleIndexing(c, previewQueries)
 
 	// start scraping
 	q.Run(c)
diff --git a/types/types.go b/types/types.go
index 3503a5b..46f3fa9 100644
--- a/types/types.go
+++ b/types/types.go
@@ -29,10 +29,10 @@ type Config struct {
 		Links      string `json:"links"`
 	} `json:"theme"`
 	Data struct {
-		Source     string `json:source`
-		Database   string `json:database`
-		Heuristics string `json:heuristics`
-		Wordlist   string `json:wordlist`
+		Source         string `json:source`
+		Database       string `json:database`
+		Heuristics     string `json:heuristics`
+		Wordlist       string `json:wordlist`
 	} `json:data`
 	Crawler struct {
 		Webring        string `json:webring`
@@ -40,5 +40,6 @@ type Config struct {
 		BannedSuffixes string `json:bannedSuffixes`
 		BoringWords    string `json:boringWords`
 		BoringDomains  string `json:boringDomains`
+		PreviewQueries string `json:"previewQueryList"`
 	} `json:crawler`
 }