diff --git a/crawler/crawler.go b/crawler/crawler.go index e979b32..902c962 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -153,12 +153,12 @@ func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []st }) c.OnHTML("body", func(e *colly.HTMLElement) { - QueryLoop: + QueryLoop: for i := 0; i < len(previewQueries); i++ { // After the fourth paragraph we're probably too far in to get something interesting for a preview elements := e.DOM.Find(previewQueries[i]) - for j := 0; j < 4 && j < elements.Length() ; j++ { - element_text := elements.Slice(j,j+1).Text() + for j := 0; j < 4 && j < elements.Length(); j++ { + element_text := elements.Slice(j, j+1).Text() paragraph := cleanText(element_text) if len(paragraph) < 1500 && len(paragraph) > 20 { if !util.Contains(heuristics, strings.ToLower(paragraph)) { @@ -172,7 +172,7 @@ func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []st if len(paragraph) < 1500 && len(paragraph) > 0 { fmt.Println("para-just-p", paragraph, e.Request.URL) } - + // get all relevant page headings collectHeadingText("h1", e) collectHeadingText("h2", e) @@ -230,7 +230,7 @@ func Precrawl(config types.Config) { s := doc.Find("html") query := config.General.WebringSelector if query == "" { - query = "li > a[href]:first-of-type" + query = "li > a[href]:first-of-type" } util.QuerySelector(query, s, &items) @@ -298,7 +298,7 @@ func Crawl(config types.Config) { if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 { return } - + link := getLink(e.Attr("href")) if findSuffix(SUFFIXES, link) { return diff --git a/ingest/ingest.go b/ingest/ingest.go index a723be5..0022ede 100644 --- a/ingest/ingest.go +++ b/ingest/ingest.go @@ -144,7 +144,7 @@ func Ingest(config types.Config) { page.AboutSource = token processed = partitionSentence(payload) case "para": - if (page.AboutSource != "og-desc" || len(rawdata)*10 > len(page.About)*7) { + if page.AboutSource != "og-desc" || len(rawdata)*10 > len(page.About)*7 { if performAboutHeuristic(config.Data.Heuristics, payload) { page.About = rawdata page.AboutSource = token @@ -200,12 +200,14 @@ func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]ty i++ } // TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from - log.Println("starting to ingest batch (Pages:", len(pages), "Words:", len(batch), "Links:", len(links),")") + log.Println("starting to ingest batch (Pages:", len(pages), "Words:", len(batch), "Links:", len(links), ")") database.InsertManyDomains(db, pages) database.InsertManyPages(db, pages) for i := 0; i < len(batch); i += 3000 { end_i := i + 3000 - if (end_i > len(batch)) { end_i = len(batch)} + if end_i > len(batch) { + end_i = len(batch) + } database.InsertManyWords(db, batch[i:end_i]) } database.InsertManyExternalLinks(db, links) diff --git a/types/types.go b/types/types.go index d26999d..f85919d 100644 --- a/types/types.go +++ b/types/types.go @@ -7,10 +7,10 @@ type SearchFragment struct { } type PageData struct { - URL string - Title string - About string - Lang string + URL string + Title string + About string + Lang string AboutSource string } @@ -30,10 +30,10 @@ type Config struct { Links string `json:"links"` } `json:"theme"` Data struct { - Source string `json:source` - Database string `json:database` - Heuristics string `json:heuristics` - Wordlist string `json:wordlist` + Source string `json:source` + Database string `json:database` + Heuristics string `json:heuristics` + Wordlist string `json:wordlist` } `json:data` Crawler struct { Webring string `json:webring`