From b9387d3b04c09707a4f1f50172a8e0262d790322 Mon Sep 17 00:00:00 2001 From: cblgh Date: Wed, 1 Dec 2021 09:56:09 +0100 Subject: [PATCH] add user agent Lieu, misc fixes for null results --- crawler/crawler.go | 2 +- database/database.go | 14 ++++++++++++++ ingest/ingest.go | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/crawler/crawler.go b/crawler/crawler.go index e9516e2..f33f7f8 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -52,7 +52,6 @@ func getLink(target string) string { target = strings.Split(target, "?")[0] } target = strings.TrimSpace(target) - target = strings.ToLower(target) // remove trailing / return strings.TrimSuffix(target, "/") } @@ -203,6 +202,7 @@ func Crawl(config types.Config) { q.AddURL(link) } + c.UserAgent = "Lieu" c.AllowedDomains = domains c.AllowURLRevisit = false c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains) diff --git a/database/database.go b/database/database.go index 13ca1da..deecb07 100644 --- a/database/database.go +++ b/database/database.go @@ -276,6 +276,9 @@ func searchWords(db *sql.DB, words []string, searchByScore bool, domain ...strin } func InsertManyDomains(db *sql.DB, pages []types.PageData) { + if len(pages) == 0 { + return + } values := make([]string, 0, len(pages)) args := make([]interface{}, 0, len(pages)) @@ -292,6 +295,9 @@ func InsertManyDomains(db *sql.DB, pages []types.PageData) { } func InsertManyPages(db *sql.DB, pages []types.PageData) { + if len(pages) == 0 { + return + } values := make([]string, 0, len(pages)) args := make([]interface{}, 0, len(pages)) @@ -309,6 +315,10 @@ func InsertManyPages(db *sql.DB, pages []types.PageData) { } func InsertManyWords(db *sql.DB, batch []types.SearchFragment) { + if len(batch) == 0 { + return + } + values := make([]string, 0, len(batch)) args := make([]interface{}, 0, len(batch)) @@ -324,6 +334,10 @@ func InsertManyWords(db *sql.DB, batch []types.SearchFragment) { } func InsertManyExternalLinks(db *sql.DB, externalLinks []string) { + if len(externalLinks) == 0 { + return + } + values := make([]string, 0, len(externalLinks)) args := make([]interface{}, 0, len(externalLinks)) diff --git a/ingest/ingest.go b/ingest/ingest.go index 6aaa79b..a12bb06 100644 --- a/ingest/ingest.go +++ b/ingest/ingest.go @@ -175,6 +175,7 @@ func Ingest(config types.Config) { pages = make(map[string]types.PageData) } } + ingestBatch(db, batch, pages, externalLinks) fmt.Printf("ingested %d words\n", count) err = scanner.Err() @@ -188,6 +189,7 @@ func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]ty pages[i] = pageMap[k] i++ } + // TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from log.Println("starting to ingest batch") database.InsertManyDomains(db, pages) database.InsertManyPages(db, pages)