From e56f60ccb9ab3621493ea2e04094e590cef63d2e Mon Sep 17 00:00:00 2001 From: Slatian Date: Sat, 19 Nov 2022 15:47:01 +0100 Subject: [PATCH] Added batching functionality because wordlists become pretty long when the scraper found lots of long paragraphs --- ingest/ingest.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ingest/ingest.go b/ingest/ingest.go index eb1e380..cead52f 100644 --- a/ingest/ingest.go +++ b/ingest/ingest.go @@ -200,10 +200,14 @@ func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]ty i++ } // TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from - log.Println("starting to ingest batch") + log.Println("starting to ingest batch (Pages:", len(pages), "Words:", len(batch), "Links:", len(links),")") database.InsertManyDomains(db, pages) database.InsertManyPages(db, pages) - database.InsertManyWords(db, batch) + for i := 0; i < len(batch); i += 3000 { + end_i := i + 3000 + if (end_i > len(batch)) { end_i = len(batch)} + database.InsertManyWords(db, batch[i:end_i]) + } database.InsertManyExternalLinks(db, links) log.Println("finished ingesting batch") }