Added batching functionality because wordlists become pretty long when the scraper found lots of long paragraphs

pull/15/head
Slatian 2022-11-19 15:47:01 +01:00 zatwierdzone przez Alexander Cobleigh
rodzic ed5f5189b0
commit e56f60ccb9
1 zmienionych plików z 6 dodań i 2 usunięć

Wyświetl plik

@ -200,10 +200,14 @@ func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]ty
i++
}
// TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from
log.Println("starting to ingest batch")
log.Println("starting to ingest batch (Pages:", len(pages), "Words:", len(batch), "Links:", len(links),")")
database.InsertManyDomains(db, pages)
database.InsertManyPages(db, pages)
database.InsertManyWords(db, batch)
for i := 0; i < len(batch); i += 3000 {
end_i := i + 3000
if (end_i > len(batch)) { end_i = len(batch)}
database.InsertManyWords(db, batch[i:end_i])
}
database.InsertManyExternalLinks(db, links)
log.Println("finished ingesting batch")
}