add user agent Lieu, misc fixes for null results

pull/7/head
cblgh 2021-12-01 09:56:09 +01:00
rodzic 65907c5441
commit b9387d3b04
3 zmienionych plików z 17 dodań i 1 usunięć

Wyświetl plik

@ -52,7 +52,6 @@ func getLink(target string) string {
target = strings.Split(target, "?")[0]
}
target = strings.TrimSpace(target)
target = strings.ToLower(target)
// remove trailing /
return strings.TrimSuffix(target, "/")
}
@ -203,6 +202,7 @@ func Crawl(config types.Config) {
q.AddURL(link)
}
c.UserAgent = "Lieu"
c.AllowedDomains = domains
c.AllowURLRevisit = false
c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains)

Wyświetl plik

@ -276,6 +276,9 @@ func searchWords(db *sql.DB, words []string, searchByScore bool, domain ...strin
}
func InsertManyDomains(db *sql.DB, pages []types.PageData) {
if len(pages) == 0 {
return
}
values := make([]string, 0, len(pages))
args := make([]interface{}, 0, len(pages))
@ -292,6 +295,9 @@ func InsertManyDomains(db *sql.DB, pages []types.PageData) {
}
func InsertManyPages(db *sql.DB, pages []types.PageData) {
if len(pages) == 0 {
return
}
values := make([]string, 0, len(pages))
args := make([]interface{}, 0, len(pages))
@ -309,6 +315,10 @@ func InsertManyPages(db *sql.DB, pages []types.PageData) {
}
func InsertManyWords(db *sql.DB, batch []types.SearchFragment) {
if len(batch) == 0 {
return
}
values := make([]string, 0, len(batch))
args := make([]interface{}, 0, len(batch))
@ -324,6 +334,10 @@ func InsertManyWords(db *sql.DB, batch []types.SearchFragment) {
}
func InsertManyExternalLinks(db *sql.DB, externalLinks []string) {
if len(externalLinks) == 0 {
return
}
values := make([]string, 0, len(externalLinks))
args := make([]interface{}, 0, len(externalLinks))

Wyświetl plik

@ -175,6 +175,7 @@ func Ingest(config types.Config) {
pages = make(map[string]types.PageData)
}
}
ingestBatch(db, batch, pages, externalLinks)
fmt.Printf("ingested %d words\n", count)
err = scanner.Err()
@ -188,6 +189,7 @@ func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]ty
pages[i] = pageMap[k]
i++
}
// TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from
log.Println("starting to ingest batch")
database.InsertManyDomains(db, pages)
database.InsertManyPages(db, pages)