kopia lustrzana https://github.com/cblgh/lieu
add user agent Lieu, misc fixes for null results
rodzic
65907c5441
commit
b9387d3b04
|
@ -52,7 +52,6 @@ func getLink(target string) string {
|
|||
target = strings.Split(target, "?")[0]
|
||||
}
|
||||
target = strings.TrimSpace(target)
|
||||
target = strings.ToLower(target)
|
||||
// remove trailing /
|
||||
return strings.TrimSuffix(target, "/")
|
||||
}
|
||||
|
@ -203,6 +202,7 @@ func Crawl(config types.Config) {
|
|||
q.AddURL(link)
|
||||
}
|
||||
|
||||
c.UserAgent = "Lieu"
|
||||
c.AllowedDomains = domains
|
||||
c.AllowURLRevisit = false
|
||||
c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains)
|
||||
|
|
|
@ -276,6 +276,9 @@ func searchWords(db *sql.DB, words []string, searchByScore bool, domain ...strin
|
|||
}
|
||||
|
||||
func InsertManyDomains(db *sql.DB, pages []types.PageData) {
|
||||
if len(pages) == 0 {
|
||||
return
|
||||
}
|
||||
values := make([]string, 0, len(pages))
|
||||
args := make([]interface{}, 0, len(pages))
|
||||
|
||||
|
@ -292,6 +295,9 @@ func InsertManyDomains(db *sql.DB, pages []types.PageData) {
|
|||
}
|
||||
|
||||
func InsertManyPages(db *sql.DB, pages []types.PageData) {
|
||||
if len(pages) == 0 {
|
||||
return
|
||||
}
|
||||
values := make([]string, 0, len(pages))
|
||||
args := make([]interface{}, 0, len(pages))
|
||||
|
||||
|
@ -309,6 +315,10 @@ func InsertManyPages(db *sql.DB, pages []types.PageData) {
|
|||
}
|
||||
|
||||
func InsertManyWords(db *sql.DB, batch []types.SearchFragment) {
|
||||
if len(batch) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
values := make([]string, 0, len(batch))
|
||||
args := make([]interface{}, 0, len(batch))
|
||||
|
||||
|
@ -324,6 +334,10 @@ func InsertManyWords(db *sql.DB, batch []types.SearchFragment) {
|
|||
}
|
||||
|
||||
func InsertManyExternalLinks(db *sql.DB, externalLinks []string) {
|
||||
if len(externalLinks) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
values := make([]string, 0, len(externalLinks))
|
||||
args := make([]interface{}, 0, len(externalLinks))
|
||||
|
||||
|
|
|
@ -175,6 +175,7 @@ func Ingest(config types.Config) {
|
|||
pages = make(map[string]types.PageData)
|
||||
}
|
||||
}
|
||||
ingestBatch(db, batch, pages, externalLinks)
|
||||
fmt.Printf("ingested %d words\n", count)
|
||||
|
||||
err = scanner.Err()
|
||||
|
@ -188,6 +189,7 @@ func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]ty
|
|||
pages[i] = pageMap[k]
|
||||
i++
|
||||
}
|
||||
// TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from
|
||||
log.Println("starting to ingest batch")
|
||||
database.InsertManyDomains(db, pages)
|
||||
database.InsertManyPages(db, pages)
|
||||
|
|
Ładowanie…
Reference in New Issue