kopia lustrzana https://github.com/cblgh/lieu
add user agent Lieu, misc fixes for null results
rodzic
65907c5441
commit
b9387d3b04
|
@ -52,7 +52,6 @@ func getLink(target string) string {
|
||||||
target = strings.Split(target, "?")[0]
|
target = strings.Split(target, "?")[0]
|
||||||
}
|
}
|
||||||
target = strings.TrimSpace(target)
|
target = strings.TrimSpace(target)
|
||||||
target = strings.ToLower(target)
|
|
||||||
// remove trailing /
|
// remove trailing /
|
||||||
return strings.TrimSuffix(target, "/")
|
return strings.TrimSuffix(target, "/")
|
||||||
}
|
}
|
||||||
|
@ -203,6 +202,7 @@ func Crawl(config types.Config) {
|
||||||
q.AddURL(link)
|
q.AddURL(link)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
c.UserAgent = "Lieu"
|
||||||
c.AllowedDomains = domains
|
c.AllowedDomains = domains
|
||||||
c.AllowURLRevisit = false
|
c.AllowURLRevisit = false
|
||||||
c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains)
|
c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains)
|
||||||
|
|
|
@ -276,6 +276,9 @@ func searchWords(db *sql.DB, words []string, searchByScore bool, domain ...strin
|
||||||
}
|
}
|
||||||
|
|
||||||
func InsertManyDomains(db *sql.DB, pages []types.PageData) {
|
func InsertManyDomains(db *sql.DB, pages []types.PageData) {
|
||||||
|
if len(pages) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
values := make([]string, 0, len(pages))
|
values := make([]string, 0, len(pages))
|
||||||
args := make([]interface{}, 0, len(pages))
|
args := make([]interface{}, 0, len(pages))
|
||||||
|
|
||||||
|
@ -292,6 +295,9 @@ func InsertManyDomains(db *sql.DB, pages []types.PageData) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func InsertManyPages(db *sql.DB, pages []types.PageData) {
|
func InsertManyPages(db *sql.DB, pages []types.PageData) {
|
||||||
|
if len(pages) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
values := make([]string, 0, len(pages))
|
values := make([]string, 0, len(pages))
|
||||||
args := make([]interface{}, 0, len(pages))
|
args := make([]interface{}, 0, len(pages))
|
||||||
|
|
||||||
|
@ -309,6 +315,10 @@ func InsertManyPages(db *sql.DB, pages []types.PageData) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func InsertManyWords(db *sql.DB, batch []types.SearchFragment) {
|
func InsertManyWords(db *sql.DB, batch []types.SearchFragment) {
|
||||||
|
if len(batch) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
values := make([]string, 0, len(batch))
|
values := make([]string, 0, len(batch))
|
||||||
args := make([]interface{}, 0, len(batch))
|
args := make([]interface{}, 0, len(batch))
|
||||||
|
|
||||||
|
@ -324,6 +334,10 @@ func InsertManyWords(db *sql.DB, batch []types.SearchFragment) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func InsertManyExternalLinks(db *sql.DB, externalLinks []string) {
|
func InsertManyExternalLinks(db *sql.DB, externalLinks []string) {
|
||||||
|
if len(externalLinks) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
values := make([]string, 0, len(externalLinks))
|
values := make([]string, 0, len(externalLinks))
|
||||||
args := make([]interface{}, 0, len(externalLinks))
|
args := make([]interface{}, 0, len(externalLinks))
|
||||||
|
|
||||||
|
|
|
@ -175,6 +175,7 @@ func Ingest(config types.Config) {
|
||||||
pages = make(map[string]types.PageData)
|
pages = make(map[string]types.PageData)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
ingestBatch(db, batch, pages, externalLinks)
|
||||||
fmt.Printf("ingested %d words\n", count)
|
fmt.Printf("ingested %d words\n", count)
|
||||||
|
|
||||||
err = scanner.Err()
|
err = scanner.Err()
|
||||||
|
@ -188,6 +189,7 @@ func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]ty
|
||||||
pages[i] = pageMap[k]
|
pages[i] = pageMap[k]
|
||||||
i++
|
i++
|
||||||
}
|
}
|
||||||
|
// TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from
|
||||||
log.Println("starting to ingest batch")
|
log.Println("starting to ingest batch")
|
||||||
database.InsertManyDomains(db, pages)
|
database.InsertManyDomains(db, pages)
|
||||||
database.InsertManyPages(db, pages)
|
database.InsertManyPages(db, pages)
|
||||||
|
|
Ładowanie…
Reference in New Issue