pull/15/head
cblgh 2022-11-22 14:08:59 +01:00
rodzic 9517f62de2
commit 9377bd6fab
3 zmienionych plików z 19 dodań i 17 usunięć

Wyświetl plik

@ -153,12 +153,12 @@ func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []st
}) })
c.OnHTML("body", func(e *colly.HTMLElement) { c.OnHTML("body", func(e *colly.HTMLElement) {
QueryLoop: QueryLoop:
for i := 0; i < len(previewQueries); i++ { for i := 0; i < len(previewQueries); i++ {
// After the fourth paragraph we're probably too far in to get something interesting for a preview // After the fourth paragraph we're probably too far in to get something interesting for a preview
elements := e.DOM.Find(previewQueries[i]) elements := e.DOM.Find(previewQueries[i])
for j := 0; j < 4 && j < elements.Length() ; j++ { for j := 0; j < 4 && j < elements.Length(); j++ {
element_text := elements.Slice(j,j+1).Text() element_text := elements.Slice(j, j+1).Text()
paragraph := cleanText(element_text) paragraph := cleanText(element_text)
if len(paragraph) < 1500 && len(paragraph) > 20 { if len(paragraph) < 1500 && len(paragraph) > 20 {
if !util.Contains(heuristics, strings.ToLower(paragraph)) { if !util.Contains(heuristics, strings.ToLower(paragraph)) {
@ -172,7 +172,7 @@ func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []st
if len(paragraph) < 1500 && len(paragraph) > 0 { if len(paragraph) < 1500 && len(paragraph) > 0 {
fmt.Println("para-just-p", paragraph, e.Request.URL) fmt.Println("para-just-p", paragraph, e.Request.URL)
} }
// get all relevant page headings // get all relevant page headings
collectHeadingText("h1", e) collectHeadingText("h1", e)
collectHeadingText("h2", e) collectHeadingText("h2", e)
@ -230,7 +230,7 @@ func Precrawl(config types.Config) {
s := doc.Find("html") s := doc.Find("html")
query := config.General.WebringSelector query := config.General.WebringSelector
if query == "" { if query == "" {
query = "li > a[href]:first-of-type" query = "li > a[href]:first-of-type"
} }
util.QuerySelector(query, s, &items) util.QuerySelector(query, s, &items)
@ -298,7 +298,7 @@ func Crawl(config types.Config) {
if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 { if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 {
return return
} }
link := getLink(e.Attr("href")) link := getLink(e.Attr("href"))
if findSuffix(SUFFIXES, link) { if findSuffix(SUFFIXES, link) {
return return

Wyświetl plik

@ -144,7 +144,7 @@ func Ingest(config types.Config) {
page.AboutSource = token page.AboutSource = token
processed = partitionSentence(payload) processed = partitionSentence(payload)
case "para": case "para":
if (page.AboutSource != "og-desc" || len(rawdata)*10 > len(page.About)*7) { if page.AboutSource != "og-desc" || len(rawdata)*10 > len(page.About)*7 {
if performAboutHeuristic(config.Data.Heuristics, payload) { if performAboutHeuristic(config.Data.Heuristics, payload) {
page.About = rawdata page.About = rawdata
page.AboutSource = token page.AboutSource = token
@ -200,12 +200,14 @@ func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]ty
i++ i++
} }
// TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from // TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from
log.Println("starting to ingest batch (Pages:", len(pages), "Words:", len(batch), "Links:", len(links),")") log.Println("starting to ingest batch (Pages:", len(pages), "Words:", len(batch), "Links:", len(links), ")")
database.InsertManyDomains(db, pages) database.InsertManyDomains(db, pages)
database.InsertManyPages(db, pages) database.InsertManyPages(db, pages)
for i := 0; i < len(batch); i += 3000 { for i := 0; i < len(batch); i += 3000 {
end_i := i + 3000 end_i := i + 3000
if (end_i > len(batch)) { end_i = len(batch)} if end_i > len(batch) {
end_i = len(batch)
}
database.InsertManyWords(db, batch[i:end_i]) database.InsertManyWords(db, batch[i:end_i])
} }
database.InsertManyExternalLinks(db, links) database.InsertManyExternalLinks(db, links)

Wyświetl plik

@ -7,10 +7,10 @@ type SearchFragment struct {
} }
type PageData struct { type PageData struct {
URL string URL string
Title string Title string
About string About string
Lang string Lang string
AboutSource string AboutSource string
} }
@ -30,10 +30,10 @@ type Config struct {
Links string `json:"links"` Links string `json:"links"`
} `json:"theme"` } `json:"theme"`
Data struct { Data struct {
Source string `json:source` Source string `json:source`
Database string `json:database` Database string `json:database`
Heuristics string `json:heuristics` Heuristics string `json:heuristics`
Wordlist string `json:wordlist` Wordlist string `json:wordlist`
} `json:data` } `json:data`
Crawler struct { Crawler struct {
Webring string `json:webring` Webring string `json:webring`