kopia lustrzana https://github.com/cblgh/lieu
go fmt
rodzic
9517f62de2
commit
9377bd6fab
|
@ -153,12 +153,12 @@ func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []st
|
||||||
})
|
})
|
||||||
|
|
||||||
c.OnHTML("body", func(e *colly.HTMLElement) {
|
c.OnHTML("body", func(e *colly.HTMLElement) {
|
||||||
QueryLoop:
|
QueryLoop:
|
||||||
for i := 0; i < len(previewQueries); i++ {
|
for i := 0; i < len(previewQueries); i++ {
|
||||||
// After the fourth paragraph we're probably too far in to get something interesting for a preview
|
// After the fourth paragraph we're probably too far in to get something interesting for a preview
|
||||||
elements := e.DOM.Find(previewQueries[i])
|
elements := e.DOM.Find(previewQueries[i])
|
||||||
for j := 0; j < 4 && j < elements.Length() ; j++ {
|
for j := 0; j < 4 && j < elements.Length(); j++ {
|
||||||
element_text := elements.Slice(j,j+1).Text()
|
element_text := elements.Slice(j, j+1).Text()
|
||||||
paragraph := cleanText(element_text)
|
paragraph := cleanText(element_text)
|
||||||
if len(paragraph) < 1500 && len(paragraph) > 20 {
|
if len(paragraph) < 1500 && len(paragraph) > 20 {
|
||||||
if !util.Contains(heuristics, strings.ToLower(paragraph)) {
|
if !util.Contains(heuristics, strings.ToLower(paragraph)) {
|
||||||
|
@ -172,7 +172,7 @@ func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []st
|
||||||
if len(paragraph) < 1500 && len(paragraph) > 0 {
|
if len(paragraph) < 1500 && len(paragraph) > 0 {
|
||||||
fmt.Println("para-just-p", paragraph, e.Request.URL)
|
fmt.Println("para-just-p", paragraph, e.Request.URL)
|
||||||
}
|
}
|
||||||
|
|
||||||
// get all relevant page headings
|
// get all relevant page headings
|
||||||
collectHeadingText("h1", e)
|
collectHeadingText("h1", e)
|
||||||
collectHeadingText("h2", e)
|
collectHeadingText("h2", e)
|
||||||
|
@ -230,7 +230,7 @@ func Precrawl(config types.Config) {
|
||||||
s := doc.Find("html")
|
s := doc.Find("html")
|
||||||
query := config.General.WebringSelector
|
query := config.General.WebringSelector
|
||||||
if query == "" {
|
if query == "" {
|
||||||
query = "li > a[href]:first-of-type"
|
query = "li > a[href]:first-of-type"
|
||||||
}
|
}
|
||||||
util.QuerySelector(query, s, &items)
|
util.QuerySelector(query, s, &items)
|
||||||
|
|
||||||
|
@ -298,7 +298,7 @@ func Crawl(config types.Config) {
|
||||||
if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 {
|
if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
link := getLink(e.Attr("href"))
|
link := getLink(e.Attr("href"))
|
||||||
if findSuffix(SUFFIXES, link) {
|
if findSuffix(SUFFIXES, link) {
|
||||||
return
|
return
|
||||||
|
|
|
@ -144,7 +144,7 @@ func Ingest(config types.Config) {
|
||||||
page.AboutSource = token
|
page.AboutSource = token
|
||||||
processed = partitionSentence(payload)
|
processed = partitionSentence(payload)
|
||||||
case "para":
|
case "para":
|
||||||
if (page.AboutSource != "og-desc" || len(rawdata)*10 > len(page.About)*7) {
|
if page.AboutSource != "og-desc" || len(rawdata)*10 > len(page.About)*7 {
|
||||||
if performAboutHeuristic(config.Data.Heuristics, payload) {
|
if performAboutHeuristic(config.Data.Heuristics, payload) {
|
||||||
page.About = rawdata
|
page.About = rawdata
|
||||||
page.AboutSource = token
|
page.AboutSource = token
|
||||||
|
@ -200,12 +200,14 @@ func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]ty
|
||||||
i++
|
i++
|
||||||
}
|
}
|
||||||
// TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from
|
// TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from
|
||||||
log.Println("starting to ingest batch (Pages:", len(pages), "Words:", len(batch), "Links:", len(links),")")
|
log.Println("starting to ingest batch (Pages:", len(pages), "Words:", len(batch), "Links:", len(links), ")")
|
||||||
database.InsertManyDomains(db, pages)
|
database.InsertManyDomains(db, pages)
|
||||||
database.InsertManyPages(db, pages)
|
database.InsertManyPages(db, pages)
|
||||||
for i := 0; i < len(batch); i += 3000 {
|
for i := 0; i < len(batch); i += 3000 {
|
||||||
end_i := i + 3000
|
end_i := i + 3000
|
||||||
if (end_i > len(batch)) { end_i = len(batch)}
|
if end_i > len(batch) {
|
||||||
|
end_i = len(batch)
|
||||||
|
}
|
||||||
database.InsertManyWords(db, batch[i:end_i])
|
database.InsertManyWords(db, batch[i:end_i])
|
||||||
}
|
}
|
||||||
database.InsertManyExternalLinks(db, links)
|
database.InsertManyExternalLinks(db, links)
|
||||||
|
|
|
@ -7,10 +7,10 @@ type SearchFragment struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
type PageData struct {
|
type PageData struct {
|
||||||
URL string
|
URL string
|
||||||
Title string
|
Title string
|
||||||
About string
|
About string
|
||||||
Lang string
|
Lang string
|
||||||
AboutSource string
|
AboutSource string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -30,10 +30,10 @@ type Config struct {
|
||||||
Links string `json:"links"`
|
Links string `json:"links"`
|
||||||
} `json:"theme"`
|
} `json:"theme"`
|
||||||
Data struct {
|
Data struct {
|
||||||
Source string `json:source`
|
Source string `json:source`
|
||||||
Database string `json:database`
|
Database string `json:database`
|
||||||
Heuristics string `json:heuristics`
|
Heuristics string `json:heuristics`
|
||||||
Wordlist string `json:wordlist`
|
Wordlist string `json:wordlist`
|
||||||
} `json:data`
|
} `json:data`
|
||||||
Crawler struct {
|
Crawler struct {
|
||||||
Webring string `json:webring`
|
Webring string `json:webring`
|
||||||
|
|
Ładowanie…
Reference in New Issue