pull/15/head
cblgh 2022-11-22 14:08:59 +01:00
rodzic 9517f62de2
commit 9377bd6fab
3 zmienionych plików z 19 dodań i 17 usunięć

Wyświetl plik

@ -153,12 +153,12 @@ func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []st
})
c.OnHTML("body", func(e *colly.HTMLElement) {
QueryLoop:
QueryLoop:
for i := 0; i < len(previewQueries); i++ {
// After the fourth paragraph we're probably too far in to get something interesting for a preview
elements := e.DOM.Find(previewQueries[i])
for j := 0; j < 4 && j < elements.Length() ; j++ {
element_text := elements.Slice(j,j+1).Text()
for j := 0; j < 4 && j < elements.Length(); j++ {
element_text := elements.Slice(j, j+1).Text()
paragraph := cleanText(element_text)
if len(paragraph) < 1500 && len(paragraph) > 20 {
if !util.Contains(heuristics, strings.ToLower(paragraph)) {
@ -172,7 +172,7 @@ func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []st
if len(paragraph) < 1500 && len(paragraph) > 0 {
fmt.Println("para-just-p", paragraph, e.Request.URL)
}
// get all relevant page headings
collectHeadingText("h1", e)
collectHeadingText("h2", e)
@ -230,7 +230,7 @@ func Precrawl(config types.Config) {
s := doc.Find("html")
query := config.General.WebringSelector
if query == "" {
query = "li > a[href]:first-of-type"
query = "li > a[href]:first-of-type"
}
util.QuerySelector(query, s, &items)
@ -298,7 +298,7 @@ func Crawl(config types.Config) {
if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 {
return
}
link := getLink(e.Attr("href"))
if findSuffix(SUFFIXES, link) {
return

Wyświetl plik

@ -144,7 +144,7 @@ func Ingest(config types.Config) {
page.AboutSource = token
processed = partitionSentence(payload)
case "para":
if (page.AboutSource != "og-desc" || len(rawdata)*10 > len(page.About)*7) {
if page.AboutSource != "og-desc" || len(rawdata)*10 > len(page.About)*7 {
if performAboutHeuristic(config.Data.Heuristics, payload) {
page.About = rawdata
page.AboutSource = token
@ -200,12 +200,14 @@ func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]ty
i++
}
// TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from
log.Println("starting to ingest batch (Pages:", len(pages), "Words:", len(batch), "Links:", len(links),")")
log.Println("starting to ingest batch (Pages:", len(pages), "Words:", len(batch), "Links:", len(links), ")")
database.InsertManyDomains(db, pages)
database.InsertManyPages(db, pages)
for i := 0; i < len(batch); i += 3000 {
end_i := i + 3000
if (end_i > len(batch)) { end_i = len(batch)}
if end_i > len(batch) {
end_i = len(batch)
}
database.InsertManyWords(db, batch[i:end_i])
}
database.InsertManyExternalLinks(db, links)

Wyświetl plik

@ -7,10 +7,10 @@ type SearchFragment struct {
}
type PageData struct {
URL string
Title string
About string
Lang string
URL string
Title string
About string
Lang string
AboutSource string
}
@ -30,10 +30,10 @@ type Config struct {
Links string `json:"links"`
} `json:"theme"`
Data struct {
Source string `json:source`
Database string `json:database`
Heuristics string `json:heuristics`
Wordlist string `json:wordlist`
Source string `json:source`
Database string `json:database`
Heuristics string `json:heuristics`
Wordlist string `json:wordlist`
} `json:data`
Crawler struct {
Webring string `json:webring`