lieu/crawler/crawler.go

352 wiersze
8.6 KiB
Go

package crawler
import (
"fmt"
"lieu/types"
"lieu/util"
"log"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/queue"
)
// the following domains are excluded from crawling & indexing, typically because they have a lot of microblog pages
// (very spammy)
func getBannedDomains(path string) []string {
return util.ReadList(path, "\n")
}
func getBannedSuffixes(path string) []string {
return util.ReadList(path, "\n")
}
func getBoringWords(path string) []string {
return util.ReadList(path, "\n")
}
func getBoringDomains(path string) []string {
return util.ReadList(path, "\n")
}
func getAboutHeuristics(path string) []string {
return util.ReadList(path, "\n")
}
func getPreviewQueries(path string) []string {
previewQueries := util.ReadList(path, "\n")
if len(previewQueries) > 0 {
return previewQueries
} else {
return []string{"main p", "article p", "section p", "p"}
}
}
func find(list []string, query string) bool {
for _, item := range list {
if item == query {
return true
}
}
return false
}
func getLink(target string) string {
// remove anchor links
if strings.Contains(target, "#") {
target = strings.Split(target, "#")[0]
}
if strings.Contains(target, "?") {
target = strings.Split(target, "?")[0]
}
target = strings.TrimSpace(target)
// remove trailing /
return strings.TrimSuffix(target, "/")
}
func getWebringLinks(path string) []string {
var links []string
candidates := util.ReadList(path, "\n")
for _, l := range candidates {
u, err := url.Parse(l)
if err != nil {
continue
}
if u.Scheme == "" {
u.Scheme = "https"
}
links = append(links, u.String())
}
return links
}
func getDomains(links []string) ([]string, []string) {
var domains []string
// sites which should have stricter crawling enforced (e.g. applicable for shared sites like tilde sites)
// pathsites are sites that are passed in which contain path,
// e.g. https://example.com/site/lupin -> only children pages of /site/lupin/ will be crawled
var pathsites []string
for _, l := range links {
u, err := url.Parse(l)
if err != nil {
continue
}
domains = append(domains, u.Hostname())
if len(u.Path) > 0 && (u.Path != "/" || u.Path != "index.html") {
pathsites = append(pathsites, l)
}
}
return domains, pathsites
}
func findSuffix(suffixes []string, query string) bool {
for _, suffix := range suffixes {
if strings.HasSuffix(strings.ToLower(query), suffix) {
return true
}
}
return false
}
func cleanText(s string) string {
s = strings.TrimSpace(s)
s = strings.ReplaceAll(s, "\n", " ")
whitespace := regexp.MustCompile(`\p{Z}+`)
s = whitespace.ReplaceAllString(s, " ")
return s
}
func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []string) {
c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) {
fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL)
})
c.OnHTML("meta[name=\"description\"]", func(e *colly.HTMLElement) {
desc := cleanText(e.Attr("content"))
if len(desc) > 0 && len(desc) < 1500 {
fmt.Println("desc", desc, e.Request.URL)
}
})
c.OnHTML("meta[property=\"og:description\"]", func(e *colly.HTMLElement) {
ogDesc := cleanText(e.Attr("content"))
if len(ogDesc) > 0 && len(ogDesc) < 1500 {
fmt.Println("og-desc", ogDesc, e.Request.URL)
}
})
c.OnHTML("html[lang]", func(e *colly.HTMLElement) {
lang := cleanText(e.Attr("lang"))
if len(lang) > 0 && len(lang) < 100 {
fmt.Println("lang", lang, e.Request.URL)
}
})
// get page title
c.OnHTML("title", func(e *colly.HTMLElement) {
fmt.Println("title", cleanText(e.Text), e.Request.URL)
})
c.OnHTML("body", func(e *colly.HTMLElement) {
QueryLoop:
for i := 0; i < len(previewQueries); i++ {
// After the fourth paragraph we're probably too far in to get something interesting for a preview
elements := e.DOM.Find(previewQueries[i])
for j := 0; j < 4 && j < elements.Length(); j++ {
element_text := elements.Slice(j, j+1).Text()
paragraph := cleanText(element_text)
if len(paragraph) < 1500 && len(paragraph) > 20 {
if !util.Contains(heuristics, strings.ToLower(paragraph)) {
fmt.Println("para", paragraph, e.Request.URL)
break QueryLoop
}
}
}
}
paragraph := cleanText(e.DOM.Find("p").First().Text())
if len(paragraph) < 1500 && len(paragraph) > 0 {
fmt.Println("para-just-p", paragraph, e.Request.URL)
}
// get all relevant page headings
collectHeadingText("h1", e)
collectHeadingText("h2", e)
collectHeadingText("h3", e)
})
}
func collectHeadingText(heading string, e *colly.HTMLElement) {
for _, headingText := range e.ChildTexts(heading) {
if len(headingText) < 500 {
fmt.Println(heading, cleanText(headingText), e.Request.URL)
}
}
}
func SetupDefaultProxy(config types.Config) error {
// no proxy configured, go back
if config.General.Proxy == "" {
return nil
}
proxyURL, err := url.Parse(config.General.Proxy)
if err != nil {
return err
}
httpClient := &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(proxyURL),
},
}
http.DefaultClient = httpClient
return nil
}
func Precrawl(config types.Config) {
// setup proxy
err := SetupDefaultProxy(config)
if err != nil {
log.Fatal(err)
}
res, err := http.Get(config.General.URL)
util.Check(err)
defer res.Body.Close()
if res.StatusCode != 200 {
log.Fatal("status not 200")
}
doc, err := goquery.NewDocumentFromReader(res.Body)
util.Check(err)
items := make([]string, 0)
s := doc.Find("html")
query := config.General.WebringSelector
if query == "" {
query = "li > a[href]:first-of-type"
}
util.QuerySelector(query, s, &items)
BANNED := getBannedDomains(config.Crawler.BannedDomains)
for _, item := range items {
link := getLink(item)
u, err := url.Parse(link)
// invalid link
if err != nil {
continue
}
domain := u.Hostname()
if find(BANNED, domain) {
continue
}
fmt.Println(link)
}
}
func Crawl(config types.Config) {
// setup proxy
err := SetupDefaultProxy(config)
if err != nil {
log.Fatal(err)
}
SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes)
links := getWebringLinks(config.Crawler.Webring)
domains, pathsites := getDomains(links)
initialDomain := config.General.URL
// TODO: introduce c2 for scraping links (with depth 1) linked to from webring domains
// instantiate default collector
c := colly.NewCollector(
colly.MaxDepth(3),
)
if config.General.Proxy != "" {
c.SetProxy(config.General.Proxy)
}
q, _ := queue.New(
5, /* threads */
&queue.InMemoryQueueStorage{MaxSize: 100000},
)
for _, link := range links {
q.AddURL(link)
}
c.UserAgent = "Lieu"
c.AllowedDomains = domains
c.AllowURLRevisit = false
c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains)
delay, _ := time.ParseDuration("200ms")
c.Limit(&colly.LimitRule{DomainGlob: "*", Delay: delay, Parallelism: 3})
boringDomains := getBoringDomains(config.Crawler.BoringDomains)
boringWords := getBoringWords(config.Crawler.BoringWords)
previewQueries := getPreviewQueries(config.Crawler.PreviewQueries)
heuristics := getAboutHeuristics(config.Data.Heuristics)
// on every a element which has an href attribute, call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 {
return
}
link := getLink(e.Attr("href"))
if findSuffix(SUFFIXES, link) {
return
}
link = e.Request.AbsoluteURL(link)
u, err := url.Parse(link)
if err != nil {
return
}
outgoingDomain := u.Hostname()
currentDomain := e.Request.URL.Hostname()
// log which site links to what
if !util.Contains(boringWords, link) && !util.Contains(boringDomains, link) {
if !find(domains, outgoingDomain) {
fmt.Println("non-webring-link", link, e.Request.URL)
// solidarity! someone in the webring linked to someone else in it
} else if outgoingDomain != currentDomain && outgoingDomain != initialDomain && currentDomain != initialDomain {
fmt.Println("webring-link", link, e.Request.URL)
}
}
// rule-based crawling
var pathsite string
for _, s := range pathsites {
if strings.Contains(s, outgoingDomain) {
pathsite = s
break
}
}
// the visited site was a so called »pathsite», a site with restrictions on which pages can be crawled (most often due to
// existing on a shared domain)
if pathsite != "" {
// make sure we're only crawling descendents of the original path
if strings.HasPrefix(link, pathsite) {
q.AddURL(link)
}
} else {
// visits links from AllowedDomains
q.AddURL(link)
}
})
handleIndexing(c, previewQueries, heuristics)
// start scraping
q.Run(c)
}