package crawler import ( "fmt" "lieu/types" "lieu/util" "log" "net/http" "net/url" "regexp" "strings" "time" "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly/v2" "github.com/gocolly/colly/v2/queue" ) // the following domains are excluded from crawling & indexing, typically because they have a lot of microblog pages // (very spammy) func getBannedDomains(path string) []string { return util.ReadList(path, "\n") } func getBannedSuffixes(path string) []string { return util.ReadList(path, "\n") } func getBoringWords(path string) []string { return util.ReadList(path, "\n") } func getBoringDomains(path string) []string { return util.ReadList(path, "\n") } func getAboutHeuristics(path string) []string { return util.ReadList(path, "\n") } func getPreviewQueries(path string) []string { previewQueries := util.ReadList(path, "\n") if len(previewQueries) > 0 { return previewQueries } else { return []string{"main p", "article p", "section p", "p"} } } func find(list []string, query string) bool { for _, item := range list { if item == query { return true } } return false } func getLink(target string) string { // remove anchor links if strings.Contains(target, "#") { target = strings.Split(target, "#")[0] } if strings.Contains(target, "?") { target = strings.Split(target, "?")[0] } target = strings.TrimSpace(target) // remove trailing / return strings.TrimSuffix(target, "/") } func getWebringLinks(path string) []string { var links []string candidates := util.ReadList(path, "\n") for _, l := range candidates { u, err := url.Parse(l) if err != nil { continue } if u.Scheme == "" { u.Scheme = "https" } links = append(links, u.String()) } return links } func getDomains(links []string) ([]string, []string) { var domains []string // sites which should have stricter crawling enforced (e.g. applicable for shared sites like tilde sites) // pathsites are sites that are passed in which contain path, // e.g. https://example.com/site/lupin -> only children pages of /site/lupin/ will be crawled var pathsites []string for _, l := range links { u, err := url.Parse(l) if err != nil { continue } domains = append(domains, u.Hostname()) if len(u.Path) > 0 && (u.Path != "/" || u.Path != "index.html") { pathsites = append(pathsites, l) } } return domains, pathsites } func findSuffix(suffixes []string, query string) bool { for _, suffix := range suffixes { if strings.HasSuffix(strings.ToLower(query), suffix) { return true } } return false } func cleanText(s string) string { s = strings.TrimSpace(s) s = strings.ReplaceAll(s, "\n", " ") whitespace := regexp.MustCompile(`\p{Z}+`) s = whitespace.ReplaceAllString(s, " ") return s } func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []string) { c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) { fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL) }) c.OnHTML("meta[name=\"description\"]", func(e *colly.HTMLElement) { desc := cleanText(e.Attr("content")) if len(desc) > 0 && len(desc) < 1500 { fmt.Println("desc", desc, e.Request.URL) } }) c.OnHTML("meta[property=\"og:description\"]", func(e *colly.HTMLElement) { ogDesc := cleanText(e.Attr("content")) if len(ogDesc) > 0 && len(ogDesc) < 1500 { fmt.Println("og-desc", ogDesc, e.Request.URL) } }) c.OnHTML("html[lang]", func(e *colly.HTMLElement) { lang := cleanText(e.Attr("lang")) if len(lang) > 0 && len(lang) < 100 { fmt.Println("lang", lang, e.Request.URL) } }) // get page title c.OnHTML("title", func(e *colly.HTMLElement) { fmt.Println("title", cleanText(e.Text), e.Request.URL) }) c.OnHTML("body", func(e *colly.HTMLElement) { QueryLoop: for i := 0; i < len(previewQueries); i++ { // After the fourth paragraph we're probably too far in to get something interesting for a preview elements := e.DOM.Find(previewQueries[i]) for j := 0; j < 4 && j < elements.Length(); j++ { element_text := elements.Slice(j, j+1).Text() paragraph := cleanText(element_text) if len(paragraph) < 1500 && len(paragraph) > 20 { if !util.Contains(heuristics, strings.ToLower(paragraph)) { fmt.Println("para", paragraph, e.Request.URL) break QueryLoop } } } } paragraph := cleanText(e.DOM.Find("p").First().Text()) if len(paragraph) < 1500 && len(paragraph) > 0 { fmt.Println("para-just-p", paragraph, e.Request.URL) } // get all relevant page headings collectHeadingText("h1", e) collectHeadingText("h2", e) collectHeadingText("h3", e) }) } func collectHeadingText(heading string, e *colly.HTMLElement) { for _, headingText := range e.ChildTexts(heading) { if len(headingText) < 500 { fmt.Println(heading, cleanText(headingText), e.Request.URL) } } } func SetupDefaultProxy(config types.Config) error { // no proxy configured, go back if config.General.Proxy == "" { return nil } proxyURL, err := url.Parse(config.General.Proxy) if err != nil { return err } httpClient := &http.Client{ Transport: &http.Transport{ Proxy: http.ProxyURL(proxyURL), }, } http.DefaultClient = httpClient return nil } func Precrawl(config types.Config) { // setup proxy err := SetupDefaultProxy(config) if err != nil { log.Fatal(err) } res, err := http.Get(config.General.URL) util.Check(err) defer res.Body.Close() if res.StatusCode != 200 { log.Fatal("status not 200") } doc, err := goquery.NewDocumentFromReader(res.Body) util.Check(err) items := make([]string, 0) s := doc.Find("html") query := config.General.WebringSelector if query == "" { query = "li > a[href]:first-of-type" } util.QuerySelector(query, s, &items) BANNED := getBannedDomains(config.Crawler.BannedDomains) for _, item := range items { link := getLink(item) u, err := url.Parse(link) // invalid link if err != nil { continue } domain := u.Hostname() if find(BANNED, domain) { continue } fmt.Println(link) } } func Crawl(config types.Config) { // setup proxy err := SetupDefaultProxy(config) if err != nil { log.Fatal(err) } SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes) links := getWebringLinks(config.Crawler.Webring) domains, pathsites := getDomains(links) initialDomain := config.General.URL // TODO: introduce c2 for scraping links (with depth 1) linked to from webring domains // instantiate default collector c := colly.NewCollector( colly.MaxDepth(3), ) if config.General.Proxy != "" { c.SetProxy(config.General.Proxy) } q, _ := queue.New( 5, /* threads */ &queue.InMemoryQueueStorage{MaxSize: 100000}, ) for _, link := range links { q.AddURL(link) } c.UserAgent = "Lieu" c.AllowedDomains = domains c.AllowURLRevisit = false c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains) delay, _ := time.ParseDuration("200ms") c.Limit(&colly.LimitRule{DomainGlob: "*", Delay: delay, Parallelism: 3}) boringDomains := getBoringDomains(config.Crawler.BoringDomains) boringWords := getBoringWords(config.Crawler.BoringWords) previewQueries := getPreviewQueries(config.Crawler.PreviewQueries) heuristics := getAboutHeuristics(config.Data.Heuristics) // on every a element which has an href attribute, call callback c.OnHTML("a[href]", func(e *colly.HTMLElement) { if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 { return } link := getLink(e.Attr("href")) if findSuffix(SUFFIXES, link) { return } link = e.Request.AbsoluteURL(link) u, err := url.Parse(link) if err != nil { return } outgoingDomain := u.Hostname() currentDomain := e.Request.URL.Hostname() // log which site links to what if !util.Contains(boringWords, link) && !util.Contains(boringDomains, link) { if !find(domains, outgoingDomain) { fmt.Println("non-webring-link", link, e.Request.URL) // solidarity! someone in the webring linked to someone else in it } else if outgoingDomain != currentDomain && outgoingDomain != initialDomain && currentDomain != initialDomain { fmt.Println("webring-link", link, e.Request.URL) } } // rule-based crawling var pathsite string for _, s := range pathsites { if strings.Contains(s, outgoingDomain) { pathsite = s break } } // the visited site was a so called »pathsite», a site with restrictions on which pages can be crawled (most often due to // existing on a shared domain) if pathsite != "" { // make sure we're only crawling descendents of the original path if strings.HasPrefix(link, pathsite) { q.AddURL(link) } } else { // visits links from AllowedDomains q.AddURL(link) } }) handleIndexing(c, previewQueries, heuristics) // start scraping q.Run(c) }