2021-02-03 08:12:30 +00:00
|
|
|
package crawler
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"lieu/types"
|
|
|
|
"lieu/util"
|
|
|
|
"log"
|
|
|
|
"net/http"
|
|
|
|
"net/url"
|
|
|
|
"regexp"
|
|
|
|
"strings"
|
2021-05-11 18:39:14 +00:00
|
|
|
"time"
|
2021-02-03 08:12:30 +00:00
|
|
|
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
|
"github.com/gocolly/colly/v2"
|
|
|
|
"github.com/gocolly/colly/v2/queue"
|
|
|
|
)
|
|
|
|
|
|
|
|
// the following domains are excluded from crawling & indexing, typically because they have a lot of microblog pages
|
|
|
|
// (very spammy)
|
|
|
|
func getBannedDomains(path string) []string {
|
|
|
|
return util.ReadList(path, "\n")
|
|
|
|
}
|
|
|
|
|
|
|
|
func getBannedSuffixes(path string) []string {
|
|
|
|
return util.ReadList(path, "\n")
|
|
|
|
}
|
|
|
|
|
|
|
|
func getBoringWords(path string) []string {
|
|
|
|
return util.ReadList(path, "\n")
|
|
|
|
}
|
|
|
|
|
|
|
|
func getBoringDomains(path string) []string {
|
|
|
|
return util.ReadList(path, "\n")
|
|
|
|
}
|
|
|
|
|
2022-11-19 00:09:09 +00:00
|
|
|
func getAboutHeuristics(path string) []string {
|
|
|
|
return util.ReadList(path, "\n")
|
|
|
|
}
|
|
|
|
|
2022-11-15 15:38:02 +00:00
|
|
|
func getPreviewQueries(path string) []string {
|
|
|
|
previewQueries := util.ReadList(path, "\n")
|
|
|
|
if len(previewQueries) > 0 {
|
2022-11-22 13:08:44 +00:00
|
|
|
return previewQueries
|
2022-11-15 15:38:02 +00:00
|
|
|
} else {
|
2022-11-22 13:08:44 +00:00
|
|
|
return []string{"main p", "article p", "section p", "p"}
|
2022-11-15 15:38:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-03 08:12:30 +00:00
|
|
|
func find(list []string, query string) bool {
|
|
|
|
for _, item := range list {
|
|
|
|
if item == query {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
func getLink(target string) string {
|
|
|
|
// remove anchor links
|
|
|
|
if strings.Contains(target, "#") {
|
|
|
|
target = strings.Split(target, "#")[0]
|
|
|
|
}
|
|
|
|
if strings.Contains(target, "?") {
|
|
|
|
target = strings.Split(target, "?")[0]
|
|
|
|
}
|
|
|
|
target = strings.TrimSpace(target)
|
|
|
|
// remove trailing /
|
|
|
|
return strings.TrimSuffix(target, "/")
|
|
|
|
}
|
|
|
|
|
|
|
|
func getWebringLinks(path string) []string {
|
|
|
|
var links []string
|
|
|
|
candidates := util.ReadList(path, "\n")
|
|
|
|
for _, l := range candidates {
|
|
|
|
u, err := url.Parse(l)
|
|
|
|
if err != nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if u.Scheme == "" {
|
|
|
|
u.Scheme = "https"
|
|
|
|
}
|
|
|
|
links = append(links, u.String())
|
|
|
|
}
|
|
|
|
return links
|
|
|
|
}
|
|
|
|
|
2022-03-07 10:21:23 +00:00
|
|
|
func getDomains(links []string) ([]string, []string) {
|
2021-02-03 08:12:30 +00:00
|
|
|
var domains []string
|
2022-03-07 10:21:23 +00:00
|
|
|
// sites which should have stricter crawling enforced (e.g. applicable for shared sites like tilde sites)
|
|
|
|
// pathsites are sites that are passed in which contain path,
|
|
|
|
// e.g. https://example.com/site/lupin -> only children pages of /site/lupin/ will be crawled
|
|
|
|
var pathsites []string
|
2021-02-03 08:12:30 +00:00
|
|
|
for _, l := range links {
|
|
|
|
u, err := url.Parse(l)
|
|
|
|
if err != nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
domains = append(domains, u.Hostname())
|
2022-03-07 10:21:23 +00:00
|
|
|
if len(u.Path) > 0 && (u.Path != "/" || u.Path != "index.html") {
|
|
|
|
pathsites = append(pathsites, l)
|
|
|
|
}
|
2021-02-03 08:12:30 +00:00
|
|
|
}
|
2022-03-07 10:21:23 +00:00
|
|
|
return domains, pathsites
|
2021-02-03 08:12:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func findSuffix(suffixes []string, query string) bool {
|
|
|
|
for _, suffix := range suffixes {
|
|
|
|
if strings.HasSuffix(strings.ToLower(query), suffix) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
func cleanText(s string) string {
|
|
|
|
s = strings.TrimSpace(s)
|
|
|
|
s = strings.ReplaceAll(s, "\n", " ")
|
2022-11-15 15:38:02 +00:00
|
|
|
whitespace := regexp.MustCompile(`\p{Z}+`)
|
2021-02-03 08:12:30 +00:00
|
|
|
s = whitespace.ReplaceAllString(s, " ")
|
|
|
|
return s
|
|
|
|
}
|
|
|
|
|
2022-11-19 00:09:09 +00:00
|
|
|
func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []string) {
|
2021-02-03 08:12:30 +00:00
|
|
|
c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) {
|
|
|
|
fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL)
|
|
|
|
})
|
|
|
|
|
|
|
|
c.OnHTML("meta[name=\"description\"]", func(e *colly.HTMLElement) {
|
|
|
|
desc := cleanText(e.Attr("content"))
|
2022-11-19 00:09:09 +00:00
|
|
|
if len(desc) > 0 && len(desc) < 1500 {
|
2021-02-03 08:12:30 +00:00
|
|
|
fmt.Println("desc", desc, e.Request.URL)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
|
2022-11-19 00:09:09 +00:00
|
|
|
c.OnHTML("meta[property=\"og:description\"]", func(e *colly.HTMLElement) {
|
|
|
|
ogDesc := cleanText(e.Attr("content"))
|
|
|
|
if len(ogDesc) > 0 && len(ogDesc) < 1500 {
|
|
|
|
fmt.Println("og-desc", ogDesc, e.Request.URL)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
|
2021-02-03 08:12:30 +00:00
|
|
|
c.OnHTML("html[lang]", func(e *colly.HTMLElement) {
|
|
|
|
lang := cleanText(e.Attr("lang"))
|
2022-11-19 00:09:09 +00:00
|
|
|
if len(lang) > 0 && len(lang) < 100 {
|
2021-02-03 08:12:30 +00:00
|
|
|
fmt.Println("lang", lang, e.Request.URL)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
|
|
|
|
// get page title
|
|
|
|
c.OnHTML("title", func(e *colly.HTMLElement) {
|
|
|
|
fmt.Println("title", cleanText(e.Text), e.Request.URL)
|
|
|
|
})
|
|
|
|
|
|
|
|
c.OnHTML("body", func(e *colly.HTMLElement) {
|
2022-11-19 00:09:09 +00:00
|
|
|
QueryLoop:
|
2022-11-15 15:38:02 +00:00
|
|
|
for i := 0; i < len(previewQueries); i++ {
|
2022-11-19 00:09:09 +00:00
|
|
|
// After the fourth paragraph we're probably too far in to get something interesting for a preview
|
|
|
|
elements := e.DOM.Find(previewQueries[i])
|
|
|
|
for j := 0; j < 4 && j < elements.Length() ; j++ {
|
|
|
|
element_text := elements.Slice(j,j+1).Text()
|
|
|
|
paragraph := cleanText(element_text)
|
|
|
|
if len(paragraph) < 1500 && len(paragraph) > 20 {
|
|
|
|
if !util.Contains(heuristics, strings.ToLower(paragraph)) {
|
|
|
|
fmt.Println("para", paragraph, e.Request.URL)
|
|
|
|
break QueryLoop
|
|
|
|
}
|
|
|
|
}
|
2022-11-15 15:38:02 +00:00
|
|
|
}
|
2021-02-03 08:12:30 +00:00
|
|
|
}
|
2022-11-19 00:09:09 +00:00
|
|
|
paragraph := cleanText(e.DOM.Find("p").First().Text())
|
|
|
|
if len(paragraph) < 1500 && len(paragraph) > 0 {
|
|
|
|
fmt.Println("para-just-p", paragraph, e.Request.URL)
|
|
|
|
}
|
|
|
|
|
2021-02-03 08:12:30 +00:00
|
|
|
// get all relevant page headings
|
|
|
|
collectHeadingText("h1", e)
|
|
|
|
collectHeadingText("h2", e)
|
|
|
|
collectHeadingText("h3", e)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
func collectHeadingText(heading string, e *colly.HTMLElement) {
|
|
|
|
for _, headingText := range e.ChildTexts(heading) {
|
|
|
|
if len(headingText) < 500 {
|
|
|
|
fmt.Println(heading, cleanText(headingText), e.Request.URL)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-29 12:36:48 +00:00
|
|
|
func SetupDefaultProxy(config types.Config) error {
|
2022-03-30 13:13:16 +00:00
|
|
|
// no proxy configured, go back
|
|
|
|
if config.General.Proxy == "" {
|
|
|
|
return nil
|
|
|
|
}
|
2022-03-29 12:36:48 +00:00
|
|
|
proxyURL, err := url.Parse(config.General.Proxy)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
httpClient := &http.Client{
|
|
|
|
Transport: &http.Transport{
|
|
|
|
Proxy: http.ProxyURL(proxyURL),
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
http.DefaultClient = httpClient
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2021-02-03 08:12:30 +00:00
|
|
|
func Precrawl(config types.Config) {
|
2022-03-29 12:36:48 +00:00
|
|
|
// setup proxy
|
|
|
|
err := SetupDefaultProxy(config)
|
|
|
|
if err != nil {
|
|
|
|
log.Fatal(err)
|
|
|
|
}
|
2022-03-30 13:13:16 +00:00
|
|
|
|
2021-02-03 08:12:30 +00:00
|
|
|
res, err := http.Get(config.General.URL)
|
|
|
|
util.Check(err)
|
|
|
|
defer res.Body.Close()
|
|
|
|
|
|
|
|
if res.StatusCode != 200 {
|
|
|
|
log.Fatal("status not 200")
|
|
|
|
}
|
|
|
|
|
|
|
|
doc, err := goquery.NewDocumentFromReader(res.Body)
|
|
|
|
util.Check(err)
|
|
|
|
|
|
|
|
items := make([]string, 0)
|
2022-03-30 13:13:16 +00:00
|
|
|
s := doc.Find("html")
|
|
|
|
query := config.General.WebringSelector
|
|
|
|
if query == "" {
|
|
|
|
query = "li > a[href]:first-of-type"
|
|
|
|
}
|
|
|
|
util.QuerySelector(query, s, &items)
|
2021-02-03 08:12:30 +00:00
|
|
|
|
|
|
|
BANNED := getBannedDomains(config.Crawler.BannedDomains)
|
|
|
|
for _, item := range items {
|
|
|
|
link := getLink(item)
|
|
|
|
u, err := url.Parse(link)
|
|
|
|
// invalid link
|
|
|
|
if err != nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
domain := u.Hostname()
|
|
|
|
if find(BANNED, domain) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
fmt.Println(link)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func Crawl(config types.Config) {
|
2022-03-29 12:36:48 +00:00
|
|
|
// setup proxy
|
|
|
|
err := SetupDefaultProxy(config)
|
|
|
|
if err != nil {
|
|
|
|
log.Fatal(err)
|
|
|
|
}
|
2021-02-03 08:12:30 +00:00
|
|
|
SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes)
|
|
|
|
links := getWebringLinks(config.Crawler.Webring)
|
2022-03-07 10:21:23 +00:00
|
|
|
domains, pathsites := getDomains(links)
|
2021-02-03 08:12:30 +00:00
|
|
|
initialDomain := config.General.URL
|
|
|
|
|
|
|
|
// TODO: introduce c2 for scraping links (with depth 1) linked to from webring domains
|
|
|
|
// instantiate default collector
|
|
|
|
c := colly.NewCollector(
|
|
|
|
colly.MaxDepth(3),
|
|
|
|
)
|
2022-03-30 13:13:16 +00:00
|
|
|
if config.General.Proxy != "" {
|
|
|
|
c.SetProxy(config.General.Proxy)
|
|
|
|
}
|
2021-02-03 08:12:30 +00:00
|
|
|
|
|
|
|
q, _ := queue.New(
|
|
|
|
5, /* threads */
|
|
|
|
&queue.InMemoryQueueStorage{MaxSize: 100000},
|
|
|
|
)
|
|
|
|
|
|
|
|
for _, link := range links {
|
|
|
|
q.AddURL(link)
|
|
|
|
}
|
|
|
|
|
2021-12-01 08:56:09 +00:00
|
|
|
c.UserAgent = "Lieu"
|
2021-02-03 08:12:30 +00:00
|
|
|
c.AllowedDomains = domains
|
|
|
|
c.AllowURLRevisit = false
|
|
|
|
c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains)
|
|
|
|
|
2021-05-11 18:39:14 +00:00
|
|
|
delay, _ := time.ParseDuration("200ms")
|
|
|
|
c.Limit(&colly.LimitRule{DomainGlob: "*", Delay: delay, Parallelism: 3})
|
2021-02-03 08:12:30 +00:00
|
|
|
|
|
|
|
boringDomains := getBoringDomains(config.Crawler.BoringDomains)
|
|
|
|
boringWords := getBoringWords(config.Crawler.BoringWords)
|
2022-11-15 15:38:02 +00:00
|
|
|
previewQueries := getPreviewQueries(config.Crawler.PreviewQueries)
|
2022-11-19 00:09:09 +00:00
|
|
|
heuristics := getAboutHeuristics(config.Data.Heuristics)
|
2021-02-03 08:12:30 +00:00
|
|
|
|
|
|
|
// on every a element which has an href attribute, call callback
|
|
|
|
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
2022-11-19 14:45:52 +00:00
|
|
|
|
|
|
|
if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2021-02-03 08:12:30 +00:00
|
|
|
link := getLink(e.Attr("href"))
|
|
|
|
if findSuffix(SUFFIXES, link) {
|
|
|
|
return
|
|
|
|
}
|
2022-03-07 10:21:23 +00:00
|
|
|
|
2021-02-03 08:12:30 +00:00
|
|
|
link = e.Request.AbsoluteURL(link)
|
|
|
|
u, err := url.Parse(link)
|
2022-03-07 10:21:23 +00:00
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
outgoingDomain := u.Hostname()
|
|
|
|
currentDomain := e.Request.URL.Hostname()
|
|
|
|
|
2021-02-03 08:12:30 +00:00
|
|
|
// log which site links to what
|
2022-03-07 10:21:23 +00:00
|
|
|
if !util.Contains(boringWords, link) && !util.Contains(boringDomains, link) {
|
2021-02-03 08:12:30 +00:00
|
|
|
if !find(domains, outgoingDomain) {
|
|
|
|
fmt.Println("non-webring-link", link, e.Request.URL)
|
|
|
|
// solidarity! someone in the webring linked to someone else in it
|
|
|
|
} else if outgoingDomain != currentDomain && outgoingDomain != initialDomain && currentDomain != initialDomain {
|
|
|
|
fmt.Println("webring-link", link, e.Request.URL)
|
|
|
|
}
|
|
|
|
}
|
2022-03-07 10:21:23 +00:00
|
|
|
|
|
|
|
// rule-based crawling
|
|
|
|
var pathsite string
|
|
|
|
for _, s := range pathsites {
|
|
|
|
if strings.Contains(s, outgoingDomain) {
|
|
|
|
pathsite = s
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// the visited site was a so called »pathsite», a site with restrictions on which pages can be crawled (most often due to
|
|
|
|
// existing on a shared domain)
|
|
|
|
if pathsite != "" {
|
|
|
|
// make sure we're only crawling descendents of the original path
|
|
|
|
if strings.HasPrefix(link, pathsite) {
|
|
|
|
q.AddURL(link)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// visits links from AllowedDomains
|
|
|
|
q.AddURL(link)
|
|
|
|
}
|
2021-02-03 08:12:30 +00:00
|
|
|
})
|
|
|
|
|
2022-11-19 00:09:09 +00:00
|
|
|
handleIndexing(c, previewQueries, heuristics)
|
2021-02-03 08:12:30 +00:00
|
|
|
|
|
|
|
// start scraping
|
|
|
|
q.Run(c)
|
|
|
|
}
|