lieu/crawler/crawler.go

package crawler

import (
	"fmt"
	"lieu/types"
	"lieu/util"
	"log"
	"net/http"
	"net/url"
	"regexp"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
	"github.com/gocolly/colly/v2"
	"github.com/gocolly/colly/v2/queue"
)

// the following domains are excluded from crawling & indexing, typically because they have a lot of microblog pages
// (very spammy)
func getBannedDomains(path string) []string {
	return util.ReadList(path, "\n")
}

func getBannedSuffixes(path string) []string {
	return util.ReadList(path, "\n")
}

func getBoringWords(path string) []string {
	return util.ReadList(path, "\n")
}

func getBoringDomains(path string) []string {
	return util.ReadList(path, "\n")
}

func getAboutHeuristics(path string) []string {
	return util.ReadList(path, "\n")
}

func getPreviewQueries(path string) []string {
	previewQueries := util.ReadList(path, "\n")
	if len(previewQueries) > 0 {
		return previewQueries
	} else {
		return []string{"main p", "article p", "section p", "p"}
	}
}

func find(list []string, query string) bool {
	for _, item := range list {
		if item == query {
			return true
		}
	}
	return false
}

func getLink(target string) string {
	// remove anchor links
	if strings.Contains(target, "#") {
		target = strings.Split(target, "#")[0]
	}
	if strings.Contains(target, "?") {
		target = strings.Split(target, "?")[0]
	}
	target = strings.TrimSpace(target)
	// remove trailing /
	return strings.TrimSuffix(target, "/")
}

func getWebringLinks(path string) []string {
	var links []string
	candidates := util.ReadList(path, "\n")
	for _, l := range candidates {
		u, err := url.Parse(l)
		if err != nil {
			continue
		}
		if u.Scheme == "" {
			u.Scheme = "https"
		}
		links = append(links, u.String())
	}
	return links
}

func getDomains(links []string) ([]string, []string) {
	var domains []string
	// sites which should have stricter crawling enforced (e.g. applicable for shared sites like tilde sites)
	// pathsites are sites that are passed in which contain path,
	// e.g. https://example.com/site/lupin -> only children pages of /site/lupin/ will be crawled
	var pathsites []string
	for _, l := range links {
		u, err := url.Parse(l)
		if err != nil {
			continue
		}
		domains = append(domains, u.Hostname())
		if len(u.Path) > 0 && (u.Path != "/" || u.Path != "index.html") {
			pathsites = append(pathsites, l)
		}
	}
	return domains, pathsites
}

func findSuffix(suffixes []string, query string) bool {
	for _, suffix := range suffixes {
		if strings.HasSuffix(strings.ToLower(query), suffix) {
			return true
		}
	}
	return false
}

func cleanText(s string) string {
	s = strings.TrimSpace(s)
	s = strings.ReplaceAll(s, "\n", " ")
	whitespace := regexp.MustCompile(`\p{Z}+`)
	s = whitespace.ReplaceAllString(s, " ")
	return s
}

func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []string) {
	c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) {
		fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL)
	})

	c.OnHTML("meta[name=\"description\"]", func(e *colly.HTMLElement) {
		desc := cleanText(e.Attr("content"))
		if len(desc) > 0 && len(desc) < 1500 {
			fmt.Println("desc", desc, e.Request.URL)
		}
	})

	c.OnHTML("meta[property=\"og:description\"]", func(e *colly.HTMLElement) {
		ogDesc := cleanText(e.Attr("content"))
		if len(ogDesc) > 0 && len(ogDesc) < 1500 {
			fmt.Println("og-desc", ogDesc, e.Request.URL)
		}
	})

	c.OnHTML("html[lang]", func(e *colly.HTMLElement) {
		lang := cleanText(e.Attr("lang"))
		if len(lang) > 0 && len(lang) < 100 {
			fmt.Println("lang", lang, e.Request.URL)
		}
	})

	// get page title
	c.OnHTML("title", func(e *colly.HTMLElement) {
		fmt.Println("title", cleanText(e.Text), e.Request.URL)
	})

	c.OnHTML("body", func(e *colly.HTMLElement) {
		QueryLoop:
		for i := 0; i < len(previewQueries); i++ {
			// After the fourth paragraph we're probably too far in to get something interesting for a preview
			elements := e.DOM.Find(previewQueries[i])
			for j := 0; j < 4 && j < elements.Length() ; j++ {
				element_text := elements.Slice(j,j+1).Text()
				paragraph := cleanText(element_text)
				if len(paragraph) < 1500 && len(paragraph) > 20 {
					if !util.Contains(heuristics, strings.ToLower(paragraph)) {
						fmt.Println("para", paragraph, e.Request.URL)
						break QueryLoop
					}
				}
			}
		}
		paragraph := cleanText(e.DOM.Find("p").First().Text())
		if len(paragraph) < 1500 && len(paragraph) > 0 {
			fmt.Println("para-just-p", paragraph, e.Request.URL)
		}
	
		// get all relevant page headings
		collectHeadingText("h1", e)
		collectHeadingText("h2", e)
		collectHeadingText("h3", e)
	})
}

func collectHeadingText(heading string, e *colly.HTMLElement) {
	for _, headingText := range e.ChildTexts(heading) {
		if len(headingText) < 500 {
			fmt.Println(heading, cleanText(headingText), e.Request.URL)
		}
	}
}

func SetupDefaultProxy(config types.Config) error {
	// no proxy configured, go back
	if config.General.Proxy == "" {
		return nil
	}
	proxyURL, err := url.Parse(config.General.Proxy)
	if err != nil {
		return err
	}

	httpClient := &http.Client{
		Transport: &http.Transport{
			Proxy: http.ProxyURL(proxyURL),
		},
	}

	http.DefaultClient = httpClient
	return nil
}

func Precrawl(config types.Config) {
	// setup proxy
	err := SetupDefaultProxy(config)
	if err != nil {
		log.Fatal(err)
	}

	res, err := http.Get(config.General.URL)
	util.Check(err)
	defer res.Body.Close()

	if res.StatusCode != 200 {
		log.Fatal("status not 200")
	}

	doc, err := goquery.NewDocumentFromReader(res.Body)
	util.Check(err)

	items := make([]string, 0)
	s := doc.Find("html")
	query := config.General.WebringSelector
	if query == "" {
    query = "li > a[href]:first-of-type"
	}
	util.QuerySelector(query, s, &items)

	BANNED := getBannedDomains(config.Crawler.BannedDomains)
	for _, item := range items {
		link := getLink(item)
		u, err := url.Parse(link)
		// invalid link
		if err != nil {
			continue
		}
		domain := u.Hostname()
		if find(BANNED, domain) {
			continue
		}
		fmt.Println(link)
	}
}

func Crawl(config types.Config) {
	// setup proxy
	err := SetupDefaultProxy(config)
	if err != nil {
		log.Fatal(err)
	}
	SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes)
	links := getWebringLinks(config.Crawler.Webring)
	domains, pathsites := getDomains(links)
	initialDomain := config.General.URL

	// TODO: introduce c2 for scraping links (with depth 1) linked to from webring domains
	// instantiate default collector
	c := colly.NewCollector(
		colly.MaxDepth(3),
	)
	if config.General.Proxy != "" {
		c.SetProxy(config.General.Proxy)
	}

	q, _ := queue.New(
		5, /* threads */
		&queue.InMemoryQueueStorage{MaxSize: 100000},
	)

	for _, link := range links {
		q.AddURL(link)
	}

	c.UserAgent = "Lieu"
	c.AllowedDomains = domains
	c.AllowURLRevisit = false
	c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains)

	delay, _ := time.ParseDuration("200ms")
	c.Limit(&colly.LimitRule{DomainGlob: "*", Delay: delay, Parallelism: 3})

	boringDomains := getBoringDomains(config.Crawler.BoringDomains)
	boringWords := getBoringWords(config.Crawler.BoringWords)
	previewQueries := getPreviewQueries(config.Crawler.PreviewQueries)
	heuristics := getAboutHeuristics(config.Data.Heuristics)

	// on every a element which has an href attribute, call callback
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {

		if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 {
			return
		}
		
		link := getLink(e.Attr("href"))
		if findSuffix(SUFFIXES, link) {
			return
		}

		link = e.Request.AbsoluteURL(link)
		u, err := url.Parse(link)
		if err != nil {
			return
		}

		outgoingDomain := u.Hostname()
		currentDomain := e.Request.URL.Hostname()

		// log which site links to what
		if !util.Contains(boringWords, link) && !util.Contains(boringDomains, link) {
			if !find(domains, outgoingDomain) {
				fmt.Println("non-webring-link", link, e.Request.URL)
				// solidarity! someone in the webring linked to someone else in it
			} else if outgoingDomain != currentDomain && outgoingDomain != initialDomain && currentDomain != initialDomain {
				fmt.Println("webring-link", link, e.Request.URL)
			}
		}

		// rule-based crawling
		var pathsite string
		for _, s := range pathsites {
			if strings.Contains(s, outgoingDomain) {
				pathsite = s
				break
			}
		}
		// the visited site was a so called »pathsite», a site with restrictions on which pages can be crawled (most often due to
		// existing on a shared domain)
		if pathsite != "" {
			// make sure we're only crawling descendents of the original path
			if strings.HasPrefix(link, pathsite) {
				q.AddURL(link)
			}
		} else {
			// visits links from AllowedDomains
			q.AddURL(link)
		}
	})

	handleIndexing(c, previewQueries, heuristics)

	// start scraping
	q.Run(c)
}
launch 2021-02-03 08:12:30 +00:00			`package crawler`

			`import (`
			`"fmt"`
			`"lieu/types"`
			`"lieu/util"`
			`"log"`
			`"net/http"`
			`"net/url"`
			`"regexp"`
			`"strings"`
merge eotl changes :) 2021-05-11 18:39:14 +00:00			`"time"`
launch 2021-02-03 08:12:30 +00:00
			`"github.com/PuerkitoBio/goquery"`
			`"github.com/gocolly/colly/v2"`
			`"github.com/gocolly/colly/v2/queue"`
			`)`

			`// the following domains are excluded from crawling & indexing, typically because they have a lot of microblog pages`
			`// (very spammy)`
			`func getBannedDomains(path string) []string {`
			`return util.ReadList(path, "\n")`
			`}`

			`func getBannedSuffixes(path string) []string {`
			`return util.ReadList(path, "\n")`
			`}`

			`func getBoringWords(path string) []string {`
			`return util.ReadList(path, "\n")`
			`}`

			`func getBoringDomains(path string) []string {`
			`return util.ReadList(path, "\n")`
			`}`

Added some extra mechanisms that come in handy for getting more useful previews 2022-11-19 00:09:09 +00:00			`func getAboutHeuristics(path string) []string {`
			`return util.ReadList(path, "\n")`
			`}`

Made scring the preview text configurable and improved the cleanup function a tiny bit. 2022-11-15 15:38:02 +00:00			`func getPreviewQueries(path string) []string {`
			`previewQueries := util.ReadList(path, "\n")`
			`if len(previewQueries) > 0 {`
tweak wording and minor details relating to preview queries 2022-11-22 13:08:44 +00:00			`return previewQueries`
Made scring the preview text configurable and improved the cleanup function a tiny bit. 2022-11-15 15:38:02 +00:00			`} else {`
tweak wording and minor details relating to preview queries 2022-11-22 13:08:44 +00:00			`return []string{"main p", "article p", "section p", "p"}`
Made scring the preview text configurable and improved the cleanup function a tiny bit. 2022-11-15 15:38:02 +00:00			`}`
			`}`

launch 2021-02-03 08:12:30 +00:00			`func find(list []string, query string) bool {`
			`for _, item := range list {`
			`if item == query {`
			`return true`
			`}`
			`}`
			`return false`
			`}`

			`func getLink(target string) string {`
			`// remove anchor links`
			`if strings.Contains(target, "#") {`
			`target = strings.Split(target, "#")[0]`
			`}`
			`if strings.Contains(target, "?") {`
			`target = strings.Split(target, "?")[0]`
			`}`
			`target = strings.TrimSpace(target)`
			`// remove trailing /`
			`return strings.TrimSuffix(target, "/")`
			`}`

			`func getWebringLinks(path string) []string {`
			`var links []string`
			`candidates := util.ReadList(path, "\n")`
			`for _, l := range candidates {`
			`u, err := url.Parse(l)`
			`if err != nil {`
			`continue`
			`}`
			`if u.Scheme == "" {`
			`u.Scheme = "https"`
			`}`
			`links = append(links, u.String())`
			`}`
			`return links`
			`}`

improve crawling rules wrt path-suffixed sites, close #6 2022-03-07 10:21:23 +00:00			`func getDomains(links []string) ([]string, []string) {`
launch 2021-02-03 08:12:30 +00:00			`var domains []string`
improve crawling rules wrt path-suffixed sites, close #6 2022-03-07 10:21:23 +00:00			`// sites which should have stricter crawling enforced (e.g. applicable for shared sites like tilde sites)`
			`// pathsites are sites that are passed in which contain path,`
			`// e.g. https://example.com/site/lupin -> only children pages of /site/lupin/ will be crawled`
			`var pathsites []string`
launch 2021-02-03 08:12:30 +00:00			`for _, l := range links {`
			`u, err := url.Parse(l)`
			`if err != nil {`
			`continue`
			`}`
			`domains = append(domains, u.Hostname())`
improve crawling rules wrt path-suffixed sites, close #6 2022-03-07 10:21:23 +00:00			`if len(u.Path) > 0 && (u.Path != "/" \|\| u.Path != "index.html") {`
			`pathsites = append(pathsites, l)`
			`}`
launch 2021-02-03 08:12:30 +00:00			`}`
improve crawling rules wrt path-suffixed sites, close #6 2022-03-07 10:21:23 +00:00			`return domains, pathsites`
launch 2021-02-03 08:12:30 +00:00			`}`

			`func findSuffix(suffixes []string, query string) bool {`
			`for _, suffix := range suffixes {`
			`if strings.HasSuffix(strings.ToLower(query), suffix) {`
			`return true`
			`}`
			`}`
			`return false`
			`}`

			`func cleanText(s string) string {`
			`s = strings.TrimSpace(s)`
			`s = strings.ReplaceAll(s, "\n", " ")`
Made scring the preview text configurable and improved the cleanup function a tiny bit. 2022-11-15 15:38:02 +00:00			whitespace := regexp.MustCompile(`\p{Z}+`)
launch 2021-02-03 08:12:30 +00:00			`s = whitespace.ReplaceAllString(s, " ")`
			`return s`
			`}`

Added some extra mechanisms that come in handy for getting more useful previews 2022-11-19 00:09:09 +00:00			`func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []string) {`
launch 2021-02-03 08:12:30 +00:00			`c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) {`
			`fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL)`
			`})`

			`c.OnHTML("meta[name=\"description\"]", func(e *colly.HTMLElement) {`
			`desc := cleanText(e.Attr("content"))`
Added some extra mechanisms that come in handy for getting more useful previews 2022-11-19 00:09:09 +00:00			`if len(desc) > 0 && len(desc) < 1500 {`
launch 2021-02-03 08:12:30 +00:00			`fmt.Println("desc", desc, e.Request.URL)`
			`}`
			`})`

Added some extra mechanisms that come in handy for getting more useful previews 2022-11-19 00:09:09 +00:00			`c.OnHTML("meta[property=\"og:description\"]", func(e *colly.HTMLElement) {`
			`ogDesc := cleanText(e.Attr("content"))`
			`if len(ogDesc) > 0 && len(ogDesc) < 1500 {`
			`fmt.Println("og-desc", ogDesc, e.Request.URL)`
			`}`
			`})`

launch 2021-02-03 08:12:30 +00:00			`c.OnHTML("html[lang]", func(e *colly.HTMLElement) {`
			`lang := cleanText(e.Attr("lang"))`
Added some extra mechanisms that come in handy for getting more useful previews 2022-11-19 00:09:09 +00:00			`if len(lang) > 0 && len(lang) < 100 {`
launch 2021-02-03 08:12:30 +00:00			`fmt.Println("lang", lang, e.Request.URL)`
			`}`
			`})`

			`// get page title`
			`c.OnHTML("title", func(e *colly.HTMLElement) {`
			`fmt.Println("title", cleanText(e.Text), e.Request.URL)`
			`})`

			`c.OnHTML("body", func(e *colly.HTMLElement) {`
Added some extra mechanisms that come in handy for getting more useful previews 2022-11-19 00:09:09 +00:00			`QueryLoop:`
Made scring the preview text configurable and improved the cleanup function a tiny bit. 2022-11-15 15:38:02 +00:00			`for i := 0; i < len(previewQueries); i++ {`
Added some extra mechanisms that come in handy for getting more useful previews 2022-11-19 00:09:09 +00:00			`// After the fourth paragraph we're probably too far in to get something interesting for a preview`
			`elements := e.DOM.Find(previewQueries[i])`
			`for j := 0; j < 4 && j < elements.Length() ; j++ {`
			`element_text := elements.Slice(j,j+1).Text()`
			`paragraph := cleanText(element_text)`
			`if len(paragraph) < 1500 && len(paragraph) > 20 {`
			`if !util.Contains(heuristics, strings.ToLower(paragraph)) {`
			`fmt.Println("para", paragraph, e.Request.URL)`
			`break QueryLoop`
			`}`
			`}`
Made scring the preview text configurable and improved the cleanup function a tiny bit. 2022-11-15 15:38:02 +00:00			`}`
launch 2021-02-03 08:12:30 +00:00			`}`
Added some extra mechanisms that come in handy for getting more useful previews 2022-11-19 00:09:09 +00:00			`paragraph := cleanText(e.DOM.Find("p").First().Text())`
			`if len(paragraph) < 1500 && len(paragraph) > 0 {`
			`fmt.Println("para-just-p", paragraph, e.Request.URL)`
			`}`

launch 2021-02-03 08:12:30 +00:00			`// get all relevant page headings`
			`collectHeadingText("h1", e)`
			`collectHeadingText("h2", e)`
			`collectHeadingText("h3", e)`
			`})`
			`}`

			`func collectHeadingText(heading string, e *colly.HTMLElement) {`
			`for _, headingText := range e.ChildTexts(heading) {`
			`if len(headingText) < 500 {`
			`fmt.Println(heading, cleanText(headingText), e.Request.URL)`
			`}`
			`}`
			`}`

Allows the configuration of a proxy (#9) * Add proxy support, capability to crawl using SOCKS proxies 2022-03-29 12:36:48 +00:00			`func SetupDefaultProxy(config types.Config) error {`
add custom webring selector for precrawl 2022-03-30 13:13:16 +00:00			`// no proxy configured, go back`
			`if config.General.Proxy == "" {`
			`return nil`
			`}`
Allows the configuration of a proxy (#9) * Add proxy support, capability to crawl using SOCKS proxies 2022-03-29 12:36:48 +00:00			`proxyURL, err := url.Parse(config.General.Proxy)`
			`if err != nil {`
			`return err`
			`}`

			`httpClient := &http.Client{`
			`Transport: &http.Transport{`
			`Proxy: http.ProxyURL(proxyURL),`
			`},`
			`}`

			`http.DefaultClient = httpClient`
			`return nil`
			`}`

launch 2021-02-03 08:12:30 +00:00			`func Precrawl(config types.Config) {`
Allows the configuration of a proxy (#9) * Add proxy support, capability to crawl using SOCKS proxies 2022-03-29 12:36:48 +00:00			`// setup proxy`
			`err := SetupDefaultProxy(config)`
			`if err != nil {`
			`log.Fatal(err)`
			`}`
add custom webring selector for precrawl 2022-03-30 13:13:16 +00:00
launch 2021-02-03 08:12:30 +00:00			`res, err := http.Get(config.General.URL)`
			`util.Check(err)`
			`defer res.Body.Close()`

			`if res.StatusCode != 200 {`
			`log.Fatal("status not 200")`
			`}`

			`doc, err := goquery.NewDocumentFromReader(res.Body)`
			`util.Check(err)`

			`items := make([]string, 0)`
add custom webring selector for precrawl 2022-03-30 13:13:16 +00:00			`s := doc.Find("html")`
			`query := config.General.WebringSelector`
			`if query == "" {`
			`query = "li > a[href]:first-of-type"`
			`}`
			`util.QuerySelector(query, s, &items)`
launch 2021-02-03 08:12:30 +00:00
			`BANNED := getBannedDomains(config.Crawler.BannedDomains)`
			`for _, item := range items {`
			`link := getLink(item)`
			`u, err := url.Parse(link)`
			`// invalid link`
			`if err != nil {`
			`continue`
			`}`
			`domain := u.Hostname()`
			`if find(BANNED, domain) {`
			`continue`
			`}`
			`fmt.Println(link)`
			`}`
			`}`

			`func Crawl(config types.Config) {`
Allows the configuration of a proxy (#9) * Add proxy support, capability to crawl using SOCKS proxies 2022-03-29 12:36:48 +00:00			`// setup proxy`
			`err := SetupDefaultProxy(config)`
			`if err != nil {`
			`log.Fatal(err)`
			`}`
launch 2021-02-03 08:12:30 +00:00			`SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes)`
			`links := getWebringLinks(config.Crawler.Webring)`
improve crawling rules wrt path-suffixed sites, close #6 2022-03-07 10:21:23 +00:00			`domains, pathsites := getDomains(links)`
launch 2021-02-03 08:12:30 +00:00			`initialDomain := config.General.URL`

			`// TODO: introduce c2 for scraping links (with depth 1) linked to from webring domains`
			`// instantiate default collector`
			`c := colly.NewCollector(`
			`colly.MaxDepth(3),`
			`)`
add custom webring selector for precrawl 2022-03-30 13:13:16 +00:00			`if config.General.Proxy != "" {`
			`c.SetProxy(config.General.Proxy)`
			`}`
launch 2021-02-03 08:12:30 +00:00
			`q, _ := queue.New(`
			`5, /* threads */`
			`&queue.InMemoryQueueStorage{MaxSize: 100000},`
			`)`

			`for _, link := range links {`
			`q.AddURL(link)`
			`}`

add user agent Lieu, misc fixes for null results 2021-12-01 08:56:09 +00:00			`c.UserAgent = "Lieu"`
launch 2021-02-03 08:12:30 +00:00			`c.AllowedDomains = domains`
			`c.AllowURLRevisit = false`
			`c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains)`

merge eotl changes :) 2021-05-11 18:39:14 +00:00			`delay, _ := time.ParseDuration("200ms")`
			`c.Limit(&colly.LimitRule{DomainGlob: "*", Delay: delay, Parallelism: 3})`
launch 2021-02-03 08:12:30 +00:00
			`boringDomains := getBoringDomains(config.Crawler.BoringDomains)`
			`boringWords := getBoringWords(config.Crawler.BoringWords)`
Made scring the preview text configurable and improved the cleanup function a tiny bit. 2022-11-15 15:38:02 +00:00			`previewQueries := getPreviewQueries(config.Crawler.PreviewQueries)`
Added some extra mechanisms that come in handy for getting more useful previews 2022-11-19 00:09:09 +00:00			`heuristics := getAboutHeuristics(config.Data.Heuristics)`
launch 2021-02-03 08:12:30 +00:00
			`// on every a element which has an href attribute, call callback`
			`c.OnHTML("a[href]", func(e *colly.HTMLElement) {`
Added a little check for the response code to not index pages that return errors or finish with codes in the 100 range 2022-11-19 14:45:52 +00:00
			`if e.Response.StatusCode >= 400 \|\| e.Response.StatusCode <= 100 {`
			`return`
			`}`

launch 2021-02-03 08:12:30 +00:00			`link := getLink(e.Attr("href"))`
			`if findSuffix(SUFFIXES, link) {`
			`return`
			`}`
improve crawling rules wrt path-suffixed sites, close #6 2022-03-07 10:21:23 +00:00
launch 2021-02-03 08:12:30 +00:00			`link = e.Request.AbsoluteURL(link)`
			`u, err := url.Parse(link)`
improve crawling rules wrt path-suffixed sites, close #6 2022-03-07 10:21:23 +00:00			`if err != nil {`
			`return`
			`}`

			`outgoingDomain := u.Hostname()`
			`currentDomain := e.Request.URL.Hostname()`

launch 2021-02-03 08:12:30 +00:00			`// log which site links to what`
improve crawling rules wrt path-suffixed sites, close #6 2022-03-07 10:21:23 +00:00			`if !util.Contains(boringWords, link) && !util.Contains(boringDomains, link) {`
launch 2021-02-03 08:12:30 +00:00			`if !find(domains, outgoingDomain) {`
			`fmt.Println("non-webring-link", link, e.Request.URL)`
			`// solidarity! someone in the webring linked to someone else in it`
			`} else if outgoingDomain != currentDomain && outgoingDomain != initialDomain && currentDomain != initialDomain {`
			`fmt.Println("webring-link", link, e.Request.URL)`
			`}`
			`}`
improve crawling rules wrt path-suffixed sites, close #6 2022-03-07 10:21:23 +00:00
			`// rule-based crawling`
			`var pathsite string`
			`for _, s := range pathsites {`
			`if strings.Contains(s, outgoingDomain) {`
			`pathsite = s`
			`break`
			`}`
			`}`
			`// the visited site was a so called »pathsite», a site with restrictions on which pages can be crawled (most often due to`
			`// existing on a shared domain)`
			`if pathsite != "" {`
			`// make sure we're only crawling descendents of the original path`
			`if strings.HasPrefix(link, pathsite) {`
			`q.AddURL(link)`
			`}`
			`} else {`
			`// visits links from AllowedDomains`
			`q.AddURL(link)`
			`}`
launch 2021-02-03 08:12:30 +00:00			`})`

Added some extra mechanisms that come in handy for getting more useful previews 2022-11-19 00:09:09 +00:00			`handleIndexing(c, previewQueries, heuristics)`
launch 2021-02-03 08:12:30 +00:00
			`// start scraping`
			`q.Run(c)`
			`}`