improve crawling rules wrt path-suffixed sites, close #6

pull/9/head 2022-03-07
cblgh 2022-03-07 11:21:23 +01:00 zatwierdzone przez Alexander Cobleigh
rodzic d98f76573f
commit 9f912b8323
2 zmienionych plików z 43 dodań i 12 usunięć

Wyświetl plik

@ -72,16 +72,23 @@ func getWebringLinks(path string) []string {
return links return links
} }
func getDomains(links []string) []string { func getDomains(links []string) ([]string, []string) {
var domains []string var domains []string
// sites which should have stricter crawling enforced (e.g. applicable for shared sites like tilde sites)
// pathsites are sites that are passed in which contain path,
// e.g. https://example.com/site/lupin -> only children pages of /site/lupin/ will be crawled
var pathsites []string
for _, l := range links { for _, l := range links {
u, err := url.Parse(l) u, err := url.Parse(l)
if err != nil { if err != nil {
continue continue
} }
domains = append(domains, u.Hostname()) domains = append(domains, u.Hostname())
if len(u.Path) > 0 && (u.Path != "/" || u.Path != "index.html") {
pathsites = append(pathsites, l)
}
} }
return domains return domains, pathsites
} }
func findSuffix(suffixes []string, query string) bool { func findSuffix(suffixes []string, query string) bool {
@ -184,7 +191,7 @@ func Precrawl(config types.Config) {
func Crawl(config types.Config) { func Crawl(config types.Config) {
SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes) SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes)
links := getWebringLinks(config.Crawler.Webring) links := getWebringLinks(config.Crawler.Webring)
domains := getDomains(links) domains, pathsites := getDomains(links)
initialDomain := config.General.URL initialDomain := config.General.URL
// TODO: introduce c2 for scraping links (with depth 1) linked to from webring domains // TODO: introduce c2 for scraping links (with depth 1) linked to from webring domains
@ -219,12 +226,18 @@ func Crawl(config types.Config) {
if findSuffix(SUFFIXES, link) { if findSuffix(SUFFIXES, link) {
return return
} }
link = e.Request.AbsoluteURL(link) link = e.Request.AbsoluteURL(link)
u, err := url.Parse(link) u, err := url.Parse(link)
if err != nil {
return
}
outgoingDomain := u.Hostname()
currentDomain := e.Request.URL.Hostname()
// log which site links to what // log which site links to what
if err == nil && !util.Contains(boringWords, link) && !util.Contains(boringDomains, link) { if !util.Contains(boringWords, link) && !util.Contains(boringDomains, link) {
outgoingDomain := u.Hostname()
currentDomain := e.Request.URL.Hostname()
if !find(domains, outgoingDomain) { if !find(domains, outgoingDomain) {
fmt.Println("non-webring-link", link, e.Request.URL) fmt.Println("non-webring-link", link, e.Request.URL)
// solidarity! someone in the webring linked to someone else in it // solidarity! someone in the webring linked to someone else in it
@ -232,8 +245,26 @@ func Crawl(config types.Config) {
fmt.Println("webring-link", link, e.Request.URL) fmt.Println("webring-link", link, e.Request.URL)
} }
} }
// only visits links from AllowedDomains
q.AddURL(link) // rule-based crawling
var pathsite string
for _, s := range pathsites {
if strings.Contains(s, outgoingDomain) {
pathsite = s
break
}
}
// the visited site was a so called »pathsite», a site with restrictions on which pages can be crawled (most often due to
// existing on a shared domain)
if pathsite != "" {
// make sure we're only crawling descendents of the original path
if strings.HasPrefix(link, pathsite) {
q.AddURL(link)
}
} else {
// visits links from AllowedDomains
q.AddURL(link)
}
}) })
handleIndexing(c) handleIndexing(c)

Wyświetl plik

@ -229,10 +229,10 @@ func (h RequestHandler) renderView(res http.ResponseWriter, tmpl string, view *T
func WriteTheme(config types.Config) { func WriteTheme(config types.Config) {
theme := config.Theme theme := config.Theme
// no theme is set, use the default // no theme is set, use the default
if theme.Foreground == "" { if theme.Foreground == "" {
return return
} }
colors := fmt.Sprintf(`:root { colors := fmt.Sprintf(`:root {
--primary: %s; --primary: %s;
--secondary: %s; --secondary: %s;