kopia lustrzana https://github.com/cblgh/lieu
rodzic
d98f76573f
commit
9f912b8323
|
@ -72,16 +72,23 @@ func getWebringLinks(path string) []string {
|
||||||
return links
|
return links
|
||||||
}
|
}
|
||||||
|
|
||||||
func getDomains(links []string) []string {
|
func getDomains(links []string) ([]string, []string) {
|
||||||
var domains []string
|
var domains []string
|
||||||
|
// sites which should have stricter crawling enforced (e.g. applicable for shared sites like tilde sites)
|
||||||
|
// pathsites are sites that are passed in which contain path,
|
||||||
|
// e.g. https://example.com/site/lupin -> only children pages of /site/lupin/ will be crawled
|
||||||
|
var pathsites []string
|
||||||
for _, l := range links {
|
for _, l := range links {
|
||||||
u, err := url.Parse(l)
|
u, err := url.Parse(l)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
domains = append(domains, u.Hostname())
|
domains = append(domains, u.Hostname())
|
||||||
|
if len(u.Path) > 0 && (u.Path != "/" || u.Path != "index.html") {
|
||||||
|
pathsites = append(pathsites, l)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return domains
|
return domains, pathsites
|
||||||
}
|
}
|
||||||
|
|
||||||
func findSuffix(suffixes []string, query string) bool {
|
func findSuffix(suffixes []string, query string) bool {
|
||||||
|
@ -184,7 +191,7 @@ func Precrawl(config types.Config) {
|
||||||
func Crawl(config types.Config) {
|
func Crawl(config types.Config) {
|
||||||
SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes)
|
SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes)
|
||||||
links := getWebringLinks(config.Crawler.Webring)
|
links := getWebringLinks(config.Crawler.Webring)
|
||||||
domains := getDomains(links)
|
domains, pathsites := getDomains(links)
|
||||||
initialDomain := config.General.URL
|
initialDomain := config.General.URL
|
||||||
|
|
||||||
// TODO: introduce c2 for scraping links (with depth 1) linked to from webring domains
|
// TODO: introduce c2 for scraping links (with depth 1) linked to from webring domains
|
||||||
|
@ -219,12 +226,18 @@ func Crawl(config types.Config) {
|
||||||
if findSuffix(SUFFIXES, link) {
|
if findSuffix(SUFFIXES, link) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
link = e.Request.AbsoluteURL(link)
|
link = e.Request.AbsoluteURL(link)
|
||||||
u, err := url.Parse(link)
|
u, err := url.Parse(link)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
outgoingDomain := u.Hostname()
|
||||||
|
currentDomain := e.Request.URL.Hostname()
|
||||||
|
|
||||||
// log which site links to what
|
// log which site links to what
|
||||||
if err == nil && !util.Contains(boringWords, link) && !util.Contains(boringDomains, link) {
|
if !util.Contains(boringWords, link) && !util.Contains(boringDomains, link) {
|
||||||
outgoingDomain := u.Hostname()
|
|
||||||
currentDomain := e.Request.URL.Hostname()
|
|
||||||
if !find(domains, outgoingDomain) {
|
if !find(domains, outgoingDomain) {
|
||||||
fmt.Println("non-webring-link", link, e.Request.URL)
|
fmt.Println("non-webring-link", link, e.Request.URL)
|
||||||
// solidarity! someone in the webring linked to someone else in it
|
// solidarity! someone in the webring linked to someone else in it
|
||||||
|
@ -232,8 +245,26 @@ func Crawl(config types.Config) {
|
||||||
fmt.Println("webring-link", link, e.Request.URL)
|
fmt.Println("webring-link", link, e.Request.URL)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// only visits links from AllowedDomains
|
|
||||||
q.AddURL(link)
|
// rule-based crawling
|
||||||
|
var pathsite string
|
||||||
|
for _, s := range pathsites {
|
||||||
|
if strings.Contains(s, outgoingDomain) {
|
||||||
|
pathsite = s
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// the visited site was a so called »pathsite», a site with restrictions on which pages can be crawled (most often due to
|
||||||
|
// existing on a shared domain)
|
||||||
|
if pathsite != "" {
|
||||||
|
// make sure we're only crawling descendents of the original path
|
||||||
|
if strings.HasPrefix(link, pathsite) {
|
||||||
|
q.AddURL(link)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// visits links from AllowedDomains
|
||||||
|
q.AddURL(link)
|
||||||
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
handleIndexing(c)
|
handleIndexing(c)
|
||||||
|
|
|
@ -229,10 +229,10 @@ func (h RequestHandler) renderView(res http.ResponseWriter, tmpl string, view *T
|
||||||
|
|
||||||
func WriteTheme(config types.Config) {
|
func WriteTheme(config types.Config) {
|
||||||
theme := config.Theme
|
theme := config.Theme
|
||||||
// no theme is set, use the default
|
// no theme is set, use the default
|
||||||
if theme.Foreground == "" {
|
if theme.Foreground == "" {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
colors := fmt.Sprintf(`:root {
|
colors := fmt.Sprintf(`:root {
|
||||||
--primary: %s;
|
--primary: %s;
|
||||||
--secondary: %s;
|
--secondary: %s;
|
||||||
|
|
Ładowanie…
Reference in New Issue