From 21ef8aac08b444e736e82885517527bf9c7dac07 Mon Sep 17 00:00:00 2001 From: idk Date: Tue, 29 Mar 2022 08:36:48 -0400 Subject: [PATCH] Allows the configuration of a proxy (#9) * Add proxy support, capability to crawl using SOCKS proxies --- .gitignore | 1 + crawler/crawler.go | 28 ++++++++++++++++++++++++++++ data/crawled.txt | 1 + data/webring.txt | 1 + types/types.go | 1 + util/util.go | 15 ++++++++++++++- 6 files changed, 46 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0b898b7..9cc89f2 100755 --- a/.gitignore +++ b/.gitignore @@ -224,3 +224,4 @@ pip-log.txt #Mr Developer .mr.developer.cfg +lieu diff --git a/crawler/crawler.go b/crawler/crawler.go index c3081c4..91b1662 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -153,7 +153,29 @@ func collectHeadingText(heading string, e *colly.HTMLElement) { } } +func SetupDefaultProxy(config types.Config) error { + proxyURL, err := url.Parse(config.General.Proxy) + if err != nil { + return err + } + + httpClient := &http.Client{ + Transport: &http.Transport{ + Proxy: http.ProxyURL(proxyURL), + }, + } + + //colly.SetHTTPClient(httpClient) + http.DefaultClient = httpClient + return nil +} + func Precrawl(config types.Config) { + // setup proxy + err := SetupDefaultProxy(config) + if err != nil { + log.Fatal(err) + } res, err := http.Get(config.General.URL) util.Check(err) defer res.Body.Close() @@ -189,6 +211,11 @@ func Precrawl(config types.Config) { } func Crawl(config types.Config) { + // setup proxy + err := SetupDefaultProxy(config) + if err != nil { + log.Fatal(err) + } SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes) links := getWebringLinks(config.Crawler.Webring) domains, pathsites := getDomains(links) @@ -199,6 +226,7 @@ func Crawl(config types.Config) { c := colly.NewCollector( colly.MaxDepth(3), ) + c.SetProxy(config.General.Proxy) q, _ := queue.New( 5, /* threads */ diff --git a/data/crawled.txt b/data/crawled.txt index e69de29..8b13789 100644 --- a/data/crawled.txt +++ b/data/crawled.txt @@ -0,0 +1 @@ + diff --git a/data/webring.txt b/data/webring.txt index e69de29..8b13789 100644 --- a/data/webring.txt +++ b/data/webring.txt @@ -0,0 +1 @@ + diff --git a/types/types.go b/types/types.go index c9e2b25..bb893c6 100644 --- a/types/types.go +++ b/types/types.go @@ -20,6 +20,7 @@ type Config struct { Placeholder string `json:placeholder` URL string `json:url` Port int `json:port` + Proxy string `json:proxy` } `json:general` Theme struct { Foreground string `json:"foreground"` diff --git a/util/util.go b/util/util.go index 070a518..fd34ac1 100644 --- a/util/util.go +++ b/util/util.go @@ -10,9 +10,10 @@ import ( "os" "strings" + "lieu/types" + "github.com/jinzhu/inflection" "github.com/komkom/toml" - "lieu/types" ) func Inflect(words []string) []string { @@ -140,3 +141,15 @@ boringDomains = "data/boring-domains.txt" func Exit() { os.Exit(0) } + +func DeduplicateSlice(intSlice []string) []string { + keys := make(map[string]bool) + list := []string{} + for _, entry := range intSlice { + if _, value := keys[entry]; !value { + keys[entry] = true + list = append(list, entry) + } + } + return list +}