kopia lustrzana https://github.com/cblgh/lieu
add custom webring selector for precrawl
rodzic
21ef8aac08
commit
b0ad7dce10
|
@ -70,6 +70,9 @@ The config file is written in [TOML](https://toml.io/en/).
|
||||||
name = "Merveilles Webring"
|
name = "Merveilles Webring"
|
||||||
# used by the precrawl command and linked to in /about route
|
# used by the precrawl command and linked to in /about route
|
||||||
url = "https://webring.xxiivv.com"
|
url = "https://webring.xxiivv.com"
|
||||||
|
# used by the precrawl command to populate the Crawler.Webring file;
|
||||||
|
# takes simple html selectors. might be a bit wonky :)
|
||||||
|
webringSelector = "li > a[href]:first-of-type"
|
||||||
port = 10001
|
port = 10001
|
||||||
|
|
||||||
[theme]
|
[theme]
|
||||||
|
|
|
@ -154,6 +154,10 @@ func collectHeadingText(heading string, e *colly.HTMLElement) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func SetupDefaultProxy(config types.Config) error {
|
func SetupDefaultProxy(config types.Config) error {
|
||||||
|
// no proxy configured, go back
|
||||||
|
if config.General.Proxy == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
proxyURL, err := url.Parse(config.General.Proxy)
|
proxyURL, err := url.Parse(config.General.Proxy)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -165,7 +169,6 @@ func SetupDefaultProxy(config types.Config) error {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
//colly.SetHTTPClient(httpClient)
|
|
||||||
http.DefaultClient = httpClient
|
http.DefaultClient = httpClient
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -176,6 +179,7 @@ func Precrawl(config types.Config) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
res, err := http.Get(config.General.URL)
|
res, err := http.Get(config.General.URL)
|
||||||
util.Check(err)
|
util.Check(err)
|
||||||
defer res.Body.Close()
|
defer res.Body.Close()
|
||||||
|
@ -188,11 +192,12 @@ func Precrawl(config types.Config) {
|
||||||
util.Check(err)
|
util.Check(err)
|
||||||
|
|
||||||
items := make([]string, 0)
|
items := make([]string, 0)
|
||||||
doc.Find("li").Each(func(i int, s *goquery.Selection) {
|
s := doc.Find("html")
|
||||||
if domain, exists := s.Find("a").Attr("href"); exists {
|
query := config.General.WebringSelector
|
||||||
items = append(items, domain)
|
if query == "" {
|
||||||
}
|
query = "li > a[href]:first-of-type"
|
||||||
})
|
}
|
||||||
|
util.QuerySelector(query, s, &items)
|
||||||
|
|
||||||
BANNED := getBannedDomains(config.Crawler.BannedDomains)
|
BANNED := getBannedDomains(config.Crawler.BannedDomains)
|
||||||
for _, item := range items {
|
for _, item := range items {
|
||||||
|
@ -226,7 +231,9 @@ func Crawl(config types.Config) {
|
||||||
c := colly.NewCollector(
|
c := colly.NewCollector(
|
||||||
colly.MaxDepth(3),
|
colly.MaxDepth(3),
|
||||||
)
|
)
|
||||||
c.SetProxy(config.General.Proxy)
|
if config.General.Proxy != "" {
|
||||||
|
c.SetProxy(config.General.Proxy)
|
||||||
|
}
|
||||||
|
|
||||||
q, _ := queue.New(
|
q, _ := queue.New(
|
||||||
5, /* threads */
|
5, /* threads */
|
||||||
|
|
|
@ -15,12 +15,13 @@ type PageData struct {
|
||||||
|
|
||||||
type Config struct {
|
type Config struct {
|
||||||
General struct {
|
General struct {
|
||||||
Name string `json:name`
|
Name string `json:name`
|
||||||
Tagline string `json:tagline`
|
Tagline string `json:tagline`
|
||||||
Placeholder string `json:placeholder`
|
Placeholder string `json:placeholder`
|
||||||
URL string `json:url`
|
URL string `json:url`
|
||||||
Port int `json:port`
|
WebringSelector string `json:"webringSelector"`
|
||||||
Proxy string `json:proxy`
|
Port int `json:port`
|
||||||
|
Proxy string `json:proxy`
|
||||||
} `json:general`
|
} `json:general`
|
||||||
Theme struct {
|
Theme struct {
|
||||||
Foreground string `json:"foreground"`
|
Foreground string `json:"foreground"`
|
||||||
|
|
63
util/util.go
63
util/util.go
|
@ -4,10 +4,12 @@ import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"github.com/PuerkitoBio/goquery"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"log"
|
"log"
|
||||||
"net"
|
"net"
|
||||||
"os"
|
"os"
|
||||||
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"lieu/types"
|
"lieu/types"
|
||||||
|
@ -30,6 +32,66 @@ func Check(err error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// document.querySelectorAll-type functionality. limited functionality as of now (no classes or id support atm, i think!!)
|
||||||
|
func QuerySelector(query string, current *goquery.Selection, results *[]string) {
|
||||||
|
var op, operand string
|
||||||
|
|
||||||
|
attrPattern := regexp.MustCompile(`(\w+)\[(\w+)\](.+)?`)
|
||||||
|
attrValuePattern := regexp.MustCompile(`\[(\w+)\]`)
|
||||||
|
|
||||||
|
if len(query) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
fields := strings.Fields(query)
|
||||||
|
part := fields[0]
|
||||||
|
query = strings.Join(fields[1:], " ")
|
||||||
|
if part == ">" {
|
||||||
|
op = "subchild"
|
||||||
|
} else if attrPattern.MatchString(part) {
|
||||||
|
op = "element"
|
||||||
|
matches := attrPattern.FindStringSubmatch(part)
|
||||||
|
operand = matches[1]
|
||||||
|
var optional string
|
||||||
|
if len(matches) == 4 {
|
||||||
|
optional = matches[3]
|
||||||
|
}
|
||||||
|
query = strings.TrimSpace(fmt.Sprintf("[%s]%s %s", matches[2], optional, query))
|
||||||
|
} else if attrValuePattern.MatchString(part) {
|
||||||
|
op = "attr"
|
||||||
|
operand = attrValuePattern.FindStringSubmatch(part)[1]
|
||||||
|
} else if len(query) == 0 {
|
||||||
|
op = "final"
|
||||||
|
} else {
|
||||||
|
op = "element"
|
||||||
|
operand = part
|
||||||
|
}
|
||||||
|
|
||||||
|
switch op {
|
||||||
|
case "element": // e.g. [el]; bla > [el]; but also [el] > bla
|
||||||
|
current = current.Find(operand)
|
||||||
|
if strings.HasSuffix(query, "first-of-type") {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
fallthrough
|
||||||
|
case "subchild": // [preceding] > [future]
|
||||||
|
// recurse querySelector on all [preceding] element types
|
||||||
|
current.Each(func(j int, s *goquery.Selection) {
|
||||||
|
QuerySelector(query, s, results)
|
||||||
|
})
|
||||||
|
return
|
||||||
|
case "attr": // x[attr]
|
||||||
|
// extract the attribute
|
||||||
|
if str, exists := current.Attr(operand); exists {
|
||||||
|
*results = append(*results, str)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
case "final": // no more in query, and we did not end on an attr: get text
|
||||||
|
*results = append(*results, current.Text())
|
||||||
|
}
|
||||||
|
QuerySelector(query, current, results)
|
||||||
|
}
|
||||||
|
|
||||||
func DatabaseDoesNotExist(filepath string) {
|
func DatabaseDoesNotExist(filepath string) {
|
||||||
fmt.Printf("lieu: database %s does not exist\n", filepath)
|
fmt.Printf("lieu: database %s does not exist\n", filepath)
|
||||||
fmt.Println("lieu: try running `lieu ingest` if you have already crawled source data")
|
fmt.Println("lieu: try running `lieu ingest` if you have already crawled source data")
|
||||||
|
@ -104,6 +166,7 @@ func WriteMockConfig() {
|
||||||
name = "Sweet Webring"
|
name = "Sweet Webring"
|
||||||
# used by the precrawl command and linked to in /about route
|
# used by the precrawl command and linked to in /about route
|
||||||
url = "https://example.com/"
|
url = "https://example.com/"
|
||||||
|
webringSelector = "li > a"
|
||||||
port = 10001
|
port = 10001
|
||||||
|
|
||||||
[theme]
|
[theme]
|
||||||
|
|
Ładowanie…
Reference in New Issue