Porównaj commity

...

23 Commity

Autor SHA1 Wiadomość Data
Alexander Cobleigh f27c45d4be
Update README.md 2023-05-10 15:43:01 +02:00
Slatian 22d1802337 Documented the theming part a bit better 2022-12-06 12:14:52 +01:00
cblgh 9173912782 tweak language for new search docs 2022-12-06 12:11:32 +01:00
Slatian f41b7f87e7 Added some pretty liberal limits on query length to make it more difficult to cause a DOS condition.
(the go http package by default limits the header length to 1 Megabyte, which is great at preventing someone from causing trpuble at the http layer, but doesn't work too well when there is a pretty expensive search going on in the background)
2022-12-06 12:02:14 +01:00
Slatian e21cc9a9d0 Documented how to place search queries 2022-12-06 12:02:14 +01:00
Slatian b2a9947fb9 Removed debugging outputs 2022-12-06 12:02:14 +01:00
Slatian b431a15441 Added experimental support for "-site:" and "lang:" queries 2022-12-06 12:02:14 +01:00
Slatian b4a2e5e269 Added a robots.txt file 2022-12-06 11:23:04 +01:00
Slatian d02edd35ca Optimized favicons a bit 2022-12-06 11:22:19 +01:00
cblgh 9377bd6fab go fmt 2022-11-22 14:08:59 +01:00
cblgh 9517f62de2 tweak wording and minor details relating to preview queries 2022-11-22 14:08:44 +01:00
Slatian 7c6a63ce2c Added a bit of documentation for new features 2022-11-22 13:52:32 +01:00
Slatian 212f5c5655 Added new configuration option top lieu.toml 2022-11-22 13:52:32 +01:00
Slatian 27e1b68b66 URIs are not case insensitive by default so, we shouldn't assume that they are 2022-11-22 13:52:32 +01:00
Slatian e56f60ccb9 Added batching functionality because wordlists become pretty long when the scraper found lots of long paragraphs 2022-11-22 13:52:32 +01:00
Slatian ed5f5189b0 Added a little check for the response code to not index pages that return errors or finish with codes in the 100 range 2022-11-22 13:52:32 +01:00
Slatian 34d6df3830 Removed "note:" from heuristics as too many sites are affected negatively by this 2022-11-22 13:52:32 +01:00
Slatian c72ea4c6ca Improved heuristics for enlish language text to skip over most fluff paragraphs to get better samples of sites 2022-11-22 13:52:32 +01:00
Slatian 5fe32df938 Added some extra mechanisms that come in handy for getting more useful previews 2022-11-22 13:52:32 +01:00
Slatian cbaa6e06b1 Add a default configuration that mostly works … 2022-11-22 13:52:32 +01:00
Slatian 0a85f38b36 Made scring the preview text configurable and improved the cleanup function a tiny bit. 2022-11-22 13:52:32 +01:00
cblgh b0ad7dce10 add custom webring selector for precrawl 2022-03-30 15:18:38 +02:00
idk 21ef8aac08
Allows the configuration of a proxy (#9)
* Add proxy support, capability to crawl using SOCKS proxies
2022-03-29 14:36:48 +02:00
22 zmienionych plików z 389 dodań i 66 usunięć

1
.gitignore vendored
Wyświetl plik

@ -224,3 +224,4 @@ pip-log.txt
#Mr Developer
.mr.developer.cfg
lieu

Wyświetl plik

@ -18,6 +18,12 @@ engine, a way for personal webrings to increase serendipitous connexions.
## Usage
### How to search
For the full search syntax (including how to use `site:` and `-site:`), see the [search syntax and API documentation](docs/querying.md). For more tips, read the [appendix](https://cblgh.org/lieu/).
### Getting Lieu running
```
$ lieu help
Lieu: neighbourhood search engine
@ -70,10 +76,15 @@ The config file is written in [TOML](https://toml.io/en/).
name = "Merveilles Webring"
# used by the precrawl command and linked to in /about route
url = "https://webring.xxiivv.com"
# used by the precrawl command to populate the Crawler.Webring file;
# takes simple html selectors. might be a bit wonky :)
webringSelector = "li > a[href]:first-of-type"
port = 10001
[theme]
# colors specified in hex (or valid css names) which determine the theme of the lieu instance
# NOTE: If (and only if) all three values are set lieu uses those to generate the file html/assets/theme.css at startup.
# You can also write directly to that file istead of adding this section to your configuration file
foreground = "#ffffff"
background = "#000000"
links = "#ffffff"
@ -99,6 +110,8 @@ bannedSuffixes = "data/banned-suffixes.txt"
boringWords = "data/boring-words.txt"
# domains that won't be output as outgoing links
boringDomains = "data/boring-domains.txt"
# queries to search for finding preview text
previewQueryList = "data/preview-query-list.txt"
```
For your own use, the following config fields should be customized:
@ -116,6 +129,7 @@ The following config-defined files can stay as-is unless you have specific requi
* `heuristics`
* `wordlist`
* `bannedSuffixes`
* `previewQueryList`
For a full rundown of the files and their various jobs, see the [files
description](docs/files.md).

Wyświetl plik

@ -34,6 +34,19 @@ func getBoringDomains(path string) []string {
return util.ReadList(path, "\n")
}
func getAboutHeuristics(path string) []string {
return util.ReadList(path, "\n")
}
func getPreviewQueries(path string) []string {
previewQueries := util.ReadList(path, "\n")
if len(previewQueries) > 0 {
return previewQueries
} else {
return []string{"main p", "article p", "section p", "p"}
}
}
func find(list []string, query string) bool {
for _, item := range list {
if item == query {
@ -103,27 +116,33 @@ func findSuffix(suffixes []string, query string) bool {
func cleanText(s string) string {
s = strings.TrimSpace(s)
s = strings.ReplaceAll(s, "\n", " ")
s = strings.ReplaceAll(s, "|", " ")
whitespace := regexp.MustCompile(`\p{Z}`)
whitespace := regexp.MustCompile(`\p{Z}+`)
s = whitespace.ReplaceAllString(s, " ")
return s
}
func handleIndexing(c *colly.Collector) {
func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []string) {
c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) {
fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL)
})
c.OnHTML("meta[name=\"description\"]", func(e *colly.HTMLElement) {
desc := cleanText(e.Attr("content"))
if len(desc) > 0 {
if len(desc) > 0 && len(desc) < 1500 {
fmt.Println("desc", desc, e.Request.URL)
}
})
c.OnHTML("meta[property=\"og:description\"]", func(e *colly.HTMLElement) {
ogDesc := cleanText(e.Attr("content"))
if len(ogDesc) > 0 && len(ogDesc) < 1500 {
fmt.Println("og-desc", ogDesc, e.Request.URL)
}
})
c.OnHTML("html[lang]", func(e *colly.HTMLElement) {
lang := cleanText(e.Attr("lang"))
if len(lang) > 0 {
if len(lang) > 0 && len(lang) < 100 {
fmt.Println("lang", lang, e.Request.URL)
}
})
@ -134,10 +153,26 @@ func handleIndexing(c *colly.Collector) {
})
c.OnHTML("body", func(e *colly.HTMLElement) {
QueryLoop:
for i := 0; i < len(previewQueries); i++ {
// After the fourth paragraph we're probably too far in to get something interesting for a preview
elements := e.DOM.Find(previewQueries[i])
for j := 0; j < 4 && j < elements.Length(); j++ {
element_text := elements.Slice(j, j+1).Text()
paragraph := cleanText(element_text)
if len(paragraph) < 1500 && len(paragraph) > 20 {
if !util.Contains(heuristics, strings.ToLower(paragraph)) {
fmt.Println("para", paragraph, e.Request.URL)
break QueryLoop
}
}
}
}
paragraph := cleanText(e.DOM.Find("p").First().Text())
if len(paragraph) < 1500 && len(paragraph) > 0 {
fmt.Println("para", paragraph, e.Request.URL)
fmt.Println("para-just-p", paragraph, e.Request.URL)
}
// get all relevant page headings
collectHeadingText("h1", e)
collectHeadingText("h2", e)
@ -153,7 +188,33 @@ func collectHeadingText(heading string, e *colly.HTMLElement) {
}
}
func SetupDefaultProxy(config types.Config) error {
// no proxy configured, go back
if config.General.Proxy == "" {
return nil
}
proxyURL, err := url.Parse(config.General.Proxy)
if err != nil {
return err
}
httpClient := &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(proxyURL),
},
}
http.DefaultClient = httpClient
return nil
}
func Precrawl(config types.Config) {
// setup proxy
err := SetupDefaultProxy(config)
if err != nil {
log.Fatal(err)
}
res, err := http.Get(config.General.URL)
util.Check(err)
defer res.Body.Close()
@ -166,11 +227,12 @@ func Precrawl(config types.Config) {
util.Check(err)
items := make([]string, 0)
doc.Find("li").Each(func(i int, s *goquery.Selection) {
if domain, exists := s.Find("a").Attr("href"); exists {
items = append(items, domain)
}
})
s := doc.Find("html")
query := config.General.WebringSelector
if query == "" {
query = "li > a[href]:first-of-type"
}
util.QuerySelector(query, s, &items)
BANNED := getBannedDomains(config.Crawler.BannedDomains)
for _, item := range items {
@ -189,6 +251,11 @@ func Precrawl(config types.Config) {
}
func Crawl(config types.Config) {
// setup proxy
err := SetupDefaultProxy(config)
if err != nil {
log.Fatal(err)
}
SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes)
links := getWebringLinks(config.Crawler.Webring)
domains, pathsites := getDomains(links)
@ -199,6 +266,9 @@ func Crawl(config types.Config) {
c := colly.NewCollector(
colly.MaxDepth(3),
)
if config.General.Proxy != "" {
c.SetProxy(config.General.Proxy)
}
q, _ := queue.New(
5, /* threads */
@ -219,9 +289,16 @@ func Crawl(config types.Config) {
boringDomains := getBoringDomains(config.Crawler.BoringDomains)
boringWords := getBoringWords(config.Crawler.BoringWords)
previewQueries := getPreviewQueries(config.Crawler.PreviewQueries)
heuristics := getAboutHeuristics(config.Data.Heuristics)
// on every a element which has an href attribute, call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 {
return
}
link := getLink(e.Attr("href"))
if findSuffix(SUFFIXES, link) {
return
@ -267,7 +344,7 @@ func Crawl(config types.Config) {
}
})
handleIndexing(c)
handleIndexing(c, previewQueries, heuristics)
// start scraping
q.Run(c)

Wyświetl plik

@ -0,0 +1 @@

Wyświetl plik

@ -8,3 +8,16 @@ last edit
(c)
all rights reserved
licensed under
subscribe
|
generated by
powered by
this post was
click here for
click here to
published on:
published:
posted:
share this article
estimated read time

Wyświetl plik

@ -0,0 +1,9 @@
header p.p-summary
main p.p-summary
main p:not(.post-meta):not(.alternate):not(header p):not(footer p):not(nav p):not(aside p):not(.sidebar p)
article p.p-summary
article p:not(.post-meta):not(.alternate):not(header p):not(footer p):not(nav p):not(aside p):not(.sidebar p)
p:not(.post-meta):not(.alternate):not(header p):not(footer p):not(nav p):not(aside p):not(.sidebar p)
header ~ p:not(.post-meta):not(.alternate):not(footer p):not(aside p):not(.sidebar p)
h1 ~ p:not(.post-meta):not(.alternate)
p:not(.post-meta):not(.alternate):not(footer p):not(aside p):not(.sidebar p)

Wyświetl plik

@ -0,0 +1 @@

Wyświetl plik

@ -19,10 +19,13 @@ import (
"log"
"net/url"
"strings"
"regexp"
_ "github.com/mattn/go-sqlite3"
)
var languageCodeSanityRegex = regexp.MustCompile("^[a-zA-Z\\-0-9]+$")
func InitDB(filepath string) *sql.DB {
db, err := sql.Open("sqlite3", filepath)
if err != nil {
@ -95,17 +98,19 @@ query params:
&order=score, &order=count
*/
var emptyStringArray = []string{}
func SearchWordsByScore(db *sql.DB, words []string) []types.PageData {
return searchWords(db, words, true)
return SearchWords(db, words, true, emptyStringArray, emptyStringArray, emptyStringArray)
}
func SearchWordsBySite(db *sql.DB, words []string, domain string) []types.PageData {
// search words by site is same as search words by score, but adds a domain condition
return searchWords(db, words, true, domain)
return SearchWords(db, words, true, []string{domain}, emptyStringArray, emptyStringArray)
}
func SearchWordsByCount(db *sql.DB, words []string) []types.PageData {
return searchWords(db, words, false)
return SearchWords(db, words, false, emptyStringArray, emptyStringArray, emptyStringArray)
}
func FulltextSearchWords(db *sql.DB, phrase string) []types.PageData {
@ -222,12 +227,16 @@ func countQuery(db *sql.DB, table string) int {
return count
}
func searchWords(db *sql.DB, words []string, searchByScore bool, domain ...string) []types.PageData {
var wordlist []string
func SearchWords(db *sql.DB, words []string, searchByScore bool, domain []string, nodomain []string, language []string) []types.PageData {
var args []interface{}
for _, word := range words {
wordlist = append(wordlist, "word = ?")
args = append(args, strings.ToLower(word))
wordlist := []string{"1"}
if len(words) > 0 && words[0] != "" {
wordlist = make([]string, 0)
for _, word := range words {
wordlist = append(wordlist, "word = ?")
args = append(args, strings.ToLower(word))
}
}
// the domains conditional defaults to just 'true' i.e. no domain condition
@ -240,6 +249,28 @@ func searchWords(db *sql.DB, words []string, searchByScore bool, domain ...strin
}
}
nodomains := []string{"1"}
if len(nodomain) > 0 && nodomain[0] != "" {
nodomains = make([]string, 0)
for _, d := range nodomain {
nodomains = append(nodomains, "domain != ?")
args = append(args, d)
}
}
//This needs some wildcard support …
languages := []string{"1"}
if len(language) > 0 && language[0] != "" {
languages = make([]string, 0)
for _, d := range language {
// Do a little check to avoid the database being DOSed
if languageCodeSanityRegex.MatchString(d) {
languages = append(languages, "lang LIKE ?")
args = append(args, d+"%")
}
}
}
orderType := "SUM(score)"
if !searchByScore {
orderType = "COUNT(*)"
@ -250,11 +281,13 @@ func searchWords(db *sql.DB, words []string, searchByScore bool, domain ...strin
FROM inv_index inv INNER JOIN pages p ON inv.url = p.url
WHERE (%s)
AND (%s)
AND (%s)
AND (%s)
GROUP BY inv.url
ORDER BY %s
DESC
LIMIT 15
`, strings.Join(wordlist, " OR "), strings.Join(domains, " OR "), orderType)
`, strings.Join(wordlist, " OR "), strings.Join(domains, " OR "), strings.Join(nodomains, " AND "), strings.Join(languages, " OR "), orderType)
stmt, err := db.Prepare(query)
util.Check(err)

Wyświetl plik

@ -37,6 +37,8 @@ bannedSuffixes = "data/banned-suffixes.txt"
boringWords = "data/boring-words.txt"
# domains that won't be output as outgoing links
boringDomains = "data/boring-domains.txt"
# queries to search for finding preview text
previewQueryList = "data/preview-query-list.txt"
```
## HTML
@ -120,6 +122,21 @@ are stopped from entering the search index. The default wordlist consists of the
1000 or so most common English words, albeit curated slightly to still allow for
interesting concepts and verbs—such as `reading` and `books`, for example.
#### `previewQueryList`
A list of css selectors—one per line—used to fetch preview paragraphs. The first paragraph
found passing a check against the `heuristics` file makes it into the search index. For
each selector in `previewQueryList`, Lieu tries the first four paragraphs—as found by the
selector—before trying to find a new set of paragraphs using the file's next selector.
To get good results, one usually wants to tune this list to getting the first "real" paragraph
after common page headers, or finding a summary paragraph. The default has been, at the time of
writing, tuned for use with the [Fediring](https://fediring.net).
Depending on the structure of the websites you are indexing, this will get you 70-90% of the
way in terms of accurate link descriptions. For the rest of the way, fine-tune `heuristics.txt`
and reach out the creators of the websites you are indexing; they often appreciate the
feedback.
#### OpenSearch metadata
If you are running your own instance of Lieu, you might want to look into changing the URL
defined in the file `opensearch.xml`, which specifies [OpenSearch

41
docs/querying.md 100644
Wyświetl plik

@ -0,0 +1,41 @@
# Querying Lieu
## Search Syntax
* `cat dog` - search for pages about cats or dogs, most probably both
* `fox site:example.org` - search example.org (if indexed) for term "fox"
* `fox -site:example.org` - search all indexed sites except `example.org` for term "fox"
* `emoji lang:de` - search pages that claim to mainly contain German content for the term "emoji"
When searching, capitalisation and inflection do not matter, as search terms are:
* Converted to lowercase using the go standard library
* Passed through [jinzhu's inflection library](https://github.com/jinzhu/inflection) for
converting to a possible singular form (intended to work with English nouns)
## Search API
Lieu currently only renders its results to HTML. A query can be passed to the `/` endpoint using a `GET` request.
It supports two URL parameters:
* `q` - used for the search query
* `site` - accepts one domain name and will have the same effect as the `site:<domain>` syntax.
You can use this to make your webrings search engine double as a searchbox on your website.
### Examples
To search `example.org` for the term "ssh" using `https://search.webring.example`:
```
https://search.webring.example/?q=ssh&site=example.org
```
Adding a form element, to use Lieu as a search engine, to the HTML at example.org:
```
<form method="GET" action="https://search.webring.example">
<label for="search">Search example.org</label>
<input type="search" minlength="1" required="" name="q" placeholder="Your search query here" id="search">
<input type="hidden" name="site" value="example.org"> <!-- This hidden field tells lieu to only search example.org -->
<button type="submit">Let's go!</button>
</form>
```

Plik binarny nie jest wyświetlany.

Przed

Szerokość:  |  Wysokość:  |  Rozmiar: 1.1 KiB

Po

Szerokość:  |  Wysokość:  |  Rozmiar: 326 B

Plik binarny nie jest wyświetlany.

Przed

Szerokość:  |  Wysokość:  |  Rozmiar: 7.0 KiB

Po

Szerokość:  |  Wysokość:  |  Rozmiar: 2.7 KiB

Wyświetl plik

@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg width="420" height="420" fill="none" version="1.1" xmlns="http://www.w3.org/2000/svg"><g stroke-linecap="round"><rect width="420" height="420" fill="#000" stroke-width="12.8"/></g><path d="m210 87c-53.5 0-104 27.1-149 71.9l-28.6 28.6 50.7 50.7 28.3-27.9 4.76 13.4-38.8 38.8 57.3 57.3 34.5-34.5v2.72 4.5 40.5h81v-40.5-4.5-2.72l34.5 34.5 57.3-57.3-38.8-38.8 4.76-13.4 28.3 27.9 50.7-50.7-28.6-28.6c-44.8-44.8-95.1-71.9-149-71.9zm0 81c11.2 0 19.5 8.25 19.5 19.5s-8.25 19.5-19.5 19.5-19.5-8.25-19.5-19.5 8.25-19.5 19.5-19.5z" color="#000000" fill="#fff" stroke-linecap="square" stroke-linejoin="round" style="-inkscape-stroke:none"/></svg>

Po

Szerokość:  |  Wysokość:  |  Rozmiar: 680 B

Wyświetl plik

@ -15,7 +15,7 @@
<link href="/assets/theme.css" rel="stylesheet">
<link rel="icon" href="/assets/favicon.ico">
<link rel="icon" href="/assets/logo.svg" type="image/svg+xml">
<link rel="icon" href="/assets/favicon.svg" type="image/svg+xml">
<link rel="shortcut icon" href="/assets/favicon.png">
<link rel="apple-touch-icon" href="/assets/favicon.png">
<meta name="theme-color" content="#000000">

Wyświetl plik

@ -20,7 +20,7 @@
<form class="search">
<label class="visually-hidden" for="search">Search {{ .SiteName }}</label>
<span class="search__input">
<input type="search" required minlength="1" name="q" placeholder="{{ .Data.Placeholder }}" class="flex-grow" id="search">
<input type="search" required minlength="1" name="q" placeholder="{{ .Data.Placeholder }}" class="flex-grow" id="search" maxlength="6000" >
<button type="submit" class="search__button" aria-label="Search" title="Search">
<svg viewBox="0 0 420 300" xmlns="http://www.w3.org/2000/svg" baseProfile="full" style="background:var(--secondary)" width="42" height="30" fill="none"><path d="M90 135q60-60 120-60 0 0 0 0 60 0 120 60m-120 60a60 60 0 01-60-60 60 60 0 0160-60 60 60 0 0160 60 60 60 0 01-60 60m45-15h0l30 30m-75-15h0v45m-45-60h0l-30 30" stroke-width="81" stroke-linecap="square" stroke-linejoin="round" stroke="var(--primary)"/></svg>
</button>

2
html/robots.txt 100644
Wyświetl plik

@ -0,0 +1,2 @@
User-agent: *
Disallow: /*?

Wyświetl plik

@ -6,7 +6,7 @@
<form method="GET" class="search">
<label for="search">Search {{ .SiteName }} </label>
<span class="search__input">
<input type="search" minlength="1" required name="q" placeholder="Search" value="{{ .Data.Query }}" class="search-box" id="search">
<input type="search" minlength="1" required name="q" placeholder="Search" value="{{ .Data.Query }}" class="search-box" id="search" maxlength="6000">
{{ if ne .Data.Site "" }}
<input type="hidden" value="{{ .Data.Site }}" name="site">
{{ end }}

Wyświetl plik

@ -95,7 +95,7 @@ func Ingest(config types.Config) {
continue
}
pageurl := strings.ToLower(strings.TrimSuffix(strings.TrimSpace(line[lastSpace:len(line)]), "/"))
pageurl := strings.TrimSuffix(strings.TrimSpace(line[lastSpace:len(line)]), "/")
if !strings.HasPrefix(pageurl, "http") {
continue
}
@ -117,6 +117,7 @@ func Ingest(config types.Config) {
case "title":
if len(page.About) == 0 {
page.About = rawdata
page.AboutSource = token
}
score = 5
page.Title = rawdata
@ -124,6 +125,7 @@ func Ingest(config types.Config) {
case "h1":
if len(page.About) == 0 {
page.About = rawdata
page.AboutSource = token
}
fallthrough
case "h2":
@ -132,13 +134,21 @@ func Ingest(config types.Config) {
score = 15
processed = partitionSentence(payload)
case "desc":
if len(page.About) < 30 && len(rawdata) < 100 {
if len(page.About) < 30 && len(rawdata) < 100 && len(rawdata) > len(page.About) {
page.About = rawdata
page.AboutSource = token
}
processed = partitionSentence(payload)
case "og-desc":
page.About = rawdata
page.AboutSource = token
processed = partitionSentence(payload)
case "para":
if performAboutHeuristic(config.Data.Heuristics, payload) {
page.About = rawdata
if page.AboutSource != "og-desc" || len(rawdata)*10 > len(page.About)*7 {
if performAboutHeuristic(config.Data.Heuristics, payload) {
page.About = rawdata
page.AboutSource = token
}
}
processed = partitionSentence(payload)
case "lang":
@ -146,7 +156,7 @@ func Ingest(config types.Config) {
case "keywords":
processed = strings.Split(strings.ReplaceAll(payload, ", ", ","), ",")
case "non-webring-link":
externalLinks = append(externalLinks, payload)
externalLinks = append(externalLinks, rawdata)
default:
continue
}
@ -162,7 +172,7 @@ func Ingest(config types.Config) {
// only extract path segments once per url.
// we do it here because every page is virtually guaranteed to have a title attr &
// it only appears once
for _, word := range extractPathSegments(pageurl) {
for _, word := range extractPathSegments(strings.ToLower(pageurl)) {
batch = append(batch, types.SearchFragment{Word: word, URL: pageurl, Score: 2})
}
}
@ -190,10 +200,16 @@ func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]ty
i++
}
// TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from
log.Println("starting to ingest batch")
log.Println("starting to ingest batch (Pages:", len(pages), "Words:", len(batch), "Links:", len(links), ")")
database.InsertManyDomains(db, pages)
database.InsertManyPages(db, pages)
database.InsertManyWords(db, batch)
for i := 0; i < len(batch); i += 3000 {
end_i := i + 3000
if end_i > len(batch) {
end_i = len(batch)
}
database.InsertManyWords(db, batch[i:end_i])
}
database.InsertManyExternalLinks(db, links)
log.Println("finished ingesting batch")
}

Wyświetl plik

@ -27,3 +27,5 @@ bannedSuffixes = "data/banned-suffixes.txt"
boringWords = "data/boring-words.txt"
# domains that won't be output as outgoing links
boringDomains = "data/boring-domains.txt"
# queries to search for finding preview text
previewQueryList = "data/preview-query-list.txt"

Wyświetl plik

@ -7,7 +7,6 @@ import (
"net/http"
"net/url"
"os"
"regexp"
"strings"
"syscall"
@ -61,17 +60,21 @@ var templates = template.Must(template.ParseFiles(
const useURLTitles = true
var sitePattern = regexp.MustCompile(`site:\S+`)
func (h RequestHandler) searchRoute(res http.ResponseWriter, req *http.Request) {
var query string
var domain string
view := &TemplateView{}
var domain string
if req.Method == http.MethodGet {
var domains = []string{}
var nodomains = []string{}
var langs = []string{}
var queryFields = []string{}
if req.Method == http.MethodGet{
params := req.URL.Query()
if words, exists := params["q"]; exists && words[0] != "" {
query = words[0]
queryFields = strings.Fields(query)
}
// how to use: https://gist.github.com/cblgh/29991ba0a9e65cccbe14f4afd7c975f1
@ -80,29 +83,36 @@ func (h RequestHandler) searchRoute(res http.ResponseWriter, req *http.Request)
domain = strings.TrimPrefix(parts[0], "https://")
domain = strings.TrimPrefix(domain, "http://")
domain = strings.TrimSuffix(domain, "/")
} else if sitePattern.MatchString(query) {
// if user searched with "site:<domain>" in text box, behave the same way as if a query param was used
domain = sitePattern.FindString(query)[5:]
domains = append(domains, domain)
}
// if clear button was used -> clear site param / search text
if parts, exists := params["clear"]; exists && parts[0] != "" {
domain = ""
query = sitePattern.ReplaceAllString(query, "")
// don't process if there are too many fields
if len(queryFields) <= 100 {
var newQueryFields []string;
for _, word := range queryFields {
// This could be more efficient by splitting arrays, but I'm going with the more readable version for now
if strings.HasPrefix(word, "site:") {
domains = append(domains, strings.TrimPrefix(word, "site:"))
} else if strings.HasPrefix(word, "-site:") {
nodomains = append(nodomains, strings.TrimPrefix(word, "-site:"))
} else if strings.HasPrefix(word, "lang:") {
langs = append(langs, strings.TrimPrefix(word, "lang:"))
} else {
newQueryFields = append(newQueryFields, word)
}
}
queryFields = newQueryFields;
}
}
if len(query) == 0 {
if len(queryFields) == 0 || len(queryFields) > 100 || len(query) >= 8192 {
view.Data = IndexData{Tagline: h.config.General.Tagline, Placeholder: h.config.General.Placeholder}
h.renderView(res, "index", view)
return
}
var pages []types.PageData
if domain != "" {
pages = database.SearchWordsBySite(h.db, util.Inflect(strings.Fields(query)), domain)
} else {
pages = database.SearchWordsByScore(h.db, util.Inflect(strings.Fields(query)))
}
var pages = database.SearchWords(h.db, util.Inflect(queryFields), true, domains, nodomains, langs)
if useURLTitles {
for i, pageData := range pages {
@ -230,14 +240,15 @@ func (h RequestHandler) renderView(res http.ResponseWriter, tmpl string, view *T
func WriteTheme(config types.Config) {
theme := config.Theme
// no theme is set, use the default
if theme.Foreground == "" {
if theme.Foreground == "" || theme.Background == "" || theme.Links =="" {
return
}
colors := fmt.Sprintf(`:root {
colors := fmt.Sprintf(`/*This file will be automatically regenerated by lieu on startup if the theme colors are set in the configuration file*/
:root {
--primary: %s;
--secondary: %s;
--link: %s;
}\n`, theme.Foreground, theme.Background, theme.Links)
}`, theme.Foreground, theme.Background, theme.Links)
err := os.WriteFile("html/assets/theme.css", []byte(colors), 0644)
util.Check(err)
}
@ -255,8 +266,9 @@ func Serve(config types.Config) {
http.HandleFunc("/webring", handler.webringRoute)
http.HandleFunc("/filtered", handler.filteredRoute)
fileserver := http.FileServer(http.Dir("html/assets/"))
http.Handle("/assets/", http.StripPrefix("/assets/", fileserver))
fileserver := http.FileServer(http.Dir("html/"))
http.Handle("/assets/", fileserver)
http.Handle("/robots.txt", fileserver)
portstr := fmt.Sprintf(":%d", config.General.Port)
fmt.Println("Listening on port: ", portstr)

Wyświetl plik

@ -7,19 +7,22 @@ type SearchFragment struct {
}
type PageData struct {
URL string
Title string
About string
Lang string
URL string
Title string
About string
Lang string
AboutSource string
}
type Config struct {
General struct {
Name string `json:name`
Tagline string `json:tagline`
Placeholder string `json:placeholder`
URL string `json:url`
Port int `json:port`
Name string `json:name`
Tagline string `json:tagline`
Placeholder string `json:placeholder`
URL string `json:url`
WebringSelector string `json:"webringSelector"`
Port int `json:port`
Proxy string `json:proxy`
} `json:general`
Theme struct {
Foreground string `json:"foreground"`
@ -38,5 +41,6 @@ type Config struct {
BannedSuffixes string `json:bannedSuffixes`
BoringWords string `json:boringWords`
BoringDomains string `json:boringDomains`
PreviewQueries string `json:"previewQueryList"`
} `json:crawler`
}

Wyświetl plik

@ -4,15 +4,18 @@ import (
"bytes"
"encoding/json"
"fmt"
"github.com/PuerkitoBio/goquery"
"io/ioutil"
"log"
"net"
"os"
"regexp"
"strings"
"lieu/types"
"github.com/jinzhu/inflection"
"github.com/komkom/toml"
"lieu/types"
)
func Inflect(words []string) []string {
@ -29,6 +32,66 @@ func Check(err error) {
}
}
// document.querySelectorAll-type functionality. limited functionality as of now (no classes or id support atm, i think!!)
func QuerySelector(query string, current *goquery.Selection, results *[]string) {
var op, operand string
attrPattern := regexp.MustCompile(`(\w+)\[(\w+)\](.+)?`)
attrValuePattern := regexp.MustCompile(`\[(\w+)\]`)
if len(query) == 0 {
return
}
fields := strings.Fields(query)
part := fields[0]
query = strings.Join(fields[1:], " ")
if part == ">" {
op = "subchild"
} else if attrPattern.MatchString(part) {
op = "element"
matches := attrPattern.FindStringSubmatch(part)
operand = matches[1]
var optional string
if len(matches) == 4 {
optional = matches[3]
}
query = strings.TrimSpace(fmt.Sprintf("[%s]%s %s", matches[2], optional, query))
} else if attrValuePattern.MatchString(part) {
op = "attr"
operand = attrValuePattern.FindStringSubmatch(part)[1]
} else if len(query) == 0 {
op = "final"
} else {
op = "element"
operand = part
}
switch op {
case "element": // e.g. [el]; bla > [el]; but also [el] > bla
current = current.Find(operand)
if strings.HasSuffix(query, "first-of-type") {
break
}
fallthrough
case "subchild": // [preceding] > [future]
// recurse querySelector on all [preceding] element types
current.Each(func(j int, s *goquery.Selection) {
QuerySelector(query, s, results)
})
return
case "attr": // x[attr]
// extract the attribute
if str, exists := current.Attr(operand); exists {
*results = append(*results, str)
}
return
case "final": // no more in query, and we did not end on an attr: get text
*results = append(*results, current.Text())
}
QuerySelector(query, current, results)
}
func DatabaseDoesNotExist(filepath string) {
fmt.Printf("lieu: database %s does not exist\n", filepath)
fmt.Println("lieu: try running `lieu ingest` if you have already crawled source data")
@ -103,6 +166,7 @@ func WriteMockConfig() {
name = "Sweet Webring"
# used by the precrawl command and linked to in /about route
url = "https://example.com/"
webringSelector = "li > a"
port = 10001
[theme]
@ -132,6 +196,8 @@ bannedSuffixes = "data/banned-suffixes.txt"
boringWords = "data/boring-words.txt"
# domains that won't be output as outgoing links
boringDomains = "data/boring-domains.txt"
# queries to search for finding preview text
previewQueryList = "data/preview-query-list.txt"
`)
err := ioutil.WriteFile("lieu.toml", conf, 0644)
Check(err)
@ -140,3 +206,15 @@ boringDomains = "data/boring-domains.txt"
func Exit() {
os.Exit(0)
}
func DeduplicateSlice(intSlice []string) []string {
keys := make(map[string]bool)
list := []string{}
for _, entry := range intSlice {
if _, value := keys[entry]; !value {
keys[entry] = true
list = append(list, entry)
}
}
return list
}