Update README.md

Documented the theming part a bit better
tweak language for new search docs
2023-05-10 15:43:01 +02:00 · 2022-12-06 12:14:52 +01:00 · 2022-12-06 12:11:32 +01:00 · 2022-12-06 12:02:14 +01:00 · 2022-12-06 12:02:14 +01:00 · 2022-12-06 12:02:14 +01:00
--- a/.gitignore
+++ b/.gitignore
@ -224,3 +224,4 @@ pip-log.txt

 #Mr Developer
 .mr.developer.cfg
+lieu
--- a/README.md
+++ b/README.md
@ -18,6 +18,12 @@ engine, a way for personal webrings to increase serendipitous connexions.

 ## Usage

+### How to search
+
+For the full search syntax (including how to use `site:` and `-site:`), see the [search syntax and API documentation](docs/querying.md). For more tips, read the [appendix](https://cblgh.org/lieu/).
+
+### Getting Lieu running
+
 ```
 $ lieu help
 Lieu: neighbourhood search engine
@ -70,10 +76,15 @@ The config file is written in [TOML](https://toml.io/en/).
 name = "Merveilles Webring"
 # used by the precrawl command and linked to in /about route
 url = "https://webring.xxiivv.com"
+# used by the precrawl command to populate the Crawler.Webring file;
+# takes simple html selectors. might be a bit wonky :)
+webringSelector = "li > a[href]:first-of-type"
 port = 10001

 [theme]
 # colors specified in hex (or valid css names) which determine the theme of the lieu instance
+# NOTE: If (and only if) all three values are set lieu uses those to generate the file html/assets/theme.css at startup.
+# You can also write directly to that file istead of adding this section to your configuration file
 foreground = "#ffffff"
 background = "#000000"
 links = "#ffffff"
@ -99,6 +110,8 @@ bannedSuffixes = "data/banned-suffixes.txt"
 boringWords = "data/boring-words.txt"
 # domains that won't be output as outgoing links
 boringDomains = "data/boring-domains.txt"
+# queries to search for finding preview text
+previewQueryList = "data/preview-query-list.txt"
 ```

 For your own use, the following config fields should be customized:
@ -116,6 +129,7 @@ The following config-defined files can stay as-is unless you have specific requi
 * `heuristics`
 * `wordlist`
 * `bannedSuffixes`
+* `previewQueryList`

 For a full rundown of the files and their various jobs, see the [files
 description](docs/files.md).
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@ -34,6 +34,19 @@ func getBoringDomains(path string) []string {
 	return util.ReadList(path, "\n")
 }

+func getAboutHeuristics(path string) []string {
+	return util.ReadList(path, "\n")
+}
+
+func getPreviewQueries(path string) []string {
+	previewQueries := util.ReadList(path, "\n")
+	if len(previewQueries) > 0 {
+		return previewQueries
+	} else {
+		return []string{"main p", "article p", "section p", "p"}
+	}
+}
+
 func find(list []string, query string) bool {
 	for _, item := range list {
 		if item == query {
@ -103,27 +116,33 @@ func findSuffix(suffixes []string, query string) bool {
 func cleanText(s string) string {
 	s = strings.TrimSpace(s)
 	s = strings.ReplaceAll(s, "\n", " ")
-	s = strings.ReplaceAll(s, "|", " ")
-	whitespace := regexp.MustCompile(`\p{Z}`)
+	whitespace := regexp.MustCompile(`\p{Z}+`)
 	s = whitespace.ReplaceAllString(s, " ")
 	return s
 }

-func handleIndexing(c *colly.Collector) {
+func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []string) {
 	c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) {
 		fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL)
 	})

 	c.OnHTML("meta[name=\"description\"]", func(e *colly.HTMLElement) {
 		desc := cleanText(e.Attr("content"))
-		if len(desc) > 0 {
+		if len(desc) > 0 && len(desc) < 1500 {
 			fmt.Println("desc", desc, e.Request.URL)
 		}
 	})

+	c.OnHTML("meta[property=\"og:description\"]", func(e *colly.HTMLElement) {
+		ogDesc := cleanText(e.Attr("content"))
+		if len(ogDesc) > 0 && len(ogDesc) < 1500 {
+			fmt.Println("og-desc", ogDesc, e.Request.URL)
+		}
+	})
+
 	c.OnHTML("html[lang]", func(e *colly.HTMLElement) {
 		lang := cleanText(e.Attr("lang"))
-		if len(lang) > 0 {
+		if len(lang) > 0 && len(lang) < 100 {
 			fmt.Println("lang", lang, e.Request.URL)
 		}
 	})
@ -134,10 +153,26 @@ func handleIndexing(c *colly.Collector) {
 	})

 	c.OnHTML("body", func(e *colly.HTMLElement) {
+	QueryLoop:
+		for i := 0; i < len(previewQueries); i++ {
+			// After the fourth paragraph we're probably too far in to get something interesting for a preview
+			elements := e.DOM.Find(previewQueries[i])
+			for j := 0; j < 4 && j < elements.Length(); j++ {
+				element_text := elements.Slice(j, j+1).Text()
+				paragraph := cleanText(element_text)
+				if len(paragraph) < 1500 && len(paragraph) > 20 {
+					if !util.Contains(heuristics, strings.ToLower(paragraph)) {
+						fmt.Println("para", paragraph, e.Request.URL)
+						break QueryLoop
+					}
+				}
+			}
+		}
 		paragraph := cleanText(e.DOM.Find("p").First().Text())
 		if len(paragraph) < 1500 && len(paragraph) > 0 {
-			fmt.Println("para", paragraph, e.Request.URL)
+			fmt.Println("para-just-p", paragraph, e.Request.URL)
 		}
+
 		// get all relevant page headings
 		collectHeadingText("h1", e)
 		collectHeadingText("h2", e)
@ -153,7 +188,33 @@ func collectHeadingText(heading string, e *colly.HTMLElement) {
 	}
 }

+func SetupDefaultProxy(config types.Config) error {
+	// no proxy configured, go back
+	if config.General.Proxy == "" {
+		return nil
+	}
+	proxyURL, err := url.Parse(config.General.Proxy)
+	if err != nil {
+		return err
+	}
+
+	httpClient := &http.Client{
+		Transport: &http.Transport{
+			Proxy: http.ProxyURL(proxyURL),
+		},
+	}
+
+	http.DefaultClient = httpClient
+	return nil
+}
+
 func Precrawl(config types.Config) {
+	// setup proxy
+	err := SetupDefaultProxy(config)
+	if err != nil {
+		log.Fatal(err)
+	}
+
 	res, err := http.Get(config.General.URL)
 	util.Check(err)
 	defer res.Body.Close()
@ -166,11 +227,12 @@ func Precrawl(config types.Config) {
 	util.Check(err)

 	items := make([]string, 0)
-	doc.Find("li").Each(func(i int, s *goquery.Selection) {
-		if domain, exists := s.Find("a").Attr("href"); exists {
-			items = append(items, domain)
-		}
-	})
+	s := doc.Find("html")
+	query := config.General.WebringSelector
+	if query == "" {
+		query = "li > a[href]:first-of-type"
+	}
+	util.QuerySelector(query, s, &items)

 	BANNED := getBannedDomains(config.Crawler.BannedDomains)
 	for _, item := range items {
@ -189,6 +251,11 @@ func Precrawl(config types.Config) {
 }

 func Crawl(config types.Config) {
+	// setup proxy
+	err := SetupDefaultProxy(config)
+	if err != nil {
+		log.Fatal(err)
+	}
 	SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes)
 	links := getWebringLinks(config.Crawler.Webring)
 	domains, pathsites := getDomains(links)
@ -199,6 +266,9 @@ func Crawl(config types.Config) {
 	c := colly.NewCollector(
 		colly.MaxDepth(3),
 	)
+	if config.General.Proxy != "" {
+		c.SetProxy(config.General.Proxy)
+	}

 	q, _ := queue.New(
 		5, /* threads */
@ -219,9 +289,16 @@ func Crawl(config types.Config) {

 	boringDomains := getBoringDomains(config.Crawler.BoringDomains)
 	boringWords := getBoringWords(config.Crawler.BoringWords)
+	previewQueries := getPreviewQueries(config.Crawler.PreviewQueries)
+	heuristics := getAboutHeuristics(config.Data.Heuristics)

 	// on every a element which has an href attribute, call callback
 	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
+
+		if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 {
+			return
+		}
+
 		link := getLink(e.Attr("href"))
 		if findSuffix(SUFFIXES, link) {
 			return
@ -267,7 +344,7 @@ func Crawl(config types.Config) {
 		}
 	})

-	handleIndexing(c)
+	handleIndexing(c, previewQueries, heuristics)

 	// start scraping
 	q.Run(c)
--- a/data/crawled.txt
+++ b/data/crawled.txt
@ -0,0 +1 @@
+
--- a/data/heuristics.txt
+++ b/data/heuristics.txt
@ -8,3 +8,16 @@ last edit
 (c)
 all rights reserved
 licensed under
+subscribe
+|
+•
+generated by
+powered by
+this post was
+click here for
+click here to
+published on:
+published:
+posted:
+share this article
+estimated read time
--- a/data/preview-query-list.txt
+++ b/data/preview-query-list.txt
@ -0,0 +1,9 @@
+header p.p-summary
+main p.p-summary
+main p:not(.post-meta):not(.alternate):not(header p):not(footer p):not(nav p):not(aside p):not(.sidebar p)
+article p.p-summary
+article p:not(.post-meta):not(.alternate):not(header p):not(footer p):not(nav p):not(aside p):not(.sidebar p)
+p:not(.post-meta):not(.alternate):not(header p):not(footer p):not(nav p):not(aside p):not(.sidebar p)
+header ~ p:not(.post-meta):not(.alternate):not(footer p):not(aside p):not(.sidebar p)
+h1 ~ p:not(.post-meta):not(.alternate)
+p:not(.post-meta):not(.alternate):not(footer p):not(aside p):not(.sidebar p)
--- a/data/webring.txt
+++ b/data/webring.txt
@ -0,0 +1 @@
+
--- a/database/database.go
+++ b/database/database.go
@ -19,10 +19,13 @@ import (
 	"log"
 	"net/url"
 	"strings"
+	"regexp"

 	_ "github.com/mattn/go-sqlite3"
 )

+var languageCodeSanityRegex = regexp.MustCompile("^[a-zA-Z\\-0-9]+$")
+
 func InitDB(filepath string) *sql.DB {
 	db, err := sql.Open("sqlite3", filepath)
 	if err != nil {
@ -95,17 +98,19 @@ query params:
 &order=score, &order=count
 */

+var emptyStringArray = []string{}
+
 func SearchWordsByScore(db *sql.DB, words []string) []types.PageData {
-	return searchWords(db, words, true)
+	return SearchWords(db, words, true, emptyStringArray, emptyStringArray, emptyStringArray)
 }

 func SearchWordsBySite(db *sql.DB, words []string, domain string) []types.PageData {
 	// search words by site is same as search words by score, but adds a domain condition
-	return searchWords(db, words, true, domain)
+	return SearchWords(db, words, true, []string{domain}, emptyStringArray, emptyStringArray)
 }

 func SearchWordsByCount(db *sql.DB, words []string) []types.PageData {
-	return searchWords(db, words, false)
+	return SearchWords(db, words, false, emptyStringArray, emptyStringArray, emptyStringArray)
 }

 func FulltextSearchWords(db *sql.DB, phrase string) []types.PageData {
@ -222,12 +227,16 @@ func countQuery(db *sql.DB, table string) int {
 	return count
 }

-func searchWords(db *sql.DB, words []string, searchByScore bool, domain ...string) []types.PageData {
-	var wordlist []string
+func SearchWords(db *sql.DB, words []string, searchByScore bool, domain []string, nodomain []string, language []string) []types.PageData {
 	var args []interface{}
-	for _, word := range words {
-		wordlist = append(wordlist, "word = ?")
-		args = append(args, strings.ToLower(word))
+
+	wordlist := []string{"1"}
+	if len(words) > 0 && words[0] != "" {
+		wordlist = make([]string, 0)
+		for _, word := range words {
+			wordlist = append(wordlist, "word = ?")
+			args = append(args, strings.ToLower(word))
+		}
 	}

 	// the domains conditional defaults to just 'true' i.e. no domain condition
@ -240,6 +249,28 @@ func searchWords(db *sql.DB, words []string, searchByScore bool, domain ...strin
 		}
 	}

+	nodomains := []string{"1"}
+	if len(nodomain) > 0 && nodomain[0] != "" {
+		nodomains = make([]string, 0)
+		for _, d := range nodomain {
+			nodomains = append(nodomains, "domain != ?")
+			args = append(args, d)
+		}
+	}
+
+	//This needs some wildcard support …
+	languages := []string{"1"}
+	if len(language) > 0 && language[0] != "" {
+		languages = make([]string, 0)
+		for _, d := range language {
+			// Do a little check to avoid the database being DOSed
+			if languageCodeSanityRegex.MatchString(d) {
+				languages = append(languages, "lang LIKE ?")
+				args = append(args, d+"%")
+			}
+		}
+	}
+
 	orderType := "SUM(score)"
 	if !searchByScore {
 		orderType = "COUNT(*)"
@ -250,11 +281,13 @@ func searchWords(db *sql.DB, words []string, searchByScore bool, domain ...strin
    FROM inv_index inv INNER JOIN pages p ON inv.url = p.url 
    WHERE (%s)
    AND (%s)
+    AND (%s)
+    AND (%s)
    GROUP BY inv.url 
    ORDER BY %s
    DESC
    LIMIT 15
-    `, strings.Join(wordlist, " OR "), strings.Join(domains, " OR "), orderType)
+    `, strings.Join(wordlist, " OR "), strings.Join(domains, " OR "), strings.Join(nodomains, " AND "), strings.Join(languages, " OR "), orderType)

 	stmt, err := db.Prepare(query)
 	util.Check(err)
--- a/docs/files.md
+++ b/docs/files.md
@ -37,6 +37,8 @@ bannedSuffixes = "data/banned-suffixes.txt"
 boringWords = "data/boring-words.txt"
 # domains that won't be output as outgoing links
 boringDomains = "data/boring-domains.txt"
+# queries to search for finding preview text
+previewQueryList = "data/preview-query-list.txt"
 ```

 ## HTML
@ -120,6 +122,21 @@ are stopped from entering the search index. The default wordlist consists of the
 1000 or so most common English words, albeit curated slightly to still allow for
 interesting concepts and verbs—such as `reading` and `books`, for example.

+#### `previewQueryList`
+A list of css selectors—one per line—used to fetch preview paragraphs. The first paragraph
+found passing a check against the `heuristics` file makes it into the search index. For
+each selector in `previewQueryList`, Lieu tries the first four paragraphs—as found by the
+selector—before trying to find a new set of paragraphs using the file's next selector.
+
+To get good results, one usually wants to tune this list to getting the first "real" paragraph
+after common page headers, or finding a summary paragraph. The default has been, at the time of
+writing, tuned for use with the [Fediring](https://fediring.net).
+
+Depending on the structure of the websites you are indexing, this will get you 70-90% of the
+way in terms of accurate link descriptions. For the rest of the way, fine-tune `heuristics.txt`
+and reach out the creators of the websites you are indexing; they often appreciate the
+feedback.
+
 #### OpenSearch metadata
 If you are running your own instance of Lieu, you might want to look into changing the URL
 defined in the file `opensearch.xml`, which specifies [OpenSearch
--- a/docs/querying.md
+++ b/docs/querying.md
@ -0,0 +1,41 @@
+# Querying Lieu
+
+## Search Syntax
+
+* `cat dog` - search for pages about cats or dogs, most probably both
+* `fox site:example.org` - search example.org (if indexed) for term "fox"
+* `fox -site:example.org` - search all indexed sites except `example.org` for term "fox"
+* `emoji lang:de` - search pages that claim to mainly contain German content for the term "emoji"
+
+When searching, capitalisation and inflection do not matter, as search terms are:
+
+* Converted to lowercase using the go standard library
+* Passed through [jinzhu's inflection library](https://github.com/jinzhu/inflection) for
+  converting to a possible singular form (intended to work with English nouns)
+
+## Search API
+
+Lieu currently only renders its results to HTML. A query can be passed to the `/` endpoint using a `GET` request.
+
+It supports two URL parameters:
+* `q` - used for the search query
+* `site` - accepts one domain name and will have the same effect as the `site:<domain>` syntax.
+  You can use this to make your webrings search engine double as a searchbox on your website.
+
+### Examples
+To search `example.org` for the term "ssh" using `https://search.webring.example`:
+
+```
+https://search.webring.example/?q=ssh&site=example.org
+```
+
+Adding a form element, to use Lieu as a search engine, to the HTML at example.org:
+
+```
+<form method="GET" action="https://search.webring.example">
+	<label for="search">Search example.org</label>
+	<input type="search" minlength="1" required="" name="q" placeholder="Your search query here" id="search">
+	<input type="hidden" name="site" value="example.org"> <!-- This hidden field tells lieu to only search example.org -->
+	<button type="submit">Let's go!</button>
+</form>
+```
--- a/html/assets/favicon.ico
+++ b/html/assets/favicon.ico
--- a/html/assets/favicon.png
+++ b/html/assets/favicon.png
--- a/html/assets/favicon.svg
+++ b/html/assets/favicon.svg
@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg width="420" height="420" fill="none" version="1.1" xmlns="http://www.w3.org/2000/svg"><g stroke-linecap="round"><rect width="420" height="420" fill="#000" stroke-width="12.8"/></g><path d="m210 87c-53.5 0-104 27.1-149 71.9l-28.6 28.6 50.7 50.7 28.3-27.9 4.76 13.4-38.8 38.8 57.3 57.3 34.5-34.5v2.72 4.5 40.5h81v-40.5-4.5-2.72l34.5 34.5 57.3-57.3-38.8-38.8 4.76-13.4 28.3 27.9 50.7-50.7-28.6-28.6c-44.8-44.8-95.1-71.9-149-71.9zm0 81c11.2 0 19.5 8.25 19.5 19.5s-8.25 19.5-19.5 19.5-19.5-8.25-19.5-19.5 8.25-19.5 19.5-19.5z" color="#000000" fill="#fff" stroke-linecap="square" stroke-linejoin="round" style="-inkscape-stroke:none"/></svg>
--- a/html/head.html
+++ b/html/head.html
@ -15,7 +15,7 @@
        <link href="/assets/theme.css" rel="stylesheet">

        <link rel="icon" href="/assets/favicon.ico">
-        <link rel="icon" href="/assets/logo.svg" type="image/svg+xml">
+        <link rel="icon" href="/assets/favicon.svg" type="image/svg+xml">
        <link rel="shortcut icon" href="/assets/favicon.png">
        <link rel="apple-touch-icon" href="/assets/favicon.png">
        <meta name="theme-color" content="#000000">
--- a/html/index.html
+++ b/html/index.html
@ -20,7 +20,7 @@
            <form class="search">
                <label class="visually-hidden" for="search">Search {{ .SiteName }}</label>
                <span class="search__input">
-                    <input type="search" required minlength="1" name="q" placeholder="{{ .Data.Placeholder }}" class="flex-grow" id="search">
+                    <input type="search" required minlength="1" name="q" placeholder="{{ .Data.Placeholder }}" class="flex-grow" id="search" maxlength="6000" >
                    <button type="submit" class="search__button" aria-label="Search" title="Search">
                        <svg viewBox="0 0 420 300" xmlns="http://www.w3.org/2000/svg" baseProfile="full" style="background:var(--secondary)" width="42" height="30" fill="none"><path d="M90 135q60-60 120-60 0 0 0 0 60 0 120 60m-120 60a60 60 0 01-60-60 60 60 0 0160-60 60 60 0 0160 60 60 60 0 01-60 60m45-15h0l30 30m-75-15h0v45m-45-60h0l-30 30" stroke-width="81" stroke-linecap="square" stroke-linejoin="round" stroke="var(--primary)"/></svg>
                    </button>
--- a/html/robots.txt
+++ b/html/robots.txt
@ -0,0 +1,2 @@
+User-agent: *
+Disallow: /*?
--- a/html/search.html
+++ b/html/search.html
@ -6,7 +6,7 @@
    <form method="GET" class="search">
        <label for="search">Search {{ .SiteName }} </label>
        <span class="search__input">
-            <input type="search" minlength="1" required name="q" placeholder="Search" value="{{ .Data.Query }}" class="search-box" id="search">
+            <input type="search" minlength="1" required name="q" placeholder="Search" value="{{ .Data.Query }}" class="search-box" id="search" maxlength="6000">
            {{ if ne .Data.Site "" }} 
                <input type="hidden" value="{{ .Data.Site }}" name="site">
            {{ end }}
--- a/ingest/ingest.go
+++ b/ingest/ingest.go
@ -95,7 +95,7 @@ func Ingest(config types.Config) {
 			continue
 		}

-		pageurl := strings.ToLower(strings.TrimSuffix(strings.TrimSpace(line[lastSpace:len(line)]), "/"))
+		pageurl := strings.TrimSuffix(strings.TrimSpace(line[lastSpace:len(line)]), "/")
 		if !strings.HasPrefix(pageurl, "http") {
 			continue
 		}
@ -117,6 +117,7 @@ func Ingest(config types.Config) {
 		case "title":
 			if len(page.About) == 0 {
 				page.About = rawdata
+				page.AboutSource = token
 			}
 			score = 5
 			page.Title = rawdata
@ -124,6 +125,7 @@ func Ingest(config types.Config) {
 		case "h1":
 			if len(page.About) == 0 {
 				page.About = rawdata
+				page.AboutSource = token
 			}
 			fallthrough
 		case "h2":
@ -132,13 +134,21 @@ func Ingest(config types.Config) {
 			score = 15
 			processed = partitionSentence(payload)
 		case "desc":
-			if len(page.About) < 30 && len(rawdata) < 100 {
+			if len(page.About) < 30 && len(rawdata) < 100 && len(rawdata) > len(page.About) {
 				page.About = rawdata
+				page.AboutSource = token
 			}
 			processed = partitionSentence(payload)
+		case "og-desc":
+			page.About = rawdata
+			page.AboutSource = token
+			processed = partitionSentence(payload)
 		case "para":
-			if performAboutHeuristic(config.Data.Heuristics, payload) {
-				page.About = rawdata
+			if page.AboutSource != "og-desc" || len(rawdata)*10 > len(page.About)*7 {
+				if performAboutHeuristic(config.Data.Heuristics, payload) {
+					page.About = rawdata
+					page.AboutSource = token
+				}
 			}
 			processed = partitionSentence(payload)
 		case "lang":
@ -146,7 +156,7 @@ func Ingest(config types.Config) {
 		case "keywords":
 			processed = strings.Split(strings.ReplaceAll(payload, ", ", ","), ",")
 		case "non-webring-link":
-			externalLinks = append(externalLinks, payload)
+			externalLinks = append(externalLinks, rawdata)
 		default:
 			continue
 		}
@ -162,7 +172,7 @@ func Ingest(config types.Config) {
 			// only extract path segments once per url.
 			// we do it here because every page is virtually guaranteed to have a title attr &
 			// it only appears once
-			for _, word := range extractPathSegments(pageurl) {
+			for _, word := range extractPathSegments(strings.ToLower(pageurl)) {
 				batch = append(batch, types.SearchFragment{Word: word, URL: pageurl, Score: 2})
 			}
 		}
@ -190,10 +200,16 @@ func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]ty
 		i++
 	}
 	// TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from
-	log.Println("starting to ingest batch")
+	log.Println("starting to ingest batch (Pages:", len(pages), "Words:", len(batch), "Links:", len(links), ")")
 	database.InsertManyDomains(db, pages)
 	database.InsertManyPages(db, pages)
-	database.InsertManyWords(db, batch)
+	for i := 0; i < len(batch); i += 3000 {
+		end_i := i + 3000
+		if end_i > len(batch) {
+			end_i = len(batch)
+		}
+		database.InsertManyWords(db, batch[i:end_i])
+	}
 	database.InsertManyExternalLinks(db, links)
 	log.Println("finished ingesting batch")
 }
--- a/lieu.toml
+++ b/lieu.toml
@ -27,3 +27,5 @@ bannedSuffixes = "data/banned-suffixes.txt"
 boringWords = "data/boring-words.txt"
 # domains that won't be output as outgoing links
 boringDomains = "data/boring-domains.txt"
+# queries to search for finding preview text
+previewQueryList = "data/preview-query-list.txt"
--- a/server/server.go
+++ b/server/server.go
@ -7,7 +7,6 @@ import (
 	"net/http"
 	"net/url"
 	"os"
-	"regexp"
 	"strings"
 	"syscall"

@ -61,17 +60,21 @@ var templates = template.Must(template.ParseFiles(

 const useURLTitles = true

-var sitePattern = regexp.MustCompile(`site:\S+`)
-
 func (h RequestHandler) searchRoute(res http.ResponseWriter, req *http.Request) {
 	var query string
+	var domain string
 	view := &TemplateView{}

-	var domain string
-	if req.Method == http.MethodGet {
+	var domains = []string{}
+	var nodomains = []string{}
+	var langs = []string{}
+	var queryFields = []string{}
+		
+	if req.Method == http.MethodGet{
 		params := req.URL.Query()
 		if words, exists := params["q"]; exists && words[0] != "" {
 			query = words[0]
+			queryFields = strings.Fields(query)
 		}

 		// how to use: https://gist.github.com/cblgh/29991ba0a9e65cccbe14f4afd7c975f1
@ -80,29 +83,36 @@ func (h RequestHandler) searchRoute(res http.ResponseWriter, req *http.Request)
 			domain = strings.TrimPrefix(parts[0], "https://")
 			domain = strings.TrimPrefix(domain, "http://")
 			domain = strings.TrimSuffix(domain, "/")
-		} else if sitePattern.MatchString(query) {
-			// if user searched with "site:<domain>" in text box, behave the same way as if a query param was used
-			domain = sitePattern.FindString(query)[5:]
+			domains = append(domains, domain)
 		}
-		// if clear button was used -> clear site param / search text
-		if parts, exists := params["clear"]; exists && parts[0] != "" {
-			domain = ""
-			query = sitePattern.ReplaceAllString(query, "")
+
+		// don't process if there are too many fields
+		if len(queryFields) <= 100 {
+			var newQueryFields []string;
+			for _, word := range queryFields {
+				// This could be more efficient by splitting arrays, but I'm going with the more readable version for now
+				if strings.HasPrefix(word, "site:") {
+					domains = append(domains, strings.TrimPrefix(word, "site:"))
+				} else if strings.HasPrefix(word, "-site:") {
+					nodomains = append(nodomains, strings.TrimPrefix(word, "-site:"))
+				} else if strings.HasPrefix(word, "lang:") {
+					langs = append(langs, strings.TrimPrefix(word, "lang:"))
+				} else {
+					newQueryFields = append(newQueryFields, word)
+				}
+			}
+			queryFields = newQueryFields;
 		}
+		
 	}

-	if len(query) == 0 {
+	if len(queryFields) == 0 || len(queryFields) > 100 || len(query) >= 8192 {
 		view.Data = IndexData{Tagline: h.config.General.Tagline, Placeholder: h.config.General.Placeholder}
 		h.renderView(res, "index", view)
 		return
 	}

-	var pages []types.PageData
-	if domain != "" {
-		pages = database.SearchWordsBySite(h.db, util.Inflect(strings.Fields(query)), domain)
-	} else {
-		pages = database.SearchWordsByScore(h.db, util.Inflect(strings.Fields(query)))
-	}
+	var pages = database.SearchWords(h.db, util.Inflect(queryFields), true, domains, nodomains, langs)

 	if useURLTitles {
 		for i, pageData := range pages {
@ -230,14 +240,15 @@ func (h RequestHandler) renderView(res http.ResponseWriter, tmpl string, view *T
 func WriteTheme(config types.Config) {
 	theme := config.Theme
 	// no theme is set, use the default
-	if theme.Foreground == "" {
+	if theme.Foreground == "" || theme.Background == "" || theme.Links =="" {
 		return
 	}
-	colors := fmt.Sprintf(`:root {
+	colors := fmt.Sprintf(`/*This file will be automatically regenerated by lieu on startup if the theme colors are set in the configuration file*/
+:root {
  --primary: %s;
  --secondary: %s;
  --link: %s;
-}\n`, theme.Foreground, theme.Background, theme.Links)
+}`, theme.Foreground, theme.Background, theme.Links)
 	err := os.WriteFile("html/assets/theme.css", []byte(colors), 0644)
 	util.Check(err)
 }
@ -255,8 +266,9 @@ func Serve(config types.Config) {
 	http.HandleFunc("/webring", handler.webringRoute)
 	http.HandleFunc("/filtered", handler.filteredRoute)

-	fileserver := http.FileServer(http.Dir("html/assets/"))
-	http.Handle("/assets/", http.StripPrefix("/assets/", fileserver))
+	fileserver := http.FileServer(http.Dir("html/"))
+	http.Handle("/assets/", fileserver)
+	http.Handle("/robots.txt", fileserver)

 	portstr := fmt.Sprintf(":%d", config.General.Port)
 	fmt.Println("Listening on port: ", portstr)
--- a/types/types.go
+++ b/types/types.go
@ -7,19 +7,22 @@ type SearchFragment struct {
 }

 type PageData struct {
-	URL   string
-	Title string
-	About string
-	Lang  string
+	URL         string
+	Title       string
+	About       string
+	Lang        string
+	AboutSource string
 }

 type Config struct {
 	General struct {
-		Name        string `json:name`
-		Tagline     string `json:tagline`
-		Placeholder string `json:placeholder`
-		URL         string `json:url`
-		Port        int    `json:port`
+		Name            string `json:name`
+		Tagline         string `json:tagline`
+		Placeholder     string `json:placeholder`
+		URL             string `json:url`
+		WebringSelector string `json:"webringSelector"`
+		Port            int    `json:port`
+		Proxy           string `json:proxy`
 	} `json:general`
 	Theme struct {
 		Foreground string `json:"foreground"`
@ -38,5 +41,6 @@ type Config struct {
 		BannedSuffixes string `json:bannedSuffixes`
 		BoringWords    string `json:boringWords`
 		BoringDomains  string `json:boringDomains`
+		PreviewQueries string `json:"previewQueryList"`
 	} `json:crawler`
 }
--- a/util/util.go
+++ b/util/util.go
@ -4,15 +4,18 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"github.com/PuerkitoBio/goquery"
 	"io/ioutil"
 	"log"
 	"net"
 	"os"
+	"regexp"
 	"strings"

+	"lieu/types"
+
 	"github.com/jinzhu/inflection"
 	"github.com/komkom/toml"
-	"lieu/types"
 )

 func Inflect(words []string) []string {
@ -29,6 +32,66 @@ func Check(err error) {
 	}
 }

+// document.querySelectorAll-type functionality. limited functionality as of now (no classes or id support atm, i think!!)
+func QuerySelector(query string, current *goquery.Selection, results *[]string) {
+	var op, operand string
+
+	attrPattern := regexp.MustCompile(`(\w+)\[(\w+)\](.+)?`)
+	attrValuePattern := regexp.MustCompile(`\[(\w+)\]`)
+
+	if len(query) == 0 {
+		return
+	}
+
+	fields := strings.Fields(query)
+	part := fields[0]
+	query = strings.Join(fields[1:], " ")
+	if part == ">" {
+		op = "subchild"
+	} else if attrPattern.MatchString(part) {
+		op = "element"
+		matches := attrPattern.FindStringSubmatch(part)
+		operand = matches[1]
+		var optional string
+		if len(matches) == 4 {
+			optional = matches[3]
+		}
+		query = strings.TrimSpace(fmt.Sprintf("[%s]%s %s", matches[2], optional, query))
+	} else if attrValuePattern.MatchString(part) {
+		op = "attr"
+		operand = attrValuePattern.FindStringSubmatch(part)[1]
+	} else if len(query) == 0 {
+		op = "final"
+	} else {
+		op = "element"
+		operand = part
+	}
+
+	switch op {
+	case "element": // e.g. [el]; bla > [el]; but also [el] > bla
+		current = current.Find(operand)
+		if strings.HasSuffix(query, "first-of-type") {
+			break
+		}
+		fallthrough
+	case "subchild": // [preceding] > [future]
+		// recurse querySelector on all [preceding] element types
+		current.Each(func(j int, s *goquery.Selection) {
+			QuerySelector(query, s, results)
+		})
+		return
+	case "attr": // x[attr]
+		// extract the attribute
+		if str, exists := current.Attr(operand); exists {
+			*results = append(*results, str)
+		}
+		return
+	case "final": // no more in query, and we did not end on an attr: get text
+		*results = append(*results, current.Text())
+	}
+	QuerySelector(query, current, results)
+}
+
 func DatabaseDoesNotExist(filepath string) {
 	fmt.Printf("lieu: database %s does not exist\n", filepath)
 	fmt.Println("lieu: try running `lieu ingest` if you have already crawled source data")
@ -103,6 +166,7 @@ func WriteMockConfig() {
 name = "Sweet Webring"
 # used by the precrawl command and linked to in /about route
 url = "https://example.com/"
+webringSelector = "li > a"
 port = 10001

 [theme]
@ -132,6 +196,8 @@ bannedSuffixes = "data/banned-suffixes.txt"
 boringWords = "data/boring-words.txt"
 # domains that won't be output as outgoing links
 boringDomains = "data/boring-domains.txt"
+# queries to search for finding preview text
+previewQueryList = "data/preview-query-list.txt"
 `)
 	err := ioutil.WriteFile("lieu.toml", conf, 0644)
 	Check(err)
@ -140,3 +206,15 @@ boringDomains = "data/boring-domains.txt"
 func Exit() {
 	os.Exit(0)
 }
+
+func DeduplicateSlice(intSlice []string) []string {
+	keys := make(map[string]bool)
+	list := []string{}
+	for _, entry := range intSlice {
+		if _, value := keys[entry]; !value {
+			keys[entry] = true
+			list = append(list, entry)
+		}
+	}
+	return list
+}
Autor	SHA1	Wiadomość	Data
Alexander Cobleigh	f27c45d4be	Update README.md	2023-05-10 15:43:01 +02:00
Slatian	22d1802337	Documented the theming part a bit better	2022-12-06 12:14:52 +01:00
cblgh	9173912782	tweak language for new search docs	2022-12-06 12:11:32 +01:00
Slatian	f41b7f87e7	Added some pretty liberal limits on query length to make it more difficult to cause a DOS condition. (the go http package by default limits the header length to 1 Megabyte, which is great at preventing someone from causing trpuble at the http layer, but doesn't work too well when there is a pretty expensive search going on in the background)	2022-12-06 12:02:14 +01:00
Slatian	e21cc9a9d0	Documented how to place search queries	2022-12-06 12:02:14 +01:00
Slatian	b2a9947fb9	Removed debugging outputs	2022-12-06 12:02:14 +01:00
Slatian	b431a15441	Added experimental support for "-site:" and "lang:" queries	2022-12-06 12:02:14 +01:00
Slatian	b4a2e5e269	Added a robots.txt file	2022-12-06 11:23:04 +01:00
Slatian	d02edd35ca	Optimized favicons a bit	2022-12-06 11:22:19 +01:00
cblgh	9377bd6fab	go fmt	2022-11-22 14:08:59 +01:00
cblgh	9517f62de2	tweak wording and minor details relating to preview queries	2022-11-22 14:08:44 +01:00
Slatian	7c6a63ce2c	Added a bit of documentation for new features	2022-11-22 13:52:32 +01:00
Slatian	212f5c5655	Added new configuration option top lieu.toml	2022-11-22 13:52:32 +01:00
Slatian	27e1b68b66	URIs are not case insensitive by default so, we shouldn't assume that they are	2022-11-22 13:52:32 +01:00
Slatian	e56f60ccb9	Added batching functionality because wordlists become pretty long when the scraper found lots of long paragraphs	2022-11-22 13:52:32 +01:00
Slatian	ed5f5189b0	Added a little check for the response code to not index pages that return errors or finish with codes in the 100 range	2022-11-22 13:52:32 +01:00
Slatian	34d6df3830	Removed "note:" from heuristics as too many sites are affected negatively by this	2022-11-22 13:52:32 +01:00
Slatian	c72ea4c6ca	Improved heuristics for enlish language text to skip over most fluff paragraphs to get better samples of sites	2022-11-22 13:52:32 +01:00
Slatian	5fe32df938	Added some extra mechanisms that come in handy for getting more useful previews	2022-11-22 13:52:32 +01:00
Slatian	cbaa6e06b1	Add a default configuration that mostly works …	2022-11-22 13:52:32 +01:00
Slatian	0a85f38b36	Made scring the preview text configurable and improved the cleanup function a tiny bit.	2022-11-22 13:52:32 +01:00
cblgh	b0ad7dce10	add custom webring selector for precrawl	2022-03-30 15:18:38 +02:00
idk	21ef8aac08	Allows the configuration of a proxy (#9 ) * Add proxy support, capability to crawl using SOCKS proxies	2022-03-29 14:36:48 +02:00