kopia lustrzana https://github.com/cblgh/lieu
Porównaj commity
23 Commity
2022-03-07
...
main
Autor | SHA1 | Data |
---|---|---|
Alexander Cobleigh | f27c45d4be | |
Slatian | 22d1802337 | |
cblgh | 9173912782 | |
Slatian | f41b7f87e7 | |
Slatian | e21cc9a9d0 | |
Slatian | b2a9947fb9 | |
Slatian | b431a15441 | |
Slatian | b4a2e5e269 | |
Slatian | d02edd35ca | |
cblgh | 9377bd6fab | |
cblgh | 9517f62de2 | |
Slatian | 7c6a63ce2c | |
Slatian | 212f5c5655 | |
Slatian | 27e1b68b66 | |
Slatian | e56f60ccb9 | |
Slatian | ed5f5189b0 | |
Slatian | 34d6df3830 | |
Slatian | c72ea4c6ca | |
Slatian | 5fe32df938 | |
Slatian | cbaa6e06b1 | |
Slatian | 0a85f38b36 | |
cblgh | b0ad7dce10 | |
idk | 21ef8aac08 |
|
@ -224,3 +224,4 @@ pip-log.txt
|
|||
|
||||
#Mr Developer
|
||||
.mr.developer.cfg
|
||||
lieu
|
||||
|
|
14
README.md
14
README.md
|
@ -18,6 +18,12 @@ engine, a way for personal webrings to increase serendipitous connexions.
|
|||
|
||||
## Usage
|
||||
|
||||
### How to search
|
||||
|
||||
For the full search syntax (including how to use `site:` and `-site:`), see the [search syntax and API documentation](docs/querying.md). For more tips, read the [appendix](https://cblgh.org/lieu/).
|
||||
|
||||
### Getting Lieu running
|
||||
|
||||
```
|
||||
$ lieu help
|
||||
Lieu: neighbourhood search engine
|
||||
|
@ -70,10 +76,15 @@ The config file is written in [TOML](https://toml.io/en/).
|
|||
name = "Merveilles Webring"
|
||||
# used by the precrawl command and linked to in /about route
|
||||
url = "https://webring.xxiivv.com"
|
||||
# used by the precrawl command to populate the Crawler.Webring file;
|
||||
# takes simple html selectors. might be a bit wonky :)
|
||||
webringSelector = "li > a[href]:first-of-type"
|
||||
port = 10001
|
||||
|
||||
[theme]
|
||||
# colors specified in hex (or valid css names) which determine the theme of the lieu instance
|
||||
# NOTE: If (and only if) all three values are set lieu uses those to generate the file html/assets/theme.css at startup.
|
||||
# You can also write directly to that file istead of adding this section to your configuration file
|
||||
foreground = "#ffffff"
|
||||
background = "#000000"
|
||||
links = "#ffffff"
|
||||
|
@ -99,6 +110,8 @@ bannedSuffixes = "data/banned-suffixes.txt"
|
|||
boringWords = "data/boring-words.txt"
|
||||
# domains that won't be output as outgoing links
|
||||
boringDomains = "data/boring-domains.txt"
|
||||
# queries to search for finding preview text
|
||||
previewQueryList = "data/preview-query-list.txt"
|
||||
```
|
||||
|
||||
For your own use, the following config fields should be customized:
|
||||
|
@ -116,6 +129,7 @@ The following config-defined files can stay as-is unless you have specific requi
|
|||
* `heuristics`
|
||||
* `wordlist`
|
||||
* `bannedSuffixes`
|
||||
* `previewQueryList`
|
||||
|
||||
For a full rundown of the files and their various jobs, see the [files
|
||||
description](docs/files.md).
|
||||
|
|
|
@ -34,6 +34,19 @@ func getBoringDomains(path string) []string {
|
|||
return util.ReadList(path, "\n")
|
||||
}
|
||||
|
||||
func getAboutHeuristics(path string) []string {
|
||||
return util.ReadList(path, "\n")
|
||||
}
|
||||
|
||||
func getPreviewQueries(path string) []string {
|
||||
previewQueries := util.ReadList(path, "\n")
|
||||
if len(previewQueries) > 0 {
|
||||
return previewQueries
|
||||
} else {
|
||||
return []string{"main p", "article p", "section p", "p"}
|
||||
}
|
||||
}
|
||||
|
||||
func find(list []string, query string) bool {
|
||||
for _, item := range list {
|
||||
if item == query {
|
||||
|
@ -103,27 +116,33 @@ func findSuffix(suffixes []string, query string) bool {
|
|||
func cleanText(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
s = strings.ReplaceAll(s, "\n", " ")
|
||||
s = strings.ReplaceAll(s, "|", " ")
|
||||
whitespace := regexp.MustCompile(`\p{Z}`)
|
||||
whitespace := regexp.MustCompile(`\p{Z}+`)
|
||||
s = whitespace.ReplaceAllString(s, " ")
|
||||
return s
|
||||
}
|
||||
|
||||
func handleIndexing(c *colly.Collector) {
|
||||
func handleIndexing(c *colly.Collector, previewQueries []string, heuristics []string) {
|
||||
c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) {
|
||||
fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL)
|
||||
})
|
||||
|
||||
c.OnHTML("meta[name=\"description\"]", func(e *colly.HTMLElement) {
|
||||
desc := cleanText(e.Attr("content"))
|
||||
if len(desc) > 0 {
|
||||
if len(desc) > 0 && len(desc) < 1500 {
|
||||
fmt.Println("desc", desc, e.Request.URL)
|
||||
}
|
||||
})
|
||||
|
||||
c.OnHTML("meta[property=\"og:description\"]", func(e *colly.HTMLElement) {
|
||||
ogDesc := cleanText(e.Attr("content"))
|
||||
if len(ogDesc) > 0 && len(ogDesc) < 1500 {
|
||||
fmt.Println("og-desc", ogDesc, e.Request.URL)
|
||||
}
|
||||
})
|
||||
|
||||
c.OnHTML("html[lang]", func(e *colly.HTMLElement) {
|
||||
lang := cleanText(e.Attr("lang"))
|
||||
if len(lang) > 0 {
|
||||
if len(lang) > 0 && len(lang) < 100 {
|
||||
fmt.Println("lang", lang, e.Request.URL)
|
||||
}
|
||||
})
|
||||
|
@ -134,10 +153,26 @@ func handleIndexing(c *colly.Collector) {
|
|||
})
|
||||
|
||||
c.OnHTML("body", func(e *colly.HTMLElement) {
|
||||
QueryLoop:
|
||||
for i := 0; i < len(previewQueries); i++ {
|
||||
// After the fourth paragraph we're probably too far in to get something interesting for a preview
|
||||
elements := e.DOM.Find(previewQueries[i])
|
||||
for j := 0; j < 4 && j < elements.Length(); j++ {
|
||||
element_text := elements.Slice(j, j+1).Text()
|
||||
paragraph := cleanText(element_text)
|
||||
if len(paragraph) < 1500 && len(paragraph) > 20 {
|
||||
if !util.Contains(heuristics, strings.ToLower(paragraph)) {
|
||||
fmt.Println("para", paragraph, e.Request.URL)
|
||||
break QueryLoop
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
paragraph := cleanText(e.DOM.Find("p").First().Text())
|
||||
if len(paragraph) < 1500 && len(paragraph) > 0 {
|
||||
fmt.Println("para", paragraph, e.Request.URL)
|
||||
fmt.Println("para-just-p", paragraph, e.Request.URL)
|
||||
}
|
||||
|
||||
// get all relevant page headings
|
||||
collectHeadingText("h1", e)
|
||||
collectHeadingText("h2", e)
|
||||
|
@ -153,7 +188,33 @@ func collectHeadingText(heading string, e *colly.HTMLElement) {
|
|||
}
|
||||
}
|
||||
|
||||
func SetupDefaultProxy(config types.Config) error {
|
||||
// no proxy configured, go back
|
||||
if config.General.Proxy == "" {
|
||||
return nil
|
||||
}
|
||||
proxyURL, err := url.Parse(config.General.Proxy)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
httpClient := &http.Client{
|
||||
Transport: &http.Transport{
|
||||
Proxy: http.ProxyURL(proxyURL),
|
||||
},
|
||||
}
|
||||
|
||||
http.DefaultClient = httpClient
|
||||
return nil
|
||||
}
|
||||
|
||||
func Precrawl(config types.Config) {
|
||||
// setup proxy
|
||||
err := SetupDefaultProxy(config)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
res, err := http.Get(config.General.URL)
|
||||
util.Check(err)
|
||||
defer res.Body.Close()
|
||||
|
@ -166,11 +227,12 @@ func Precrawl(config types.Config) {
|
|||
util.Check(err)
|
||||
|
||||
items := make([]string, 0)
|
||||
doc.Find("li").Each(func(i int, s *goquery.Selection) {
|
||||
if domain, exists := s.Find("a").Attr("href"); exists {
|
||||
items = append(items, domain)
|
||||
}
|
||||
})
|
||||
s := doc.Find("html")
|
||||
query := config.General.WebringSelector
|
||||
if query == "" {
|
||||
query = "li > a[href]:first-of-type"
|
||||
}
|
||||
util.QuerySelector(query, s, &items)
|
||||
|
||||
BANNED := getBannedDomains(config.Crawler.BannedDomains)
|
||||
for _, item := range items {
|
||||
|
@ -189,6 +251,11 @@ func Precrawl(config types.Config) {
|
|||
}
|
||||
|
||||
func Crawl(config types.Config) {
|
||||
// setup proxy
|
||||
err := SetupDefaultProxy(config)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes)
|
||||
links := getWebringLinks(config.Crawler.Webring)
|
||||
domains, pathsites := getDomains(links)
|
||||
|
@ -199,6 +266,9 @@ func Crawl(config types.Config) {
|
|||
c := colly.NewCollector(
|
||||
colly.MaxDepth(3),
|
||||
)
|
||||
if config.General.Proxy != "" {
|
||||
c.SetProxy(config.General.Proxy)
|
||||
}
|
||||
|
||||
q, _ := queue.New(
|
||||
5, /* threads */
|
||||
|
@ -219,9 +289,16 @@ func Crawl(config types.Config) {
|
|||
|
||||
boringDomains := getBoringDomains(config.Crawler.BoringDomains)
|
||||
boringWords := getBoringWords(config.Crawler.BoringWords)
|
||||
previewQueries := getPreviewQueries(config.Crawler.PreviewQueries)
|
||||
heuristics := getAboutHeuristics(config.Data.Heuristics)
|
||||
|
||||
// on every a element which has an href attribute, call callback
|
||||
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||
|
||||
if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 {
|
||||
return
|
||||
}
|
||||
|
||||
link := getLink(e.Attr("href"))
|
||||
if findSuffix(SUFFIXES, link) {
|
||||
return
|
||||
|
@ -267,7 +344,7 @@ func Crawl(config types.Config) {
|
|||
}
|
||||
})
|
||||
|
||||
handleIndexing(c)
|
||||
handleIndexing(c, previewQueries, heuristics)
|
||||
|
||||
// start scraping
|
||||
q.Run(c)
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
|
|
@ -8,3 +8,16 @@ last edit
|
|||
(c)
|
||||
all rights reserved
|
||||
licensed under
|
||||
subscribe
|
||||
|
|
||||
•
|
||||
generated by
|
||||
powered by
|
||||
this post was
|
||||
click here for
|
||||
click here to
|
||||
published on:
|
||||
published:
|
||||
posted:
|
||||
share this article
|
||||
estimated read time
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
header p.p-summary
|
||||
main p.p-summary
|
||||
main p:not(.post-meta):not(.alternate):not(header p):not(footer p):not(nav p):not(aside p):not(.sidebar p)
|
||||
article p.p-summary
|
||||
article p:not(.post-meta):not(.alternate):not(header p):not(footer p):not(nav p):not(aside p):not(.sidebar p)
|
||||
p:not(.post-meta):not(.alternate):not(header p):not(footer p):not(nav p):not(aside p):not(.sidebar p)
|
||||
header ~ p:not(.post-meta):not(.alternate):not(footer p):not(aside p):not(.sidebar p)
|
||||
h1 ~ p:not(.post-meta):not(.alternate)
|
||||
p:not(.post-meta):not(.alternate):not(footer p):not(aside p):not(.sidebar p)
|
|
@ -0,0 +1 @@
|
|||
|
|
@ -19,10 +19,13 @@ import (
|
|||
"log"
|
||||
"net/url"
|
||||
"strings"
|
||||
"regexp"
|
||||
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
var languageCodeSanityRegex = regexp.MustCompile("^[a-zA-Z\\-0-9]+$")
|
||||
|
||||
func InitDB(filepath string) *sql.DB {
|
||||
db, err := sql.Open("sqlite3", filepath)
|
||||
if err != nil {
|
||||
|
@ -95,17 +98,19 @@ query params:
|
|||
&order=score, &order=count
|
||||
*/
|
||||
|
||||
var emptyStringArray = []string{}
|
||||
|
||||
func SearchWordsByScore(db *sql.DB, words []string) []types.PageData {
|
||||
return searchWords(db, words, true)
|
||||
return SearchWords(db, words, true, emptyStringArray, emptyStringArray, emptyStringArray)
|
||||
}
|
||||
|
||||
func SearchWordsBySite(db *sql.DB, words []string, domain string) []types.PageData {
|
||||
// search words by site is same as search words by score, but adds a domain condition
|
||||
return searchWords(db, words, true, domain)
|
||||
return SearchWords(db, words, true, []string{domain}, emptyStringArray, emptyStringArray)
|
||||
}
|
||||
|
||||
func SearchWordsByCount(db *sql.DB, words []string) []types.PageData {
|
||||
return searchWords(db, words, false)
|
||||
return SearchWords(db, words, false, emptyStringArray, emptyStringArray, emptyStringArray)
|
||||
}
|
||||
|
||||
func FulltextSearchWords(db *sql.DB, phrase string) []types.PageData {
|
||||
|
@ -222,12 +227,16 @@ func countQuery(db *sql.DB, table string) int {
|
|||
return count
|
||||
}
|
||||
|
||||
func searchWords(db *sql.DB, words []string, searchByScore bool, domain ...string) []types.PageData {
|
||||
var wordlist []string
|
||||
func SearchWords(db *sql.DB, words []string, searchByScore bool, domain []string, nodomain []string, language []string) []types.PageData {
|
||||
var args []interface{}
|
||||
for _, word := range words {
|
||||
wordlist = append(wordlist, "word = ?")
|
||||
args = append(args, strings.ToLower(word))
|
||||
|
||||
wordlist := []string{"1"}
|
||||
if len(words) > 0 && words[0] != "" {
|
||||
wordlist = make([]string, 0)
|
||||
for _, word := range words {
|
||||
wordlist = append(wordlist, "word = ?")
|
||||
args = append(args, strings.ToLower(word))
|
||||
}
|
||||
}
|
||||
|
||||
// the domains conditional defaults to just 'true' i.e. no domain condition
|
||||
|
@ -240,6 +249,28 @@ func searchWords(db *sql.DB, words []string, searchByScore bool, domain ...strin
|
|||
}
|
||||
}
|
||||
|
||||
nodomains := []string{"1"}
|
||||
if len(nodomain) > 0 && nodomain[0] != "" {
|
||||
nodomains = make([]string, 0)
|
||||
for _, d := range nodomain {
|
||||
nodomains = append(nodomains, "domain != ?")
|
||||
args = append(args, d)
|
||||
}
|
||||
}
|
||||
|
||||
//This needs some wildcard support …
|
||||
languages := []string{"1"}
|
||||
if len(language) > 0 && language[0] != "" {
|
||||
languages = make([]string, 0)
|
||||
for _, d := range language {
|
||||
// Do a little check to avoid the database being DOSed
|
||||
if languageCodeSanityRegex.MatchString(d) {
|
||||
languages = append(languages, "lang LIKE ?")
|
||||
args = append(args, d+"%")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
orderType := "SUM(score)"
|
||||
if !searchByScore {
|
||||
orderType = "COUNT(*)"
|
||||
|
@ -250,11 +281,13 @@ func searchWords(db *sql.DB, words []string, searchByScore bool, domain ...strin
|
|||
FROM inv_index inv INNER JOIN pages p ON inv.url = p.url
|
||||
WHERE (%s)
|
||||
AND (%s)
|
||||
AND (%s)
|
||||
AND (%s)
|
||||
GROUP BY inv.url
|
||||
ORDER BY %s
|
||||
DESC
|
||||
LIMIT 15
|
||||
`, strings.Join(wordlist, " OR "), strings.Join(domains, " OR "), orderType)
|
||||
`, strings.Join(wordlist, " OR "), strings.Join(domains, " OR "), strings.Join(nodomains, " AND "), strings.Join(languages, " OR "), orderType)
|
||||
|
||||
stmt, err := db.Prepare(query)
|
||||
util.Check(err)
|
||||
|
|
|
@ -37,6 +37,8 @@ bannedSuffixes = "data/banned-suffixes.txt"
|
|||
boringWords = "data/boring-words.txt"
|
||||
# domains that won't be output as outgoing links
|
||||
boringDomains = "data/boring-domains.txt"
|
||||
# queries to search for finding preview text
|
||||
previewQueryList = "data/preview-query-list.txt"
|
||||
```
|
||||
|
||||
## HTML
|
||||
|
@ -120,6 +122,21 @@ are stopped from entering the search index. The default wordlist consists of the
|
|||
1000 or so most common English words, albeit curated slightly to still allow for
|
||||
interesting concepts and verbs—such as `reading` and `books`, for example.
|
||||
|
||||
#### `previewQueryList`
|
||||
A list of css selectors—one per line—used to fetch preview paragraphs. The first paragraph
|
||||
found passing a check against the `heuristics` file makes it into the search index. For
|
||||
each selector in `previewQueryList`, Lieu tries the first four paragraphs—as found by the
|
||||
selector—before trying to find a new set of paragraphs using the file's next selector.
|
||||
|
||||
To get good results, one usually wants to tune this list to getting the first "real" paragraph
|
||||
after common page headers, or finding a summary paragraph. The default has been, at the time of
|
||||
writing, tuned for use with the [Fediring](https://fediring.net).
|
||||
|
||||
Depending on the structure of the websites you are indexing, this will get you 70-90% of the
|
||||
way in terms of accurate link descriptions. For the rest of the way, fine-tune `heuristics.txt`
|
||||
and reach out the creators of the websites you are indexing; they often appreciate the
|
||||
feedback.
|
||||
|
||||
#### OpenSearch metadata
|
||||
If you are running your own instance of Lieu, you might want to look into changing the URL
|
||||
defined in the file `opensearch.xml`, which specifies [OpenSearch
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
# Querying Lieu
|
||||
|
||||
## Search Syntax
|
||||
|
||||
* `cat dog` - search for pages about cats or dogs, most probably both
|
||||
* `fox site:example.org` - search example.org (if indexed) for term "fox"
|
||||
* `fox -site:example.org` - search all indexed sites except `example.org` for term "fox"
|
||||
* `emoji lang:de` - search pages that claim to mainly contain German content for the term "emoji"
|
||||
|
||||
When searching, capitalisation and inflection do not matter, as search terms are:
|
||||
|
||||
* Converted to lowercase using the go standard library
|
||||
* Passed through [jinzhu's inflection library](https://github.com/jinzhu/inflection) for
|
||||
converting to a possible singular form (intended to work with English nouns)
|
||||
|
||||
## Search API
|
||||
|
||||
Lieu currently only renders its results to HTML. A query can be passed to the `/` endpoint using a `GET` request.
|
||||
|
||||
It supports two URL parameters:
|
||||
* `q` - used for the search query
|
||||
* `site` - accepts one domain name and will have the same effect as the `site:<domain>` syntax.
|
||||
You can use this to make your webrings search engine double as a searchbox on your website.
|
||||
|
||||
### Examples
|
||||
To search `example.org` for the term "ssh" using `https://search.webring.example`:
|
||||
|
||||
```
|
||||
https://search.webring.example/?q=ssh&site=example.org
|
||||
```
|
||||
|
||||
Adding a form element, to use Lieu as a search engine, to the HTML at example.org:
|
||||
|
||||
```
|
||||
<form method="GET" action="https://search.webring.example">
|
||||
<label for="search">Search example.org</label>
|
||||
<input type="search" minlength="1" required="" name="q" placeholder="Your search query here" id="search">
|
||||
<input type="hidden" name="site" value="example.org"> <!-- This hidden field tells lieu to only search example.org -->
|
||||
<button type="submit">Let's go!</button>
|
||||
</form>
|
||||
```
|
Plik binarny nie jest wyświetlany.
Przed Szerokość: | Wysokość: | Rozmiar: 1.1 KiB Po Szerokość: | Wysokość: | Rozmiar: 326 B |
Plik binarny nie jest wyświetlany.
Przed Szerokość: | Wysokość: | Rozmiar: 7.0 KiB Po Szerokość: | Wysokość: | Rozmiar: 2.7 KiB |
|
@ -0,0 +1,2 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<svg width="420" height="420" fill="none" version="1.1" xmlns="http://www.w3.org/2000/svg"><g stroke-linecap="round"><rect width="420" height="420" fill="#000" stroke-width="12.8"/></g><path d="m210 87c-53.5 0-104 27.1-149 71.9l-28.6 28.6 50.7 50.7 28.3-27.9 4.76 13.4-38.8 38.8 57.3 57.3 34.5-34.5v2.72 4.5 40.5h81v-40.5-4.5-2.72l34.5 34.5 57.3-57.3-38.8-38.8 4.76-13.4 28.3 27.9 50.7-50.7-28.6-28.6c-44.8-44.8-95.1-71.9-149-71.9zm0 81c11.2 0 19.5 8.25 19.5 19.5s-8.25 19.5-19.5 19.5-19.5-8.25-19.5-19.5 8.25-19.5 19.5-19.5z" color="#000000" fill="#fff" stroke-linecap="square" stroke-linejoin="round" style="-inkscape-stroke:none"/></svg>
|
Po Szerokość: | Wysokość: | Rozmiar: 680 B |
|
@ -15,7 +15,7 @@
|
|||
<link href="/assets/theme.css" rel="stylesheet">
|
||||
|
||||
<link rel="icon" href="/assets/favicon.ico">
|
||||
<link rel="icon" href="/assets/logo.svg" type="image/svg+xml">
|
||||
<link rel="icon" href="/assets/favicon.svg" type="image/svg+xml">
|
||||
<link rel="shortcut icon" href="/assets/favicon.png">
|
||||
<link rel="apple-touch-icon" href="/assets/favicon.png">
|
||||
<meta name="theme-color" content="#000000">
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
<form class="search">
|
||||
<label class="visually-hidden" for="search">Search {{ .SiteName }}</label>
|
||||
<span class="search__input">
|
||||
<input type="search" required minlength="1" name="q" placeholder="{{ .Data.Placeholder }}" class="flex-grow" id="search">
|
||||
<input type="search" required minlength="1" name="q" placeholder="{{ .Data.Placeholder }}" class="flex-grow" id="search" maxlength="6000" >
|
||||
<button type="submit" class="search__button" aria-label="Search" title="Search">
|
||||
<svg viewBox="0 0 420 300" xmlns="http://www.w3.org/2000/svg" baseProfile="full" style="background:var(--secondary)" width="42" height="30" fill="none"><path d="M90 135q60-60 120-60 0 0 0 0 60 0 120 60m-120 60a60 60 0 01-60-60 60 60 0 0160-60 60 60 0 0160 60 60 60 0 01-60 60m45-15h0l30 30m-75-15h0v45m-45-60h0l-30 30" stroke-width="81" stroke-linecap="square" stroke-linejoin="round" stroke="var(--primary)"/></svg>
|
||||
</button>
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
User-agent: *
|
||||
Disallow: /*?
|
|
@ -6,7 +6,7 @@
|
|||
<form method="GET" class="search">
|
||||
<label for="search">Search {{ .SiteName }} </label>
|
||||
<span class="search__input">
|
||||
<input type="search" minlength="1" required name="q" placeholder="Search" value="{{ .Data.Query }}" class="search-box" id="search">
|
||||
<input type="search" minlength="1" required name="q" placeholder="Search" value="{{ .Data.Query }}" class="search-box" id="search" maxlength="6000">
|
||||
{{ if ne .Data.Site "" }}
|
||||
<input type="hidden" value="{{ .Data.Site }}" name="site">
|
||||
{{ end }}
|
||||
|
|
|
@ -95,7 +95,7 @@ func Ingest(config types.Config) {
|
|||
continue
|
||||
}
|
||||
|
||||
pageurl := strings.ToLower(strings.TrimSuffix(strings.TrimSpace(line[lastSpace:len(line)]), "/"))
|
||||
pageurl := strings.TrimSuffix(strings.TrimSpace(line[lastSpace:len(line)]), "/")
|
||||
if !strings.HasPrefix(pageurl, "http") {
|
||||
continue
|
||||
}
|
||||
|
@ -117,6 +117,7 @@ func Ingest(config types.Config) {
|
|||
case "title":
|
||||
if len(page.About) == 0 {
|
||||
page.About = rawdata
|
||||
page.AboutSource = token
|
||||
}
|
||||
score = 5
|
||||
page.Title = rawdata
|
||||
|
@ -124,6 +125,7 @@ func Ingest(config types.Config) {
|
|||
case "h1":
|
||||
if len(page.About) == 0 {
|
||||
page.About = rawdata
|
||||
page.AboutSource = token
|
||||
}
|
||||
fallthrough
|
||||
case "h2":
|
||||
|
@ -132,13 +134,21 @@ func Ingest(config types.Config) {
|
|||
score = 15
|
||||
processed = partitionSentence(payload)
|
||||
case "desc":
|
||||
if len(page.About) < 30 && len(rawdata) < 100 {
|
||||
if len(page.About) < 30 && len(rawdata) < 100 && len(rawdata) > len(page.About) {
|
||||
page.About = rawdata
|
||||
page.AboutSource = token
|
||||
}
|
||||
processed = partitionSentence(payload)
|
||||
case "og-desc":
|
||||
page.About = rawdata
|
||||
page.AboutSource = token
|
||||
processed = partitionSentence(payload)
|
||||
case "para":
|
||||
if performAboutHeuristic(config.Data.Heuristics, payload) {
|
||||
page.About = rawdata
|
||||
if page.AboutSource != "og-desc" || len(rawdata)*10 > len(page.About)*7 {
|
||||
if performAboutHeuristic(config.Data.Heuristics, payload) {
|
||||
page.About = rawdata
|
||||
page.AboutSource = token
|
||||
}
|
||||
}
|
||||
processed = partitionSentence(payload)
|
||||
case "lang":
|
||||
|
@ -146,7 +156,7 @@ func Ingest(config types.Config) {
|
|||
case "keywords":
|
||||
processed = strings.Split(strings.ReplaceAll(payload, ", ", ","), ",")
|
||||
case "non-webring-link":
|
||||
externalLinks = append(externalLinks, payload)
|
||||
externalLinks = append(externalLinks, rawdata)
|
||||
default:
|
||||
continue
|
||||
}
|
||||
|
@ -162,7 +172,7 @@ func Ingest(config types.Config) {
|
|||
// only extract path segments once per url.
|
||||
// we do it here because every page is virtually guaranteed to have a title attr &
|
||||
// it only appears once
|
||||
for _, word := range extractPathSegments(pageurl) {
|
||||
for _, word := range extractPathSegments(strings.ToLower(pageurl)) {
|
||||
batch = append(batch, types.SearchFragment{Word: word, URL: pageurl, Score: 2})
|
||||
}
|
||||
}
|
||||
|
@ -190,10 +200,16 @@ func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]ty
|
|||
i++
|
||||
}
|
||||
// TODO (2021-11-10): debug the "incomplete input" error / log, and find out where it is coming from
|
||||
log.Println("starting to ingest batch")
|
||||
log.Println("starting to ingest batch (Pages:", len(pages), "Words:", len(batch), "Links:", len(links), ")")
|
||||
database.InsertManyDomains(db, pages)
|
||||
database.InsertManyPages(db, pages)
|
||||
database.InsertManyWords(db, batch)
|
||||
for i := 0; i < len(batch); i += 3000 {
|
||||
end_i := i + 3000
|
||||
if end_i > len(batch) {
|
||||
end_i = len(batch)
|
||||
}
|
||||
database.InsertManyWords(db, batch[i:end_i])
|
||||
}
|
||||
database.InsertManyExternalLinks(db, links)
|
||||
log.Println("finished ingesting batch")
|
||||
}
|
||||
|
|
|
@ -27,3 +27,5 @@ bannedSuffixes = "data/banned-suffixes.txt"
|
|||
boringWords = "data/boring-words.txt"
|
||||
# domains that won't be output as outgoing links
|
||||
boringDomains = "data/boring-domains.txt"
|
||||
# queries to search for finding preview text
|
||||
previewQueryList = "data/preview-query-list.txt"
|
||||
|
|
|
@ -7,7 +7,6 @@ import (
|
|||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
||||
|
@ -61,17 +60,21 @@ var templates = template.Must(template.ParseFiles(
|
|||
|
||||
const useURLTitles = true
|
||||
|
||||
var sitePattern = regexp.MustCompile(`site:\S+`)
|
||||
|
||||
func (h RequestHandler) searchRoute(res http.ResponseWriter, req *http.Request) {
|
||||
var query string
|
||||
var domain string
|
||||
view := &TemplateView{}
|
||||
|
||||
var domain string
|
||||
if req.Method == http.MethodGet {
|
||||
var domains = []string{}
|
||||
var nodomains = []string{}
|
||||
var langs = []string{}
|
||||
var queryFields = []string{}
|
||||
|
||||
if req.Method == http.MethodGet{
|
||||
params := req.URL.Query()
|
||||
if words, exists := params["q"]; exists && words[0] != "" {
|
||||
query = words[0]
|
||||
queryFields = strings.Fields(query)
|
||||
}
|
||||
|
||||
// how to use: https://gist.github.com/cblgh/29991ba0a9e65cccbe14f4afd7c975f1
|
||||
|
@ -80,29 +83,36 @@ func (h RequestHandler) searchRoute(res http.ResponseWriter, req *http.Request)
|
|||
domain = strings.TrimPrefix(parts[0], "https://")
|
||||
domain = strings.TrimPrefix(domain, "http://")
|
||||
domain = strings.TrimSuffix(domain, "/")
|
||||
} else if sitePattern.MatchString(query) {
|
||||
// if user searched with "site:<domain>" in text box, behave the same way as if a query param was used
|
||||
domain = sitePattern.FindString(query)[5:]
|
||||
domains = append(domains, domain)
|
||||
}
|
||||
// if clear button was used -> clear site param / search text
|
||||
if parts, exists := params["clear"]; exists && parts[0] != "" {
|
||||
domain = ""
|
||||
query = sitePattern.ReplaceAllString(query, "")
|
||||
|
||||
// don't process if there are too many fields
|
||||
if len(queryFields) <= 100 {
|
||||
var newQueryFields []string;
|
||||
for _, word := range queryFields {
|
||||
// This could be more efficient by splitting arrays, but I'm going with the more readable version for now
|
||||
if strings.HasPrefix(word, "site:") {
|
||||
domains = append(domains, strings.TrimPrefix(word, "site:"))
|
||||
} else if strings.HasPrefix(word, "-site:") {
|
||||
nodomains = append(nodomains, strings.TrimPrefix(word, "-site:"))
|
||||
} else if strings.HasPrefix(word, "lang:") {
|
||||
langs = append(langs, strings.TrimPrefix(word, "lang:"))
|
||||
} else {
|
||||
newQueryFields = append(newQueryFields, word)
|
||||
}
|
||||
}
|
||||
queryFields = newQueryFields;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if len(query) == 0 {
|
||||
if len(queryFields) == 0 || len(queryFields) > 100 || len(query) >= 8192 {
|
||||
view.Data = IndexData{Tagline: h.config.General.Tagline, Placeholder: h.config.General.Placeholder}
|
||||
h.renderView(res, "index", view)
|
||||
return
|
||||
}
|
||||
|
||||
var pages []types.PageData
|
||||
if domain != "" {
|
||||
pages = database.SearchWordsBySite(h.db, util.Inflect(strings.Fields(query)), domain)
|
||||
} else {
|
||||
pages = database.SearchWordsByScore(h.db, util.Inflect(strings.Fields(query)))
|
||||
}
|
||||
var pages = database.SearchWords(h.db, util.Inflect(queryFields), true, domains, nodomains, langs)
|
||||
|
||||
if useURLTitles {
|
||||
for i, pageData := range pages {
|
||||
|
@ -230,14 +240,15 @@ func (h RequestHandler) renderView(res http.ResponseWriter, tmpl string, view *T
|
|||
func WriteTheme(config types.Config) {
|
||||
theme := config.Theme
|
||||
// no theme is set, use the default
|
||||
if theme.Foreground == "" {
|
||||
if theme.Foreground == "" || theme.Background == "" || theme.Links =="" {
|
||||
return
|
||||
}
|
||||
colors := fmt.Sprintf(`:root {
|
||||
colors := fmt.Sprintf(`/*This file will be automatically regenerated by lieu on startup if the theme colors are set in the configuration file*/
|
||||
:root {
|
||||
--primary: %s;
|
||||
--secondary: %s;
|
||||
--link: %s;
|
||||
}\n`, theme.Foreground, theme.Background, theme.Links)
|
||||
}`, theme.Foreground, theme.Background, theme.Links)
|
||||
err := os.WriteFile("html/assets/theme.css", []byte(colors), 0644)
|
||||
util.Check(err)
|
||||
}
|
||||
|
@ -255,8 +266,9 @@ func Serve(config types.Config) {
|
|||
http.HandleFunc("/webring", handler.webringRoute)
|
||||
http.HandleFunc("/filtered", handler.filteredRoute)
|
||||
|
||||
fileserver := http.FileServer(http.Dir("html/assets/"))
|
||||
http.Handle("/assets/", http.StripPrefix("/assets/", fileserver))
|
||||
fileserver := http.FileServer(http.Dir("html/"))
|
||||
http.Handle("/assets/", fileserver)
|
||||
http.Handle("/robots.txt", fileserver)
|
||||
|
||||
portstr := fmt.Sprintf(":%d", config.General.Port)
|
||||
fmt.Println("Listening on port: ", portstr)
|
||||
|
|
|
@ -7,19 +7,22 @@ type SearchFragment struct {
|
|||
}
|
||||
|
||||
type PageData struct {
|
||||
URL string
|
||||
Title string
|
||||
About string
|
||||
Lang string
|
||||
URL string
|
||||
Title string
|
||||
About string
|
||||
Lang string
|
||||
AboutSource string
|
||||
}
|
||||
|
||||
type Config struct {
|
||||
General struct {
|
||||
Name string `json:name`
|
||||
Tagline string `json:tagline`
|
||||
Placeholder string `json:placeholder`
|
||||
URL string `json:url`
|
||||
Port int `json:port`
|
||||
Name string `json:name`
|
||||
Tagline string `json:tagline`
|
||||
Placeholder string `json:placeholder`
|
||||
URL string `json:url`
|
||||
WebringSelector string `json:"webringSelector"`
|
||||
Port int `json:port`
|
||||
Proxy string `json:proxy`
|
||||
} `json:general`
|
||||
Theme struct {
|
||||
Foreground string `json:"foreground"`
|
||||
|
@ -38,5 +41,6 @@ type Config struct {
|
|||
BannedSuffixes string `json:bannedSuffixes`
|
||||
BoringWords string `json:boringWords`
|
||||
BoringDomains string `json:boringDomains`
|
||||
PreviewQueries string `json:"previewQueryList"`
|
||||
} `json:crawler`
|
||||
}
|
||||
|
|
80
util/util.go
80
util/util.go
|
@ -4,15 +4,18 @@ import (
|
|||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"lieu/types"
|
||||
|
||||
"github.com/jinzhu/inflection"
|
||||
"github.com/komkom/toml"
|
||||
"lieu/types"
|
||||
)
|
||||
|
||||
func Inflect(words []string) []string {
|
||||
|
@ -29,6 +32,66 @@ func Check(err error) {
|
|||
}
|
||||
}
|
||||
|
||||
// document.querySelectorAll-type functionality. limited functionality as of now (no classes or id support atm, i think!!)
|
||||
func QuerySelector(query string, current *goquery.Selection, results *[]string) {
|
||||
var op, operand string
|
||||
|
||||
attrPattern := regexp.MustCompile(`(\w+)\[(\w+)\](.+)?`)
|
||||
attrValuePattern := regexp.MustCompile(`\[(\w+)\]`)
|
||||
|
||||
if len(query) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
fields := strings.Fields(query)
|
||||
part := fields[0]
|
||||
query = strings.Join(fields[1:], " ")
|
||||
if part == ">" {
|
||||
op = "subchild"
|
||||
} else if attrPattern.MatchString(part) {
|
||||
op = "element"
|
||||
matches := attrPattern.FindStringSubmatch(part)
|
||||
operand = matches[1]
|
||||
var optional string
|
||||
if len(matches) == 4 {
|
||||
optional = matches[3]
|
||||
}
|
||||
query = strings.TrimSpace(fmt.Sprintf("[%s]%s %s", matches[2], optional, query))
|
||||
} else if attrValuePattern.MatchString(part) {
|
||||
op = "attr"
|
||||
operand = attrValuePattern.FindStringSubmatch(part)[1]
|
||||
} else if len(query) == 0 {
|
||||
op = "final"
|
||||
} else {
|
||||
op = "element"
|
||||
operand = part
|
||||
}
|
||||
|
||||
switch op {
|
||||
case "element": // e.g. [el]; bla > [el]; but also [el] > bla
|
||||
current = current.Find(operand)
|
||||
if strings.HasSuffix(query, "first-of-type") {
|
||||
break
|
||||
}
|
||||
fallthrough
|
||||
case "subchild": // [preceding] > [future]
|
||||
// recurse querySelector on all [preceding] element types
|
||||
current.Each(func(j int, s *goquery.Selection) {
|
||||
QuerySelector(query, s, results)
|
||||
})
|
||||
return
|
||||
case "attr": // x[attr]
|
||||
// extract the attribute
|
||||
if str, exists := current.Attr(operand); exists {
|
||||
*results = append(*results, str)
|
||||
}
|
||||
return
|
||||
case "final": // no more in query, and we did not end on an attr: get text
|
||||
*results = append(*results, current.Text())
|
||||
}
|
||||
QuerySelector(query, current, results)
|
||||
}
|
||||
|
||||
func DatabaseDoesNotExist(filepath string) {
|
||||
fmt.Printf("lieu: database %s does not exist\n", filepath)
|
||||
fmt.Println("lieu: try running `lieu ingest` if you have already crawled source data")
|
||||
|
@ -103,6 +166,7 @@ func WriteMockConfig() {
|
|||
name = "Sweet Webring"
|
||||
# used by the precrawl command and linked to in /about route
|
||||
url = "https://example.com/"
|
||||
webringSelector = "li > a"
|
||||
port = 10001
|
||||
|
||||
[theme]
|
||||
|
@ -132,6 +196,8 @@ bannedSuffixes = "data/banned-suffixes.txt"
|
|||
boringWords = "data/boring-words.txt"
|
||||
# domains that won't be output as outgoing links
|
||||
boringDomains = "data/boring-domains.txt"
|
||||
# queries to search for finding preview text
|
||||
previewQueryList = "data/preview-query-list.txt"
|
||||
`)
|
||||
err := ioutil.WriteFile("lieu.toml", conf, 0644)
|
||||
Check(err)
|
||||
|
@ -140,3 +206,15 @@ boringDomains = "data/boring-domains.txt"
|
|||
func Exit() {
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
func DeduplicateSlice(intSlice []string) []string {
|
||||
keys := make(map[string]bool)
|
||||
list := []string{}
|
||||
for _, entry := range intSlice {
|
||||
if _, value := keys[entry]; !value {
|
||||
keys[entry] = true
|
||||
list = append(list, entry)
|
||||
}
|
||||
}
|
||||
return list
|
||||
}
|
||||
|
|
Ładowanie…
Reference in New Issue