pull/4/head
cblgh 2021-02-03 09:12:30 +01:00
commit 28d052f4c3
34 zmienionych plików z 2133 dodań i 0 usunięć

223
.gitignore vendored 100755
Wyświetl plik

@ -0,0 +1,223 @@
#~top ignores~
node_modules/
*.vim
*bundle*.js
/html/*.html
*.sw[a-z]
config.conf
config.js
*.pdf
archives
builds
dist
#################
## Eclipse
#################
*.pydevproject
.project
.metadata
bin/
tmp/
*.tmp
*.bak
*.swp
*~.nib
local.properties
.classpath
.settings/
.loadpath
# External tool builders
.externalToolBuilders/
# Locally stored "Eclipse launch configurations"
*.launch
# CDT-specific
.cproject
# PDT-specific
.buildpath
#################
## Visual Studio
#################
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
# User-specific files
*.suo
*.user
*.sln.docstates
# Build results
[Dd]ebug/
[Rr]elease/
x64/
build/
[Bb]in/
[Oo]bj/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
*_i.c
*_p.c
*.ilk
*.meta
*.obj
*.pch
*.pdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.log
*.scc
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opensdf
*.sdf
*.cachefile
# Visual Studio profiler
*.psess
*.vsp
*.vspx
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# NCrunch
*.ncrunch*
.*crunch*.local.xml
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.Publish.xml
*.pubxml
# Windows Azure Build Output
csx
*.build.csdef
# Windows Store app package directory
AppPackages/
# Others
sql/
*.Cache
ClientBin/
[Ss]tyle[Cc]op.*
~$*
*~
*.dbmdl
*.[Pp]ublish.xml
*.pfx
*.publishsettings
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file to a newer
# Visual Studio version. Backup files are not needed, because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
# SQL Server files
App_Data/*.mdf
App_Data/*.ldf
#############
## Windows detritus
#############
# Windows image file caches
Thumbs.db
ehthumbs.db
# Folder config file
Desktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Mac crap
.DS_Store
#############
## Python
#############
*.py[co]
# Packages
*.egg
*.egg-info
dist/
build/
eggs/
parts/
var/
sdist/
develop-eggs/
.installed.cfg
# Installer logs
pip-log.txt
# Unit test / coverage reports
.coverage
.tox
#Translations
*.mo
#Mr Developer
.mr.developer.cfg

104
README.md 100644
Wyświetl plik

@ -0,0 +1,104 @@
# Lieu
_an alternative search engine_
Created in response to the environs of apathy concerning the use of hypertext
search and discovery. In Lieu, the internet is not what is made searchable, but
instead one's own neighbourhood. Put differently, Lieu is a neighbourhood search
engine, a way for personal webrings to increase serendipitous connexions.
## Goals
* Enable serendipitous discovery
* Support personal communities
* Be reusable, easily
## Usage
```
$ lieu help
Lieu: neighbourhood search engine
Commands
- precrawl (scrapes config's general.url for a list of links: <li> elements containing an anchor <a> tag)
- crawl (start crawler, crawls all urls in config's crawler.webring file)
- ingest (ingest crawled data, generates database)
- search (interactive cli for searching the database)
- host (hosts search engine over http)
Example:
lieu precrawl > data/webring.txt
lieu ingest
lieu host
```
Lieu's crawl & precrawl commands output to [standard
output](https://en.wikipedia.org/wiki/Standard_streams#Standard_output_(stdout)),
for easy inspection of the data. You typically want to redirect their output to
the files Lieu reads from, as defined in the config file. See below for a
typical workflow.
### Workflow
* Edit the config
* Add domains to crawl in `config.crawler.webring`
* **If you have a webpage with links you want to crawl:**
* Set the config's `url` field to that page
* Populate the list of domains to crawl with `precrawl`: `lieu precrawl > data/webring.txt`
* Crawl: `lieu crawl > data/source.txt`
* Create database: `lieu ingest`
* Host engine: `lieu host`
After ingesting the data with `lieu ingest`, you can also use lieu to search the
corpus in the terminal with `lieu search`.
## Config
The config file is written in [TOML](https://toml.io/en/).
```toml
[general]
name = "Merveilles Webring"
# used by the precrawl command and linked to in /about route
url = "https://webring.xxiivv.com"
port = 10001
[data]
# the source file should contain the crawl command's output
source = "data/crawled.txt"
# location & name of the sqlite database
database = "data/searchengine.db"
# contains words and phrases disqualifying scraped paragraphs from being presented in search results
heuristics = "data/heuristics.txt"
# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word
wordlist = "data/wordlist.txt"
[crawler]
# manually curated list of domains, or the output of the precrawl command
webring = "data/webring.txt"
# domains that are banned from being crawled but might originally be part of the webring
bannedDomains = "data/banned-domains.txt"
# file suffixes that are banned from being crawled
bannedSuffixes = "data/banned-suffixes.txt"
# phrases and words which won't be scraped (e.g. if a contained in a link)
boringWords = "data/boring-words.txt"
# domains that won't be output as outgoing links
boringDomains = "data/boring-domains.txt"
```
For your own use, the following config fields should be customized:
* `name`
* `url `
* `port`
* `source`
* `webring`
* `bannedDomains`
The following config-defined files can stay as-is unless you have specific requirements:
* `database`
* `heuristics`
* `wordlist`
* `bannedSuffixes`
For a full rundown of the files and their various jobs, see the [files
description](docs/files.md).
### License
Source code `AGPL-3.0-or-later`, Inter is available under `SIL OPEN FONT
LICENSE Version 1.1`, Noto Serif is licensed as `Apache License, Version 2.0`.

125
cli.go 100644
Wyświetl plik

@ -0,0 +1,125 @@
package main
import (
"bufio"
"fmt"
"lieu/crawler"
"lieu/database"
"lieu/ingest"
"lieu/server"
"lieu/util"
"os"
"strings"
)
const help = `Lieu: neighbourhood search engine
Commands
- precrawl (scrapes config's general.url for a list of links: <li> elements containing an anchor <a> tag)
- crawl (start crawler, crawls all urls in config's crawler.webring file. outputs to stdout)
- ingest (ingest crawled data, generates database)
- search (interactive cli for searching the database)
- host (hosts search engine over http)
Example:
lieu precrawl > data/webring.txt
lieu crawl > data/source.txt
lieu ingest
lieu host
See the configuration file lieu.toml or
https://github.com/cblgh/lieu for more information.
`
func main() {
exists := util.CheckFileExists("lieu.toml")
if !exists {
fmt.Println("lieu: can't find config, saving an example config in the working directory")
util.WriteMockConfig()
fmt.Println("lieu: lieu.toml written to disk")
util.Exit()
}
config := util.ReadConfig()
var cmd string
if len(os.Args) > 1 {
cmd = os.Args[1]
} else {
cmd = "search"
}
switch cmd {
case "help":
fmt.Println(help)
case "precrawl":
if config.General.URL == "https://example.com/" {
fmt.Println("lieu: the url is not set (example.com)")
util.Exit()
}
crawler.Precrawl(config)
case "crawl":
exists := util.CheckFileExists(config.Crawler.Webring)
if !exists {
fmt.Printf("lieu: webring file %s does not exist\n", config.Data.Source)
util.Exit()
}
sourceLen := len(util.ReadList(config.Crawler.Webring, "\n"))
if sourceLen == 0 {
fmt.Printf("lieu: nothing to crawl; the webring file %s is empty\n", config.Data.Source)
util.Exit()
}
crawler.Crawl(config)
case "ingest":
exists := util.CheckFileExists(config.Data.Source)
if !exists {
fmt.Printf("lieu: data source %s does not exist\n", config.Data.Source)
fmt.Println("lieu: try running `lieu crawl`")
util.Exit()
}
sourceLen := len(util.ReadList(config.Data.Source, "\n"))
if sourceLen == 0 {
fmt.Printf("lieu: nothing to ingest; data source %s is empty\n", config.Data.Source)
fmt.Println("lieu: try running `lieu crawl`")
util.Exit()
}
fmt.Println("lieu: creating a new database & initiating ingestion")
ingest.Ingest(config)
case "search":
exists := util.CheckFileExists(config.Data.Database)
if !exists {
util.DatabaseDoesNotExist(config.Data.Database)
}
interactiveMode(config.Data.Database)
case "host":
exists := util.CheckFileExists(config.Data.Database)
if !exists {
util.DatabaseDoesNotExist(config.Data.Database)
}
open := util.CheckPortOpen(config.General.Port)
if !open {
fmt.Printf("lieu: port %d is not open; try another one\n", config.General.Port)
util.Exit()
}
server.Serve(config)
default:
fmt.Println("Lieu: no such command, currently. Try `lieu help`")
}
}
func interactiveMode(databasePath string) {
db := database.InitDB(databasePath)
reader := bufio.NewReader(os.Stdin)
for {
fmt.Printf("> ")
input, err := reader.ReadString('\n')
util.Check(err)
input = strings.TrimSuffix(input, "\n")
pages := database.SearchWordsByScore(db, util.Inflect(strings.Fields(input)))
for _, pageData := range pages {
fmt.Println(pageData.URL)
if len(pageData.About) > 0 {
fmt.Println(pageData.About)
}
}
}
}

244
crawler/crawler.go 100644
Wyświetl plik

@ -0,0 +1,244 @@
package crawler
import (
"fmt"
"lieu/types"
"lieu/util"
"log"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/queue"
)
// the following domains are excluded from crawling & indexing, typically because they have a lot of microblog pages
// (very spammy)
func getBannedDomains(path string) []string {
return util.ReadList(path, "\n")
}
func getBannedSuffixes(path string) []string {
return util.ReadList(path, "\n")
}
func getBoringWords(path string) []string {
return util.ReadList(path, "\n")
}
func getBoringDomains(path string) []string {
return util.ReadList(path, "\n")
}
func find(list []string, query string) bool {
for _, item := range list {
if item == query {
return true
}
}
return false
}
func getLink(target string) string {
// remove anchor links
if strings.Contains(target, "#") {
target = strings.Split(target, "#")[0]
}
if strings.Contains(target, "?") {
target = strings.Split(target, "?")[0]
}
target = strings.TrimSpace(target)
target = strings.ToLower(target)
// remove trailing /
return strings.TrimSuffix(target, "/")
}
func getWebringLinks(path string) []string {
var links []string
candidates := util.ReadList(path, "\n")
for _, l := range candidates {
u, err := url.Parse(l)
if err != nil {
continue
}
if u.Scheme == "" {
u.Scheme = "https"
}
links = append(links, u.String())
}
return links
}
func getDomains(links []string) []string {
var domains []string
for _, l := range links {
u, err := url.Parse(l)
if err != nil {
continue
}
domains = append(domains, u.Hostname())
}
return domains
}
func findSuffix(suffixes []string, query string) bool {
for _, suffix := range suffixes {
if strings.HasSuffix(strings.ToLower(query), suffix) {
return true
}
}
return false
}
func cleanText(s string) string {
s = strings.TrimSpace(s)
s = strings.ReplaceAll(s, "\n", " ")
s = strings.ReplaceAll(s, "|", " ")
whitespace := regexp.MustCompile(`\p{Z}`)
s = whitespace.ReplaceAllString(s, " ")
return s
}
func handleIndexing(c *colly.Collector) {
c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) {
fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL)
})
c.OnHTML("meta[name=\"description\"]", func(e *colly.HTMLElement) {
desc := cleanText(e.Attr("content"))
if len(desc) > 0 {
fmt.Println("desc", desc, e.Request.URL)
}
})
c.OnHTML("html[lang]", func(e *colly.HTMLElement) {
lang := cleanText(e.Attr("lang"))
if len(lang) > 0 {
fmt.Println("lang", lang, e.Request.URL)
}
})
// get page title
c.OnHTML("title", func(e *colly.HTMLElement) {
fmt.Println("title", cleanText(e.Text), e.Request.URL)
})
c.OnHTML("body", func(e *colly.HTMLElement) {
paragraph := cleanText(e.DOM.Find("p").First().Text())
if len(paragraph) < 1500 && len(paragraph) > 0 {
fmt.Println("para", paragraph, e.Request.URL)
}
// get all relevant page headings
collectHeadingText("h1", e)
collectHeadingText("h2", e)
collectHeadingText("h3", e)
})
}
func collectHeadingText(heading string, e *colly.HTMLElement) {
for _, headingText := range e.ChildTexts(heading) {
if len(headingText) < 500 {
fmt.Println(heading, cleanText(headingText), e.Request.URL)
}
}
}
func Precrawl(config types.Config) {
res, err := http.Get(config.General.URL)
util.Check(err)
defer res.Body.Close()
if res.StatusCode != 200 {
log.Fatal("status not 200")
}
doc, err := goquery.NewDocumentFromReader(res.Body)
util.Check(err)
items := make([]string, 0)
doc.Find("li").Each(func(i int, s *goquery.Selection) {
if domain, exists := s.Find("a").Attr("href"); exists {
items = append(items, domain)
}
})
BANNED := getBannedDomains(config.Crawler.BannedDomains)
for _, item := range items {
link := getLink(item)
u, err := url.Parse(link)
// invalid link
if err != nil {
continue
}
domain := u.Hostname()
if find(BANNED, domain) {
continue
}
fmt.Println(link)
}
}
func Crawl(config types.Config) {
SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes)
links := getWebringLinks(config.Crawler.Webring)
domains := getDomains(links)
initialDomain := config.General.URL
// TODO: introduce c2 for scraping links (with depth 1) linked to from webring domains
// instantiate default collector
c := colly.NewCollector(
colly.MaxDepth(3),
)
q, _ := queue.New(
5, /* threads */
&queue.InMemoryQueueStorage{MaxSize: 100000},
)
for _, link := range links {
q.AddURL(link)
}
c.AllowedDomains = domains
c.AllowURLRevisit = false
c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains)
delay, _ := time.ParseDuration("200ms")
c.Limit(&colly.LimitRule{DomainGlob: "*", Delay: delay, Parallelism: 3})
boringDomains := getBoringDomains(config.Crawler.BoringDomains)
boringWords := getBoringWords(config.Crawler.BoringWords)
// on every a element which has an href attribute, call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := getLink(e.Attr("href"))
if findSuffix(SUFFIXES, link) {
return
}
link = e.Request.AbsoluteURL(link)
u, err := url.Parse(link)
// log which site links to what
if err == nil && !util.Contains(boringWords, link) && !util.Contains(boringDomains, link) {
outgoingDomain := u.Hostname()
currentDomain := e.Request.URL.Hostname()
if !find(domains, outgoingDomain) {
fmt.Println("non-webring-link", link, e.Request.URL)
// solidarity! someone in the webring linked to someone else in it
} else if outgoingDomain != currentDomain && outgoingDomain != initialDomain && currentDomain != initialDomain {
fmt.Println("webring-link", link, e.Request.URL)
}
}
// only visits links from AllowedDomains
q.AddURL(link)
})
handleIndexing(c)
// start scraping
q.Run(c)
}

Wyświetl plik

Wyświetl plik

@ -0,0 +1,17 @@
.xml
.pdf
.rss
.jpg
.png
.gif
.avi
.webm
.mp4
.ogg
.mp3
.zip
.exe
.txt
.asc
.key
.csv

Wyświetl plik

@ -0,0 +1,19 @@
instagram.com
twitter.com
linkedin.com
facebook.com
getpoole.com
jekyllrb.com
twitter.com
amazon.com
google.com
microsoft.com
youtube.com
github.io
meetup.com
ebay.com
t.co
a.co
wsj.com
creativecommons.org
patreon.com

Wyświetl plik

@ -0,0 +1,4 @@
bitcoin
javascript:
mailto:
subscribe

0
data/crawled.txt 100644
Wyświetl plik

Wyświetl plik

@ -0,0 +1,10 @@
incoming
tagged
edited
updated
last update
last edit
©
(c)
all rights reserved
licensed under

0
data/webring.txt 100644
Wyświetl plik

File diff suppressed because one or more lines are too long

Wyświetl plik

@ -0,0 +1,222 @@
package database
/* example query
SELECT p.url
FROM inv_index index
INNER JOIN pages p ON p.id = index.pageid
WHERE i.word = "project";
select url from inv_index where word="esoteric" group by url order by sum(score) desc limit 15;
select url from inv_index where word = "<word>" group by url order by sum(score) desc;
*/
import (
"database/sql"
"fmt"
"lieu/types"
"lieu/util"
"log"
"net/url"
"strings"
_ "github.com/mattn/go-sqlite3"
)
func InitDB(filepath string) *sql.DB {
db, err := sql.Open("sqlite3", filepath)
if err != nil {
log.Fatalln(err)
}
if db == nil {
log.Fatalln("db is nil")
}
createTables(db)
return db
}
func createTables(db *sql.DB) {
// create the table if it doesn't exist
queries := []string{`
CREATE TABLE IF NOT EXISTS domains (
id INTEGER PRIMARY KEY AUTOINCREMENT,
domain TEXT NOT NULL UNIQUE
);
`,
`
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL UNIQUE,
title TEXT,
about TEXT,
lang TEXT,
domain TEXT NOT NULL,
FOREIGN KEY(domain) REFERENCES domains(domain)
);
`,
`
CREATE TABLE IF NOT EXISTS external_pages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL UNIQUE,
domain TEXT NOT NULL,
title TEXT
);
`,
`
CREATE TABLE IF NOT EXISTS inv_index (
word TEXT NOT NULL,
score INTEGER NOT NULL,
url TEXT NOT NULL,
FOREIGN KEY(url) REFERENCES pages(url)
)`,
}
for _, query := range queries {
if _, err := db.Exec(query); err != nil {
log.Fatalln(err)
}
}
}
/* TODO: filters
lang:en|fr|en|<..>
site:wiki.xxiivv.com, site:cblgh.org
nosite:excluded-domain.com
"word1 word2 word3" strict query
query params:
&order=score, &order=count
&outgoing=true
*/
func SearchWordsByScore(db *sql.DB, words []string) []types.PageData {
return searchWords(db, words, true)
}
func SearchWordsByCount(db *sql.DB, words []string) []types.PageData {
return searchWords(db, words, false)
}
func GetDomainCount(db *sql.DB) int {
return countQuery(db, "domains")
}
func GetPageCount(db *sql.DB) int {
return countQuery(db, "pages")
}
func GetWordCount(db *sql.DB) int {
return countQuery(db, "inv_index")
}
func GetRandomPage(db *sql.DB) string {
rows, err := db.Query("SELECT url FROM pages ORDER BY RANDOM() LIMIT 1;")
util.Check(err)
var link string
for rows.Next() {
err = rows.Scan(&link)
util.Check(err)
}
return link
}
func countQuery(db *sql.DB, table string) int {
rows, err := db.Query(fmt.Sprintf("SELECT COUNT(*) FROM %s;", table))
util.Check(err)
var count int
for rows.Next() {
err = rows.Scan(&count)
util.Check(err)
}
return count
}
func searchWords(db *sql.DB, words []string, searchByScore bool) []types.PageData {
var wordlist []string
var args []interface{}
for _, word := range words {
wordlist = append(wordlist, "word = ?")
args = append(args, strings.ToLower(word))
}
orderType := "SUM(score)"
if !searchByScore {
orderType = "COUNT(*)"
}
query := fmt.Sprintf(`
SELECT p.url, p.about, p.title
FROM inv_index inv INNER JOIN pages p ON inv.url = p.url
WHERE %s
GROUP BY inv.url
ORDER BY %s
DESC
LIMIT 15
`, strings.Join(wordlist, " OR "), orderType)
stmt, err := db.Prepare(query)
util.Check(err)
defer stmt.Close()
rows, err := stmt.Query(args...)
util.Check(err)
var pageData types.PageData
var pages []types.PageData
for rows.Next() {
if err := rows.Scan(&pageData.URL, &pageData.About, &pageData.Title); err != nil {
log.Fatalln(err)
}
pages = append(pages, pageData)
}
return pages
}
func InsertManyDomains(db *sql.DB, pages []types.PageData) {
values := make([]string, 0, len(pages))
args := make([]interface{}, 0, len(pages))
for _, b := range pages {
values = append(values, "(?)")
u, err := url.Parse(b.URL)
util.Check(err)
args = append(args, u.Hostname())
}
stmt := fmt.Sprintf(`INSERT OR IGNORE INTO domains(domain) VALUES %s`, strings.Join(values, ","))
_, err := db.Exec(stmt, args...)
util.Check(err)
}
func InsertManyPages(db *sql.DB, pages []types.PageData) {
values := make([]string, 0, len(pages))
args := make([]interface{}, 0, len(pages))
for _, b := range pages {
// url, title, lang, about, domain
values = append(values, "(?, ?, ?, ?, ?)")
u, err := url.Parse(b.URL)
util.Check(err)
args = append(args, b.URL, b.Title, b.Lang, b.About, u.Hostname())
}
stmt := fmt.Sprintf(`INSERT OR IGNORE INTO pages(url, title, lang, about, domain) VALUES %s`, strings.Join(values, ","))
_, err := db.Exec(stmt, args...)
util.Check(err)
}
func InsertManyWords(db *sql.DB, batch []types.SearchFragment) {
values := make([]string, 0, len(batch))
args := make([]interface{}, 0, len(batch))
for _, b := range batch {
pageurl := strings.TrimSuffix(b.URL, "/")
values = append(values, "(?, ?, ?)")
args = append(args, b.Word, pageurl, b.Score)
}
stmt := fmt.Sprintf(`INSERT OR IGNORE INTO inv_index(word, url, score) VALUES %s`, strings.Join(values, ","))
_, err := db.Exec(stmt, args...)
util.Check(err)
}

121
docs/files.md 100644
Wyświetl plik

@ -0,0 +1,121 @@
# Files
_what the purposes are of all those damn files_
Lieu is based on a few files, which in turn configure various behaviours in the
**crawler** (visits urls & extracts relevant elements) and the **ingester**
(converts the crawled source data into database fields). The basic reason is to
minimize hardcoded assumptions in the source, furthering Lieu's reuse.
Below, I will refer to the files by their config defined names. Here's the
config example from the [README](../README.md), again.
```toml
[general]
name = "Merveilles Webring"
# used by the precrawl command and linked to in /about route
url = "https://webring.xxiivv.com"
port = 10001
[data]
# the source file should contain the crawl command's output
source = "data/crawled.txt"
# location & name of the sqlite database
database = "data/searchengine.db"
# contains words and phrases disqualifying scraped paragraphs from being presented in search results
heuristics = "data/heuristics.txt"
# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word
wordlist = "data/wordlist.txt"
[crawler]
# manually curated list of domains, or the output of the precrawl command
webring = "data/webring.txt"
# domains that are banned from being crawled but might originally be part of the webring
bannedDomains = "data/banned-domains.txt"
# file suffixes that are banned from being crawled
bannedSuffixes = "data/banned-suffixes.txt"
# phrases and words which won't be scraped (e.g. if a contained in a link)
boringWords = "data/boring-words.txt"
# domains that won't be output as outgoing links
boringDomains = "data/boring-domains.txt"
```
## HTML
Before we start, a final note on some other types of files in use. The HTML
templates, used when presenting the search engine in the browser, are all
available in the [`html`](../html) folder. The includes—currently only css
& font files—are available in [`html/assets`](../html/assets).
## `[crawler]`
#### `webring`
Defines which domains will be crawled for pages. At current writing, no domains
outside of this file will be crawled.
You can populate the `webring` file manually or by precrawling an existing
webpage that contains all of the domains you want to crawl:
lieu precrawl > data/webring.txt
#### `bannedDomains`
A list of domains that will not be crawled. This means that if they are present
in the `webring` file, they will be skipped over as candidates for crawling.
The rationale is that some of the domains of a webring may be unsuitable for ingestion
into the database. I typically find this is the case for domains that include
microblogs with 100s or 1000s of one line pages—needlessly gunking up the search
results without providing anything of interest outside the individual creating
the logs.
#### `bannedSuffixes`
Eliminates html links that end with suffixes present in this file. Typically I want
to avoid crawling links to media formats such as `.mp4`, and other types of
non-html documents, really.
It's fine to leave this file intact with its defaults.
#### `boringWords`
This file is a bit more specific. It contains words which, if present in a link,
will prevent the link from being logged. The reason is cause it suggests the
link target is boring—irrelevant for this application of the search engine.
This can be `javascript:` script links, or other types of content that is less
relevant to the focus of the search engine & webring.
Link data of this type is as yet unused in Lieu's ingestion.
#### `boringDomains`
Like `boringWords` except it contains a list of domains which are banned from
having their links be logged, typically because they are deemed less relevant
for the focus of the search engine.
Link data of this type is as yet unused in Lieu's ingestion.
## `[data]`
#### `source`
Contains the linewise data that was produced by the crawler. The first word
identifies the type of data and the last word identifies the page the data
originated from.
Example:
```
h2 Prelude https://cblgh.org/articles/four-nights-in-tornio.html
```
* An `<h2>` tag was scraped,
* its contents were `Prelude`, and
* the originating article was https://cblgh.org/articles/four-nights-in-tornio.html
#### `database`
The location the sqlite3 database will be created & read from.
#### `heuristics`
Heuristics contains a list of words or phrases which disqualify scraped
paragraphs from being used as descriptive text Lieu's search results. Typically
excluded are e.g. paragraphs which contain copyright symbols—as that indicates we
have scraped the bottom-most paragraph, i.e. the page was likely a short stub,
with a better content description elsewhere.
#### `wordlist`
Also known as [stopwords](https://en.wikipedia.org/wiki/Stop_word)—words which
are stopped from entering the search index. The default wordlist consists of the
1000 or so most common English words, albeit curated slightly to still allow for
interesting concepts and verbs—such as `reading` and `books`, for example.

11
go.mod 100644
Wyświetl plik

@ -0,0 +1,11 @@
module lieu
go 1.14
require (
github.com/PuerkitoBio/goquery v1.5.1
github.com/gocolly/colly/v2 v2.1.0
github.com/jinzhu/inflection v1.0.0
github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b
github.com/mattn/go-sqlite3 v1.14.6
)

144
go.sum 100644
Wyświetl plik

@ -0,0 +1,144 @@
cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/anaskhan96/soup v1.2.4 h1:or+sKs9QbzJGZVTYFmTs2VBateEywoq00a6K14z331E=
github.com/anaskhan96/soup v1.2.4/go.mod h1:6YnEp9A2yywlYdM4EgDz9NEHclocMepEtku7wg6Cq3s=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE=
github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY=
github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M=
github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0=
github.com/antchfx/xmlquery v1.2.4 h1:T/SH1bYdzdjTMoz2RgsfVKbM5uWh3gjDYYepFqQmFv4=
github.com/antchfx/xmlquery v1.2.4/go.mod h1:KQQuESaxSlqugE2ZBcM/qn+ebIpt+d+4Xx7YcSGAIrM=
github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/antchfx/xpath v1.1.8 h1:PcL6bIX42Px5usSx6xRYw/wjB3wYGkj0MJ9MBzEKVgk=
github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs=
github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0=
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY=
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0=
github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg=
github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E=
github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b h1:UmqyLHqfYJjkiuA2hddGeovwAGOCBm5gOTVKuxtvoMo=
github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b/go.mod h1:wLcNqnyr6riTbnFObg4o2/GemTCso9AnsUdLsMsdspw=
github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRUbg=
github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200602114024-627f9648deb9 h1:pNX+40auqi2JqRfOP1akLGtYcn15TUbkhwuCO3foqqM=
golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20201021035429-f5854403a974 h1:IX6qOQeG5uLjB/hjjwjedwfjND0hgjPMMyO1RoIXQNI=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20210114065538-d78b04bdf963/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/appengine v1.6.6 h1:lMO5rYAqUxkmaj76jAkRUvt5JZgFymx/+Q5Mzfivuhc=
google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEGA=
google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=

Plik binarny nie jest wyświetlany.

Plik binarny nie jest wyświetlany.

Plik binarny nie jest wyświetlany.

Wyświetl plik

@ -0,0 +1,24 @@
@import url("base.css");
html {
max-width: 31rem;
}
h1 {
font-size: 3rem;
margin-bottom: 0.5rem;
}
h2 {
font-family: "Noto Serif";
font-style: italic;
font-weight: 400;
font-size: 1.5rem;
margin-top: 0;
margin-bottom: 2rem;
}
.lieu {
font-family: "Noto Serif";
font-weight: 400;
}

Wyświetl plik

@ -0,0 +1,162 @@
@import url('inter-ui-web/inter-ui.css');
@font-face {
font-family: "Noto Serif";
src: url("NotoSerif-Bold.ttf");
}
@font-face {
font-family: "Noto Serif";
font-weight: 400;
src: url("NotoSerif-Regular.ttf");
}
@font-face {
font-family: "Noto Serif";
font-weight: 400;
font-style: italic;
src: url("NotoSerif-Italic.ttf");
}
:root {
--primary: #fefefe;
--secondary: #000;
/* alt colorscheme: 1 */
/* --primary: red; */
/* --secondary: #fefefe; */
/* alt colorscheme: 2 */
/* --primary: #F35363; */
/* --secondary: black; */
}
li {
list-style-type: circle;
}
ul {
margin: 0;
padding-left: 1rem;
}
html {
font-family: "Inter UI", sans-serif;
background: var(--secondary);
color: var(--primary);
max-width: 650px;
padding-bottom: 2rem;
padding-left: 2rem;
margin-top: 2rem;
}
body {
margin: 0;
}
h1 {
font-family: "Noto Serif";
font-weight: 400;
font-size: 3rem;
margin-bottom: 1rem;
margin-top: 0;
}
h1 > a, h1 > a:hover {
border-bottom: none;
}
a {
cursor: pointer;
color: var(--primary);
text-decoration: none;
border-bottom: 0.1rem solid var(--primary);
word-wrap: break-word;
}
a:hover {
border-bottom-style: dotted;
}
p {
hyphens: auto;
margin-bottom: 1.5rem;
}
.entry {
-webkit-column-break-inside: avoid;
-moz-column-break-inside:avoid;
-moz-page-break-inside:avoid;
page-break-inside: avoid;
break-inside: avoid-column;
}
.search-container {
display: grid;
margin-bottom: 2rem;
height: 2.5rem;
align-items: center;
grid-template-columns: 16rem 3rem;
grid-auto-flow: column;
grid-column-gap: .5rem;
}
.search-box {
font-size: 1rem;
border-radius: 0.1rem;
padding: .5rem;
padding-left: 0.75rem;
border: 0;
color: var(--secondary);
background: var(--primary);
}
.search-button {
font-size: 2rem;
color: var(--primary);
background: var(--secondary);
border: 0;
cursor: pointer;
border-radius: 2px;
transition: opacity 150ms;
}
.search-button:hover {
opacity: 0.5;
transition: opacity 150ms;
}
.about-link {
position: absolute;
top: 1rem;
right: 1rem;
font-style: normal;
}
@media
only screen
and (min-device-width : 320px)
and (max-device-width : 720px)
{
html {
padding-left: 0.75rem;
padding-right: 0.75rem;
font-size: 30pt;
max-width: 100vw;
}
}
@media
only screen
and (min-device-width : 320px)
and (max-device-width : 374px) {
html {
font-size: 40pt;
}
}
/*
@media(prefers-color-scheme: light) {
:root {
--primary: #000;
--secondary: #fefefe;
}
*/

Plik binarny nie jest wyświetlany.

Plik binarny nie jest wyświetlany.

Plik binarny nie jest wyświetlany.

Plik binarny nie jest wyświetlany.

Wyświetl plik

@ -0,0 +1,92 @@
Copyright (c) 2016-2018 The Inter UI Project Authors (me@rsms.me)
This Font Software is licensed under the SIL Open Font License, Version 1.1.
This license is copied below, and is also available with a FAQ at:
http://scripts.sil.org/OFL
-----------------------------------------------------------
SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
-----------------------------------------------------------
PREAMBLE
The goals of the Open Font License (OFL) are to stimulate worldwide
development of collaborative font projects, to support the font creation
efforts of academic and linguistic communities, and to provide a free and
open framework in which fonts may be shared and improved in partnership
with others.
The OFL allows the licensed fonts to be used, studied, modified and
redistributed freely as long as they are not sold by themselves. The
fonts, including any derivative works, can be bundled, embedded,
redistributed and/or sold with any software provided that any reserved
names are not used by derivative works. The fonts and derivatives,
however, cannot be released under any other type of license. The
requirement for fonts to remain under this license does not apply
to any document created using the fonts or their derivatives.
DEFINITIONS
"Font Software" refers to the set of files released by the Copyright
Holder(s) under this license and clearly marked as such. This may
include source files, build scripts and documentation.
"Reserved Font Name" refers to any names specified as such after the
copyright statement(s).
"Original Version" refers to the collection of Font Software components as
distributed by the Copyright Holder(s).
"Modified Version" refers to any derivative made by adding to, deleting,
or substituting -- in part or in whole -- any of the components of the
Original Version, by changing formats or by porting the Font Software to a
new environment.
"Author" refers to any designer, engineer, programmer, technical
writer or other person who contributed to the Font Software.
PERMISSION AND CONDITIONS
Permission is hereby granted, free of charge, to any person obtaining
a copy of the Font Software, to use, study, copy, merge, embed, modify,
redistribute, and sell modified and unmodified copies of the Font
Software, subject to the following conditions:
1) Neither the Font Software nor any of its individual components,
in Original or Modified Versions, may be sold by itself.
2) Original or Modified Versions of the Font Software may be bundled,
redistributed and/or sold with any software, provided that each copy
contains the above copyright notice and this license. These can be
included either as stand-alone text files, human-readable headers or
in the appropriate machine-readable metadata fields within text or
binary files as long as those fields can be easily viewed by the user.
3) No Modified Version of the Font Software may use the Reserved Font
Name(s) unless explicit written permission is granted by the corresponding
Copyright Holder. This restriction only applies to the primary font name as
presented to the users.
4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
Software shall not be used to promote, endorse or advertise any
Modified Version, except to acknowledge the contribution(s) of the
Copyright Holder(s) and the Author(s) or with their explicit written
permission.
5) The Font Software, modified or unmodified, in part or in whole,
must be distributed entirely under this license, and must not be
distributed under any other license. The requirement for fonts to
remain under this license does not apply to any document created
using the Font Software.
TERMINATION
This license becomes null and void if any of the above conditions are
not met.
DISCLAIMER
THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
OTHER DEALINGS IN THE FONT SOFTWARE.

Wyświetl plik

@ -0,0 +1,13 @@
@font-face {
font-family: 'Inter UI';
font-style: normal;
src: url("Inter-UI-Regular.woff2") format("woff2"),
url("Inter-UI-Regular.woff") format("woff");
}
@font-face {
font-family: 'Inter UI';
font-style: italic;
src: url("Inter-UI-Italic.woff2") format("woff2"),
url("Inter-UI-Italic.woff") format("woff");
}

Wyświetl plik

@ -0,0 +1,27 @@
@import url('base.css');
main {
columns: 2;
}
.entry {
-webkit-column-break-inside: avoid;
-moz-column-break-inside:avoid;
-moz-page-break-inside:avoid;
page-break-inside: avoid;
break-inside: avoid-column;
}
.link {
font-style: italic;
}
@media
only screen
and (min-device-width : 320px)
and (max-device-width : 720px)
{
main {
columns: 1 !important;
}
}

Wyświetl plik

@ -0,0 +1,24 @@
@import url("about.css");
html {
max-width: 100vw;
}
h2 {
margin-bottom: 1rem;
}
main {
display: grid;
justify-items: center;
align-items: center;
margin-top: 10rem;
}
.search-container {
grid-template-columns: 19rem 3rem;
}
.lieu-container {
justify-items: start;
}

205
ingest/ingest.go 100644
Wyświetl plik

@ -0,0 +1,205 @@
package ingest
import (
"bufio"
"database/sql"
"fmt"
"lieu/database"
"lieu/types"
"lieu/util"
"log"
"net/url"
"os"
"regexp"
"strings"
"github.com/jinzhu/inflection"
)
func partitionSentence(s string) []string {
punctuation := regexp.MustCompile(`\p{P}`)
whitespace := regexp.MustCompile(`\p{Z}`)
invisible := regexp.MustCompile(`\p{C}`)
symbols := regexp.MustCompile(`\p{S}`)
s = punctuation.ReplaceAllString(s, " ")
s = whitespace.ReplaceAllString(s, " ")
s = invisible.ReplaceAllString(s, " ")
s = symbols.ReplaceAllString(s, " ")
s = strings.ReplaceAll(s, "|", " ")
s = strings.ReplaceAll(s, "/", " ")
return strings.Fields(s)
}
func filterCommonWords(words, wordlist []string) []string {
var filtered []string
for _, word := range words {
// ingested word was too common, skip it
if len(word) == 1 || find(wordlist, word) {
continue
}
filtered = append(filtered, inflection.Singular(word))
}
return filtered
}
func find(slice []string, sought string) bool {
for _, item := range slice {
if item == sought {
return true
}
}
return false
}
func performAboutHeuristic(heuristicPath, phrase string) bool {
disallowed := util.ReadList(heuristicPath, "\n")
ok := !util.Contains(disallowed, phrase)
return ok && len(phrase) > 20
}
func Ingest(config types.Config) {
if _, err := os.Stat(config.Data.Database); err == nil || os.IsExist(err) {
err = os.Remove(config.Data.Database)
util.Check(err)
}
db := database.InitDB(config.Data.Database)
wordlist := util.ReadList(config.Data.Wordlist, "|")
buf, err := os.Open(config.Data.Source)
util.Check(err)
defer func() {
err = buf.Close()
util.Check(err)
}()
pages := make(map[string]types.PageData)
var count int
var batchsize = 100
batch := make([]types.SearchFragment, 0, 0)
scanner := bufio.NewScanner(buf)
for scanner.Scan() {
line := scanner.Text()
firstSpace := strings.Index(line, " ")
lastSpace := strings.LastIndex(line, " ")
if len(line) == 0 || firstSpace == -1 {
continue
}
pageurl := strings.ToLower(strings.TrimSuffix(strings.TrimSpace(line[lastSpace:len(line)]), "/"))
if !strings.HasPrefix(pageurl, "http") {
continue
}
var page types.PageData
if data, exists := pages[pageurl]; exists {
page = data
} else {
page.URL = pageurl
}
token := line[0:firstSpace]
rawdata := strings.TrimSpace(line[firstSpace:lastSpace])
payload := strings.ToLower(rawdata)
var processed []string
score := 1
switch token {
case "title":
if len(page.About) == 0 {
page.About = rawdata
}
score = 5
page.Title = rawdata
processed = partitionSentence(payload)
case "h1":
if len(page.About) == 0 {
page.About = rawdata
}
fallthrough
case "h2":
fallthrough
case "h3":
score = 15
processed = partitionSentence(payload)
case "desc":
if len(page.About) < 30 && len(rawdata) < 100 {
page.About = rawdata
}
processed = partitionSentence(payload)
case "para":
if performAboutHeuristic(config.Data.Heuristics, payload) {
page.About = rawdata
}
processed = partitionSentence(payload)
case "lang":
page.Lang = rawdata
case "keywords":
processed = strings.Split(strings.ReplaceAll(payload, ", ", ","), ",")
default:
continue
}
pages[pageurl] = page
processed = filterCommonWords(processed, wordlist)
count += len(processed)
for _, word := range processed {
batch = append(batch, types.SearchFragment{Word: word, URL: pageurl, Score: score})
}
if token == "title" {
// only extract path segments once per url.
// we do it here because every page is virtually guaranteed to have a title attr &
// it only appears once
for _, word := range extractPathSegments(pageurl) {
batch = append(batch, types.SearchFragment{Word: word, URL: pageurl, Score: 2})
}
}
if len(pages) > batchsize {
ingestBatch(db, batch, pages)
batch = make([]types.SearchFragment, 0, 0)
// TODO: make sure we don't partially insert any page data
pages = make(map[string]types.PageData)
}
}
fmt.Printf("ingested %d words\n", count)
err = scanner.Err()
util.Check(err)
}
func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]types.PageData) {
pages := make([]types.PageData, len(pageMap))
i := 0
for k := range pageMap {
pages[i] = pageMap[k]
i++
}
log.Println("starting to ingest batch")
database.InsertManyDomains(db, pages)
database.InsertManyPages(db, pages)
database.InsertManyWords(db, batch)
log.Println("finished ingesting batch")
}
func extractPathSegments(pageurl string) []string {
u, err := url.Parse(pageurl)
util.Check(err)
if len(u.Path) == 0 {
return make([]string, 0, 0)
}
s := u.Path
s = strings.TrimSuffix(s, ".html")
s = strings.TrimSuffix(s, ".htm")
s = strings.ReplaceAll(s, "/", " ")
s = strings.ReplaceAll(s, "-", " ")
s = strings.ReplaceAll(s, "_", " ")
s = strings.ToLower(s)
return strings.Fields(s)
}

27
lieu.toml 100644
Wyświetl plik

@ -0,0 +1,27 @@
[general]
name = "Sweet Webring"
# used by the precrawl command and linked to in /about route
url = "https://example.com/"
port = 10001
[data]
# the source file should contain the crawl command's output
source = "data/crawled.txt"
# location & name of the sqlite database
database = "data/searchengine.db"
# contains words and phrases disqualifying scraped paragraphs from being presented in search results
heuristics = "data/heuristics.txt"
# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word
wordlist = "data/wordlist.txt"
[crawler]
# manually curated list of domains, or the output of the precrawl command
webring = "data/webring.txt"
# domains that are banned from being crawled but might originally be part of the webring
bannedDomains = "data/banned-domains.txt"
# file suffixes that are banned from being crawled
bannedSuffixes = "data/banned-suffixes.txt"
# phrases and words which won't be scraped (e.g. if a contained in a link)
boringWords = "data/boring-words.txt"
# domains that won't be output as outgoing links
boringDomains = "data/boring-domains.txt"

143
server/server.go 100644
Wyświetl plik

@ -0,0 +1,143 @@
package server
import (
"fmt"
"net/http"
"net/url"
"strings"
"lieu/database"
"lieu/types"
"lieu/util"
"html/template"
// "github.com/shurcooL/vfsgen"
)
type SearchData struct {
Query string
Pages []types.PageData
}
type AboutData struct {
DomainCount int
InstanceName string
PageCount string
TermCount string
FilteredLink string
RingLink string
}
const useURLTitles = true
func searchRoute(res http.ResponseWriter, req *http.Request, config types.Config) {
var query string
if req.Method == http.MethodGet {
params := req.URL.Query()
words, exists := params["q"]
if !exists {
view := template.Must(template.ParseFiles("html/index-template.html"))
var empty interface{}
view.Execute(res, empty)
return
}
query = words[0]
} else {
view := template.Must(template.ParseFiles("html/index-template.html"))
var empty interface{}
view.Execute(res, empty)
return
}
db := database.InitDB(config.Data.Database)
pages := database.SearchWordsByScore(db, util.Inflect(strings.Fields(query)))
if useURLTitles {
for i, pageData := range pages {
prettyURL, err := url.QueryUnescape(strings.TrimPrefix(strings.TrimPrefix(pageData.URL, "http://"), "https://"))
util.Check(err)
pageData.Title = prettyURL
pages[i] = pageData
}
}
view := template.Must(template.ParseFiles("html/search-template.html"))
data := SearchData{
Query: query,
Pages: pages,
}
view.Execute(res, data)
}
func aboutRoute(res http.ResponseWriter, req *http.Request, config types.Config) {
db := database.InitDB(config.Data.Database)
pageCount := util.Humanize(database.GetPageCount(db))
wordCount := util.Humanize(database.GetWordCount(db))
domainCount := database.GetDomainCount(db)
view := template.Must(template.ParseFiles("html/about-template.html"))
data := AboutData{
InstanceName: config.General.Name,
DomainCount: domainCount,
PageCount: pageCount,
TermCount: wordCount,
FilteredLink: "/filtered",
RingLink: config.General.URL,
}
view.Execute(res, data)
}
type ListData struct {
Title string
URLs []types.PageData
}
func filteredRoute(res http.ResponseWriter, req *http.Request, config types.Config) {
view := template.Must(template.ParseFiles("html/list-template.html"))
var URLs []types.PageData
for _, domain := range util.ReadList(config.Crawler.BannedDomains, "\n") {
u, err := url.Parse(domain)
if err != nil {
continue
}
u.Scheme = "https"
p := types.PageData{Title: domain, URL: u.String()}
URLs = append(URLs, p)
}
data := ListData{
Title: "Filtered Domains",
URLs: URLs,
}
view.Execute(res, data)
}
func randomRoute(res http.ResponseWriter, req *http.Request, config types.Config) {
db := database.InitDB(config.Data.Database)
link := database.GetRandomPage(db)
http.Redirect(res, req, link, http.StatusSeeOther)
}
func Serve(config types.Config) {
http.HandleFunc("/about", func(res http.ResponseWriter, req *http.Request) {
aboutRoute(res, req, config)
})
http.HandleFunc("/", func(res http.ResponseWriter, req *http.Request) {
searchRoute(res, req, config)
})
http.HandleFunc("/filtered", func(res http.ResponseWriter, req *http.Request) {
filteredRoute(res, req, config)
})
http.HandleFunc("/random", func(res http.ResponseWriter, req *http.Request) {
randomRoute(res, req, config)
})
fileserver := http.FileServer(http.Dir("html/assets/"))
http.Handle("/links/", http.StripPrefix("/links/", fileserver))
portstr := fmt.Sprintf(":%d", config.General.Port)
fmt.Println("listening on", portstr)
http.ListenAndServe(portstr, nil)
}

35
types/types.go 100644
Wyświetl plik

@ -0,0 +1,35 @@
package types
type SearchFragment struct {
Word string
URL string
Score int
}
type PageData struct {
URL string
Title string
About string
Lang string
}
type Config struct {
General struct {
Name string `json:name`
URL string `json:url`
Port int `json:port`
} `json:general`
Data struct {
Source string `json:source`
Database string `json:database`
Heuristics string `json:heuristics`
Wordlist string `json:wordlist`
} `json:data`
Crawler struct {
Webring string `json:webring`
BannedDomains string `json:bannedDomains`
BannedSuffixes string `json:bannedSuffixes`
BoringWords string `json:boringWords`
BoringDomains string `json:boringDomains`
} `json:crawler`
}

136
util/util.go 100644
Wyświetl plik

@ -0,0 +1,136 @@
package util
import (
"os"
"bytes"
"encoding/json"
"fmt"
"net"
"io/ioutil"
"log"
"strings"
"lieu/types"
"github.com/jinzhu/inflection"
"github.com/komkom/toml"
)
func Inflect(words []string) []string {
var inflected []string
for _, word := range words {
inflected = append(inflected, inflection.Singular(word))
}
return inflected
}
func Check(err error) {
if err != nil {
log.Fatalln(err)
}
}
func DatabaseDoesNotExist(filepath string) {
fmt.Printf("lieu: database %s does not exist\n", filepath)
fmt.Println("lieu: try running `lieu ingest` if you have already crawled source data")
Exit()
}
func CheckFileExists (path string) bool {
_, err := os.Stat(path)
if err == nil {
return true
}
return os.IsExist(err)
}
func Humanize(n int) string {
if n > 1000 {
return fmt.Sprintf("%dk", n/1000)
} else if n > 1000000 {
return fmt.Sprintf("%dm", n/1000000)
}
return string(n)
}
func Contains(arr []string, query string) bool {
for _, item := range arr {
if strings.Contains(query, item) {
return true
}
}
return false
}
func ReadList(filepath, sep string) []string {
data, err := ioutil.ReadFile(filepath)
if err != nil || len(data) == 0{
return []string{}
}
return strings.Split(strings.TrimSuffix(string(data), sep), sep)
}
func CheckPortOpen(port int) bool {
tcpaddr, err := net.ResolveTCPAddr("tcp", fmt.Sprintf("localhost:%d", port))
if err != nil {
return false
}
l, err := net.ListenTCP("tcp", tcpaddr)
defer l.Close()
if err != nil {
return false
}
return true
}
func ReadConfig() types.Config {
data, err := ioutil.ReadFile("lieu.toml")
Check(err)
var conf types.Config
decoder := json.NewDecoder(toml.New(bytes.NewBuffer(data)))
err = decoder.Decode(&conf)
Check(err)
return conf
}
func WriteMockConfig () {
conf := []byte(`[general]
name = "Sweet Webring"
# used by the precrawl command and linked to in /about route
url = "https://example.com/"
port = 10001
[data]
# the source file should contain the crawl command's output
source = "data/crawled.txt"
# location & name of the sqlite database
database = "data/searchengine.db"
# contains words and phrases disqualifying scraped paragraphs from being presented in search results
heuristics = "data/heuristics.txt"
# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word
wordlist = "data/wordlist.txt"
[crawler]
# manually curated list of domains, or the output of the precrawl command
webring = "data/webring.txt"
# domains that are banned from being crawled but might originally be part of the webring
bannedDomains = "data/banned-domains.txt"
# file suffixes that are banned from being crawled
bannedSuffixes = "data/banned-suffixes.txt"
# phrases and words which won't be scraped (e.g. if a contained in a link)
boringWords = "data/boring-words.txt"
# domains that won't be output as outgoing links
boringDomains = "data/boring-domains.txt"
`)
err := ioutil.WriteFile("lieu.toml", conf, 0644)
Check(err)
}
func Exit () {
os.Exit(0)
}