launch

2021-02-03 09:12:30 +01:00 · 2021-02-03 09:12:30 +01:00 · 28d052f4c3
commit 28d052f4c3
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,223 @@
+#~top ignores~
+node_modules/
+*.vim
+*bundle*.js
+/html/*.html
+*.sw[a-z]
+config.conf
+config.js
+*.pdf
+archives
+builds
+dist
+
+#################
+## Eclipse
+#################
+*.pydevproject
+.project
+.metadata
+bin/
+tmp/
+*.tmp
+*.bak
+*.swp
+*~.nib
+local.properties
+.classpath
+.settings/
+.loadpath
+
+# External tool builders
+.externalToolBuilders/
+
+# Locally stored "Eclipse launch configurations"
+*.launch
+
+# CDT-specific
+.cproject
+
+# PDT-specific
+.buildpath
+
+
+#################
+## Visual Studio
+#################
+
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+
+# User-specific files
+*.suo
+*.user
+*.sln.docstates
+
+# Build results
+
+[Dd]ebug/
+[Rr]elease/
+x64/
+build/
+[Bb]in/
+[Oo]bj/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+*_i.c
+*_p.c
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.log
+*.scc
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opensdf
+*.sdf
+*.cachefile
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# NCrunch
+*.ncrunch*
+.*crunch*.local.xml
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.Publish.xml
+*.pubxml
+
+# Windows Azure Build Output
+csx
+*.build.csdef
+
+# Windows Store app package directory
+AppPackages/
+
+# Others
+sql/
+*.Cache
+ClientBin/
+[Ss]tyle[Cc]op.*
+~$*
+*~
+*.dbmdl
+*.[Pp]ublish.xml
+*.pfx
+*.publishsettings
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file to a newer
+# Visual Studio version. Backup files are not needed, because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+
+# SQL Server files
+App_Data/*.mdf
+App_Data/*.ldf
+
+#############
+## Windows detritus
+#############
+
+# Windows image file caches
+Thumbs.db
+ehthumbs.db
+
+# Folder config file
+Desktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Mac crap
+.DS_Store
+
+
+#############
+## Python
+#############
+
+*.py[co]
+
+# Packages
+*.egg
+*.egg-info
+dist/
+build/
+eggs/
+parts/
+var/
+sdist/
+develop-eggs/
+.installed.cfg
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+
+#Translations
+*.mo
+
+#Mr Developer
+.mr.developer.cfg
--- a/README.md
+++ b/README.md
@ -0,0 +1,104 @@
+# Lieu
+_an alternative search engine_
+
+Created in response to the environs of apathy concerning the use of hypertext
+search and discovery. In Lieu, the internet is not what is made searchable, but
+instead one's own neighbourhood. Put differently, Lieu is a neighbourhood search
+engine, a way for personal webrings to increase serendipitous connexions.
+
+
+## Goals
+* Enable serendipitous discovery
+* Support personal communities
+* Be reusable, easily
+
+## Usage
+```
+$ lieu help
+Lieu: neighbourhood search engine
+
+Commands
+- precrawl  (scrapes config's general.url for a list of links: <li> elements containing an anchor <a> tag)
+- crawl     (start crawler, crawls all urls in config's crawler.webring file)
+- ingest    (ingest crawled data, generates database)
+- search    (interactive cli for searching the database)
+- host      (hosts search engine over http)
+
+Example:
+    lieu precrawl > data/webring.txt
+    lieu ingest
+    lieu host
+```
+
+Lieu's crawl & precrawl commands output to [standard
+output](https://en.wikipedia.org/wiki/Standard_streams#Standard_output_(stdout)),
+for easy inspection of the data. You typically want to redirect their output to
+the files Lieu reads from, as defined in the config file. See below for a
+typical workflow.
+
+### Workflow
+* Edit the config
+* Add domains to crawl in `config.crawler.webring`
+	* **If you have a webpage with links you want to crawl:**
+	* Set the config's `url` field to that page
+	* Populate the list of domains to crawl with `precrawl`: `lieu precrawl > data/webring.txt`
+* Crawl: `lieu crawl > data/source.txt`
+* Create database: `lieu ingest`
+* Host engine: `lieu host`
+
+After ingesting the data with `lieu ingest`, you can also use lieu to search the
+corpus in the terminal with `lieu search`.
+
+## Config
+The config file is written in [TOML](https://toml.io/en/).
+
+```toml
+[general]
+name = "Merveilles Webring"
+# used by the precrawl command and linked to in /about route
+url = "https://webring.xxiivv.com"
+port = 10001
+
+[data]
+# the source file should contain the crawl command's output 
+source = "data/crawled.txt"
+# location & name of the sqlite database
+database = "data/searchengine.db"
+# contains words and phrases disqualifying scraped paragraphs from being presented in search results
+heuristics = "data/heuristics.txt"
+# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word
+wordlist = "data/wordlist.txt"
+
+[crawler]
+# manually curated list of domains, or the output of the precrawl command
+webring = "data/webring.txt"
+# domains that are banned from being crawled but might originally be part of the webring
+bannedDomains = "data/banned-domains.txt"
+# file suffixes that are banned from being crawled
+bannedSuffixes = "data/banned-suffixes.txt"
+# phrases and words which won't be scraped (e.g. if a contained in a link)
+boringWords = "data/boring-words.txt"
+# domains that won't be output as outgoing links
+boringDomains = "data/boring-domains.txt"
+```
+
+For your own use, the following config fields should be customized:
+* `name`
+* `url `
+* `port`
+* `source`
+* `webring`
+* `bannedDomains`
+
+The following config-defined files can stay as-is unless you have specific requirements:
+* `database`
+* `heuristics`
+* `wordlist`
+* `bannedSuffixes`
+
+For a full rundown of the files and their various jobs, see the [files
+description](docs/files.md).
+
+### License
+Source code `AGPL-3.0-or-later`, Inter is available under `SIL OPEN FONT
+LICENSE Version 1.1`, Noto Serif is licensed as `Apache License, Version 2.0`.
--- a/cli.go
+++ b/cli.go
@ -0,0 +1,125 @@
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"lieu/crawler"
+	"lieu/database"
+	"lieu/ingest"
+	"lieu/server"
+	"lieu/util"
+	"os"
+	"strings"
+)
+
+const help = `Lieu: neighbourhood search engine
+
+Commands
+- precrawl  (scrapes config's general.url for a list of links: <li> elements containing an anchor <a> tag)
+- crawl     (start crawler, crawls all urls in config's crawler.webring file. outputs to stdout)
+- ingest    (ingest crawled data, generates database)
+- search    (interactive cli for searching the database)
+- host      (hosts search engine over http) 
+
+Example:
+    lieu precrawl > data/webring.txt 
+    lieu crawl > data/source.txt
+    lieu ingest
+    lieu host
+
+See the configuration file lieu.toml or 
+https://github.com/cblgh/lieu for more information.
+`
+
+func main() {
+    exists := util.CheckFileExists("lieu.toml")
+    if !exists {
+        fmt.Println("lieu: can't find config, saving an example config in the working directory")
+        util.WriteMockConfig()
+        fmt.Println("lieu: lieu.toml written to disk")
+        util.Exit()
+    }
+	config := util.ReadConfig()
+
+	var cmd string
+	if len(os.Args) > 1 {
+		cmd = os.Args[1]
+	} else {
+		cmd = "search"
+	}
+
+	switch cmd {
+	case "help":
+        fmt.Println(help)
+	case "precrawl":
+        if config.General.URL == "https://example.com/" {
+            fmt.Println("lieu: the url is not set (example.com)")
+            util.Exit()
+        }
+		crawler.Precrawl(config)
+	case "crawl":
+        exists := util.CheckFileExists(config.Crawler.Webring)
+        if !exists {
+            fmt.Printf("lieu: webring file %s does not exist\n", config.Data.Source)
+            util.Exit()
+        }
+        sourceLen := len(util.ReadList(config.Crawler.Webring, "\n"))
+        if sourceLen == 0 {
+            fmt.Printf("lieu: nothing to crawl; the webring file %s is empty\n", config.Data.Source)
+            util.Exit()
+        }
+		crawler.Crawl(config)
+	case "ingest":
+        exists := util.CheckFileExists(config.Data.Source)
+        if !exists {
+            fmt.Printf("lieu: data source %s does not exist\n", config.Data.Source)
+            fmt.Println("lieu: try running `lieu crawl`")
+            util.Exit()
+        }
+        sourceLen := len(util.ReadList(config.Data.Source, "\n"))
+        if sourceLen == 0 {
+            fmt.Printf("lieu: nothing to ingest; data source %s is empty\n", config.Data.Source)
+            fmt.Println("lieu: try running `lieu crawl`")
+            util.Exit()
+        }
+        fmt.Println("lieu: creating a new database & initiating ingestion")
+		ingest.Ingest(config)
+	case "search":
+        exists := util.CheckFileExists(config.Data.Database)
+        if !exists {
+            util.DatabaseDoesNotExist(config.Data.Database)
+        }
+		interactiveMode(config.Data.Database)
+	case "host":
+        exists := util.CheckFileExists(config.Data.Database)
+        if !exists {
+            util.DatabaseDoesNotExist(config.Data.Database)
+        }
+        open := util.CheckPortOpen(config.General.Port)
+        if !open {
+            fmt.Printf("lieu: port %d is not open; try another one\n", config.General.Port)
+            util.Exit()
+        }
+		server.Serve(config)
+	default:
+        fmt.Println("Lieu: no such command, currently. Try `lieu help`")
+	}
+}
+
+func interactiveMode(databasePath string) {
+	db := database.InitDB(databasePath)
+	reader := bufio.NewReader(os.Stdin)
+	for {
+		fmt.Printf("> ")
+		input, err := reader.ReadString('\n')
+		util.Check(err)
+		input = strings.TrimSuffix(input, "\n")
+		pages := database.SearchWordsByScore(db, util.Inflect(strings.Fields(input)))
+		for _, pageData := range pages {
+			fmt.Println(pageData.URL)
+			if len(pageData.About) > 0 {
+				fmt.Println(pageData.About)
+			}
+		}
+	}
+}
--- a/crawler/crawler.go
+++ b/crawler/crawler.go
@ -0,0 +1,244 @@
+package crawler
+
+import (
+	"fmt"
+	"lieu/types"
+	"lieu/util"
+	"log"
+	"net/http"
+	"net/url"
+	"regexp"
+	"strings"
+    "time"
+
+	"github.com/PuerkitoBio/goquery"
+	"github.com/gocolly/colly/v2"
+	"github.com/gocolly/colly/v2/queue"
+)
+
+// the following domains are excluded from crawling & indexing, typically because they have a lot of microblog pages
+// (very spammy)
+func getBannedDomains(path string) []string {
+	return util.ReadList(path, "\n")
+}
+
+func getBannedSuffixes(path string) []string {
+	return util.ReadList(path, "\n")
+}
+
+func getBoringWords(path string) []string {
+	return util.ReadList(path, "\n")
+}
+
+func getBoringDomains(path string) []string {
+	return util.ReadList(path, "\n")
+}
+
+func find(list []string, query string) bool {
+	for _, item := range list {
+		if item == query {
+			return true
+		}
+	}
+	return false
+}
+
+func getLink(target string) string {
+	// remove anchor links
+	if strings.Contains(target, "#") {
+		target = strings.Split(target, "#")[0]
+	}
+	if strings.Contains(target, "?") {
+		target = strings.Split(target, "?")[0]
+	}
+	target = strings.TrimSpace(target)
+	target = strings.ToLower(target)
+	// remove trailing /
+	return strings.TrimSuffix(target, "/")
+}
+
+func getWebringLinks(path string) []string {
+	var links []string
+	candidates := util.ReadList(path, "\n")
+	for _, l := range candidates {
+		u, err := url.Parse(l)
+		if err != nil {
+			continue
+		}
+		if u.Scheme == "" {
+			u.Scheme = "https"
+		}
+		links = append(links, u.String())
+	}
+	return links
+}
+
+func getDomains(links []string) []string {
+	var domains []string
+	for _, l := range links {
+		u, err := url.Parse(l)
+		if err != nil {
+			continue
+		}
+		domains = append(domains, u.Hostname())
+	}
+	return domains
+}
+
+
+func findSuffix(suffixes []string, query string) bool {
+	for _, suffix := range suffixes {
+		if strings.HasSuffix(strings.ToLower(query), suffix) {
+			return true
+		}
+	}
+	return false
+}
+
+func cleanText(s string) string {
+	s = strings.TrimSpace(s)
+	s = strings.ReplaceAll(s, "\n", " ")
+	s = strings.ReplaceAll(s, "|", " ")
+	whitespace := regexp.MustCompile(`\p{Z}`)
+	s = whitespace.ReplaceAllString(s, " ")
+	return s
+}
+
+func handleIndexing(c *colly.Collector) {
+	c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) {
+		fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL)
+	})
+
+	c.OnHTML("meta[name=\"description\"]", func(e *colly.HTMLElement) {
+		desc := cleanText(e.Attr("content"))
+		if len(desc) > 0 {
+			fmt.Println("desc", desc, e.Request.URL)
+		}
+	})
+
+	c.OnHTML("html[lang]", func(e *colly.HTMLElement) {
+		lang := cleanText(e.Attr("lang"))
+		if len(lang) > 0 {
+			fmt.Println("lang", lang, e.Request.URL)
+		}
+	})
+
+	// get page title
+	c.OnHTML("title", func(e *colly.HTMLElement) {
+		fmt.Println("title", cleanText(e.Text), e.Request.URL)
+	})
+
+	c.OnHTML("body", func(e *colly.HTMLElement) {
+		paragraph := cleanText(e.DOM.Find("p").First().Text())
+		if len(paragraph) < 1500 && len(paragraph) > 0 {
+			fmt.Println("para", paragraph, e.Request.URL)
+		}
+		// get all relevant page headings
+		collectHeadingText("h1", e)
+		collectHeadingText("h2", e)
+		collectHeadingText("h3", e)
+	})
+}
+
+func collectHeadingText(heading string, e *colly.HTMLElement) {
+	for _, headingText := range e.ChildTexts(heading) {
+		if len(headingText) < 500 {
+			fmt.Println(heading, cleanText(headingText), e.Request.URL)
+		}
+	}
+}
+
+func Precrawl(config types.Config) {
+	res, err := http.Get(config.General.URL)
+	util.Check(err)
+	defer res.Body.Close()
+
+	if res.StatusCode != 200 {
+		log.Fatal("status not 200")
+	}
+
+	doc, err := goquery.NewDocumentFromReader(res.Body)
+	util.Check(err)
+
+	items := make([]string, 0)
+	doc.Find("li").Each(func(i int, s *goquery.Selection) {
+		if domain, exists := s.Find("a").Attr("href"); exists {
+			items = append(items, domain)
+		}
+	})
+
+	BANNED := getBannedDomains(config.Crawler.BannedDomains)
+	for _, item := range items {
+		link := getLink(item)
+		u, err := url.Parse(link)
+		// invalid link
+		if err != nil {
+			continue
+		}
+		domain := u.Hostname()
+		if find(BANNED, domain) {
+			continue
+		}
+		fmt.Println(link)
+	}
+}
+
+func Crawl(config types.Config) {
+	SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes)
+	links := getWebringLinks(config.Crawler.Webring)
+	domains := getDomains(links)
+	initialDomain := config.General.URL
+
+	// TODO: introduce c2 for scraping links (with depth 1) linked to from webring domains
+	// instantiate default collector
+	c := colly.NewCollector(
+		colly.MaxDepth(3),
+	)
+
+	q, _ := queue.New(
+		5, /* threads */
+		&queue.InMemoryQueueStorage{MaxSize: 100000},
+	)
+
+	for _, link := range links {
+		q.AddURL(link)
+	}
+
+	c.AllowedDomains = domains
+	c.AllowURLRevisit = false
+	c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains)
+
+    delay, _ := time.ParseDuration("200ms")
+    c.Limit(&colly.LimitRule{DomainGlob: "*", Delay: delay, Parallelism: 3})
+
+	boringDomains := getBoringDomains(config.Crawler.BoringDomains)
+	boringWords := getBoringWords(config.Crawler.BoringWords)
+
+	// on every a element which has an href attribute, call callback
+	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
+		link := getLink(e.Attr("href"))
+		if findSuffix(SUFFIXES, link) {
+			return
+		}
+		link = e.Request.AbsoluteURL(link)
+		u, err := url.Parse(link)
+		// log which site links to what
+		if err == nil && !util.Contains(boringWords, link) && !util.Contains(boringDomains, link) {
+			outgoingDomain := u.Hostname()
+			currentDomain := e.Request.URL.Hostname()
+			if !find(domains, outgoingDomain) {
+				fmt.Println("non-webring-link", link, e.Request.URL)
+				// solidarity! someone in the webring linked to someone else in it
+			} else if outgoingDomain != currentDomain && outgoingDomain != initialDomain && currentDomain != initialDomain {
+				fmt.Println("webring-link", link, e.Request.URL)
+			}
+		}
+		// only visits links from AllowedDomains
+		q.AddURL(link)
+	})
+
+	handleIndexing(c)
+
+	// start scraping
+	q.Run(c)
+}
--- a/data/banned-domains.txt
+++ b/data/banned-domains.txt
--- a/data/banned-suffixes.txt
+++ b/data/banned-suffixes.txt
@ -0,0 +1,17 @@
+.xml
+.pdf
+.rss
+.jpg
+.png
+.gif
+.avi
+.webm
+.mp4
+.ogg
+.mp3
+.zip
+.exe
+.txt
+.asc
+.key
+.csv
--- a/data/boring-domains.txt
+++ b/data/boring-domains.txt
@ -0,0 +1,19 @@
+instagram.com
+twitter.com
+linkedin.com
+facebook.com
+getpoole.com
+jekyllrb.com
+twitter.com
+amazon.com
+google.com
+microsoft.com
+youtube.com
+github.io
+meetup.com
+ebay.com
+t.co
+a.co
+wsj.com
+creativecommons.org
+patreon.com
--- a/data/boring-words.txt
+++ b/data/boring-words.txt
@ -0,0 +1,4 @@
+bitcoin
+javascript:
+mailto:
+subscribe
--- a/data/crawled.txt
+++ b/data/crawled.txt
--- a/data/heuristics.txt
+++ b/data/heuristics.txt
@ -0,0 +1,10 @@
+incoming
+tagged
+edited
+updated
+last update
+last edit
+©
+(c)
+all rights reserved
+licensed under
--- a/data/webring.txt
+++ b/data/webring.txt
--- a/data/wordlist.txt
+++ b/data/wordlist.txt
--- a/database/database.go
+++ b/database/database.go
@ -0,0 +1,222 @@
+package database
+
+/* example query
+SELECT p.url
+FROM inv_index index
+INNER JOIN pages p ON p.id = index.pageid
+WHERE i.word = "project";
+
+select url from inv_index where word="esoteric" group by url order by sum(score) desc limit 15;
+
+select url from inv_index where word = "<word>" group by url order by sum(score) desc;
+*/
+
+import (
+	"database/sql"
+	"fmt"
+	"lieu/types"
+	"lieu/util"
+	"log"
+	"net/url"
+	"strings"
+
+	_ "github.com/mattn/go-sqlite3"
+)
+
+func InitDB(filepath string) *sql.DB {
+	db, err := sql.Open("sqlite3", filepath)
+	if err != nil {
+		log.Fatalln(err)
+	}
+	if db == nil {
+		log.Fatalln("db is nil")
+	}
+	createTables(db)
+	return db
+}
+
+func createTables(db *sql.DB) {
+	// create the table if it doesn't exist
+	queries := []string{`
+    CREATE TABLE IF NOT EXISTS domains (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        domain TEXT NOT NULL UNIQUE
+    );
+    `,
+		`
+    CREATE TABLE IF NOT EXISTS pages (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        url TEXT NOT NULL UNIQUE,
+        title TEXT,
+        about TEXT,
+        lang TEXT,
+        domain TEXT NOT NULL,
+        FOREIGN KEY(domain) REFERENCES domains(domain)
+    );
+    `,
+		`
+    CREATE TABLE IF NOT EXISTS external_pages (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        url TEXT NOT NULL UNIQUE,
+        domain TEXT NOT NULL,
+        title TEXT
+    );
+    `,
+		`
+    CREATE TABLE IF NOT EXISTS inv_index (
+        word TEXT NOT NULL,
+        score INTEGER NOT NULL,
+        url TEXT NOT NULL,
+        FOREIGN KEY(url) REFERENCES pages(url)
+    )`,
+	}
+
+	for _, query := range queries {
+		if _, err := db.Exec(query); err != nil {
+			log.Fatalln(err)
+		}
+	}
+}
+
+/* TODO: filters
+lang:en|fr|en|<..>
+site:wiki.xxiivv.com, site:cblgh.org
+nosite:excluded-domain.com
+
+"word1 word2 word3" strict query
+
+query params:
+&order=score, &order=count
+&outgoing=true
+*/
+
+func SearchWordsByScore(db *sql.DB, words []string) []types.PageData {
+	return searchWords(db, words, true)
+}
+
+func SearchWordsByCount(db *sql.DB, words []string) []types.PageData {
+	return searchWords(db, words, false)
+}
+
+func GetDomainCount(db *sql.DB) int {
+	return countQuery(db, "domains")
+}
+
+func GetPageCount(db *sql.DB) int {
+	return countQuery(db, "pages")
+}
+
+func GetWordCount(db *sql.DB) int {
+	return countQuery(db, "inv_index")
+}
+
+func GetRandomPage(db *sql.DB) string {
+    rows, err := db.Query("SELECT url FROM pages ORDER BY RANDOM() LIMIT 1;")
+    util.Check(err)
+
+    var link string
+    for rows.Next() {
+        err = rows.Scan(&link)
+        util.Check(err)
+    }
+    return link
+}
+
+func countQuery(db *sql.DB, table string) int {
+	rows, err := db.Query(fmt.Sprintf("SELECT COUNT(*) FROM %s;", table))
+	util.Check(err)
+	var count int
+	for rows.Next() {
+		err = rows.Scan(&count)
+		util.Check(err)
+	}
+	return count
+}
+
+func searchWords(db *sql.DB, words []string, searchByScore bool) []types.PageData {
+	var wordlist []string
+	var args []interface{}
+	for _, word := range words {
+		wordlist = append(wordlist, "word = ?")
+		args = append(args, strings.ToLower(word))
+	}
+
+	orderType := "SUM(score)"
+	if !searchByScore {
+		orderType = "COUNT(*)"
+	}
+
+	query := fmt.Sprintf(`
+    SELECT p.url, p.about, p.title 
+    FROM inv_index inv INNER JOIN pages p ON inv.url = p.url 
+    WHERE %s
+    GROUP BY inv.url 
+    ORDER BY %s
+    DESC
+    LIMIT 15
+    `, strings.Join(wordlist, " OR "), orderType)
+
+	stmt, err := db.Prepare(query)
+	util.Check(err)
+	defer stmt.Close()
+
+	rows, err := stmt.Query(args...)
+	util.Check(err)
+	var pageData types.PageData
+	var pages []types.PageData
+	for rows.Next() {
+		if err := rows.Scan(&pageData.URL, &pageData.About, &pageData.Title); err != nil {
+			log.Fatalln(err)
+		}
+		pages = append(pages, pageData)
+	}
+	return pages
+}
+
+func InsertManyDomains(db *sql.DB, pages []types.PageData) {
+	values := make([]string, 0, len(pages))
+	args := make([]interface{}, 0, len(pages))
+
+	for _, b := range pages {
+		values = append(values, "(?)")
+		u, err := url.Parse(b.URL)
+		util.Check(err)
+		args = append(args, u.Hostname())
+	}
+
+	stmt := fmt.Sprintf(`INSERT OR IGNORE INTO domains(domain) VALUES %s`, strings.Join(values, ","))
+	_, err := db.Exec(stmt, args...)
+	util.Check(err)
+}
+
+func InsertManyPages(db *sql.DB, pages []types.PageData) {
+	values := make([]string, 0, len(pages))
+	args := make([]interface{}, 0, len(pages))
+
+	for _, b := range pages {
+		// url, title, lang, about, domain
+		values = append(values, "(?, ?, ?, ?, ?)")
+		u, err := url.Parse(b.URL)
+		util.Check(err)
+		args = append(args, b.URL, b.Title, b.Lang, b.About, u.Hostname())
+	}
+
+	stmt := fmt.Sprintf(`INSERT OR IGNORE INTO pages(url, title, lang, about, domain) VALUES %s`, strings.Join(values, ","))
+	_, err := db.Exec(stmt, args...)
+	util.Check(err)
+}
+
+func InsertManyWords(db *sql.DB, batch []types.SearchFragment) {
+	values := make([]string, 0, len(batch))
+	args := make([]interface{}, 0, len(batch))
+
+	for _, b := range batch {
+		pageurl := strings.TrimSuffix(b.URL, "/")
+		values = append(values, "(?, ?, ?)")
+		args = append(args, b.Word, pageurl, b.Score)
+	}
+
+	stmt := fmt.Sprintf(`INSERT OR IGNORE INTO inv_index(word, url, score) VALUES %s`, strings.Join(values, ","))
+	_, err := db.Exec(stmt, args...)
+	util.Check(err)
+}
--- a/docs/files.md
+++ b/docs/files.md
@ -0,0 +1,121 @@
+# Files
+_what the purposes are of all those damn files_
+
+Lieu is based on a few files, which in turn configure various behaviours in the
+**crawler** (visits urls & extracts relevant elements) and the **ingester**
+(converts the crawled source data into database fields). The basic reason is to
+minimize hardcoded assumptions in the source, furthering Lieu's reuse.
+
+Below, I will refer to the files by their config defined names. Here's the
+config example from the [README](../README.md), again.
+
+```toml
+[general]
+name = "Merveilles Webring"
+# used by the precrawl command and linked to in /about route
+url = "https://webring.xxiivv.com"
+port = 10001
+
+[data]
+# the source file should contain the crawl command's output 
+source = "data/crawled.txt"
+# location & name of the sqlite database
+database = "data/searchengine.db"
+# contains words and phrases disqualifying scraped paragraphs from being presented in search results
+heuristics = "data/heuristics.txt"
+# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word
+wordlist = "data/wordlist.txt"
+
+[crawler]
+# manually curated list of domains, or the output of the precrawl command
+webring = "data/webring.txt"
+# domains that are banned from being crawled but might originally be part of the webring
+bannedDomains = "data/banned-domains.txt"
+# file suffixes that are banned from being crawled
+bannedSuffixes = "data/banned-suffixes.txt"
+# phrases and words which won't be scraped (e.g. if a contained in a link)
+boringWords = "data/boring-words.txt"
+# domains that won't be output as outgoing links
+boringDomains = "data/boring-domains.txt"
+```
+
+## HTML
+Before we start, a final note on some other types of files in use. The HTML
+templates, used when presenting the search engine in the browser, are all
+available in the [`html`](../html) folder. The includes—currently only css
+& font files—are available in [`html/assets`](../html/assets).
+
+## `[crawler]`
+#### `webring`
+Defines which domains will be crawled for pages. At current writing, no domains
+outside of this file will be crawled.
+
+You can populate the `webring` file manually or by precrawling an existing
+webpage that contains all of the domains you want to crawl:
+
+    lieu precrawl > data/webring.txt
+
+#### `bannedDomains`
+A list of domains that will not be crawled. This means that if they are present
+in the `webring` file, they will be skipped over as candidates for crawling.
+
+The rationale is that some of the domains of a webring may be unsuitable for ingestion
+into the database. I typically find this is the case for domains that include
+microblogs with 100s or 1000s of one line pages—needlessly gunking up the search
+results without providing anything of interest outside the individual creating
+the logs.
+
+#### `bannedSuffixes`
+Eliminates html links that end with suffixes present in this file. Typically I want
+to avoid crawling links to media formats such as `.mp4`, and other types of
+non-html documents, really.
+
+It's fine to leave this file intact with its defaults.
+
+#### `boringWords`
+This file is a bit more specific. It contains words which, if present in a link,
+will prevent the link from being logged. The reason is cause it suggests the
+link target is boring—irrelevant for this application of the search engine.
+
+This can be `javascript:` script links, or other types of content that is less
+relevant to the focus of the search engine & webring.
+
+Link data of this type is as yet unused in Lieu's ingestion.
+
+#### `boringDomains`
+Like `boringWords` except it contains a list of domains which are banned from
+having their links be logged, typically because they are deemed less relevant
+for the focus of the search engine.
+
+Link data of this type is as yet unused in Lieu's ingestion.
+
+## `[data]`
+#### `source`
+Contains the linewise data that was produced by the crawler. The first word
+identifies the type of data and the last word identifies the page the data
+originated from.
+
+Example:
+```
+h2 Prelude https://cblgh.org/articles/four-nights-in-tornio.html
+```
+
+* An `<h2>` tag was scraped, 
+* its contents were `Prelude`, and 
+* the originating article was https://cblgh.org/articles/four-nights-in-tornio.html
+
+#### `database`
+The location the sqlite3 database will be created & read from.
+
+#### `heuristics`
+Heuristics contains a list of words or phrases which disqualify scraped
+paragraphs from being used as descriptive text Lieu's search results. Typically
+excluded are e.g. paragraphs which contain copyright symbols—as that indicates we
+have scraped the bottom-most paragraph, i.e. the page was likely a short stub,
+with a better content description elsewhere.
+
+#### `wordlist`
+Also known as [stopwords](https://en.wikipedia.org/wiki/Stop_word)—words which
+are stopped from entering the search index. The default wordlist consists of the
+1000 or so most common English words, albeit curated slightly to still allow for
+interesting concepts and verbs—such as `reading` and `books`, for example.
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,11 @@
+module lieu
+
+go 1.14
+
+require (
+	github.com/PuerkitoBio/goquery v1.5.1
+	github.com/gocolly/colly/v2 v2.1.0
+	github.com/jinzhu/inflection v1.0.0
+	github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b
+	github.com/mattn/go-sqlite3 v1.14.6
+)
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,144 @@
+cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
+github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
+github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
+github.com/anaskhan96/soup v1.2.4 h1:or+sKs9QbzJGZVTYFmTs2VBateEywoq00a6K14z331E=
+github.com/anaskhan96/soup v1.2.4/go.mod h1:6YnEp9A2yywlYdM4EgDz9NEHclocMepEtku7wg6Cq3s=
+github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
+github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE=
+github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY=
+github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M=
+github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0=
+github.com/antchfx/xmlquery v1.2.4 h1:T/SH1bYdzdjTMoz2RgsfVKbM5uWh3gjDYYepFqQmFv4=
+github.com/antchfx/xmlquery v1.2.4/go.mod h1:KQQuESaxSlqugE2ZBcM/qn+ebIpt+d+4Xx7YcSGAIrM=
+github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
+github.com/antchfx/xpath v1.1.8 h1:PcL6bIX42Px5usSx6xRYw/wjB3wYGkj0MJ9MBzEKVgk=
+github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
+github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
+github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
+github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
+github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
+github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
+github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
+github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs=
+github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0=
+github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
+github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY=
+github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
+github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
+github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
+github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
+github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
+github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
+github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
+github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0=
+github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
+github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
+github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg=
+github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E=
+github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc=
+github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
+github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
+github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b h1:UmqyLHqfYJjkiuA2hddGeovwAGOCBm5gOTVKuxtvoMo=
+github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b/go.mod h1:wLcNqnyr6riTbnFObg4o2/GemTCso9AnsUdLsMsdspw=
+github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRUbg=
+github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=
+github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
+github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
+github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
+golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
+golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
+golang.org/x/net v0.0.0-20200602114024-627f9648deb9 h1:pNX+40auqi2JqRfOP1akLGtYcn15TUbkhwuCO3foqqM=
+golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
+golang.org/x/net v0.0.0-20201021035429-f5854403a974 h1:IX6qOQeG5uLjB/hjjwjedwfjND0hgjPMMyO1RoIXQNI=
+golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
+golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
+golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
+golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
+golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20210114065538-d78b04bdf963/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
+google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
+google.golang.org/appengine v1.6.6 h1:lMO5rYAqUxkmaj76jAkRUvt5JZgFymx/+Q5Mzfivuhc=
+google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
+google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
+google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
+google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
+google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
+google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
+google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
+google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
+google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
+google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
+google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
+google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEGA=
+google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
--- a/html/assets/NotoSerif-Bold.ttf
+++ b/html/assets/NotoSerif-Bold.ttf
--- a/html/assets/NotoSerif-Italic.ttf
+++ b/html/assets/NotoSerif-Italic.ttf
--- a/html/assets/NotoSerif-Regular.ttf
+++ b/html/assets/NotoSerif-Regular.ttf
--- a/html/assets/about.css
+++ b/html/assets/about.css
@ -0,0 +1,24 @@
+@import url("base.css");
+
+html {
+    max-width: 31rem;
+}
+
+h1 {
+    font-size: 3rem;
+    margin-bottom: 0.5rem;
+}
+
+h2 {
+    font-family: "Noto Serif";
+    font-style: italic;
+    font-weight: 400;
+    font-size: 1.5rem;
+    margin-top: 0;
+    margin-bottom: 2rem;
+}
+
+.lieu {
+    font-family: "Noto Serif";
+    font-weight: 400;
+}
--- a/html/assets/base.css
+++ b/html/assets/base.css
@ -0,0 +1,162 @@
+@import url('inter-ui-web/inter-ui.css');
+
+@font-face {
+  font-family: "Noto Serif";
+  src: url("NotoSerif-Bold.ttf");
+}
+
+@font-face {
+  font-family: "Noto Serif";
+  font-weight: 400;
+  src: url("NotoSerif-Regular.ttf");
+}
+
+@font-face {
+  font-family: "Noto Serif";
+  font-weight: 400;
+  font-style: italic;
+  src: url("NotoSerif-Italic.ttf");
+}
+
+:root {
+    --primary: #fefefe;
+    --secondary: #000;
+    /* alt colorscheme: 1 */
+    /* --primary: red; */
+    /* --secondary: #fefefe; */
+    /* alt colorscheme: 2 */
+    /* --primary: #F35363; */
+    /* --secondary: black; */
+}
+
+li {
+    list-style-type: circle;
+}
+
+ul {
+    margin: 0;
+    padding-left: 1rem;
+}
+
+html {
+    font-family: "Inter UI", sans-serif;
+    background: var(--secondary);
+    color: var(--primary);
+    max-width: 650px;
+    padding-bottom: 2rem;
+    padding-left: 2rem;
+    margin-top: 2rem;
+}
+
+body {
+    margin: 0;
+}
+
+h1 {
+    font-family: "Noto Serif";
+    font-weight: 400;
+    font-size: 3rem;
+    margin-bottom: 1rem;
+    margin-top: 0;
+}
+
+h1 > a, h1 > a:hover {
+    border-bottom: none;
+}
+
+a {
+    cursor: pointer;
+    color: var(--primary);
+    text-decoration: none;
+    border-bottom: 0.1rem solid var(--primary);
+    word-wrap: break-word;
+}
+
+a:hover {
+    border-bottom-style: dotted;
+}
+
+p {
+    hyphens: auto;
+    margin-bottom: 1.5rem;
+}
+
+.entry {
+    -webkit-column-break-inside: avoid;
+    -moz-column-break-inside:avoid;
+    -moz-page-break-inside:avoid;
+    page-break-inside: avoid;
+    break-inside: avoid-column;
+}
+
+.search-container {
+    display: grid;
+    margin-bottom: 2rem;
+    height: 2.5rem;
+    align-items: center;
+    grid-template-columns: 16rem 3rem;
+    grid-auto-flow: column;
+    grid-column-gap: .5rem;
+}
+
+.search-box {
+    font-size: 1rem;
+    border-radius: 0.1rem;
+    padding: .5rem;
+    padding-left: 0.75rem;
+    border: 0;
+    color: var(--secondary);
+    background: var(--primary);
+}
+
+.search-button {
+	font-size: 2rem;
+	color: var(--primary);
+	background: var(--secondary);
+	border: 0;
+	cursor: pointer;
+	border-radius: 2px;
+    transition: opacity 150ms;
+}
+
+.search-button:hover {
+    opacity: 0.5;
+    transition: opacity 150ms;
+}
+
+.about-link {
+    position: absolute;
+    top: 1rem;
+    right: 1rem;
+    font-style: normal;
+}
+
+@media 
+only screen 
+and (min-device-width : 320px) 
+and (max-device-width : 720px)
+{ 
+    html {
+        padding-left: 0.75rem;
+        padding-right: 0.75rem;
+        font-size: 30pt;
+        max-width: 100vw;
+    }
+}
+
+@media 
+only screen
+and (min-device-width : 320px) 
+and (max-device-width : 374px) {
+    html {
+        font-size: 40pt;
+    }
+}
+
+/*
+@media(prefers-color-scheme: light) {
+:root {
+    --primary: #000;
+    --secondary: #fefefe;
+}
+*/
--- a/html/assets/inter-ui-web/Inter-UI-Italic.woff
+++ b/html/assets/inter-ui-web/Inter-UI-Italic.woff
--- a/html/assets/inter-ui-web/Inter-UI-Italic.woff2
+++ b/html/assets/inter-ui-web/Inter-UI-Italic.woff2
--- a/html/assets/inter-ui-web/Inter-UI-Regular.woff
+++ b/html/assets/inter-ui-web/Inter-UI-Regular.woff
--- a/html/assets/inter-ui-web/Inter-UI-Regular.woff2
+++ b/html/assets/inter-ui-web/Inter-UI-Regular.woff2
--- a/html/assets/inter-ui-web/LICENSE.txt
+++ b/html/assets/inter-ui-web/LICENSE.txt
@ -0,0 +1,92 @@
+Copyright (c) 2016-2018 The Inter UI Project Authors (me@rsms.me)
+
+This Font Software is licensed under the SIL Open Font License, Version 1.1.
+This license is copied below, and is also available with a FAQ at:
+http://scripts.sil.org/OFL
+
+-----------------------------------------------------------
+SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
+-----------------------------------------------------------
+
+PREAMBLE
+The goals of the Open Font License (OFL) are to stimulate worldwide
+development of collaborative font projects, to support the font creation
+efforts of academic and linguistic communities, and to provide a free and
+open framework in which fonts may be shared and improved in partnership
+with others.
+
+The OFL allows the licensed fonts to be used, studied, modified and
+redistributed freely as long as they are not sold by themselves. The
+fonts, including any derivative works, can be bundled, embedded,
+redistributed and/or sold with any software provided that any reserved
+names are not used by derivative works. The fonts and derivatives,
+however, cannot be released under any other type of license. The
+requirement for fonts to remain under this license does not apply
+to any document created using the fonts or their derivatives.
+
+DEFINITIONS
+"Font Software" refers to the set of files released by the Copyright
+Holder(s) under this license and clearly marked as such. This may
+include source files, build scripts and documentation.
+
+"Reserved Font Name" refers to any names specified as such after the
+copyright statement(s).
+
+"Original Version" refers to the collection of Font Software components as
+distributed by the Copyright Holder(s).
+
+"Modified Version" refers to any derivative made by adding to, deleting,
+or substituting -- in part or in whole -- any of the components of the
+Original Version, by changing formats or by porting the Font Software to a
+new environment.
+
+"Author" refers to any designer, engineer, programmer, technical
+writer or other person who contributed to the Font Software.
+
+PERMISSION AND CONDITIONS
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Font Software, to use, study, copy, merge, embed, modify,
+redistribute, and sell modified and unmodified copies of the Font
+Software, subject to the following conditions:
+
+1) Neither the Font Software nor any of its individual components,
+in Original or Modified Versions, may be sold by itself.
+
+2) Original or Modified Versions of the Font Software may be bundled,
+redistributed and/or sold with any software, provided that each copy
+contains the above copyright notice and this license. These can be
+included either as stand-alone text files, human-readable headers or
+in the appropriate machine-readable metadata fields within text or
+binary files as long as those fields can be easily viewed by the user.
+
+3) No Modified Version of the Font Software may use the Reserved Font
+Name(s) unless explicit written permission is granted by the corresponding
+Copyright Holder. This restriction only applies to the primary font name as
+presented to the users.
+
+4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
+Software shall not be used to promote, endorse or advertise any
+Modified Version, except to acknowledge the contribution(s) of the
+Copyright Holder(s) and the Author(s) or with their explicit written
+permission.
+
+5) The Font Software, modified or unmodified, in part or in whole,
+must be distributed entirely under this license, and must not be
+distributed under any other license. The requirement for fonts to
+remain under this license does not apply to any document created
+using the Font Software.
+
+TERMINATION
+This license becomes null and void if any of the above conditions are
+not met.
+
+DISCLAIMER
+THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
+DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
+OTHER DEALINGS IN THE FONT SOFTWARE.
--- a/html/assets/inter-ui-web/inter-ui.css
+++ b/html/assets/inter-ui-web/inter-ui.css
@ -0,0 +1,13 @@
+@font-face {
+  font-family: 'Inter UI';
+  font-style:  normal;
+  src: url("Inter-UI-Regular.woff2") format("woff2"),
+       url("Inter-UI-Regular.woff") format("woff");
+}
+
+@font-face {
+  font-family: 'Inter UI';
+  font-style:  italic;
+  src: url("Inter-UI-Italic.woff2") format("woff2"),
+       url("Inter-UI-Italic.woff") format("woff");
+}
--- a/html/assets/search.css
+++ b/html/assets/search.css
@ -0,0 +1,27 @@
+@import url('base.css');
+
+main {
+    columns: 2;
+}
+
+.entry {
+    -webkit-column-break-inside: avoid;
+    -moz-column-break-inside:avoid;
+    -moz-page-break-inside:avoid;
+    page-break-inside: avoid;
+    break-inside: avoid-column;
+}
+
+.link {
+    font-style: italic;
+}
+
+@media 
+only screen 
+and (min-device-width : 320px) 
+and (max-device-width : 720px)
+{ 
+    main {
+        columns: 1 !important;
+    }
+}
--- a/html/assets/startpage.css
+++ b/html/assets/startpage.css
@ -0,0 +1,24 @@
+@import url("about.css");
+
+html {
+    max-width: 100vw;
+}
+
+h2 {
+    margin-bottom: 1rem;
+}
+
+main {
+    display: grid;
+    justify-items: center;
+    align-items: center;
+    margin-top: 10rem;
+}
+
+.search-container {
+    grid-template-columns: 19rem 3rem;
+}
+
+.lieu-container {
+    justify-items: start;
+}
--- a/ingest/ingest.go
+++ b/ingest/ingest.go
@ -0,0 +1,205 @@
+package ingest
+
+import (
+	"bufio"
+	"database/sql"
+	"fmt"
+	"lieu/database"
+	"lieu/types"
+	"lieu/util"
+	"log"
+	"net/url"
+	"os"
+	"regexp"
+	"strings"
+
+	"github.com/jinzhu/inflection"
+)
+
+func partitionSentence(s string) []string {
+	punctuation := regexp.MustCompile(`\p{P}`)
+	whitespace := regexp.MustCompile(`\p{Z}`)
+	invisible := regexp.MustCompile(`\p{C}`)
+	symbols := regexp.MustCompile(`\p{S}`)
+
+	s = punctuation.ReplaceAllString(s, " ")
+	s = whitespace.ReplaceAllString(s, " ")
+	s = invisible.ReplaceAllString(s, " ")
+	s = symbols.ReplaceAllString(s, " ")
+	s = strings.ReplaceAll(s, "|", " ")
+	s = strings.ReplaceAll(s, "/", " ")
+	return strings.Fields(s)
+}
+
+func filterCommonWords(words, wordlist []string) []string {
+	var filtered []string
+	for _, word := range words {
+		// ingested word was too common, skip it
+		if len(word) == 1 || find(wordlist, word) {
+			continue
+		}
+		filtered = append(filtered, inflection.Singular(word))
+	}
+	return filtered
+}
+
+func find(slice []string, sought string) bool {
+	for _, item := range slice {
+		if item == sought {
+			return true
+		}
+	}
+	return false
+}
+
+func performAboutHeuristic(heuristicPath, phrase string) bool {
+	disallowed := util.ReadList(heuristicPath, "\n")
+	ok := !util.Contains(disallowed, phrase)
+	return ok && len(phrase) > 20
+}
+
+func Ingest(config types.Config) {
+	if _, err := os.Stat(config.Data.Database); err == nil || os.IsExist(err) {
+		err = os.Remove(config.Data.Database)
+		util.Check(err)
+	}
+
+	db := database.InitDB(config.Data.Database)
+
+	wordlist := util.ReadList(config.Data.Wordlist, "|")
+
+	buf, err := os.Open(config.Data.Source)
+	util.Check(err)
+
+	defer func() {
+		err = buf.Close()
+		util.Check(err)
+	}()
+
+	pages := make(map[string]types.PageData)
+	var count int
+	var batchsize = 100
+	batch := make([]types.SearchFragment, 0, 0)
+
+	scanner := bufio.NewScanner(buf)
+	for scanner.Scan() {
+		line := scanner.Text()
+		firstSpace := strings.Index(line, " ")
+		lastSpace := strings.LastIndex(line, " ")
+
+		if len(line) == 0 || firstSpace == -1 {
+			continue
+		}
+
+		pageurl := strings.ToLower(strings.TrimSuffix(strings.TrimSpace(line[lastSpace:len(line)]), "/"))
+		if !strings.HasPrefix(pageurl, "http") {
+			continue
+		}
+
+		var page types.PageData
+		if data, exists := pages[pageurl]; exists {
+			page = data
+		} else {
+			page.URL = pageurl
+		}
+
+		token := line[0:firstSpace]
+		rawdata := strings.TrimSpace(line[firstSpace:lastSpace])
+		payload := strings.ToLower(rawdata)
+
+		var processed []string
+		score := 1
+		switch token {
+		case "title":
+			if len(page.About) == 0 {
+				page.About = rawdata
+			}
+			score = 5
+			page.Title = rawdata
+			processed = partitionSentence(payload)
+		case "h1":
+			if len(page.About) == 0 {
+				page.About = rawdata
+			}
+			fallthrough
+		case "h2":
+			fallthrough
+		case "h3":
+			score = 15
+			processed = partitionSentence(payload)
+		case "desc":
+			if len(page.About) < 30 && len(rawdata) < 100 {
+				page.About = rawdata
+			}
+			processed = partitionSentence(payload)
+		case "para":
+			if performAboutHeuristic(config.Data.Heuristics, payload) {
+				page.About = rawdata
+			}
+			processed = partitionSentence(payload)
+		case "lang":
+			page.Lang = rawdata
+		case "keywords":
+			processed = strings.Split(strings.ReplaceAll(payload, ", ", ","), ",")
+		default:
+			continue
+		}
+
+		pages[pageurl] = page
+		processed = filterCommonWords(processed, wordlist)
+		count += len(processed)
+
+		for _, word := range processed {
+			batch = append(batch, types.SearchFragment{Word: word, URL: pageurl, Score: score})
+		}
+		if token == "title" {
+			// only extract path segments once per url.
+			// we do it here because every page is virtually guaranteed to have a title attr &
+			// it only appears once
+			for _, word := range extractPathSegments(pageurl) {
+				batch = append(batch, types.SearchFragment{Word: word, URL: pageurl, Score: 2})
+			}
+		}
+
+		if len(pages) > batchsize {
+			ingestBatch(db, batch, pages)
+			batch = make([]types.SearchFragment, 0, 0)
+			// TODO: make sure we don't partially insert any page data
+			pages = make(map[string]types.PageData)
+		}
+	}
+	fmt.Printf("ingested %d words\n", count)
+
+	err = scanner.Err()
+	util.Check(err)
+}
+
+func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]types.PageData) {
+	pages := make([]types.PageData, len(pageMap))
+	i := 0
+	for k := range pageMap {
+		pages[i] = pageMap[k]
+		i++
+	}
+	log.Println("starting to ingest batch")
+	database.InsertManyDomains(db, pages)
+	database.InsertManyPages(db, pages)
+	database.InsertManyWords(db, batch)
+	log.Println("finished ingesting batch")
+}
+
+func extractPathSegments(pageurl string) []string {
+	u, err := url.Parse(pageurl)
+	util.Check(err)
+	if len(u.Path) == 0 {
+		return make([]string, 0, 0)
+	}
+	s := u.Path
+	s = strings.TrimSuffix(s, ".html")
+	s = strings.TrimSuffix(s, ".htm")
+	s = strings.ReplaceAll(s, "/", " ")
+	s = strings.ReplaceAll(s, "-", " ")
+	s = strings.ReplaceAll(s, "_", " ")
+	s = strings.ToLower(s)
+	return strings.Fields(s)
+}
--- a/lieu.toml
+++ b/lieu.toml
@ -0,0 +1,27 @@
+[general]
+name = "Sweet Webring"
+# used by the precrawl command and linked to in /about route
+url = "https://example.com/"
+port = 10001
+
+[data]
+# the source file should contain the crawl command's output 
+source = "data/crawled.txt"
+# location & name of the sqlite database
+database = "data/searchengine.db"
+# contains words and phrases disqualifying scraped paragraphs from being presented in search results
+heuristics = "data/heuristics.txt"
+# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word
+wordlist = "data/wordlist.txt"
+
+[crawler]
+# manually curated list of domains, or the output of the precrawl command
+webring = "data/webring.txt"
+# domains that are banned from being crawled but might originally be part of the webring
+bannedDomains = "data/banned-domains.txt"
+# file suffixes that are banned from being crawled
+bannedSuffixes = "data/banned-suffixes.txt"
+# phrases and words which won't be scraped (e.g. if a contained in a link)
+boringWords = "data/boring-words.txt"
+# domains that won't be output as outgoing links
+boringDomains = "data/boring-domains.txt"
--- a/server/server.go
+++ b/server/server.go
@ -0,0 +1,143 @@
+package server
+
+import (
+	"fmt"
+	"net/http"
+	"net/url"
+	"strings"
+
+	"lieu/database"
+	"lieu/types"
+	"lieu/util"
+	"html/template"
+
+    // "github.com/shurcooL/vfsgen"
+)
+
+type SearchData struct {
+	Query string
+	Pages []types.PageData
+}
+
+type AboutData struct {
+	DomainCount  int
+	InstanceName string
+	PageCount    string
+	TermCount    string
+	FilteredLink string
+	RingLink     string
+}
+
+const useURLTitles = true
+
+func searchRoute(res http.ResponseWriter, req *http.Request, config types.Config) {
+	var query string
+
+	if req.Method == http.MethodGet {
+		params := req.URL.Query()
+		words, exists := params["q"]
+		if !exists {
+			view := template.Must(template.ParseFiles("html/index-template.html"))
+			var empty interface{}
+			view.Execute(res, empty)
+			return
+		}
+		query = words[0]
+	} else {
+		view := template.Must(template.ParseFiles("html/index-template.html"))
+		var empty interface{}
+		view.Execute(res, empty)
+		return
+	}
+
+	db := database.InitDB(config.Data.Database)
+	pages := database.SearchWordsByScore(db, util.Inflect(strings.Fields(query)))
+
+	if useURLTitles {
+		for i, pageData := range pages {
+			prettyURL, err := url.QueryUnescape(strings.TrimPrefix(strings.TrimPrefix(pageData.URL, "http://"), "https://"))
+			util.Check(err)
+			pageData.Title = prettyURL
+			pages[i] = pageData
+		}
+	}
+
+	view := template.Must(template.ParseFiles("html/search-template.html"))
+	data := SearchData{
+		Query: query,
+		Pages: pages,
+	}
+	view.Execute(res, data)
+}
+
+func aboutRoute(res http.ResponseWriter, req *http.Request, config types.Config) {
+	db := database.InitDB(config.Data.Database)
+	pageCount := util.Humanize(database.GetPageCount(db))
+	wordCount := util.Humanize(database.GetWordCount(db))
+	domainCount := database.GetDomainCount(db)
+
+	view := template.Must(template.ParseFiles("html/about-template.html"))
+	data := AboutData{
+		InstanceName: config.General.Name,
+		DomainCount:  domainCount,
+		PageCount:    pageCount,
+		TermCount:    wordCount,
+		FilteredLink: "/filtered",
+		RingLink:     config.General.URL,
+	}
+	view.Execute(res, data)
+}
+
+type ListData struct {
+	Title string
+	URLs  []types.PageData
+}
+
+func filteredRoute(res http.ResponseWriter, req *http.Request, config types.Config) {
+	view := template.Must(template.ParseFiles("html/list-template.html"))
+	var URLs []types.PageData
+	for _, domain := range util.ReadList(config.Crawler.BannedDomains, "\n") {
+		u, err := url.Parse(domain)
+		if err != nil {
+			continue
+		}
+		u.Scheme = "https"
+		p := types.PageData{Title: domain, URL: u.String()}
+		URLs = append(URLs, p)
+	}
+	data := ListData{
+		Title: "Filtered Domains",
+		URLs:  URLs,
+	}
+	view.Execute(res, data)
+}
+
+func randomRoute(res http.ResponseWriter, req *http.Request, config types.Config) {
+	db := database.InitDB(config.Data.Database)
+    link := database.GetRandomPage(db)
+    http.Redirect(res, req, link, http.StatusSeeOther)
+}
+
+func Serve(config types.Config) {
+	http.HandleFunc("/about", func(res http.ResponseWriter, req *http.Request) {
+		aboutRoute(res, req, config)
+	})
+	http.HandleFunc("/", func(res http.ResponseWriter, req *http.Request) {
+		searchRoute(res, req, config)
+	})
+
+	http.HandleFunc("/filtered", func(res http.ResponseWriter, req *http.Request) {
+		filteredRoute(res, req, config)
+	})
+
+    http.HandleFunc("/random", func(res http.ResponseWriter, req *http.Request) {
+        randomRoute(res, req, config)
+    })
+	fileserver := http.FileServer(http.Dir("html/assets/"))
+	http.Handle("/links/", http.StripPrefix("/links/", fileserver))
+
+	portstr := fmt.Sprintf(":%d", config.General.Port)
+	fmt.Println("listening on", portstr)
+
+	http.ListenAndServe(portstr, nil)
+}
--- a/types/types.go
+++ b/types/types.go
@ -0,0 +1,35 @@
+package types
+
+type SearchFragment struct {
+	Word  string
+	URL   string
+	Score int
+}
+
+type PageData struct {
+	URL   string
+	Title string
+	About string
+	Lang  string
+}
+
+type Config struct {
+	General struct {
+		Name string `json:name`
+		URL  string `json:url`
+		Port int    `json:port`
+	} `json:general`
+	Data struct {
+		Source     string `json:source`
+		Database   string `json:database`
+		Heuristics string `json:heuristics`
+		Wordlist   string `json:wordlist`
+	} `json:data`
+	Crawler struct {
+		Webring        string `json:webring`
+		BannedDomains  string `json:bannedDomains`
+		BannedSuffixes string `json:bannedSuffixes`
+		BoringWords    string `json:boringWords`
+		BoringDomains  string `json:boringDomains`
+	} `json:crawler`
+}
--- a/util/util.go
+++ b/util/util.go
@ -0,0 +1,136 @@
+package util
+
+import (
+    "os"
+	"bytes"
+	"encoding/json"
+	"fmt"
+    "net"
+	"io/ioutil"
+	"log"
+	"strings"
+
+	"lieu/types"
+	"github.com/jinzhu/inflection"
+	"github.com/komkom/toml"
+)
+
+func Inflect(words []string) []string {
+	var inflected []string
+	for _, word := range words {
+		inflected = append(inflected, inflection.Singular(word))
+	}
+	return inflected
+}
+
+func Check(err error) {
+	if err != nil {
+		log.Fatalln(err)
+	}
+}
+
+func DatabaseDoesNotExist(filepath string) {
+    fmt.Printf("lieu: database %s does not exist\n", filepath)
+    fmt.Println("lieu: try running `lieu ingest` if you have already crawled source data")
+    Exit()
+}
+
+func CheckFileExists (path string) bool {
+    _, err := os.Stat(path)
+    if err == nil {
+        return true
+    }
+    return os.IsExist(err)
+}
+
+func Humanize(n int) string {
+	if n > 1000 {
+		return fmt.Sprintf("%dk", n/1000)
+	} else if n > 1000000 {
+		return fmt.Sprintf("%dm", n/1000000)
+	}
+
+	return string(n)
+}
+
+func Contains(arr []string, query string) bool {
+	for _, item := range arr {
+		if strings.Contains(query, item) {
+			return true
+		}
+	}
+	return false
+}
+
+func ReadList(filepath, sep string) []string {
+	data, err := ioutil.ReadFile(filepath)
+	if err != nil || len(data) == 0{
+		return []string{}
+	}
+	return strings.Split(strings.TrimSuffix(string(data), sep), sep)
+}
+
+func CheckPortOpen(port int) bool {
+    tcpaddr, err := net.ResolveTCPAddr("tcp", fmt.Sprintf("localhost:%d", port))
+    if err != nil {
+        return false
+    }
+
+    l, err := net.ListenTCP("tcp", tcpaddr)
+    defer l.Close()
+
+    if err != nil {
+        return false
+    }
+    return true
+}
+
+func ReadConfig() types.Config {
+	data, err := ioutil.ReadFile("lieu.toml")
+    Check(err)
+
+	var conf types.Config
+	decoder := json.NewDecoder(toml.New(bytes.NewBuffer(data)))
+
+	err = decoder.Decode(&conf)
+	Check(err)
+
+	return conf
+}
+
+func WriteMockConfig () {
+    conf := []byte(`[general]
+name = "Sweet Webring"
+# used by the precrawl command and linked to in /about route
+url = "https://example.com/"
+port = 10001
+
+[data]
+# the source file should contain the crawl command's output 
+source = "data/crawled.txt"
+# location & name of the sqlite database
+database = "data/searchengine.db"
+# contains words and phrases disqualifying scraped paragraphs from being presented in search results
+heuristics = "data/heuristics.txt"
+# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word
+wordlist = "data/wordlist.txt"
+
+[crawler]
+# manually curated list of domains, or the output of the precrawl command
+webring = "data/webring.txt"
+# domains that are banned from being crawled but might originally be part of the webring
+bannedDomains = "data/banned-domains.txt"
+# file suffixes that are banned from being crawled
+bannedSuffixes = "data/banned-suffixes.txt"
+# phrases and words which won't be scraped (e.g. if a contained in a link)
+boringWords = "data/boring-words.txt"
+# domains that won't be output as outgoing links
+boringDomains = "data/boring-domains.txt"
+`)
+    err := ioutil.WriteFile("lieu.toml", conf, 0644)
+	Check(err)
+}
+
+func Exit () {
+    os.Exit(0)
+}