kopia lustrzana https://github.com/cblgh/lieu
launch
commit
28d052f4c3
|
@ -0,0 +1,223 @@
|
|||
#~top ignores~
|
||||
node_modules/
|
||||
*.vim
|
||||
*bundle*.js
|
||||
/html/*.html
|
||||
*.sw[a-z]
|
||||
config.conf
|
||||
config.js
|
||||
*.pdf
|
||||
archives
|
||||
builds
|
||||
dist
|
||||
|
||||
#################
|
||||
## Eclipse
|
||||
#################
|
||||
*.pydevproject
|
||||
.project
|
||||
.metadata
|
||||
bin/
|
||||
tmp/
|
||||
*.tmp
|
||||
*.bak
|
||||
*.swp
|
||||
*~.nib
|
||||
local.properties
|
||||
.classpath
|
||||
.settings/
|
||||
.loadpath
|
||||
|
||||
# External tool builders
|
||||
.externalToolBuilders/
|
||||
|
||||
# Locally stored "Eclipse launch configurations"
|
||||
*.launch
|
||||
|
||||
# CDT-specific
|
||||
.cproject
|
||||
|
||||
# PDT-specific
|
||||
.buildpath
|
||||
|
||||
|
||||
#################
|
||||
## Visual Studio
|
||||
#################
|
||||
|
||||
## Ignore Visual Studio temporary files, build results, and
|
||||
## files generated by popular Visual Studio add-ons.
|
||||
|
||||
# User-specific files
|
||||
*.suo
|
||||
*.user
|
||||
*.sln.docstates
|
||||
|
||||
# Build results
|
||||
|
||||
[Dd]ebug/
|
||||
[Rr]elease/
|
||||
x64/
|
||||
build/
|
||||
[Bb]in/
|
||||
[Oo]bj/
|
||||
|
||||
# MSTest test Results
|
||||
[Tt]est[Rr]esult*/
|
||||
[Bb]uild[Ll]og.*
|
||||
|
||||
*_i.c
|
||||
*_p.c
|
||||
*.ilk
|
||||
*.meta
|
||||
*.obj
|
||||
*.pch
|
||||
*.pdb
|
||||
*.pgc
|
||||
*.pgd
|
||||
*.rsp
|
||||
*.sbr
|
||||
*.tlb
|
||||
*.tli
|
||||
*.tlh
|
||||
*.tmp
|
||||
*.tmp_proj
|
||||
*.log
|
||||
*.vspscc
|
||||
*.vssscc
|
||||
.builds
|
||||
*.pidb
|
||||
*.log
|
||||
*.scc
|
||||
|
||||
# Visual C++ cache files
|
||||
ipch/
|
||||
*.aps
|
||||
*.ncb
|
||||
*.opensdf
|
||||
*.sdf
|
||||
*.cachefile
|
||||
|
||||
# Visual Studio profiler
|
||||
*.psess
|
||||
*.vsp
|
||||
*.vspx
|
||||
|
||||
# Guidance Automation Toolkit
|
||||
*.gpState
|
||||
|
||||
# ReSharper is a .NET coding add-in
|
||||
_ReSharper*/
|
||||
*.[Rr]e[Ss]harper
|
||||
|
||||
# TeamCity is a build add-in
|
||||
_TeamCity*
|
||||
|
||||
# DotCover is a Code Coverage Tool
|
||||
*.dotCover
|
||||
|
||||
# NCrunch
|
||||
*.ncrunch*
|
||||
.*crunch*.local.xml
|
||||
|
||||
# Installshield output folder
|
||||
[Ee]xpress/
|
||||
|
||||
# DocProject is a documentation generator add-in
|
||||
DocProject/buildhelp/
|
||||
DocProject/Help/*.HxT
|
||||
DocProject/Help/*.HxC
|
||||
DocProject/Help/*.hhc
|
||||
DocProject/Help/*.hhk
|
||||
DocProject/Help/*.hhp
|
||||
DocProject/Help/Html2
|
||||
DocProject/Help/html
|
||||
|
||||
# Click-Once directory
|
||||
publish/
|
||||
|
||||
# Publish Web Output
|
||||
*.Publish.xml
|
||||
*.pubxml
|
||||
|
||||
# Windows Azure Build Output
|
||||
csx
|
||||
*.build.csdef
|
||||
|
||||
# Windows Store app package directory
|
||||
AppPackages/
|
||||
|
||||
# Others
|
||||
sql/
|
||||
*.Cache
|
||||
ClientBin/
|
||||
[Ss]tyle[Cc]op.*
|
||||
~$*
|
||||
*~
|
||||
*.dbmdl
|
||||
*.[Pp]ublish.xml
|
||||
*.pfx
|
||||
*.publishsettings
|
||||
|
||||
# RIA/Silverlight projects
|
||||
Generated_Code/
|
||||
|
||||
# Backup & report files from converting an old project file to a newer
|
||||
# Visual Studio version. Backup files are not needed, because we have git ;-)
|
||||
_UpgradeReport_Files/
|
||||
Backup*/
|
||||
UpgradeLog*.XML
|
||||
UpgradeLog*.htm
|
||||
|
||||
# SQL Server files
|
||||
App_Data/*.mdf
|
||||
App_Data/*.ldf
|
||||
|
||||
#############
|
||||
## Windows detritus
|
||||
#############
|
||||
|
||||
# Windows image file caches
|
||||
Thumbs.db
|
||||
ehthumbs.db
|
||||
|
||||
# Folder config file
|
||||
Desktop.ini
|
||||
|
||||
# Recycle Bin used on file shares
|
||||
$RECYCLE.BIN/
|
||||
|
||||
# Mac crap
|
||||
.DS_Store
|
||||
|
||||
|
||||
#############
|
||||
## Python
|
||||
#############
|
||||
|
||||
*.py[co]
|
||||
|
||||
# Packages
|
||||
*.egg
|
||||
*.egg-info
|
||||
dist/
|
||||
build/
|
||||
eggs/
|
||||
parts/
|
||||
var/
|
||||
sdist/
|
||||
develop-eggs/
|
||||
.installed.cfg
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
.coverage
|
||||
.tox
|
||||
|
||||
#Translations
|
||||
*.mo
|
||||
|
||||
#Mr Developer
|
||||
.mr.developer.cfg
|
|
@ -0,0 +1,104 @@
|
|||
# Lieu
|
||||
_an alternative search engine_
|
||||
|
||||
Created in response to the environs of apathy concerning the use of hypertext
|
||||
search and discovery. In Lieu, the internet is not what is made searchable, but
|
||||
instead one's own neighbourhood. Put differently, Lieu is a neighbourhood search
|
||||
engine, a way for personal webrings to increase serendipitous connexions.
|
||||
|
||||
|
||||
## Goals
|
||||
* Enable serendipitous discovery
|
||||
* Support personal communities
|
||||
* Be reusable, easily
|
||||
|
||||
## Usage
|
||||
```
|
||||
$ lieu help
|
||||
Lieu: neighbourhood search engine
|
||||
|
||||
Commands
|
||||
- precrawl (scrapes config's general.url for a list of links: <li> elements containing an anchor <a> tag)
|
||||
- crawl (start crawler, crawls all urls in config's crawler.webring file)
|
||||
- ingest (ingest crawled data, generates database)
|
||||
- search (interactive cli for searching the database)
|
||||
- host (hosts search engine over http)
|
||||
|
||||
Example:
|
||||
lieu precrawl > data/webring.txt
|
||||
lieu ingest
|
||||
lieu host
|
||||
```
|
||||
|
||||
Lieu's crawl & precrawl commands output to [standard
|
||||
output](https://en.wikipedia.org/wiki/Standard_streams#Standard_output_(stdout)),
|
||||
for easy inspection of the data. You typically want to redirect their output to
|
||||
the files Lieu reads from, as defined in the config file. See below for a
|
||||
typical workflow.
|
||||
|
||||
### Workflow
|
||||
* Edit the config
|
||||
* Add domains to crawl in `config.crawler.webring`
|
||||
* **If you have a webpage with links you want to crawl:**
|
||||
* Set the config's `url` field to that page
|
||||
* Populate the list of domains to crawl with `precrawl`: `lieu precrawl > data/webring.txt`
|
||||
* Crawl: `lieu crawl > data/source.txt`
|
||||
* Create database: `lieu ingest`
|
||||
* Host engine: `lieu host`
|
||||
|
||||
After ingesting the data with `lieu ingest`, you can also use lieu to search the
|
||||
corpus in the terminal with `lieu search`.
|
||||
|
||||
## Config
|
||||
The config file is written in [TOML](https://toml.io/en/).
|
||||
|
||||
```toml
|
||||
[general]
|
||||
name = "Merveilles Webring"
|
||||
# used by the precrawl command and linked to in /about route
|
||||
url = "https://webring.xxiivv.com"
|
||||
port = 10001
|
||||
|
||||
[data]
|
||||
# the source file should contain the crawl command's output
|
||||
source = "data/crawled.txt"
|
||||
# location & name of the sqlite database
|
||||
database = "data/searchengine.db"
|
||||
# contains words and phrases disqualifying scraped paragraphs from being presented in search results
|
||||
heuristics = "data/heuristics.txt"
|
||||
# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word
|
||||
wordlist = "data/wordlist.txt"
|
||||
|
||||
[crawler]
|
||||
# manually curated list of domains, or the output of the precrawl command
|
||||
webring = "data/webring.txt"
|
||||
# domains that are banned from being crawled but might originally be part of the webring
|
||||
bannedDomains = "data/banned-domains.txt"
|
||||
# file suffixes that are banned from being crawled
|
||||
bannedSuffixes = "data/banned-suffixes.txt"
|
||||
# phrases and words which won't be scraped (e.g. if a contained in a link)
|
||||
boringWords = "data/boring-words.txt"
|
||||
# domains that won't be output as outgoing links
|
||||
boringDomains = "data/boring-domains.txt"
|
||||
```
|
||||
|
||||
For your own use, the following config fields should be customized:
|
||||
* `name`
|
||||
* `url `
|
||||
* `port`
|
||||
* `source`
|
||||
* `webring`
|
||||
* `bannedDomains`
|
||||
|
||||
The following config-defined files can stay as-is unless you have specific requirements:
|
||||
* `database`
|
||||
* `heuristics`
|
||||
* `wordlist`
|
||||
* `bannedSuffixes`
|
||||
|
||||
For a full rundown of the files and their various jobs, see the [files
|
||||
description](docs/files.md).
|
||||
|
||||
### License
|
||||
Source code `AGPL-3.0-or-later`, Inter is available under `SIL OPEN FONT
|
||||
LICENSE Version 1.1`, Noto Serif is licensed as `Apache License, Version 2.0`.
|
|
@ -0,0 +1,125 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"lieu/crawler"
|
||||
"lieu/database"
|
||||
"lieu/ingest"
|
||||
"lieu/server"
|
||||
"lieu/util"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const help = `Lieu: neighbourhood search engine
|
||||
|
||||
Commands
|
||||
- precrawl (scrapes config's general.url for a list of links: <li> elements containing an anchor <a> tag)
|
||||
- crawl (start crawler, crawls all urls in config's crawler.webring file. outputs to stdout)
|
||||
- ingest (ingest crawled data, generates database)
|
||||
- search (interactive cli for searching the database)
|
||||
- host (hosts search engine over http)
|
||||
|
||||
Example:
|
||||
lieu precrawl > data/webring.txt
|
||||
lieu crawl > data/source.txt
|
||||
lieu ingest
|
||||
lieu host
|
||||
|
||||
See the configuration file lieu.toml or
|
||||
https://github.com/cblgh/lieu for more information.
|
||||
`
|
||||
|
||||
func main() {
|
||||
exists := util.CheckFileExists("lieu.toml")
|
||||
if !exists {
|
||||
fmt.Println("lieu: can't find config, saving an example config in the working directory")
|
||||
util.WriteMockConfig()
|
||||
fmt.Println("lieu: lieu.toml written to disk")
|
||||
util.Exit()
|
||||
}
|
||||
config := util.ReadConfig()
|
||||
|
||||
var cmd string
|
||||
if len(os.Args) > 1 {
|
||||
cmd = os.Args[1]
|
||||
} else {
|
||||
cmd = "search"
|
||||
}
|
||||
|
||||
switch cmd {
|
||||
case "help":
|
||||
fmt.Println(help)
|
||||
case "precrawl":
|
||||
if config.General.URL == "https://example.com/" {
|
||||
fmt.Println("lieu: the url is not set (example.com)")
|
||||
util.Exit()
|
||||
}
|
||||
crawler.Precrawl(config)
|
||||
case "crawl":
|
||||
exists := util.CheckFileExists(config.Crawler.Webring)
|
||||
if !exists {
|
||||
fmt.Printf("lieu: webring file %s does not exist\n", config.Data.Source)
|
||||
util.Exit()
|
||||
}
|
||||
sourceLen := len(util.ReadList(config.Crawler.Webring, "\n"))
|
||||
if sourceLen == 0 {
|
||||
fmt.Printf("lieu: nothing to crawl; the webring file %s is empty\n", config.Data.Source)
|
||||
util.Exit()
|
||||
}
|
||||
crawler.Crawl(config)
|
||||
case "ingest":
|
||||
exists := util.CheckFileExists(config.Data.Source)
|
||||
if !exists {
|
||||
fmt.Printf("lieu: data source %s does not exist\n", config.Data.Source)
|
||||
fmt.Println("lieu: try running `lieu crawl`")
|
||||
util.Exit()
|
||||
}
|
||||
sourceLen := len(util.ReadList(config.Data.Source, "\n"))
|
||||
if sourceLen == 0 {
|
||||
fmt.Printf("lieu: nothing to ingest; data source %s is empty\n", config.Data.Source)
|
||||
fmt.Println("lieu: try running `lieu crawl`")
|
||||
util.Exit()
|
||||
}
|
||||
fmt.Println("lieu: creating a new database & initiating ingestion")
|
||||
ingest.Ingest(config)
|
||||
case "search":
|
||||
exists := util.CheckFileExists(config.Data.Database)
|
||||
if !exists {
|
||||
util.DatabaseDoesNotExist(config.Data.Database)
|
||||
}
|
||||
interactiveMode(config.Data.Database)
|
||||
case "host":
|
||||
exists := util.CheckFileExists(config.Data.Database)
|
||||
if !exists {
|
||||
util.DatabaseDoesNotExist(config.Data.Database)
|
||||
}
|
||||
open := util.CheckPortOpen(config.General.Port)
|
||||
if !open {
|
||||
fmt.Printf("lieu: port %d is not open; try another one\n", config.General.Port)
|
||||
util.Exit()
|
||||
}
|
||||
server.Serve(config)
|
||||
default:
|
||||
fmt.Println("Lieu: no such command, currently. Try `lieu help`")
|
||||
}
|
||||
}
|
||||
|
||||
func interactiveMode(databasePath string) {
|
||||
db := database.InitDB(databasePath)
|
||||
reader := bufio.NewReader(os.Stdin)
|
||||
for {
|
||||
fmt.Printf("> ")
|
||||
input, err := reader.ReadString('\n')
|
||||
util.Check(err)
|
||||
input = strings.TrimSuffix(input, "\n")
|
||||
pages := database.SearchWordsByScore(db, util.Inflect(strings.Fields(input)))
|
||||
for _, pageData := range pages {
|
||||
fmt.Println(pageData.URL)
|
||||
if len(pageData.About) > 0 {
|
||||
fmt.Println(pageData.About)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,244 @@
|
|||
package crawler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"lieu/types"
|
||||
"lieu/util"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/gocolly/colly/v2"
|
||||
"github.com/gocolly/colly/v2/queue"
|
||||
)
|
||||
|
||||
// the following domains are excluded from crawling & indexing, typically because they have a lot of microblog pages
|
||||
// (very spammy)
|
||||
func getBannedDomains(path string) []string {
|
||||
return util.ReadList(path, "\n")
|
||||
}
|
||||
|
||||
func getBannedSuffixes(path string) []string {
|
||||
return util.ReadList(path, "\n")
|
||||
}
|
||||
|
||||
func getBoringWords(path string) []string {
|
||||
return util.ReadList(path, "\n")
|
||||
}
|
||||
|
||||
func getBoringDomains(path string) []string {
|
||||
return util.ReadList(path, "\n")
|
||||
}
|
||||
|
||||
func find(list []string, query string) bool {
|
||||
for _, item := range list {
|
||||
if item == query {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func getLink(target string) string {
|
||||
// remove anchor links
|
||||
if strings.Contains(target, "#") {
|
||||
target = strings.Split(target, "#")[0]
|
||||
}
|
||||
if strings.Contains(target, "?") {
|
||||
target = strings.Split(target, "?")[0]
|
||||
}
|
||||
target = strings.TrimSpace(target)
|
||||
target = strings.ToLower(target)
|
||||
// remove trailing /
|
||||
return strings.TrimSuffix(target, "/")
|
||||
}
|
||||
|
||||
func getWebringLinks(path string) []string {
|
||||
var links []string
|
||||
candidates := util.ReadList(path, "\n")
|
||||
for _, l := range candidates {
|
||||
u, err := url.Parse(l)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if u.Scheme == "" {
|
||||
u.Scheme = "https"
|
||||
}
|
||||
links = append(links, u.String())
|
||||
}
|
||||
return links
|
||||
}
|
||||
|
||||
func getDomains(links []string) []string {
|
||||
var domains []string
|
||||
for _, l := range links {
|
||||
u, err := url.Parse(l)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
domains = append(domains, u.Hostname())
|
||||
}
|
||||
return domains
|
||||
}
|
||||
|
||||
|
||||
func findSuffix(suffixes []string, query string) bool {
|
||||
for _, suffix := range suffixes {
|
||||
if strings.HasSuffix(strings.ToLower(query), suffix) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func cleanText(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
s = strings.ReplaceAll(s, "\n", " ")
|
||||
s = strings.ReplaceAll(s, "|", " ")
|
||||
whitespace := regexp.MustCompile(`\p{Z}`)
|
||||
s = whitespace.ReplaceAllString(s, " ")
|
||||
return s
|
||||
}
|
||||
|
||||
func handleIndexing(c *colly.Collector) {
|
||||
c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) {
|
||||
fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL)
|
||||
})
|
||||
|
||||
c.OnHTML("meta[name=\"description\"]", func(e *colly.HTMLElement) {
|
||||
desc := cleanText(e.Attr("content"))
|
||||
if len(desc) > 0 {
|
||||
fmt.Println("desc", desc, e.Request.URL)
|
||||
}
|
||||
})
|
||||
|
||||
c.OnHTML("html[lang]", func(e *colly.HTMLElement) {
|
||||
lang := cleanText(e.Attr("lang"))
|
||||
if len(lang) > 0 {
|
||||
fmt.Println("lang", lang, e.Request.URL)
|
||||
}
|
||||
})
|
||||
|
||||
// get page title
|
||||
c.OnHTML("title", func(e *colly.HTMLElement) {
|
||||
fmt.Println("title", cleanText(e.Text), e.Request.URL)
|
||||
})
|
||||
|
||||
c.OnHTML("body", func(e *colly.HTMLElement) {
|
||||
paragraph := cleanText(e.DOM.Find("p").First().Text())
|
||||
if len(paragraph) < 1500 && len(paragraph) > 0 {
|
||||
fmt.Println("para", paragraph, e.Request.URL)
|
||||
}
|
||||
// get all relevant page headings
|
||||
collectHeadingText("h1", e)
|
||||
collectHeadingText("h2", e)
|
||||
collectHeadingText("h3", e)
|
||||
})
|
||||
}
|
||||
|
||||
func collectHeadingText(heading string, e *colly.HTMLElement) {
|
||||
for _, headingText := range e.ChildTexts(heading) {
|
||||
if len(headingText) < 500 {
|
||||
fmt.Println(heading, cleanText(headingText), e.Request.URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func Precrawl(config types.Config) {
|
||||
res, err := http.Get(config.General.URL)
|
||||
util.Check(err)
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode != 200 {
|
||||
log.Fatal("status not 200")
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(res.Body)
|
||||
util.Check(err)
|
||||
|
||||
items := make([]string, 0)
|
||||
doc.Find("li").Each(func(i int, s *goquery.Selection) {
|
||||
if domain, exists := s.Find("a").Attr("href"); exists {
|
||||
items = append(items, domain)
|
||||
}
|
||||
})
|
||||
|
||||
BANNED := getBannedDomains(config.Crawler.BannedDomains)
|
||||
for _, item := range items {
|
||||
link := getLink(item)
|
||||
u, err := url.Parse(link)
|
||||
// invalid link
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
domain := u.Hostname()
|
||||
if find(BANNED, domain) {
|
||||
continue
|
||||
}
|
||||
fmt.Println(link)
|
||||
}
|
||||
}
|
||||
|
||||
func Crawl(config types.Config) {
|
||||
SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes)
|
||||
links := getWebringLinks(config.Crawler.Webring)
|
||||
domains := getDomains(links)
|
||||
initialDomain := config.General.URL
|
||||
|
||||
// TODO: introduce c2 for scraping links (with depth 1) linked to from webring domains
|
||||
// instantiate default collector
|
||||
c := colly.NewCollector(
|
||||
colly.MaxDepth(3),
|
||||
)
|
||||
|
||||
q, _ := queue.New(
|
||||
5, /* threads */
|
||||
&queue.InMemoryQueueStorage{MaxSize: 100000},
|
||||
)
|
||||
|
||||
for _, link := range links {
|
||||
q.AddURL(link)
|
||||
}
|
||||
|
||||
c.AllowedDomains = domains
|
||||
c.AllowURLRevisit = false
|
||||
c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains)
|
||||
|
||||
delay, _ := time.ParseDuration("200ms")
|
||||
c.Limit(&colly.LimitRule{DomainGlob: "*", Delay: delay, Parallelism: 3})
|
||||
|
||||
boringDomains := getBoringDomains(config.Crawler.BoringDomains)
|
||||
boringWords := getBoringWords(config.Crawler.BoringWords)
|
||||
|
||||
// on every a element which has an href attribute, call callback
|
||||
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||
link := getLink(e.Attr("href"))
|
||||
if findSuffix(SUFFIXES, link) {
|
||||
return
|
||||
}
|
||||
link = e.Request.AbsoluteURL(link)
|
||||
u, err := url.Parse(link)
|
||||
// log which site links to what
|
||||
if err == nil && !util.Contains(boringWords, link) && !util.Contains(boringDomains, link) {
|
||||
outgoingDomain := u.Hostname()
|
||||
currentDomain := e.Request.URL.Hostname()
|
||||
if !find(domains, outgoingDomain) {
|
||||
fmt.Println("non-webring-link", link, e.Request.URL)
|
||||
// solidarity! someone in the webring linked to someone else in it
|
||||
} else if outgoingDomain != currentDomain && outgoingDomain != initialDomain && currentDomain != initialDomain {
|
||||
fmt.Println("webring-link", link, e.Request.URL)
|
||||
}
|
||||
}
|
||||
// only visits links from AllowedDomains
|
||||
q.AddURL(link)
|
||||
})
|
||||
|
||||
handleIndexing(c)
|
||||
|
||||
// start scraping
|
||||
q.Run(c)
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
.xml
|
||||
.pdf
|
||||
.rss
|
||||
.jpg
|
||||
.png
|
||||
.gif
|
||||
.avi
|
||||
.webm
|
||||
.mp4
|
||||
.ogg
|
||||
.mp3
|
||||
.zip
|
||||
.exe
|
||||
.txt
|
||||
.asc
|
||||
.key
|
||||
.csv
|
|
@ -0,0 +1,19 @@
|
|||
instagram.com
|
||||
twitter.com
|
||||
linkedin.com
|
||||
facebook.com
|
||||
getpoole.com
|
||||
jekyllrb.com
|
||||
twitter.com
|
||||
amazon.com
|
||||
google.com
|
||||
microsoft.com
|
||||
youtube.com
|
||||
github.io
|
||||
meetup.com
|
||||
ebay.com
|
||||
t.co
|
||||
a.co
|
||||
wsj.com
|
||||
creativecommons.org
|
||||
patreon.com
|
|
@ -0,0 +1,4 @@
|
|||
bitcoin
|
||||
javascript:
|
||||
mailto:
|
||||
subscribe
|
|
@ -0,0 +1,10 @@
|
|||
incoming
|
||||
tagged
|
||||
edited
|
||||
updated
|
||||
last update
|
||||
last edit
|
||||
©
|
||||
(c)
|
||||
all rights reserved
|
||||
licensed under
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,222 @@
|
|||
package database
|
||||
|
||||
/* example query
|
||||
SELECT p.url
|
||||
FROM inv_index index
|
||||
INNER JOIN pages p ON p.id = index.pageid
|
||||
WHERE i.word = "project";
|
||||
|
||||
select url from inv_index where word="esoteric" group by url order by sum(score) desc limit 15;
|
||||
|
||||
select url from inv_index where word = "<word>" group by url order by sum(score) desc;
|
||||
*/
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"lieu/types"
|
||||
"lieu/util"
|
||||
"log"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
func InitDB(filepath string) *sql.DB {
|
||||
db, err := sql.Open("sqlite3", filepath)
|
||||
if err != nil {
|
||||
log.Fatalln(err)
|
||||
}
|
||||
if db == nil {
|
||||
log.Fatalln("db is nil")
|
||||
}
|
||||
createTables(db)
|
||||
return db
|
||||
}
|
||||
|
||||
func createTables(db *sql.DB) {
|
||||
// create the table if it doesn't exist
|
||||
queries := []string{`
|
||||
CREATE TABLE IF NOT EXISTS domains (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
domain TEXT NOT NULL UNIQUE
|
||||
);
|
||||
`,
|
||||
`
|
||||
CREATE TABLE IF NOT EXISTS pages (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
title TEXT,
|
||||
about TEXT,
|
||||
lang TEXT,
|
||||
domain TEXT NOT NULL,
|
||||
FOREIGN KEY(domain) REFERENCES domains(domain)
|
||||
);
|
||||
`,
|
||||
`
|
||||
CREATE TABLE IF NOT EXISTS external_pages (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
domain TEXT NOT NULL,
|
||||
title TEXT
|
||||
);
|
||||
`,
|
||||
`
|
||||
CREATE TABLE IF NOT EXISTS inv_index (
|
||||
word TEXT NOT NULL,
|
||||
score INTEGER NOT NULL,
|
||||
url TEXT NOT NULL,
|
||||
FOREIGN KEY(url) REFERENCES pages(url)
|
||||
)`,
|
||||
}
|
||||
|
||||
for _, query := range queries {
|
||||
if _, err := db.Exec(query); err != nil {
|
||||
log.Fatalln(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* TODO: filters
|
||||
lang:en|fr|en|<..>
|
||||
site:wiki.xxiivv.com, site:cblgh.org
|
||||
nosite:excluded-domain.com
|
||||
|
||||
"word1 word2 word3" strict query
|
||||
|
||||
query params:
|
||||
&order=score, &order=count
|
||||
&outgoing=true
|
||||
*/
|
||||
|
||||
func SearchWordsByScore(db *sql.DB, words []string) []types.PageData {
|
||||
return searchWords(db, words, true)
|
||||
}
|
||||
|
||||
func SearchWordsByCount(db *sql.DB, words []string) []types.PageData {
|
||||
return searchWords(db, words, false)
|
||||
}
|
||||
|
||||
func GetDomainCount(db *sql.DB) int {
|
||||
return countQuery(db, "domains")
|
||||
}
|
||||
|
||||
func GetPageCount(db *sql.DB) int {
|
||||
return countQuery(db, "pages")
|
||||
}
|
||||
|
||||
func GetWordCount(db *sql.DB) int {
|
||||
return countQuery(db, "inv_index")
|
||||
}
|
||||
|
||||
func GetRandomPage(db *sql.DB) string {
|
||||
rows, err := db.Query("SELECT url FROM pages ORDER BY RANDOM() LIMIT 1;")
|
||||
util.Check(err)
|
||||
|
||||
var link string
|
||||
for rows.Next() {
|
||||
err = rows.Scan(&link)
|
||||
util.Check(err)
|
||||
}
|
||||
return link
|
||||
}
|
||||
|
||||
func countQuery(db *sql.DB, table string) int {
|
||||
rows, err := db.Query(fmt.Sprintf("SELECT COUNT(*) FROM %s;", table))
|
||||
util.Check(err)
|
||||
var count int
|
||||
for rows.Next() {
|
||||
err = rows.Scan(&count)
|
||||
util.Check(err)
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
func searchWords(db *sql.DB, words []string, searchByScore bool) []types.PageData {
|
||||
var wordlist []string
|
||||
var args []interface{}
|
||||
for _, word := range words {
|
||||
wordlist = append(wordlist, "word = ?")
|
||||
args = append(args, strings.ToLower(word))
|
||||
}
|
||||
|
||||
orderType := "SUM(score)"
|
||||
if !searchByScore {
|
||||
orderType = "COUNT(*)"
|
||||
}
|
||||
|
||||
query := fmt.Sprintf(`
|
||||
SELECT p.url, p.about, p.title
|
||||
FROM inv_index inv INNER JOIN pages p ON inv.url = p.url
|
||||
WHERE %s
|
||||
GROUP BY inv.url
|
||||
ORDER BY %s
|
||||
DESC
|
||||
LIMIT 15
|
||||
`, strings.Join(wordlist, " OR "), orderType)
|
||||
|
||||
stmt, err := db.Prepare(query)
|
||||
util.Check(err)
|
||||
defer stmt.Close()
|
||||
|
||||
rows, err := stmt.Query(args...)
|
||||
util.Check(err)
|
||||
var pageData types.PageData
|
||||
var pages []types.PageData
|
||||
for rows.Next() {
|
||||
if err := rows.Scan(&pageData.URL, &pageData.About, &pageData.Title); err != nil {
|
||||
log.Fatalln(err)
|
||||
}
|
||||
pages = append(pages, pageData)
|
||||
}
|
||||
return pages
|
||||
}
|
||||
|
||||
func InsertManyDomains(db *sql.DB, pages []types.PageData) {
|
||||
values := make([]string, 0, len(pages))
|
||||
args := make([]interface{}, 0, len(pages))
|
||||
|
||||
for _, b := range pages {
|
||||
values = append(values, "(?)")
|
||||
u, err := url.Parse(b.URL)
|
||||
util.Check(err)
|
||||
args = append(args, u.Hostname())
|
||||
}
|
||||
|
||||
stmt := fmt.Sprintf(`INSERT OR IGNORE INTO domains(domain) VALUES %s`, strings.Join(values, ","))
|
||||
_, err := db.Exec(stmt, args...)
|
||||
util.Check(err)
|
||||
}
|
||||
|
||||
func InsertManyPages(db *sql.DB, pages []types.PageData) {
|
||||
values := make([]string, 0, len(pages))
|
||||
args := make([]interface{}, 0, len(pages))
|
||||
|
||||
for _, b := range pages {
|
||||
// url, title, lang, about, domain
|
||||
values = append(values, "(?, ?, ?, ?, ?)")
|
||||
u, err := url.Parse(b.URL)
|
||||
util.Check(err)
|
||||
args = append(args, b.URL, b.Title, b.Lang, b.About, u.Hostname())
|
||||
}
|
||||
|
||||
stmt := fmt.Sprintf(`INSERT OR IGNORE INTO pages(url, title, lang, about, domain) VALUES %s`, strings.Join(values, ","))
|
||||
_, err := db.Exec(stmt, args...)
|
||||
util.Check(err)
|
||||
}
|
||||
|
||||
func InsertManyWords(db *sql.DB, batch []types.SearchFragment) {
|
||||
values := make([]string, 0, len(batch))
|
||||
args := make([]interface{}, 0, len(batch))
|
||||
|
||||
for _, b := range batch {
|
||||
pageurl := strings.TrimSuffix(b.URL, "/")
|
||||
values = append(values, "(?, ?, ?)")
|
||||
args = append(args, b.Word, pageurl, b.Score)
|
||||
}
|
||||
|
||||
stmt := fmt.Sprintf(`INSERT OR IGNORE INTO inv_index(word, url, score) VALUES %s`, strings.Join(values, ","))
|
||||
_, err := db.Exec(stmt, args...)
|
||||
util.Check(err)
|
||||
}
|
|
@ -0,0 +1,121 @@
|
|||
# Files
|
||||
_what the purposes are of all those damn files_
|
||||
|
||||
Lieu is based on a few files, which in turn configure various behaviours in the
|
||||
**crawler** (visits urls & extracts relevant elements) and the **ingester**
|
||||
(converts the crawled source data into database fields). The basic reason is to
|
||||
minimize hardcoded assumptions in the source, furthering Lieu's reuse.
|
||||
|
||||
Below, I will refer to the files by their config defined names. Here's the
|
||||
config example from the [README](../README.md), again.
|
||||
|
||||
```toml
|
||||
[general]
|
||||
name = "Merveilles Webring"
|
||||
# used by the precrawl command and linked to in /about route
|
||||
url = "https://webring.xxiivv.com"
|
||||
port = 10001
|
||||
|
||||
[data]
|
||||
# the source file should contain the crawl command's output
|
||||
source = "data/crawled.txt"
|
||||
# location & name of the sqlite database
|
||||
database = "data/searchengine.db"
|
||||
# contains words and phrases disqualifying scraped paragraphs from being presented in search results
|
||||
heuristics = "data/heuristics.txt"
|
||||
# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word
|
||||
wordlist = "data/wordlist.txt"
|
||||
|
||||
[crawler]
|
||||
# manually curated list of domains, or the output of the precrawl command
|
||||
webring = "data/webring.txt"
|
||||
# domains that are banned from being crawled but might originally be part of the webring
|
||||
bannedDomains = "data/banned-domains.txt"
|
||||
# file suffixes that are banned from being crawled
|
||||
bannedSuffixes = "data/banned-suffixes.txt"
|
||||
# phrases and words which won't be scraped (e.g. if a contained in a link)
|
||||
boringWords = "data/boring-words.txt"
|
||||
# domains that won't be output as outgoing links
|
||||
boringDomains = "data/boring-domains.txt"
|
||||
```
|
||||
|
||||
## HTML
|
||||
Before we start, a final note on some other types of files in use. The HTML
|
||||
templates, used when presenting the search engine in the browser, are all
|
||||
available in the [`html`](../html) folder. The includes—currently only css
|
||||
& font files—are available in [`html/assets`](../html/assets).
|
||||
|
||||
## `[crawler]`
|
||||
#### `webring`
|
||||
Defines which domains will be crawled for pages. At current writing, no domains
|
||||
outside of this file will be crawled.
|
||||
|
||||
You can populate the `webring` file manually or by precrawling an existing
|
||||
webpage that contains all of the domains you want to crawl:
|
||||
|
||||
lieu precrawl > data/webring.txt
|
||||
|
||||
#### `bannedDomains`
|
||||
A list of domains that will not be crawled. This means that if they are present
|
||||
in the `webring` file, they will be skipped over as candidates for crawling.
|
||||
|
||||
The rationale is that some of the domains of a webring may be unsuitable for ingestion
|
||||
into the database. I typically find this is the case for domains that include
|
||||
microblogs with 100s or 1000s of one line pages—needlessly gunking up the search
|
||||
results without providing anything of interest outside the individual creating
|
||||
the logs.
|
||||
|
||||
#### `bannedSuffixes`
|
||||
Eliminates html links that end with suffixes present in this file. Typically I want
|
||||
to avoid crawling links to media formats such as `.mp4`, and other types of
|
||||
non-html documents, really.
|
||||
|
||||
It's fine to leave this file intact with its defaults.
|
||||
|
||||
#### `boringWords`
|
||||
This file is a bit more specific. It contains words which, if present in a link,
|
||||
will prevent the link from being logged. The reason is cause it suggests the
|
||||
link target is boring—irrelevant for this application of the search engine.
|
||||
|
||||
This can be `javascript:` script links, or other types of content that is less
|
||||
relevant to the focus of the search engine & webring.
|
||||
|
||||
Link data of this type is as yet unused in Lieu's ingestion.
|
||||
|
||||
#### `boringDomains`
|
||||
Like `boringWords` except it contains a list of domains which are banned from
|
||||
having their links be logged, typically because they are deemed less relevant
|
||||
for the focus of the search engine.
|
||||
|
||||
Link data of this type is as yet unused in Lieu's ingestion.
|
||||
|
||||
## `[data]`
|
||||
#### `source`
|
||||
Contains the linewise data that was produced by the crawler. The first word
|
||||
identifies the type of data and the last word identifies the page the data
|
||||
originated from.
|
||||
|
||||
Example:
|
||||
```
|
||||
h2 Prelude https://cblgh.org/articles/four-nights-in-tornio.html
|
||||
```
|
||||
|
||||
* An `<h2>` tag was scraped,
|
||||
* its contents were `Prelude`, and
|
||||
* the originating article was https://cblgh.org/articles/four-nights-in-tornio.html
|
||||
|
||||
#### `database`
|
||||
The location the sqlite3 database will be created & read from.
|
||||
|
||||
#### `heuristics`
|
||||
Heuristics contains a list of words or phrases which disqualify scraped
|
||||
paragraphs from being used as descriptive text Lieu's search results. Typically
|
||||
excluded are e.g. paragraphs which contain copyright symbols—as that indicates we
|
||||
have scraped the bottom-most paragraph, i.e. the page was likely a short stub,
|
||||
with a better content description elsewhere.
|
||||
|
||||
#### `wordlist`
|
||||
Also known as [stopwords](https://en.wikipedia.org/wiki/Stop_word)—words which
|
||||
are stopped from entering the search index. The default wordlist consists of the
|
||||
1000 or so most common English words, albeit curated slightly to still allow for
|
||||
interesting concepts and verbs—such as `reading` and `books`, for example.
|
|
@ -0,0 +1,11 @@
|
|||
module lieu
|
||||
|
||||
go 1.14
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.5.1
|
||||
github.com/gocolly/colly/v2 v2.1.0
|
||||
github.com/jinzhu/inflection v1.0.0
|
||||
github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b
|
||||
github.com/mattn/go-sqlite3 v1.14.6
|
||||
)
|
|
@ -0,0 +1,144 @@
|
|||
cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
|
||||
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
|
||||
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
|
||||
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
|
||||
github.com/anaskhan96/soup v1.2.4 h1:or+sKs9QbzJGZVTYFmTs2VBateEywoq00a6K14z331E=
|
||||
github.com/anaskhan96/soup v1.2.4/go.mod h1:6YnEp9A2yywlYdM4EgDz9NEHclocMepEtku7wg6Cq3s=
|
||||
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
|
||||
github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE=
|
||||
github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY=
|
||||
github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M=
|
||||
github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0=
|
||||
github.com/antchfx/xmlquery v1.2.4 h1:T/SH1bYdzdjTMoz2RgsfVKbM5uWh3gjDYYepFqQmFv4=
|
||||
github.com/antchfx/xmlquery v1.2.4/go.mod h1:KQQuESaxSlqugE2ZBcM/qn+ebIpt+d+4Xx7YcSGAIrM=
|
||||
github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
|
||||
github.com/antchfx/xpath v1.1.8 h1:PcL6bIX42Px5usSx6xRYw/wjB3wYGkj0MJ9MBzEKVgk=
|
||||
github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
|
||||
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
|
||||
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
||||
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
|
||||
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
|
||||
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
|
||||
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
|
||||
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
|
||||
github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs=
|
||||
github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0=
|
||||
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
|
||||
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY=
|
||||
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
|
||||
github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
|
||||
github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
||||
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
||||
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
|
||||
github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
|
||||
github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
|
||||
github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
|
||||
github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
|
||||
github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
|
||||
github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
|
||||
github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0=
|
||||
github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
|
||||
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
|
||||
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
|
||||
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
|
||||
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg=
|
||||
github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E=
|
||||
github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc=
|
||||
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
|
||||
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
|
||||
github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b h1:UmqyLHqfYJjkiuA2hddGeovwAGOCBm5gOTVKuxtvoMo=
|
||||
github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b/go.mod h1:wLcNqnyr6riTbnFObg4o2/GemTCso9AnsUdLsMsdspw=
|
||||
github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRUbg=
|
||||
github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
|
||||
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=
|
||||
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
|
||||
github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
|
||||
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
|
||||
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
|
||||
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
|
||||
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
|
||||
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
|
||||
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||
golang.org/x/net v0.0.0-20200602114024-627f9648deb9 h1:pNX+40auqi2JqRfOP1akLGtYcn15TUbkhwuCO3foqqM=
|
||||
golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
|
||||
golang.org/x/net v0.0.0-20201021035429-f5854403a974 h1:IX6qOQeG5uLjB/hjjwjedwfjND0hgjPMMyO1RoIXQNI=
|
||||
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
|
||||
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
||||
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
|
||||
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
|
||||
golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
|
||||
golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
|
||||
golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
|
||||
golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.0.0-20210114065538-d78b04bdf963/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
|
||||
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
|
||||
google.golang.org/appengine v1.6.6 h1:lMO5rYAqUxkmaj76jAkRUvt5JZgFymx/+Q5Mzfivuhc=
|
||||
google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
|
||||
google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
|
||||
google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
|
||||
google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
|
||||
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
|
||||
google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
|
||||
google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
|
||||
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
|
||||
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
|
||||
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
|
||||
google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
|
||||
google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
|
||||
google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
|
||||
google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
|
||||
google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
|
||||
google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEGA=
|
||||
google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
|
||||
honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
|
Plik binarny nie jest wyświetlany.
Plik binarny nie jest wyświetlany.
Plik binarny nie jest wyświetlany.
|
@ -0,0 +1,24 @@
|
|||
@import url("base.css");
|
||||
|
||||
html {
|
||||
max-width: 31rem;
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-size: 3rem;
|
||||
margin-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
h2 {
|
||||
font-family: "Noto Serif";
|
||||
font-style: italic;
|
||||
font-weight: 400;
|
||||
font-size: 1.5rem;
|
||||
margin-top: 0;
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
.lieu {
|
||||
font-family: "Noto Serif";
|
||||
font-weight: 400;
|
||||
}
|
|
@ -0,0 +1,162 @@
|
|||
@import url('inter-ui-web/inter-ui.css');
|
||||
|
||||
@font-face {
|
||||
font-family: "Noto Serif";
|
||||
src: url("NotoSerif-Bold.ttf");
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: "Noto Serif";
|
||||
font-weight: 400;
|
||||
src: url("NotoSerif-Regular.ttf");
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: "Noto Serif";
|
||||
font-weight: 400;
|
||||
font-style: italic;
|
||||
src: url("NotoSerif-Italic.ttf");
|
||||
}
|
||||
|
||||
:root {
|
||||
--primary: #fefefe;
|
||||
--secondary: #000;
|
||||
/* alt colorscheme: 1 */
|
||||
/* --primary: red; */
|
||||
/* --secondary: #fefefe; */
|
||||
/* alt colorscheme: 2 */
|
||||
/* --primary: #F35363; */
|
||||
/* --secondary: black; */
|
||||
}
|
||||
|
||||
li {
|
||||
list-style-type: circle;
|
||||
}
|
||||
|
||||
ul {
|
||||
margin: 0;
|
||||
padding-left: 1rem;
|
||||
}
|
||||
|
||||
html {
|
||||
font-family: "Inter UI", sans-serif;
|
||||
background: var(--secondary);
|
||||
color: var(--primary);
|
||||
max-width: 650px;
|
||||
padding-bottom: 2rem;
|
||||
padding-left: 2rem;
|
||||
margin-top: 2rem;
|
||||
}
|
||||
|
||||
body {
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-family: "Noto Serif";
|
||||
font-weight: 400;
|
||||
font-size: 3rem;
|
||||
margin-bottom: 1rem;
|
||||
margin-top: 0;
|
||||
}
|
||||
|
||||
h1 > a, h1 > a:hover {
|
||||
border-bottom: none;
|
||||
}
|
||||
|
||||
a {
|
||||
cursor: pointer;
|
||||
color: var(--primary);
|
||||
text-decoration: none;
|
||||
border-bottom: 0.1rem solid var(--primary);
|
||||
word-wrap: break-word;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
border-bottom-style: dotted;
|
||||
}
|
||||
|
||||
p {
|
||||
hyphens: auto;
|
||||
margin-bottom: 1.5rem;
|
||||
}
|
||||
|
||||
.entry {
|
||||
-webkit-column-break-inside: avoid;
|
||||
-moz-column-break-inside:avoid;
|
||||
-moz-page-break-inside:avoid;
|
||||
page-break-inside: avoid;
|
||||
break-inside: avoid-column;
|
||||
}
|
||||
|
||||
.search-container {
|
||||
display: grid;
|
||||
margin-bottom: 2rem;
|
||||
height: 2.5rem;
|
||||
align-items: center;
|
||||
grid-template-columns: 16rem 3rem;
|
||||
grid-auto-flow: column;
|
||||
grid-column-gap: .5rem;
|
||||
}
|
||||
|
||||
.search-box {
|
||||
font-size: 1rem;
|
||||
border-radius: 0.1rem;
|
||||
padding: .5rem;
|
||||
padding-left: 0.75rem;
|
||||
border: 0;
|
||||
color: var(--secondary);
|
||||
background: var(--primary);
|
||||
}
|
||||
|
||||
.search-button {
|
||||
font-size: 2rem;
|
||||
color: var(--primary);
|
||||
background: var(--secondary);
|
||||
border: 0;
|
||||
cursor: pointer;
|
||||
border-radius: 2px;
|
||||
transition: opacity 150ms;
|
||||
}
|
||||
|
||||
.search-button:hover {
|
||||
opacity: 0.5;
|
||||
transition: opacity 150ms;
|
||||
}
|
||||
|
||||
.about-link {
|
||||
position: absolute;
|
||||
top: 1rem;
|
||||
right: 1rem;
|
||||
font-style: normal;
|
||||
}
|
||||
|
||||
@media
|
||||
only screen
|
||||
and (min-device-width : 320px)
|
||||
and (max-device-width : 720px)
|
||||
{
|
||||
html {
|
||||
padding-left: 0.75rem;
|
||||
padding-right: 0.75rem;
|
||||
font-size: 30pt;
|
||||
max-width: 100vw;
|
||||
}
|
||||
}
|
||||
|
||||
@media
|
||||
only screen
|
||||
and (min-device-width : 320px)
|
||||
and (max-device-width : 374px) {
|
||||
html {
|
||||
font-size: 40pt;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@media(prefers-color-scheme: light) {
|
||||
:root {
|
||||
--primary: #000;
|
||||
--secondary: #fefefe;
|
||||
}
|
||||
*/
|
Plik binarny nie jest wyświetlany.
Plik binarny nie jest wyświetlany.
Plik binarny nie jest wyświetlany.
Plik binarny nie jest wyświetlany.
|
@ -0,0 +1,92 @@
|
|||
Copyright (c) 2016-2018 The Inter UI Project Authors (me@rsms.me)
|
||||
|
||||
This Font Software is licensed under the SIL Open Font License, Version 1.1.
|
||||
This license is copied below, and is also available with a FAQ at:
|
||||
http://scripts.sil.org/OFL
|
||||
|
||||
-----------------------------------------------------------
|
||||
SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
|
||||
-----------------------------------------------------------
|
||||
|
||||
PREAMBLE
|
||||
The goals of the Open Font License (OFL) are to stimulate worldwide
|
||||
development of collaborative font projects, to support the font creation
|
||||
efforts of academic and linguistic communities, and to provide a free and
|
||||
open framework in which fonts may be shared and improved in partnership
|
||||
with others.
|
||||
|
||||
The OFL allows the licensed fonts to be used, studied, modified and
|
||||
redistributed freely as long as they are not sold by themselves. The
|
||||
fonts, including any derivative works, can be bundled, embedded,
|
||||
redistributed and/or sold with any software provided that any reserved
|
||||
names are not used by derivative works. The fonts and derivatives,
|
||||
however, cannot be released under any other type of license. The
|
||||
requirement for fonts to remain under this license does not apply
|
||||
to any document created using the fonts or their derivatives.
|
||||
|
||||
DEFINITIONS
|
||||
"Font Software" refers to the set of files released by the Copyright
|
||||
Holder(s) under this license and clearly marked as such. This may
|
||||
include source files, build scripts and documentation.
|
||||
|
||||
"Reserved Font Name" refers to any names specified as such after the
|
||||
copyright statement(s).
|
||||
|
||||
"Original Version" refers to the collection of Font Software components as
|
||||
distributed by the Copyright Holder(s).
|
||||
|
||||
"Modified Version" refers to any derivative made by adding to, deleting,
|
||||
or substituting -- in part or in whole -- any of the components of the
|
||||
Original Version, by changing formats or by porting the Font Software to a
|
||||
new environment.
|
||||
|
||||
"Author" refers to any designer, engineer, programmer, technical
|
||||
writer or other person who contributed to the Font Software.
|
||||
|
||||
PERMISSION AND CONDITIONS
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of the Font Software, to use, study, copy, merge, embed, modify,
|
||||
redistribute, and sell modified and unmodified copies of the Font
|
||||
Software, subject to the following conditions:
|
||||
|
||||
1) Neither the Font Software nor any of its individual components,
|
||||
in Original or Modified Versions, may be sold by itself.
|
||||
|
||||
2) Original or Modified Versions of the Font Software may be bundled,
|
||||
redistributed and/or sold with any software, provided that each copy
|
||||
contains the above copyright notice and this license. These can be
|
||||
included either as stand-alone text files, human-readable headers or
|
||||
in the appropriate machine-readable metadata fields within text or
|
||||
binary files as long as those fields can be easily viewed by the user.
|
||||
|
||||
3) No Modified Version of the Font Software may use the Reserved Font
|
||||
Name(s) unless explicit written permission is granted by the corresponding
|
||||
Copyright Holder. This restriction only applies to the primary font name as
|
||||
presented to the users.
|
||||
|
||||
4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
|
||||
Software shall not be used to promote, endorse or advertise any
|
||||
Modified Version, except to acknowledge the contribution(s) of the
|
||||
Copyright Holder(s) and the Author(s) or with their explicit written
|
||||
permission.
|
||||
|
||||
5) The Font Software, modified or unmodified, in part or in whole,
|
||||
must be distributed entirely under this license, and must not be
|
||||
distributed under any other license. The requirement for fonts to
|
||||
remain under this license does not apply to any document created
|
||||
using the Font Software.
|
||||
|
||||
TERMINATION
|
||||
This license becomes null and void if any of the above conditions are
|
||||
not met.
|
||||
|
||||
DISCLAIMER
|
||||
THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
|
||||
OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
|
||||
COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
|
||||
DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
|
||||
OTHER DEALINGS IN THE FONT SOFTWARE.
|
|
@ -0,0 +1,13 @@
|
|||
@font-face {
|
||||
font-family: 'Inter UI';
|
||||
font-style: normal;
|
||||
src: url("Inter-UI-Regular.woff2") format("woff2"),
|
||||
url("Inter-UI-Regular.woff") format("woff");
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: 'Inter UI';
|
||||
font-style: italic;
|
||||
src: url("Inter-UI-Italic.woff2") format("woff2"),
|
||||
url("Inter-UI-Italic.woff") format("woff");
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
@import url('base.css');
|
||||
|
||||
main {
|
||||
columns: 2;
|
||||
}
|
||||
|
||||
.entry {
|
||||
-webkit-column-break-inside: avoid;
|
||||
-moz-column-break-inside:avoid;
|
||||
-moz-page-break-inside:avoid;
|
||||
page-break-inside: avoid;
|
||||
break-inside: avoid-column;
|
||||
}
|
||||
|
||||
.link {
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
@media
|
||||
only screen
|
||||
and (min-device-width : 320px)
|
||||
and (max-device-width : 720px)
|
||||
{
|
||||
main {
|
||||
columns: 1 !important;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
@import url("about.css");
|
||||
|
||||
html {
|
||||
max-width: 100vw;
|
||||
}
|
||||
|
||||
h2 {
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
main {
|
||||
display: grid;
|
||||
justify-items: center;
|
||||
align-items: center;
|
||||
margin-top: 10rem;
|
||||
}
|
||||
|
||||
.search-container {
|
||||
grid-template-columns: 19rem 3rem;
|
||||
}
|
||||
|
||||
.lieu-container {
|
||||
justify-items: start;
|
||||
}
|
|
@ -0,0 +1,205 @@
|
|||
package ingest
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"lieu/database"
|
||||
"lieu/types"
|
||||
"lieu/util"
|
||||
"log"
|
||||
"net/url"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/jinzhu/inflection"
|
||||
)
|
||||
|
||||
func partitionSentence(s string) []string {
|
||||
punctuation := regexp.MustCompile(`\p{P}`)
|
||||
whitespace := regexp.MustCompile(`\p{Z}`)
|
||||
invisible := regexp.MustCompile(`\p{C}`)
|
||||
symbols := regexp.MustCompile(`\p{S}`)
|
||||
|
||||
s = punctuation.ReplaceAllString(s, " ")
|
||||
s = whitespace.ReplaceAllString(s, " ")
|
||||
s = invisible.ReplaceAllString(s, " ")
|
||||
s = symbols.ReplaceAllString(s, " ")
|
||||
s = strings.ReplaceAll(s, "|", " ")
|
||||
s = strings.ReplaceAll(s, "/", " ")
|
||||
return strings.Fields(s)
|
||||
}
|
||||
|
||||
func filterCommonWords(words, wordlist []string) []string {
|
||||
var filtered []string
|
||||
for _, word := range words {
|
||||
// ingested word was too common, skip it
|
||||
if len(word) == 1 || find(wordlist, word) {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, inflection.Singular(word))
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
|
||||
func find(slice []string, sought string) bool {
|
||||
for _, item := range slice {
|
||||
if item == sought {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func performAboutHeuristic(heuristicPath, phrase string) bool {
|
||||
disallowed := util.ReadList(heuristicPath, "\n")
|
||||
ok := !util.Contains(disallowed, phrase)
|
||||
return ok && len(phrase) > 20
|
||||
}
|
||||
|
||||
func Ingest(config types.Config) {
|
||||
if _, err := os.Stat(config.Data.Database); err == nil || os.IsExist(err) {
|
||||
err = os.Remove(config.Data.Database)
|
||||
util.Check(err)
|
||||
}
|
||||
|
||||
db := database.InitDB(config.Data.Database)
|
||||
|
||||
wordlist := util.ReadList(config.Data.Wordlist, "|")
|
||||
|
||||
buf, err := os.Open(config.Data.Source)
|
||||
util.Check(err)
|
||||
|
||||
defer func() {
|
||||
err = buf.Close()
|
||||
util.Check(err)
|
||||
}()
|
||||
|
||||
pages := make(map[string]types.PageData)
|
||||
var count int
|
||||
var batchsize = 100
|
||||
batch := make([]types.SearchFragment, 0, 0)
|
||||
|
||||
scanner := bufio.NewScanner(buf)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
firstSpace := strings.Index(line, " ")
|
||||
lastSpace := strings.LastIndex(line, " ")
|
||||
|
||||
if len(line) == 0 || firstSpace == -1 {
|
||||
continue
|
||||
}
|
||||
|
||||
pageurl := strings.ToLower(strings.TrimSuffix(strings.TrimSpace(line[lastSpace:len(line)]), "/"))
|
||||
if !strings.HasPrefix(pageurl, "http") {
|
||||
continue
|
||||
}
|
||||
|
||||
var page types.PageData
|
||||
if data, exists := pages[pageurl]; exists {
|
||||
page = data
|
||||
} else {
|
||||
page.URL = pageurl
|
||||
}
|
||||
|
||||
token := line[0:firstSpace]
|
||||
rawdata := strings.TrimSpace(line[firstSpace:lastSpace])
|
||||
payload := strings.ToLower(rawdata)
|
||||
|
||||
var processed []string
|
||||
score := 1
|
||||
switch token {
|
||||
case "title":
|
||||
if len(page.About) == 0 {
|
||||
page.About = rawdata
|
||||
}
|
||||
score = 5
|
||||
page.Title = rawdata
|
||||
processed = partitionSentence(payload)
|
||||
case "h1":
|
||||
if len(page.About) == 0 {
|
||||
page.About = rawdata
|
||||
}
|
||||
fallthrough
|
||||
case "h2":
|
||||
fallthrough
|
||||
case "h3":
|
||||
score = 15
|
||||
processed = partitionSentence(payload)
|
||||
case "desc":
|
||||
if len(page.About) < 30 && len(rawdata) < 100 {
|
||||
page.About = rawdata
|
||||
}
|
||||
processed = partitionSentence(payload)
|
||||
case "para":
|
||||
if performAboutHeuristic(config.Data.Heuristics, payload) {
|
||||
page.About = rawdata
|
||||
}
|
||||
processed = partitionSentence(payload)
|
||||
case "lang":
|
||||
page.Lang = rawdata
|
||||
case "keywords":
|
||||
processed = strings.Split(strings.ReplaceAll(payload, ", ", ","), ",")
|
||||
default:
|
||||
continue
|
||||
}
|
||||
|
||||
pages[pageurl] = page
|
||||
processed = filterCommonWords(processed, wordlist)
|
||||
count += len(processed)
|
||||
|
||||
for _, word := range processed {
|
||||
batch = append(batch, types.SearchFragment{Word: word, URL: pageurl, Score: score})
|
||||
}
|
||||
if token == "title" {
|
||||
// only extract path segments once per url.
|
||||
// we do it here because every page is virtually guaranteed to have a title attr &
|
||||
// it only appears once
|
||||
for _, word := range extractPathSegments(pageurl) {
|
||||
batch = append(batch, types.SearchFragment{Word: word, URL: pageurl, Score: 2})
|
||||
}
|
||||
}
|
||||
|
||||
if len(pages) > batchsize {
|
||||
ingestBatch(db, batch, pages)
|
||||
batch = make([]types.SearchFragment, 0, 0)
|
||||
// TODO: make sure we don't partially insert any page data
|
||||
pages = make(map[string]types.PageData)
|
||||
}
|
||||
}
|
||||
fmt.Printf("ingested %d words\n", count)
|
||||
|
||||
err = scanner.Err()
|
||||
util.Check(err)
|
||||
}
|
||||
|
||||
func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]types.PageData) {
|
||||
pages := make([]types.PageData, len(pageMap))
|
||||
i := 0
|
||||
for k := range pageMap {
|
||||
pages[i] = pageMap[k]
|
||||
i++
|
||||
}
|
||||
log.Println("starting to ingest batch")
|
||||
database.InsertManyDomains(db, pages)
|
||||
database.InsertManyPages(db, pages)
|
||||
database.InsertManyWords(db, batch)
|
||||
log.Println("finished ingesting batch")
|
||||
}
|
||||
|
||||
func extractPathSegments(pageurl string) []string {
|
||||
u, err := url.Parse(pageurl)
|
||||
util.Check(err)
|
||||
if len(u.Path) == 0 {
|
||||
return make([]string, 0, 0)
|
||||
}
|
||||
s := u.Path
|
||||
s = strings.TrimSuffix(s, ".html")
|
||||
s = strings.TrimSuffix(s, ".htm")
|
||||
s = strings.ReplaceAll(s, "/", " ")
|
||||
s = strings.ReplaceAll(s, "-", " ")
|
||||
s = strings.ReplaceAll(s, "_", " ")
|
||||
s = strings.ToLower(s)
|
||||
return strings.Fields(s)
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
[general]
|
||||
name = "Sweet Webring"
|
||||
# used by the precrawl command and linked to in /about route
|
||||
url = "https://example.com/"
|
||||
port = 10001
|
||||
|
||||
[data]
|
||||
# the source file should contain the crawl command's output
|
||||
source = "data/crawled.txt"
|
||||
# location & name of the sqlite database
|
||||
database = "data/searchengine.db"
|
||||
# contains words and phrases disqualifying scraped paragraphs from being presented in search results
|
||||
heuristics = "data/heuristics.txt"
|
||||
# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word
|
||||
wordlist = "data/wordlist.txt"
|
||||
|
||||
[crawler]
|
||||
# manually curated list of domains, or the output of the precrawl command
|
||||
webring = "data/webring.txt"
|
||||
# domains that are banned from being crawled but might originally be part of the webring
|
||||
bannedDomains = "data/banned-domains.txt"
|
||||
# file suffixes that are banned from being crawled
|
||||
bannedSuffixes = "data/banned-suffixes.txt"
|
||||
# phrases and words which won't be scraped (e.g. if a contained in a link)
|
||||
boringWords = "data/boring-words.txt"
|
||||
# domains that won't be output as outgoing links
|
||||
boringDomains = "data/boring-domains.txt"
|
|
@ -0,0 +1,143 @@
|
|||
package server
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
|
||||
"lieu/database"
|
||||
"lieu/types"
|
||||
"lieu/util"
|
||||
"html/template"
|
||||
|
||||
// "github.com/shurcooL/vfsgen"
|
||||
)
|
||||
|
||||
type SearchData struct {
|
||||
Query string
|
||||
Pages []types.PageData
|
||||
}
|
||||
|
||||
type AboutData struct {
|
||||
DomainCount int
|
||||
InstanceName string
|
||||
PageCount string
|
||||
TermCount string
|
||||
FilteredLink string
|
||||
RingLink string
|
||||
}
|
||||
|
||||
const useURLTitles = true
|
||||
|
||||
func searchRoute(res http.ResponseWriter, req *http.Request, config types.Config) {
|
||||
var query string
|
||||
|
||||
if req.Method == http.MethodGet {
|
||||
params := req.URL.Query()
|
||||
words, exists := params["q"]
|
||||
if !exists {
|
||||
view := template.Must(template.ParseFiles("html/index-template.html"))
|
||||
var empty interface{}
|
||||
view.Execute(res, empty)
|
||||
return
|
||||
}
|
||||
query = words[0]
|
||||
} else {
|
||||
view := template.Must(template.ParseFiles("html/index-template.html"))
|
||||
var empty interface{}
|
||||
view.Execute(res, empty)
|
||||
return
|
||||
}
|
||||
|
||||
db := database.InitDB(config.Data.Database)
|
||||
pages := database.SearchWordsByScore(db, util.Inflect(strings.Fields(query)))
|
||||
|
||||
if useURLTitles {
|
||||
for i, pageData := range pages {
|
||||
prettyURL, err := url.QueryUnescape(strings.TrimPrefix(strings.TrimPrefix(pageData.URL, "http://"), "https://"))
|
||||
util.Check(err)
|
||||
pageData.Title = prettyURL
|
||||
pages[i] = pageData
|
||||
}
|
||||
}
|
||||
|
||||
view := template.Must(template.ParseFiles("html/search-template.html"))
|
||||
data := SearchData{
|
||||
Query: query,
|
||||
Pages: pages,
|
||||
}
|
||||
view.Execute(res, data)
|
||||
}
|
||||
|
||||
func aboutRoute(res http.ResponseWriter, req *http.Request, config types.Config) {
|
||||
db := database.InitDB(config.Data.Database)
|
||||
pageCount := util.Humanize(database.GetPageCount(db))
|
||||
wordCount := util.Humanize(database.GetWordCount(db))
|
||||
domainCount := database.GetDomainCount(db)
|
||||
|
||||
view := template.Must(template.ParseFiles("html/about-template.html"))
|
||||
data := AboutData{
|
||||
InstanceName: config.General.Name,
|
||||
DomainCount: domainCount,
|
||||
PageCount: pageCount,
|
||||
TermCount: wordCount,
|
||||
FilteredLink: "/filtered",
|
||||
RingLink: config.General.URL,
|
||||
}
|
||||
view.Execute(res, data)
|
||||
}
|
||||
|
||||
type ListData struct {
|
||||
Title string
|
||||
URLs []types.PageData
|
||||
}
|
||||
|
||||
func filteredRoute(res http.ResponseWriter, req *http.Request, config types.Config) {
|
||||
view := template.Must(template.ParseFiles("html/list-template.html"))
|
||||
var URLs []types.PageData
|
||||
for _, domain := range util.ReadList(config.Crawler.BannedDomains, "\n") {
|
||||
u, err := url.Parse(domain)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
u.Scheme = "https"
|
||||
p := types.PageData{Title: domain, URL: u.String()}
|
||||
URLs = append(URLs, p)
|
||||
}
|
||||
data := ListData{
|
||||
Title: "Filtered Domains",
|
||||
URLs: URLs,
|
||||
}
|
||||
view.Execute(res, data)
|
||||
}
|
||||
|
||||
func randomRoute(res http.ResponseWriter, req *http.Request, config types.Config) {
|
||||
db := database.InitDB(config.Data.Database)
|
||||
link := database.GetRandomPage(db)
|
||||
http.Redirect(res, req, link, http.StatusSeeOther)
|
||||
}
|
||||
|
||||
func Serve(config types.Config) {
|
||||
http.HandleFunc("/about", func(res http.ResponseWriter, req *http.Request) {
|
||||
aboutRoute(res, req, config)
|
||||
})
|
||||
http.HandleFunc("/", func(res http.ResponseWriter, req *http.Request) {
|
||||
searchRoute(res, req, config)
|
||||
})
|
||||
|
||||
http.HandleFunc("/filtered", func(res http.ResponseWriter, req *http.Request) {
|
||||
filteredRoute(res, req, config)
|
||||
})
|
||||
|
||||
http.HandleFunc("/random", func(res http.ResponseWriter, req *http.Request) {
|
||||
randomRoute(res, req, config)
|
||||
})
|
||||
fileserver := http.FileServer(http.Dir("html/assets/"))
|
||||
http.Handle("/links/", http.StripPrefix("/links/", fileserver))
|
||||
|
||||
portstr := fmt.Sprintf(":%d", config.General.Port)
|
||||
fmt.Println("listening on", portstr)
|
||||
|
||||
http.ListenAndServe(portstr, nil)
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
package types
|
||||
|
||||
type SearchFragment struct {
|
||||
Word string
|
||||
URL string
|
||||
Score int
|
||||
}
|
||||
|
||||
type PageData struct {
|
||||
URL string
|
||||
Title string
|
||||
About string
|
||||
Lang string
|
||||
}
|
||||
|
||||
type Config struct {
|
||||
General struct {
|
||||
Name string `json:name`
|
||||
URL string `json:url`
|
||||
Port int `json:port`
|
||||
} `json:general`
|
||||
Data struct {
|
||||
Source string `json:source`
|
||||
Database string `json:database`
|
||||
Heuristics string `json:heuristics`
|
||||
Wordlist string `json:wordlist`
|
||||
} `json:data`
|
||||
Crawler struct {
|
||||
Webring string `json:webring`
|
||||
BannedDomains string `json:bannedDomains`
|
||||
BannedSuffixes string `json:bannedSuffixes`
|
||||
BoringWords string `json:boringWords`
|
||||
BoringDomains string `json:boringDomains`
|
||||
} `json:crawler`
|
||||
}
|
|
@ -0,0 +1,136 @@
|
|||
package util
|
||||
|
||||
import (
|
||||
"os"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
"lieu/types"
|
||||
"github.com/jinzhu/inflection"
|
||||
"github.com/komkom/toml"
|
||||
)
|
||||
|
||||
func Inflect(words []string) []string {
|
||||
var inflected []string
|
||||
for _, word := range words {
|
||||
inflected = append(inflected, inflection.Singular(word))
|
||||
}
|
||||
return inflected
|
||||
}
|
||||
|
||||
func Check(err error) {
|
||||
if err != nil {
|
||||
log.Fatalln(err)
|
||||
}
|
||||
}
|
||||
|
||||
func DatabaseDoesNotExist(filepath string) {
|
||||
fmt.Printf("lieu: database %s does not exist\n", filepath)
|
||||
fmt.Println("lieu: try running `lieu ingest` if you have already crawled source data")
|
||||
Exit()
|
||||
}
|
||||
|
||||
func CheckFileExists (path string) bool {
|
||||
_, err := os.Stat(path)
|
||||
if err == nil {
|
||||
return true
|
||||
}
|
||||
return os.IsExist(err)
|
||||
}
|
||||
|
||||
func Humanize(n int) string {
|
||||
if n > 1000 {
|
||||
return fmt.Sprintf("%dk", n/1000)
|
||||
} else if n > 1000000 {
|
||||
return fmt.Sprintf("%dm", n/1000000)
|
||||
}
|
||||
|
||||
return string(n)
|
||||
}
|
||||
|
||||
func Contains(arr []string, query string) bool {
|
||||
for _, item := range arr {
|
||||
if strings.Contains(query, item) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func ReadList(filepath, sep string) []string {
|
||||
data, err := ioutil.ReadFile(filepath)
|
||||
if err != nil || len(data) == 0{
|
||||
return []string{}
|
||||
}
|
||||
return strings.Split(strings.TrimSuffix(string(data), sep), sep)
|
||||
}
|
||||
|
||||
func CheckPortOpen(port int) bool {
|
||||
tcpaddr, err := net.ResolveTCPAddr("tcp", fmt.Sprintf("localhost:%d", port))
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
l, err := net.ListenTCP("tcp", tcpaddr)
|
||||
defer l.Close()
|
||||
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func ReadConfig() types.Config {
|
||||
data, err := ioutil.ReadFile("lieu.toml")
|
||||
Check(err)
|
||||
|
||||
var conf types.Config
|
||||
decoder := json.NewDecoder(toml.New(bytes.NewBuffer(data)))
|
||||
|
||||
err = decoder.Decode(&conf)
|
||||
Check(err)
|
||||
|
||||
return conf
|
||||
}
|
||||
|
||||
func WriteMockConfig () {
|
||||
conf := []byte(`[general]
|
||||
name = "Sweet Webring"
|
||||
# used by the precrawl command and linked to in /about route
|
||||
url = "https://example.com/"
|
||||
port = 10001
|
||||
|
||||
[data]
|
||||
# the source file should contain the crawl command's output
|
||||
source = "data/crawled.txt"
|
||||
# location & name of the sqlite database
|
||||
database = "data/searchengine.db"
|
||||
# contains words and phrases disqualifying scraped paragraphs from being presented in search results
|
||||
heuristics = "data/heuristics.txt"
|
||||
# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word
|
||||
wordlist = "data/wordlist.txt"
|
||||
|
||||
[crawler]
|
||||
# manually curated list of domains, or the output of the precrawl command
|
||||
webring = "data/webring.txt"
|
||||
# domains that are banned from being crawled but might originally be part of the webring
|
||||
bannedDomains = "data/banned-domains.txt"
|
||||
# file suffixes that are banned from being crawled
|
||||
bannedSuffixes = "data/banned-suffixes.txt"
|
||||
# phrases and words which won't be scraped (e.g. if a contained in a link)
|
||||
boringWords = "data/boring-words.txt"
|
||||
# domains that won't be output as outgoing links
|
||||
boringDomains = "data/boring-domains.txt"
|
||||
`)
|
||||
err := ioutil.WriteFile("lieu.toml", conf, 0644)
|
||||
Check(err)
|
||||
}
|
||||
|
||||
func Exit () {
|
||||
os.Exit(0)
|
||||
}
|
Ładowanie…
Reference in New Issue