commit 28d052f4c302d583b9323c5a8defe85ffd92525d Author: cblgh Date: Wed Feb 3 09:12:30 2021 +0100 launch diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..63ef458 --- /dev/null +++ b/.gitignore @@ -0,0 +1,223 @@ +#~top ignores~ +node_modules/ +*.vim +*bundle*.js +/html/*.html +*.sw[a-z] +config.conf +config.js +*.pdf +archives +builds +dist + +################# +## Eclipse +################# +*.pydevproject +.project +.metadata +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.classpath +.settings/ +.loadpath + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# CDT-specific +.cproject + +# PDT-specific +.buildpath + + +################# +## Visual Studio +################# + +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. + +# User-specific files +*.suo +*.user +*.sln.docstates + +# Build results + +[Dd]ebug/ +[Rr]elease/ +x64/ +build/ +[Bb]in/ +[Oo]bj/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +*_i.c +*_p.c +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.log +*.scc + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opensdf +*.sdf +*.cachefile + +# Visual Studio profiler +*.psess +*.vsp +*.vspx + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# NCrunch +*.ncrunch* +.*crunch*.local.xml + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.Publish.xml +*.pubxml + +# Windows Azure Build Output +csx +*.build.csdef + +# Windows Store app package directory +AppPackages/ + +# Others +sql/ +*.Cache +ClientBin/ +[Ss]tyle[Cc]op.* +~$* +*~ +*.dbmdl +*.[Pp]ublish.xml +*.pfx +*.publishsettings + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file to a newer +# Visual Studio version. Backup files are not needed, because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm + +# SQL Server files +App_Data/*.mdf +App_Data/*.ldf + +############# +## Windows detritus +############# + +# Windows image file caches +Thumbs.db +ehthumbs.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Mac crap +.DS_Store + + +############# +## Python +############# + +*.py[co] + +# Packages +*.egg +*.egg-info +dist/ +build/ +eggs/ +parts/ +var/ +sdist/ +develop-eggs/ +.installed.cfg + +# Installer logs +pip-log.txt + +# Unit test / coverage reports +.coverage +.tox + +#Translations +*.mo + +#Mr Developer +.mr.developer.cfg diff --git a/README.md b/README.md new file mode 100644 index 0000000..7192d8b --- /dev/null +++ b/README.md @@ -0,0 +1,104 @@ +# Lieu +_an alternative search engine_ + +Created in response to the environs of apathy concerning the use of hypertext +search and discovery. In Lieu, the internet is not what is made searchable, but +instead one's own neighbourhood. Put differently, Lieu is a neighbourhood search +engine, a way for personal webrings to increase serendipitous connexions. + + +## Goals +* Enable serendipitous discovery +* Support personal communities +* Be reusable, easily + +## Usage +``` +$ lieu help +Lieu: neighbourhood search engine + +Commands +- precrawl (scrapes config's general.url for a list of links:
  • elements containing an anchor tag) +- crawl (start crawler, crawls all urls in config's crawler.webring file) +- ingest (ingest crawled data, generates database) +- search (interactive cli for searching the database) +- host (hosts search engine over http) + +Example: + lieu precrawl > data/webring.txt + lieu ingest + lieu host +``` + +Lieu's crawl & precrawl commands output to [standard +output](https://en.wikipedia.org/wiki/Standard_streams#Standard_output_(stdout)), +for easy inspection of the data. You typically want to redirect their output to +the files Lieu reads from, as defined in the config file. See below for a +typical workflow. + +### Workflow +* Edit the config +* Add domains to crawl in `config.crawler.webring` + * **If you have a webpage with links you want to crawl:** + * Set the config's `url` field to that page + * Populate the list of domains to crawl with `precrawl`: `lieu precrawl > data/webring.txt` +* Crawl: `lieu crawl > data/source.txt` +* Create database: `lieu ingest` +* Host engine: `lieu host` + +After ingesting the data with `lieu ingest`, you can also use lieu to search the +corpus in the terminal with `lieu search`. + +## Config +The config file is written in [TOML](https://toml.io/en/). + +```toml +[general] +name = "Merveilles Webring" +# used by the precrawl command and linked to in /about route +url = "https://webring.xxiivv.com" +port = 10001 + +[data] +# the source file should contain the crawl command's output +source = "data/crawled.txt" +# location & name of the sqlite database +database = "data/searchengine.db" +# contains words and phrases disqualifying scraped paragraphs from being presented in search results +heuristics = "data/heuristics.txt" +# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word +wordlist = "data/wordlist.txt" + +[crawler] +# manually curated list of domains, or the output of the precrawl command +webring = "data/webring.txt" +# domains that are banned from being crawled but might originally be part of the webring +bannedDomains = "data/banned-domains.txt" +# file suffixes that are banned from being crawled +bannedSuffixes = "data/banned-suffixes.txt" +# phrases and words which won't be scraped (e.g. if a contained in a link) +boringWords = "data/boring-words.txt" +# domains that won't be output as outgoing links +boringDomains = "data/boring-domains.txt" +``` + +For your own use, the following config fields should be customized: +* `name` +* `url ` +* `port` +* `source` +* `webring` +* `bannedDomains` + +The following config-defined files can stay as-is unless you have specific requirements: +* `database` +* `heuristics` +* `wordlist` +* `bannedSuffixes` + +For a full rundown of the files and their various jobs, see the [files +description](docs/files.md). + +### License +Source code `AGPL-3.0-or-later`, Inter is available under `SIL OPEN FONT +LICENSE Version 1.1`, Noto Serif is licensed as `Apache License, Version 2.0`. diff --git a/cli.go b/cli.go new file mode 100644 index 0000000..8dff947 --- /dev/null +++ b/cli.go @@ -0,0 +1,125 @@ +package main + +import ( + "bufio" + "fmt" + "lieu/crawler" + "lieu/database" + "lieu/ingest" + "lieu/server" + "lieu/util" + "os" + "strings" +) + +const help = `Lieu: neighbourhood search engine + +Commands +- precrawl (scrapes config's general.url for a list of links:
  • elements containing an anchor tag) +- crawl (start crawler, crawls all urls in config's crawler.webring file. outputs to stdout) +- ingest (ingest crawled data, generates database) +- search (interactive cli for searching the database) +- host (hosts search engine over http) + +Example: + lieu precrawl > data/webring.txt + lieu crawl > data/source.txt + lieu ingest + lieu host + +See the configuration file lieu.toml or +https://github.com/cblgh/lieu for more information. +` + +func main() { + exists := util.CheckFileExists("lieu.toml") + if !exists { + fmt.Println("lieu: can't find config, saving an example config in the working directory") + util.WriteMockConfig() + fmt.Println("lieu: lieu.toml written to disk") + util.Exit() + } + config := util.ReadConfig() + + var cmd string + if len(os.Args) > 1 { + cmd = os.Args[1] + } else { + cmd = "search" + } + + switch cmd { + case "help": + fmt.Println(help) + case "precrawl": + if config.General.URL == "https://example.com/" { + fmt.Println("lieu: the url is not set (example.com)") + util.Exit() + } + crawler.Precrawl(config) + case "crawl": + exists := util.CheckFileExists(config.Crawler.Webring) + if !exists { + fmt.Printf("lieu: webring file %s does not exist\n", config.Data.Source) + util.Exit() + } + sourceLen := len(util.ReadList(config.Crawler.Webring, "\n")) + if sourceLen == 0 { + fmt.Printf("lieu: nothing to crawl; the webring file %s is empty\n", config.Data.Source) + util.Exit() + } + crawler.Crawl(config) + case "ingest": + exists := util.CheckFileExists(config.Data.Source) + if !exists { + fmt.Printf("lieu: data source %s does not exist\n", config.Data.Source) + fmt.Println("lieu: try running `lieu crawl`") + util.Exit() + } + sourceLen := len(util.ReadList(config.Data.Source, "\n")) + if sourceLen == 0 { + fmt.Printf("lieu: nothing to ingest; data source %s is empty\n", config.Data.Source) + fmt.Println("lieu: try running `lieu crawl`") + util.Exit() + } + fmt.Println("lieu: creating a new database & initiating ingestion") + ingest.Ingest(config) + case "search": + exists := util.CheckFileExists(config.Data.Database) + if !exists { + util.DatabaseDoesNotExist(config.Data.Database) + } + interactiveMode(config.Data.Database) + case "host": + exists := util.CheckFileExists(config.Data.Database) + if !exists { + util.DatabaseDoesNotExist(config.Data.Database) + } + open := util.CheckPortOpen(config.General.Port) + if !open { + fmt.Printf("lieu: port %d is not open; try another one\n", config.General.Port) + util.Exit() + } + server.Serve(config) + default: + fmt.Println("Lieu: no such command, currently. Try `lieu help`") + } +} + +func interactiveMode(databasePath string) { + db := database.InitDB(databasePath) + reader := bufio.NewReader(os.Stdin) + for { + fmt.Printf("> ") + input, err := reader.ReadString('\n') + util.Check(err) + input = strings.TrimSuffix(input, "\n") + pages := database.SearchWordsByScore(db, util.Inflect(strings.Fields(input))) + for _, pageData := range pages { + fmt.Println(pageData.URL) + if len(pageData.About) > 0 { + fmt.Println(pageData.About) + } + } + } +} diff --git a/crawler/crawler.go b/crawler/crawler.go new file mode 100644 index 0000000..99ac202 --- /dev/null +++ b/crawler/crawler.go @@ -0,0 +1,244 @@ +package crawler + +import ( + "fmt" + "lieu/types" + "lieu/util" + "log" + "net/http" + "net/url" + "regexp" + "strings" + "time" + + "github.com/PuerkitoBio/goquery" + "github.com/gocolly/colly/v2" + "github.com/gocolly/colly/v2/queue" +) + +// the following domains are excluded from crawling & indexing, typically because they have a lot of microblog pages +// (very spammy) +func getBannedDomains(path string) []string { + return util.ReadList(path, "\n") +} + +func getBannedSuffixes(path string) []string { + return util.ReadList(path, "\n") +} + +func getBoringWords(path string) []string { + return util.ReadList(path, "\n") +} + +func getBoringDomains(path string) []string { + return util.ReadList(path, "\n") +} + +func find(list []string, query string) bool { + for _, item := range list { + if item == query { + return true + } + } + return false +} + +func getLink(target string) string { + // remove anchor links + if strings.Contains(target, "#") { + target = strings.Split(target, "#")[0] + } + if strings.Contains(target, "?") { + target = strings.Split(target, "?")[0] + } + target = strings.TrimSpace(target) + target = strings.ToLower(target) + // remove trailing / + return strings.TrimSuffix(target, "/") +} + +func getWebringLinks(path string) []string { + var links []string + candidates := util.ReadList(path, "\n") + for _, l := range candidates { + u, err := url.Parse(l) + if err != nil { + continue + } + if u.Scheme == "" { + u.Scheme = "https" + } + links = append(links, u.String()) + } + return links +} + +func getDomains(links []string) []string { + var domains []string + for _, l := range links { + u, err := url.Parse(l) + if err != nil { + continue + } + domains = append(domains, u.Hostname()) + } + return domains +} + + +func findSuffix(suffixes []string, query string) bool { + for _, suffix := range suffixes { + if strings.HasSuffix(strings.ToLower(query), suffix) { + return true + } + } + return false +} + +func cleanText(s string) string { + s = strings.TrimSpace(s) + s = strings.ReplaceAll(s, "\n", " ") + s = strings.ReplaceAll(s, "|", " ") + whitespace := regexp.MustCompile(`\p{Z}`) + s = whitespace.ReplaceAllString(s, " ") + return s +} + +func handleIndexing(c *colly.Collector) { + c.OnHTML("meta[name=\"keywords\"]", func(e *colly.HTMLElement) { + fmt.Println("keywords", cleanText(e.Attr("content")), e.Request.URL) + }) + + c.OnHTML("meta[name=\"description\"]", func(e *colly.HTMLElement) { + desc := cleanText(e.Attr("content")) + if len(desc) > 0 { + fmt.Println("desc", desc, e.Request.URL) + } + }) + + c.OnHTML("html[lang]", func(e *colly.HTMLElement) { + lang := cleanText(e.Attr("lang")) + if len(lang) > 0 { + fmt.Println("lang", lang, e.Request.URL) + } + }) + + // get page title + c.OnHTML("title", func(e *colly.HTMLElement) { + fmt.Println("title", cleanText(e.Text), e.Request.URL) + }) + + c.OnHTML("body", func(e *colly.HTMLElement) { + paragraph := cleanText(e.DOM.Find("p").First().Text()) + if len(paragraph) < 1500 && len(paragraph) > 0 { + fmt.Println("para", paragraph, e.Request.URL) + } + // get all relevant page headings + collectHeadingText("h1", e) + collectHeadingText("h2", e) + collectHeadingText("h3", e) + }) +} + +func collectHeadingText(heading string, e *colly.HTMLElement) { + for _, headingText := range e.ChildTexts(heading) { + if len(headingText) < 500 { + fmt.Println(heading, cleanText(headingText), e.Request.URL) + } + } +} + +func Precrawl(config types.Config) { + res, err := http.Get(config.General.URL) + util.Check(err) + defer res.Body.Close() + + if res.StatusCode != 200 { + log.Fatal("status not 200") + } + + doc, err := goquery.NewDocumentFromReader(res.Body) + util.Check(err) + + items := make([]string, 0) + doc.Find("li").Each(func(i int, s *goquery.Selection) { + if domain, exists := s.Find("a").Attr("href"); exists { + items = append(items, domain) + } + }) + + BANNED := getBannedDomains(config.Crawler.BannedDomains) + for _, item := range items { + link := getLink(item) + u, err := url.Parse(link) + // invalid link + if err != nil { + continue + } + domain := u.Hostname() + if find(BANNED, domain) { + continue + } + fmt.Println(link) + } +} + +func Crawl(config types.Config) { + SUFFIXES := getBannedSuffixes(config.Crawler.BannedSuffixes) + links := getWebringLinks(config.Crawler.Webring) + domains := getDomains(links) + initialDomain := config.General.URL + + // TODO: introduce c2 for scraping links (with depth 1) linked to from webring domains + // instantiate default collector + c := colly.NewCollector( + colly.MaxDepth(3), + ) + + q, _ := queue.New( + 5, /* threads */ + &queue.InMemoryQueueStorage{MaxSize: 100000}, + ) + + for _, link := range links { + q.AddURL(link) + } + + c.AllowedDomains = domains + c.AllowURLRevisit = false + c.DisallowedDomains = getBannedDomains(config.Crawler.BannedDomains) + + delay, _ := time.ParseDuration("200ms") + c.Limit(&colly.LimitRule{DomainGlob: "*", Delay: delay, Parallelism: 3}) + + boringDomains := getBoringDomains(config.Crawler.BoringDomains) + boringWords := getBoringWords(config.Crawler.BoringWords) + + // on every a element which has an href attribute, call callback + c.OnHTML("a[href]", func(e *colly.HTMLElement) { + link := getLink(e.Attr("href")) + if findSuffix(SUFFIXES, link) { + return + } + link = e.Request.AbsoluteURL(link) + u, err := url.Parse(link) + // log which site links to what + if err == nil && !util.Contains(boringWords, link) && !util.Contains(boringDomains, link) { + outgoingDomain := u.Hostname() + currentDomain := e.Request.URL.Hostname() + if !find(domains, outgoingDomain) { + fmt.Println("non-webring-link", link, e.Request.URL) + // solidarity! someone in the webring linked to someone else in it + } else if outgoingDomain != currentDomain && outgoingDomain != initialDomain && currentDomain != initialDomain { + fmt.Println("webring-link", link, e.Request.URL) + } + } + // only visits links from AllowedDomains + q.AddURL(link) + }) + + handleIndexing(c) + + // start scraping + q.Run(c) +} diff --git a/data/banned-domains.txt b/data/banned-domains.txt new file mode 100644 index 0000000..e69de29 diff --git a/data/banned-suffixes.txt b/data/banned-suffixes.txt new file mode 100644 index 0000000..7081e55 --- /dev/null +++ b/data/banned-suffixes.txt @@ -0,0 +1,17 @@ +.xml +.pdf +.rss +.jpg +.png +.gif +.avi +.webm +.mp4 +.ogg +.mp3 +.zip +.exe +.txt +.asc +.key +.csv diff --git a/data/boring-domains.txt b/data/boring-domains.txt new file mode 100644 index 0000000..02ea5a9 --- /dev/null +++ b/data/boring-domains.txt @@ -0,0 +1,19 @@ +instagram.com +twitter.com +linkedin.com +facebook.com +getpoole.com +jekyllrb.com +twitter.com +amazon.com +google.com +microsoft.com +youtube.com +github.io +meetup.com +ebay.com +t.co +a.co +wsj.com +creativecommons.org +patreon.com diff --git a/data/boring-words.txt b/data/boring-words.txt new file mode 100644 index 0000000..9d57342 --- /dev/null +++ b/data/boring-words.txt @@ -0,0 +1,4 @@ +bitcoin +javascript: +mailto: +subscribe diff --git a/data/crawled.txt b/data/crawled.txt new file mode 100644 index 0000000..e69de29 diff --git a/data/heuristics.txt b/data/heuristics.txt new file mode 100644 index 0000000..99d2993 --- /dev/null +++ b/data/heuristics.txt @@ -0,0 +1,10 @@ +incoming +tagged +edited +updated +last update +last edit +© +(c) +all rights reserved +licensed under diff --git a/data/webring.txt b/data/webring.txt new file mode 100644 index 0000000..e69de29 diff --git a/data/wordlist.txt b/data/wordlist.txt new file mode 100644 index 0000000..729022a --- /dev/null +++ b/data/wordlist.txt @@ -0,0 +1 @@ +understandings|understanding|conversations|disappearing|informations|grandmothers|grandfathers|questionings|conversation|information|approaching|understands|immediately|positioning|grandmother|travellings|questioners|recognizing|recognizers|televisions|rememberers|expressions|discovering|disappeared|interesting|grandfather|straightest|controllers|controlling|considering|remembered|cigarettes|companying|completely|spreadings|considered|continuing|controlled|stationing|controller|straighter|stretching|businesses|somebodies|soldiering|countering|darknesses|situations|directions|disappears|younglings|suggesting|afternoons|breathings|distancing|screenings|schoolings|especially|everything|everywhere|explaining|explainers|expression|branchings|revealings|repeatings|surprising|rememberer|somewheres|television|themselves|recognizer|recognizes|recognized|belongings|finishings|travelling|questioner|beginnings|travelings|questioned|followings|pretending|forgetting|forgetters|forwarding|positioned|travellers|gatherings|perfecting|understand|understood|weightings|approaches|officering|numberings|happenings|mentioning|letterings|husbanding|imaginings|approached|apartments|whispering|interested|discovered|spinnings|clearings|climbings|spendings|clothings|colorings|soundings|truckings|somewhere|troubling|companies|companied|beautiful|computers|confusing|considers|travelers|youngling|continues|continued|traveller|traveling|yellowing|apartment|beginning|wheelings|travelled|sometimes|something|appearing|cornering|believing|countered|believers|countries|soldiered|coverings|creatures|crossings|accepting|daughters|belonging|situation|silvering|different|silencing|touchings|bettering|tomorrows|disappear|thinkings|boardings|discovers|admitting|wrappings|distances|distanced|sightings|shrugging|doctoring|showering|shoulders|shoppings|shootings|dressings|sheetings|shadowing|settlings|servicing|seriously|seconding|searching|weighting|screening|screaming|schooling|teachings|bothering|everybody|botherers|bottoming|excepting|expecting|explained|direction|explainer|surprised|surprises|waterings|branching|revealing|returning|surfacing|familiars|repeating|fathering|reminding|supposing|breasting|attacking|remembers|breathing|remaining|breathers|brightest|brownings|suggested|recognize|fightings|attention|figurings|receiving|reasoning|realizing|fingering|buildings|finishing|stupidest|stuffings|watchings|flashings|strongest|strikings|flighting|flowering|promisers|promising|following|bathrooms|prettiest|pretended|stretched|foreheads|foresting|stretches|forgotten|pressings|forgetter|strangest|preparing|forwarded|strangers|possibles|positions|afternoon|straights|pocketing|gardening|pleasings|wondering|gathering|picturing|personals|perfected|stomaches|stomached|carefully|stationed|catchings|parenting|paintings|orderings|groupings|wintering|officered|offerings|centering|numbering|neighbors|certainly|happening|narrowing|narrowest|mountains|mothering|mirroring|middlings|messaging|standings|mentioned|mattering|marriages|histories|machining|hospitals|listening|lightings|springing|lettering|husbanded|spreaders|whispered|imagining|imaginers|spreading|important|languages|answering|cigarette|interests|spiriting|cleanings|knockings|soundest|coatings|sounders|sounding|colleges|coloring|colorful|wouldn't|training|colorers|sorriest|worrying|belonged|approach|touchers|touching|computer|whatever|toppings|confused|confuses|workings|consider|bettered|teething|tonights|tonguers|tonguing|continue|arriving|tomorrow|controls|together|blacking|blackest|throwers|throwing|coolings|someones|blockers|somebody|thirties|soldiers|cornered|weighted|counting|thoughts|counters|thinking|thinners|thinning|coursing|covering|thinnest|craziest|snapping|creating|creature|thickest|boarding|crossing|smokings|crowding|smelling|smallest|cuttings|slipping|slightly|dancings|sleepers|slamming|wordings|darkness|daughter|boatings|skinning|weddings|thanking|sittings|deciding|deciders|singling|singings|despites|simplest|terrible|silvered|tellings|wearings|youngest|watering|silences|teachers|bookings|agreeing|teaching|discover|attacked|bothered|botherer|watching|swingers|bottling|distance|silenced|signings|bottomed|sighting|shutting|shrugged|wondered|swinging|doctored|sweetest|showered|showings|doorways|shouting|shoulder|wronging|shortest|surprise|dragging|shopping|shooters|drawings|actually|shooting|dreaming|dressing|avoiding|shitting|shirting|shipping|drinking|drinkers|braining|sheeting|sharpest|drivings|sharpers|dropping|droppers|shadowed|surfaced|settling|washings|settings|services|serviced|earliest|backings|earthing|servings|branches|branched|seconded|seatings|surfaces|searched|searches|walkings|screened|waitings|screamed|supposed|emptiest|emptying|breaking|breakers|schooled|enjoying|enjoyers|entering|runnings|breasted|rounders|rounding|supposes|everyone|visitors|visiting|breathed|excepted|roofings|exciting|breathes|expected|rollings|bankings|breather|explains|villages|bridging|viewings|brighter|ringings|righting|suitings|bringing|revealed|bringers|returned|failings|repliers|replying|repeated|brothers|familiar|wintered|families|suggests|farthest|furthest|browning|fathered|removing|building|reminded|bathroom|allowing|suddenly|allowers|feedings|builders|burnings|feelings|remained|refusing|stupider|windings|although|stuffing|studying|business|angriest|fighting|fighters|students|figuring|received|twenties|receives|fillings|reasoned|findings|stronger|turnings|realizes|realized|readiest|fingered|readying|striking|trusters|finishes|trusting|finished|readings|reachers|reaching|quieters|quietest|quieting|fittings|quickest|writings|beaching|trucking|callings|stranger|flashing|beatings|answered|flattest|flatting|flighted|straight|troubled|flowered|pullings|storming|promiser|couldn't|promised|promises|couldn’t|followed|stoppers|problems|probably|prettier|stopping|pretends|stomachs|troubles|pressers|tripping|forehead|stickers|forested|pressing|whispers|carrying|sticking|carriers|stepping|stealers|forwards|stealing|becoming|prepares|prepared|powering|freeings|stations|possible|position|freshest|beddings|wrapping|fronting|catching|fuckings|policing|funniest|pointers|pointing|catchers|pocketed|gardened|ceilings|pleasing|gathered|starting|centered|platings|plastics|planning|pictured|pictures|traveler|pickings|personal|glancing|yourself|chancing|perfects|changing|peopling|partying|partings|parented|grabbing|grabbers|changers|checking|starring|bedrooms|checkers|pairings|standing|painting|outsides|greatest|cheeking|greening|greenest|grouping|ordering|anything|openings|guarding|wheeling|officers|guessing|spreader|offering|children|anywhere|numbered|choicest|noticers|noticing|hallways|nothings|hangings|nobodies|admitted|neighbor|choosing|choosers|happened|neckings|happiest|narrowed|narrower|spotting|churches|mouthing|traveled|mountain|mothered|accepted|mornings|mirrored|headings|spirited|hearings|heatings|circling|middling|messaged|messages|heaviest|wouldn’t|spinners|mentions|helpings|cleanest|memories|meetings|meanings|appeared|mattered|marrieds|marrying|marriage|yellowed|markings|cleaning|managing|cleaners|holdings|machined|machines|lunching|luckiest|lowering|longings|clearest|hospital|lockings|littlest|clearing|listened|housings|lightest|lighting|lighters|spinning|hundreds|hurrying|believes|spenders|believed|husbands|lettered|lettings|leadings|ignoring|laughing|ignorers|imagines|yellower|imagined|climbers|imaginer|spending|closings|specials|speakers|language|believer|clothing|clouding|speaking|interest|spacings|landings|knowings|southest|jacketed|knocking|kitchens|kissings|killings|keepings|dresses|biggest|sticker|careful|shirted|warmers|shipped|birding|drinker|carries|sheeted|warming|carried|carrier|driving|sharper|tonight|drivers|casings|sharers|sharing|stepped|dropped|dropper|whisper|shapers|shaping|shakers|shaking|tonguer|shadows|stealer|several|tongued|staying|settles|settled|dusting|setting|tongues|catting|backing|catches|earlier|warmest|earthed|service|serving|warring|wanters|catcher|serious|eastest|sensing|senders|easiest|sending|sellers|selling|seeming|seeings|tiniest|seconds|station|causing|seating|edgings|stating|timings|efforts|causers|screens|blacker|ceiling|screams|centers|wanting|walling|walkers|certain|emptied|empties|emptier|thrower|endings|started|schools|scarers|scaring|sayings|engines|savings|sanding|enjoyed|starers|saddest|enjoyer|staring|enoughs|rushing|bagging|runners|entered|running|chances|entires|chancer|rubbing|rowings|rounder|chanced|rounded|starred|rooming|changed|changes|blocked|angrier|exactly|changer|blocker|excepts|checked|excited|walking|excites|roofing|through|expects|blooded|checker|cheeked|throats|explain|wakings|springs|thought|waiting|blowing|rolling|rocking|risings|ringing|baggers|animals|righter|righted|ridings|richest|facings|reveals|blowers|choicer|choices|returns|voicing|worries|resting|chooses|failing|spreads|replier|failers|falling|spotted|replies|replied|chooser|thinned|fallers|thinner|balling|boarded|repeats|visitor|farther|further|circles|another|removed|fastest|removes|fathers|thicker|circled|visited|reminds|fearing|spirits|classes|banking|boating|cleaned|feeding|spinner|thanked|village|worried|feeling|cleaner|remains|cleared|refuses|refused|workers|reddest|telling|yellows|spender|working|clearer|clearly|climbed|tearing|fighter|teaming|figured|figures|booking|viewing|climber|usually|closest|receive|filling|teacher|reasons|closing|finally|closers|anybody|finding|anymore|realize|special|finders|booting|realest|clothed|readier|readies|readied|fingers|teaches|tallest|speaker|readers|talkers|clouded|talking|reading|firings|spacing|takings|reacher|reached|coating|reaches|raising|raining|fishing|quietly|fittest|fitting|systems|whether|bothers|wrapped|fitters|quieted|quieter|quickly|coffees|quicker|fixings|coldest|sounded|sounder|actings|anyways|college|flashed|flashes|bottles|flatter|flatted|colored|bottled|wording|turning|sorting|flights|colorer|putting|pushers|pushing|flowers|pullers|swinger|wonders|sorrier|pulling|proving|comings|bottoms|promise|truster|boxings|company|follows|younger|sweeter|yelling|problem|without|beached|footing|confuse|beaches|brained|bearing|pretend|trucked|forcing|presser|wishing|trouble|forests|appears|beating|airings|forever|surface|control|forgets|accepts|pressed|wronged|winters|forming|presses|prepare|beaters|breaker|wheeled|because|forward|coolers|cooling|allowed|powered|pourers|freeing|pouring|tripped|coolest|breasts|someone|fresher|suppose|somehow|friends|breaths|copping|fronted|becomes|porches|poppers|popping|poorest|treeing|fucking|fullest|pooling|breathe|polices|funnier|funnies|policed|bedding|corners|futures|pointer|pointed|gamings|counted|soldier|pockets|wetting|pleased|gardens|wetters|wettest|pleases|counter|sunning|players|westest|country|gathers|bridges|playing|plating|bridged|plastic|couples|softest|getting|planned|getters|placing|gifting|pinking|pilings|piecing|picture|coursed|courses|summers|picking|snowing|phoning|bedroom|glances|glanced|winging|snapped|glassed|glasses|perhaps|covered|crazies|crazier|perfect|peopled|persons|peoples|suiting|pausing|passing|goldest|partied|windows|parties|parting|creates|grabbed|smokers|created|grabber|brought|weights|bringer|arrives|crosser|crosses|grasses|parents|palming|graying|pairing|crossed|painted|arrived|greying|smoking|paining|outside|brother|greater|smilers|outings|greened|greener|crowded|travels|smiling|ordered|grounds|offings|smelled|openers|browner|grouped|opening|smaller|growing|okaying|officer|guarded|slowest|slowing|cupping|slipped|guessed|guesses|cutting|offices|gunning|offered|browned|allower|nursing|numbing|suggest|cutters|numbers|sliders|halving|sliding|noticer|wedding|notices|noticed|nothing|writers|hallway|handing|sleeper|normals|noising|hanging|nodding|dancing|wearing|writing|slammed|hangers|darkest|skinned|happens|trained|needing|builder|beliefs|happier|necking|nearest|hardest|nearing|burning|believe|winding|hatting|narrows|stupids|sitting|mouthed|deadest|watered|sisters|mothers|singled|winning|morning|mooning|moments|heading|missing|decides|decided|decider|mirrors|minutes|hearing|minings|already|minding|middled|heating|burners|singles|middles|deepest|stuffed|heaters|singing|simpler|heavier|heavies|belongs|message|despite|mention|simples|studies|studied|silvers|helping|helpers|members|meeting|willing|meanest|attacks|herself|meaning|dinners|student|hidings|matters|marries|married|busying|busiest|silence|against|highest|wildest|hilling|marking|mapping|manages|managed|himself|history|tracked|strikes|manning|hitting|makings|hitters|whiting|towards|watched|holding|toucher|machine|holders|lunches|lunched|watches|luckier|stretch|streets|lowered|loudest|lookers|looking|longing|calling|longest|locking|bending|washing|signing|hottest|littler|benders|strange|sighted|listens|linings|likings|housing|beneath|sighing|sicking|however|lighted|sickest|lighter|calming|lifters|hundred|calmest|hurried|hurries|lifting|touched|doesn't|doesn’t|hurting|touches|showers|husband|doctors|letters|cameras|letting|tossing|leaving|dogging|leaning|leafing|leaders|leading|whitest|layered|ignored|showing|ignores|stories|ignorer|shoving|laughed|lasting|largest|imaging|doorway|besting|imagine|shouted|stormed|downing|storing|topping|avoided|dragged|shorter|betters|stopper|landers|insides|instead|written|drawing|shopped|stopped|between|landing|shooter|knowing|jackets|dreamed|carding|toothed|knocked|knifing|kitchen|joining|teethed|stomach|joiners|kissing|kindest|killers|killing|shoeing|kidding|jumping|kickers|kicking|jumpers|keepers|dressed|keeping|enough|checks|kicked|jumper|kicker|kidded|jumped|killed|joking|killer|kinder|joiner|kisses|kissed|joined|knives|knifes|knifed|jacket|knocks|itself|ladies|landed|lander|inside|larger|images|lasted|imaged|laughs|ignore|aboves|laying|accept|layers|across|yellow|leaded|leader|leaved|leaned|learns|leaves|yelled|lesser|letter|living|lifted|lifter|humans|hugest|lights|wrongs|houses|liking|likers|lining|housed|acting|listen|hotels|little|hotter|locals|locked|horses|longer|longed|looked|hoping|looker|losing|adding|louder|loving|lovers|lowing|lowest|writer|lowers|homing|holing|holder|making|hitter|makers|manned|manage|writes|admits|mapped|marked|hilled|higher|afraid|hiding|hidden|matter|ageing|helper|member|helped|hellos|heater|metals|middle|heated|mights|minded|hearts|mining|minute|headed|mirror|misses|missed|moment|moneys|monies|months|mooned|mostly|having|mother|worlds|hating|mouths|moving|movers|movies|worker|myself|naming|namers|narrow|hatted|hardly|nearer|neared|nearly|harder|necked|needed|happen|hanger|newest|nicest|nights|worked|nobody|nodded|handed|noises|noised|worded|normal|norths|nosing|agrees|noting|notice|halves|halved|number|guying|numbed|nurses|nursed|agreed|wooden|offing|gunned|offers|office|guards|wonder|okayed|okay'd|okay’d|ok'ing|ok’ing|oldest|womens|opened|opener|groups|womans|within|ground|orders|others|outing|wished|greens|greats|owning|wishes|owners|paging|pained|paints|greyed|greyer|paired|palest|grayed|palmed|papers|grayer|parent|parted|passed|golder|passes|pauses|paused|paying|person|people|wipers|goings|glance|phones|phoned|picked|giving|givens|pieces|pieced|piling|gifted|pinked|pinker|places|placed|getter|gotten|plated|plates|gently|played|gather|player|please|gating|garden|pocket|gamers|points|pointy|gaming|future|wiping|fuller|police|pooled|poorer|fucked|popped|popper|fronts|friend|freers|poured|pourer|freest|powers|formed|forget|forgot|forest|forces|forced|footed|pretty|follow|fliers|flyers|proven|airing|proves|proved|prover|pulled|flying|puller|flower|pushes|pushed|floors|pusher|flight|fixers|fixing|quicks|winter|fitted|quiets|fitter|winged|radios|rained|raises|raised|fishes|rather|fished|firsts|firing|reader|finish|finger|fining|finest|realer|finder|really|finals|reason|filled|figure|fought|fights|fields|fewest|redder|refuse|remain|feeing|remind|feared|father|faster|remove|repeat|family|faller|fallen|failer|failed|rested|fading|return|reveal|riches|richer|riding|ridden|window|riders|rights|facing|allows|ringed|rising|rivers|extras|rocked|rolled|expect|roofed|excite|except|rooves|roomed|events|rounds|rowing|evened|rubbed|almost|entire|runner|enters|keying|rushed|rushes|sadder|safest|sanded|enjoys|saving|engine|savers|winded|saying|enders|scared|scares|scarer|scenes|ending|school|scream|either|eights|screen|egging|effort|edging|seated|second|eaters|seeing|seemed|eating|seller|sender|senses|sensed|easier|easily|earths|serves|served|willed|dusted|settle|during|driers|sevens|sexing|shadow|shakes|shaken|dryers|shaker|always|shaped|driest|shapes|shaper|drying|shares|shared|sharer|sharps|driver|drives|driven|sheets|droves|drinks|shirts|drunks|shoots|shorts|dozens|should|downed|shouts|shoved|shoves|showed|wilder|shower|dogged|doctor|shrugs|didn’t|sicker|sicked|didn't|siding|sighed|doings|sights|signed|dinner|silent|silver|dyings|widest|simple|simply|deeper|single|decide|deaths|sister|deader|sizing|darker|wholes|dances|danced|slides|slider|cutter|slower|slowed|slowly|smalls|cupped|smells|smelly|crying|smiles|smiled|smiler|crowds|smokes|smoked|smoker|covers|snowed|whited|softer|course|softly|couple|counts|corner|whiter|copped|cooled|cooler|coming|whites|sorted|colors|colder|sounds|coffee|coated|spaces|clouds|spaced|spoken|speaks|clothe|closed|closes|closer|spends|climbs|clears|cleans|spirit|cities|circle|church|choose|spread|chosen|choice|chests|sprung|sprang|stages|stairs|cheeks|stands|keeper|change|chance|stared|stares|starer|chairs|starts|center|causer|caused|states|stated|causes|caught|catted|stayed|steals|stolen|casing|sticks|caring|carded|stones|animal|cannot|stored|stores|storms|calmer|calmed|called|street|buyers|bought|strike|struck|buying|anyone|strong|busier|busied|busing|burner|stuffs|burned|stupid|builds|browns|suites|suited|brings|summer|bright|sunned|bridge|breath|breast|breaks|broken|surest|branch|brains|anyway|boxing|wheels|sweets|swings|bottom|bottle|system|bother|tables|taking|takers|talked|talker|boring|taller|booted|taught|booked|teamed|teared|boning|appear|bodies|thanks|boated|thicks|boards|bluest|things|thinks|blower|thirds|thirty|though|threes|throat|bloods|thrown|throws|blocks|blacks|tinier|biters|tiring|todays|biting|toning|tongue|arming|birded|bigger|wetter|toothy|beyond|better|topped|tossed|bested|tosses|beside|bender|toward|bended|tracks|belong|trains|belief|travel|behind|begins|before|bedded|became|become|beater|beaten|trucks|truest|aren’t|aren't|trusts|truths|trying|turned|twenty|around|uncles|weight|wasn’t|wasn't|arrive|unless|upping|wedded|viewed|barely|visits|banked|balled|voices|voiced|waited|bagger|waking|walked|bagged|walker|walled|asking|wanted|wanter|warred|waring|backed|warmed|warmer|babies|washed|washes|avoids|attack|waters|asleep|watery|waving|wavers|seems|party|minds|eaten|sells|sends|known|sense|hours|pasts|paths|easts|pause|mined|layer|payed|serve|earth|early|wills|aired|heard|hears|dusts|kills|goers|hotel|seven|dried|sexed|going|drier|dries|dryer|glass|heads|shake|leads|shook|gives|shape|picks|above|locks|money|drops|share|given|wrong|girls|month|sharp|piece|wilds|sheet|drove|drive|moons|lands|piles|ships|drink|piled|drank|drunk|shirt|pinks|shits|dress|shoes|mores|shoot|longs|shots|drawn|draws|drags|shops|haves|horse|short|gifts|dozen|place|downs|shout|hopes|shove|hoped|plans|wiper|doors|shown|shows|wiped|plate|world|mouth|doers|joins|shrug|shuts|leafs|moved|plays|moves|sicks|don’t|pleas|sided|sides|sighs|don't|gated|sight|looks|gates|wives|mover|signs|doing|dirts|knees|movie|gamer|gamed|dying|since|desks|sings|singe|deeps|point|acted|musts|yells|funny|wider|loses|sixes|whose|names|sizes|sized|skins|keyed|skies|pools|slams|darks|named|slept|namer|leave|dance|slide|hated|young|whole|fucks|who’s|slips|who's|slows|front|porch|loved|hates|small|fresh|cries|cried|smell|white|nears|loves|smile|freer|pours|lover|freed|power|smoke|frees|yeses|crowd|cross|jokes|fours|snaps|crazy|forms|cover|homed|snows|among|necks|happy|least|press|force|homes|count|needs|wipes|years|cools|foots|joked|never|songs|comes|sorry|flier|color|sorts|souls|lower|newer|flyer|colds|sound|flown|south|works|coats|space|nicer|prove|lucky|spoke|night|speak|cloud|hurts|yards|pulls|holed|flies|close|spent|spend|words|holes|hangs|clear|lunch|spins|clean|class|liars|floor|holds|spots|alive|noise|flats|chose|flash|nones|child|fixer|fixed|fixes|chest|cheek|mains|stage|hands|makes|stair|quick|stood|check|fiver|stand|fives|north|wrote|stare|lying|quiet|noses|quite|start|chair|nosed|lived|rains|notes|state|large|cause|raise|catch|noted|maker|stays|halls|angry|stole|steal|reach|first|cased|cases|steps|lives|fires|stuck|carry|stick|cares|still|cared|fired|cards|added|stone|halve|stops|can’t|ready|hairy|store|hairs|can't|storm|numbs|story|could|finer|knife|fines|calms|fined|calls|hurry|while|buyer|finds|nurse|found|which|lifts|admit|final|fills|lasts|keeps|where|buses|bused|study|offed|stuff|fight|woods|burnt|burns|field|human|built|wings|offer|brown|allow|guyed|suite|suits|bring|marks|fewer|feels|hills|wines|later|feeds|agree|guess|surer|fears|broke|break|guard|brain|highs|often|marry|ahead|knock|boxes|sweet|boxed|okays|swing|swung|falls|reply|hides|fails|huger|table|takes|taken|laugh|taker|rests|house|talks|bored|women|faded|fades|wheel|facts|wraps|boots|teach|faces|teams|older|tears|bones|maybe|faced|areas|boned|opens|tells|rides|grows|thank|their|boats|thens|there|these|thick|rider|after|board|right|bluer|thins|blues|blued|grown|thing|again|rings|think|blows|blown|third|would|means|those|risen|three|rises|blood|eying|heres|throw|threw|roses|group|river|black|tying|times|timed|roads|rocks|order|meant|green|tired|tires|extra|meets|today|rolls|biter|bitey|other|toned|tones|light|bites|worry|birds|roofs|armed|outer|rooms|outed|every|tooth|teeth|round|image|bests|event|liked|evens|rowed|likes|touch|bends|windy|bents|towns|winds|great|below|overs|owned|liker|train|enter|wound|begun|helps|began|begin|owner|beers|kinds|wests|paged|trees|treed|tripe|trips|pages|alone|hello|beats|enjoy|bears|truck|beach|safer|trues|truer|trued|safes|hells|sames|truth|pains|wells|sands|tried|tries|greys|turns|isn’t|isn't|heavy|twice|saves|uncle|saved|under|kicks|saver|paint|lines|grays|until|weeks|upped|pairs|using|asked|usual|scare|being|ender|metal|views|paled|banks|visit|pales|paler|voice|scene|heats|waits|balls|ended|empty|woken|palms|wakes|waked|lined|knows|pants|worse|paper|walls|worst|wants|eight|heart|along|backs|egged|jumps|warms|grass|might|edges|grabs|seats|avoid|parts|edged|aunts|watch|about|eater|won’t|water|won't|waved|waves|goods|waver|golds|wears|ears|grab|fits|each|sets|knee|lots|part|dust|noes|fish|stay|good|rain|cats|work|wild|laid|hang|gold|pass|step|loud|case|help|your|past|nods|home|care|path|hell|love|fire|gods|lift|card|stop|pays|keys|cars|paid|fine|none|real|into|drop|heat|wish|cans|kids|find|goer|goes|went|calm|just|lead|gone|call|fill|nose|ship|huge|acts|lows|buys|some|note|kind|shit|shat|mind|ices|busy|pick|hand|shod|shoe|gave|reds|shot|hall|fews|ours|feel|burn|drew|such|draw|shop|give|felt|wing|suit|drag|hear|feed|mine|girl|feds|iced|down|when|fees|half|suns|able|word|fear|nows|door|fast|sure|leaf|pile|jobs|show|wine|boys|dogs|yell|hair|guys|kept|doer|fall|fell|head|shut|gift|hole|rest|numb|kick|lean|take|both|sick|fail|fade|took|miss|side|sigh|held|talk|last|plan|bore|hold|done|tall|teas|fact|boot|like|wife|rich|sign|wood|team|does|main|offs|tear|tore|torn|rode|dirt|gets|bone|joke|ride|make|told|play|died|tell|dies|tens|area|body|than|boat|line|guns|desk|that|what|kiss|them|they|gate|sang|then|plea|kill|face|sing|sung|eyes|thin|blue|deep|made|rung|ring|sirs|wide|he’s|rang|moon|blow|eyed|sits|more|whys|dead|blew|days|this|left|grew|he's|size|rise|rose|whom|have|skin|most|late|grow|slam|road|game|tied|ties|arms|dark|rock|okay|ages|mens|roll|mans|tiny|slid|dads|airs|ok'd|tire|wets|ok’d|i’ll|roof|slip|full|cuts|pool|slow|tone|bite|lips|cups|bits|room|olds|poor|bird|adds|ever|knew|hate|fuck|pops|even|tops|wipe|hits|once|west|hour|rows|rubs|toss|best|ones|only|from|runs|bend|bent|onto|open|move|town|free|pour|legs|rush|jump|snap|many|hill|less|snow|keep|safe|much|soft|join|beer|i'll|beds|four|tree|same|sand|form|cops|must|year|cool|trip|lets|beat|mark|born|bear|with|come|save|know|true|sons|lock|song|soon|laws|came|outs|name|well|been|says|said|sort|feet|soul|high|yeah|were|hide|foot|turn|cold|wind|yard|twos|coat|over|hats|owns|ends|lady|aged|arts|else|long|flew|hurt|page|week|upon|lays|used|uses|hard|eggs|wins|very|mays|seas|pain|near|view|bars|weds|pull|edge|wrap|lies|bank|spin|ball|grey|seat|spun|lied|neck|push|wait|hope|bags|city|look|wake|spot|saws|woke|wear|pink|liar|eats|need|sees|seen|puts|seem|wall|want|pair|gray|sell|will|flat|back|pale|sold|asks|wars|land|send|mean|warm|baby|sent|also|wash|away|here|easy|hung|sens|hers|aunt|palm|worn|meet|wore|east|live|news|five|wave|next|lost|lose|nice|ways|far|few|war|bad|bag|bar|wed|use|ups|art|was|two|try|are|bed|top|arm|wet|big|too|bit|tie|the|ten|tvs|tea|box|boy|sun|bus|but|buy|any|can|car|cat|and|son|cop|sos|cry|cup|cut|who|dad|sky|day|six|why|sit|sat|sir|die|did|dog|she|dry|set|ear|ate|eat|see|saw|win|won|egg|end|say|sad|ran|run|rub|row|eye|rid|ask|fed|fee|red|way|fit|fix|all|put|fly|for|pop|fun|get|got|god|pay|own|out|our|air|ors|one|old|ohs|gun|key|off|guy|now|not|nor|nod|nos|ago|new|hat|had|has|her|met|hey|may|hid|him|add|his|men|hit|mad|low|lot|hot|lip|how|lit|lie|kid|i'm|let|i’m|leg|i'd|i’d|ice|led|act|lay|law|ins|yes|yet|you|its|job|no|at|by|my|on|ha|do|ok|he|oh|is|tv|me|us|as|hi|go|if|of|am|up|to|we|so|in|or|it|be|an|i|a diff --git a/database/database.go b/database/database.go new file mode 100644 index 0000000..861891c --- /dev/null +++ b/database/database.go @@ -0,0 +1,222 @@ +package database + +/* example query +SELECT p.url +FROM inv_index index +INNER JOIN pages p ON p.id = index.pageid +WHERE i.word = "project"; + +select url from inv_index where word="esoteric" group by url order by sum(score) desc limit 15; + +select url from inv_index where word = "" group by url order by sum(score) desc; +*/ + +import ( + "database/sql" + "fmt" + "lieu/types" + "lieu/util" + "log" + "net/url" + "strings" + + _ "github.com/mattn/go-sqlite3" +) + +func InitDB(filepath string) *sql.DB { + db, err := sql.Open("sqlite3", filepath) + if err != nil { + log.Fatalln(err) + } + if db == nil { + log.Fatalln("db is nil") + } + createTables(db) + return db +} + +func createTables(db *sql.DB) { + // create the table if it doesn't exist + queries := []string{` + CREATE TABLE IF NOT EXISTS domains ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + domain TEXT NOT NULL UNIQUE + ); + `, + ` + CREATE TABLE IF NOT EXISTS pages ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT NOT NULL UNIQUE, + title TEXT, + about TEXT, + lang TEXT, + domain TEXT NOT NULL, + FOREIGN KEY(domain) REFERENCES domains(domain) + ); + `, + ` + CREATE TABLE IF NOT EXISTS external_pages ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT NOT NULL UNIQUE, + domain TEXT NOT NULL, + title TEXT + ); + `, + ` + CREATE TABLE IF NOT EXISTS inv_index ( + word TEXT NOT NULL, + score INTEGER NOT NULL, + url TEXT NOT NULL, + FOREIGN KEY(url) REFERENCES pages(url) + )`, + } + + for _, query := range queries { + if _, err := db.Exec(query); err != nil { + log.Fatalln(err) + } + } +} + +/* TODO: filters +lang:en|fr|en|<..> +site:wiki.xxiivv.com, site:cblgh.org +nosite:excluded-domain.com + +"word1 word2 word3" strict query + +query params: +&order=score, &order=count +&outgoing=true +*/ + +func SearchWordsByScore(db *sql.DB, words []string) []types.PageData { + return searchWords(db, words, true) +} + +func SearchWordsByCount(db *sql.DB, words []string) []types.PageData { + return searchWords(db, words, false) +} + +func GetDomainCount(db *sql.DB) int { + return countQuery(db, "domains") +} + +func GetPageCount(db *sql.DB) int { + return countQuery(db, "pages") +} + +func GetWordCount(db *sql.DB) int { + return countQuery(db, "inv_index") +} + +func GetRandomPage(db *sql.DB) string { + rows, err := db.Query("SELECT url FROM pages ORDER BY RANDOM() LIMIT 1;") + util.Check(err) + + var link string + for rows.Next() { + err = rows.Scan(&link) + util.Check(err) + } + return link +} + +func countQuery(db *sql.DB, table string) int { + rows, err := db.Query(fmt.Sprintf("SELECT COUNT(*) FROM %s;", table)) + util.Check(err) + var count int + for rows.Next() { + err = rows.Scan(&count) + util.Check(err) + } + return count +} + +func searchWords(db *sql.DB, words []string, searchByScore bool) []types.PageData { + var wordlist []string + var args []interface{} + for _, word := range words { + wordlist = append(wordlist, "word = ?") + args = append(args, strings.ToLower(word)) + } + + orderType := "SUM(score)" + if !searchByScore { + orderType = "COUNT(*)" + } + + query := fmt.Sprintf(` + SELECT p.url, p.about, p.title + FROM inv_index inv INNER JOIN pages p ON inv.url = p.url + WHERE %s + GROUP BY inv.url + ORDER BY %s + DESC + LIMIT 15 + `, strings.Join(wordlist, " OR "), orderType) + + stmt, err := db.Prepare(query) + util.Check(err) + defer stmt.Close() + + rows, err := stmt.Query(args...) + util.Check(err) + var pageData types.PageData + var pages []types.PageData + for rows.Next() { + if err := rows.Scan(&pageData.URL, &pageData.About, &pageData.Title); err != nil { + log.Fatalln(err) + } + pages = append(pages, pageData) + } + return pages +} + +func InsertManyDomains(db *sql.DB, pages []types.PageData) { + values := make([]string, 0, len(pages)) + args := make([]interface{}, 0, len(pages)) + + for _, b := range pages { + values = append(values, "(?)") + u, err := url.Parse(b.URL) + util.Check(err) + args = append(args, u.Hostname()) + } + + stmt := fmt.Sprintf(`INSERT OR IGNORE INTO domains(domain) VALUES %s`, strings.Join(values, ",")) + _, err := db.Exec(stmt, args...) + util.Check(err) +} + +func InsertManyPages(db *sql.DB, pages []types.PageData) { + values := make([]string, 0, len(pages)) + args := make([]interface{}, 0, len(pages)) + + for _, b := range pages { + // url, title, lang, about, domain + values = append(values, "(?, ?, ?, ?, ?)") + u, err := url.Parse(b.URL) + util.Check(err) + args = append(args, b.URL, b.Title, b.Lang, b.About, u.Hostname()) + } + + stmt := fmt.Sprintf(`INSERT OR IGNORE INTO pages(url, title, lang, about, domain) VALUES %s`, strings.Join(values, ",")) + _, err := db.Exec(stmt, args...) + util.Check(err) +} + +func InsertManyWords(db *sql.DB, batch []types.SearchFragment) { + values := make([]string, 0, len(batch)) + args := make([]interface{}, 0, len(batch)) + + for _, b := range batch { + pageurl := strings.TrimSuffix(b.URL, "/") + values = append(values, "(?, ?, ?)") + args = append(args, b.Word, pageurl, b.Score) + } + + stmt := fmt.Sprintf(`INSERT OR IGNORE INTO inv_index(word, url, score) VALUES %s`, strings.Join(values, ",")) + _, err := db.Exec(stmt, args...) + util.Check(err) +} diff --git a/docs/files.md b/docs/files.md new file mode 100644 index 0000000..5c28ed6 --- /dev/null +++ b/docs/files.md @@ -0,0 +1,121 @@ +# Files +_what the purposes are of all those damn files_ + +Lieu is based on a few files, which in turn configure various behaviours in the +**crawler** (visits urls & extracts relevant elements) and the **ingester** +(converts the crawled source data into database fields). The basic reason is to +minimize hardcoded assumptions in the source, furthering Lieu's reuse. + +Below, I will refer to the files by their config defined names. Here's the +config example from the [README](../README.md), again. + +```toml +[general] +name = "Merveilles Webring" +# used by the precrawl command and linked to in /about route +url = "https://webring.xxiivv.com" +port = 10001 + +[data] +# the source file should contain the crawl command's output +source = "data/crawled.txt" +# location & name of the sqlite database +database = "data/searchengine.db" +# contains words and phrases disqualifying scraped paragraphs from being presented in search results +heuristics = "data/heuristics.txt" +# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word +wordlist = "data/wordlist.txt" + +[crawler] +# manually curated list of domains, or the output of the precrawl command +webring = "data/webring.txt" +# domains that are banned from being crawled but might originally be part of the webring +bannedDomains = "data/banned-domains.txt" +# file suffixes that are banned from being crawled +bannedSuffixes = "data/banned-suffixes.txt" +# phrases and words which won't be scraped (e.g. if a contained in a link) +boringWords = "data/boring-words.txt" +# domains that won't be output as outgoing links +boringDomains = "data/boring-domains.txt" +``` + +## HTML +Before we start, a final note on some other types of files in use. The HTML +templates, used when presenting the search engine in the browser, are all +available in the [`html`](../html) folder. The includes—currently only css +& font files—are available in [`html/assets`](../html/assets). + +## `[crawler]` +#### `webring` +Defines which domains will be crawled for pages. At current writing, no domains +outside of this file will be crawled. + +You can populate the `webring` file manually or by precrawling an existing +webpage that contains all of the domains you want to crawl: + + lieu precrawl > data/webring.txt + +#### `bannedDomains` +A list of domains that will not be crawled. This means that if they are present +in the `webring` file, they will be skipped over as candidates for crawling. + +The rationale is that some of the domains of a webring may be unsuitable for ingestion +into the database. I typically find this is the case for domains that include +microblogs with 100s or 1000s of one line pages—needlessly gunking up the search +results without providing anything of interest outside the individual creating +the logs. + +#### `bannedSuffixes` +Eliminates html links that end with suffixes present in this file. Typically I want +to avoid crawling links to media formats such as `.mp4`, and other types of +non-html documents, really. + +It's fine to leave this file intact with its defaults. + +#### `boringWords` +This file is a bit more specific. It contains words which, if present in a link, +will prevent the link from being logged. The reason is cause it suggests the +link target is boring—irrelevant for this application of the search engine. + +This can be `javascript:` script links, or other types of content that is less +relevant to the focus of the search engine & webring. + +Link data of this type is as yet unused in Lieu's ingestion. + +#### `boringDomains` +Like `boringWords` except it contains a list of domains which are banned from +having their links be logged, typically because they are deemed less relevant +for the focus of the search engine. + +Link data of this type is as yet unused in Lieu's ingestion. + +## `[data]` +#### `source` +Contains the linewise data that was produced by the crawler. The first word +identifies the type of data and the last word identifies the page the data +originated from. + +Example: +``` +h2 Prelude https://cblgh.org/articles/four-nights-in-tornio.html +``` + +* An `

    ` tag was scraped, +* its contents were `Prelude`, and +* the originating article was https://cblgh.org/articles/four-nights-in-tornio.html + +#### `database` +The location the sqlite3 database will be created & read from. + +#### `heuristics` +Heuristics contains a list of words or phrases which disqualify scraped +paragraphs from being used as descriptive text Lieu's search results. Typically +excluded are e.g. paragraphs which contain copyright symbols—as that indicates we +have scraped the bottom-most paragraph, i.e. the page was likely a short stub, +with a better content description elsewhere. + +#### `wordlist` +Also known as [stopwords](https://en.wikipedia.org/wiki/Stop_word)—words which +are stopped from entering the search index. The default wordlist consists of the +1000 or so most common English words, albeit curated slightly to still allow for +interesting concepts and verbs—such as `reading` and `books`, for example. diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..0bfbf33 --- /dev/null +++ b/go.mod @@ -0,0 +1,11 @@ +module lieu + +go 1.14 + +require ( + github.com/PuerkitoBio/goquery v1.5.1 + github.com/gocolly/colly/v2 v2.1.0 + github.com/jinzhu/inflection v1.0.0 + github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b + github.com/mattn/go-sqlite3 v1.14.6 +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..7278df0 --- /dev/null +++ b/go.sum @@ -0,0 +1,144 @@ +cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= +github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= +github.com/anaskhan96/soup v1.2.4 h1:or+sKs9QbzJGZVTYFmTs2VBateEywoq00a6K14z331E= +github.com/anaskhan96/soup v1.2.4/go.mod h1:6YnEp9A2yywlYdM4EgDz9NEHclocMepEtku7wg6Cq3s= +github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE= +github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= +github.com/antchfx/htmlquery v1.2.3 h1:sP3NFDneHx2stfNXCKbhHFo8XgNjCACnU/4AO5gWz6M= +github.com/antchfx/htmlquery v1.2.3/go.mod h1:B0ABL+F5irhhMWg54ymEZinzMSi0Kt3I2if0BLYa3V0= +github.com/antchfx/xmlquery v1.2.4 h1:T/SH1bYdzdjTMoz2RgsfVKbM5uWh3gjDYYepFqQmFv4= +github.com/antchfx/xmlquery v1.2.4/go.mod h1:KQQuESaxSlqugE2ZBcM/qn+ebIpt+d+4Xx7YcSGAIrM= +github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= +github.com/antchfx/xpath v1.1.8 h1:PcL6bIX42Px5usSx6xRYw/wjB3wYGkj0MJ9MBzEKVgk= +github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk= +github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= +github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= +github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= +github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs= +github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= +github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= +github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= +github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= +github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= +github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0= +github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg= +github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E= +github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc= +github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= +github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b h1:UmqyLHqfYJjkiuA2hddGeovwAGOCBm5gOTVKuxtvoMo= +github.com/komkom/toml v0.0.0-20210129103441-ff0648d25a4b/go.mod h1:wLcNqnyr6riTbnFObg4o2/GemTCso9AnsUdLsMsdspw= +github.com/mattn/go-sqlite3 v1.14.6 h1:dNPt6NO46WmLVt2DLNpwczCmdV5boIZ6g/tlDrlRUbg= +github.com/mattn/go-sqlite3 v1.14.6/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI= +github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA= +github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200602114024-627f9648deb9 h1:pNX+40auqi2JqRfOP1akLGtYcn15TUbkhwuCO3foqqM= +golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20201021035429-f5854403a974 h1:IX6qOQeG5uLjB/hjjwjedwfjND0hgjPMMyO1RoIXQNI= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20210114065538-d78b04bdf963/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.6.6 h1:lMO5rYAqUxkmaj76jAkRUvt5JZgFymx/+Q5Mzfivuhc= +google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= +google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= +google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= +google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= +google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= +google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEGA= +google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/html/assets/NotoSerif-Bold.ttf b/html/assets/NotoSerif-Bold.ttf new file mode 100755 index 0000000..2726b0a Binary files /dev/null and b/html/assets/NotoSerif-Bold.ttf differ diff --git a/html/assets/NotoSerif-Italic.ttf b/html/assets/NotoSerif-Italic.ttf new file mode 100755 index 0000000..605b1b8 Binary files /dev/null and b/html/assets/NotoSerif-Italic.ttf differ diff --git a/html/assets/NotoSerif-Regular.ttf b/html/assets/NotoSerif-Regular.ttf new file mode 100755 index 0000000..0e87524 Binary files /dev/null and b/html/assets/NotoSerif-Regular.ttf differ diff --git a/html/assets/about.css b/html/assets/about.css new file mode 100644 index 0000000..51ffbf0 --- /dev/null +++ b/html/assets/about.css @@ -0,0 +1,24 @@ +@import url("base.css"); + +html { + max-width: 31rem; +} + +h1 { + font-size: 3rem; + margin-bottom: 0.5rem; +} + +h2 { + font-family: "Noto Serif"; + font-style: italic; + font-weight: 400; + font-size: 1.5rem; + margin-top: 0; + margin-bottom: 2rem; +} + +.lieu { + font-family: "Noto Serif"; + font-weight: 400; +} diff --git a/html/assets/base.css b/html/assets/base.css new file mode 100644 index 0000000..0ebca74 --- /dev/null +++ b/html/assets/base.css @@ -0,0 +1,162 @@ +@import url('inter-ui-web/inter-ui.css'); + +@font-face { + font-family: "Noto Serif"; + src: url("NotoSerif-Bold.ttf"); +} + +@font-face { + font-family: "Noto Serif"; + font-weight: 400; + src: url("NotoSerif-Regular.ttf"); +} + +@font-face { + font-family: "Noto Serif"; + font-weight: 400; + font-style: italic; + src: url("NotoSerif-Italic.ttf"); +} + +:root { + --primary: #fefefe; + --secondary: #000; + /* alt colorscheme: 1 */ + /* --primary: red; */ + /* --secondary: #fefefe; */ + /* alt colorscheme: 2 */ + /* --primary: #F35363; */ + /* --secondary: black; */ +} + +li { + list-style-type: circle; +} + +ul { + margin: 0; + padding-left: 1rem; +} + +html { + font-family: "Inter UI", sans-serif; + background: var(--secondary); + color: var(--primary); + max-width: 650px; + padding-bottom: 2rem; + padding-left: 2rem; + margin-top: 2rem; +} + +body { + margin: 0; +} + +h1 { + font-family: "Noto Serif"; + font-weight: 400; + font-size: 3rem; + margin-bottom: 1rem; + margin-top: 0; +} + +h1 > a, h1 > a:hover { + border-bottom: none; +} + +a { + cursor: pointer; + color: var(--primary); + text-decoration: none; + border-bottom: 0.1rem solid var(--primary); + word-wrap: break-word; +} + +a:hover { + border-bottom-style: dotted; +} + +p { + hyphens: auto; + margin-bottom: 1.5rem; +} + +.entry { + -webkit-column-break-inside: avoid; + -moz-column-break-inside:avoid; + -moz-page-break-inside:avoid; + page-break-inside: avoid; + break-inside: avoid-column; +} + +.search-container { + display: grid; + margin-bottom: 2rem; + height: 2.5rem; + align-items: center; + grid-template-columns: 16rem 3rem; + grid-auto-flow: column; + grid-column-gap: .5rem; +} + +.search-box { + font-size: 1rem; + border-radius: 0.1rem; + padding: .5rem; + padding-left: 0.75rem; + border: 0; + color: var(--secondary); + background: var(--primary); +} + +.search-button { + font-size: 2rem; + color: var(--primary); + background: var(--secondary); + border: 0; + cursor: pointer; + border-radius: 2px; + transition: opacity 150ms; +} + +.search-button:hover { + opacity: 0.5; + transition: opacity 150ms; +} + +.about-link { + position: absolute; + top: 1rem; + right: 1rem; + font-style: normal; +} + +@media +only screen +and (min-device-width : 320px) +and (max-device-width : 720px) +{ + html { + padding-left: 0.75rem; + padding-right: 0.75rem; + font-size: 30pt; + max-width: 100vw; + } +} + +@media +only screen +and (min-device-width : 320px) +and (max-device-width : 374px) { + html { + font-size: 40pt; + } +} + +/* +@media(prefers-color-scheme: light) { +:root { + --primary: #000; + --secondary: #fefefe; +} +*/ diff --git a/html/assets/inter-ui-web/Inter-UI-Italic.woff b/html/assets/inter-ui-web/Inter-UI-Italic.woff new file mode 100644 index 0000000..ca80404 Binary files /dev/null and b/html/assets/inter-ui-web/Inter-UI-Italic.woff differ diff --git a/html/assets/inter-ui-web/Inter-UI-Italic.woff2 b/html/assets/inter-ui-web/Inter-UI-Italic.woff2 new file mode 100644 index 0000000..920203e Binary files /dev/null and b/html/assets/inter-ui-web/Inter-UI-Italic.woff2 differ diff --git a/html/assets/inter-ui-web/Inter-UI-Regular.woff b/html/assets/inter-ui-web/Inter-UI-Regular.woff new file mode 100644 index 0000000..1252f19 Binary files /dev/null and b/html/assets/inter-ui-web/Inter-UI-Regular.woff differ diff --git a/html/assets/inter-ui-web/Inter-UI-Regular.woff2 b/html/assets/inter-ui-web/Inter-UI-Regular.woff2 new file mode 100644 index 0000000..b1c7d65 Binary files /dev/null and b/html/assets/inter-ui-web/Inter-UI-Regular.woff2 differ diff --git a/html/assets/inter-ui-web/LICENSE.txt b/html/assets/inter-ui-web/LICENSE.txt new file mode 100644 index 0000000..da64fc7 --- /dev/null +++ b/html/assets/inter-ui-web/LICENSE.txt @@ -0,0 +1,92 @@ +Copyright (c) 2016-2018 The Inter UI Project Authors (me@rsms.me) + +This Font Software is licensed under the SIL Open Font License, Version 1.1. +This license is copied below, and is also available with a FAQ at: +http://scripts.sil.org/OFL + +----------------------------------------------------------- +SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 +----------------------------------------------------------- + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font creation +efforts of academic and linguistic communities, and to provide a free and +open framework in which fonts may be shared and improved in partnership +with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply +to any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software components as +distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, deleting, +or substituting -- in part or in whole -- any of the components of the +Original Version, by changing formats or by porting the Font Software to a +new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION AND CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, modify, +redistribute, and sell modified and unmodified copies of the Font +Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, +in Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the corresponding +Copyright Holder. This restriction only applies to the primary font name as +presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created +using the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. diff --git a/html/assets/inter-ui-web/inter-ui.css b/html/assets/inter-ui-web/inter-ui.css new file mode 100644 index 0000000..95017a0 --- /dev/null +++ b/html/assets/inter-ui-web/inter-ui.css @@ -0,0 +1,13 @@ +@font-face { + font-family: 'Inter UI'; + font-style: normal; + src: url("Inter-UI-Regular.woff2") format("woff2"), + url("Inter-UI-Regular.woff") format("woff"); +} + +@font-face { + font-family: 'Inter UI'; + font-style: italic; + src: url("Inter-UI-Italic.woff2") format("woff2"), + url("Inter-UI-Italic.woff") format("woff"); +} diff --git a/html/assets/search.css b/html/assets/search.css new file mode 100644 index 0000000..817d834 --- /dev/null +++ b/html/assets/search.css @@ -0,0 +1,27 @@ +@import url('base.css'); + +main { + columns: 2; +} + +.entry { + -webkit-column-break-inside: avoid; + -moz-column-break-inside:avoid; + -moz-page-break-inside:avoid; + page-break-inside: avoid; + break-inside: avoid-column; +} + +.link { + font-style: italic; +} + +@media +only screen +and (min-device-width : 320px) +and (max-device-width : 720px) +{ + main { + columns: 1 !important; + } +} diff --git a/html/assets/startpage.css b/html/assets/startpage.css new file mode 100644 index 0000000..da87f3f --- /dev/null +++ b/html/assets/startpage.css @@ -0,0 +1,24 @@ +@import url("about.css"); + +html { + max-width: 100vw; +} + +h2 { + margin-bottom: 1rem; +} + +main { + display: grid; + justify-items: center; + align-items: center; + margin-top: 10rem; +} + +.search-container { + grid-template-columns: 19rem 3rem; +} + +.lieu-container { + justify-items: start; +} diff --git a/ingest/ingest.go b/ingest/ingest.go new file mode 100644 index 0000000..40b144a --- /dev/null +++ b/ingest/ingest.go @@ -0,0 +1,205 @@ +package ingest + +import ( + "bufio" + "database/sql" + "fmt" + "lieu/database" + "lieu/types" + "lieu/util" + "log" + "net/url" + "os" + "regexp" + "strings" + + "github.com/jinzhu/inflection" +) + +func partitionSentence(s string) []string { + punctuation := regexp.MustCompile(`\p{P}`) + whitespace := regexp.MustCompile(`\p{Z}`) + invisible := regexp.MustCompile(`\p{C}`) + symbols := regexp.MustCompile(`\p{S}`) + + s = punctuation.ReplaceAllString(s, " ") + s = whitespace.ReplaceAllString(s, " ") + s = invisible.ReplaceAllString(s, " ") + s = symbols.ReplaceAllString(s, " ") + s = strings.ReplaceAll(s, "|", " ") + s = strings.ReplaceAll(s, "/", " ") + return strings.Fields(s) +} + +func filterCommonWords(words, wordlist []string) []string { + var filtered []string + for _, word := range words { + // ingested word was too common, skip it + if len(word) == 1 || find(wordlist, word) { + continue + } + filtered = append(filtered, inflection.Singular(word)) + } + return filtered +} + +func find(slice []string, sought string) bool { + for _, item := range slice { + if item == sought { + return true + } + } + return false +} + +func performAboutHeuristic(heuristicPath, phrase string) bool { + disallowed := util.ReadList(heuristicPath, "\n") + ok := !util.Contains(disallowed, phrase) + return ok && len(phrase) > 20 +} + +func Ingest(config types.Config) { + if _, err := os.Stat(config.Data.Database); err == nil || os.IsExist(err) { + err = os.Remove(config.Data.Database) + util.Check(err) + } + + db := database.InitDB(config.Data.Database) + + wordlist := util.ReadList(config.Data.Wordlist, "|") + + buf, err := os.Open(config.Data.Source) + util.Check(err) + + defer func() { + err = buf.Close() + util.Check(err) + }() + + pages := make(map[string]types.PageData) + var count int + var batchsize = 100 + batch := make([]types.SearchFragment, 0, 0) + + scanner := bufio.NewScanner(buf) + for scanner.Scan() { + line := scanner.Text() + firstSpace := strings.Index(line, " ") + lastSpace := strings.LastIndex(line, " ") + + if len(line) == 0 || firstSpace == -1 { + continue + } + + pageurl := strings.ToLower(strings.TrimSuffix(strings.TrimSpace(line[lastSpace:len(line)]), "/")) + if !strings.HasPrefix(pageurl, "http") { + continue + } + + var page types.PageData + if data, exists := pages[pageurl]; exists { + page = data + } else { + page.URL = pageurl + } + + token := line[0:firstSpace] + rawdata := strings.TrimSpace(line[firstSpace:lastSpace]) + payload := strings.ToLower(rawdata) + + var processed []string + score := 1 + switch token { + case "title": + if len(page.About) == 0 { + page.About = rawdata + } + score = 5 + page.Title = rawdata + processed = partitionSentence(payload) + case "h1": + if len(page.About) == 0 { + page.About = rawdata + } + fallthrough + case "h2": + fallthrough + case "h3": + score = 15 + processed = partitionSentence(payload) + case "desc": + if len(page.About) < 30 && len(rawdata) < 100 { + page.About = rawdata + } + processed = partitionSentence(payload) + case "para": + if performAboutHeuristic(config.Data.Heuristics, payload) { + page.About = rawdata + } + processed = partitionSentence(payload) + case "lang": + page.Lang = rawdata + case "keywords": + processed = strings.Split(strings.ReplaceAll(payload, ", ", ","), ",") + default: + continue + } + + pages[pageurl] = page + processed = filterCommonWords(processed, wordlist) + count += len(processed) + + for _, word := range processed { + batch = append(batch, types.SearchFragment{Word: word, URL: pageurl, Score: score}) + } + if token == "title" { + // only extract path segments once per url. + // we do it here because every page is virtually guaranteed to have a title attr & + // it only appears once + for _, word := range extractPathSegments(pageurl) { + batch = append(batch, types.SearchFragment{Word: word, URL: pageurl, Score: 2}) + } + } + + if len(pages) > batchsize { + ingestBatch(db, batch, pages) + batch = make([]types.SearchFragment, 0, 0) + // TODO: make sure we don't partially insert any page data + pages = make(map[string]types.PageData) + } + } + fmt.Printf("ingested %d words\n", count) + + err = scanner.Err() + util.Check(err) +} + +func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]types.PageData) { + pages := make([]types.PageData, len(pageMap)) + i := 0 + for k := range pageMap { + pages[i] = pageMap[k] + i++ + } + log.Println("starting to ingest batch") + database.InsertManyDomains(db, pages) + database.InsertManyPages(db, pages) + database.InsertManyWords(db, batch) + log.Println("finished ingesting batch") +} + +func extractPathSegments(pageurl string) []string { + u, err := url.Parse(pageurl) + util.Check(err) + if len(u.Path) == 0 { + return make([]string, 0, 0) + } + s := u.Path + s = strings.TrimSuffix(s, ".html") + s = strings.TrimSuffix(s, ".htm") + s = strings.ReplaceAll(s, "/", " ") + s = strings.ReplaceAll(s, "-", " ") + s = strings.ReplaceAll(s, "_", " ") + s = strings.ToLower(s) + return strings.Fields(s) +} diff --git a/lieu.toml b/lieu.toml new file mode 100644 index 0000000..75288c4 --- /dev/null +++ b/lieu.toml @@ -0,0 +1,27 @@ +[general] +name = "Sweet Webring" +# used by the precrawl command and linked to in /about route +url = "https://example.com/" +port = 10001 + +[data] +# the source file should contain the crawl command's output +source = "data/crawled.txt" +# location & name of the sqlite database +database = "data/searchengine.db" +# contains words and phrases disqualifying scraped paragraphs from being presented in search results +heuristics = "data/heuristics.txt" +# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word +wordlist = "data/wordlist.txt" + +[crawler] +# manually curated list of domains, or the output of the precrawl command +webring = "data/webring.txt" +# domains that are banned from being crawled but might originally be part of the webring +bannedDomains = "data/banned-domains.txt" +# file suffixes that are banned from being crawled +bannedSuffixes = "data/banned-suffixes.txt" +# phrases and words which won't be scraped (e.g. if a contained in a link) +boringWords = "data/boring-words.txt" +# domains that won't be output as outgoing links +boringDomains = "data/boring-domains.txt" diff --git a/server/server.go b/server/server.go new file mode 100644 index 0000000..10d9d51 --- /dev/null +++ b/server/server.go @@ -0,0 +1,143 @@ +package server + +import ( + "fmt" + "net/http" + "net/url" + "strings" + + "lieu/database" + "lieu/types" + "lieu/util" + "html/template" + + // "github.com/shurcooL/vfsgen" +) + +type SearchData struct { + Query string + Pages []types.PageData +} + +type AboutData struct { + DomainCount int + InstanceName string + PageCount string + TermCount string + FilteredLink string + RingLink string +} + +const useURLTitles = true + +func searchRoute(res http.ResponseWriter, req *http.Request, config types.Config) { + var query string + + if req.Method == http.MethodGet { + params := req.URL.Query() + words, exists := params["q"] + if !exists { + view := template.Must(template.ParseFiles("html/index-template.html")) + var empty interface{} + view.Execute(res, empty) + return + } + query = words[0] + } else { + view := template.Must(template.ParseFiles("html/index-template.html")) + var empty interface{} + view.Execute(res, empty) + return + } + + db := database.InitDB(config.Data.Database) + pages := database.SearchWordsByScore(db, util.Inflect(strings.Fields(query))) + + if useURLTitles { + for i, pageData := range pages { + prettyURL, err := url.QueryUnescape(strings.TrimPrefix(strings.TrimPrefix(pageData.URL, "http://"), "https://")) + util.Check(err) + pageData.Title = prettyURL + pages[i] = pageData + } + } + + view := template.Must(template.ParseFiles("html/search-template.html")) + data := SearchData{ + Query: query, + Pages: pages, + } + view.Execute(res, data) +} + +func aboutRoute(res http.ResponseWriter, req *http.Request, config types.Config) { + db := database.InitDB(config.Data.Database) + pageCount := util.Humanize(database.GetPageCount(db)) + wordCount := util.Humanize(database.GetWordCount(db)) + domainCount := database.GetDomainCount(db) + + view := template.Must(template.ParseFiles("html/about-template.html")) + data := AboutData{ + InstanceName: config.General.Name, + DomainCount: domainCount, + PageCount: pageCount, + TermCount: wordCount, + FilteredLink: "/filtered", + RingLink: config.General.URL, + } + view.Execute(res, data) +} + +type ListData struct { + Title string + URLs []types.PageData +} + +func filteredRoute(res http.ResponseWriter, req *http.Request, config types.Config) { + view := template.Must(template.ParseFiles("html/list-template.html")) + var URLs []types.PageData + for _, domain := range util.ReadList(config.Crawler.BannedDomains, "\n") { + u, err := url.Parse(domain) + if err != nil { + continue + } + u.Scheme = "https" + p := types.PageData{Title: domain, URL: u.String()} + URLs = append(URLs, p) + } + data := ListData{ + Title: "Filtered Domains", + URLs: URLs, + } + view.Execute(res, data) +} + +func randomRoute(res http.ResponseWriter, req *http.Request, config types.Config) { + db := database.InitDB(config.Data.Database) + link := database.GetRandomPage(db) + http.Redirect(res, req, link, http.StatusSeeOther) +} + +func Serve(config types.Config) { + http.HandleFunc("/about", func(res http.ResponseWriter, req *http.Request) { + aboutRoute(res, req, config) + }) + http.HandleFunc("/", func(res http.ResponseWriter, req *http.Request) { + searchRoute(res, req, config) + }) + + http.HandleFunc("/filtered", func(res http.ResponseWriter, req *http.Request) { + filteredRoute(res, req, config) + }) + + http.HandleFunc("/random", func(res http.ResponseWriter, req *http.Request) { + randomRoute(res, req, config) + }) + fileserver := http.FileServer(http.Dir("html/assets/")) + http.Handle("/links/", http.StripPrefix("/links/", fileserver)) + + portstr := fmt.Sprintf(":%d", config.General.Port) + fmt.Println("listening on", portstr) + + http.ListenAndServe(portstr, nil) +} diff --git a/types/types.go b/types/types.go new file mode 100644 index 0000000..1afe936 --- /dev/null +++ b/types/types.go @@ -0,0 +1,35 @@ +package types + +type SearchFragment struct { + Word string + URL string + Score int +} + +type PageData struct { + URL string + Title string + About string + Lang string +} + +type Config struct { + General struct { + Name string `json:name` + URL string `json:url` + Port int `json:port` + } `json:general` + Data struct { + Source string `json:source` + Database string `json:database` + Heuristics string `json:heuristics` + Wordlist string `json:wordlist` + } `json:data` + Crawler struct { + Webring string `json:webring` + BannedDomains string `json:bannedDomains` + BannedSuffixes string `json:bannedSuffixes` + BoringWords string `json:boringWords` + BoringDomains string `json:boringDomains` + } `json:crawler` +} diff --git a/util/util.go b/util/util.go new file mode 100644 index 0000000..d772907 --- /dev/null +++ b/util/util.go @@ -0,0 +1,136 @@ +package util + +import ( + "os" + "bytes" + "encoding/json" + "fmt" + "net" + "io/ioutil" + "log" + "strings" + + "lieu/types" + "github.com/jinzhu/inflection" + "github.com/komkom/toml" +) + +func Inflect(words []string) []string { + var inflected []string + for _, word := range words { + inflected = append(inflected, inflection.Singular(word)) + } + return inflected +} + +func Check(err error) { + if err != nil { + log.Fatalln(err) + } +} + +func DatabaseDoesNotExist(filepath string) { + fmt.Printf("lieu: database %s does not exist\n", filepath) + fmt.Println("lieu: try running `lieu ingest` if you have already crawled source data") + Exit() +} + +func CheckFileExists (path string) bool { + _, err := os.Stat(path) + if err == nil { + return true + } + return os.IsExist(err) +} + +func Humanize(n int) string { + if n > 1000 { + return fmt.Sprintf("%dk", n/1000) + } else if n > 1000000 { + return fmt.Sprintf("%dm", n/1000000) + } + + return string(n) +} + +func Contains(arr []string, query string) bool { + for _, item := range arr { + if strings.Contains(query, item) { + return true + } + } + return false +} + +func ReadList(filepath, sep string) []string { + data, err := ioutil.ReadFile(filepath) + if err != nil || len(data) == 0{ + return []string{} + } + return strings.Split(strings.TrimSuffix(string(data), sep), sep) +} + +func CheckPortOpen(port int) bool { + tcpaddr, err := net.ResolveTCPAddr("tcp", fmt.Sprintf("localhost:%d", port)) + if err != nil { + return false + } + + l, err := net.ListenTCP("tcp", tcpaddr) + defer l.Close() + + if err != nil { + return false + } + return true +} + +func ReadConfig() types.Config { + data, err := ioutil.ReadFile("lieu.toml") + Check(err) + + var conf types.Config + decoder := json.NewDecoder(toml.New(bytes.NewBuffer(data))) + + err = decoder.Decode(&conf) + Check(err) + + return conf +} + +func WriteMockConfig () { + conf := []byte(`[general] +name = "Sweet Webring" +# used by the precrawl command and linked to in /about route +url = "https://example.com/" +port = 10001 + +[data] +# the source file should contain the crawl command's output +source = "data/crawled.txt" +# location & name of the sqlite database +database = "data/searchengine.db" +# contains words and phrases disqualifying scraped paragraphs from being presented in search results +heuristics = "data/heuristics.txt" +# aka stopwords, in the search engine biz: https://en.wikipedia.org/wiki/Stop_word +wordlist = "data/wordlist.txt" + +[crawler] +# manually curated list of domains, or the output of the precrawl command +webring = "data/webring.txt" +# domains that are banned from being crawled but might originally be part of the webring +bannedDomains = "data/banned-domains.txt" +# file suffixes that are banned from being crawled +bannedSuffixes = "data/banned-suffixes.txt" +# phrases and words which won't be scraped (e.g. if a contained in a link) +boringWords = "data/boring-words.txt" +# domains that won't be output as outgoing links +boringDomains = "data/boring-domains.txt" +`) + err := ioutil.WriteFile("lieu.toml", conf, 0644) + Check(err) +} + +func Exit () { + os.Exit(0) +}