2021-02-03 08:12:30 +00:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"fmt"
|
|
|
|
"lieu/crawler"
|
|
|
|
"lieu/database"
|
|
|
|
"lieu/ingest"
|
|
|
|
"lieu/server"
|
|
|
|
"lieu/util"
|
|
|
|
"os"
|
|
|
|
"strings"
|
|
|
|
)
|
|
|
|
|
|
|
|
const help = `Lieu: neighbourhood search engine
|
|
|
|
|
|
|
|
Commands
|
|
|
|
- precrawl (scrapes config's general.url for a list of links: <li> elements containing an anchor <a> tag)
|
|
|
|
- crawl (start crawler, crawls all urls in config's crawler.webring file. outputs to stdout)
|
|
|
|
- ingest (ingest crawled data, generates database)
|
|
|
|
- search (interactive cli for searching the database)
|
|
|
|
- host (hosts search engine over http)
|
|
|
|
|
|
|
|
Example:
|
|
|
|
lieu precrawl > data/webring.txt
|
|
|
|
lieu crawl > data/source.txt
|
|
|
|
lieu ingest
|
|
|
|
lieu host
|
|
|
|
|
|
|
|
See the configuration file lieu.toml or
|
|
|
|
https://github.com/cblgh/lieu for more information.
|
|
|
|
`
|
|
|
|
|
|
|
|
func main() {
|
2021-02-03 16:22:40 +00:00
|
|
|
exists := util.CheckFileExists("lieu.toml")
|
|
|
|
if !exists {
|
|
|
|
fmt.Println("lieu: can't find config, saving an example config in the working directory")
|
|
|
|
util.WriteMockConfig()
|
|
|
|
fmt.Println("lieu: lieu.toml written to disk")
|
|
|
|
util.Exit()
|
|
|
|
}
|
2021-02-03 08:12:30 +00:00
|
|
|
config := util.ReadConfig()
|
|
|
|
|
|
|
|
var cmd string
|
|
|
|
if len(os.Args) > 1 {
|
|
|
|
cmd = os.Args[1]
|
|
|
|
} else {
|
2021-02-03 16:22:40 +00:00
|
|
|
cmd = "help"
|
2021-02-03 08:12:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
switch cmd {
|
|
|
|
case "help":
|
2021-02-03 16:22:40 +00:00
|
|
|
fmt.Println(help)
|
2021-02-03 08:12:30 +00:00
|
|
|
case "precrawl":
|
2021-02-03 16:22:40 +00:00
|
|
|
if config.General.URL == "https://example.com/" {
|
|
|
|
fmt.Println("lieu: the url is not set (example.com)")
|
|
|
|
util.Exit()
|
|
|
|
}
|
2021-02-03 08:12:30 +00:00
|
|
|
crawler.Precrawl(config)
|
|
|
|
case "crawl":
|
2021-02-03 16:22:40 +00:00
|
|
|
exists := util.CheckFileExists(config.Crawler.Webring)
|
|
|
|
if !exists {
|
|
|
|
fmt.Printf("lieu: webring file %s does not exist\n", config.Crawler.Webring)
|
|
|
|
util.Exit()
|
|
|
|
}
|
|
|
|
sourceLen := len(util.ReadList(config.Crawler.Webring, "\n"))
|
|
|
|
if sourceLen == 0 {
|
|
|
|
fmt.Printf("lieu: nothing to crawl; the webring file %s is empty\n", config.Crawler.Webring)
|
|
|
|
util.Exit()
|
|
|
|
}
|
2021-02-03 08:12:30 +00:00
|
|
|
crawler.Crawl(config)
|
|
|
|
case "ingest":
|
2021-02-03 16:22:40 +00:00
|
|
|
exists := util.CheckFileExists(config.Data.Source)
|
|
|
|
if !exists {
|
|
|
|
fmt.Printf("lieu: data source %s does not exist\n", config.Data.Source)
|
|
|
|
fmt.Println("lieu: try running `lieu crawl`")
|
|
|
|
util.Exit()
|
|
|
|
}
|
|
|
|
sourceLen := len(util.ReadList(config.Data.Source, "\n"))
|
|
|
|
if sourceLen == 0 {
|
|
|
|
fmt.Printf("lieu: nothing to ingest; data source %s is empty\n", config.Data.Source)
|
|
|
|
fmt.Println("lieu: try running `lieu crawl`")
|
|
|
|
util.Exit()
|
|
|
|
}
|
|
|
|
fmt.Println("lieu: creating a new database & initiating ingestion")
|
2021-02-03 08:12:30 +00:00
|
|
|
ingest.Ingest(config)
|
|
|
|
case "search":
|
2021-02-03 16:22:40 +00:00
|
|
|
exists := util.CheckFileExists(config.Data.Database)
|
|
|
|
if !exists {
|
|
|
|
util.DatabaseDoesNotExist(config.Data.Database)
|
|
|
|
}
|
2021-02-03 08:12:30 +00:00
|
|
|
interactiveMode(config.Data.Database)
|
2021-03-12 17:00:24 +00:00
|
|
|
case "random":
|
|
|
|
exists := util.CheckFileExists(config.Data.Database)
|
|
|
|
if !exists {
|
|
|
|
util.DatabaseDoesNotExist(config.Data.Database)
|
|
|
|
}
|
|
|
|
db := database.InitDB(config.Data.Database)
|
|
|
|
fmt.Println(database.GetRandomPage(db))
|
2021-02-03 08:12:30 +00:00
|
|
|
case "host":
|
2021-02-03 16:22:40 +00:00
|
|
|
exists := util.CheckFileExists(config.Data.Database)
|
|
|
|
if !exists {
|
|
|
|
util.DatabaseDoesNotExist(config.Data.Database)
|
|
|
|
}
|
|
|
|
open := util.CheckPortOpen(config.General.Port)
|
|
|
|
if !open {
|
|
|
|
fmt.Printf("lieu: port %d is not open; try another one\n", config.General.Port)
|
|
|
|
util.Exit()
|
|
|
|
}
|
2021-02-03 08:12:30 +00:00
|
|
|
server.Serve(config)
|
|
|
|
default:
|
2021-02-03 16:22:40 +00:00
|
|
|
fmt.Println("Lieu: no such command, currently. Try `lieu help`")
|
2021-02-03 08:12:30 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func interactiveMode(databasePath string) {
|
|
|
|
db := database.InitDB(databasePath)
|
|
|
|
reader := bufio.NewReader(os.Stdin)
|
|
|
|
for {
|
|
|
|
fmt.Printf("> ")
|
|
|
|
input, err := reader.ReadString('\n')
|
|
|
|
util.Check(err)
|
|
|
|
input = strings.TrimSuffix(input, "\n")
|
|
|
|
pages := database.SearchWordsByScore(db, util.Inflect(strings.Fields(input)))
|
|
|
|
for _, pageData := range pages {
|
|
|
|
fmt.Println(pageData.URL)
|
|
|
|
if len(pageData.About) > 0 {
|
|
|
|
fmt.Println(pageData.About)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|