diff --git a/database/database.go b/database/database.go index dd525dc..3b32521 100644 --- a/database/database.go +++ b/database/database.go @@ -69,6 +69,7 @@ func createTables(db *sql.DB) { url TEXT NOT NULL, FOREIGN KEY(url) REFERENCES pages(url) )`, + `CREATE VIRTUAL TABLE IF NOT EXISTS external_links USING fts5 (url, tokenize="trigram")`, } for _, query := range queries { @@ -98,6 +99,29 @@ func SearchWordsByCount(db *sql.DB, words []string) []types.PageData { return searchWords(db, words, false) } +func FulltextSearchWords(db *sql.DB, phrase string) []types.PageData { + query := fmt.Sprintf(`SELECT url from external_links WHERE url MATCH ? GROUP BY url ORDER BY RANDOM() LIMIT 30`) + + stmt, err := db.Prepare(query) + util.Check(err) + defer stmt.Close() + + rows, err := stmt.Query(phrase) + util.Check(err) + defer rows.Close() + + var pageData types.PageData + var pages []types.PageData + for rows.Next() { + if err := rows.Scan(&pageData.URL); err != nil { + log.Fatalln(err) + } + pageData.Title = pageData.URL + pages = append(pages, pageData) + } + return pages +} + func GetDomainCount(db *sql.DB) int { return countQuery(db, "domains") } @@ -123,6 +147,19 @@ func GetRandomDomain(db *sql.DB) string { return domain } +func GetRandomExternalLink(db *sql.DB) string { + rows, err := db.Query("SELECT url FROM external_links ORDER BY RANDOM() LIMIT 1;") + util.Check(err) + defer rows.Close() + + var link string + for rows.Next() { + err = rows.Scan(&link) + util.Check(err) + } + return link +} + func GetRandomPage(db *sql.DB) string { domain := GetRandomDomain(db) stmt, err := db.Prepare("SELECT url FROM pages WHERE domain = ? ORDER BY RANDOM() LIMIT 1;") @@ -242,3 +279,17 @@ func InsertManyWords(db *sql.DB, batch []types.SearchFragment) { _, err := db.Exec(stmt, args...) util.Check(err) } + +func InsertManyExternalLinks(db *sql.DB, externalLinks []string) { + values := make([]string, 0, len(externalLinks)) + args := make([]interface{}, 0, len(externalLinks)) + + for _, externalLink := range externalLinks { + values = append(values, "(?)") + args = append(args, externalLink) + } + + stmt := fmt.Sprintf(`INSERT OR IGNORE INTO external_links(url) VALUES %s`, strings.Join(values, ",")) + _, err := db.Exec(stmt, args...) + util.Check(err) +} diff --git a/ingest/ingest.go b/ingest/ingest.go index 40b144a..a0d67b5 100644 --- a/ingest/ingest.go +++ b/ingest/ingest.go @@ -80,6 +80,7 @@ func Ingest(config types.Config) { var count int var batchsize = 100 batch := make([]types.SearchFragment, 0, 0) + var externalLinks []string scanner := bufio.NewScanner(buf) for scanner.Scan() { @@ -141,6 +142,8 @@ func Ingest(config types.Config) { page.Lang = rawdata case "keywords": processed = strings.Split(strings.ReplaceAll(payload, ", ", ","), ",") + case "non-webring-link": + externalLinks = append(externalLinks, payload) default: continue } @@ -162,7 +165,8 @@ func Ingest(config types.Config) { } if len(pages) > batchsize { - ingestBatch(db, batch, pages) + ingestBatch(db, batch, pages, externalLinks) + externalLinks = make([]string, 0, 0) batch = make([]types.SearchFragment, 0, 0) // TODO: make sure we don't partially insert any page data pages = make(map[string]types.PageData) @@ -174,7 +178,7 @@ func Ingest(config types.Config) { util.Check(err) } -func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]types.PageData) { +func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]types.PageData, links []string) { pages := make([]types.PageData, len(pageMap)) i := 0 for k := range pageMap { @@ -185,6 +189,7 @@ func ingestBatch(db *sql.DB, batch []types.SearchFragment, pageMap map[string]ty database.InsertManyDomains(db, pages) database.InsertManyPages(db, pages) database.InsertManyWords(db, batch) + database.InsertManyExternalLinks(db, links) log.Println("finished ingesting batch") } diff --git a/server/server.go b/server/server.go index 3bd70f9..3d8181d 100644 --- a/server/server.go +++ b/server/server.go @@ -89,6 +89,35 @@ func (h RequestHandler) searchRoute(res http.ResponseWriter, req *http.Request) h.renderView(res, "search", view) } +func (h RequestHandler) externalSearchRoute(res http.ResponseWriter, req *http.Request) { + var query string + view := &TemplateView{} + + if req.Method == http.MethodGet { + params := req.URL.Query() + if words, exists := params["q"]; exists && words[0] != "" { + query = words[0] + } + } + + pages := database.FulltextSearchWords(h.db, query) + + if useURLTitles { + for i, pageData := range pages { + prettyURL, err := url.QueryUnescape(strings.TrimPrefix(strings.TrimPrefix(pageData.URL, "http://"), "https://")) + util.Check(err) + pageData.Title = prettyURL + pages[i] = pageData + } + } + + view.Data = SearchData{ + Query: query, + Pages: pages, + } + h.renderView(res, "search", view) +} + func (h RequestHandler) aboutRoute(res http.ResponseWriter, req *http.Request) { view := &TemplateView{} @@ -133,6 +162,11 @@ func (h RequestHandler) randomRoute(res http.ResponseWriter, req *http.Request) http.Redirect(res, req, link, http.StatusSeeOther) } +func (h RequestHandler) randomExternalRoute(res http.ResponseWriter, req *http.Request) { + link := database.GetRandomExternalLink(h.db) + http.Redirect(res, req, link, http.StatusSeeOther) +} + func (h RequestHandler) webringRoute(res http.ResponseWriter, req *http.Request) { http.Redirect(res, req, h.config.General.URL, http.StatusSeeOther) } @@ -157,6 +191,8 @@ func Serve(config types.Config) { http.HandleFunc("/about", handler.aboutRoute) http.HandleFunc("/", handler.searchRoute) + http.HandleFunc("/external", handler.externalSearchRoute) + http.HandleFunc("/random/external", handler.randomExternalRoute) http.HandleFunc("/random", handler.randomRoute) http.HandleFunc("/webring", handler.webringRoute) http.HandleFunc("/filtered", handler.filteredRoute)