From 7c6a63ce2c3ec9466be7106fd0db683bd41af049 Mon Sep 17 00:00:00 2001 From: Slatian Date: Sat, 19 Nov 2022 23:29:03 +0100 Subject: [PATCH] Added a bit of documentation for new features --- README.md | 3 +++ docs/files.md | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/README.md b/README.md index f12b14c..c63d99b 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,8 @@ bannedSuffixes = "data/banned-suffixes.txt" boringWords = "data/boring-words.txt" # domains that won't be output as outgoing links boringDomains = "data/boring-domains.txt" +# queries to search for finding preview text +previewQueryList = "data/preview-query-list.txt" ``` For your own use, the following config fields should be customized: @@ -119,6 +121,7 @@ The following config-defined files can stay as-is unless you have specific requi * `heuristics` * `wordlist` * `bannedSuffixes` +* `previewQueryList` For a full rundown of the files and their various jobs, see the [files description](docs/files.md). diff --git a/docs/files.md b/docs/files.md index 3d3d161..c3db40a 100644 --- a/docs/files.md +++ b/docs/files.md @@ -37,6 +37,8 @@ bannedSuffixes = "data/banned-suffixes.txt" boringWords = "data/boring-words.txt" # domains that won't be output as outgoing links boringDomains = "data/boring-domains.txt" +# queries to search for finding preview text +previewQueryList = "data/preview-query-list.txt" ``` ## HTML @@ -120,6 +122,23 @@ are stopped from entering the search index. The default wordlist consists of the 1000 or so most common English words, albeit curated slightly to still allow for interesting concepts and verbs—such as `reading` and `books`, for example. +#### `previewQueryList` +A list of css selectors (one per line) to fetch preview paragraphs, +the first paragraph found that passes a check against the `heuristics` file makes +it into the search index. For each selector lieu tries the first four paragraphs +found with each selector before skipping to the next one. + +To get good results one usually wants to tune this to getting the first "real" paragraph +after the header, or a summary paragraph if provided. It is also worth trying to avoind getting +irelevant paragraphs as they clutter up your index and results, lieu will fall back to other +preview sources. + +The default has been (at the time of writing) tuned for use with the Fediring. + +Depending on how well the websites you are indexing are with semantic HTML this will +get you the 70 to 90% solution. For the rest use heuristics and contact the creators of the +websites you are tring to index, they (usually) appreciate the feedback. + #### OpenSearch metadata If you are running your own instance of Lieu, you might want to look into changing the URL defined in the file `opensearch.xml`, which specifies [OpenSearch