koch-method-real-words/generate_wordlist.py

85 wiersze
3.0 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import sys
import re
import subprocess
def raw_words(dic_file, aff_file, encoding):
unmunch = subprocess.run(["unmunch", dic_file, aff_file],
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
check=False, encoding=encoding)
# Catching stderr and printing
# provides transcoding from ISO-8859-1 to UTF-8 when needed
print(unmunch.stderr, file=sys.stderr, flush=True)
unmunch.check_returncode()
for word in unmunch.stdout.split("\n"):
yield word.lower()
def good_words(words):
"""Remove words that contain digits or are only one character long."""
letters = re.compile("(\w\w+)")
for word in words:
match = letters.fullmatch(word)
if match:
yield match.group(1)
def ascii_words(words):
"""Apply some common transliteration appropriate for German "Umlaute",
then remove all words still not ASCII, and also remove roman numbers.
"""
to_ascii = str.maketrans({'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss'})
is_ascii = re.compile("[a-z]+")
is_roman_number = re.compile("[lxvi]+")
for word in words:
word_maybeascii = word.translate(to_ascii)
if is_ascii.fullmatch(word_maybeascii) and not is_roman_number.fullmatch(word_maybeascii):
yield word_maybeascii
def short_ascii_words(words, max_word_length):
"""Filter all words that are longer than max_word_length."""
for word in words:
if len(word) <= max_word_length:
yield word
def generate(outfile, language, max_word_length):
"""Generate the sorted wordlist into outfile
(assumed to have a `write` method accepting strings).
The parameter `language` can be set to de_DE, en_US, or en_GB."""
encoding = {
"de_DE": "ISO-8859-1",
"en_US": "UTF-8",
"en_GB": "UTF-8"
}[language]
dic_file = f"/usr/share/hunspell/{language}.dic"
aff_file = f"/usr/share/hunspell/{language}.aff"
for word in sorted(set(
short_ascii_words(
good_words(ascii_words(raw_words(dic_file, aff_file, encoding))),
max_word_length))):
outfile.write(f"{word}\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Fetch words from hunspell's unmunch (default German FraMi), filter out junk, and write list.")
parser.add_argument("--lang", default="de_DE",
help="language to use",
choices=["de_DE", "en_GB", "en_US"])
parser.add_argument("--out",
type=argparse.FileType("w", encoding="UTF-8"),
default="wordlist.txt",
help="The output file of one word per line to be written (in UTF-8 encoding).")
parser.add_argument("--max-word-length",
type=int,
default=6,
help="Do not use words longer than this many characters.")
args = parser.parse_args()
outfile = args.out
generate(outfile, args.lang, args.max_word_length)