#!/usr/bin/env python3 import argparse import sys import re import subprocess def raw_words(dic_file, aff_file, encoding): unmunch = subprocess.run(["unmunch", dic_file, aff_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False, encoding=encoding) # Catching stderr and printing # provides transcoding from ISO-8859-1 to UTF-8 when needed print(unmunch.stderr, file=sys.stderr, flush=True) unmunch.check_returncode() for word in unmunch.stdout.split("\n"): yield word.lower() def good_words(words): """Remove words that contain digits or are only one character long.""" letters = re.compile("(\w\w+)") for word in words: match = letters.fullmatch(word) if match: yield match.group(1) def ascii_words(words): """Apply some common transliteration appropriate for German "Umlaute", then remove all words still not ASCII, and also remove roman numbers. """ to_ascii = str.maketrans({'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss'}) is_ascii = re.compile("[a-z]+") is_roman_number = re.compile("[lxvi]+") for word in words: word_maybeascii = word.translate(to_ascii) if is_ascii.fullmatch(word_maybeascii) and not is_roman_number.fullmatch(word_maybeascii): yield word_maybeascii def short_ascii_words(words, max_word_length): """Filter all words that are longer than max_word_length.""" for word in words: if len(word) <= max_word_length: yield word def generate(outfile, language, max_word_length): """Generate the sorted wordlist into outfile (assumed to have a `write` method accepting strings). The parameter `language` can be set to de_DE, en_US, or en_GB.""" encoding = { "de_DE": "ISO-8859-1", "en_US": "UTF-8", "en_GB": "UTF-8" }[language] dic_file = f"/usr/share/hunspell/{language}.dic" aff_file = f"/usr/share/hunspell/{language}.aff" for word in sorted(set( short_ascii_words( good_words(ascii_words(raw_words(dic_file, aff_file, encoding))), max_word_length))): outfile.write(f"{word}\n") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Fetch words from hunspell's unmunch (default German FraMi), filter out junk, and write list.") parser.add_argument("--lang", default="de_DE", help="language to use", choices=["de_DE", "en_GB", "en_US"]) parser.add_argument("--out", type=argparse.FileType("w", encoding="UTF-8"), default="wordlist.txt", help="The output file of one word per line to be written (in UTF-8 encoding).") parser.add_argument("--max-word-length", type=int, default=6, help="Do not use words longer than this many characters.") args = parser.parse_args() outfile = args.out generate(outfile, args.lang, args.max_word_length)