koch-method-real-words/generate_wordlist.py

#!/usr/bin/env python3

import argparse
import sys
import re
import subprocess

def raw_words(dic_file, aff_file, encoding):
    unmunch = subprocess.run(["unmunch", dic_file, aff_file],
                             stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                             check=False, encoding=encoding)
    # Catching stderr and printing
    # provides transcoding from ISO-8859-1 to UTF-8 when needed
    print(unmunch.stderr, file=sys.stderr, flush=True)
    unmunch.check_returncode()
    for word in unmunch.stdout.split("\n"):
        yield word.lower()

def good_words(words):
    """Remove words that contain digits or are only one character long."""
    letters = re.compile("(\w\w+)")
    for word in words:
        match = letters.fullmatch(word)
        if match:
            yield match.group(1)

def ascii_words(words):
    """Apply some common transliteration appropriate for German "Umlaute",
       then remove all words still not ASCII, and also remove roman numbers.
    """
    to_ascii = str.maketrans({'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss'})
    is_ascii = re.compile("[a-z]+")
    is_roman_number = re.compile("[lxvi]+")
    for word in words:
        word_maybeascii =  word.translate(to_ascii)
        if is_ascii.fullmatch(word_maybeascii) and not is_roman_number.fullmatch(word_maybeascii):
            yield word_maybeascii

def short_ascii_words(words, max_word_length):
    """Filter all words that are longer than max_word_length."""
    for word in words:
        if len(word) <= max_word_length:
            yield word

def generate(outfile, language, max_word_length):
    """Generate the sorted wordlist into outfile
    (assumed to have a `write` method accepting strings).

    The parameter `language` can be set to de_DE, en_US, or en_GB."""

    encoding = {
        "de_DE": "ISO-8859-1",
        "en_US": "UTF-8",
        "en_GB": "UTF-8"
    }[language]

    dic_file = f"/usr/share/hunspell/{language}.dic"
    aff_file = f"/usr/share/hunspell/{language}.aff"

    for word in sorted(set(
            short_ascii_words(
                good_words(ascii_words(raw_words(dic_file, aff_file, encoding))),
                max_word_length))):
        outfile.write(f"{word}\n")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Fetch words from hunspell's unmunch (default German FraMi), filter out junk, and write list.")
    parser.add_argument("--lang", default="de_DE",
                    help="language to use",
                    choices=["de_DE", "en_GB", "en_US"])

    parser.add_argument("--out",
                    type=argparse.FileType("w", encoding="UTF-8"),
                    default="wordlist.txt",
                    help="The output file of one word per line to be written (in UTF-8 encoding).")
    parser.add_argument("--max-word-length",
                    type=int,
                    default=6,
                    help="Do not use words longer than this many characters.")
    args = parser.parse_args()

    outfile = args.out
    generate(outfile, args.lang, args.max_word_length)