koch-method-real-words/mk_bin_file.py

#!/usr/bin/env python3

import argparse
import mmap
from datetime import datetime, timedelta
from multiprocessing import Pool
from os import truncate

from letters_rare_first import from_lettercount_file
from letter2bitmask import Letter2Bitmask
from collections import Counter

# The original brute force algorithm:
# def count_words(containing_bitmap):
#     words = 0
#     for (bm, cnt) in bitmap_count:
#         if bm & containing_bitmap == bm:
#             words += cnt
#     return (containing_bitmap, words)

# Do the same in chunks:
chunk_size = 2 ** 16
head_of_bitmap_filter = (2 ** 10 - 1) << 16

def count_words_per_chunk(initial_bitmap):

    # Some consistency checks:
    if initial_bitmap & head_of_bitmap_filter != initial_bitmap:
        raise RuntimeError(f"Nonsensical initial bitmap {initial_bitmap:x}")
    if (initial_bitmap + chunk_size - 1) & head_of_bitmap_filter != initial_bitmap:
        raise RuntimeError(f"Unexpected initial bitmap {initial_bitmap:x} - this is weird.")

    # We need not bother with words that need letters
    # which will never occure in our entire chunk:
    bitmap_count_with_fitting_head = []
    for (bm, cnt) in bitmap_count:
        if (bm & initial_bitmap) == (bm & head_of_bitmap_filter):
            bitmap_count_with_fitting_head.append((bm, cnt))

    if len(bitmap_count_with_fitting_head) == 0:
        # All words need letters that never occure in our chunk:
        return []
    else:
        # For each bitmap in our chunk, find out how many words fit:
        result = []
        for bitmap in range(initial_bitmap, initial_bitmap + chunk_size):
            words = 0
            for (bm, cnt) in bitmap_count_with_fitting_head:
                if bm & bitmap == bm:
                    words += cnt
            if 0 < words:
                result.append((bitmap, words))
        return result


def mk(lettercount_file, wordlist, outfile):

    global l2b
    l2b = Letter2Bitmask(from_lettercount_file(lettercount_file))

    bitmap2count = Counter()
    for line in wordlist:
        line = line.rstrip()
        index = l2b.number(line)
        bitmap2count[index] += 1

    global bitmap_count
    bitmap_count = [(bitmap, bitmap2count[bitmap]) for bitmap in bitmap2count.keys()]


    length = 2 ** 26 * 4
    outfile.seek(length-1)
    outfile.write(bytes([0]))
    outfile.flush()
    outfile.seek(0)
    with mmap.mmap(outfile.fileno(), length) as mm:
        with memoryview(mm).cast('I') as view:
            with Pool(processes=4) as pool:
                last_bitmap_for_estimate = 2 ** 26 - chunk_size
                estimate_period = timedelta(seconds = 20)
                start = datetime.now()
                next_estimate_due = start + estimate_period
                print(f"Starting kmrw database generation process at {start.isoformat()} UTC")
                print("The first estimates will be pessimistic and may take some minutes to come in.")

                # Reverse the order, as that makes the estimates pessimistic:
                chunk_starts = [cstart for cstart in range(0, 2 ** 26, chunk_size)]
                chunk_starts.reverse()

                for bm_words_pairs in pool.imap(count_words_per_chunk, chunk_starts, 1):
                    for (bm, words) in bm_words_pairs:
                        view[bm] = words
                        last_bitmap_for_estimate = bm
                    now = datetime.now()
                    if (next_estimate_due <= now):
                        remaining_work = last_bitmap_for_estimate / 2 ** 26
                        expected_end = now + (now - start) * (remaining_work / (1 - remaining_work))
                        print(f"{last_bitmap_for_estimate} left. Done at or before {expected_end.isoformat()} UTC.")
                        next_estimate_due = now + estimate_period

if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Construct the binary file from the wordlist file")

    parser.add_argument("--wordlist",
                        default="wordlist.txt",
                        type=argparse.FileType("r", encoding="UTF8"),
                        help="The input wordlist file as generated by generate-wordlist")
    parser.add_argument("--lettercount",
                        default="lettercount.txt",
                        type=argparse.FileType("r", encoding="UTF8"),
                        help="The input lettercount file as generated by lettercount")
    parser.add_argument("--binfile",
                        default="letterset2count.kmrw",
                        type=argparse.FileType(mode="w+b", bufsize=0),
                        help="The output file to be produced")

    args = parser.parse_args()

    mk(args.lettercount, args.wordlist, args.binfile)