koch-method-real-words/mk_bin_file.py

121 wiersze
4.7 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import mmap
from datetime import datetime, timedelta
from multiprocessing import Pool
from os import truncate
from letters_rare_first import from_lettercount_file
from letter2bitmask import Letter2Bitmask
from collections import Counter
# The original brute force algorithm:
# def count_words(containing_bitmap):
# words = 0
# for (bm, cnt) in bitmap_count:
# if bm & containing_bitmap == bm:
# words += cnt
# return (containing_bitmap, words)
# Do the same in chunks:
chunk_size = 2 ** 16
head_of_bitmap_filter = (2 ** 10 - 1) << 16
def count_words_per_chunk(initial_bitmap):
# Some consistency checks:
if initial_bitmap & head_of_bitmap_filter != initial_bitmap:
raise RuntimeError(f"Nonsensical initial bitmap {initial_bitmap:x}")
if (initial_bitmap + chunk_size - 1) & head_of_bitmap_filter != initial_bitmap:
raise RuntimeError(f"Unexpected initial bitmap {initial_bitmap:x} - this is weird.")
# We need not bother with words that need letters
# which will never occure in our entire chunk:
bitmap_count_with_fitting_head = []
for (bm, cnt) in bitmap_count:
if (bm & initial_bitmap) == (bm & head_of_bitmap_filter):
bitmap_count_with_fitting_head.append((bm, cnt))
if len(bitmap_count_with_fitting_head) == 0:
# All words need letters that never occure in our chunk:
return []
else:
# For each bitmap in our chunk, find out how many words fit:
result = []
for bitmap in range(initial_bitmap, initial_bitmap + chunk_size):
words = 0
for (bm, cnt) in bitmap_count_with_fitting_head:
if bm & bitmap == bm:
words += cnt
if 0 < words:
result.append((bitmap, words))
return result
def mk(lettercount_file, wordlist, outfile):
global l2b
l2b = Letter2Bitmask(from_lettercount_file(lettercount_file))
bitmap2count = Counter()
for line in wordlist:
line = line.rstrip()
index = l2b.number(line)
bitmap2count[index] += 1
global bitmap_count
bitmap_count = [(bitmap, bitmap2count[bitmap]) for bitmap in bitmap2count.keys()]
length = 2 ** 26 * 4
outfile.seek(length-1)
outfile.write(bytes([0]))
outfile.flush()
outfile.seek(0)
with mmap.mmap(outfile.fileno(), length) as mm:
with memoryview(mm).cast('I') as view:
with Pool(processes=4) as pool:
last_bitmap_for_estimate = 2 ** 26 - chunk_size
estimate_period = timedelta(seconds = 20)
start = datetime.now()
next_estimate_due = start + estimate_period
print(f"Starting kmrw database generation process at {start.isoformat()} UTC")
print("The first estimates will be pessimistic and may take some minutes to come in.")
# Reverse the order, as that makes the estimates pessimistic:
chunk_starts = [cstart for cstart in range(0, 2 ** 26, chunk_size)]
chunk_starts.reverse()
for bm_words_pairs in pool.imap(count_words_per_chunk, chunk_starts, 1):
for (bm, words) in bm_words_pairs:
view[bm] = words
last_bitmap_for_estimate = bm
now = datetime.now()
if (next_estimate_due <= now):
remaining_work = last_bitmap_for_estimate / 2 ** 26
expected_end = now + (now - start) * (remaining_work / (1 - remaining_work))
print(f"{last_bitmap_for_estimate} left. Done at or before {expected_end.isoformat()} UTC.")
next_estimate_due = now + estimate_period
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Construct the binary file from the wordlist file")
parser.add_argument("--wordlist",
default="wordlist.txt",
type=argparse.FileType("r", encoding="UTF8"),
help="The input wordlist file as generated by generate-wordlist")
parser.add_argument("--lettercount",
default="lettercount.txt",
type=argparse.FileType("r", encoding="UTF8"),
help="The input lettercount file as generated by lettercount")
parser.add_argument("--binfile",
default="letterset2count.kmrw",
type=argparse.FileType(mode="w+b", bufsize=0),
help="The output file to be produced")
args = parser.parse_args()
mk(args.lettercount, args.wordlist, args.binfile)