kopia lustrzana https://gitlab.com/4ham/koch-method-real-words
121 wiersze
4.7 KiB
Python
121 wiersze
4.7 KiB
Python
|
#!/usr/bin/env python3
|
||
|
|
||
|
import argparse
|
||
|
import mmap
|
||
|
from datetime import datetime, timedelta
|
||
|
from multiprocessing import Pool
|
||
|
from os import truncate
|
||
|
|
||
|
from letters_rare_first import from_lettercount_file
|
||
|
from letter2bitmask import Letter2Bitmask
|
||
|
from collections import Counter
|
||
|
|
||
|
# The original brute force algorithm:
|
||
|
# def count_words(containing_bitmap):
|
||
|
# words = 0
|
||
|
# for (bm, cnt) in bitmap_count:
|
||
|
# if bm & containing_bitmap == bm:
|
||
|
# words += cnt
|
||
|
# return (containing_bitmap, words)
|
||
|
|
||
|
# Do the same in chunks:
|
||
|
chunk_size = 2 ** 16
|
||
|
head_of_bitmap_filter = (2 ** 10 - 1) << 16
|
||
|
|
||
|
def count_words_per_chunk(initial_bitmap):
|
||
|
|
||
|
# Some consistency checks:
|
||
|
if initial_bitmap & head_of_bitmap_filter != initial_bitmap:
|
||
|
raise RuntimeError(f"Nonsensical initial bitmap {initial_bitmap:x}")
|
||
|
if (initial_bitmap + chunk_size - 1) & head_of_bitmap_filter != initial_bitmap:
|
||
|
raise RuntimeError(f"Unexpected initial bitmap {initial_bitmap:x} - this is weird.")
|
||
|
|
||
|
# We need not bother with words that need letters
|
||
|
# which will never occure in our entire chunk:
|
||
|
bitmap_count_with_fitting_head = []
|
||
|
for (bm, cnt) in bitmap_count:
|
||
|
if (bm & initial_bitmap) == (bm & head_of_bitmap_filter):
|
||
|
bitmap_count_with_fitting_head.append((bm, cnt))
|
||
|
|
||
|
if len(bitmap_count_with_fitting_head) == 0:
|
||
|
# All words need letters that never occure in our chunk:
|
||
|
return []
|
||
|
else:
|
||
|
# For each bitmap in our chunk, find out how many words fit:
|
||
|
result = []
|
||
|
for bitmap in range(initial_bitmap, initial_bitmap + chunk_size):
|
||
|
words = 0
|
||
|
for (bm, cnt) in bitmap_count_with_fitting_head:
|
||
|
if bm & bitmap == bm:
|
||
|
words += cnt
|
||
|
if 0 < words:
|
||
|
result.append((bitmap, words))
|
||
|
return result
|
||
|
|
||
|
|
||
|
def mk(lettercount_file, wordlist, outfile):
|
||
|
|
||
|
global l2b
|
||
|
l2b = Letter2Bitmask(from_lettercount_file(lettercount_file))
|
||
|
|
||
|
bitmap2count = Counter()
|
||
|
for line in wordlist:
|
||
|
line = line.rstrip()
|
||
|
index = l2b.number(line)
|
||
|
bitmap2count[index] += 1
|
||
|
|
||
|
global bitmap_count
|
||
|
bitmap_count = [(bitmap, bitmap2count[bitmap]) for bitmap in bitmap2count.keys()]
|
||
|
|
||
|
|
||
|
length = 2 ** 26 * 4
|
||
|
outfile.seek(length-1)
|
||
|
outfile.write(bytes([0]))
|
||
|
outfile.flush()
|
||
|
outfile.seek(0)
|
||
|
with mmap.mmap(outfile.fileno(), length) as mm:
|
||
|
with memoryview(mm).cast('I') as view:
|
||
|
with Pool(processes=4) as pool:
|
||
|
last_bitmap_for_estimate = 2 ** 26 - chunk_size
|
||
|
estimate_period = timedelta(seconds = 20)
|
||
|
start = datetime.now()
|
||
|
next_estimate_due = start + estimate_period
|
||
|
print(f"Starting kmrw database generation process at {start.isoformat()} UTC")
|
||
|
print("The first estimates will be pessimistic and may take some minutes to come in.")
|
||
|
|
||
|
# Reverse the order, as that makes the estimates pessimistic:
|
||
|
chunk_starts = [cstart for cstart in range(0, 2 ** 26, chunk_size)]
|
||
|
chunk_starts.reverse()
|
||
|
|
||
|
for bm_words_pairs in pool.imap(count_words_per_chunk, chunk_starts, 1):
|
||
|
for (bm, words) in bm_words_pairs:
|
||
|
view[bm] = words
|
||
|
last_bitmap_for_estimate = bm
|
||
|
now = datetime.now()
|
||
|
if (next_estimate_due <= now):
|
||
|
remaining_work = last_bitmap_for_estimate / 2 ** 26
|
||
|
expected_end = now + (now - start) * (remaining_work / (1 - remaining_work))
|
||
|
print(f"{last_bitmap_for_estimate} left. Done at or before {expected_end.isoformat()} UTC.")
|
||
|
next_estimate_due = now + estimate_period
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
|
||
|
parser = argparse.ArgumentParser(description="Construct the binary file from the wordlist file")
|
||
|
|
||
|
parser.add_argument("--wordlist",
|
||
|
default="wordlist.txt",
|
||
|
type=argparse.FileType("r", encoding="UTF8"),
|
||
|
help="The input wordlist file as generated by generate-wordlist")
|
||
|
parser.add_argument("--lettercount",
|
||
|
default="lettercount.txt",
|
||
|
type=argparse.FileType("r", encoding="UTF8"),
|
||
|
help="The input lettercount file as generated by lettercount")
|
||
|
parser.add_argument("--binfile",
|
||
|
default="letterset2count.kmrw",
|
||
|
type=argparse.FileType(mode="w+b", bufsize=0),
|
||
|
help="The output file to be produced")
|
||
|
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
mk(args.lettercount, args.wordlist, args.binfile)
|