#!/usr/bin/env python3 import argparse import mmap from datetime import datetime, timedelta from multiprocessing import Pool from os import truncate from letters_rare_first import from_lettercount_file from letter2bitmask import Letter2Bitmask from collections import Counter # The original brute force algorithm: # def count_words(containing_bitmap): # words = 0 # for (bm, cnt) in bitmap_count: # if bm & containing_bitmap == bm: # words += cnt # return (containing_bitmap, words) # Do the same in chunks: chunk_size = 2 ** 16 head_of_bitmap_filter = (2 ** 10 - 1) << 16 def count_words_per_chunk(initial_bitmap): # Some consistency checks: if initial_bitmap & head_of_bitmap_filter != initial_bitmap: raise RuntimeError(f"Nonsensical initial bitmap {initial_bitmap:x}") if (initial_bitmap + chunk_size - 1) & head_of_bitmap_filter != initial_bitmap: raise RuntimeError(f"Unexpected initial bitmap {initial_bitmap:x} - this is weird.") # We need not bother with words that need letters # which will never occure in our entire chunk: bitmap_count_with_fitting_head = [] for (bm, cnt) in bitmap_count: if (bm & initial_bitmap) == (bm & head_of_bitmap_filter): bitmap_count_with_fitting_head.append((bm, cnt)) if len(bitmap_count_with_fitting_head) == 0: # All words need letters that never occure in our chunk: return [] else: # For each bitmap in our chunk, find out how many words fit: result = [] for bitmap in range(initial_bitmap, initial_bitmap + chunk_size): words = 0 for (bm, cnt) in bitmap_count_with_fitting_head: if bm & bitmap == bm: words += cnt if 0 < words: result.append((bitmap, words)) return result def mk(lettercount_file, wordlist, outfile): global l2b l2b = Letter2Bitmask(from_lettercount_file(lettercount_file)) bitmap2count = Counter() for line in wordlist: line = line.rstrip() index = l2b.number(line) bitmap2count[index] += 1 global bitmap_count bitmap_count = [(bitmap, bitmap2count[bitmap]) for bitmap in bitmap2count.keys()] length = 2 ** 26 * 4 outfile.seek(length-1) outfile.write(bytes([0])) outfile.flush() outfile.seek(0) with mmap.mmap(outfile.fileno(), length) as mm: with memoryview(mm).cast('I') as view: with Pool(processes=4) as pool: last_bitmap_for_estimate = 2 ** 26 - chunk_size estimate_period = timedelta(seconds = 20) start = datetime.now() next_estimate_due = start + estimate_period print(f"Starting kmrw database generation process at {start.isoformat()} UTC") print("The first estimates will be pessimistic and may take some minutes to come in.") # Reverse the order, as that makes the estimates pessimistic: chunk_starts = [cstart for cstart in range(0, 2 ** 26, chunk_size)] chunk_starts.reverse() for bm_words_pairs in pool.imap(count_words_per_chunk, chunk_starts, 1): for (bm, words) in bm_words_pairs: view[bm] = words last_bitmap_for_estimate = bm now = datetime.now() if (next_estimate_due <= now): remaining_work = last_bitmap_for_estimate / 2 ** 26 expected_end = now + (now - start) * (remaining_work / (1 - remaining_work)) print(f"{last_bitmap_for_estimate} left. Done at or before {expected_end.isoformat()} UTC.") next_estimate_due = now + estimate_period if __name__ == "__main__": parser = argparse.ArgumentParser(description="Construct the binary file from the wordlist file") parser.add_argument("--wordlist", default="wordlist.txt", type=argparse.FileType("r", encoding="UTF8"), help="The input wordlist file as generated by generate-wordlist") parser.add_argument("--lettercount", default="lettercount.txt", type=argparse.FileType("r", encoding="UTF8"), help="The input lettercount file as generated by lettercount") parser.add_argument("--binfile", default="letterset2count.kmrw", type=argparse.FileType(mode="w+b", bufsize=0), help="The output file to be produced") args = parser.parse_args() mk(args.lettercount, args.wordlist, args.binfile)