koch-method-real-words/mk_bin_file.py

#!/usr/bin/env python3

import argparse
import mmap
from datetime import datetime, timedelta
from multiprocessing import Pool
from os import truncate

from letters_rare_first import from_lettercount_file
from letter2bitmask import Letter2Bitmask
from collections import Counter

# The original brute force algorithm:
# def count_words(containing_bitmap):
#     words = 0
#     for (bm, cnt) in bitmap_count:
#         if bm & containing_bitmap == bm:
#             words += cnt
#     return (containing_bitmap, words)

# Do the same in chunks:
chunk_size = 2 ** 16
head_of_bitmap_filter = (2 ** 10 - 1) << 16

def count_words_per_chunk(initial_bitmap):

    # Some consistency checks:
    if initial_bitmap & head_of_bitmap_filter != initial_bitmap:
        raise RuntimeError(f"Nonsensical initial bitmap {initial_bitmap:x}")
    if (initial_bitmap + chunk_size - 1) & head_of_bitmap_filter != initial_bitmap:
        raise RuntimeError(f"Unexpected initial bitmap {initial_bitmap:x} - this is weird.")

    # We need not bother with words that need letters
    # which will never occure in our entire chunk:
    bitmap_count_with_fitting_head = []
    for (bm, cnt) in bitmap_count:
        if (bm & initial_bitmap) == (bm & head_of_bitmap_filter):
            bitmap_count_with_fitting_head.append((bm, cnt))

    if len(bitmap_count_with_fitting_head) == 0:
        # All words need letters that never occure in our chunk:
        return []
    else:
        # For each bitmap in our chunk, find out how many words fit:
        result = []
        for bitmap in range(initial_bitmap, initial_bitmap + chunk_size):
            words = 0
            for (bm, cnt) in bitmap_count_with_fitting_head:
                if bm & bitmap == bm:
                    words += cnt
            if 0 < words:
                result.append((bitmap, words))
        return result


def mk(lettercount_file, wordlist, outfile):

    global l2b
    l2b = Letter2Bitmask(from_lettercount_file(lettercount_file))

    bitmap2count = Counter()
    for line in wordlist:
        line = line.rstrip()
        index = l2b.number(line)
        bitmap2count[index] += 1

    global bitmap_count
    bitmap_count = [(bitmap, bitmap2count[bitmap]) for bitmap in bitmap2count.keys()]


    length = 2 ** 26 * 4
    outfile.seek(length-1)
    outfile.write(bytes([0]))
    outfile.flush()
    outfile.seek(0)
    with mmap.mmap(outfile.fileno(), length) as mm:
        with memoryview(mm).cast('I') as view:
            with Pool(processes=4) as pool:
                last_bitmap_for_estimate = 2 ** 26 - chunk_size
                estimate_period = timedelta(seconds = 20)
                start = datetime.now()
                next_estimate_due = start + estimate_period
                print(f"Starting kmrw database generation process at {start.isoformat()} UTC")
                print("The first estimates will be pessimistic and may take some minutes to come in.")

                # Reverse the order, as that makes the estimates pessimistic:
                chunk_starts = [cstart for cstart in range(0, 2 ** 26, chunk_size)]
                chunk_starts.reverse()

                for bm_words_pairs in pool.imap(count_words_per_chunk, chunk_starts, 1):
                    for (bm, words) in bm_words_pairs:
                        view[bm] = words
                        last_bitmap_for_estimate = bm
                    now = datetime.now()
                    if (next_estimate_due <= now):
                        remaining_work = last_bitmap_for_estimate / 2 ** 26
                        expected_end = now + (now - start) * (remaining_work / (1 - remaining_work))
                        print(f"{last_bitmap_for_estimate} left. Done at or before {expected_end.isoformat()} UTC.")
                        next_estimate_due = now + estimate_period

if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Construct the binary file from the wordlist file")

    parser.add_argument("--wordlist",
                        default="wordlist.txt",
                        type=argparse.FileType("r", encoding="UTF8"),
                        help="The input wordlist file as generated by generate-wordlist")
    parser.add_argument("--lettercount",
                        default="lettercount.txt",
                        type=argparse.FileType("r", encoding="UTF8"),
                        help="The input lettercount file as generated by lettercount")
    parser.add_argument("--binfile",
                        default="letterset2count.kmrw",
                        type=argparse.FileType(mode="w+b", bufsize=0),
                        help="The output file to be produced")

    args = parser.parse_args()

    mk(args.lettercount, args.wordlist, args.binfile)
Koch method for learning Morse code with real words: Initial public commit. 2020-12-22 17:33:49 +00:00			`#!/usr/bin/env python3`

			`import argparse`
			`import mmap`
			`from datetime import datetime, timedelta`
			`from multiprocessing import Pool`
			`from os import truncate`

			`from letters_rare_first import from_lettercount_file`
			`from letter2bitmask import Letter2Bitmask`
			`from collections import Counter`

			`# The original brute force algorithm:`
			`# def count_words(containing_bitmap):`
			`# words = 0`
			`# for (bm, cnt) in bitmap_count:`
			`# if bm & containing_bitmap == bm:`
			`# words += cnt`
			`# return (containing_bitmap, words)`

			`# Do the same in chunks:`
			`chunk_size = 2 ** 16`
			`head_of_bitmap_filter = (2 ** 10 - 1) << 16`

			`def count_words_per_chunk(initial_bitmap):`

			`# Some consistency checks:`
			`if initial_bitmap & head_of_bitmap_filter != initial_bitmap:`
			`raise RuntimeError(f"Nonsensical initial bitmap {initial_bitmap:x}")`
			`if (initial_bitmap + chunk_size - 1) & head_of_bitmap_filter != initial_bitmap:`
			`raise RuntimeError(f"Unexpected initial bitmap {initial_bitmap:x} - this is weird.")`

			`# We need not bother with words that need letters`
			`# which will never occure in our entire chunk:`
			`bitmap_count_with_fitting_head = []`
			`for (bm, cnt) in bitmap_count:`
			`if (bm & initial_bitmap) == (bm & head_of_bitmap_filter):`
			`bitmap_count_with_fitting_head.append((bm, cnt))`

			`if len(bitmap_count_with_fitting_head) == 0:`
			`# All words need letters that never occure in our chunk:`
			`return []`
			`else:`
			`# For each bitmap in our chunk, find out how many words fit:`
			`result = []`
			`for bitmap in range(initial_bitmap, initial_bitmap + chunk_size):`
			`words = 0`
			`for (bm, cnt) in bitmap_count_with_fitting_head:`
			`if bm & bitmap == bm:`
			`words += cnt`
			`if 0 < words:`
			`result.append((bitmap, words))`
			`return result`


			`def mk(lettercount_file, wordlist, outfile):`

			`global l2b`
			`l2b = Letter2Bitmask(from_lettercount_file(lettercount_file))`

			`bitmap2count = Counter()`
			`for line in wordlist:`
			`line = line.rstrip()`
			`index = l2b.number(line)`
			`bitmap2count[index] += 1`

			`global bitmap_count`
			`bitmap_count = [(bitmap, bitmap2count[bitmap]) for bitmap in bitmap2count.keys()]`


			`length = 2 ** 26 * 4`
			`outfile.seek(length-1)`
			`outfile.write(bytes([0]))`
			`outfile.flush()`
			`outfile.seek(0)`
			`with mmap.mmap(outfile.fileno(), length) as mm:`
			`with memoryview(mm).cast('I') as view:`
			`with Pool(processes=4) as pool:`
			`last_bitmap_for_estimate = 2 ** 26 - chunk_size`
			`estimate_period = timedelta(seconds = 20)`
			`start = datetime.now()`
			`next_estimate_due = start + estimate_period`
			`print(f"Starting kmrw database generation process at {start.isoformat()} UTC")`
			`print("The first estimates will be pessimistic and may take some minutes to come in.")`

			`# Reverse the order, as that makes the estimates pessimistic:`
			`chunk_starts = [cstart for cstart in range(0, 2 ** 26, chunk_size)]`
			`chunk_starts.reverse()`

			`for bm_words_pairs in pool.imap(count_words_per_chunk, chunk_starts, 1):`
			`for (bm, words) in bm_words_pairs:`
			`view[bm] = words`
			`last_bitmap_for_estimate = bm`
			`now = datetime.now()`
			`if (next_estimate_due <= now):`
			`remaining_work = last_bitmap_for_estimate / 2 ** 26`
			`expected_end = now + (now - start) * (remaining_work / (1 - remaining_work))`
			`print(f"{last_bitmap_for_estimate} left. Done at or before {expected_end.isoformat()} UTC.")`
			`next_estimate_due = now + estimate_period`

			`if __name__ == "__main__":`

			`parser = argparse.ArgumentParser(description="Construct the binary file from the wordlist file")`

			`parser.add_argument("--wordlist",`
			`default="wordlist.txt",`
			`type=argparse.FileType("r", encoding="UTF8"),`
			`help="The input wordlist file as generated by generate-wordlist")`
			`parser.add_argument("--lettercount",`
			`default="lettercount.txt",`
			`type=argparse.FileType("r", encoding="UTF8"),`
			`help="The input lettercount file as generated by lettercount")`
			`parser.add_argument("--binfile",`
			`default="letterset2count.kmrw",`
			`type=argparse.FileType(mode="w+b", bufsize=0),`
			`help="The output file to be produced")`

			`args = parser.parse_args()`

			`mk(args.lettercount, args.wordlist, args.binfile)`