koch-method-real-words/find_learning_order.py

#!/usr/bin/env python3

import argparse
import mmap
import json
from itertools import combinations

from letters_rare_first import from_lettercount_file
from letter2bitmask import Letter2Bitmask

def find(lettercount_file, min_words_new_letter, db_file):
    l2b = Letter2Bitmask(from_lettercount_file(lettercount_file))

    with mmap.mmap(db_file.fileno(), 2 ** 26 * 4, prot=mmap.PROT_READ) as mm:
        with memoryview(mm).cast("I") as view:

            # The 26 letters, as an array of one-bit bitmasks.
            bits = [ 1 << i for i in range(0, 26) ]

            # This is the (crude) way of choosing what to learn next.
            def findBestIndex(countNIndex):
                """Given a list of pairs, each pair consisting of a wordcount
                and a letter-set bitmask, find the optimal such pair.

                The (word-)count should be at least min_words_new_letter.
                Among those, the bitmask should be as large as possible
                (so rare letters are introduced early, which gives them a chance
                to be practiced more often).
                If no count at least min_words_new_letter is found,
                use a pair with the largest count achieved."""

                # A bitmask is "rich" if it allows enough new words:
                rich = [ci for ci in countNIndex if min_words_new_letter <= ci[0]]

                def swap(x):
                    return (x[1], x[0])

                # Do we have a rich bitmask?
                if 1 <= len(rich):
                    # Yes. Of these, use the smallest bitmap.
                    # Rare letters are mapped to small bitmap values,
                    # so prefering small bitmaps presumably results in
                    # introducion of rare letters early, so they are exercised more often.
                    return swap(sorted([swap(r) for r in rich])[0])
                else:
                    # We have not found a rich bitmask.
                    # In this case, use the bitmasks that allows for the most words:
                    largest_count = sorted(countNIndex, reverse=True)[0][0]
                    # We could potentially have several that give the same number of words.
                    # If so, prepare the smaller bitmask:
                    ci_with_that_many_words = [ci for ci in countNIndex if ci[0] == largest_count]
                    return swap(sorted([swap(ci) for ci in ci_with_that_many_words])[0])

            # First step: Try all combinations of three letters (bits, really).
            # Which three letters allow the maximal numbers of words?
            # Simply brute-force "try all":
            countNIndex = []
            for three_bits in combinations(bits, 3):
                index = three_bits[0] | three_bits[1] | three_bits[2]
                count = view[index]
                countNIndex.append((count, index))

            (count_initial, first_three_bits) = findBestIndex(countNIndex)

            result = {}

            result["lesson_01"] = {
                "letters": l2b.chars(first_three_bits),
                "new_words": count_initial,
                "words_total": count_initial}

            # Now consecutively add one bit at a time.

            bits_so_far = first_three_bits
            all_bits = 2 ** 26 - 1
            lesson_number = 2
            count_so_far = count_initial
            while bits_so_far != all_bits:
                countNIndex = []
                for new_bit in [1 << i for i in range(0, 26)]:
                    bits = new_bit | bits_so_far
                    if bits != bits_so_far:
                        countNIndex.append((view[bits] - count_so_far, new_bit))

                (new_count, new_bit) = findBestIndex(countNIndex)
                result[f"lesson_{lesson_number:02d}"] = {
                    "new_letter": l2b.chars(new_bit)[0],
                    "old_letters": l2b.chars(bits_so_far),
                    "new_words": new_count,
                    "words_total": count_so_far + new_count
                }

                bits_so_far = bits_so_far | new_bit
                count_so_far += new_count
                lesson_number += 1

            return result

if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Find choices for initial three letters, given a binfile.")

    parser.add_argument("--binfile",
                        default="letterset2count.kmrw",
                        type=argparse.FileType(mode="rb", bufsize=0),
                        help="The database file used as a basis.")

    parser.add_argument("--lettercount",
                        default="lettercount.txt",
                        type=argparse.FileType("r", encoding="UTF8"),
                        help="The input lettercount file as generated by lettercount")

    parser.add_argument("--words",
                        default=120,
                        type=int,
                        help="Number of new words needed for a new letter to be learnable")

    args = parser.parse_args()

    print(json.dumps(find(args.lettercount, args.words, args.binfile), indent=2))