koch-method-real-words/find_learning_order.py

122 wiersze
5.1 KiB
Python
Executable File

#!/usr/bin/env python3
import argparse
import mmap
import json
from itertools import combinations
from letters_rare_first import from_lettercount_file
from letter2bitmask import Letter2Bitmask
def find(lettercount_file, min_words_new_letter, db_file):
l2b = Letter2Bitmask(from_lettercount_file(lettercount_file))
with mmap.mmap(db_file.fileno(), 2 ** 26 * 4, prot=mmap.PROT_READ) as mm:
with memoryview(mm).cast("I") as view:
# The 26 letters, as an array of one-bit bitmasks.
bits = [ 1 << i for i in range(0, 26) ]
# This is the (crude) way of choosing what to learn next.
def findBestIndex(countNIndex):
"""Given a list of pairs, each pair consisting of a wordcount
and a letter-set bitmask, find the optimal such pair.
The (word-)count should be at least min_words_new_letter.
Among those, the bitmask should be as large as possible
(so rare letters are introduced early, which gives them a chance
to be practiced more often).
If no count at least min_words_new_letter is found,
use a pair with the largest count achieved."""
# A bitmask is "rich" if it allows enough new words:
rich = [ci for ci in countNIndex if min_words_new_letter <= ci[0]]
def swap(x):
return (x[1], x[0])
# Do we have a rich bitmask?
if 1 <= len(rich):
# Yes. Of these, use the smallest bitmap.
# Rare letters are mapped to small bitmap values,
# so prefering small bitmaps presumably results in
# introducion of rare letters early, so they are exercised more often.
return swap(sorted([swap(r) for r in rich])[0])
else:
# We have not found a rich bitmask.
# In this case, use the bitmasks that allows for the most words:
largest_count = sorted(countNIndex, reverse=True)[0][0]
# We could potentially have several that give the same number of words.
# If so, prepare the smaller bitmask:
ci_with_that_many_words = [ci for ci in countNIndex if ci[0] == largest_count]
return swap(sorted([swap(ci) for ci in ci_with_that_many_words])[0])
# First step: Try all combinations of three letters (bits, really).
# Which three letters allow the maximal numbers of words?
# Simply brute-force "try all":
countNIndex = []
for three_bits in combinations(bits, 3):
index = three_bits[0] | three_bits[1] | three_bits[2]
count = view[index]
countNIndex.append((count, index))
(count_initial, first_three_bits) = findBestIndex(countNIndex)
result = {}
result["lesson_01"] = {
"letters": l2b.chars(first_three_bits),
"new_words": count_initial,
"words_total": count_initial}
# Now consecutively add one bit at a time.
bits_so_far = first_three_bits
all_bits = 2 ** 26 - 1
lesson_number = 2
count_so_far = count_initial
while bits_so_far != all_bits:
countNIndex = []
for new_bit in [1 << i for i in range(0, 26)]:
bits = new_bit | bits_so_far
if bits != bits_so_far:
countNIndex.append((view[bits] - count_so_far, new_bit))
(new_count, new_bit) = findBestIndex(countNIndex)
result[f"lesson_{lesson_number:02d}"] = {
"new_letter": l2b.chars(new_bit)[0],
"old_letters": l2b.chars(bits_so_far),
"new_words": new_count,
"words_total": count_so_far + new_count
}
bits_so_far = bits_so_far | new_bit
count_so_far += new_count
lesson_number += 1
return result
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Find choices for initial three letters, given a binfile.")
parser.add_argument("--binfile",
default="letterset2count.kmrw",
type=argparse.FileType(mode="rb", bufsize=0),
help="The database file used as a basis.")
parser.add_argument("--lettercount",
default="lettercount.txt",
type=argparse.FileType("r", encoding="UTF8"),
help="The input lettercount file as generated by lettercount")
parser.add_argument("--words",
default=120,
type=int,
help="Number of new words needed for a new letter to be learnable")
args = parser.parse_args()
print(json.dumps(find(args.lettercount, args.words, args.binfile), indent=2))