kopia lustrzana https://gitlab.com/4ham/koch-method-real-words
122 wiersze
5.1 KiB
Python
Executable File
122 wiersze
5.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
import mmap
|
|
import json
|
|
from itertools import combinations
|
|
|
|
from letters_rare_first import from_lettercount_file
|
|
from letter2bitmask import Letter2Bitmask
|
|
|
|
def find(lettercount_file, min_words_new_letter, db_file):
|
|
l2b = Letter2Bitmask(from_lettercount_file(lettercount_file))
|
|
|
|
with mmap.mmap(db_file.fileno(), 2 ** 26 * 4, prot=mmap.PROT_READ) as mm:
|
|
with memoryview(mm).cast("I") as view:
|
|
|
|
# The 26 letters, as an array of one-bit bitmasks.
|
|
bits = [ 1 << i for i in range(0, 26) ]
|
|
|
|
# This is the (crude) way of choosing what to learn next.
|
|
def findBestIndex(countNIndex):
|
|
"""Given a list of pairs, each pair consisting of a wordcount
|
|
and a letter-set bitmask, find the optimal such pair.
|
|
|
|
The (word-)count should be at least min_words_new_letter.
|
|
Among those, the bitmask should be as large as possible
|
|
(so rare letters are introduced early, which gives them a chance
|
|
to be practiced more often).
|
|
If no count at least min_words_new_letter is found,
|
|
use a pair with the largest count achieved."""
|
|
|
|
# A bitmask is "rich" if it allows enough new words:
|
|
rich = [ci for ci in countNIndex if min_words_new_letter <= ci[0]]
|
|
|
|
def swap(x):
|
|
return (x[1], x[0])
|
|
|
|
# Do we have a rich bitmask?
|
|
if 1 <= len(rich):
|
|
# Yes. Of these, use the smallest bitmap.
|
|
# Rare letters are mapped to small bitmap values,
|
|
# so prefering small bitmaps presumably results in
|
|
# introducion of rare letters early, so they are exercised more often.
|
|
return swap(sorted([swap(r) for r in rich])[0])
|
|
else:
|
|
# We have not found a rich bitmask.
|
|
# In this case, use the bitmasks that allows for the most words:
|
|
largest_count = sorted(countNIndex, reverse=True)[0][0]
|
|
# We could potentially have several that give the same number of words.
|
|
# If so, prepare the smaller bitmask:
|
|
ci_with_that_many_words = [ci for ci in countNIndex if ci[0] == largest_count]
|
|
return swap(sorted([swap(ci) for ci in ci_with_that_many_words])[0])
|
|
|
|
# First step: Try all combinations of three letters (bits, really).
|
|
# Which three letters allow the maximal numbers of words?
|
|
# Simply brute-force "try all":
|
|
countNIndex = []
|
|
for three_bits in combinations(bits, 3):
|
|
index = three_bits[0] | three_bits[1] | three_bits[2]
|
|
count = view[index]
|
|
countNIndex.append((count, index))
|
|
|
|
(count_initial, first_three_bits) = findBestIndex(countNIndex)
|
|
|
|
result = {}
|
|
|
|
result["lesson_01"] = {
|
|
"letters": l2b.chars(first_three_bits),
|
|
"new_words": count_initial,
|
|
"words_total": count_initial}
|
|
|
|
# Now consecutively add one bit at a time.
|
|
|
|
bits_so_far = first_three_bits
|
|
all_bits = 2 ** 26 - 1
|
|
lesson_number = 2
|
|
count_so_far = count_initial
|
|
while bits_so_far != all_bits:
|
|
countNIndex = []
|
|
for new_bit in [1 << i for i in range(0, 26)]:
|
|
bits = new_bit | bits_so_far
|
|
if bits != bits_so_far:
|
|
countNIndex.append((view[bits] - count_so_far, new_bit))
|
|
|
|
(new_count, new_bit) = findBestIndex(countNIndex)
|
|
result[f"lesson_{lesson_number:02d}"] = {
|
|
"new_letter": l2b.chars(new_bit)[0],
|
|
"old_letters": l2b.chars(bits_so_far),
|
|
"new_words": new_count,
|
|
"words_total": count_so_far + new_count
|
|
}
|
|
|
|
bits_so_far = bits_so_far | new_bit
|
|
count_so_far += new_count
|
|
lesson_number += 1
|
|
|
|
return result
|
|
|
|
if __name__ == "__main__":
|
|
|
|
parser = argparse.ArgumentParser(description="Find choices for initial three letters, given a binfile.")
|
|
|
|
parser.add_argument("--binfile",
|
|
default="letterset2count.kmrw",
|
|
type=argparse.FileType(mode="rb", bufsize=0),
|
|
help="The database file used as a basis.")
|
|
|
|
parser.add_argument("--lettercount",
|
|
default="lettercount.txt",
|
|
type=argparse.FileType("r", encoding="UTF8"),
|
|
help="The input lettercount file as generated by lettercount")
|
|
|
|
parser.add_argument("--words",
|
|
default=120,
|
|
type=int,
|
|
help="Number of new words needed for a new letter to be learnable")
|
|
|
|
args = parser.parse_args()
|
|
|
|
print(json.dumps(find(args.lettercount, args.words, args.binfile), indent=2))
|
|
|