feat: (At last) add rarity ordering. Also too many chore liftups

This commit is contained in:
Dendy 2025-08-26 08:45:55 +02:00
parent 12ca7afd80
commit f729734a71
6 changed files with 156 additions and 68 deletions

View File

@ -8,18 +8,14 @@ use App\Entity\Term;
use App\Service\AnkiService;
use App\Utils\Japanese;
use App\Utils\Progress;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
#[AsCommand(
name: 'app:create-production',
description: 'Add a short description for your command',
)]
#[AsCommand('app:create:listening', 'Create new listening Anki Cards')]
class CreateProductionCommand extends Command
{
public function __construct(
@ -28,6 +24,7 @@ class CreateProductionCommand extends Command
parent::__construct();
}
/** @return list<string> */
private static function extractKanji(string $str): array
{
preg_match_all('/\p{Script=Han}/u', $str, $matches);
@ -39,6 +36,7 @@ class CreateProductionCommand extends Command
return preg_replace('/[^\p{Script=Han}]/u', '', $str);
}
/** @param array<string, 0> $ref */
private static function kanjiDiff(array &$ref, string $subject): bool
{
$len = mb_strlen($subject);
@ -47,7 +45,7 @@ class CreateProductionCommand extends Command
for ($i = 0; $i < $len; $i++) {
$subKanji = mb_substr($subject, $i, 1);
foreach ($ref as $refKanji => $value) {
foreach (array_keys($ref) as $refKanji) {
if ($subKanji === $refKanji) continue 2;
}
@ -60,10 +58,11 @@ class CreateProductionCommand extends Command
protected function configure(): void
{
$this
->addArgument('count', InputArgument::OPTIONAL, 'Amount of cards to make', 1);
//->addOption('option1', null, InputOption::VALUE_NONE, 'Option description')
;
$this->addArgument(
'count',
InputArgument::REQUIRED,
'Amount of cards to make',
);
}
protected function createProductionNoteFromTerm(Term $term): void
@ -112,71 +111,105 @@ class CreateProductionCommand extends Command
protected function execute(InputInterface $input, OutputInterface $output): int
{
$allSentenceNotes = $this->getAllSentenceNotes();
$allSentenceNotes = $this->getAllSentenceNotes();
$allListeningNotes = $this->getAllSentenceListeningNotes();
// --------- Getting list<SentenceNote> into array<TermKanji, Term> ---
// Index of all the Terms indexed by its TermKanji
$allTerms = []; // ["パレートの法則" => App\Entity\Term]
// Set of known Kanji Characters
$knownKanji = []; // ["法" => 0, "則" => 0]
// How many times it appears (not as a term, but in KanjiSentence)
$termCounts = []; // ["パレートの法則" => 1]
$allTerms = [];
$knownKanji = [];
$termCounts = [];
printf('Indexing all terms...');
foreach ($allSentenceNotes as $note) {
foreach ($note->getTerms() as &$term) {
assert($term instanceof Term);
// Deduplicate list
if (key_exists($term->getKanji(), $allTerms)) continue;
$termCounts[$term->getKanji()] = 0;
// Actual indexing
$allTerms[$term->getKanji()] = &$term;
// Just simple intialization
$termCounts[$term->getKanji()] = 0;
foreach (self::extractKanji($term->getKanji()) as $kanji) {
$knownKanji[$kanji] = 0;
}
}
}
printf(" OK (%d)\n", count($allTerms));
printf(" OK (%d)\n", count($knownKanji));
// Populate $knownKanji ["例" => 378, ...];
// TODO: Move this into own function to prevent side-effects. It's
// looping through the whole thing again anyway, so there's no
// need for it in here.
//
// Maybe while you're at it, it could be simplified into a
// function like $this->anki->getKanji('origField', 'countField')
// Where count can be null so it's just a Set
$progress = new Progress('Getting frequenciees', count($allSentenceNotes));
foreach ($allSentenceNotes as $note) {
$progress->tick();
$sentKanji = str_replace(
// Sanitize sentence (remove those pesky \u{200E})
$_sentKanji = str_replace(
"\u{200E}",
'',
strip_tags($note->getFields()['SentKanji'])
);
foreach ($knownKanji as $kanji => &$count) {
if (str_contains($sentKanji, $kanji)) $count++;
if (str_contains($_sentKanji, $kanji)) $count++;
}
}
// TODO: Make progress a function with a callback? That way scope inside
// and side-effects are easy to control & track
unset($progress);
printf("\n");
$seenKanji = $this->ankiService->getKnownSlnKanjiCounts();
// Build the values to be used in the ordering process
// TODO: It kinda feels wrong that $termCounts is used in this special
// manner while $seenScore is separate. Does it make sense to
// build them at this stage? Make a generic orderer?
//
// $termOrdering = ['first' => 32, 'second' => 34, 'apple' => 2];
//
// At first we just built the term list, then we generate an
// ordering array where the list is ordered
printf('Rating terms...');
$studiedKanji = $this->ankiService->getKnownSlnKanjiCounts('ASC');
$seenScore = [];
foreach ($allTerms as $key => $term) {
$count = Japanese::kanjiDiff(
$term->getKanji(),
array_keys($seenKanji),
);
$diff = Japanese::kanjiDiff($term->getKanji(), array_keys($studiedKanji));
if ($count <= 0) {
if (count($diff) <= 0) {
unset($allTerms[$key]);
unset($termCounts[$key]);
continue;
}
$termCounts[$term->getKanji()] = $count;
// Build $seenScore
$seenScore[$key] = 0;
foreach (Japanese::getKanjiList($key) as $_kanji) {
$seenScore[$key] += $studiedKanji[$_kanji] ?? 0;
}
// Build $termCounts
$termCounts[$term->getKanji()] = count($diff);
}
// Ordering in having:
// 1. Least new Kanji (ideally we just one 1 new kanji)
// 2. Most Kanji (most amount of unique kanji)
// 3. Least studied kanji
uksort($termCounts, fn($a, $b) => $seenScore[$a] <=> $seenScore[$b]);
uksort($termCounts, function ($a, $b) {
$aLen = mb_strlen(Japanese::getOnlyKanji($a));
$bLen = mb_strlen(Japanese::getOnlyKanji($b));
$aLen = count(Japanese::getKanjiList($a));
$bLen = count(Japanese::getKanjiList($b));
return $bLen <=> $aLen;
});
asort($termCounts);
asort($termCounts, SORT_NUMERIC);
printf(" OK\n");
// Have into account the ones that have already been created.
// This will not only skip them but also update the general array for
// already seen kanji.
@ -184,8 +217,6 @@ class CreateProductionCommand extends Command
printf('Filtering out terms with no new kanji...');
// First pass: Get the list of the kanji we've seen
foreach ($allListeningNotes as $listeningNote) {
assert($listeningNote instanceof SentenceListeningNote);
$termKanji = self::getOnlyKanji($listeningNote->getTerm()->getKanji());
self::kanjiDiff($seenKanji, $termKanji);
}
@ -200,7 +231,7 @@ class CreateProductionCommand extends Command
}
printf(" OK\n");
asort($termCounts);
asort($termCounts, SORT_NUMERIC);
printf("\n");
@ -209,11 +240,8 @@ class CreateProductionCommand extends Command
foreach ($termCounts as $term => $count) {
if ($newNotesCount <= 0) break;
// FIXME: This shouldn't happen at all
if (!$allTerms[$term] instanceof Term) continue;
$termKanji = self::getOnlyKanji($term);
printf("%s: %d\n", $term, $count);
printf("%s %d | %d\n", "{$term}", $count, $seenScore[$term]);
//$len = mb_strlen($termKanji);
//for ($i = 0; $i < $len; $i++) {

View File

@ -73,6 +73,14 @@ class AnkiController extends AbstractController
]);
}
#[Route('/kanji', 'kanji', methods: 'GET')]
public function kanji()
{
$thing = $this->ankiService->getKnownSnKanjiCounts();
asort($thing, SORT_DESC);
return new Response(implode('', array_keys($thing)));
}
#[Route('/note/{nid}/get', name: 'get_note')]
public function get_note(int $nid)
{

View File

@ -11,10 +11,12 @@ class SentenceNote extends Note
// -------------------------------------------------- Getters & setters ---
/** @return list<Term> */
public function getTerms(): array
{
return $this->terms;
}
/** @param list<Terms> $terms */
public function setTerms(array $terms): static
{
$this->terms = $terms;

View File

@ -18,15 +18,21 @@ class Term
return self::parseFurigana($this->kanji)['kanji'];
}
/**
* Get the kanji version & the reading for a given term
*
* TODO: Make this smarter & handle mixing of kanji & hiradana
*
* @return array{'kanji': string, 'reading': null|string}
* */
public static function parseFurigana(string $furigana): array
{
// 0: all, 1: (kanji/hiragana), 2: ([reading]): 3: (reading)
// 0: all, 1: (kanji/hiragana), 2: ([reading, ...]), 3: (reading)
preg_match_all('/([^ \[]+)(\[([^\]]*)\])? ?/', $furigana, $matches, PREG_SET_ORDER);
$matchedKanji = array_map(fn($x) => $x[1], $matches);
$matchedReading = array_map(fn($x) => $x[3] ?? $x[1], $matches);
return [
'kanji' => join('', $matchedKanji),
'reading' => $matchedKanji == $matchedReading
@ -35,7 +41,7 @@ class Term
];
}
public function toAnkiVocabDef()
public function toAnkiVocabDef(): string
{
$ret = '<span ' . Note::HIGHLIGHT_ATTR_KANJI . '>' . $this->kanji;
@ -79,10 +85,7 @@ class Term
'」' => ']',
' ' => ' ',
]));
$def = mb_trim($def);
if (!is_string($term->kanji)) {
return null;
}
$def = mb_trim($def);
// -------------------------------------------------- No definition ---
@ -139,6 +142,10 @@ class Term
return Term::fromVocabDefLine($kanji . $separator . $def);
}
/**
* @param array<string, string> $fields
* @return list<Term>
*/
public static function fromNoteFields(array $fields): array
{
// -------------------- Trying to extract it with the modern syntax ---

View File

@ -134,7 +134,7 @@ class AnkiService
}
/** @return array<string, int> */
public function getKnownSlnKanjiCounts(): array
public function getKnownSlnKanjiCounts(?string $order = null): array
{
$allListeningIds = $this->getAllSentenceListeningNoteIds();
$ret = [];
@ -152,6 +152,34 @@ class AnkiService
}
}
uasort($ret, function (int $a, int $b) use ($order) {
return $order === 'ASC' ? $a <=> $b : $b <=> $a;
});
return $ret;
}
/** @return array<string, int> */
public function getKnownSnKanjiCounts(): array
{
$allListeningIds = $this->getAllSentenceNoteIds();
$ret = [];
foreach ($this->getNotes($allListeningIds) as $sNote) {
assert($sNote instanceof SentenceNote);
foreach ($sNote->getTerms() as $term) {
$termKanji = Japanese::getOnlyKanji($term->getKanji());
$len = mb_strlen($termKanji);
for ($i = 0; $i < $len; $i++) {
$kanji = mb_substr($termKanji, $i, 1);
$ret[$kanji] ??= 0;
$ret[$kanji]++;
}
}
}
return $ret;
}
}

View File

@ -6,33 +6,48 @@ class Japanese
{
public static function getOnlyKanji(string $str): string
{
return preg_replace('/[^\p{Script=Han}]/u', '', $str);
return preg_replace('/[^\p{Script=Han}]/u', '', $str) ?? '';
}
/** Get the number of kanji of a string that are not in the given set
* of kanji
*
* Only kanji are considere, not katakana, hiragana or any other symbols.
*
* @param list<string> $kanjiSet
*/
public static function kanjiDiff(string $str, array $kanjiSet): int
/** @return \Generator<int, string> */
public static function mbIterate(
string $str,
int $start = 0,
int $length = 1,
?string $encoding = null,
): \Generator {
while (($char = mb_substr($str, $start++, $length, $encoding)) !== '') {
yield $char;
}
}
/** @return list<string> */
public static function getKanjiList(string $str): array
{
$ret = 0;
$ret = [];
$strKanji = self::getOnlyKanji($str);
$len = mb_strlen($strKanji);
for ($i = 0; $i < $len; $i++) {
$kanji = mb_substr($strKanji, $i, 1);
if (!array_search($kanji, $kanjiSet)) {
$ret++;
}
foreach (self::mbIterate(self::getOnlyKanji($str)) as $kanji) {
$ret[$kanji] = 0;
}
//dump($str, $strKanji, $ret);
//echo "\n";
return array_keys($ret);
}
return $ret;
/**
* Get the list of kanji that are not present in a given string.
* Only kanji are considered, not katakana, hiragana or any other symbols.
*
* @param list<string> $kanjiSet
* @return list<string>
*/
public static function kanjiDiff(string $str, array $kanjiSet): array
{
$ret = [];
foreach (self::mbIterate(self::getOnlyKanji($str)) as $kanji) {
if (!array_search($kanji, $kanjiSet)) $ret[$kanji] = 0;
}
return array_keys($ret);
}
}