From da9d4d344ac934d754850c2b7eeb85cb35b86081 Mon Sep 17 00:00:00 2001 From: Dendy Faist Date: Thu, 1 May 2025 20:56:06 +0200 Subject: [PATCH] feat: Implement a better algo for production card creation term valuing --- src/Command/CreateProductionCommand.php | 71 ++++++++++++++----------- src/Service/AnkiService.php | 23 ++++++++ src/Utils/Japanese.php | 38 +++++++++++++ 3 files changed, 102 insertions(+), 30 deletions(-) create mode 100644 src/Utils/Japanese.php diff --git a/src/Command/CreateProductionCommand.php b/src/Command/CreateProductionCommand.php index 0549cab..32ffdd9 100644 --- a/src/Command/CreateProductionCommand.php +++ b/src/Command/CreateProductionCommand.php @@ -6,6 +6,7 @@ use App\Entity\SentenceListeningNote; use App\Entity\SentenceNote; use App\Entity\Term; use App\Service\AnkiService; +use App\Utils\Japanese; use App\Utils\Progress; use Symfony\Component\Console\Attribute\AsCommand; use Symfony\Component\Console\Command\Command; @@ -116,7 +117,7 @@ class CreateProductionCommand extends Command // --------- Getting list into array --- - $knownTerms = []; + $allTerms = []; $knownKanji = []; $termCounts = []; printf('Indexing all terms...'); @@ -124,15 +125,16 @@ class CreateProductionCommand extends Command foreach ($note->getTerms() as &$term) { assert($term instanceof Term); - if (key_exists($term->getKanji(), $knownTerms)) continue; + if (key_exists($term->getKanji(), $allTerms)) continue; $termCounts[$term->getKanji()] = 0; - $knownTerms[$term->getKanji()] = &$term; + $allTerms[$term->getKanji()] = &$term; foreach (self::extractKanji($term->getKanji()) as $kanji) { $knownKanji[$kanji] = 0; } } } - printf(" OK (%d)\n", count($knownTerms)); + printf(" OK (%d)\n", count($allTerms)); + $progress = new Progress('Getting frequenciees', count($allSentenceNotes)); foreach ($allSentenceNotes as $note) { @@ -150,67 +152,76 @@ class CreateProductionCommand extends Command } printf("\n"); - $seenKanji = []; - //uksort($knownTerms, function ($a, $b) { - // //return strlen(self::getOnlyKanji($b)) <=> strlen(self::getOnlyKanji($a)); // descending order - // return strlen($b) <=> strlen($a); // ascending order - //}); + $seenKanji = $this->ankiService->getKnownSlnKanjiCounts(); printf('Rating terms...'); - foreach ($knownTerms as $term) { - $termKanji = self::getOnlyKanji($term->getKanji()); - $weight = 1 / max(mb_strlen($termKanji), 1); + foreach ($allTerms as $key => $term) { + $count = Japanese::kanjiDiff( + $term->getKanji(), + array_keys($seenKanji), + ); - // First pass: Calculate the weight - foreach ($knownKanji as $kanji => $count) { - if (str_contains($termKanji, $kanji)) { - $termCounts[$term->getKanji()] += (int) ceil($count * $weight); - } + if ($count <= 0) { + unset($allTerms[$key]); + unset($termCounts[$key]); + continue; } - } - arsort($termCounts); + $termCounts[$term->getKanji()] = $count; + } + uksort($termCounts, function ($a, $b) { + $aLen = mb_strlen(Japanese::getOnlyKanji($a)); + $bLen = mb_strlen(Japanese::getOnlyKanji($b)); + return $bLen <=> $aLen; + }); + asort($termCounts); + printf(" OK\n"); // Have into account the ones that have already been created. // This will not only skip them but also update the general array for // already seen kanji. + $seenKanji = []; + printf('Filtering out terms with no new kanji...'); + // First pass: Get the list of the kanji we've seen foreach ($allListeningNotes as $listeningNote) { assert($listeningNote instanceof SentenceListeningNote); $termKanji = self::getOnlyKanji($listeningNote->getTerm()->getKanji()); self::kanjiDiff($seenKanji, $termKanji); } - + // Second pass: Remove terms with no new kanji at all foreach ($termCounts as $term => $count) { $termKanji = self::getOnlyKanji($term); - // Second pass: Remove terms with no new kanji at all if (!self::kanjiDiff($seenKanji, $termKanji)) { unset($termCounts[$term]); - //unset($knownTerms[$term->getKanji()]); - //$termCounts[$term->getKanji()] = 0; + unset($allTerms[$term]); } } printf(" OK\n"); - arsort($termCounts); + asort($termCounts); printf("\n"); + $newNotesCount = intval($input->getArgument('count')); foreach ($termCounts as $term => $count) { if ($newNotesCount <= 0) break; + // FIXME: This shouldn't happen at all + if (!$allTerms[$term] instanceof Term) continue; + $termKanji = self::getOnlyKanji($term); printf("%s: %d\n", $term, $count); - $len = mb_strlen($termKanji); - for ($i = 0; $i < $len; $i++) { - $iKanji = mb_substr($termKanji, $i, 1); - printf(" - %s: %0.2f\n", $iKanji, $knownKanji[$iKanji] / $len); - } + //$len = mb_strlen($termKanji); + //for ($i = 0; $i < $len; $i++) { + // $iKanji = mb_substr($termKanji, $i, 1); + // printf(" - %s: %0.2f\n", $iKanji, $knownKanji[$iKanji] / $len); + //} - //$this->createProductionNoteFromTerm($knownTerms[$term]); + $this->createProductionNoteFromTerm($allTerms[$term]); $newNotesCount -= 1; }; diff --git a/src/Service/AnkiService.php b/src/Service/AnkiService.php index cd69c8a..e45011a 100644 --- a/src/Service/AnkiService.php +++ b/src/Service/AnkiService.php @@ -5,6 +5,7 @@ namespace App\Service; use App\Entity\Note; use App\Entity\SentenceListeningNote; use App\Entity\SentenceNote; +use App\Utils\Japanese; use Symfony\Contracts\HttpClient\HttpClientInterface; class AnkiService @@ -131,4 +132,26 @@ class AnkiService $this->request('updateNoteFields', ['note' => $note->toAnki()]); $this->request('guiBrowse', ['query' => 'nid:' . $note->getId()]); } + + /** @return array */ + public function getKnownSlnKanjiCounts(): array + { + $allListeningIds = $this->getAllSentenceListeningNoteIds(); + $ret = []; + + foreach ($this->getNotes($allListeningIds) as $slNote) { + assert($slNote instanceof SentenceListeningNote); + + $termKanji = Japanese::getOnlyKanji($slNote->getTerm()->getKanji()); + $len = mb_strlen($termKanji); + for ($i = 0; $i < $len; $i++) { + $kanji = mb_substr($termKanji, $i, 1); + + $ret[$kanji] ??= 0; + $ret[$kanji]++; + } + } + + return $ret; + } } diff --git a/src/Utils/Japanese.php b/src/Utils/Japanese.php new file mode 100644 index 0000000..f7a18e1 --- /dev/null +++ b/src/Utils/Japanese.php @@ -0,0 +1,38 @@ + $kanjiSet + */ + public static function kanjiDiff(string $str, array $kanjiSet): int + { + $ret = 0; + + $strKanji = self::getOnlyKanji($str); + $len = mb_strlen($strKanji); + for ($i = 0; $i < $len; $i++) { + $kanji = mb_substr($strKanji, $i, 1); + + if (!array_search($kanji, $kanjiSet)) { + $ret++; + } + } + + //dump($str, $strKanji, $ret); + //echo "\n"; + + return $ret; + } +}