feat: Implement a better algo for production card creation term valuing

This commit is contained in:
Dendy 2025-05-01 20:56:06 +02:00
parent e2c178e998
commit da9d4d344a
3 changed files with 102 additions and 30 deletions

View File

@ -6,6 +6,7 @@ use App\Entity\SentenceListeningNote;
use App\Entity\SentenceNote; use App\Entity\SentenceNote;
use App\Entity\Term; use App\Entity\Term;
use App\Service\AnkiService; use App\Service\AnkiService;
use App\Utils\Japanese;
use App\Utils\Progress; use App\Utils\Progress;
use Symfony\Component\Console\Attribute\AsCommand; use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command; use Symfony\Component\Console\Command\Command;
@ -116,7 +117,7 @@ class CreateProductionCommand extends Command
// --------- Getting list<SentenceNote> into array<TermKanji, Term> --- // --------- Getting list<SentenceNote> into array<TermKanji, Term> ---
$knownTerms = []; $allTerms = [];
$knownKanji = []; $knownKanji = [];
$termCounts = []; $termCounts = [];
printf('Indexing all terms...'); printf('Indexing all terms...');
@ -124,15 +125,16 @@ class CreateProductionCommand extends Command
foreach ($note->getTerms() as &$term) { foreach ($note->getTerms() as &$term) {
assert($term instanceof Term); assert($term instanceof Term);
if (key_exists($term->getKanji(), $knownTerms)) continue; if (key_exists($term->getKanji(), $allTerms)) continue;
$termCounts[$term->getKanji()] = 0; $termCounts[$term->getKanji()] = 0;
$knownTerms[$term->getKanji()] = &$term; $allTerms[$term->getKanji()] = &$term;
foreach (self::extractKanji($term->getKanji()) as $kanji) { foreach (self::extractKanji($term->getKanji()) as $kanji) {
$knownKanji[$kanji] = 0; $knownKanji[$kanji] = 0;
} }
} }
} }
printf(" OK (%d)\n", count($knownTerms)); printf(" OK (%d)\n", count($allTerms));
$progress = new Progress('Getting frequenciees', count($allSentenceNotes)); $progress = new Progress('Getting frequenciees', count($allSentenceNotes));
foreach ($allSentenceNotes as $note) { foreach ($allSentenceNotes as $note) {
@ -150,67 +152,76 @@ class CreateProductionCommand extends Command
} }
printf("\n"); printf("\n");
$seenKanji = [];
//uksort($knownTerms, function ($a, $b) {
// //return strlen(self::getOnlyKanji($b)) <=> strlen(self::getOnlyKanji($a)); // descending order
// return strlen($b) <=> strlen($a); // ascending order
//});
$seenKanji = $this->ankiService->getKnownSlnKanjiCounts();
printf('Rating terms...'); printf('Rating terms...');
foreach ($knownTerms as $term) { foreach ($allTerms as $key => $term) {
$termKanji = self::getOnlyKanji($term->getKanji()); $count = Japanese::kanjiDiff(
$weight = 1 / max(mb_strlen($termKanji), 1); $term->getKanji(),
array_keys($seenKanji),
);
// First pass: Calculate the weight if ($count <= 0) {
foreach ($knownKanji as $kanji => $count) { unset($allTerms[$key]);
if (str_contains($termKanji, $kanji)) { unset($termCounts[$key]);
$termCounts[$term->getKanji()] += (int) ceil($count * $weight); continue;
}
}
} }
arsort($termCounts); $termCounts[$term->getKanji()] = $count;
}
uksort($termCounts, function ($a, $b) {
$aLen = mb_strlen(Japanese::getOnlyKanji($a));
$bLen = mb_strlen(Japanese::getOnlyKanji($b));
return $bLen <=> $aLen;
});
asort($termCounts);
printf(" OK\n");
// Have into account the ones that have already been created. // Have into account the ones that have already been created.
// This will not only skip them but also update the general array for // This will not only skip them but also update the general array for
// already seen kanji. // already seen kanji.
$seenKanji = [];
printf('Filtering out terms with no new kanji...');
// First pass: Get the list of the kanji we've seen
foreach ($allListeningNotes as $listeningNote) { foreach ($allListeningNotes as $listeningNote) {
assert($listeningNote instanceof SentenceListeningNote); assert($listeningNote instanceof SentenceListeningNote);
$termKanji = self::getOnlyKanji($listeningNote->getTerm()->getKanji()); $termKanji = self::getOnlyKanji($listeningNote->getTerm()->getKanji());
self::kanjiDiff($seenKanji, $termKanji); self::kanjiDiff($seenKanji, $termKanji);
} }
// Second pass: Remove terms with no new kanji at all
foreach ($termCounts as $term => $count) { foreach ($termCounts as $term => $count) {
$termKanji = self::getOnlyKanji($term); $termKanji = self::getOnlyKanji($term);
// Second pass: Remove terms with no new kanji at all
if (!self::kanjiDiff($seenKanji, $termKanji)) { if (!self::kanjiDiff($seenKanji, $termKanji)) {
unset($termCounts[$term]); unset($termCounts[$term]);
//unset($knownTerms[$term->getKanji()]); unset($allTerms[$term]);
//$termCounts[$term->getKanji()] = 0;
} }
} }
printf(" OK\n"); printf(" OK\n");
arsort($termCounts); asort($termCounts);
printf("\n"); printf("\n");
$newNotesCount = intval($input->getArgument('count')); $newNotesCount = intval($input->getArgument('count'));
foreach ($termCounts as $term => $count) { foreach ($termCounts as $term => $count) {
if ($newNotesCount <= 0) break; if ($newNotesCount <= 0) break;
// FIXME: This shouldn't happen at all
if (!$allTerms[$term] instanceof Term) continue;
$termKanji = self::getOnlyKanji($term); $termKanji = self::getOnlyKanji($term);
printf("%s: %d\n", $term, $count); printf("%s: %d\n", $term, $count);
$len = mb_strlen($termKanji); //$len = mb_strlen($termKanji);
for ($i = 0; $i < $len; $i++) { //for ($i = 0; $i < $len; $i++) {
$iKanji = mb_substr($termKanji, $i, 1); // $iKanji = mb_substr($termKanji, $i, 1);
printf(" - %s: %0.2f\n", $iKanji, $knownKanji[$iKanji] / $len); // printf(" - %s: %0.2f\n", $iKanji, $knownKanji[$iKanji] / $len);
} //}
//$this->createProductionNoteFromTerm($knownTerms[$term]); $this->createProductionNoteFromTerm($allTerms[$term]);
$newNotesCount -= 1; $newNotesCount -= 1;
}; };

View File

@ -5,6 +5,7 @@ namespace App\Service;
use App\Entity\Note; use App\Entity\Note;
use App\Entity\SentenceListeningNote; use App\Entity\SentenceListeningNote;
use App\Entity\SentenceNote; use App\Entity\SentenceNote;
use App\Utils\Japanese;
use Symfony\Contracts\HttpClient\HttpClientInterface; use Symfony\Contracts\HttpClient\HttpClientInterface;
class AnkiService class AnkiService
@ -131,4 +132,26 @@ class AnkiService
$this->request('updateNoteFields', ['note' => $note->toAnki()]); $this->request('updateNoteFields', ['note' => $note->toAnki()]);
$this->request('guiBrowse', ['query' => 'nid:' . $note->getId()]); $this->request('guiBrowse', ['query' => 'nid:' . $note->getId()]);
} }
/** @return array<string, int> */
public function getKnownSlnKanjiCounts(): array
{
$allListeningIds = $this->getAllSentenceListeningNoteIds();
$ret = [];
foreach ($this->getNotes($allListeningIds) as $slNote) {
assert($slNote instanceof SentenceListeningNote);
$termKanji = Japanese::getOnlyKanji($slNote->getTerm()->getKanji());
$len = mb_strlen($termKanji);
for ($i = 0; $i < $len; $i++) {
$kanji = mb_substr($termKanji, $i, 1);
$ret[$kanji] ??= 0;
$ret[$kanji]++;
}
}
return $ret;
}
} }

38
src/Utils/Japanese.php Normal file
View File

@ -0,0 +1,38 @@
<?php
namespace App\Utils;
class Japanese
{
public static function getOnlyKanji(string $str): string
{
return preg_replace('/[^\p{Script=Han}]/u', '', $str);
}
/** Get the number of kanji of a string that are not in the given set
* of kanji
*
* Only kanji are considere, not katakana, hiragana or any other symbols.
*
* @param list<string> $kanjiSet
*/
public static function kanjiDiff(string $str, array $kanjiSet): int
{
$ret = 0;
$strKanji = self::getOnlyKanji($str);
$len = mb_strlen($strKanji);
for ($i = 0; $i < $len; $i++) {
$kanji = mb_substr($strKanji, $i, 1);
if (!array_search($kanji, $kanjiSet)) {
$ret++;
}
}
//dump($str, $strKanji, $ret);
//echo "\n";
return $ret;
}
}