feat: Implement a better algo for production card creation term valuing

This commit is contained in:
Dendy 2025-05-01 20:56:06 +02:00
parent e2c178e998
commit da9d4d344a
3 changed files with 102 additions and 30 deletions

View File

@ -6,6 +6,7 @@ use App\Entity\SentenceListeningNote;
use App\Entity\SentenceNote;
use App\Entity\Term;
use App\Service\AnkiService;
use App\Utils\Japanese;
use App\Utils\Progress;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
@ -116,7 +117,7 @@ class CreateProductionCommand extends Command
// --------- Getting list<SentenceNote> into array<TermKanji, Term> ---
$knownTerms = [];
$allTerms = [];
$knownKanji = [];
$termCounts = [];
printf('Indexing all terms...');
@ -124,15 +125,16 @@ class CreateProductionCommand extends Command
foreach ($note->getTerms() as &$term) {
assert($term instanceof Term);
if (key_exists($term->getKanji(), $knownTerms)) continue;
if (key_exists($term->getKanji(), $allTerms)) continue;
$termCounts[$term->getKanji()] = 0;
$knownTerms[$term->getKanji()] = &$term;
$allTerms[$term->getKanji()] = &$term;
foreach (self::extractKanji($term->getKanji()) as $kanji) {
$knownKanji[$kanji] = 0;
}
}
}
printf(" OK (%d)\n", count($knownTerms));
printf(" OK (%d)\n", count($allTerms));
$progress = new Progress('Getting frequenciees', count($allSentenceNotes));
foreach ($allSentenceNotes as $note) {
@ -150,67 +152,76 @@ class CreateProductionCommand extends Command
}
printf("\n");
$seenKanji = [];
//uksort($knownTerms, function ($a, $b) {
// //return strlen(self::getOnlyKanji($b)) <=> strlen(self::getOnlyKanji($a)); // descending order
// return strlen($b) <=> strlen($a); // ascending order
//});
$seenKanji = $this->ankiService->getKnownSlnKanjiCounts();
printf('Rating terms...');
foreach ($knownTerms as $term) {
$termKanji = self::getOnlyKanji($term->getKanji());
$weight = 1 / max(mb_strlen($termKanji), 1);
foreach ($allTerms as $key => $term) {
$count = Japanese::kanjiDiff(
$term->getKanji(),
array_keys($seenKanji),
);
// First pass: Calculate the weight
foreach ($knownKanji as $kanji => $count) {
if (str_contains($termKanji, $kanji)) {
$termCounts[$term->getKanji()] += (int) ceil($count * $weight);
}
}
if ($count <= 0) {
unset($allTerms[$key]);
unset($termCounts[$key]);
continue;
}
arsort($termCounts);
$termCounts[$term->getKanji()] = $count;
}
uksort($termCounts, function ($a, $b) {
$aLen = mb_strlen(Japanese::getOnlyKanji($a));
$bLen = mb_strlen(Japanese::getOnlyKanji($b));
return $bLen <=> $aLen;
});
asort($termCounts);
printf(" OK\n");
// Have into account the ones that have already been created.
// This will not only skip them but also update the general array for
// already seen kanji.
$seenKanji = [];
printf('Filtering out terms with no new kanji...');
// First pass: Get the list of the kanji we've seen
foreach ($allListeningNotes as $listeningNote) {
assert($listeningNote instanceof SentenceListeningNote);
$termKanji = self::getOnlyKanji($listeningNote->getTerm()->getKanji());
self::kanjiDiff($seenKanji, $termKanji);
}
// Second pass: Remove terms with no new kanji at all
foreach ($termCounts as $term => $count) {
$termKanji = self::getOnlyKanji($term);
// Second pass: Remove terms with no new kanji at all
if (!self::kanjiDiff($seenKanji, $termKanji)) {
unset($termCounts[$term]);
//unset($knownTerms[$term->getKanji()]);
//$termCounts[$term->getKanji()] = 0;
unset($allTerms[$term]);
}
}
printf(" OK\n");
arsort($termCounts);
asort($termCounts);
printf("\n");
$newNotesCount = intval($input->getArgument('count'));
foreach ($termCounts as $term => $count) {
if ($newNotesCount <= 0) break;
// FIXME: This shouldn't happen at all
if (!$allTerms[$term] instanceof Term) continue;
$termKanji = self::getOnlyKanji($term);
printf("%s: %d\n", $term, $count);
$len = mb_strlen($termKanji);
for ($i = 0; $i < $len; $i++) {
$iKanji = mb_substr($termKanji, $i, 1);
printf(" - %s: %0.2f\n", $iKanji, $knownKanji[$iKanji] / $len);
}
//$len = mb_strlen($termKanji);
//for ($i = 0; $i < $len; $i++) {
// $iKanji = mb_substr($termKanji, $i, 1);
// printf(" - %s: %0.2f\n", $iKanji, $knownKanji[$iKanji] / $len);
//}
//$this->createProductionNoteFromTerm($knownTerms[$term]);
$this->createProductionNoteFromTerm($allTerms[$term]);
$newNotesCount -= 1;
};

View File

@ -5,6 +5,7 @@ namespace App\Service;
use App\Entity\Note;
use App\Entity\SentenceListeningNote;
use App\Entity\SentenceNote;
use App\Utils\Japanese;
use Symfony\Contracts\HttpClient\HttpClientInterface;
class AnkiService
@ -131,4 +132,26 @@ class AnkiService
$this->request('updateNoteFields', ['note' => $note->toAnki()]);
$this->request('guiBrowse', ['query' => 'nid:' . $note->getId()]);
}
/** @return array<string, int> */
public function getKnownSlnKanjiCounts(): array
{
$allListeningIds = $this->getAllSentenceListeningNoteIds();
$ret = [];
foreach ($this->getNotes($allListeningIds) as $slNote) {
assert($slNote instanceof SentenceListeningNote);
$termKanji = Japanese::getOnlyKanji($slNote->getTerm()->getKanji());
$len = mb_strlen($termKanji);
for ($i = 0; $i < $len; $i++) {
$kanji = mb_substr($termKanji, $i, 1);
$ret[$kanji] ??= 0;
$ret[$kanji]++;
}
}
return $ret;
}
}

38
src/Utils/Japanese.php Normal file
View File

@ -0,0 +1,38 @@
<?php
namespace App\Utils;
class Japanese
{
public static function getOnlyKanji(string $str): string
{
return preg_replace('/[^\p{Script=Han}]/u', '', $str);
}
/** Get the number of kanji of a string that are not in the given set
* of kanji
*
* Only kanji are considere, not katakana, hiragana or any other symbols.
*
* @param list<string> $kanjiSet
*/
public static function kanjiDiff(string $str, array $kanjiSet): int
{
$ret = 0;
$strKanji = self::getOnlyKanji($str);
$len = mb_strlen($strKanji);
for ($i = 0; $i < $len; $i++) {
$kanji = mb_substr($strKanji, $i, 1);
if (!array_search($kanji, $kanjiSet)) {
$ret++;
}
}
//dump($str, $strKanji, $ret);
//echo "\n";
return $ret;
}
}