Compare commits

..

3 Commits

4 changed files with 154 additions and 66 deletions

View File

@ -6,6 +6,8 @@ use App\Entity\SentenceListeningNote;
use App\Entity\SentenceNote; use App\Entity\SentenceNote;
use App\Entity\Term; use App\Entity\Term;
use App\Service\AnkiService; use App\Service\AnkiService;
use App\Utils\Japanese;
use App\Utils\Progress;
use Symfony\Component\Console\Attribute\AsCommand; use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command; use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument; use Symfony\Component\Console\Input\InputArgument;
@ -88,55 +90,55 @@ class CreateProductionCommand extends Command
} }
} }
protected function execute(InputInterface $input, OutputInterface $output): int /** @return list<SentenceNote> */
private function getAllSentenceNotes(): array
{ {
printf('Getting all SentenceCards...'); printf('Getting all SentenceNote...');
$allIds = $this->ankiService->getAllSentenceNoteIds(); $allIds = $this->ankiService->getAllSentenceNoteIds();
$allNotes = $this->ankiService->getNotes($allIds); $allNotes = $this->ankiService->getNotes($allIds);
printf(" OK (%d)\n", count($allNotes)); printf(" OK (%d)\n", count($allNotes));
return $allNotes;
}
printf('Getting all SentenceCards...'); /** @return list<SentenceListeningNote> */
private function getAllSentenceListeningNotes(): array
{
printf('Getting all SentenceListeningNote...');
$allListeningIds = $this->ankiService->getAllSentenceListeningNoteIds(); $allListeningIds = $this->ankiService->getAllSentenceListeningNoteIds();
$allListeningNotes = $this->ankiService->getNotes($allListeningIds); $allListeningNotes = $this->ankiService->getNotes($allListeningIds);
printf(" OK (%d)\n", count($allListeningNotes)); printf(" OK (%d)\n", count($allListeningNotes));
return $allListeningNotes;
}
printf('Indexing all terms...'); protected function execute(InputInterface $input, OutputInterface $output): int
$knownTerms = []; {
$allSentenceNotes = $this->getAllSentenceNotes();
$allListeningNotes = $this->getAllSentenceListeningNotes();
// --------- Getting list<SentenceNote> into array<TermKanji, Term> ---
$allTerms = [];
$knownKanji = []; $knownKanji = [];
$termCounts = []; $termCounts = [];
foreach ($allNotes as $note) { printf('Indexing all terms...');
if (!$note instanceof SentenceNote) throw new \Exception(sprintf( foreach ($allSentenceNotes as $note) {
'Expected SentenceNote, got %s',
$note::class,
));
foreach ($note->getTerms() as &$term) { foreach ($note->getTerms() as &$term) {
assert($term instanceof Term); assert($term instanceof Term);
if (key_exists($term->getKanji(), $knownTerms)) continue; if (key_exists($term->getKanji(), $allTerms)) continue;
$termCounts[$term->getKanji()] = 0; $termCounts[$term->getKanji()] = 0;
$knownTerms[$term->getKanji()] = &$term; $allTerms[$term->getKanji()] = &$term;
foreach (self::extractKanji($term->getKanji()) as $kanji) { foreach (self::extractKanji($term->getKanji()) as $kanji) {
$knownKanji[$kanji] = 0; $knownKanji[$kanji] = 0;
} }
} }
} }
printf(" OK (%d)\n", count($knownTerms)); printf(" OK (%d)\n", count($allTerms));
$total = count($knownTerms);
$i = 0;
foreach ($allNotes as $note) {
$i += 1;
if ($i % 12 === 0 or $i === $total) {
printf(
"\33[2K\r% 7d/% 7d | %.2f GiB | Getting frequencies",
$i,
$total,
memory_get_usage() / 1024 / 1024 / 1024
);
}
assert($note instanceof SentenceNote); $progress = new Progress('Getting frequenciees', count($allSentenceNotes));
foreach ($allSentenceNotes as $note) {
$progress->tick();
$sentKanji = str_replace( $sentKanji = str_replace(
"\u{200E}", "\u{200E}",
@ -144,83 +146,82 @@ class CreateProductionCommand extends Command
strip_tags($note->getFields()['SentKanji']) strip_tags($note->getFields()['SentKanji'])
); );
//foreach ($knownTerms as &$term) {
// assert($term instanceof Term);
// if (str_contains($sentKanji, $term->getKanji())) {
// $termCounts[$term->getKanji()] += 1;
// }
//}
foreach ($knownKanji as $kanji => &$count) { foreach ($knownKanji as $kanji => &$count) {
if (str_contains($sentKanji, $kanji)) { if (str_contains($sentKanji, $kanji)) $count++;
$count++;
}
} }
} }
printf("\n"); printf("\n");
$seenKanji = [];
//uksort($knownTerms, function ($a, $b) {
// //return strlen(self::getOnlyKanji($b)) <=> strlen(self::getOnlyKanji($a)); // descending order
// return strlen($b) <=> strlen($a); // ascending order
//});
$seenKanji = $this->ankiService->getKnownSlnKanjiCounts();
printf('Rating terms...'); printf('Rating terms...');
foreach ($knownTerms as $term) { foreach ($allTerms as $key => $term) {
$termKanji = self::getOnlyKanji($term->getKanji()); $count = Japanese::kanjiDiff(
$weight = 1 / max(mb_strlen($termKanji), 1); $term->getKanji(),
array_keys($seenKanji),
);
// First pass: Calculate the weight if ($count <= 0) {
foreach ($knownKanji as $kanji => $count) { unset($allTerms[$key]);
if (str_contains($termKanji, $kanji)) { unset($termCounts[$key]);
$termCounts[$term->getKanji()] += ceil($count * $weight); continue;
}
} }
}
arsort($termCounts); $termCounts[$term->getKanji()] = $count;
}
uksort($termCounts, function ($a, $b) {
$aLen = mb_strlen(Japanese::getOnlyKanji($a));
$bLen = mb_strlen(Japanese::getOnlyKanji($b));
return $bLen <=> $aLen;
});
asort($termCounts);
printf(" OK\n");
// Have into account the ones that have already been created. // Have into account the ones that have already been created.
// This will not only skip them but take into account the kanjis they // This will not only skip them but also update the general array for
// have. // already seen kanji.
$seenKanji = [];
printf('Filtering out terms with no new kanji...');
// First pass: Get the list of the kanji we've seen
foreach ($allListeningNotes as $listeningNote) { foreach ($allListeningNotes as $listeningNote) {
assert($listeningNote instanceof SentenceListeningNote); assert($listeningNote instanceof SentenceListeningNote);
$termKanji = self::getOnlyKanji($listeningNote->getTerm()->getKanji()); $termKanji = self::getOnlyKanji($listeningNote->getTerm()->getKanji());
self::kanjiDiff($seenKanji, $termKanji); self::kanjiDiff($seenKanji, $termKanji);
} }
// Second pass: Remove terms with no new kanji at all
foreach ($termCounts as $term => $count) { foreach ($termCounts as $term => $count) {
$termKanji = self::getOnlyKanji($term); $termKanji = self::getOnlyKanji($term);
// Second pass: Penalize terms with no new kanji at all
if (!self::kanjiDiff($seenKanji, $termKanji)) { if (!self::kanjiDiff($seenKanji, $termKanji)) {
unset($termCounts[$term]); unset($termCounts[$term]);
//unset($knownTerms[$term->getKanji()]); unset($allTerms[$term]);
//$termCounts[$term->getKanji()] = 0;
} }
} }
printf(" OK\n"); printf(" OK\n");
arsort($termCounts); asort($termCounts);
printf("\n"); printf("\n");
$newNotesCount = intval($input->getArgument('count')); $newNotesCount = intval($input->getArgument('count'));
foreach ($termCounts as $term => $count) { foreach ($termCounts as $term => $count) {
if ($newNotesCount <= 0) break; if ($newNotesCount <= 0) break;
// FIXME: This shouldn't happen at all
if (!$allTerms[$term] instanceof Term) continue;
$termKanji = self::getOnlyKanji($term); $termKanji = self::getOnlyKanji($term);
printf("%s: %d\n", $term, $count); printf("%s: %d\n", $term, $count);
$len = mb_strlen($termKanji); //$len = mb_strlen($termKanji);
for ($i = 0; $i < $len; $i++) { //for ($i = 0; $i < $len; $i++) {
$iKanji = mb_substr($termKanji, $i, 1); // $iKanji = mb_substr($termKanji, $i, 1);
printf(" - %s: %0.2f\n", $iKanji, $knownKanji[$iKanji] / $len); // printf(" - %s: %0.2f\n", $iKanji, $knownKanji[$iKanji] / $len);
} //}
$this->createProductionNoteFromTerm($knownTerms[$term]); $this->createProductionNoteFromTerm($allTerms[$term]);
$newNotesCount -= 1; $newNotesCount -= 1;
}; };

View File

@ -5,6 +5,7 @@ namespace App\Service;
use App\Entity\Note; use App\Entity\Note;
use App\Entity\SentenceListeningNote; use App\Entity\SentenceListeningNote;
use App\Entity\SentenceNote; use App\Entity\SentenceNote;
use App\Utils\Japanese;
use Symfony\Contracts\HttpClient\HttpClientInterface; use Symfony\Contracts\HttpClient\HttpClientInterface;
class AnkiService class AnkiService
@ -131,4 +132,26 @@ class AnkiService
$this->request('updateNoteFields', ['note' => $note->toAnki()]); $this->request('updateNoteFields', ['note' => $note->toAnki()]);
$this->request('guiBrowse', ['query' => 'nid:' . $note->getId()]); $this->request('guiBrowse', ['query' => 'nid:' . $note->getId()]);
} }
/** @return array<string, int> */
public function getKnownSlnKanjiCounts(): array
{
$allListeningIds = $this->getAllSentenceListeningNoteIds();
$ret = [];
foreach ($this->getNotes($allListeningIds) as $slNote) {
assert($slNote instanceof SentenceListeningNote);
$termKanji = Japanese::getOnlyKanji($slNote->getTerm()->getKanji());
$len = mb_strlen($termKanji);
for ($i = 0; $i < $len; $i++) {
$kanji = mb_substr($termKanji, $i, 1);
$ret[$kanji] ??= 0;
$ret[$kanji]++;
}
}
return $ret;
}
} }

38
src/Utils/Japanese.php Normal file
View File

@ -0,0 +1,38 @@
<?php
namespace App\Utils;
class Japanese
{
public static function getOnlyKanji(string $str): string
{
return preg_replace('/[^\p{Script=Han}]/u', '', $str);
}
/** Get the number of kanji of a string that are not in the given set
* of kanji
*
* Only kanji are considere, not katakana, hiragana or any other symbols.
*
* @param list<string> $kanjiSet
*/
public static function kanjiDiff(string $str, array $kanjiSet): int
{
$ret = 0;
$strKanji = self::getOnlyKanji($str);
$len = mb_strlen($strKanji);
for ($i = 0; $i < $len; $i++) {
$kanji = mb_substr($strKanji, $i, 1);
if (!array_search($kanji, $kanjiSet)) {
$ret++;
}
}
//dump($str, $strKanji, $ret);
//echo "\n";
return $ret;
}
}

26
src/Utils/Progress.php Normal file
View File

@ -0,0 +1,26 @@
<?php
namespace App\Utils;
class Progress
{
function __construct(
private string $message,
private int $total,
private int $speed = 12,
private int $i = 0,
) {}
public function tick()
{
$this->i += 1;
if ($this->i % $this->speed === 0 or $this->i === $this->total) {
printf(
"\33[2K\r% 7d/% 7d | %.2f GiB | {$this->message}",
$this->i,
$this->total,
memory_get_usage() / 1024 / 1024 / 1024
);
}
}
}