anker/src/Command/CreateProductionCommand.php

272 lines
9.4 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
namespace App\Command;
use App\Entity\SentenceListeningNote;
use App\Entity\SentenceNote;
use App\Entity\Term;
use App\Service\AnkiService;
use App\Utils\Japanese;
use App\Utils\Progress;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
#[AsCommand('app:create:listening', 'Create new listening Anki Cards')]
class CreateProductionCommand extends Command
{
public function __construct(
private AnkiService $ankiService,
) {
parent::__construct();
}
/** @return list<string> */
private static function extractKanji(string $str): array
{
preg_match_all('/\p{Script=Han}/u', $str, $matches);
return array_unique($matches[0]);
}
private static function getOnlyKanji(string $str): string
{
return preg_replace('/[^\p{Script=Han}]/u', '', $str);
}
/** @param array<string, 0> $ref */
private static function kanjiDiff(array &$ref, string $subject): bool
{
$len = mb_strlen($subject);
$hasUnseenKanji = false;
for ($i = 0; $i < $len; $i++) {
$subKanji = mb_substr($subject, $i, 1);
foreach (array_keys($ref) as $refKanji) {
if ($subKanji === $refKanji) continue 2;
}
$ref[$subKanji] = 0;
$hasUnseenKanji = true;
}
return $hasUnseenKanji;
}
protected function configure(): void
{
$this->addArgument(
'count',
InputArgument::REQUIRED,
'Amount of cards to make',
);
}
protected function createProductionNoteFromTerm(Term $term): void
{
$noteIds = $this->ankiService->findNotesIds(sprintf(
'"SentKanji:*%s*" "note:%s"',
$term->getKanji(),
SentenceNote::MODEL_NAME,
));
if (count($noteIds) <= 0) {
$noteIds = $this->ankiService->findNotesIds(sprintf(
'"VocabKanji:*%s*" "note:%s"',
$term->getKanji(),
SentenceNote::MODEL_NAME,
));
}
$sNote = $this->ankiService->getNote($noteIds[array_key_last($noteIds)]);
$newSlNote = SentenceListeningNote::fromNote($sNote, $term);
if (!$this->ankiService->addNote($newSlNote, 'production')) {
throw new \Exception('Failed to add note!');
}
}
/** @return list<SentenceNote> */
private function getAllSentenceNotes(): array
{
printf('Getting all SentenceNote...');
$allIds = $this->ankiService->getAllSentenceNoteIds();
$allNotes = $this->ankiService->getNotes($allIds);
printf(" OK (%d)\n", count($allNotes));
return $allNotes;
}
/** @return list<SentenceListeningNote> */
private function getAllSentenceListeningNotes(): array
{
printf('Getting all SentenceListeningNote...');
$allListeningIds = $this->ankiService->getAllSentenceListeningNoteIds();
$allListeningNotes = $this->ankiService->getNotes($allListeningIds);
printf(" OK (%d)\n", count($allListeningNotes));
return $allListeningNotes;
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$allSentenceNotes = $this->getAllSentenceNotes();
$allListeningNotes = $this->getAllSentenceListeningNotes();
// Index of all the Terms indexed by its TermKanji
$allTerms = []; // ["パレートの法則" => App\Entity\Term]
// Set of known Kanji Characters
$knownKanji = []; // ["法" => 0, "則" => 0]
// How many times it appears (not as a term, but in KanjiSentence)
$termCounts = []; // ["パレートの法則" => 1]
printf('Indexing all terms...');
foreach ($allSentenceNotes as $note) {
foreach ($note->getTerms() as &$term) {
// Deduplicate list
if (key_exists($term->getKanji(), $allTerms)) continue;
// Actual indexing
$allTerms[$term->getKanji()] = &$term;
// Just simple intialization
$termCounts[$term->getKanji()] = 0;
foreach (self::extractKanji($term->getKanji()) as $kanji) {
$knownKanji[$kanji] = 0;
}
// Please put me into a function
unset($term); // Prevent things being reassigned
}
}
printf(" OK (%d)\n", count($knownKanji));
// Populate $knownKanji ["例" => 378, ...];
// TODO: Move this into own function to prevent side-effects. It's
// looping through the whole thing again anyway, so there's no
// need for it in here.
//
// Maybe while you're at it, it could be simplified into a
// function like $this->anki->getKanji('origField', 'countField')
// Where count can be null so it's just a Set
$progress = new Progress('Getting frequenciees', count($allSentenceNotes));
foreach ($allSentenceNotes as $note) {
$progress->tick();
// Sanitize sentence (remove those pesky \u{200E})
$_sentKanji = str_replace(
"\u{200E}",
'',
strip_tags($note->getFields()['SentKanji'])
);
foreach ($knownKanji as $kanji => &$count) {
if (str_contains($_sentKanji, $kanji)) $count++;
}
}
// TODO: Make progress a function with a callback? That way scope inside
// and side-effects are easy to control & track
unset($progress);
printf("\n");
// Build the values to be used in the ordering process
// TODO: It kinda feels wrong that $termCounts is used in this special
// manner while $seenScore is separate. Does it make sense to
// build them at this stage? Make a generic orderer?
//
// $termOrdering = ['first' => 32, 'second' => 34, 'apple' => 2];
//
// At first we just built the term list, then we generate an
// ordering array where the list is ordered
printf('Rating terms...');
$studiedKanji = $this->ankiService->getKnownSlnKanjiCounts('ASC');
$seenScore = [];
foreach ($allTerms as $key => $term) {
$diff = Japanese::kanjiDiff($term->getKanji(), array_keys($studiedKanji));
if (count($diff) <= 0) {
unset($allTerms[$key]);
unset($termCounts[$key]);
continue;
}
// Build $seenScore
$seenScore[$key] = 0;
foreach (Japanese::getKanjiList($key) as $_kanji) {
$seenScore[$key] += $studiedKanji[$_kanji] ?? 0;
}
// Build $termCounts
$termCounts[$term->getKanji()] = count($diff);
}
// Ordering in having:
// 1. Least new Kanji (ideally we just one 1 new kanji)
// 2. Most Kanji (most amount of unique kanji)
// 3. Least studied kanji
uksort($termCounts, fn($a, $b) => $seenScore[$a] <=> $seenScore[$b]);
uksort($termCounts, function ($a, $b) {
$aLen = count(Japanese::getKanjiList($a));
$bLen = count(Japanese::getKanjiList($b));
return $bLen <=> $aLen;
});
asort($termCounts, SORT_NUMERIC);
printf(" OK\n");
// Have into account the ones that have already been created.
// This will not only skip them but also update the general array for
// already seen kanji.
$seenKanji = [];
printf('Filtering out terms with no new kanji...');
// First pass: Get the list of the kanji we've seen
foreach ($allListeningNotes as $listeningNote) {
$termKanji = self::getOnlyKanji($listeningNote->getTerm()->getKanji());
self::kanjiDiff($seenKanji, $termKanji);
}
// Second pass: Remove terms with no new kanji at all
foreach ($termCounts as $term => $count) {
$termKanji = self::getOnlyKanji($term);
if (!self::kanjiDiff($seenKanji, $termKanji)) {
unset($termCounts[$term]);
unset($allTerms[$term]);
}
}
printf(" OK\n");
asort($termCounts, SORT_NUMERIC);
printf("\n");
$newNotesCount = intval($input->getArgument('count'));
foreach ($termCounts as $term => $count) {
if ($newNotesCount <= 0) break;
$termKanji = self::getOnlyKanji($term);
printf("%s %d | %d\n", "{$term}", $count, $seenScore[$term]);
//$len = mb_strlen($termKanji);
//for ($i = 0; $i < $len; $i++) {
// $iKanji = mb_substr($termKanji, $i, 1);
// printf(" - %s: %0.2f\n", $iKanji, $knownKanji[$iKanji] / $len);
//}
$this->createProductionNoteFromTerm($allTerms[$term]);
$newNotesCount -= 1;
};
printf(
<<<FMNT
total: %d cards
max usage: %0.2f MiB
current usage: %0.2f MiB\n
FMNT,
count($termCounts),
memory_get_peak_usage() / 1024 / 1024,
memory_get_usage() / 1024 / 1024,
);
return Command::SUCCESS;
}
}