diff --git a/src/Command/CreateProductionCommand.php b/src/Command/CreateProductionCommand.php index 32ffdd9..7a60624 100644 --- a/src/Command/CreateProductionCommand.php +++ b/src/Command/CreateProductionCommand.php @@ -8,18 +8,14 @@ use App\Entity\Term; use App\Service\AnkiService; use App\Utils\Japanese; use App\Utils\Progress; + use Symfony\Component\Console\Attribute\AsCommand; use Symfony\Component\Console\Command\Command; use Symfony\Component\Console\Input\InputArgument; use Symfony\Component\Console\Input\InputInterface; -use Symfony\Component\Console\Input\InputOption; use Symfony\Component\Console\Output\OutputInterface; -use Symfony\Component\Console\Style\SymfonyStyle; -#[AsCommand( - name: 'app:create-production', - description: 'Add a short description for your command', -)] +#[AsCommand('app:create:listening', 'Create new listening Anki Cards')] class CreateProductionCommand extends Command { public function __construct( @@ -28,6 +24,7 @@ class CreateProductionCommand extends Command parent::__construct(); } + /** @return list */ private static function extractKanji(string $str): array { preg_match_all('/\p{Script=Han}/u', $str, $matches); @@ -39,6 +36,7 @@ class CreateProductionCommand extends Command return preg_replace('/[^\p{Script=Han}]/u', '', $str); } + /** @param array $ref */ private static function kanjiDiff(array &$ref, string $subject): bool { $len = mb_strlen($subject); @@ -47,7 +45,7 @@ class CreateProductionCommand extends Command for ($i = 0; $i < $len; $i++) { $subKanji = mb_substr($subject, $i, 1); - foreach ($ref as $refKanji => $value) { + foreach (array_keys($ref) as $refKanji) { if ($subKanji === $refKanji) continue 2; } @@ -60,10 +58,11 @@ class CreateProductionCommand extends Command protected function configure(): void { - $this - ->addArgument('count', InputArgument::OPTIONAL, 'Amount of cards to make', 1); - //->addOption('option1', null, InputOption::VALUE_NONE, 'Option description') - ; + $this->addArgument( + 'count', + InputArgument::REQUIRED, + 'Amount of cards to make', + ); } protected function createProductionNoteFromTerm(Term $term): void @@ -112,71 +111,105 @@ class CreateProductionCommand extends Command protected function execute(InputInterface $input, OutputInterface $output): int { - $allSentenceNotes = $this->getAllSentenceNotes(); + $allSentenceNotes = $this->getAllSentenceNotes(); $allListeningNotes = $this->getAllSentenceListeningNotes(); - // --------- Getting list into array --- + // Index of all the Terms indexed by its TermKanji + $allTerms = []; // ["パレートの法則" => App\Entity\Term] + // Set of known Kanji Characters + $knownKanji = []; // ["法" => 0, "則" => 0] + // How many times it appears (not as a term, but in KanjiSentence) + $termCounts = []; // ["パレートの法則" => 1] - $allTerms = []; - $knownKanji = []; - $termCounts = []; printf('Indexing all terms...'); foreach ($allSentenceNotes as $note) { foreach ($note->getTerms() as &$term) { - assert($term instanceof Term); - + // Deduplicate list if (key_exists($term->getKanji(), $allTerms)) continue; - $termCounts[$term->getKanji()] = 0; + // Actual indexing $allTerms[$term->getKanji()] = &$term; + // Just simple intialization + $termCounts[$term->getKanji()] = 0; foreach (self::extractKanji($term->getKanji()) as $kanji) { $knownKanji[$kanji] = 0; } } } - printf(" OK (%d)\n", count($allTerms)); + printf(" OK (%d)\n", count($knownKanji)); + // Populate $knownKanji ["例" => 378, ...]; + // TODO: Move this into own function to prevent side-effects. It's + // looping through the whole thing again anyway, so there's no + // need for it in here. + // + // Maybe while you're at it, it could be simplified into a + // function like $this->anki->getKanji('origField', 'countField') + // Where count can be null so it's just a Set $progress = new Progress('Getting frequenciees', count($allSentenceNotes)); foreach ($allSentenceNotes as $note) { $progress->tick(); - $sentKanji = str_replace( + // Sanitize sentence (remove those pesky \u{200E}) + $_sentKanji = str_replace( "\u{200E}", '', strip_tags($note->getFields()['SentKanji']) ); foreach ($knownKanji as $kanji => &$count) { - if (str_contains($sentKanji, $kanji)) $count++; + if (str_contains($_sentKanji, $kanji)) $count++; } } + // TODO: Make progress a function with a callback? That way scope inside + // and side-effects are easy to control & track + unset($progress); printf("\n"); - $seenKanji = $this->ankiService->getKnownSlnKanjiCounts(); + // Build the values to be used in the ordering process + // TODO: It kinda feels wrong that $termCounts is used in this special + // manner while $seenScore is separate. Does it make sense to + // build them at this stage? Make a generic orderer? + // + // $termOrdering = ['first' => 32, 'second' => 34, 'apple' => 2]; + // + // At first we just built the term list, then we generate an + // ordering array where the list is ordered printf('Rating terms...'); + $studiedKanji = $this->ankiService->getKnownSlnKanjiCounts('ASC'); + $seenScore = []; foreach ($allTerms as $key => $term) { - $count = Japanese::kanjiDiff( - $term->getKanji(), - array_keys($seenKanji), - ); + $diff = Japanese::kanjiDiff($term->getKanji(), array_keys($studiedKanji)); - if ($count <= 0) { + if (count($diff) <= 0) { unset($allTerms[$key]); unset($termCounts[$key]); continue; } - $termCounts[$term->getKanji()] = $count; + // Build $seenScore + $seenScore[$key] = 0; + foreach (Japanese::getKanjiList($key) as $_kanji) { + $seenScore[$key] += $studiedKanji[$_kanji] ?? 0; + } + // Build $termCounts + $termCounts[$term->getKanji()] = count($diff); } + // Ordering in having: + // 1. Least new Kanji (ideally we just one 1 new kanji) + // 2. Most Kanji (most amount of unique kanji) + // 3. Least studied kanji + uksort($termCounts, fn($a, $b) => $seenScore[$a] <=> $seenScore[$b]); uksort($termCounts, function ($a, $b) { - $aLen = mb_strlen(Japanese::getOnlyKanji($a)); - $bLen = mb_strlen(Japanese::getOnlyKanji($b)); + $aLen = count(Japanese::getKanjiList($a)); + $bLen = count(Japanese::getKanjiList($b)); return $bLen <=> $aLen; }); - asort($termCounts); + asort($termCounts, SORT_NUMERIC); printf(" OK\n"); + // Have into account the ones that have already been created. // This will not only skip them but also update the general array for // already seen kanji. @@ -184,8 +217,6 @@ class CreateProductionCommand extends Command printf('Filtering out terms with no new kanji...'); // First pass: Get the list of the kanji we've seen foreach ($allListeningNotes as $listeningNote) { - assert($listeningNote instanceof SentenceListeningNote); - $termKanji = self::getOnlyKanji($listeningNote->getTerm()->getKanji()); self::kanjiDiff($seenKanji, $termKanji); } @@ -200,7 +231,7 @@ class CreateProductionCommand extends Command } printf(" OK\n"); - asort($termCounts); + asort($termCounts, SORT_NUMERIC); printf("\n"); @@ -209,11 +240,8 @@ class CreateProductionCommand extends Command foreach ($termCounts as $term => $count) { if ($newNotesCount <= 0) break; - // FIXME: This shouldn't happen at all - if (!$allTerms[$term] instanceof Term) continue; - $termKanji = self::getOnlyKanji($term); - printf("%s: %d\n", $term, $count); + printf("%s %d | %d\n", "{$term}:", $count, $seenScore[$term]); //$len = mb_strlen($termKanji); //for ($i = 0; $i < $len; $i++) { diff --git a/src/Controller/AnkiController.php b/src/Controller/AnkiController.php index 6b2f521..4e4383e 100644 --- a/src/Controller/AnkiController.php +++ b/src/Controller/AnkiController.php @@ -73,6 +73,14 @@ class AnkiController extends AbstractController ]); } + #[Route('/kanji', 'kanji', methods: 'GET')] + public function kanji() + { + $thing = $this->ankiService->getKnownSnKanjiCounts(); + asort($thing, SORT_DESC); + return new Response(implode('', array_keys($thing))); + } + #[Route('/note/{nid}/get', name: 'get_note')] public function get_note(int $nid) { diff --git a/src/Entity/SentenceNote.php b/src/Entity/SentenceNote.php index 0c04d4b..38c2bc8 100644 --- a/src/Entity/SentenceNote.php +++ b/src/Entity/SentenceNote.php @@ -11,10 +11,12 @@ class SentenceNote extends Note // -------------------------------------------------- Getters & setters --- + /** @return list */ public function getTerms(): array { return $this->terms; } + /** @param list $terms */ public function setTerms(array $terms): static { $this->terms = $terms; diff --git a/src/Entity/Term.php b/src/Entity/Term.php index 486c30b..0048493 100644 --- a/src/Entity/Term.php +++ b/src/Entity/Term.php @@ -18,15 +18,21 @@ class Term return self::parseFurigana($this->kanji)['kanji']; } + /** + * Get the kanji version & the reading for a given term + * + * TODO: Make this smarter & handle mixing of kanji & hiradana + * + * @return array{'kanji': string, 'reading': null|string} + * */ public static function parseFurigana(string $furigana): array { - // 0: all, 1: (kanji/hiragana), 2: ([reading]): 3: (reading) + // 0: all, 1: (kanji/hiragana), 2: ([reading, ...]), 3: (reading) preg_match_all('/([^ \[]+)(\[([^\]]*)\])? ?/', $furigana, $matches, PREG_SET_ORDER); $matchedKanji = array_map(fn($x) => $x[1], $matches); $matchedReading = array_map(fn($x) => $x[3] ?? $x[1], $matches); - return [ 'kanji' => join('', $matchedKanji), 'reading' => $matchedKanji == $matchedReading @@ -35,7 +41,7 @@ class Term ]; } - public function toAnkiVocabDef() + public function toAnkiVocabDef(): string { $ret = '' . $this->kanji; @@ -79,10 +85,7 @@ class Term '」' => ']', ' ' => ' ', ])); - $def = mb_trim($def); - if (!is_string($term->kanji)) { - return null; - } + $def = mb_trim($def); // -------------------------------------------------- No definition --- @@ -139,6 +142,10 @@ class Term return Term::fromVocabDefLine($kanji . $separator . $def); } + /** + * @param array $fields + * @return list + */ public static function fromNoteFields(array $fields): array { // -------------------- Trying to extract it with the modern syntax --- diff --git a/src/Service/AnkiService.php b/src/Service/AnkiService.php index e45011a..181e49d 100644 --- a/src/Service/AnkiService.php +++ b/src/Service/AnkiService.php @@ -134,7 +134,7 @@ class AnkiService } /** @return array */ - public function getKnownSlnKanjiCounts(): array + public function getKnownSlnKanjiCounts(?string $order = null): array { $allListeningIds = $this->getAllSentenceListeningNoteIds(); $ret = []; @@ -152,6 +152,34 @@ class AnkiService } } + uasort($ret, function (int $a, int $b) use ($order) { + return $order === 'ASC' ? $a <=> $b : $b <=> $a; + }); + + return $ret; + } + + /** @return array */ + public function getKnownSnKanjiCounts(): array + { + $allListeningIds = $this->getAllSentenceNoteIds(); + $ret = []; + + foreach ($this->getNotes($allListeningIds) as $sNote) { + assert($sNote instanceof SentenceNote); + + foreach ($sNote->getTerms() as $term) { + $termKanji = Japanese::getOnlyKanji($term->getKanji()); + $len = mb_strlen($termKanji); + for ($i = 0; $i < $len; $i++) { + $kanji = mb_substr($termKanji, $i, 1); + + $ret[$kanji] ??= 0; + $ret[$kanji]++; + } + } + } + return $ret; } } diff --git a/src/Utils/Japanese.php b/src/Utils/Japanese.php index f7a18e1..4d22a57 100644 --- a/src/Utils/Japanese.php +++ b/src/Utils/Japanese.php @@ -6,33 +6,48 @@ class Japanese { public static function getOnlyKanji(string $str): string { - return preg_replace('/[^\p{Script=Han}]/u', '', $str); + return preg_replace('/[^\p{Script=Han}]/u', '', $str) ?? ''; } - /** Get the number of kanji of a string that are not in the given set - * of kanji - * - * Only kanji are considere, not katakana, hiragana or any other symbols. - * - * @param list $kanjiSet - */ - public static function kanjiDiff(string $str, array $kanjiSet): int + /** @return \Generator */ + public static function mbIterate( + string $str, + int $start = 0, + int $length = 1, + ?string $encoding = null, + ): \Generator { + while (($char = mb_substr($str, $start++, $length, $encoding)) !== '') { + yield $char; + } + } + + /** @return list */ + public static function getKanjiList(string $str): array { - $ret = 0; + $ret = []; - $strKanji = self::getOnlyKanji($str); - $len = mb_strlen($strKanji); - for ($i = 0; $i < $len; $i++) { - $kanji = mb_substr($strKanji, $i, 1); - - if (!array_search($kanji, $kanjiSet)) { - $ret++; - } + foreach (self::mbIterate(self::getOnlyKanji($str)) as $kanji) { + $ret[$kanji] = 0; } - //dump($str, $strKanji, $ret); - //echo "\n"; + return array_keys($ret); + } - return $ret; + /** + * Get the list of kanji that are not present in a given string. + * Only kanji are considered, not katakana, hiragana or any other symbols. + * + * @param list $kanjiSet + * @return list + */ + public static function kanjiDiff(string $str, array $kanjiSet): array + { + $ret = []; + + foreach (self::mbIterate(self::getOnlyKanji($str)) as $kanji) { + if (!array_search($kanji, $kanjiSet)) $ret[$kanji] = 0; + } + + return array_keys($ret); } }