feat: (At last) add rarity ordering. Also too many chore liftups
This commit is contained in:
parent
12ca7afd80
commit
f729734a71
|
@ -8,18 +8,14 @@ use App\Entity\Term;
|
|||
use App\Service\AnkiService;
|
||||
use App\Utils\Japanese;
|
||||
use App\Utils\Progress;
|
||||
|
||||
use Symfony\Component\Console\Attribute\AsCommand;
|
||||
use Symfony\Component\Console\Command\Command;
|
||||
use Symfony\Component\Console\Input\InputArgument;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use Symfony\Component\Console\Style\SymfonyStyle;
|
||||
|
||||
#[AsCommand(
|
||||
name: 'app:create-production',
|
||||
description: 'Add a short description for your command',
|
||||
)]
|
||||
#[AsCommand('app:create:listening', 'Create new listening Anki Cards')]
|
||||
class CreateProductionCommand extends Command
|
||||
{
|
||||
public function __construct(
|
||||
|
@ -28,6 +24,7 @@ class CreateProductionCommand extends Command
|
|||
parent::__construct();
|
||||
}
|
||||
|
||||
/** @return list<string> */
|
||||
private static function extractKanji(string $str): array
|
||||
{
|
||||
preg_match_all('/\p{Script=Han}/u', $str, $matches);
|
||||
|
@ -39,6 +36,7 @@ class CreateProductionCommand extends Command
|
|||
return preg_replace('/[^\p{Script=Han}]/u', '', $str);
|
||||
}
|
||||
|
||||
/** @param array<string, 0> $ref */
|
||||
private static function kanjiDiff(array &$ref, string $subject): bool
|
||||
{
|
||||
$len = mb_strlen($subject);
|
||||
|
@ -47,7 +45,7 @@ class CreateProductionCommand extends Command
|
|||
for ($i = 0; $i < $len; $i++) {
|
||||
$subKanji = mb_substr($subject, $i, 1);
|
||||
|
||||
foreach ($ref as $refKanji => $value) {
|
||||
foreach (array_keys($ref) as $refKanji) {
|
||||
if ($subKanji === $refKanji) continue 2;
|
||||
}
|
||||
|
||||
|
@ -60,10 +58,11 @@ class CreateProductionCommand extends Command
|
|||
|
||||
protected function configure(): void
|
||||
{
|
||||
$this
|
||||
->addArgument('count', InputArgument::OPTIONAL, 'Amount of cards to make', 1);
|
||||
//->addOption('option1', null, InputOption::VALUE_NONE, 'Option description')
|
||||
;
|
||||
$this->addArgument(
|
||||
'count',
|
||||
InputArgument::REQUIRED,
|
||||
'Amount of cards to make',
|
||||
);
|
||||
}
|
||||
|
||||
protected function createProductionNoteFromTerm(Term $term): void
|
||||
|
@ -115,68 +114,102 @@ class CreateProductionCommand extends Command
|
|||
$allSentenceNotes = $this->getAllSentenceNotes();
|
||||
$allListeningNotes = $this->getAllSentenceListeningNotes();
|
||||
|
||||
// --------- Getting list<SentenceNote> into array<TermKanji, Term> ---
|
||||
// Index of all the Terms indexed by its TermKanji
|
||||
$allTerms = []; // ["パレートの法則" => App\Entity\Term]
|
||||
// Set of known Kanji Characters
|
||||
$knownKanji = []; // ["法" => 0, "則" => 0]
|
||||
// How many times it appears (not as a term, but in KanjiSentence)
|
||||
$termCounts = []; // ["パレートの法則" => 1]
|
||||
|
||||
$allTerms = [];
|
||||
$knownKanji = [];
|
||||
$termCounts = [];
|
||||
printf('Indexing all terms...');
|
||||
foreach ($allSentenceNotes as $note) {
|
||||
foreach ($note->getTerms() as &$term) {
|
||||
assert($term instanceof Term);
|
||||
|
||||
// Deduplicate list
|
||||
if (key_exists($term->getKanji(), $allTerms)) continue;
|
||||
$termCounts[$term->getKanji()] = 0;
|
||||
// Actual indexing
|
||||
$allTerms[$term->getKanji()] = &$term;
|
||||
// Just simple intialization
|
||||
$termCounts[$term->getKanji()] = 0;
|
||||
foreach (self::extractKanji($term->getKanji()) as $kanji) {
|
||||
$knownKanji[$kanji] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
printf(" OK (%d)\n", count($allTerms));
|
||||
printf(" OK (%d)\n", count($knownKanji));
|
||||
|
||||
|
||||
// Populate $knownKanji ["例" => 378, ...];
|
||||
// TODO: Move this into own function to prevent side-effects. It's
|
||||
// looping through the whole thing again anyway, so there's no
|
||||
// need for it in here.
|
||||
//
|
||||
// Maybe while you're at it, it could be simplified into a
|
||||
// function like $this->anki->getKanji('origField', 'countField')
|
||||
// Where count can be null so it's just a Set
|
||||
$progress = new Progress('Getting frequenciees', count($allSentenceNotes));
|
||||
foreach ($allSentenceNotes as $note) {
|
||||
$progress->tick();
|
||||
|
||||
$sentKanji = str_replace(
|
||||
// Sanitize sentence (remove those pesky \u{200E})
|
||||
$_sentKanji = str_replace(
|
||||
"\u{200E}",
|
||||
'',
|
||||
strip_tags($note->getFields()['SentKanji'])
|
||||
);
|
||||
|
||||
foreach ($knownKanji as $kanji => &$count) {
|
||||
if (str_contains($sentKanji, $kanji)) $count++;
|
||||
if (str_contains($_sentKanji, $kanji)) $count++;
|
||||
}
|
||||
}
|
||||
// TODO: Make progress a function with a callback? That way scope inside
|
||||
// and side-effects are easy to control & track
|
||||
unset($progress);
|
||||
printf("\n");
|
||||
|
||||
|
||||
$seenKanji = $this->ankiService->getKnownSlnKanjiCounts();
|
||||
// Build the values to be used in the ordering process
|
||||
// TODO: It kinda feels wrong that $termCounts is used in this special
|
||||
// manner while $seenScore is separate. Does it make sense to
|
||||
// build them at this stage? Make a generic orderer?
|
||||
//
|
||||
// $termOrdering = ['first' => 32, 'second' => 34, 'apple' => 2];
|
||||
//
|
||||
// At first we just built the term list, then we generate an
|
||||
// ordering array where the list is ordered
|
||||
printf('Rating terms...');
|
||||
$studiedKanji = $this->ankiService->getKnownSlnKanjiCounts('ASC');
|
||||
$seenScore = [];
|
||||
foreach ($allTerms as $key => $term) {
|
||||
$count = Japanese::kanjiDiff(
|
||||
$term->getKanji(),
|
||||
array_keys($seenKanji),
|
||||
);
|
||||
$diff = Japanese::kanjiDiff($term->getKanji(), array_keys($studiedKanji));
|
||||
|
||||
if ($count <= 0) {
|
||||
if (count($diff) <= 0) {
|
||||
unset($allTerms[$key]);
|
||||
unset($termCounts[$key]);
|
||||
continue;
|
||||
}
|
||||
|
||||
$termCounts[$term->getKanji()] = $count;
|
||||
// Build $seenScore
|
||||
$seenScore[$key] = 0;
|
||||
foreach (Japanese::getKanjiList($key) as $_kanji) {
|
||||
$seenScore[$key] += $studiedKanji[$_kanji] ?? 0;
|
||||
}
|
||||
// Build $termCounts
|
||||
$termCounts[$term->getKanji()] = count($diff);
|
||||
}
|
||||
// Ordering in having:
|
||||
// 1. Least new Kanji (ideally we just one 1 new kanji)
|
||||
// 2. Most Kanji (most amount of unique kanji)
|
||||
// 3. Least studied kanji
|
||||
uksort($termCounts, fn($a, $b) => $seenScore[$a] <=> $seenScore[$b]);
|
||||
uksort($termCounts, function ($a, $b) {
|
||||
$aLen = mb_strlen(Japanese::getOnlyKanji($a));
|
||||
$bLen = mb_strlen(Japanese::getOnlyKanji($b));
|
||||
$aLen = count(Japanese::getKanjiList($a));
|
||||
$bLen = count(Japanese::getKanjiList($b));
|
||||
return $bLen <=> $aLen;
|
||||
});
|
||||
asort($termCounts);
|
||||
asort($termCounts, SORT_NUMERIC);
|
||||
printf(" OK\n");
|
||||
|
||||
|
||||
// Have into account the ones that have already been created.
|
||||
// This will not only skip them but also update the general array for
|
||||
// already seen kanji.
|
||||
|
@ -184,8 +217,6 @@ class CreateProductionCommand extends Command
|
|||
printf('Filtering out terms with no new kanji...');
|
||||
// First pass: Get the list of the kanji we've seen
|
||||
foreach ($allListeningNotes as $listeningNote) {
|
||||
assert($listeningNote instanceof SentenceListeningNote);
|
||||
|
||||
$termKanji = self::getOnlyKanji($listeningNote->getTerm()->getKanji());
|
||||
self::kanjiDiff($seenKanji, $termKanji);
|
||||
}
|
||||
|
@ -200,7 +231,7 @@ class CreateProductionCommand extends Command
|
|||
}
|
||||
printf(" OK\n");
|
||||
|
||||
asort($termCounts);
|
||||
asort($termCounts, SORT_NUMERIC);
|
||||
printf("\n");
|
||||
|
||||
|
||||
|
@ -209,11 +240,8 @@ class CreateProductionCommand extends Command
|
|||
foreach ($termCounts as $term => $count) {
|
||||
if ($newNotesCount <= 0) break;
|
||||
|
||||
// FIXME: This shouldn't happen at all
|
||||
if (!$allTerms[$term] instanceof Term) continue;
|
||||
|
||||
$termKanji = self::getOnlyKanji($term);
|
||||
printf("%s: %d\n", $term, $count);
|
||||
printf("%s %d | %d\n", "{$term}:", $count, $seenScore[$term]);
|
||||
|
||||
//$len = mb_strlen($termKanji);
|
||||
//for ($i = 0; $i < $len; $i++) {
|
||||
|
|
|
@ -73,6 +73,14 @@ class AnkiController extends AbstractController
|
|||
]);
|
||||
}
|
||||
|
||||
#[Route('/kanji', 'kanji', methods: 'GET')]
|
||||
public function kanji()
|
||||
{
|
||||
$thing = $this->ankiService->getKnownSnKanjiCounts();
|
||||
asort($thing, SORT_DESC);
|
||||
return new Response(implode('', array_keys($thing)));
|
||||
}
|
||||
|
||||
#[Route('/note/{nid}/get', name: 'get_note')]
|
||||
public function get_note(int $nid)
|
||||
{
|
||||
|
|
|
@ -11,10 +11,12 @@ class SentenceNote extends Note
|
|||
|
||||
// -------------------------------------------------- Getters & setters ---
|
||||
|
||||
/** @return list<Term> */
|
||||
public function getTerms(): array
|
||||
{
|
||||
return $this->terms;
|
||||
}
|
||||
/** @param list<Terms> $terms */
|
||||
public function setTerms(array $terms): static
|
||||
{
|
||||
$this->terms = $terms;
|
||||
|
|
|
@ -18,15 +18,21 @@ class Term
|
|||
return self::parseFurigana($this->kanji)['kanji'];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the kanji version & the reading for a given term
|
||||
*
|
||||
* TODO: Make this smarter & handle mixing of kanji & hiradana
|
||||
*
|
||||
* @return array{'kanji': string, 'reading': null|string}
|
||||
* */
|
||||
public static function parseFurigana(string $furigana): array
|
||||
{
|
||||
// 0: all, 1: (kanji/hiragana), 2: ([reading]): 3: (reading)
|
||||
// 0: all, 1: (kanji/hiragana), 2: ([reading, ...]), 3: (reading)
|
||||
preg_match_all('/([^ \[]+)(\[([^\]]*)\])? ?/', $furigana, $matches, PREG_SET_ORDER);
|
||||
|
||||
$matchedKanji = array_map(fn($x) => $x[1], $matches);
|
||||
$matchedReading = array_map(fn($x) => $x[3] ?? $x[1], $matches);
|
||||
|
||||
|
||||
return [
|
||||
'kanji' => join('', $matchedKanji),
|
||||
'reading' => $matchedKanji == $matchedReading
|
||||
|
@ -35,7 +41,7 @@ class Term
|
|||
];
|
||||
}
|
||||
|
||||
public function toAnkiVocabDef()
|
||||
public function toAnkiVocabDef(): string
|
||||
{
|
||||
$ret = '<span ' . Note::HIGHLIGHT_ATTR_KANJI . '>' . $this->kanji;
|
||||
|
||||
|
@ -80,9 +86,6 @@ class Term
|
|||
' ' => ' ',
|
||||
]));
|
||||
$def = mb_trim($def);
|
||||
if (!is_string($term->kanji)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// -------------------------------------------------- No definition ---
|
||||
|
||||
|
@ -139,6 +142,10 @@ class Term
|
|||
return Term::fromVocabDefLine($kanji . $separator . $def);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, string> $fields
|
||||
* @return list<Term>
|
||||
*/
|
||||
public static function fromNoteFields(array $fields): array
|
||||
{
|
||||
// -------------------- Trying to extract it with the modern syntax ---
|
||||
|
|
|
@ -134,7 +134,7 @@ class AnkiService
|
|||
}
|
||||
|
||||
/** @return array<string, int> */
|
||||
public function getKnownSlnKanjiCounts(): array
|
||||
public function getKnownSlnKanjiCounts(?string $order = null): array
|
||||
{
|
||||
$allListeningIds = $this->getAllSentenceListeningNoteIds();
|
||||
$ret = [];
|
||||
|
@ -152,6 +152,34 @@ class AnkiService
|
|||
}
|
||||
}
|
||||
|
||||
uasort($ret, function (int $a, int $b) use ($order) {
|
||||
return $order === 'ASC' ? $a <=> $b : $b <=> $a;
|
||||
});
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
||||
/** @return array<string, int> */
|
||||
public function getKnownSnKanjiCounts(): array
|
||||
{
|
||||
$allListeningIds = $this->getAllSentenceNoteIds();
|
||||
$ret = [];
|
||||
|
||||
foreach ($this->getNotes($allListeningIds) as $sNote) {
|
||||
assert($sNote instanceof SentenceNote);
|
||||
|
||||
foreach ($sNote->getTerms() as $term) {
|
||||
$termKanji = Japanese::getOnlyKanji($term->getKanji());
|
||||
$len = mb_strlen($termKanji);
|
||||
for ($i = 0; $i < $len; $i++) {
|
||||
$kanji = mb_substr($termKanji, $i, 1);
|
||||
|
||||
$ret[$kanji] ??= 0;
|
||||
$ret[$kanji]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $ret;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,33 +6,48 @@ class Japanese
|
|||
{
|
||||
public static function getOnlyKanji(string $str): string
|
||||
{
|
||||
return preg_replace('/[^\p{Script=Han}]/u', '', $str);
|
||||
return preg_replace('/[^\p{Script=Han}]/u', '', $str) ?? '';
|
||||
}
|
||||
|
||||
/** Get the number of kanji of a string that are not in the given set
|
||||
* of kanji
|
||||
*
|
||||
* Only kanji are considere, not katakana, hiragana or any other symbols.
|
||||
/** @return \Generator<int, string> */
|
||||
public static function mbIterate(
|
||||
string $str,
|
||||
int $start = 0,
|
||||
int $length = 1,
|
||||
?string $encoding = null,
|
||||
): \Generator {
|
||||
while (($char = mb_substr($str, $start++, $length, $encoding)) !== '') {
|
||||
yield $char;
|
||||
}
|
||||
}
|
||||
|
||||
/** @return list<string> */
|
||||
public static function getKanjiList(string $str): array
|
||||
{
|
||||
$ret = [];
|
||||
|
||||
foreach (self::mbIterate(self::getOnlyKanji($str)) as $kanji) {
|
||||
$ret[$kanji] = 0;
|
||||
}
|
||||
|
||||
return array_keys($ret);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of kanji that are not present in a given string.
|
||||
* Only kanji are considered, not katakana, hiragana or any other symbols.
|
||||
*
|
||||
* @param list<string> $kanjiSet
|
||||
* @return list<string>
|
||||
*/
|
||||
public static function kanjiDiff(string $str, array $kanjiSet): int
|
||||
public static function kanjiDiff(string $str, array $kanjiSet): array
|
||||
{
|
||||
$ret = 0;
|
||||
$ret = [];
|
||||
|
||||
$strKanji = self::getOnlyKanji($str);
|
||||
$len = mb_strlen($strKanji);
|
||||
for ($i = 0; $i < $len; $i++) {
|
||||
$kanji = mb_substr($strKanji, $i, 1);
|
||||
|
||||
if (!array_search($kanji, $kanjiSet)) {
|
||||
$ret++;
|
||||
}
|
||||
foreach (self::mbIterate(self::getOnlyKanji($str)) as $kanji) {
|
||||
if (!array_search($kanji, $kanjiSet)) $ret[$kanji] = 0;
|
||||
}
|
||||
|
||||
//dump($str, $strKanji, $ret);
|
||||
//echo "\n";
|
||||
|
||||
return $ret;
|
||||
return array_keys($ret);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue