feat: (At last) add rarity ordering. Also too many chore liftups
This commit is contained in:
parent
12ca7afd80
commit
f729734a71
|
@ -8,18 +8,14 @@ use App\Entity\Term;
|
||||||
use App\Service\AnkiService;
|
use App\Service\AnkiService;
|
||||||
use App\Utils\Japanese;
|
use App\Utils\Japanese;
|
||||||
use App\Utils\Progress;
|
use App\Utils\Progress;
|
||||||
|
|
||||||
use Symfony\Component\Console\Attribute\AsCommand;
|
use Symfony\Component\Console\Attribute\AsCommand;
|
||||||
use Symfony\Component\Console\Command\Command;
|
use Symfony\Component\Console\Command\Command;
|
||||||
use Symfony\Component\Console\Input\InputArgument;
|
use Symfony\Component\Console\Input\InputArgument;
|
||||||
use Symfony\Component\Console\Input\InputInterface;
|
use Symfony\Component\Console\Input\InputInterface;
|
||||||
use Symfony\Component\Console\Input\InputOption;
|
|
||||||
use Symfony\Component\Console\Output\OutputInterface;
|
use Symfony\Component\Console\Output\OutputInterface;
|
||||||
use Symfony\Component\Console\Style\SymfonyStyle;
|
|
||||||
|
|
||||||
#[AsCommand(
|
#[AsCommand('app:create:listening', 'Create new listening Anki Cards')]
|
||||||
name: 'app:create-production',
|
|
||||||
description: 'Add a short description for your command',
|
|
||||||
)]
|
|
||||||
class CreateProductionCommand extends Command
|
class CreateProductionCommand extends Command
|
||||||
{
|
{
|
||||||
public function __construct(
|
public function __construct(
|
||||||
|
@ -28,6 +24,7 @@ class CreateProductionCommand extends Command
|
||||||
parent::__construct();
|
parent::__construct();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @return list<string> */
|
||||||
private static function extractKanji(string $str): array
|
private static function extractKanji(string $str): array
|
||||||
{
|
{
|
||||||
preg_match_all('/\p{Script=Han}/u', $str, $matches);
|
preg_match_all('/\p{Script=Han}/u', $str, $matches);
|
||||||
|
@ -39,6 +36,7 @@ class CreateProductionCommand extends Command
|
||||||
return preg_replace('/[^\p{Script=Han}]/u', '', $str);
|
return preg_replace('/[^\p{Script=Han}]/u', '', $str);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @param array<string, 0> $ref */
|
||||||
private static function kanjiDiff(array &$ref, string $subject): bool
|
private static function kanjiDiff(array &$ref, string $subject): bool
|
||||||
{
|
{
|
||||||
$len = mb_strlen($subject);
|
$len = mb_strlen($subject);
|
||||||
|
@ -47,7 +45,7 @@ class CreateProductionCommand extends Command
|
||||||
for ($i = 0; $i < $len; $i++) {
|
for ($i = 0; $i < $len; $i++) {
|
||||||
$subKanji = mb_substr($subject, $i, 1);
|
$subKanji = mb_substr($subject, $i, 1);
|
||||||
|
|
||||||
foreach ($ref as $refKanji => $value) {
|
foreach (array_keys($ref) as $refKanji) {
|
||||||
if ($subKanji === $refKanji) continue 2;
|
if ($subKanji === $refKanji) continue 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -60,10 +58,11 @@ class CreateProductionCommand extends Command
|
||||||
|
|
||||||
protected function configure(): void
|
protected function configure(): void
|
||||||
{
|
{
|
||||||
$this
|
$this->addArgument(
|
||||||
->addArgument('count', InputArgument::OPTIONAL, 'Amount of cards to make', 1);
|
'count',
|
||||||
//->addOption('option1', null, InputOption::VALUE_NONE, 'Option description')
|
InputArgument::REQUIRED,
|
||||||
;
|
'Amount of cards to make',
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected function createProductionNoteFromTerm(Term $term): void
|
protected function createProductionNoteFromTerm(Term $term): void
|
||||||
|
@ -115,68 +114,102 @@ class CreateProductionCommand extends Command
|
||||||
$allSentenceNotes = $this->getAllSentenceNotes();
|
$allSentenceNotes = $this->getAllSentenceNotes();
|
||||||
$allListeningNotes = $this->getAllSentenceListeningNotes();
|
$allListeningNotes = $this->getAllSentenceListeningNotes();
|
||||||
|
|
||||||
// --------- Getting list<SentenceNote> into array<TermKanji, Term> ---
|
// Index of all the Terms indexed by its TermKanji
|
||||||
|
$allTerms = []; // ["パレートの法則" => App\Entity\Term]
|
||||||
|
// Set of known Kanji Characters
|
||||||
|
$knownKanji = []; // ["法" => 0, "則" => 0]
|
||||||
|
// How many times it appears (not as a term, but in KanjiSentence)
|
||||||
|
$termCounts = []; // ["パレートの法則" => 1]
|
||||||
|
|
||||||
$allTerms = [];
|
|
||||||
$knownKanji = [];
|
|
||||||
$termCounts = [];
|
|
||||||
printf('Indexing all terms...');
|
printf('Indexing all terms...');
|
||||||
foreach ($allSentenceNotes as $note) {
|
foreach ($allSentenceNotes as $note) {
|
||||||
foreach ($note->getTerms() as &$term) {
|
foreach ($note->getTerms() as &$term) {
|
||||||
assert($term instanceof Term);
|
// Deduplicate list
|
||||||
|
|
||||||
if (key_exists($term->getKanji(), $allTerms)) continue;
|
if (key_exists($term->getKanji(), $allTerms)) continue;
|
||||||
$termCounts[$term->getKanji()] = 0;
|
// Actual indexing
|
||||||
$allTerms[$term->getKanji()] = &$term;
|
$allTerms[$term->getKanji()] = &$term;
|
||||||
|
// Just simple intialization
|
||||||
|
$termCounts[$term->getKanji()] = 0;
|
||||||
foreach (self::extractKanji($term->getKanji()) as $kanji) {
|
foreach (self::extractKanji($term->getKanji()) as $kanji) {
|
||||||
$knownKanji[$kanji] = 0;
|
$knownKanji[$kanji] = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf(" OK (%d)\n", count($allTerms));
|
printf(" OK (%d)\n", count($knownKanji));
|
||||||
|
|
||||||
|
|
||||||
|
// Populate $knownKanji ["例" => 378, ...];
|
||||||
|
// TODO: Move this into own function to prevent side-effects. It's
|
||||||
|
// looping through the whole thing again anyway, so there's no
|
||||||
|
// need for it in here.
|
||||||
|
//
|
||||||
|
// Maybe while you're at it, it could be simplified into a
|
||||||
|
// function like $this->anki->getKanji('origField', 'countField')
|
||||||
|
// Where count can be null so it's just a Set
|
||||||
$progress = new Progress('Getting frequenciees', count($allSentenceNotes));
|
$progress = new Progress('Getting frequenciees', count($allSentenceNotes));
|
||||||
foreach ($allSentenceNotes as $note) {
|
foreach ($allSentenceNotes as $note) {
|
||||||
$progress->tick();
|
$progress->tick();
|
||||||
|
|
||||||
$sentKanji = str_replace(
|
// Sanitize sentence (remove those pesky \u{200E})
|
||||||
|
$_sentKanji = str_replace(
|
||||||
"\u{200E}",
|
"\u{200E}",
|
||||||
'',
|
'',
|
||||||
strip_tags($note->getFields()['SentKanji'])
|
strip_tags($note->getFields()['SentKanji'])
|
||||||
);
|
);
|
||||||
|
|
||||||
foreach ($knownKanji as $kanji => &$count) {
|
foreach ($knownKanji as $kanji => &$count) {
|
||||||
if (str_contains($sentKanji, $kanji)) $count++;
|
if (str_contains($_sentKanji, $kanji)) $count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// TODO: Make progress a function with a callback? That way scope inside
|
||||||
|
// and side-effects are easy to control & track
|
||||||
|
unset($progress);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
||||||
|
|
||||||
$seenKanji = $this->ankiService->getKnownSlnKanjiCounts();
|
// Build the values to be used in the ordering process
|
||||||
|
// TODO: It kinda feels wrong that $termCounts is used in this special
|
||||||
|
// manner while $seenScore is separate. Does it make sense to
|
||||||
|
// build them at this stage? Make a generic orderer?
|
||||||
|
//
|
||||||
|
// $termOrdering = ['first' => 32, 'second' => 34, 'apple' => 2];
|
||||||
|
//
|
||||||
|
// At first we just built the term list, then we generate an
|
||||||
|
// ordering array where the list is ordered
|
||||||
printf('Rating terms...');
|
printf('Rating terms...');
|
||||||
|
$studiedKanji = $this->ankiService->getKnownSlnKanjiCounts('ASC');
|
||||||
|
$seenScore = [];
|
||||||
foreach ($allTerms as $key => $term) {
|
foreach ($allTerms as $key => $term) {
|
||||||
$count = Japanese::kanjiDiff(
|
$diff = Japanese::kanjiDiff($term->getKanji(), array_keys($studiedKanji));
|
||||||
$term->getKanji(),
|
|
||||||
array_keys($seenKanji),
|
|
||||||
);
|
|
||||||
|
|
||||||
if ($count <= 0) {
|
if (count($diff) <= 0) {
|
||||||
unset($allTerms[$key]);
|
unset($allTerms[$key]);
|
||||||
unset($termCounts[$key]);
|
unset($termCounts[$key]);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
$termCounts[$term->getKanji()] = $count;
|
// Build $seenScore
|
||||||
|
$seenScore[$key] = 0;
|
||||||
|
foreach (Japanese::getKanjiList($key) as $_kanji) {
|
||||||
|
$seenScore[$key] += $studiedKanji[$_kanji] ?? 0;
|
||||||
}
|
}
|
||||||
|
// Build $termCounts
|
||||||
|
$termCounts[$term->getKanji()] = count($diff);
|
||||||
|
}
|
||||||
|
// Ordering in having:
|
||||||
|
// 1. Least new Kanji (ideally we just one 1 new kanji)
|
||||||
|
// 2. Most Kanji (most amount of unique kanji)
|
||||||
|
// 3. Least studied kanji
|
||||||
|
uksort($termCounts, fn($a, $b) => $seenScore[$a] <=> $seenScore[$b]);
|
||||||
uksort($termCounts, function ($a, $b) {
|
uksort($termCounts, function ($a, $b) {
|
||||||
$aLen = mb_strlen(Japanese::getOnlyKanji($a));
|
$aLen = count(Japanese::getKanjiList($a));
|
||||||
$bLen = mb_strlen(Japanese::getOnlyKanji($b));
|
$bLen = count(Japanese::getKanjiList($b));
|
||||||
return $bLen <=> $aLen;
|
return $bLen <=> $aLen;
|
||||||
});
|
});
|
||||||
asort($termCounts);
|
asort($termCounts, SORT_NUMERIC);
|
||||||
printf(" OK\n");
|
printf(" OK\n");
|
||||||
|
|
||||||
|
|
||||||
// Have into account the ones that have already been created.
|
// Have into account the ones that have already been created.
|
||||||
// This will not only skip them but also update the general array for
|
// This will not only skip them but also update the general array for
|
||||||
// already seen kanji.
|
// already seen kanji.
|
||||||
|
@ -184,8 +217,6 @@ class CreateProductionCommand extends Command
|
||||||
printf('Filtering out terms with no new kanji...');
|
printf('Filtering out terms with no new kanji...');
|
||||||
// First pass: Get the list of the kanji we've seen
|
// First pass: Get the list of the kanji we've seen
|
||||||
foreach ($allListeningNotes as $listeningNote) {
|
foreach ($allListeningNotes as $listeningNote) {
|
||||||
assert($listeningNote instanceof SentenceListeningNote);
|
|
||||||
|
|
||||||
$termKanji = self::getOnlyKanji($listeningNote->getTerm()->getKanji());
|
$termKanji = self::getOnlyKanji($listeningNote->getTerm()->getKanji());
|
||||||
self::kanjiDiff($seenKanji, $termKanji);
|
self::kanjiDiff($seenKanji, $termKanji);
|
||||||
}
|
}
|
||||||
|
@ -200,7 +231,7 @@ class CreateProductionCommand extends Command
|
||||||
}
|
}
|
||||||
printf(" OK\n");
|
printf(" OK\n");
|
||||||
|
|
||||||
asort($termCounts);
|
asort($termCounts, SORT_NUMERIC);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
||||||
|
|
||||||
|
@ -209,11 +240,8 @@ class CreateProductionCommand extends Command
|
||||||
foreach ($termCounts as $term => $count) {
|
foreach ($termCounts as $term => $count) {
|
||||||
if ($newNotesCount <= 0) break;
|
if ($newNotesCount <= 0) break;
|
||||||
|
|
||||||
// FIXME: This shouldn't happen at all
|
|
||||||
if (!$allTerms[$term] instanceof Term) continue;
|
|
||||||
|
|
||||||
$termKanji = self::getOnlyKanji($term);
|
$termKanji = self::getOnlyKanji($term);
|
||||||
printf("%s: %d\n", $term, $count);
|
printf("%s %d | %d\n", "{$term}:", $count, $seenScore[$term]);
|
||||||
|
|
||||||
//$len = mb_strlen($termKanji);
|
//$len = mb_strlen($termKanji);
|
||||||
//for ($i = 0; $i < $len; $i++) {
|
//for ($i = 0; $i < $len; $i++) {
|
||||||
|
|
|
@ -73,6 +73,14 @@ class AnkiController extends AbstractController
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[Route('/kanji', 'kanji', methods: 'GET')]
|
||||||
|
public function kanji()
|
||||||
|
{
|
||||||
|
$thing = $this->ankiService->getKnownSnKanjiCounts();
|
||||||
|
asort($thing, SORT_DESC);
|
||||||
|
return new Response(implode('', array_keys($thing)));
|
||||||
|
}
|
||||||
|
|
||||||
#[Route('/note/{nid}/get', name: 'get_note')]
|
#[Route('/note/{nid}/get', name: 'get_note')]
|
||||||
public function get_note(int $nid)
|
public function get_note(int $nid)
|
||||||
{
|
{
|
||||||
|
|
|
@ -11,10 +11,12 @@ class SentenceNote extends Note
|
||||||
|
|
||||||
// -------------------------------------------------- Getters & setters ---
|
// -------------------------------------------------- Getters & setters ---
|
||||||
|
|
||||||
|
/** @return list<Term> */
|
||||||
public function getTerms(): array
|
public function getTerms(): array
|
||||||
{
|
{
|
||||||
return $this->terms;
|
return $this->terms;
|
||||||
}
|
}
|
||||||
|
/** @param list<Terms> $terms */
|
||||||
public function setTerms(array $terms): static
|
public function setTerms(array $terms): static
|
||||||
{
|
{
|
||||||
$this->terms = $terms;
|
$this->terms = $terms;
|
||||||
|
|
|
@ -18,15 +18,21 @@ class Term
|
||||||
return self::parseFurigana($this->kanji)['kanji'];
|
return self::parseFurigana($this->kanji)['kanji'];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the kanji version & the reading for a given term
|
||||||
|
*
|
||||||
|
* TODO: Make this smarter & handle mixing of kanji & hiradana
|
||||||
|
*
|
||||||
|
* @return array{'kanji': string, 'reading': null|string}
|
||||||
|
* */
|
||||||
public static function parseFurigana(string $furigana): array
|
public static function parseFurigana(string $furigana): array
|
||||||
{
|
{
|
||||||
// 0: all, 1: (kanji/hiragana), 2: ([reading]): 3: (reading)
|
// 0: all, 1: (kanji/hiragana), 2: ([reading, ...]), 3: (reading)
|
||||||
preg_match_all('/([^ \[]+)(\[([^\]]*)\])? ?/', $furigana, $matches, PREG_SET_ORDER);
|
preg_match_all('/([^ \[]+)(\[([^\]]*)\])? ?/', $furigana, $matches, PREG_SET_ORDER);
|
||||||
|
|
||||||
$matchedKanji = array_map(fn($x) => $x[1], $matches);
|
$matchedKanji = array_map(fn($x) => $x[1], $matches);
|
||||||
$matchedReading = array_map(fn($x) => $x[3] ?? $x[1], $matches);
|
$matchedReading = array_map(fn($x) => $x[3] ?? $x[1], $matches);
|
||||||
|
|
||||||
|
|
||||||
return [
|
return [
|
||||||
'kanji' => join('', $matchedKanji),
|
'kanji' => join('', $matchedKanji),
|
||||||
'reading' => $matchedKanji == $matchedReading
|
'reading' => $matchedKanji == $matchedReading
|
||||||
|
@ -35,7 +41,7 @@ class Term
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
public function toAnkiVocabDef()
|
public function toAnkiVocabDef(): string
|
||||||
{
|
{
|
||||||
$ret = '<span ' . Note::HIGHLIGHT_ATTR_KANJI . '>' . $this->kanji;
|
$ret = '<span ' . Note::HIGHLIGHT_ATTR_KANJI . '>' . $this->kanji;
|
||||||
|
|
||||||
|
@ -80,9 +86,6 @@ class Term
|
||||||
' ' => ' ',
|
' ' => ' ',
|
||||||
]));
|
]));
|
||||||
$def = mb_trim($def);
|
$def = mb_trim($def);
|
||||||
if (!is_string($term->kanji)) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------- No definition ---
|
// -------------------------------------------------- No definition ---
|
||||||
|
|
||||||
|
@ -139,6 +142,10 @@ class Term
|
||||||
return Term::fromVocabDefLine($kanji . $separator . $def);
|
return Term::fromVocabDefLine($kanji . $separator . $def);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param array<string, string> $fields
|
||||||
|
* @return list<Term>
|
||||||
|
*/
|
||||||
public static function fromNoteFields(array $fields): array
|
public static function fromNoteFields(array $fields): array
|
||||||
{
|
{
|
||||||
// -------------------- Trying to extract it with the modern syntax ---
|
// -------------------- Trying to extract it with the modern syntax ---
|
||||||
|
|
|
@ -134,7 +134,7 @@ class AnkiService
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @return array<string, int> */
|
/** @return array<string, int> */
|
||||||
public function getKnownSlnKanjiCounts(): array
|
public function getKnownSlnKanjiCounts(?string $order = null): array
|
||||||
{
|
{
|
||||||
$allListeningIds = $this->getAllSentenceListeningNoteIds();
|
$allListeningIds = $this->getAllSentenceListeningNoteIds();
|
||||||
$ret = [];
|
$ret = [];
|
||||||
|
@ -152,6 +152,34 @@ class AnkiService
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uasort($ret, function (int $a, int $b) use ($order) {
|
||||||
|
return $order === 'ASC' ? $a <=> $b : $b <=> $a;
|
||||||
|
});
|
||||||
|
|
||||||
|
return $ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @return array<string, int> */
|
||||||
|
public function getKnownSnKanjiCounts(): array
|
||||||
|
{
|
||||||
|
$allListeningIds = $this->getAllSentenceNoteIds();
|
||||||
|
$ret = [];
|
||||||
|
|
||||||
|
foreach ($this->getNotes($allListeningIds) as $sNote) {
|
||||||
|
assert($sNote instanceof SentenceNote);
|
||||||
|
|
||||||
|
foreach ($sNote->getTerms() as $term) {
|
||||||
|
$termKanji = Japanese::getOnlyKanji($term->getKanji());
|
||||||
|
$len = mb_strlen($termKanji);
|
||||||
|
for ($i = 0; $i < $len; $i++) {
|
||||||
|
$kanji = mb_substr($termKanji, $i, 1);
|
||||||
|
|
||||||
|
$ret[$kanji] ??= 0;
|
||||||
|
$ret[$kanji]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return $ret;
|
return $ret;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,33 +6,48 @@ class Japanese
|
||||||
{
|
{
|
||||||
public static function getOnlyKanji(string $str): string
|
public static function getOnlyKanji(string $str): string
|
||||||
{
|
{
|
||||||
return preg_replace('/[^\p{Script=Han}]/u', '', $str);
|
return preg_replace('/[^\p{Script=Han}]/u', '', $str) ?? '';
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Get the number of kanji of a string that are not in the given set
|
/** @return \Generator<int, string> */
|
||||||
* of kanji
|
public static function mbIterate(
|
||||||
*
|
string $str,
|
||||||
* Only kanji are considere, not katakana, hiragana or any other symbols.
|
int $start = 0,
|
||||||
|
int $length = 1,
|
||||||
|
?string $encoding = null,
|
||||||
|
): \Generator {
|
||||||
|
while (($char = mb_substr($str, $start++, $length, $encoding)) !== '') {
|
||||||
|
yield $char;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @return list<string> */
|
||||||
|
public static function getKanjiList(string $str): array
|
||||||
|
{
|
||||||
|
$ret = [];
|
||||||
|
|
||||||
|
foreach (self::mbIterate(self::getOnlyKanji($str)) as $kanji) {
|
||||||
|
$ret[$kanji] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return array_keys($ret);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the list of kanji that are not present in a given string.
|
||||||
|
* Only kanji are considered, not katakana, hiragana or any other symbols.
|
||||||
*
|
*
|
||||||
* @param list<string> $kanjiSet
|
* @param list<string> $kanjiSet
|
||||||
|
* @return list<string>
|
||||||
*/
|
*/
|
||||||
public static function kanjiDiff(string $str, array $kanjiSet): int
|
public static function kanjiDiff(string $str, array $kanjiSet): array
|
||||||
{
|
{
|
||||||
$ret = 0;
|
$ret = [];
|
||||||
|
|
||||||
$strKanji = self::getOnlyKanji($str);
|
foreach (self::mbIterate(self::getOnlyKanji($str)) as $kanji) {
|
||||||
$len = mb_strlen($strKanji);
|
if (!array_search($kanji, $kanjiSet)) $ret[$kanji] = 0;
|
||||||
for ($i = 0; $i < $len; $i++) {
|
|
||||||
$kanji = mb_substr($strKanji, $i, 1);
|
|
||||||
|
|
||||||
if (!array_search($kanji, $kanjiSet)) {
|
|
||||||
$ret++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//dump($str, $strKanji, $ret);
|
return array_keys($ret);
|
||||||
//echo "\n";
|
|
||||||
|
|
||||||
return $ret;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue