Compare commits
No commits in common. "da9d4d344ac934d754850c2b7eeb85cb35b86081" and "b7ec7623b26b758a78f6d866a17786ed6003bf4a" have entirely different histories.
da9d4d344a
...
b7ec7623b2
|
@ -6,8 +6,6 @@ use App\Entity\SentenceListeningNote;
|
||||||
use App\Entity\SentenceNote;
|
use App\Entity\SentenceNote;
|
||||||
use App\Entity\Term;
|
use App\Entity\Term;
|
||||||
use App\Service\AnkiService;
|
use App\Service\AnkiService;
|
||||||
use App\Utils\Japanese;
|
|
||||||
use App\Utils\Progress;
|
|
||||||
use Symfony\Component\Console\Attribute\AsCommand;
|
use Symfony\Component\Console\Attribute\AsCommand;
|
||||||
use Symfony\Component\Console\Command\Command;
|
use Symfony\Component\Console\Command\Command;
|
||||||
use Symfony\Component\Console\Input\InputArgument;
|
use Symfony\Component\Console\Input\InputArgument;
|
||||||
|
@ -90,55 +88,55 @@ class CreateProductionCommand extends Command
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @return list<SentenceNote> */
|
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||||
private function getAllSentenceNotes(): array
|
|
||||||
{
|
{
|
||||||
printf('Getting all SentenceNote...');
|
printf('Getting all SentenceCards...');
|
||||||
$allIds = $this->ankiService->getAllSentenceNoteIds();
|
$allIds = $this->ankiService->getAllSentenceNoteIds();
|
||||||
$allNotes = $this->ankiService->getNotes($allIds);
|
$allNotes = $this->ankiService->getNotes($allIds);
|
||||||
printf(" OK (%d)\n", count($allNotes));
|
printf(" OK (%d)\n", count($allNotes));
|
||||||
return $allNotes;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** @return list<SentenceListeningNote> */
|
printf('Getting all SentenceCards...');
|
||||||
private function getAllSentenceListeningNotes(): array
|
|
||||||
{
|
|
||||||
printf('Getting all SentenceListeningNote...');
|
|
||||||
$allListeningIds = $this->ankiService->getAllSentenceListeningNoteIds();
|
$allListeningIds = $this->ankiService->getAllSentenceListeningNoteIds();
|
||||||
$allListeningNotes = $this->ankiService->getNotes($allListeningIds);
|
$allListeningNotes = $this->ankiService->getNotes($allListeningIds);
|
||||||
printf(" OK (%d)\n", count($allListeningNotes));
|
printf(" OK (%d)\n", count($allListeningNotes));
|
||||||
return $allListeningNotes;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
printf('Indexing all terms...');
|
||||||
{
|
$knownTerms = [];
|
||||||
$allSentenceNotes = $this->getAllSentenceNotes();
|
|
||||||
$allListeningNotes = $this->getAllSentenceListeningNotes();
|
|
||||||
|
|
||||||
// --------- Getting list<SentenceNote> into array<TermKanji, Term> ---
|
|
||||||
|
|
||||||
$allTerms = [];
|
|
||||||
$knownKanji = [];
|
$knownKanji = [];
|
||||||
$termCounts = [];
|
$termCounts = [];
|
||||||
printf('Indexing all terms...');
|
foreach ($allNotes as $note) {
|
||||||
foreach ($allSentenceNotes as $note) {
|
if (!$note instanceof SentenceNote) throw new \Exception(sprintf(
|
||||||
|
'Expected SentenceNote, got %s',
|
||||||
|
$note::class,
|
||||||
|
));
|
||||||
|
|
||||||
foreach ($note->getTerms() as &$term) {
|
foreach ($note->getTerms() as &$term) {
|
||||||
assert($term instanceof Term);
|
assert($term instanceof Term);
|
||||||
|
|
||||||
if (key_exists($term->getKanji(), $allTerms)) continue;
|
if (key_exists($term->getKanji(), $knownTerms)) continue;
|
||||||
$termCounts[$term->getKanji()] = 0;
|
$termCounts[$term->getKanji()] = 0;
|
||||||
$allTerms[$term->getKanji()] = &$term;
|
$knownTerms[$term->getKanji()] = &$term;
|
||||||
foreach (self::extractKanji($term->getKanji()) as $kanji) {
|
foreach (self::extractKanji($term->getKanji()) as $kanji) {
|
||||||
$knownKanji[$kanji] = 0;
|
$knownKanji[$kanji] = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf(" OK (%d)\n", count($allTerms));
|
printf(" OK (%d)\n", count($knownTerms));
|
||||||
|
|
||||||
|
$total = count($knownTerms);
|
||||||
|
$i = 0;
|
||||||
|
foreach ($allNotes as $note) {
|
||||||
|
$i += 1;
|
||||||
|
if ($i % 12 === 0 or $i === $total) {
|
||||||
|
printf(
|
||||||
|
"\33[2K\r% 7d/% 7d | %.2f GiB | Getting frequencies",
|
||||||
|
$i,
|
||||||
|
$total,
|
||||||
|
memory_get_usage() / 1024 / 1024 / 1024
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
$progress = new Progress('Getting frequenciees', count($allSentenceNotes));
|
assert($note instanceof SentenceNote);
|
||||||
foreach ($allSentenceNotes as $note) {
|
|
||||||
$progress->tick();
|
|
||||||
|
|
||||||
$sentKanji = str_replace(
|
$sentKanji = str_replace(
|
||||||
"\u{200E}",
|
"\u{200E}",
|
||||||
|
@ -146,82 +144,83 @@ class CreateProductionCommand extends Command
|
||||||
strip_tags($note->getFields()['SentKanji'])
|
strip_tags($note->getFields()['SentKanji'])
|
||||||
);
|
);
|
||||||
|
|
||||||
|
//foreach ($knownTerms as &$term) {
|
||||||
|
// assert($term instanceof Term);
|
||||||
|
|
||||||
|
// if (str_contains($sentKanji, $term->getKanji())) {
|
||||||
|
// $termCounts[$term->getKanji()] += 1;
|
||||||
|
// }
|
||||||
|
//}
|
||||||
|
|
||||||
foreach ($knownKanji as $kanji => &$count) {
|
foreach ($knownKanji as $kanji => &$count) {
|
||||||
if (str_contains($sentKanji, $kanji)) $count++;
|
if (str_contains($sentKanji, $kanji)) {
|
||||||
|
$count++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
||||||
|
$seenKanji = [];
|
||||||
|
//uksort($knownTerms, function ($a, $b) {
|
||||||
|
// //return strlen(self::getOnlyKanji($b)) <=> strlen(self::getOnlyKanji($a)); // descending order
|
||||||
|
// return strlen($b) <=> strlen($a); // ascending order
|
||||||
|
//});
|
||||||
|
|
||||||
$seenKanji = $this->ankiService->getKnownSlnKanjiCounts();
|
|
||||||
printf('Rating terms...');
|
printf('Rating terms...');
|
||||||
foreach ($allTerms as $key => $term) {
|
foreach ($knownTerms as $term) {
|
||||||
$count = Japanese::kanjiDiff(
|
$termKanji = self::getOnlyKanji($term->getKanji());
|
||||||
$term->getKanji(),
|
$weight = 1 / max(mb_strlen($termKanji), 1);
|
||||||
array_keys($seenKanji),
|
|
||||||
);
|
|
||||||
|
|
||||||
if ($count <= 0) {
|
// First pass: Calculate the weight
|
||||||
unset($allTerms[$key]);
|
foreach ($knownKanji as $kanji => $count) {
|
||||||
unset($termCounts[$key]);
|
if (str_contains($termKanji, $kanji)) {
|
||||||
continue;
|
$termCounts[$term->getKanji()] += ceil($count * $weight);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$termCounts[$term->getKanji()] = $count;
|
arsort($termCounts);
|
||||||
}
|
|
||||||
uksort($termCounts, function ($a, $b) {
|
|
||||||
$aLen = mb_strlen(Japanese::getOnlyKanji($a));
|
|
||||||
$bLen = mb_strlen(Japanese::getOnlyKanji($b));
|
|
||||||
return $bLen <=> $aLen;
|
|
||||||
});
|
|
||||||
asort($termCounts);
|
|
||||||
printf(" OK\n");
|
|
||||||
|
|
||||||
// Have into account the ones that have already been created.
|
// Have into account the ones that have already been created.
|
||||||
// This will not only skip them but also update the general array for
|
// This will not only skip them but take into account the kanjis they
|
||||||
// already seen kanji.
|
// have.
|
||||||
$seenKanji = [];
|
|
||||||
printf('Filtering out terms with no new kanji...');
|
|
||||||
// First pass: Get the list of the kanji we've seen
|
|
||||||
foreach ($allListeningNotes as $listeningNote) {
|
foreach ($allListeningNotes as $listeningNote) {
|
||||||
assert($listeningNote instanceof SentenceListeningNote);
|
assert($listeningNote instanceof SentenceListeningNote);
|
||||||
|
|
||||||
$termKanji = self::getOnlyKanji($listeningNote->getTerm()->getKanji());
|
$termKanji = self::getOnlyKanji($listeningNote->getTerm()->getKanji());
|
||||||
self::kanjiDiff($seenKanji, $termKanji);
|
self::kanjiDiff($seenKanji, $termKanji);
|
||||||
}
|
}
|
||||||
// Second pass: Remove terms with no new kanji at all
|
|
||||||
foreach ($termCounts as $term => $count) {
|
foreach ($termCounts as $term => $count) {
|
||||||
$termKanji = self::getOnlyKanji($term);
|
$termKanji = self::getOnlyKanji($term);
|
||||||
|
|
||||||
|
// Second pass: Penalize terms with no new kanji at all
|
||||||
if (!self::kanjiDiff($seenKanji, $termKanji)) {
|
if (!self::kanjiDiff($seenKanji, $termKanji)) {
|
||||||
unset($termCounts[$term]);
|
unset($termCounts[$term]);
|
||||||
unset($allTerms[$term]);
|
//unset($knownTerms[$term->getKanji()]);
|
||||||
|
//$termCounts[$term->getKanji()] = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf(" OK\n");
|
printf(" OK\n");
|
||||||
|
|
||||||
asort($termCounts);
|
arsort($termCounts);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
||||||
|
|
||||||
$newNotesCount = intval($input->getArgument('count'));
|
$newNotesCount = intval($input->getArgument('count'));
|
||||||
|
|
||||||
foreach ($termCounts as $term => $count) {
|
foreach ($termCounts as $term => $count) {
|
||||||
if ($newNotesCount <= 0) break;
|
if ($newNotesCount <= 0) break;
|
||||||
|
|
||||||
// FIXME: This shouldn't happen at all
|
|
||||||
if (!$allTerms[$term] instanceof Term) continue;
|
|
||||||
|
|
||||||
$termKanji = self::getOnlyKanji($term);
|
$termKanji = self::getOnlyKanji($term);
|
||||||
printf("%s: %d\n", $term, $count);
|
printf("%s: %d\n", $term, $count);
|
||||||
|
|
||||||
//$len = mb_strlen($termKanji);
|
$len = mb_strlen($termKanji);
|
||||||
//for ($i = 0; $i < $len; $i++) {
|
for ($i = 0; $i < $len; $i++) {
|
||||||
// $iKanji = mb_substr($termKanji, $i, 1);
|
$iKanji = mb_substr($termKanji, $i, 1);
|
||||||
// printf(" - %s: %0.2f\n", $iKanji, $knownKanji[$iKanji] / $len);
|
printf(" - %s: %0.2f\n", $iKanji, $knownKanji[$iKanji] / $len);
|
||||||
//}
|
}
|
||||||
|
|
||||||
$this->createProductionNoteFromTerm($allTerms[$term]);
|
$this->createProductionNoteFromTerm($knownTerms[$term]);
|
||||||
$newNotesCount -= 1;
|
$newNotesCount -= 1;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,6 @@ namespace App\Service;
|
||||||
use App\Entity\Note;
|
use App\Entity\Note;
|
||||||
use App\Entity\SentenceListeningNote;
|
use App\Entity\SentenceListeningNote;
|
||||||
use App\Entity\SentenceNote;
|
use App\Entity\SentenceNote;
|
||||||
use App\Utils\Japanese;
|
|
||||||
use Symfony\Contracts\HttpClient\HttpClientInterface;
|
use Symfony\Contracts\HttpClient\HttpClientInterface;
|
||||||
|
|
||||||
class AnkiService
|
class AnkiService
|
||||||
|
@ -132,26 +131,4 @@ class AnkiService
|
||||||
$this->request('updateNoteFields', ['note' => $note->toAnki()]);
|
$this->request('updateNoteFields', ['note' => $note->toAnki()]);
|
||||||
$this->request('guiBrowse', ['query' => 'nid:' . $note->getId()]);
|
$this->request('guiBrowse', ['query' => 'nid:' . $note->getId()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @return array<string, int> */
|
|
||||||
public function getKnownSlnKanjiCounts(): array
|
|
||||||
{
|
|
||||||
$allListeningIds = $this->getAllSentenceListeningNoteIds();
|
|
||||||
$ret = [];
|
|
||||||
|
|
||||||
foreach ($this->getNotes($allListeningIds) as $slNote) {
|
|
||||||
assert($slNote instanceof SentenceListeningNote);
|
|
||||||
|
|
||||||
$termKanji = Japanese::getOnlyKanji($slNote->getTerm()->getKanji());
|
|
||||||
$len = mb_strlen($termKanji);
|
|
||||||
for ($i = 0; $i < $len; $i++) {
|
|
||||||
$kanji = mb_substr($termKanji, $i, 1);
|
|
||||||
|
|
||||||
$ret[$kanji] ??= 0;
|
|
||||||
$ret[$kanji]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return $ret;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,38 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
namespace App\Utils;
|
|
||||||
|
|
||||||
class Japanese
|
|
||||||
{
|
|
||||||
public static function getOnlyKanji(string $str): string
|
|
||||||
{
|
|
||||||
return preg_replace('/[^\p{Script=Han}]/u', '', $str);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Get the number of kanji of a string that are not in the given set
|
|
||||||
* of kanji
|
|
||||||
*
|
|
||||||
* Only kanji are considere, not katakana, hiragana or any other symbols.
|
|
||||||
*
|
|
||||||
* @param list<string> $kanjiSet
|
|
||||||
*/
|
|
||||||
public static function kanjiDiff(string $str, array $kanjiSet): int
|
|
||||||
{
|
|
||||||
$ret = 0;
|
|
||||||
|
|
||||||
$strKanji = self::getOnlyKanji($str);
|
|
||||||
$len = mb_strlen($strKanji);
|
|
||||||
for ($i = 0; $i < $len; $i++) {
|
|
||||||
$kanji = mb_substr($strKanji, $i, 1);
|
|
||||||
|
|
||||||
if (!array_search($kanji, $kanjiSet)) {
|
|
||||||
$ret++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//dump($str, $strKanji, $ret);
|
|
||||||
//echo "\n";
|
|
||||||
|
|
||||||
return $ret;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,26 +0,0 @@
|
||||||
<?php
|
|
||||||
|
|
||||||
namespace App\Utils;
|
|
||||||
|
|
||||||
class Progress
|
|
||||||
{
|
|
||||||
function __construct(
|
|
||||||
private string $message,
|
|
||||||
private int $total,
|
|
||||||
private int $speed = 12,
|
|
||||||
private int $i = 0,
|
|
||||||
) {}
|
|
||||||
|
|
||||||
public function tick()
|
|
||||||
{
|
|
||||||
$this->i += 1;
|
|
||||||
if ($this->i % $this->speed === 0 or $this->i === $this->total) {
|
|
||||||
printf(
|
|
||||||
"\33[2K\r% 7d/% 7d | %.2f GiB | {$this->message}",
|
|
||||||
$this->i,
|
|
||||||
$this->total,
|
|
||||||
memory_get_usage() / 1024 / 1024 / 1024
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue