feat: WIP | Prod. generation command. Get & rank terms to create
This commit is contained in:
parent
eadd8a01ea
commit
96963fb926
|
@ -0,0 +1,209 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Command;
|
||||||
|
|
||||||
|
use App\Entity\SentenceNote;
|
||||||
|
use App\Entity\Term;
|
||||||
|
use App\Service\AnkiService;
|
||||||
|
use Symfony\Component\Console\Attribute\AsCommand;
|
||||||
|
use Symfony\Component\Console\Command\Command;
|
||||||
|
use Symfony\Component\Console\Input\InputArgument;
|
||||||
|
use Symfony\Component\Console\Input\InputInterface;
|
||||||
|
use Symfony\Component\Console\Input\InputOption;
|
||||||
|
use Symfony\Component\Console\Output\OutputInterface;
|
||||||
|
use Symfony\Component\Console\Style\SymfonyStyle;
|
||||||
|
|
||||||
|
#[AsCommand(
|
||||||
|
name: 'app:create-production',
|
||||||
|
description: 'Add a short description for your command',
|
||||||
|
)]
|
||||||
|
class CreateProductionCommand extends Command
|
||||||
|
{
|
||||||
|
public function __construct(
|
||||||
|
private AnkiService $ankiService,
|
||||||
|
) {
|
||||||
|
parent::__construct();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static function extractKanji(string $str): array
|
||||||
|
{
|
||||||
|
preg_match_all('/\p{Script=Han}/u', $str, $matches);
|
||||||
|
return array_unique($matches[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static function getOnlyKanji(string $str): string
|
||||||
|
{
|
||||||
|
return preg_replace('/[^\p{Script=Han}]/u', '', $str);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static function kanjiDiff(array &$ref, string $subject): bool
|
||||||
|
{
|
||||||
|
$len = mb_strlen($subject);
|
||||||
|
$hasUnseenKanji = false;
|
||||||
|
|
||||||
|
for ($i = 0; $i < $len; $i++) {
|
||||||
|
$subKanji = mb_substr($subject, $i, 1);
|
||||||
|
|
||||||
|
foreach ($ref as $refKanji => $value) {
|
||||||
|
if ($subKanji === $refKanji) continue 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
$ref[$subKanji] = 0;
|
||||||
|
$hasUnseenKanji = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return $hasUnseenKanji;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function configure(): void
|
||||||
|
{
|
||||||
|
//$this
|
||||||
|
// ->addArgument('arg1', InputArgument::OPTIONAL, 'Argument description')
|
||||||
|
// ->addOption('option1', null, InputOption::VALUE_NONE, 'Option description')
|
||||||
|
//;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||||
|
{
|
||||||
|
printf('Getting all SentenceCards...');
|
||||||
|
$allIds = $this->ankiService->getAllSentenceNoteIds();
|
||||||
|
$allNotes = $this->ankiService->getNotes($allIds);
|
||||||
|
printf(" OK (%d)\n", count($allNotes));
|
||||||
|
|
||||||
|
printf('Indexing all terms...');
|
||||||
|
$knownTerms = [];
|
||||||
|
$knownKanji = [];
|
||||||
|
$termCounts = [];
|
||||||
|
foreach ($allNotes as $note) {
|
||||||
|
if (!$note instanceof SentenceNote) throw new \Exception(sprintf(
|
||||||
|
'Expected SentenceNote, got %s',
|
||||||
|
$note::class,
|
||||||
|
));
|
||||||
|
|
||||||
|
foreach ($note->getTerms() as &$term) {
|
||||||
|
assert($term instanceof Term);
|
||||||
|
|
||||||
|
if (key_exists($term->getKanji(), $knownTerms)) continue;
|
||||||
|
$termCounts[$term->getKanji()] = 0;
|
||||||
|
$knownTerms[$term->getKanji()] = &$term;
|
||||||
|
foreach (self::extractKanji($term->getKanji()) as $kanji) {
|
||||||
|
$knownKanji[$kanji] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf(" OK (%d)\n", count($knownTerms));
|
||||||
|
|
||||||
|
$total = count($knownTerms);
|
||||||
|
$i = 0;
|
||||||
|
foreach ($allNotes as $note) {
|
||||||
|
$i += 1;
|
||||||
|
if ($i % 12 === 0 or $i === $total) {
|
||||||
|
printf(
|
||||||
|
"\33[2K\r% 7d/% 7d | %.2f GiB | Getting frequencies",
|
||||||
|
$i,
|
||||||
|
$total,
|
||||||
|
memory_get_usage() / 1024 / 1024 / 1024
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert($note instanceof SentenceNote);
|
||||||
|
|
||||||
|
$sentKanji = str_replace(
|
||||||
|
"\u{200E}",
|
||||||
|
'',
|
||||||
|
strip_tags($note->getFields()['SentKanji'])
|
||||||
|
);
|
||||||
|
|
||||||
|
//foreach ($knownTerms as &$term) {
|
||||||
|
// assert($term instanceof Term);
|
||||||
|
|
||||||
|
// if (str_contains($sentKanji, $term->getKanji())) {
|
||||||
|
// $termCounts[$term->getKanji()] += 1;
|
||||||
|
// }
|
||||||
|
//}
|
||||||
|
|
||||||
|
foreach ($knownKanji as $kanji => &$count) {
|
||||||
|
if (str_contains($sentKanji, $kanji)) {
|
||||||
|
$count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
$seenKanji = [];
|
||||||
|
//uksort($knownTerms, function ($a, $b) {
|
||||||
|
// //return strlen(self::getOnlyKanji($b)) <=> strlen(self::getOnlyKanji($a)); // descending order
|
||||||
|
// return strlen($b) <=> strlen($a); // ascending order
|
||||||
|
//});
|
||||||
|
|
||||||
|
|
||||||
|
printf('Rating terms...');
|
||||||
|
foreach ($knownTerms as $term) {
|
||||||
|
$termKanji = self::getOnlyKanji($term->getKanji());
|
||||||
|
$weight = 1 / max(mb_strlen($termKanji), 1);
|
||||||
|
|
||||||
|
// First pass: Calculate the weight
|
||||||
|
foreach ($knownKanji as $kanji => $count) {
|
||||||
|
if (str_contains($termKanji, $kanji)) {
|
||||||
|
$termCounts[$term->getKanji()] += ceil($count * $weight);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
arsort($termCounts);
|
||||||
|
|
||||||
|
foreach ($termCounts as $term => $count) {
|
||||||
|
$termKanji = self::getOnlyKanji($term);
|
||||||
|
|
||||||
|
// Second pass: Penalize terms with no new kanji at all
|
||||||
|
if (!self::kanjiDiff($seenKanji, $termKanji)) {
|
||||||
|
unset($termCounts[$term]);
|
||||||
|
//unset($knownTerms[$term->getKanji()]);
|
||||||
|
//$termCounts[$term->getKanji()] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf(" OK\n");
|
||||||
|
|
||||||
|
asort($termCounts);
|
||||||
|
foreach ($termCounts as $term => $count) {
|
||||||
|
$termKanji = self::getOnlyKanji($term);
|
||||||
|
printf("%s: %d\n", $term, $count);
|
||||||
|
|
||||||
|
$len = mb_strlen($termKanji);
|
||||||
|
for ($i = 0; $i < $len; $i++) {
|
||||||
|
$iKanji = mb_substr($termKanji, $i, 1);
|
||||||
|
printf(" - %s: %0.2f\n", $iKanji, $knownKanji[$iKanji] / $len);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
dump(count($termCounts));
|
||||||
|
|
||||||
|
printf(
|
||||||
|
<<<FMNT
|
||||||
|
max usage: %0.2f MiB
|
||||||
|
current usage: %0.2f MiB\n
|
||||||
|
FMNT,
|
||||||
|
memory_get_peak_usage() / 1024 / 1024,
|
||||||
|
memory_get_usage() / 1024 / 1024,
|
||||||
|
);
|
||||||
|
|
||||||
|
//dd($kanjiNotes);
|
||||||
|
|
||||||
|
//$io = new SymfonyStyle($input, $output);
|
||||||
|
//$arg1 = $input->getArgument('arg1');
|
||||||
|
|
||||||
|
//if ($arg1) {
|
||||||
|
// $io->note(sprintf('You passed an argument: %s', $arg1));
|
||||||
|
//}
|
||||||
|
|
||||||
|
//if ($input->getOption('option1')) {
|
||||||
|
// // ...
|
||||||
|
//}
|
||||||
|
|
||||||
|
//$io->success('You have a new command! Now make it your own! Pass --help to see your options.');
|
||||||
|
|
||||||
|
return Command::SUCCESS;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue