feat: Implement chinese sentence parsing & production creation
This commit is contained in:
parent
ca6ea10f5d
commit
e8dab956cf
|
|
@ -0,0 +1,73 @@
|
|||
<?php
|
||||
|
||||
namespace App\Command;
|
||||
|
||||
use App\Entity\ChineseProductionNote;
|
||||
use App\Entity\ChineseSentenceNote;
|
||||
use App\Entity\SentenceNote;
|
||||
use App\Service\AnkiService;
|
||||
|
||||
use Symfony\Component\Console\Attribute\AsCommand;
|
||||
use Symfony\Component\Console\Command\Command;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
|
||||
#[AsCommand('app:create:chinese:production', 'Create new listening Anki Cards')]
|
||||
class CreateChineseProductionCommand extends Command
|
||||
{
|
||||
public function __construct(
|
||||
private AnkiService $ankiService,
|
||||
) {
|
||||
parent::__construct();
|
||||
}
|
||||
|
||||
protected function configure(): void
|
||||
{
|
||||
//$this->addArgument(
|
||||
// 'count',
|
||||
// InputArgument::REQUIRED,
|
||||
// 'Amount of cards to make',
|
||||
//);
|
||||
}
|
||||
|
||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$csns = $this->ankiService->getAllFromClass(ChineseSentenceNote::class);
|
||||
$cpns = $this->ankiService->getAllFromClass(ChineseProductionNote::class);
|
||||
|
||||
$existentTerms = [];
|
||||
foreach ($cpns as $productionNote) {
|
||||
$existentTerms[$productionNote->getTerm()->getKanji()] = null;
|
||||
}
|
||||
|
||||
////$newNotesCount = intval($input->getArgument('count'));
|
||||
|
||||
$newProductionNotes = [];
|
||||
foreach ($csns as $sentenceNote) {
|
||||
foreach ($sentenceNote->getTerms() as $term) {
|
||||
$termStr = $term->getKanji();
|
||||
|
||||
if (key_exists($termStr, $existentTerms)) continue;
|
||||
|
||||
$existentTerms[$term->getKanji()] = null;
|
||||
$newProductionNotes[] = ChineseProductionNote::fromNote($sentenceNote, $term);
|
||||
//if (count($newProductionNotes) >= $newNotesCount) break 2;
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($newProductionNotes as $newNote) {
|
||||
$this->ankiService->addNote($newNote);
|
||||
}
|
||||
|
||||
printf(
|
||||
<<<FMNT
|
||||
max usage: %0.2f MiB
|
||||
current usage: %0.2f MiB\n
|
||||
FMNT,
|
||||
memory_get_peak_usage() / 1024 / 1024,
|
||||
memory_get_usage() / 1024 / 1024,
|
||||
);
|
||||
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
<?php
|
||||
|
||||
namespace App\Entity;
|
||||
|
||||
use App\Utils\Japanese;
|
||||
|
||||
class ChineseProductionNote extends Note
|
||||
{
|
||||
const MODEL_NAME = 'Chinese production';
|
||||
const DECK = '汉语::汉字';
|
||||
|
||||
private ?array $mediaInfo = [];
|
||||
private ?Term $term = null;
|
||||
|
||||
public static function fromNote(Note $origNote, Term $term): self
|
||||
{
|
||||
$slNote = new self();
|
||||
foreach (get_object_vars($origNote) as $prop => $value) {
|
||||
$slNote->$prop = $value;
|
||||
}
|
||||
|
||||
// Related fields are updated using the setter
|
||||
$slNote->setTerm($term);
|
||||
// Reset relations and basic data
|
||||
$slNote->id = null;
|
||||
$slNote->model = self::MODEL_NAME;
|
||||
$slNote->cardIds = [];
|
||||
|
||||
return $slNote;
|
||||
}
|
||||
|
||||
// -------------------------------------------------- Getters & setters ---
|
||||
|
||||
public function getTerm(): Term
|
||||
{
|
||||
return $this->term;
|
||||
}
|
||||
public function setTerm(Term $term): static
|
||||
{
|
||||
$hanzi = Japanese::getKanjiList($term->getKanji());
|
||||
$pinyin = explode(' ', $term->getReading());
|
||||
|
||||
foreach ($pinyin as $key => $iPinyin) {
|
||||
$color = ChineseTone::fromPinyin($iPinyin)->getColor();
|
||||
$spanTmpl = '<span style="color: %s;">%s</span>';
|
||||
|
||||
$hanzi[$key] = sprintf($spanTmpl, $color, $hanzi[$key]);
|
||||
$pinyin[$key] = sprintf($spanTmpl, $color, $iPinyin);
|
||||
}
|
||||
|
||||
if (!isset($term->audio)) dd($term);
|
||||
$this->fields['VocabHanzi'] = implode('', $hanzi);
|
||||
$this->fields['VocabPinyin'] = implode(' ', $pinyin);
|
||||
$this->fields['VocabDef'] = $term->toAnkiVocabDef();
|
||||
$this->fields['VocabAudio'] = $term->audio;
|
||||
$this->fields['SentHanzi'] = Note::stringHighlight(
|
||||
$this->fields['SentHanzi'],
|
||||
$term->getKanji(),
|
||||
);
|
||||
$this->term = $term;
|
||||
return $this;
|
||||
}
|
||||
|
||||
|
||||
// ------------------------------------------------------- Anki-related ---
|
||||
|
||||
/** @param array<string, string> $noteInfo */
|
||||
public static function fromAnki(array $noteInfo): static
|
||||
{
|
||||
$note = parent::fromAnki($noteInfo);
|
||||
|
||||
if ($note->getModel() !== self::MODEL_NAME) {
|
||||
throw new \Exception('Trying to parse wrong model');
|
||||
}
|
||||
|
||||
$note->mediaInfo = Note::parseMediaInfo($note->fields['Notes']);
|
||||
|
||||
// Set VocabKanji field
|
||||
$note->term = Term::fromNoteFields($note->fields)[0] ?? null;
|
||||
if ($note->term === null) {
|
||||
throw new \Exception("Couldn't get term for Listening card");
|
||||
}
|
||||
|
||||
return $note;
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------- Derived methods ---
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,172 @@
|
|||
<?php
|
||||
|
||||
namespace App\Entity;
|
||||
|
||||
use App\Utils\Japanese;
|
||||
|
||||
class ChineseSentenceNote extends Note
|
||||
{
|
||||
const MODEL_NAME = 'Chinese sentences';
|
||||
const DECK = '汉语::朗读';
|
||||
|
||||
private ?array $mediaInfo = [];
|
||||
/** @var list<Term> */
|
||||
private array $terms = [];
|
||||
|
||||
// -------------------------------------------------- Getters & setters ---
|
||||
|
||||
/** @return list<Term> */
|
||||
public function getTerms(): array
|
||||
{
|
||||
return $this->terms;
|
||||
}
|
||||
/** @param list<Term> $terms */
|
||||
public function setTerms(array $terms): static
|
||||
{
|
||||
$this->terms = $terms;
|
||||
return $this;
|
||||
}
|
||||
|
||||
|
||||
// ------------------------------------------------------- Anki-related ---
|
||||
|
||||
/** @param array<string, string> $noteInfo */
|
||||
public static function fromAnki(array $noteInfo): static
|
||||
{
|
||||
$note = parent::fromAnki($noteInfo);
|
||||
|
||||
if ($note->getModel() !== self::MODEL_NAME) {
|
||||
throw new \Exception('Trying to parse wrong model');
|
||||
}
|
||||
|
||||
$note->mediaInfo = Note::parseMediaInfo($note->fields['Notes']);
|
||||
|
||||
// Set VocabKanji field
|
||||
$note->terms = Term::fromNoteFields($note->fields);
|
||||
|
||||
// If unable to, create them from the highlighted parts in the sentence
|
||||
if (empty($note->terms)) {
|
||||
foreach ($note->getHighlightedKanji() as $highlighedKanji) {
|
||||
$term = new Term();
|
||||
$term->kanji = $highlighedKanji;
|
||||
$term->definitionEn = null;
|
||||
$term->definitionJp = null;
|
||||
$note->terms[] = $term;
|
||||
}
|
||||
}
|
||||
|
||||
// Set to null whatever is null
|
||||
$readings = array_map(
|
||||
fn($x) => in_array($x, ['_', '_', '']) ? null : $x,
|
||||
explode('|', $note->fields['VocabPinyin']),
|
||||
);
|
||||
|
||||
if (count($readings) !== count($note->terms)) throw new \Exception(sprintf(
|
||||
<<<FMT
|
||||
Number of terms and readings is not equal.
|
||||
Expected %d readings, got %d.
|
||||
|
||||
Hanzi: "%s"
|
||||
Readings: "%s",
|
||||
FMT,
|
||||
count($note->terms),
|
||||
count($readings),
|
||||
implode('|', array_map(fn(Term $x) => $x->kanji, $note->terms)),
|
||||
implode('|', $readings),
|
||||
));
|
||||
|
||||
|
||||
// Set readings from furigana field
|
||||
foreach ($note->terms as $key => &$term) {
|
||||
// already has a reading
|
||||
if (null !== $term->getReading()) continue;
|
||||
// reading not specified
|
||||
$reading = $readings[$key] ?? null;
|
||||
if (null === $reading) continue;
|
||||
|
||||
$reading = explode(' ', $reading);
|
||||
if (count($reading) !== mb_strlen($term->kanji))
|
||||
throw new \Exception(sprintf(
|
||||
'Number of hanzi & readings not equal. "%s" & "%s".',
|
||||
$term->kanji,
|
||||
implode(' ', $reading),
|
||||
));
|
||||
|
||||
$term->kanji = implode(' ', array_map(
|
||||
fn(string $hanzi, string $pinyin) => "{$hanzi}[{$pinyin}]",
|
||||
Japanese::getKanjiList($term->kanji),
|
||||
$reading,
|
||||
));
|
||||
}
|
||||
|
||||
return $note;
|
||||
}
|
||||
|
||||
/** @return mixed[] */
|
||||
public function toAnki(): array
|
||||
{
|
||||
return array_merge(parent::toAnki(), [
|
||||
'fields' => [
|
||||
'VocabKanji' => join('|', array_map(
|
||||
fn(Term $x) => $x->getKanji(),
|
||||
$this->terms,
|
||||
)),
|
||||
'VocabFurigana' => join('|', array_map(
|
||||
fn(Term $x) => $x->getReading() ?? '_',
|
||||
$this->terms,
|
||||
)),
|
||||
'VocabDef' => join("<br>\n", array_map(
|
||||
fn(Term $x) => $x->toAnkiVocabDef(),
|
||||
$this->terms,
|
||||
)),
|
||||
],
|
||||
]);
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------- Derived methods ---
|
||||
|
||||
public function hasTerm(string $kanji): bool
|
||||
{
|
||||
foreach ($this->terms as $term) {
|
||||
if ($term->kanji == $kanji) return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public function isSentKanjiHighlighted(): bool
|
||||
{
|
||||
return str_contains(
|
||||
$this->fields['SentKanji'],
|
||||
self::HIGHLIGHT_ATTR_KANJI,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an array of strings with the highlighted kanji in the SentKanji
|
||||
* @return list<string>
|
||||
*/
|
||||
public function getHighlightedKanji(): array
|
||||
{
|
||||
$ret = [];
|
||||
$matches = [];
|
||||
|
||||
// 1. Get all spans in the text
|
||||
preg_match_all(
|
||||
self::HIGHLIGHT_PATTERN,
|
||||
$this->fields['SentKanji'],
|
||||
$matches,
|
||||
PREG_SET_ORDER,
|
||||
);
|
||||
|
||||
// 2. Check the ones that match with the kanji color
|
||||
foreach ($matches as $match) {
|
||||
if ($match[1] === self::HIGHLIGHT_ATTR_KANJI) {
|
||||
$ret[] = mb_trim($match[2]);
|
||||
}
|
||||
}
|
||||
|
||||
return $ret;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
<?php
|
||||
|
||||
namespace App\Entity;
|
||||
|
||||
enum ChineseTone: int
|
||||
{
|
||||
case flat = 1;
|
||||
case rising = 2;
|
||||
case dip = 3;
|
||||
case falling = 4;
|
||||
case neutral = 5;
|
||||
|
||||
public static function fromPinyin(string $pinyin): self
|
||||
{
|
||||
return match (1) {
|
||||
preg_match('/[āēīōūǖĀĒĪŌŪǕ]/u', $pinyin) => self::flat,
|
||||
preg_match('/[áéíóúǘÁÉÍÓÚǗ]/u', $pinyin) => self::rising,
|
||||
preg_match('/[ǎěǐǒǔǚǍĚǏǑǓǙ]/u', $pinyin) => self::dip,
|
||||
preg_match('/[àèìòùǜÀÈÌÒÙǛ]/u', $pinyin) => self::falling,
|
||||
default => self::neutral,
|
||||
};
|
||||
}
|
||||
|
||||
public function getColor(): string
|
||||
{
|
||||
return match ($this) {
|
||||
self::flat => 'red',
|
||||
self::rising => 'darkorange',
|
||||
self::dip => 'forestgreen',
|
||||
self::falling => 'darkcyan',
|
||||
self::neutral => 'inherit',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
@ -98,8 +98,10 @@ class Note
|
|||
|
||||
// -------------------------------------------------- Utility functions ---
|
||||
|
||||
protected static function stringHighlight(string $haystack, string $needle): string
|
||||
{
|
||||
protected static function stringHighlight(
|
||||
string $haystack,
|
||||
string $needle,
|
||||
): string {
|
||||
$replace = sprintf(
|
||||
'<span %s>%s</span>',
|
||||
self::HIGHLIGHT_ATTR_KANJI,
|
||||
|
|
|
|||
|
|
@ -34,11 +34,14 @@ class Term
|
|||
$matchedKanji = array_map(fn($x) => $x[1], $matches);
|
||||
$matchedReading = array_map(fn($x) => $x[3] ?? $x[1], $matches);
|
||||
|
||||
$isJapanese = preg_match('/^[\x{3040}-\x{309F}]/u', implode('', $matchedReading)) === 1;
|
||||
$separator = $isJapanese ? '' : ' ';
|
||||
|
||||
return [
|
||||
'kanji' => join('', $matchedKanji),
|
||||
'reading' => $matchedKanji == $matchedReading
|
||||
'reading' => $matchedKanji === $matchedReading
|
||||
? null
|
||||
: join('', $matchedReading),
|
||||
: join($separator, $matchedReading),
|
||||
];
|
||||
}
|
||||
|
||||
|
|
@ -149,7 +152,9 @@ class Term
|
|||
*/
|
||||
public static function fromNoteFields(array $fields): array
|
||||
{
|
||||
$audios = explode('|', $fields['VocabAudio'] ?? '');
|
||||
// Get audio array with sanitization of wide characters
|
||||
$vocabAudio = str_replace('|', '|', $fields['VocabAudio'] ?? '');
|
||||
$audios = explode('|', $vocabAudio);
|
||||
|
||||
// -------------------- Trying to extract it with the modern syntax ---
|
||||
// 言葉: word
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
namespace App\Service;
|
||||
|
||||
use App\Entity\ChineseSentenceNote;
|
||||
use App\Entity\KoreanProductionNote;
|
||||
use App\Entity\KoreanSentenceNote;
|
||||
use App\Entity\Note;
|
||||
|
|
@ -154,6 +155,7 @@ class AnkiService
|
|||
SentenceListeningNote::MODEL_NAME => SentenceListeningNote::fromAnki($noteInfo),
|
||||
KoreanSentenceNote::MODEL_NAME => KoreanSentenceNote::fromAnki($noteInfo),
|
||||
KoreanProductionNote::MODEL_NAME => KoreanProductionNote::fromAnki($noteInfo),
|
||||
ChineseSentenceNote::MODEL_NAME => ChineseSentenceNote::fromAnki($noteInfo),
|
||||
default => throw new \Exception(sprintf(
|
||||
'Unrecognized Note "%s" of type "%s"',
|
||||
$noteInfo['noteId'],
|
||||
|
|
|
|||
Loading…
Reference in New Issue