feat: Implement chinese sentence parsing & production creation

This commit is contained in:
Dendy 2025-11-08 10:22:47 +01:00
parent ca6ea10f5d
commit e8dab956cf
7 changed files with 383 additions and 5 deletions

View File

@ -0,0 +1,73 @@
<?php
namespace App\Command;
use App\Entity\ChineseProductionNote;
use App\Entity\ChineseSentenceNote;
use App\Entity\SentenceNote;
use App\Service\AnkiService;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
#[AsCommand('app:create:chinese:production', 'Create new listening Anki Cards')]
class CreateChineseProductionCommand extends Command
{
public function __construct(
private AnkiService $ankiService,
) {
parent::__construct();
}
protected function configure(): void
{
//$this->addArgument(
// 'count',
// InputArgument::REQUIRED,
// 'Amount of cards to make',
//);
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$csns = $this->ankiService->getAllFromClass(ChineseSentenceNote::class);
$cpns = $this->ankiService->getAllFromClass(ChineseProductionNote::class);
$existentTerms = [];
foreach ($cpns as $productionNote) {
$existentTerms[$productionNote->getTerm()->getKanji()] = null;
}
////$newNotesCount = intval($input->getArgument('count'));
$newProductionNotes = [];
foreach ($csns as $sentenceNote) {
foreach ($sentenceNote->getTerms() as $term) {
$termStr = $term->getKanji();
if (key_exists($termStr, $existentTerms)) continue;
$existentTerms[$term->getKanji()] = null;
$newProductionNotes[] = ChineseProductionNote::fromNote($sentenceNote, $term);
//if (count($newProductionNotes) >= $newNotesCount) break 2;
}
}
foreach ($newProductionNotes as $newNote) {
$this->ankiService->addNote($newNote);
}
printf(
<<<FMNT
max usage: %0.2f MiB
current usage: %0.2f MiB\n
FMNT,
memory_get_peak_usage() / 1024 / 1024,
memory_get_usage() / 1024 / 1024,
);
return Command::SUCCESS;
}
}

View File

@ -0,0 +1,90 @@
<?php
namespace App\Entity;
use App\Utils\Japanese;
class ChineseProductionNote extends Note
{
const MODEL_NAME = 'Chinese production';
const DECK = '汉语::汉字';
private ?array $mediaInfo = [];
private ?Term $term = null;
public static function fromNote(Note $origNote, Term $term): self
{
$slNote = new self();
foreach (get_object_vars($origNote) as $prop => $value) {
$slNote->$prop = $value;
}
// Related fields are updated using the setter
$slNote->setTerm($term);
// Reset relations and basic data
$slNote->id = null;
$slNote->model = self::MODEL_NAME;
$slNote->cardIds = [];
return $slNote;
}
// -------------------------------------------------- Getters & setters ---
public function getTerm(): Term
{
return $this->term;
}
public function setTerm(Term $term): static
{
$hanzi = Japanese::getKanjiList($term->getKanji());
$pinyin = explode(' ', $term->getReading());
foreach ($pinyin as $key => $iPinyin) {
$color = ChineseTone::fromPinyin($iPinyin)->getColor();
$spanTmpl = '<span style="color: %s;">%s</span>';
$hanzi[$key] = sprintf($spanTmpl, $color, $hanzi[$key]);
$pinyin[$key] = sprintf($spanTmpl, $color, $iPinyin);
}
if (!isset($term->audio)) dd($term);
$this->fields['VocabHanzi'] = implode('', $hanzi);
$this->fields['VocabPinyin'] = implode(' ', $pinyin);
$this->fields['VocabDef'] = $term->toAnkiVocabDef();
$this->fields['VocabAudio'] = $term->audio;
$this->fields['SentHanzi'] = Note::stringHighlight(
$this->fields['SentHanzi'],
$term->getKanji(),
);
$this->term = $term;
return $this;
}
// ------------------------------------------------------- Anki-related ---
/** @param array<string, string> $noteInfo */
public static function fromAnki(array $noteInfo): static
{
$note = parent::fromAnki($noteInfo);
if ($note->getModel() !== self::MODEL_NAME) {
throw new \Exception('Trying to parse wrong model');
}
$note->mediaInfo = Note::parseMediaInfo($note->fields['Notes']);
// Set VocabKanji field
$note->term = Term::fromNoteFields($note->fields)[0] ?? null;
if ($note->term === null) {
throw new \Exception("Couldn't get term for Listening card");
}
return $note;
}
// ---------------------------------------------------- Derived methods ---
}

View File

@ -0,0 +1,172 @@
<?php
namespace App\Entity;
use App\Utils\Japanese;
class ChineseSentenceNote extends Note
{
const MODEL_NAME = 'Chinese sentences';
const DECK = '汉语::朗读';
private ?array $mediaInfo = [];
/** @var list<Term> */
private array $terms = [];
// -------------------------------------------------- Getters & setters ---
/** @return list<Term> */
public function getTerms(): array
{
return $this->terms;
}
/** @param list<Term> $terms */
public function setTerms(array $terms): static
{
$this->terms = $terms;
return $this;
}
// ------------------------------------------------------- Anki-related ---
/** @param array<string, string> $noteInfo */
public static function fromAnki(array $noteInfo): static
{
$note = parent::fromAnki($noteInfo);
if ($note->getModel() !== self::MODEL_NAME) {
throw new \Exception('Trying to parse wrong model');
}
$note->mediaInfo = Note::parseMediaInfo($note->fields['Notes']);
// Set VocabKanji field
$note->terms = Term::fromNoteFields($note->fields);
// If unable to, create them from the highlighted parts in the sentence
if (empty($note->terms)) {
foreach ($note->getHighlightedKanji() as $highlighedKanji) {
$term = new Term();
$term->kanji = $highlighedKanji;
$term->definitionEn = null;
$term->definitionJp = null;
$note->terms[] = $term;
}
}
// Set to null whatever is null
$readings = array_map(
fn($x) => in_array($x, ['_', '_', '']) ? null : $x,
explode('', $note->fields['VocabPinyin']),
);
if (count($readings) !== count($note->terms)) throw new \Exception(sprintf(
<<<FMT
Number of terms and readings is not equal.
Expected %d readings, got %d.
Hanzi: "%s"
Readings: "%s",
FMT,
count($note->terms),
count($readings),
implode('|', array_map(fn(Term $x) => $x->kanji, $note->terms)),
implode('|', $readings),
));
// Set readings from furigana field
foreach ($note->terms as $key => &$term) {
// already has a reading
if (null !== $term->getReading()) continue;
// reading not specified
$reading = $readings[$key] ?? null;
if (null === $reading) continue;
$reading = explode(' ', $reading);
if (count($reading) !== mb_strlen($term->kanji))
throw new \Exception(sprintf(
'Number of hanzi & readings not equal. "%s" & "%s".',
$term->kanji,
implode(' ', $reading),
));
$term->kanji = implode(' ', array_map(
fn(string $hanzi, string $pinyin) => "{$hanzi}[{$pinyin}]",
Japanese::getKanjiList($term->kanji),
$reading,
));
}
return $note;
}
/** @return mixed[] */
public function toAnki(): array
{
return array_merge(parent::toAnki(), [
'fields' => [
'VocabKanji' => join('', array_map(
fn(Term $x) => $x->getKanji(),
$this->terms,
)),
'VocabFurigana' => join('', array_map(
fn(Term $x) => $x->getReading() ?? '_',
$this->terms,
)),
'VocabDef' => join("<br>\n", array_map(
fn(Term $x) => $x->toAnkiVocabDef(),
$this->terms,
)),
],
]);
}
// ---------------------------------------------------- Derived methods ---
public function hasTerm(string $kanji): bool
{
foreach ($this->terms as $term) {
if ($term->kanji == $kanji) return true;
}
return false;
}
public function isSentKanjiHighlighted(): bool
{
return str_contains(
$this->fields['SentKanji'],
self::HIGHLIGHT_ATTR_KANJI,
);
}
/**
* Return an array of strings with the highlighted kanji in the SentKanji
* @return list<string>
*/
public function getHighlightedKanji(): array
{
$ret = [];
$matches = [];
// 1. Get all spans in the text
preg_match_all(
self::HIGHLIGHT_PATTERN,
$this->fields['SentKanji'],
$matches,
PREG_SET_ORDER,
);
// 2. Check the ones that match with the kanji color
foreach ($matches as $match) {
if ($match[1] === self::HIGHLIGHT_ATTR_KANJI) {
$ret[] = mb_trim($match[2]);
}
}
return $ret;
}
}

View File

@ -0,0 +1,34 @@
<?php
namespace App\Entity;
enum ChineseTone: int
{
case flat = 1;
case rising = 2;
case dip = 3;
case falling = 4;
case neutral = 5;
public static function fromPinyin(string $pinyin): self
{
return match (1) {
preg_match('/[āēīōūǖĀĒĪŌŪǕ]/u', $pinyin) => self::flat,
preg_match('/[áéíóúǘÁÉÍÓÚǗ]/u', $pinyin) => self::rising,
preg_match('/[ǎěǐǒǔǚǍĚǏǑǓǙ]/u', $pinyin) => self::dip,
preg_match('/[àèìòùǜÀÈÌÒÙǛ]/u', $pinyin) => self::falling,
default => self::neutral,
};
}
public function getColor(): string
{
return match ($this) {
self::flat => 'red',
self::rising => 'darkorange',
self::dip => 'forestgreen',
self::falling => 'darkcyan',
self::neutral => 'inherit',
};
}
}

View File

@ -98,8 +98,10 @@ class Note
// -------------------------------------------------- Utility functions ---
protected static function stringHighlight(string $haystack, string $needle): string
{
protected static function stringHighlight(
string $haystack,
string $needle,
): string {
$replace = sprintf(
'<span %s>%s</span>',
self::HIGHLIGHT_ATTR_KANJI,

View File

@ -34,11 +34,14 @@ class Term
$matchedKanji = array_map(fn($x) => $x[1], $matches);
$matchedReading = array_map(fn($x) => $x[3] ?? $x[1], $matches);
$isJapanese = preg_match('/^[\x{3040}-\x{309F}]/u', implode('', $matchedReading)) === 1;
$separator = $isJapanese ? '' : ' ';
return [
'kanji' => join('', $matchedKanji),
'reading' => $matchedKanji == $matchedReading
'reading' => $matchedKanji === $matchedReading
? null
: join('', $matchedReading),
: join($separator, $matchedReading),
];
}
@ -149,7 +152,9 @@ class Term
*/
public static function fromNoteFields(array $fields): array
{
$audios = explode('|', $fields['VocabAudio'] ?? '');
// Get audio array with sanitization of wide characters
$vocabAudio = str_replace('', '|', $fields['VocabAudio'] ?? '');
$audios = explode('|', $vocabAudio);
// -------------------- Trying to extract it with the modern syntax ---
// 言葉: word

View File

@ -2,6 +2,7 @@
namespace App\Service;
use App\Entity\ChineseSentenceNote;
use App\Entity\KoreanProductionNote;
use App\Entity\KoreanSentenceNote;
use App\Entity\Note;
@ -154,6 +155,7 @@ class AnkiService
SentenceListeningNote::MODEL_NAME => SentenceListeningNote::fromAnki($noteInfo),
KoreanSentenceNote::MODEL_NAME => KoreanSentenceNote::fromAnki($noteInfo),
KoreanProductionNote::MODEL_NAME => KoreanProductionNote::fromAnki($noteInfo),
ChineseSentenceNote::MODEL_NAME => ChineseSentenceNote::fromAnki($noteInfo),
default => throw new \Exception(sprintf(
'Unrecognized Note "%s" of type "%s"',
$noteInfo['noteId'],