feat: Interpret terms from non-conformant defintions

This commit is contained in:
Dendy 2025-02-06 20:02:50 +09:00
parent 6fc6307f6e
commit c19250e29e
2 changed files with 37 additions and 41 deletions

View File

@ -67,39 +67,7 @@ class Note
$note->mediaInfo = $note->parseMediaInfo($note->fields['Notes']); $note->mediaInfo = $note->parseMediaInfo($note->fields['Notes']);
// Set VocabKanji field // Set VocabKanji field
$terms = Term::fromVocabDef($note->fields['VocabDef']); $terms = Term::fromNoteFields($note->fields);
if (null !== $terms) {
$note->terms = $terms;
} else {
// Something went wrong when trying to parse the definitions into
// terms, that means its format is non-conforming. If there's only
// one term that means that it's an old one that wasn't updated.
if (
str_contains($note->fields['VocabKanji'], '') or
str_contains($note->fields['VocabKanji'], '|')
) {
dump("ERROR: Multiple vocab kanjis with no proper definition.");
dd($note->fields);
}
if (mb_trim($note->fields['VocabDef']) === '') {
$note->fields['VocabDef'] = '_';
}
// Make the "<def>" turn into "<kanji>:<def>". Select the
// appropriate semicolon character for each
$separator = '';
if (preg_match('/[[:alpha:]]/u', $note->fields['VocabDef'])) {
$separator = ':';
}
$note->fields['VocabDef'] = $note->fields['VocabKanji']
. $separator
. $note->fields['VocabDef'];
$terms = Term::fromVocabDef($note->fields['VocabDef']);
$note->terms = $terms ?? dd($note->fields['VocabDef']);
}
// If not defined, find them from the highlighted parts in the sentence // If not defined, find them from the highlighted parts in the sentence
if (empty($note->terms)) { if (empty($note->terms)) {

View File

@ -127,18 +127,46 @@ class Term
dd("Unexpected error, couldn't parse definition line", $vocabDefLine); dd("Unexpected error, couldn't parse definition line", $vocabDefLine);
} }
public static function fromVocabDef(string $vocabDef): ?array /** Turns a def like "<def>" turn into "<kanji>:<def>" */
private static function fromLegacyLine(string $kanji, string $def): ?Term
{ {
if (mb_trim($vocabDef) === '') return null; // Select appropriate semicolon for the job
$separator = preg_match('/[[:alpha:]]/u', $def)
? ': '
: '';
// Stick the kanji at the start and see if it makes sense
return Term::fromVocabDefLine($kanji . $separator . $def);
}
public static function fromNoteFields(array $fields): ?array
{
// -------------------- Trying to extract it with the modern syntax ---
// 言葉: word
// 上げる:上に動くこと。
// 雨:水粒を降ること。(rain)
$terms = []; $terms = [];
foreach (preg_split('|<br ?/?>|', $vocabDef) as $line) { foreach (preg_split('|<br ?/?>|', $fields['VocabDef']) as $line) {
$term = self::fromVocabDefLine(strip_tags($line)); $terms[] = self::fromVocabDefLine(strip_tags($line));
// Error parsing term, can't parse using vocabDef
if (null === $term) return null;
$terms[] = $term;
}; };
// If there's no nulls, everything went good
if (!in_array(null, $terms, true)) return $terms;
return $terms;
// ------------ Extracting failed, try to infer from other syntaxes ---
$kanjis = explode('', $fields['VocabKanji']);
$defs = explode('', $fields['VocabDef']);
// Number of legacy definitions is different from number of kanji
if (count($kanjis) !== count($defs)) return null;
$terms = [];
foreach (array_combine($kanjis, $defs) as $kanji => $def) {
$terms[] = self::fromLegacyLine($kanji, $def);
}
// Search for nulls, if found, it's owari da
return in_array(null, $terms, true) ? null : $terms;
} }
} }