feat: Interpret terms from non-conformant defintions

2025-02-06 20:02:50 +09:00 · 2025-02-06 20:02:50 +09:00 · c19250e29e
parent 6fc6307f6e
commit c19250e29e
2 changed files with 37 additions and 41 deletions
--- a/src/Entity/Note.php
+++ b/src/Entity/Note.php
@ -67,39 +67,7 @@ class Note
        $note->mediaInfo = $note->parseMediaInfo($note->fields['Notes']);
        // Set VocabKanji field
-        $terms = Term::fromVocabDef($note->fields['VocabDef']);
+        $terms = Term::fromNoteFields($note->fields);
        if (null !== $terms) {
            $note->terms = $terms;
        } else {
            // Something went wrong when trying to parse the definitions into
            // terms, that means its format is non-conforming. If there's only
            // one term that means that it's an old one that wasn't updated.
            if (
                str_contains($note->fields['VocabKanji'], '｜') or
                str_contains($note->fields['VocabKanji'], '|')
            ) {
                dump("ERROR: Multiple vocab kanjis with no proper definition.");
                dd($note->fields);
            }
            if (mb_trim($note->fields['VocabDef']) === '') {
                $note->fields['VocabDef'] = '＿';
            }
            // Make the "<def>" turn into "<kanji>:<def>". Select the
            // appropriate semicolon character for each
            $separator = '：';
            if (preg_match('/[[:alpha:]]/u', $note->fields['VocabDef'])) {
                $separator = ':';
            }
            $note->fields['VocabDef'] = $note->fields['VocabKanji']
                . $separator
                . $note->fields['VocabDef'];
            $terms = Term::fromVocabDef($note->fields['VocabDef']);
            $note->terms = $terms ?? dd($note->fields['VocabDef']);
        }
        // If not defined, find them from the highlighted parts in the sentence
        if (empty($note->terms)) {
--- a/src/Entity/Term.php
+++ b/src/Entity/Term.php
@ -127,18 +127,46 @@ class Term
        dd("Unexpected error, couldn't parse definition line", $vocabDefLine);
    }
-    public static function fromVocabDef(string $vocabDef): ?array
+    /** Turns a def like "<def>" turn into "<kanji>:<def>" */
    private static function fromLegacyLine(string $kanji, string $def): ?Term
    {
-        if (mb_trim($vocabDef) === '') return null;
+        // Select appropriate semicolon for the job
        $separator = preg_match('/[[:alpha:]]/u', $def)
            ? ': '
            : '：';
        // Stick the kanji at the start and see if it makes sense
        return Term::fromVocabDefLine($kanji . $separator . $def);
    }
    public static function fromNoteFields(array $fields): ?array
    {
        // -------------------- Trying to extract it with the modern syntax ---
        // 言葉: word
        // 上げる：上に動くこと。
        // 雨：水粒を降ること。(rain)
        $terms = [];
-        foreach (preg_split('|<br ?/?>|', $vocabDef) as $line) {
+        foreach (preg_split('|<br ?/?>|', $fields['VocabDef']) as $line) {
-            $term = self::fromVocabDefLine(strip_tags($line));
+            $terms[] = self::fromVocabDefLine(strip_tags($line));
            // Error parsing term, can't parse using vocabDef
            if (null === $term) return null;
            $terms[] = $term;
        };
        // If there's no nulls, everything went good
        if (!in_array(null, $terms, true)) return $terms;
-        return $terms;
+
        // ------------ Extracting failed, try to infer from other syntaxes ---
        $kanjis = explode('｜', $fields['VocabKanji']);
        $defs   = explode('｜', $fields['VocabDef']);
        // Number of legacy definitions is different from number of kanji
        if (count($kanjis) !== count($defs)) return null;
        $terms = [];
        foreach (array_combine($kanjis, $defs) as $kanji => $def) {
            $terms[] = self::fromLegacyLine($kanji, $def);
        }
        // Search for nulls, if found, it's owari da
        return in_array(null, $terms, true) ? null : $terms;
    }
 }