173 lines
		
	
	
		
			5.8 KiB
		
	
	
	
		
			PHP
		
	
	
	
			
		
		
	
	
			173 lines
		
	
	
		
			5.8 KiB
		
	
	
	
		
			PHP
		
	
	
	
<?php
 | 
						||
 | 
						||
namespace App\Entity;
 | 
						||
 | 
						||
class Term
 | 
						||
{
 | 
						||
    public ?string $kanji;
 | 
						||
    public ?string $definitionJp;
 | 
						||
    public ?string $definitionEn;
 | 
						||
 | 
						||
    public function getReading(): ?string
 | 
						||
    {
 | 
						||
        return self::parseFurigana($this->kanji)['reading'];
 | 
						||
    }
 | 
						||
 | 
						||
    public function getKanji(): string
 | 
						||
    {
 | 
						||
        return self::parseFurigana($this->kanji)['kanji'];
 | 
						||
    }
 | 
						||
 | 
						||
    public static function parseFurigana(string $furigana): array
 | 
						||
    {
 | 
						||
        // 0: all, 1: (kanji/hiragana), 2: ([reading]): 3: (reading)
 | 
						||
        preg_match_all('/([^ \[]+)(\[([^\]]*)\])? ?/', $furigana, $matches, PREG_SET_ORDER);
 | 
						||
 | 
						||
        $matchedKanji   = array_map(fn($x) => $x[1],          $matches);
 | 
						||
        $matchedReading = array_map(fn($x) => $x[3] ?? $x[1], $matches);
 | 
						||
 | 
						||
 | 
						||
        return [
 | 
						||
            'kanji'   => join('', $matchedKanji),
 | 
						||
            'reading' => $matchedKanji == $matchedReading
 | 
						||
                ? null
 | 
						||
                : join('', $matchedReading),
 | 
						||
        ];
 | 
						||
    }
 | 
						||
 | 
						||
    public function toAnkiVocabDef()
 | 
						||
    {
 | 
						||
        $ret = '<span ' . Note::HIGHLIGHT_ATTR_KANJI . '>' . $this->kanji;
 | 
						||
 | 
						||
        $ret .= match ([null !== $this->definitionJp, null !== $this->definitionEn]) {
 | 
						||
            [false, false] => ':</span>_',
 | 
						||
            [false,  true] => ':</span> ' . $this->definitionEn,
 | 
						||
            [true,  false] => ':</span>' . $this->definitionJp,
 | 
						||
            [true,   true] => ':</span>' . $this->definitionJp . '<span style="color: #aacebe;">(' . $this->definitionEn . ')</span>',
 | 
						||
        };
 | 
						||
 | 
						||
        return $ret;
 | 
						||
    }
 | 
						||
 | 
						||
    private static function fromVocabDefLine(string $vocabDefLine): ?Term
 | 
						||
    {
 | 
						||
        $term = new Term();
 | 
						||
 | 
						||
        // ------------------------------------------------------ Get Kanji ---
 | 
						||
 | 
						||
        $jpStart = mb_strpos($vocabDefLine, ':');
 | 
						||
        $enStart = mb_strpos($vocabDefLine, ':');
 | 
						||
 | 
						||
        // Get the kanji, as it may not be in the same order for some reason
 | 
						||
        if (false !== $jpStart) {
 | 
						||
            $term->kanji = mb_substr($vocabDefLine, 0,            $jpStart);
 | 
						||
            $def         = mb_substr($vocabDefLine, $jpStart + 1, null);
 | 
						||
            $jpStart = 0;
 | 
						||
        } elseif (false !== $enStart) {
 | 
						||
            $term->kanji = mb_substr($vocabDefLine, 0,            $enStart);
 | 
						||
            $def         = mb_substr($vocabDefLine, $enStart + 1, null);
 | 
						||
            $enStart = 0;
 | 
						||
        } else {
 | 
						||
            // Can't extract term from definition, it doesn't conform to the
 | 
						||
            // established pattern.
 | 
						||
            return null;
 | 
						||
        }
 | 
						||
 | 
						||
        // Convert 「this」 into [this]
 | 
						||
        $term->kanji = mb_trim(strtr($term->kanji, [
 | 
						||
            '「' => '[',
 | 
						||
            '」' => ']',
 | 
						||
            ' ' => ' ',
 | 
						||
        ]));
 | 
						||
        $def         = mb_trim($def);
 | 
						||
        if (!is_string($term->kanji)) {
 | 
						||
            return null;
 | 
						||
        }
 | 
						||
 | 
						||
        // -------------------------------------------------- No definition ---
 | 
						||
 | 
						||
        // Special case where there's no definitions
 | 
						||
        if ($def === '' or $def === '_' or $def === '_') {
 | 
						||
            $term->definitionJp = null;
 | 
						||
            $term->definitionEn = null;
 | 
						||
            return $term;
 | 
						||
        }
 | 
						||
 | 
						||
        // This means there's both en and jp
 | 
						||
        $parentStart = mb_strpos($def, '(');
 | 
						||
 | 
						||
        // -------------------------------------------------- Only Japanese ---
 | 
						||
 | 
						||
        if (false !== $jpStart and false === $parentStart) {
 | 
						||
            // It's all japanese, start to finish
 | 
						||
            $term->definitionJp = mb_trim(mb_substr($def, 0));
 | 
						||
            $term->definitionEn = null;
 | 
						||
            return $term;
 | 
						||
        }
 | 
						||
 | 
						||
        // -------------------------------------- Both Japanese and English ---
 | 
						||
 | 
						||
        if (false !== $jpStart and false !== $parentStart) {
 | 
						||
            $term->definitionJp = mb_trim(mb_substr($def, 0, $parentStart));
 | 
						||
            // -1 to remove the parenthesis end
 | 
						||
            $term->definitionEn = mb_trim(mb_substr($def, $parentStart + 1, -1));
 | 
						||
            return $term;
 | 
						||
        }
 | 
						||
 | 
						||
        // --------------------------------------------------- Only english ---
 | 
						||
 | 
						||
        if (false !== $enStart) {
 | 
						||
            $term->definitionJp = null;
 | 
						||
            $term->definitionEn = mb_trim(mb_substr($def, 0));
 | 
						||
            return $term;
 | 
						||
        }
 | 
						||
 | 
						||
        // ------------------------------------------------- Unvalid syntax ---
 | 
						||
 | 
						||
        dd("Unexpected error, couldn't parse definition line", $vocabDefLine);
 | 
						||
    }
 | 
						||
 | 
						||
    /** Turns a def like "<def>" turn into "<kanji>:<def>" */
 | 
						||
    private static function fromLegacyLine(string $kanji, string $def): ?Term
 | 
						||
    {
 | 
						||
        // Select appropriate semicolon for the job
 | 
						||
        $separator = preg_match('/[[:alpha:]]/u', $def)
 | 
						||
            ? ': '
 | 
						||
            : ':';
 | 
						||
 | 
						||
        // Stick the kanji at the start and see if it makes sense
 | 
						||
        return Term::fromVocabDefLine($kanji . $separator . $def);
 | 
						||
    }
 | 
						||
 | 
						||
    public static function fromNoteFields(array $fields): ?array
 | 
						||
    {
 | 
						||
        // -------------------- Trying to extract it with the modern syntax ---
 | 
						||
        // 言葉: word
 | 
						||
        // 上げる:上に動くこと。
 | 
						||
        // 雨:水粒を降ること。(rain)
 | 
						||
 | 
						||
        $terms = [];
 | 
						||
        foreach (preg_split('|<br ?/?>|', $fields['VocabDef']) as $line) {
 | 
						||
            $terms[] = self::fromVocabDefLine(strip_tags($line));
 | 
						||
        };
 | 
						||
        // If there's no nulls, everything went good
 | 
						||
        if (!in_array(null, $terms, true)) return $terms;
 | 
						||
 | 
						||
 | 
						||
        // ------------ Extracting failed, try to infer from other syntaxes ---
 | 
						||
 | 
						||
        $kanjis = explode('|', $fields['VocabKanji']);
 | 
						||
        $defs   = explode('|', $fields['VocabDef']);
 | 
						||
        // Number of legacy definitions is different from number of kanji
 | 
						||
        if (count($kanjis) !== count($defs)) return null;
 | 
						||
 | 
						||
        $terms = [];
 | 
						||
        foreach (array_combine($kanjis, $defs) as $kanji => $def) {
 | 
						||
            $terms[] = self::fromLegacyLine($kanji, $def);
 | 
						||
        }
 | 
						||
 | 
						||
        // Search for nulls, if found, it's owari da
 | 
						||
        return in_array(null, $terms, true) ? null : $terms;
 | 
						||
    }
 | 
						||
}
 |