anker/src/Entity/Term.php

173 lines
5.8 KiB
PHP
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
namespace App\Entity;
class Term
{
public ?string $kanji;
public ?string $definitionJp;
public ?string $definitionEn;
public function getReading(): ?string
{
return self::parseFurigana($this->kanji)['reading'];
}
public function getKanji(): string
{
return self::parseFurigana($this->kanji)['kanji'];
}
public static function parseFurigana(string $furigana): array
{
// 0: all, 1: (kanji/hiragana), 2: ([reading]): 3: (reading)
preg_match_all('/([^ \[]+)(\[([^\]]*)\])? ?/', $furigana, $matches, PREG_SET_ORDER);
$matchedKanji = array_map(fn($x) => $x[1], $matches);
$matchedReading = array_map(fn($x) => $x[3] ?? $x[1], $matches);
return [
'kanji' => join('', $matchedKanji),
'reading' => $matchedKanji == $matchedReading
? null
: join('', $matchedReading),
];
}
public function toAnkiVocabDef()
{
$ret = '<span ' . Note::HIGHLIGHT_ATTR_KANJI . '>' . $this->kanji;
$ret .= match ([null !== $this->definitionJp, null !== $this->definitionEn]) {
[false, false] => '</span>_',
[false, true] => ':</span> ' . $this->definitionEn,
[true, false] => '</span>' . $this->definitionJp,
[true, true] => '</span>' . $this->definitionJp . '<span style="color: #aacebe;">(' . $this->definitionEn . ')</span>',
};
return $ret;
}
private static function fromVocabDefLine(string $vocabDefLine): ?Term
{
$term = new Term();
// ------------------------------------------------------ Get Kanji ---
$jpStart = mb_strpos($vocabDefLine, '');
$enStart = mb_strpos($vocabDefLine, ':');
// Get the kanji, as it may not be in the same order for some reason
if (false !== $jpStart) {
$term->kanji = mb_substr($vocabDefLine, 0, $jpStart);
$def = mb_substr($vocabDefLine, $jpStart + 1, null);
$jpStart = 0;
} elseif (false !== $enStart) {
$term->kanji = mb_substr($vocabDefLine, 0, $enStart);
$def = mb_substr($vocabDefLine, $enStart + 1, null);
$enStart = 0;
} else {
// Can't extract term from definition, it doesn't conform to the
// established pattern.
return null;
}
// Convert 「this」 into [this]
$term->kanji = mb_trim(strtr($term->kanji, [
'「' => '[',
'」' => ']',
' ' => ' ',
]));
$def = mb_trim($def);
if (!is_string($term->kanji)) {
return null;
}
// -------------------------------------------------- No definition ---
// Special case where there's no definitions
if ($def === '' or $def === '_' or $def === '_') {
$term->definitionJp = null;
$term->definitionEn = null;
return $term;
}
// This means there's both en and jp
$parentStart = mb_strpos($def, '(');
// -------------------------------------------------- Only Japanese ---
if (false !== $jpStart and false === $parentStart) {
// It's all japanese, start to finish
$term->definitionJp = mb_trim(mb_substr($def, 0));
$term->definitionEn = null;
return $term;
}
// -------------------------------------- Both Japanese and English ---
if (false !== $jpStart and false !== $parentStart) {
$term->definitionJp = mb_trim(mb_substr($def, 0, $parentStart));
// -1 to remove the parenthesis end
$term->definitionEn = mb_trim(mb_substr($def, $parentStart + 1, -1));
return $term;
}
// --------------------------------------------------- Only english ---
if (false !== $enStart) {
$term->definitionJp = null;
$term->definitionEn = mb_trim(mb_substr($def, 0));
return $term;
}
// ------------------------------------------------- Unvalid syntax ---
dd("Unexpected error, couldn't parse definition line", $vocabDefLine);
}
/** Turns a def like "<def>" turn into "<kanji>:<def>" */
private static function fromLegacyLine(string $kanji, string $def): ?Term
{
// Select appropriate semicolon for the job
$separator = preg_match('/[[:alpha:]]/u', $def)
? ': '
: '';
// Stick the kanji at the start and see if it makes sense
return Term::fromVocabDefLine($kanji . $separator . $def);
}
public static function fromNoteFields(array $fields): ?array
{
// -------------------- Trying to extract it with the modern syntax ---
// 言葉: word
// 上げる:上に動くこと。
// 雨:水粒を降ること。(rain)
$terms = [];
foreach (preg_split('|<br ?/?>|', $fields['VocabDef']) as $line) {
$terms[] = self::fromVocabDefLine(strip_tags($line));
};
// If there's no nulls, everything went good
if (!in_array(null, $terms, true)) return $terms;
// ------------ Extracting failed, try to infer from other syntaxes ---
$kanjis = explode('', $fields['VocabKanji']);
$defs = explode('', $fields['VocabDef']);
// Number of legacy definitions is different from number of kanji
if (count($kanjis) !== count($defs)) return null;
$terms = [];
foreach (array_combine($kanjis, $defs) as $kanji => $def) {
$terms[] = self::fromLegacyLine($kanji, $def);
}
// Search for nulls, if found, it's owari da
return in_array(null, $terms, true) ? null : $terms;
}
}