hinamizawa-downporter/src/scripter.py

#!/usr/bin/env python3

from enum import Enum


class TokenType(Enum):
    # Command tokens represent all non dialogue keywords
    # If the commands are chained without text inbetween
    # they will be parsed as a single token
    COMMAND = 1
    TEXT = 2
    COMMENT = 3

class ScripterToken():
    def __init__(self, token: str, type: TokenType):
        self.token = token
        self.type = type

    def __str__(self):
        if self.type == TokenType.TEXT:
            color = '\033[34m'
        elif self.type == TokenType.COMMAND:
            color = '\033[32m'
        elif self.type == TokenType.COMMENT:
            color = '\033[36m'
        return f'{color}{self.token}\033[0m'

    def __repr__(self):
        return self.__str__()


# Parse and tokenize an Nscripter script line in japanese mode
# The original japanese mode diferentiates between single and
# double byte characters
def parse_line(line: str) -> list[ScripterToken]:
    current_token = ScripterToken('', TokenType.COMMAND)
    token_list = []

    for i, char in enumerate(line):
        # Comments signify the end of what should be parsed
        # Newline does not constitute a token, skip
        if char == '\n':
            break

        if char == ';':
            token_list.append(current_token)
            # [:-1] to remove the newline
            current_token = ScripterToken(line[i:-1], TokenType.COMMENT)
            break

        # First character of the token
        if len(current_token.token) == 0:
            if is_double_width(char):
                current_token.type = TokenType.TEXT
            else:
                current_token.type = TokenType.COMMAND
            current_token.token += char

        else:
            # End of token
            if current_token.type == TokenType.COMMAND and is_double_width(char):
                # Add to list and reset the current one
                token_list.append(current_token)
                current_token = ScripterToken('', TokenType.TEXT)
            elif current_token.type == TokenType.TEXT and not is_double_width(char):
                # Add to list and reset the current one
                token_list.append(current_token)
                current_token = ScripterToken('', TokenType.COMMAND)

            current_token.token += char

    # Append last token of the line
    token_list.append(current_token)


    return token_list


def is_double_width(char: str) -> bool:

    japanese_ranges = [
        (0x4E00, 0x9FFF), # Kanji
        (0x3040, 0x309F), # Hiragana
        (0x30A0, 0x30FF), # Katakana
        (0xFF00, 0xFFEF), # Full-width Roman characters and symbols
        (0x3000, 0x303F), # CJK symbols and punctuation (including 「」)
        (0x201c, 0x201d), # The characters “ ”
        (0x2026, 0x2026), # The character …
        (9734, 9734), # ☆ (Nscripter treats it like a dw character)
        (215, 215), # × (treated as a command when it shouldnt)
    ]

    for start, end in japanese_ranges:
        if 0xFF < ord(char[0]) or start <= ord(char[0]) <= end:
            return True

    return False