From ea4b17ee405e4ed071b17957351e83c5c60aef42 Mon Sep 17 00:00:00 2001 From: dusk Date: Fri, 23 Feb 2024 11:39:55 +0100 Subject: [PATCH] Change over to the new parsing --- src/orig.py | 177 +++++++++++++----------------------------------- src/scripter.py | 14 +++- 2 files changed, 58 insertions(+), 133 deletions(-) diff --git a/src/orig.py b/src/orig.py index 87fa5f7..76034fd 100755 --- a/src/orig.py +++ b/src/orig.py @@ -11,39 +11,6 @@ import parser import flow import fix -def line_should_be_translated(line: str) -> bool: - japanese_ranges = [ - (0x4E00, 0x9FFF), # Kanji - (0x3040, 0x309F), # Hiragana - (0x30A0, 0x30FF), # Katakana - (0xFF00, 0xFFEF), # Full-width Roman characters and symbols - (0x3000, 0x303F), # CJK symbols and punctuation (including 「」) - (0x201c, 0x201c), # The character “ - (0x2026, 0x2026), # The character … - ] - - - for start, end in japanese_ranges: - if start <= ord(line[0]) <= end: - return True - - # if line starts with special commands - if line.startswith(('!s', '!w', '!d', '@', '¥')): - # ignore line after comment - comment_i = line.find(';') - if comment_i != -1: - line = line[:comment_i] - - # Check if line has japanese chars - for c in line: - for start, end in japanese_ranges: - if start <= ord(c) <= end: - return True - - return False - - - debug_current_line = -1 def process_sections(): @@ -69,61 +36,39 @@ def process_sections(): origfile, flow.onik[section_name], ) + print("finished section: ", section_name) else: outfile.write(line) outfile.close() origfile.close() -def get_symbols(line: str) -> (str, list[str]): - res = [] - start_symbol = '' - inbetween = 0 - i = 0 - while i < len(line): - if line[i] in ['@', '/', '¥']: - symbol = line[i] - i += 1 - elif line[i:i+3] == '!sd': - symbol = line[i:i+3] - i += 3 - elif line[i:i+2] in ['!d', '!w', '!s']: - x = i - i += 2 - while i < len(line): - if line[i] >= '0' and line[i] <= '9': - i += 1 - continue +def swap_line_text(tokens, translation_lines: list[parser.OutputLine]) -> (str, str, int): + """ + Given a token list and a buffer with lines, replace the text tokens + with lines from the line_buffer. - symbol = line[x:i] - break - elif line[i] == '!': - raise Exception('Unhandled symbol', line) - else: # It's not a symbol, it's a regular character - #print(line[i]) - inbetween += 1 - i += 1 - continue + Returns the swapped token list and the amount of lines consumed. + """ - # Only reaches this if it's a symbol + # Lists are pointers to arrays, don't mutate it + ret_en = '' + ret_jp = '' + lines_written = 0 - # Each symbol acts as a separator between dialog texts, if we - # have two symbols next to eachother, then we print more dialog - # than we should. Concatenate consicutive symbols together to - # prevent this - if inbetween > 0: - res.append(symbol) + for token in tokens: + if token.type == scripter.TokenType.TEXT: + ret_en += unidecode(translation_lines[lines_written].text_en) + ret_jp += translation_lines[lines_written].text_jp + lines_written += 1 else: - # Symbols at the start should not have dialog before them. - # Treat them as a special "start_symbol" - if len(res) == 0: - start_symbol = symbol - else: - res[-1] += symbol - inbetween = 0 + ret_en += token.token + ret_jp += token.token + + return ret_en, ret_jp, lines_written + - return start_symbol, res # Given a set of translation files, the original file and the output file @@ -134,68 +79,38 @@ def write_translated(outfile, origfile, translation_file_paths): parser.parse_to_csv(transfilepath) structure = parser.parse_to_structure(transfilepath) - testfile = open('test.txt', 'w') - for line in origfile: - tokens = scripter.parse_line(line) - tkns = '' - for t in tokens: - if len(tkns) > 0: - tkns += ' | ' - tkns += t.token - testfile.write(tkns + '\n') - - continue + # --- Debug --- global debug_current_line debug_current_line += 1 + print("\n-", debug_current_line, transfilepath, tokens[0]) + # ------------- - # Check if the current line is a dialogue line or not + # Replace the text tokens with the translated ones + line_en, line_jp, lines_written = swap_line_text(tokens, structure) + # Remove the lines that have been written + structure = structure[lines_written:] - if line_should_be_translated(line): + outfile.write('`' + unidecode(line_en) + '\n') + # --- Debug --- + print(">", line_en) + print(">", line_jp) + print("<", line, end='') + # ------------- - start, symbols = get_symbols(line) - print("\n-", debug_current_line, transfilepath, [start], symbols) + if line_jp+'\n' != line: + print() + print(" ------------------------------------------------------") + print(" ! NO THAT'S WRONG! !") + print(" ------------------------------------------------------") + sys.exit(1) - outfile.write('`') - outfile.write(start) - - _printed_line_jp = start - _printed_line_en = start - while True: - _printed_line_jp += structure[0].text_jp - _printed_line_en += structure[0].text_en - outfile.write( - unidecode(structure.pop(0).text_en) - ) - - if len(symbols) > 0: - _printed_line_jp += symbols[0] - _printed_line_en += symbols[0] - outfile.write(symbols.pop(0)) - - if len(symbols) <= 0: - break - - outfile.write('\n') - - print(">", _printed_line_en) - print(">", _printed_line_jp) - print("<", line, end='') - - if _printed_line_jp+'\n' != line: - print() - print("NO THAT'S WRONG!") - sys.exit(1) - - # Used up all of the structures, this chapter has ended. - # Got to the next one - if len(structure) <= 0: - break - - else: - outfile.write(line) - - testfile.close() + # Used up all of the structures, this chapter has ended. + # Got to the next one + if len(structure) <= 0: + print() + print(f'- finished "{transfilepath}"') + break diff --git a/src/scripter.py b/src/scripter.py index 6a1c26a..6c28620 100644 --- a/src/scripter.py +++ b/src/scripter.py @@ -9,24 +9,34 @@ class TokenType(Enum): # they will be parsed as a single token COMMAND = 1 TEXT = 2 + COMMENT = 3 class ScripterToken(): def __init__(self, token: str, type: TokenType): self.token = token self.type = type + def __str__(self): + return f' |{self.token}|' + # Parse and tokenize an Nscripter script line in japanese mode # The original japanese mode diferentiates between single and # double byte characters def parse_line(line: str) -> list[ScripterToken]: - current_token = ScripterToken('', TokenType.TEXT) + current_token = ScripterToken('', TokenType.COMMAND) token_list = [] for i, char in enumerate(line): # Comments signify the end of what should be parsed # Newline does not constitute a token, skip - if char == ';' or char == '\n': + if char == '\n': + break + + if char == ';': + token_list.append(current_token) + # [:-1] to remove the newline + current_token = ScripterToken(line[i:-1], TokenType.COMMENT) break # First character of the token