Change over to the new parsing

2024-02-23 11:39:55 +01:00 · 2024-02-23 11:39:55 +01:00 · ea4b17ee40
parent c44c7ebc13
commit ea4b17ee40
2 changed files with 58 additions and 133 deletions
--- a/src/orig.py
+++ b/src/orig.py
@ -11,39 +11,6 @@ import parser
 import flow
 import fix

-def line_should_be_translated(line: str) -> bool:
-    japanese_ranges = [
-        (0x4E00, 0x9FFF),  # Kanji
-        (0x3040, 0x309F),  # Hiragana
-        (0x30A0, 0x30FF),  # Katakana
-        (0xFF00, 0xFFEF),  # Full-width Roman characters and symbols
-        (0x3000, 0x303F),  # CJK symbols and punctuation (including 「」)
-        (0x201c, 0x201c), # The character “
-        (0x2026, 0x2026), # The character …
-    ]
-
-
-    for start, end in japanese_ranges:
-        if start <= ord(line[0]) <= end:
-            return True
-
-    # if line starts with special commands
-    if line.startswith(('!s', '!w', '!d', '@', '¥')):
-        # ignore line after comment
-        comment_i = line.find(';')
-        if comment_i != -1:
-            line = line[:comment_i]
-
-        # Check if line has japanese chars
-        for c in line:
-            for start, end in japanese_ranges:
-                if start <= ord(c) <= end:
-                    return True
-
-    return False
-
-
-
 debug_current_line = -1

 def process_sections():
@ -69,61 +36,39 @@ def process_sections():
                    origfile,
                    flow.onik[section_name],
                )
+                print("finished section: ", section_name)
        else:
            outfile.write(line)

    outfile.close()
    origfile.close()

-def get_symbols(line: str) -> (str, list[str]):
-    res = []
-    start_symbol = ''

-    inbetween = 0
-    i = 0
-    while i < len(line):
-        if line[i] in ['@', '/', '¥']:
-            symbol = line[i]
-            i += 1
-        elif line[i:i+3] == '!sd':
-            symbol = line[i:i+3]
-            i += 3
-        elif line[i:i+2] in ['!d', '!w', '!s']:
-            x = i
-            i += 2
-            while i < len(line):
-                if line[i] >= '0' and line[i] <= '9':
-                    i += 1
-                    continue
+def swap_line_text(tokens, translation_lines: list[parser.OutputLine]) -> (str, str, int):
+    """
+    Given a token list and a buffer with lines, replace the text tokens
+    with lines from the line_buffer.

-                symbol = line[x:i]
-                break
-        elif line[i] == '!':
-            raise Exception('Unhandled symbol', line)
-        else: # It's not a symbol, it's a regular character
-            #print(line[i])
-            inbetween += 1
-            i += 1
-            continue
+    Returns the swapped token list and the amount of lines consumed.
+    """

-        # Only reaches this if it's a symbol
+    # Lists are pointers to arrays, don't mutate it
+    ret_en = ''
+    ret_jp = ''
+    lines_written = 0

-        # Each symbol acts as a separator between dialog texts, if we
-        # have two symbols next to eachother, then we print more dialog
-        # than we should. Concatenate consicutive symbols together to
-        # prevent this
-        if inbetween > 0:
-            res.append(symbol)
+    for token in tokens:
+        if token.type == scripter.TokenType.TEXT:
+            ret_en += unidecode(translation_lines[lines_written].text_en)
+            ret_jp += translation_lines[lines_written].text_jp
+            lines_written += 1
        else:
-            # Symbols at the start should not have dialog before them.
-            # Treat them as a special "start_symbol"
-            if len(res) == 0:
-                start_symbol = symbol
-            else:
-                res[-1] += symbol
-        inbetween = 0
+            ret_en += token.token
+            ret_jp += token.token
+
+    return ret_en, ret_jp, lines_written
+

-    return start_symbol, res


 # Given a set of translation files, the original file and the output file
@ -134,68 +79,38 @@ def write_translated(outfile, origfile, translation_file_paths):
        parser.parse_to_csv(transfilepath)
        structure = parser.parse_to_structure(transfilepath)

-        testfile = open('test.txt', 'w')
-
        for line in origfile:
-
            tokens = scripter.parse_line(line)
-            tkns = ''
-            for t in tokens:
-                if len(tkns) > 0:
-                    tkns += ' | '
-                tkns += t.token
-            testfile.write(tkns + '\n')
-
-            continue

+            # --- Debug ---
            global debug_current_line
            debug_current_line += 1
+            print("\n-", debug_current_line, transfilepath, tokens[0])
+            # -------------

-            # Check if the current line is a dialogue line or not
+            # Replace the text tokens with the translated ones
+            line_en, line_jp, lines_written = swap_line_text(tokens, structure)
+            # Remove the lines that have been written
+            structure =  structure[lines_written:]

-            if line_should_be_translated(line):
+            outfile.write('`' + unidecode(line_en) + '\n')

+            # --- Debug ---
+            print(">", line_en)
+            print(">", line_jp)
+            print("<", line, end='')
+            # -------------

-                start, symbols = get_symbols(line)
-                print("\n-", debug_current_line, transfilepath, [start], symbols)
+            if line_jp+'\n' != line:
+                print()
+                print(" ------------------------------------------------------")
+                print(" !                   NO THAT'S WRONG!                 !")
+                print(" ------------------------------------------------------")
+                sys.exit(1)

-                outfile.write('`')
-                outfile.write(start)
-
-                _printed_line_jp = start
-                _printed_line_en = start
-                while True:
-                    _printed_line_jp += structure[0].text_jp
-                    _printed_line_en += structure[0].text_en
-                    outfile.write(
-                        unidecode(structure.pop(0).text_en)
-                    )
-
-                    if len(symbols) > 0:
-                        _printed_line_jp += symbols[0]
-                        _printed_line_en += symbols[0]
-                        outfile.write(symbols.pop(0))
-
-                    if len(symbols) <= 0:
-                        break
-
-                outfile.write('\n')
-
-                print(">", _printed_line_en)
-                print(">", _printed_line_jp)
-                print("<", line, end='')
-
-                if _printed_line_jp+'\n' != line:
-                    print()
-                    print("NO THAT'S WRONG!")
-                    sys.exit(1)
-
-                # Used up all of the structures, this chapter has ended.
-                # Got to the next one
-                if len(structure) <= 0:
-                    break
-
-            else:
-                outfile.write(line)
-
-        testfile.close()
+            # Used up all of the structures, this chapter has ended.
+            # Got to the next one
+            if len(structure) <= 0:
+                print()
+                print(f'- finished "{transfilepath}"')
+                break
--- a/src/scripter.py
+++ b/src/scripter.py
@ -9,24 +9,34 @@ class TokenType(Enum):
    # they will be parsed as a single token
    COMMAND = 1
    TEXT = 2
+    COMMENT = 3

 class ScripterToken():
    def __init__(self, token: str, type: TokenType):
        self.token = token
        self.type = type

+    def __str__(self):
+        return f' |{self.token}|'
+

 # Parse and tokenize an Nscripter script line in japanese mode
 # The original japanese mode diferentiates between single and
 # double byte characters
 def parse_line(line: str) -> list[ScripterToken]:
-    current_token = ScripterToken('', TokenType.TEXT)
+    current_token = ScripterToken('', TokenType.COMMAND)
    token_list = []

    for i, char in enumerate(line):
        # Comments signify the end of what should be parsed
        # Newline does not constitute a token, skip
-        if char == ';' or char == '\n':
+        if char == '\n':
+            break
+
+        if char == ';':
+            token_list.append(current_token)
+            # [:-1] to remove the newline
+            current_token = ScripterToken(line[i:-1], TokenType.COMMENT)
            break

        # First character of the token