lint: Check utf16 strings (#449)

2026-04-23 09:04:21 +00:00 · 2025-03-18 17:15:50 -06:00 · 2025-03-18 17:15:50 -06:00 · 7b1df2234f
parent 497ff11434
commit 7b1df2234f
3 changed files with 108 additions and 22 deletions
--- a/data/data_strings.csv
+++ b/data/data_strings.csv
@ -21760,3 +21760,51 @@
 0x710187fb58,"NoName"
 0x710187fb60,"17AmiiboNpcDirector"
 0x710187fb74,"N2al9ISceneObjE"
+0x71018acfbc,u"NULL"
+0x71018acfc8,u"%02d"
+0x71018acfd2,u"?????"
+0x71018acfde,u"%d"
+0x71018acfe4,u"1"
+0x71018acfe8,u"0"
+0x71018acfec,u"%d位のひと"
+0x71018acffa,u"%ls"
+0x71018ad002,u"%u"
+0x71018ad008,u"%03d"
+0x71018ad012,u"\n"
+0x71018ad016,u"%s"
+0x71018ad01c,u"-----"
+0x71018ad028,u"%d/%d"
+0x71018ad034,u"??????"
+0x71018ad042,u"----"
+0x71018ad04c,u"@"
+0x71018ad050,u"テストステージ"
+0x71018ad060,u"---"
+0x71018ad068,u"/////////////////"
+0x71018ad08c,u"未設定"
+0x71018ad094,u"66666"
+0x71018ad0a0,u"56666"
+0x71018ad0ac,u"55666"
+0x71018ad0b8,u"55566"
+0x71018ad0c4,u"55556"
+0x71018ad0d0,u"55555"
+0x71018ad0dc,u"%lld"
+0x71018ad0e6,u"%.1f"
+0x71018ad0f0,u"★30"
+0x71018ad0f8,u"* +1"
+0x71018ad102,u"None"
+0x71018ad10c,u"＜シャイン名＞"
+0x71018ad11c,u"@title"
+0x71018ad12a,u"@space"
+0x71018ad138,u"5"
+0x71018ad13c,u"6"
+0x71018ad140,u"シャイン名未設定"
+0x71018ad152,u"メッセージ初期化失敗"
+0x71018ad168,u"%01d"
+0x71018ad172,u"%04d"
+0x71018ad17c,u"%05d"
+0x71018ad186,u"%06d"
+0x71018ad190,u"/"
+0x71018ad194,u"%2d"
+0x71018ad19c,u"%3d"
+0x71018ad1a4,u"%4d"
+0x71018ad1ac,u"%5d"
--- a/tools/check-format.py
+++ b/tools/check-format.py
@ -291,9 +291,12 @@ def common_string_finder(c, path):
        if "//" in line:
            continue

-        matches = re.findall(r'"(.*?)"', line)
+        matches = re.findall(r'(u?".*?")', line)

        for match in matches:
+            if not match.startswith("u"):
+                # Remove quotes from utf8 strings
+                match = match[1:-1]
            if len(match) < 2:
                continue
            found = False
--- a/tools/generate-strings.py
+++ b/tools/generate-strings.py
@ -10,21 +10,22 @@ MAX_TEXT_SIZE = 0x600
 CHUNK_SIZE = 0x200 # Read file chunk size

 NSO_OFFSET = 0x70FFFFFF00
-START_OFFSET = 0x710181c3d8 # String table start main 1.0.0
-END_OFFSET = 0x710187fb74 # String table end
+U8_START_OFFSET = 0x710181c3d8 # UTF8 String table start main 1.0.0
+U8_END_OFFSET = 0x710187fb74 # UTF8 String table end
+U16_START_OFFSET = 0x71018acfbc # UTF16 string table start main 1.0.0
+U16_END_OFFSET = 0x71018ad1b3 # UTF16 string table end

-def parse_string(buffer, offset, csv_file):
+def parse_string(buffer, offset, csv_file, encoding):
+    nso_addr = NSO_OFFSET + offset
+    
    if len (buffer) < MIN_TEXT_SIZE:
        return
    if len(buffer) > MAX_TEXT_SIZE:
-        print("Warning: String is bigger than buffer size")
-        print(buffer)
+        print(hex(nso_addr), "Warning: String is bigger than buffer size")
        return

-    nso_addr = NSO_OFFSET + offset
-
    try :
-        text = buffer.decode('utf-8')
+        text = buffer.decode(encoding)

        # Make escape sequences visible
        text = text.replace("\\", "\\\\")
@ -35,28 +36,30 @@ def parse_string(buffer, offset, csv_file):

        # Write to csv file
        csv_file.write(hex(nso_addr))
-        csv_file.write(",\"")
+        if encoding == 'utf-16':
+            csv_file.write(",u\"")
+        else:
+            csv_file.write(",\"")
        csv_file.write(text)
        csv_file.write("\"\n")
    except:
-        #Decode to utf-8 can fail on non-string data
-        print("Unable to parse string")
-        print(buffer)
+        # Decode can fail on non-string data
+        print(hex(nso_addr), "Unable to parse string")

-def parse_nso(csv_file, nso_file):
-    offset = START_OFFSET - NSO_OFFSET
-    end = END_OFFSET - NSO_OFFSET
+def parse_utf8(csv_file, nso_file):
+    offset = U8_START_OFFSET - NSO_OFFSET
+    end = U8_END_OFFSET - NSO_OFFSET
    nso_file.seek(offset)
    buffer = bytes()
-    
+
    while nso_file.tell() < end:
        chunk = nso_file.read(CHUNK_SIZE)
        if not chunk:
            return
-        buffer += chunk 
+        buffer += chunk
        previous_zero = 0
        last_zero = -1
-        
+
        for i, b in enumerate(buffer):
            if b != 0:
                continue
@ -65,14 +68,46 @@ def parse_nso(csv_file, nso_file):
            previous_zero = last_zero
            last_zero = i
            text = buffer[previous_zero + 1 : last_zero]
-            parse_string(text, offset, csv_file)
+            parse_string(text, offset, csv_file, 'utf-8')
            offset += last_zero - previous_zero
-            
+
+        buffer = buffer[last_zero + 1 : len(buffer)]
+
+def parse_utf16(csv_file, nso_file):
+    offset = U16_START_OFFSET - NSO_OFFSET
+    end = U16_END_OFFSET - NSO_OFFSET
+    nso_file.seek(offset)
+    buffer = bytes()
+
+    while nso_file.tell() < end:
+        chunk = nso_file.read(CHUNK_SIZE)
+        if not chunk:
+            return
+        buffer += chunk
+        previous_zero = 0
+        last_zero = -1
+        prev = 0
+
+        for i, b in enumerate(buffer):
+            if i % 2 == 0:
+                prev = b
+                continue
+            if prev != 0 or b != 0:
+                continue
+            if offset > end:
+                return
+            previous_zero = last_zero
+            last_zero = i
+            text = buffer[previous_zero + 1 : last_zero - 1]
+            parse_string(text, offset, csv_file, 'utf-16')
+            offset += last_zero - previous_zero
+
        buffer = buffer[last_zero + 1 : len(buffer)]

 def create_string_table(string_path, nso_path):
    with open(string_path, "w") as csv_file, open(nso_path, "rb") as nso_file:
-        parse_nso(csv_file, nso_file)
+        parse_utf8(csv_file, nso_file)
+        parse_utf16(csv_file, nso_file)

 project_root = setup.ROOT