From ec5ec84407fb376b593c59256a219689a82f5d8c Mon Sep 17 00:00:00 2001 From: itsRevela Date: Fri, 17 Apr 2026 05:55:25 -0500 Subject: [PATCH] feat: add 4JLibs comparison tooling Tools for comparing 4JLibs binary changes between git refs: - compare-4jlibs.py: Extracts libs, parses symbol tables via dumpbin, demangles with undname, generates structured diff reports - extract_lib.py: Extracts .obj members from COFF .lib archives - ExportLibInfo.java: Ghidra headless script for non-LTCG object files - list-lib-symbols.sh / compare-4jlibs.sh: Shell wrappers --- tools/ghidra/.gitignore | 2 + tools/ghidra/ExportLibInfo.java | 139 +++++++ tools/ghidra/compare-4jlibs.py | 632 +++++++++++++++++++++++++++++++ tools/ghidra/compare-4jlibs.sh | 344 +++++++++++++++++ tools/ghidra/extract_lib.py | 104 +++++ tools/ghidra/list-lib-symbols.sh | 44 +++ tools/ghidra/output/.gitkeep | 0 7 files changed, 1265 insertions(+) create mode 100644 tools/ghidra/.gitignore create mode 100644 tools/ghidra/ExportLibInfo.java create mode 100644 tools/ghidra/compare-4jlibs.py create mode 100644 tools/ghidra/compare-4jlibs.sh create mode 100644 tools/ghidra/extract_lib.py create mode 100644 tools/ghidra/list-lib-symbols.sh create mode 100644 tools/ghidra/output/.gitkeep diff --git a/tools/ghidra/.gitignore b/tools/ghidra/.gitignore new file mode 100644 index 00000000..55f0c71c --- /dev/null +++ b/tools/ghidra/.gitignore @@ -0,0 +1,2 @@ +# Ghidra analysis output (generated, not tracked) +output/report-*/ diff --git a/tools/ghidra/ExportLibInfo.java b/tools/ghidra/ExportLibInfo.java new file mode 100644 index 00000000..cee9ac5d --- /dev/null +++ b/tools/ghidra/ExportLibInfo.java @@ -0,0 +1,139 @@ +// Export symbols, functions, and external references from a COFF .lib to a JSON report. +// Designed for headless mode: pass output path as first script argument. +// +// Usage with analyzeHeadless: +// analyzeHeadless -import \ +// -postScript ExportLibInfo.java -deleteProject +// +//@category 4JLibs + +import java.io.File; +import java.io.FileWriter; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import ghidra.app.script.GhidraScript; +import ghidra.program.model.address.Address; +import ghidra.program.model.listing.*; +import ghidra.program.model.symbol.*; + +public class ExportLibInfo extends GhidraScript { + + @Override + public void run() throws Exception { + String[] args = getScriptArgs(); + if (args.length < 1) { + printerr("Usage: ExportLibInfo.java "); + return; + } + + File outputFile = new File(args[0]); + outputFile.getParentFile().mkdirs(); + + PrintWriter pw = new PrintWriter(new FileWriter(outputFile, true)); + + String programName = currentProgram.getName(); + Listing listing = currentProgram.getListing(); + SymbolTable symbolTable = currentProgram.getSymbolTable(); + ExternalManager extMgr = currentProgram.getExternalManager(); + + // Collect functions + List functions = new ArrayList<>(); + FunctionIterator funcIter = listing.getFunctions(true); + while (funcIter.hasNext() && !monitor.isCancelled()) { + Function f = funcIter.next(); + String sig = f.getPrototypeString(false, false); + String callingConv = f.getCallingConventionName(); + long size = f.getBody().getNumAddresses(); + + functions.add(String.format( + " {\"name\": %s, \"entry\": %s, \"signature\": %s, \"callingConvention\": %s, \"size\": %d, \"paramCount\": %d}", + jsonStr(f.getName()), + jsonStr(f.getEntryPoint().toString()), + jsonStr(sig), + jsonStr(callingConv), + size, + f.getParameterCount() + )); + } + + // Collect all symbols (non-function) + List symbols = new ArrayList<>(); + SymbolIterator symIter = symbolTable.getAllSymbols(true); + while (symIter.hasNext() && !monitor.isCancelled()) { + Symbol sym = symIter.next(); + if (sym.getSymbolType() == SymbolType.FUNCTION) { + continue; // already captured above + } + if (sym.isExternal()) { + continue; // captured below + } + symbols.add(String.format( + " {\"name\": %s, \"type\": %s, \"address\": %s, \"source\": %s}", + jsonStr(sym.getName(true)), + jsonStr(sym.getSymbolType().toString()), + jsonStr(sym.getAddress().toString()), + jsonStr(sym.getSource().toString()) + )); + } + + // Collect external symbols (imports from other libraries) + List externals = new ArrayList<>(); + symIter = symbolTable.getExternalSymbols(); + while (symIter.hasNext() && !monitor.isCancelled()) { + Symbol sym = symIter.next(); + String extLib = ""; + ExternalLocation extLoc = extMgr.getExternalLocation(sym); + if (extLoc != null && extLoc.getLibraryName() != null) { + extLib = extLoc.getLibraryName(); + } + externals.add(String.format( + " {\"name\": %s, \"type\": %s, \"library\": %s}", + jsonStr(sym.getName(true)), + jsonStr(sym.getSymbolType().toString()), + jsonStr(extLib) + )); + } + + // Write JSON object for this program/object-file + pw.println("{"); + pw.println(" \"program\": " + jsonStr(programName) + ","); + pw.println(" \"language\": " + jsonStr(currentProgram.getLanguageID().toString()) + ","); + pw.println(" \"compiler\": " + jsonStr(currentProgram.getCompilerSpec().getCompilerSpecID().toString()) + ","); + + pw.println(" \"functionCount\": " + functions.size() + ","); + pw.println(" \"functions\": ["); + pw.println(String.join(",\n", functions)); + pw.println(" ],"); + + pw.println(" \"symbolCount\": " + symbols.size() + ","); + pw.println(" \"symbols\": ["); + pw.println(String.join(",\n", symbols)); + pw.println(" ],"); + + pw.println(" \"externalCount\": " + externals.size() + ","); + pw.println(" \"externals\": ["); + pw.println(String.join(",\n", externals)); + pw.println(" ]"); + + pw.println("}"); + + pw.flush(); + pw.close(); + + println("ExportLibInfo: wrote " + functions.size() + " functions, " + + symbols.size() + " symbols, " + externals.size() + " externals for " + + programName + " -> " + outputFile.getAbsolutePath()); + } + + private String jsonStr(String s) { + if (s == null) return "null"; + return "\"" + s.replace("\\", "\\\\") + .replace("\"", "\\\"") + .replace("\n", "\\n") + .replace("\r", "\\r") + .replace("\t", "\\t") + "\""; + } +} diff --git a/tools/ghidra/compare-4jlibs.py b/tools/ghidra/compare-4jlibs.py new file mode 100644 index 00000000..9b61e484 --- /dev/null +++ b/tools/ghidra/compare-4jlibs.py @@ -0,0 +1,632 @@ +"""Compare 4JLibs between two git refs. + +Extracts .lib files from both refs, parses their symbol tables (using dumpbin +or direct ar-archive parsing), demangles MSVC symbols, and generates a +structured diff report. + +Usage: + python compare-4jlibs.py [OLD_REF] [NEW_REF] [--filter PATTERN] [--no-demangle] + +Defaults: + OLD_REF = HEAD + NEW_REF = upstream/main + +Output: + tools/ghidra/output/report-/ +""" + +import argparse +import json +import os +import re +import struct +import subprocess +import sys +import tempfile +from collections import defaultdict +from datetime import datetime +from pathlib import Path + + +# --------------------------------------------------------------------------- +# Config +# --------------------------------------------------------------------------- + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +LIB_PATH = "Minecraft.Client/Windows64/4JLibs/libs" +OUTPUT_BASE = Path(__file__).resolve().parent / "output" + +# MSVC tool discovery +MSVC_SEARCH_PATHS = [ + Path(r"C:\Program Files (x86)\Microsoft Visual Studio\18\BuildTools\VC\Tools\MSVC"), + Path(r"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC"), + Path(r"C:\Program Files\Microsoft Visual Studio\2022\Professional\VC\Tools\MSVC"), + Path(r"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Tools\MSVC"), +] + + +def find_msvc_tool(name): + """Find an MSVC tool (dumpbin.exe, undname.exe) in VS installations.""" + for base in MSVC_SEARCH_PATHS: + if not base.exists(): + continue + for version_dir in sorted(base.iterdir(), reverse=True): + tool = version_dir / "bin" / "Hostx64" / "x64" / name + if tool.exists(): + return str(tool) + return None + + +DUMPBIN = find_msvc_tool("dumpbin.exe") +UNDNAME = find_msvc_tool("undname.exe") + + +# --------------------------------------------------------------------------- +# Symbol table parsing (direct, no external tools needed) +# --------------------------------------------------------------------------- + +def parse_lib_symbols_direct(lib_path): + """Parse the first linker member of a .lib to get all public symbols.""" + symbols = [] + with open(lib_path, "rb") as f: + magic = f.read(8) + if magic != b"!\n": + return symbols + + # First linker member (big-endian) + header = f.read(60) + name = header[0:16].decode("ascii").strip() + size = int(header[48:58].decode("ascii").strip()) + + if name != "/": + return symbols + + data = f.read(size) + num_symbols = struct.unpack(">I", data[0:4])[0] + offsets_end = 4 + num_symbols * 4 + string_data = data[offsets_end:] + + pos = 0 + for _ in range(num_symbols): + end = string_data.find(b"\x00", pos) + if end == -1: + break + sym = string_data[pos:end].decode("ascii", errors="replace") + symbols.append(sym) + pos = end + 1 + + return symbols + + +def parse_lib_symbols_dumpbin(lib_path): + """Use dumpbin /LINKERMEMBER to get symbols (more reliable for edge cases).""" + if not DUMPBIN: + return None + + try: + result = subprocess.run( + [DUMPBIN, "/LINKERMEMBER:2", str(lib_path)], + capture_output=True, text=True, timeout=60 + ) + symbols = [] + in_symbols = False + for line in result.stdout.splitlines(): + line = line.strip() + if "public symbols" in line: + in_symbols = True + continue + if in_symbols and line: + # Format: " offset symbol_name" + parts = line.split(None, 1) + if len(parts) == 2 and all(c in "0123456789ABCDEFabcdef" for c in parts[0]): + symbols.append(parts[1]) + elif not line[0].isdigit(): + in_symbols = False + return symbols + except (subprocess.TimeoutExpired, FileNotFoundError): + return None + + +def parse_lib_members(lib_path): + """Extract member (object file) names from a .lib archive.""" + members = [] + with open(lib_path, "rb") as f: + magic = f.read(8) + if magic != b"!\n": + return members + + long_names = b"" + + while True: + header = f.read(60) + if len(header) < 60: + break + + raw_name = header[0:16].decode("ascii", errors="replace").rstrip() + size_str = header[48:58].decode("ascii").strip() + end_marker = header[58:60] + + if end_marker != b"\x60\x0a": + break + + size = int(size_str) + + if raw_name == "/": + f.seek(size + (size % 2), 1) + continue + if raw_name == "//": + long_names = f.read(size) + if size % 2: + f.read(1) + continue + + name = raw_name + if name.startswith("/") and name[1:].isdigit(): + offset = int(name[1:]) + end = long_names.find(b"\x00", offset) + if end == -1: + end = long_names.find(b"\n", offset) + if end == -1: + end = len(long_names) + name = long_names[offset:end].decode("ascii", errors="replace").rstrip("/") + + # Read first 2 bytes to check machine type + member_data = f.read(min(size, 2)) + if len(member_data) >= 2: + machine = struct.unpack(" 0: + f.seek(remaining, 1) + if size % 2: + f.read(1) + + members.append({ + "name": name, + "size": size, + "machine": f"0x{machine:04x}", + "is_ltcg": machine == 0x01f2, + }) + + return members + + +# --------------------------------------------------------------------------- +# Demangling +# --------------------------------------------------------------------------- + +def demangle_symbols(mangled_symbols): + """Demangle MSVC-mangled symbols using undname.exe.""" + if not UNDNAME or not mangled_symbols: + return {} + + demangled = {} + + # undname takes symbols as command-line arguments (not stdin). + # Output format: + # Undecoration of :- "??0CProfile@@QAA@XZ" + # is :- "public: __cdecl CProfile::CProfile(void)" + # Process in batches to avoid command line length limits. + batch_size = 100 + for i in range(0, len(mangled_symbols), batch_size): + batch = mangled_symbols[i:i + batch_size] + try: + result = subprocess.run( + [UNDNAME] + batch, + capture_output=True, text=True, timeout=60 + ) + current_mangled = None + for line in result.stdout.splitlines(): + line = line.strip() + if line.startswith('Undecoration of :- "'): + current_mangled = line.split('"')[1] + elif line.startswith('is :- "') and current_mangled: + dem = line.split('"')[1] + demangled[current_mangled] = dem + current_mangled = None + except (subprocess.TimeoutExpired, FileNotFoundError): + break + + return demangled + + +# --------------------------------------------------------------------------- +# Classification +# --------------------------------------------------------------------------- + +def classify_symbol(mangled, demangled=None): + """Classify a symbol into a category for organized reporting.""" + name = demangled or mangled + + # Filter out std:: library symbols + if "std::" in name or mangled.startswith("??_C@"): + return "std/compiler" + + # Constructor/destructor + if mangled.startswith("??0"): + return "constructor" + if mangled.startswith("??1"): + return "destructor" + + # Operators + if mangled.startswith("??"): + return "operator" + + # Virtual function table + if mangled.startswith("??_7") or "vftable" in name.lower(): + return "vtable" + + # Static data + if mangled.startswith("?_") and "@" in mangled: + return "static_data" + + # Check class membership + if "@C_4J" in mangled or "@C_4j" in mangled: + return "4j_interface" + + for prefix in ["CAwardManager", "CProfile", "CProfileData", "CRichPresence", + "CSys", "CStorage", "CInput", "CRender", "CRenderer"]: + if f"@{prefix}@@" in mangled or f"@{prefix}@" in mangled: + return "4j_class" + + return "other" + + +def extract_class_name(mangled): + """Try to extract the class name from a mangled symbol.""" + # Pattern: ?Method@ClassName@@... + m = re.match(r"\?\??\d?(\w+)@(\w+)@@", mangled) + if m: + return m.group(2) + + m = re.match(r"\?(\w+)@(\w+)@@", mangled) + if m: + return m.group(2) + + return None + + +# --------------------------------------------------------------------------- +# Git operations +# --------------------------------------------------------------------------- + +def git_extract_lib(ref, lib_rel_path, output_path): + """Extract a file from a git ref to a local path.""" + try: + result = subprocess.run( + ["git", "cat-file", "-e", f"{ref}:{lib_rel_path}"], + capture_output=True, cwd=str(REPO_ROOT) + ) + if result.returncode != 0: + return False + + result = subprocess.run( + ["git", "show", f"{ref}:{lib_rel_path}"], + capture_output=True, cwd=str(REPO_ROOT) + ) + if result.returncode != 0: + return False + + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "wb") as f: + f.write(result.stdout) + return True + except Exception as e: + print(f" WARNING: Failed to extract {lib_rel_path} from {ref}: {e}", file=sys.stderr) + return False + + +def git_changed_libs(old_ref, new_ref): + """Get list of .lib files that changed between two refs.""" + result = subprocess.run( + ["git", "diff", "--name-only", old_ref, new_ref, "--", f"{LIB_PATH}/*.lib"], + capture_output=True, text=True, cwd=str(REPO_ROOT) + ) + if result.returncode != 0 or not result.stdout.strip(): + # Fallback: list all libs at new ref + result = subprocess.run( + ["git", "ls-tree", "--name-only", "-r", new_ref, "--", f"{LIB_PATH}/"], + capture_output=True, text=True, cwd=str(REPO_ROOT) + ) + return [l for l in result.stdout.strip().splitlines() if l.endswith(".lib")] + + +# --------------------------------------------------------------------------- +# Report generation +# --------------------------------------------------------------------------- + +def generate_lib_report(lib_name, old_syms, new_syms, old_demangled, new_demangled, + old_members, new_members, old_size, new_size): + """Generate a detailed comparison report for one library.""" + lines = [] + lines.append(f"{'=' * 70}") + lines.append(f" {lib_name}") + lines.append(f"{'=' * 70}") + lines.append("") + + # Status + if old_syms is None and new_syms is not None: + lines.append("STATUS: ADDED (new library)") + elif old_syms is not None and new_syms is None: + lines.append("STATUS: DELETED") + else: + lines.append("STATUS: MODIFIED") + lines.append("") + + # Size + if old_size and new_size: + delta = new_size - old_size + pct = (delta * 100) // old_size if old_size else 0 + sign = "+" if delta > 0 else "" + lines.append(f"SIZE: {old_size:,} -> {new_size:,} bytes ({sign}{delta:,}, {sign}{pct}%)") + elif new_size: + lines.append(f"SIZE: (new) {new_size:,} bytes") + elif old_size: + lines.append(f"SIZE: {old_size:,} bytes (deleted)") + lines.append("") + + # Members + if old_members or new_members: + old_member_names = {m["name"] for m in (old_members or [])} + new_member_names = {m["name"] for m in (new_members or [])} + lines.append(f"OBJECT FILES: {len(old_member_names)} -> {len(new_member_names)}") + added_m = new_member_names - old_member_names + removed_m = old_member_names - new_member_names + if added_m: + lines.append(f" + Added: {', '.join(sorted(added_m))}") + if removed_m: + lines.append(f" - Removed: {', '.join(sorted(removed_m))}") + lines.append("") + + old_set = set(old_syms or []) + new_set = set(new_syms or []) + + # Filter out std/compiler symbols for the main diff + old_user = {s for s in old_set if classify_symbol(s) not in ("std/compiler",)} + new_user = {s for s in new_set if classify_symbol(s) not in ("std/compiler",)} + + old_std = old_set - old_user + new_std = new_set - new_user + + lines.append(f"SYMBOLS: {len(old_set)} -> {len(new_set)} total") + lines.append(f" User symbols: {len(old_user)} -> {len(new_user)}") + lines.append(f" Std/compiler: {len(old_std)} -> {len(new_std)}") + lines.append("") + + # Added symbols (grouped by class) + added = sorted(new_user - old_user) + removed = sorted(old_user - new_user) + unchanged = old_user & new_user + + if added: + lines.append(f"+++ ADDED SYMBOLS ({len(added)}) +++") + by_class = defaultdict(list) + for s in added: + cls = extract_class_name(s) or "(global)" + d = new_demangled.get(s, s) + by_class[cls].append(d) + for cls in sorted(by_class.keys()): + lines.append(f" [{cls}]") + for d in sorted(by_class[cls]): + lines.append(f" + {d}") + lines.append("") + + if removed: + lines.append(f"--- REMOVED SYMBOLS ({len(removed)}) ---") + by_class = defaultdict(list) + for s in removed: + cls = extract_class_name(s) or "(global)" + d = old_demangled.get(s, s) + by_class[cls].append(d) + for cls in sorted(by_class.keys()): + lines.append(f" [{cls}]") + for d in sorted(by_class[cls]): + lines.append(f" - {d}") + lines.append("") + + lines.append(f"UNCHANGED: {len(unchanged)} symbols") + lines.append("") + + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser(description="Compare 4JLibs between git refs") + parser.add_argument("old_ref", nargs="?", default="HEAD", help="Old git ref (default: HEAD)") + parser.add_argument("new_ref", nargs="?", default="upstream/main", help="New git ref (default: upstream/main)") + parser.add_argument("--filter", "-f", default="", help="Only compare libs matching this pattern") + parser.add_argument("--no-demangle", action="store_true", help="Skip demangling") + parser.add_argument("--json", action="store_true", help="Also output JSON data") + args = parser.parse_args() + + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + report_dir = OUTPUT_BASE / f"report-{timestamp}" + report_dir.mkdir(parents=True, exist_ok=True) + + print("=" * 56) + print(" 4JLibs Comparison Tool") + print("=" * 56) + print(f" Old ref: {args.old_ref}") + print(f" New ref: {args.new_ref}") + print(f" Filter: {args.filter or ''}") + print(f" Demangle: {not args.no_demangle}") + print(f" dumpbin: {'found' if DUMPBIN else 'not found (using direct parsing)'}") + print(f" undname: {'found' if UNDNAME else 'not found (no demangling)'}") + print(f" Output: {report_dir}") + print() + + # Step 1: Find changed libs + print("[1/4] Finding changed libraries...") + changed_libs = git_changed_libs(args.old_ref, args.new_ref) + if args.filter: + changed_libs = [l for l in changed_libs if args.filter in os.path.basename(l)] + + if not changed_libs: + print(" No matching .lib changes found.") + return + + for lib in changed_libs: + print(f" {os.path.basename(lib)}") + print() + + # Step 2: Extract libs from git + print("[2/4] Extracting libraries from git...") + old_dir = report_dir / "old" + new_dir = report_dir / "new" + + lib_pairs = {} # name -> (old_path, new_path) + for lib_rel in changed_libs: + name = os.path.basename(lib_rel).replace(".lib", "") + old_path = old_dir / f"{name}.lib" + new_path = new_dir / f"{name}.lib" + + old_ok = git_extract_lib(args.old_ref, lib_rel, str(old_path)) + new_ok = git_extract_lib(args.new_ref, lib_rel, str(new_path)) + + old_size = old_path.stat().st_size if old_ok else None + new_size = new_path.stat().st_size if new_ok else None + + print(f" {name}: old={'found' if old_ok else 'N/A'} new={'found' if new_ok else 'N/A'}") + lib_pairs[name] = ( + str(old_path) if old_ok else None, + str(new_path) if new_ok else None, + old_size, new_size + ) + print() + + # Step 3: Parse symbols and generate diffs + print("[3/4] Parsing symbols...") + all_reports = [] + json_data = {} + + all_mangled_to_demangle = set() + + for name, (old_path, new_path, old_size, new_size) in sorted(lib_pairs.items()): + print(f" Parsing {name}...") + + old_syms = None + new_syms = None + old_members = None + new_members = None + + if old_path: + old_syms = parse_lib_symbols_dumpbin(old_path) or parse_lib_symbols_direct(old_path) + old_members = parse_lib_members(old_path) + print(f" Old: {len(old_syms)} symbols, {len(old_members)} objects") + + if new_path: + new_syms = parse_lib_symbols_dumpbin(new_path) or parse_lib_symbols_direct(new_path) + new_members = parse_lib_members(new_path) + print(f" New: {len(new_syms)} symbols, {len(new_members)} objects") + + # Collect symbols needing demangling + if not args.no_demangle: + if old_syms: + all_mangled_to_demangle.update(old_syms) + if new_syms: + all_mangled_to_demangle.update(new_syms) + + lib_pairs[name] = (old_path, new_path, old_size, new_size, + old_syms, new_syms, old_members, new_members) + print() + + # Step 3b: Batch demangle + old_demangled = {} + new_demangled = {} + if not args.no_demangle and all_mangled_to_demangle: + print(f" Demangling {len(all_mangled_to_demangle)} unique symbols...") + all_demangled = demangle_symbols(sorted(all_mangled_to_demangle)) + print(f" Demangled {len(all_demangled)} symbols") + old_demangled = all_demangled + new_demangled = all_demangled + print() + + # Step 4: Generate reports + print("[4/4] Generating reports...") + + for name in sorted(lib_pairs.keys()): + entry = lib_pairs[name] + old_path, new_path, old_size, new_size = entry[0], entry[1], entry[2], entry[3] + old_syms, new_syms, old_members, new_members = entry[4], entry[5], entry[6], entry[7] + + report = generate_lib_report( + name, old_syms, new_syms, old_demangled, new_demangled, + old_members, new_members, old_size, new_size + ) + all_reports.append(report) + + # Write individual report + diff_dir = report_dir / "diff" + diff_dir.mkdir(exist_ok=True) + (diff_dir / f"{name}.txt").write_text(report, encoding="utf-8") + + if args.json: + json_data[name] = { + "old_size": old_size, + "new_size": new_size, + "old_symbol_count": len(old_syms) if old_syms else 0, + "new_symbol_count": len(new_syms) if new_syms else 0, + "added": sorted(set(new_syms or []) - set(old_syms or [])), + "removed": sorted(set(old_syms or []) - set(new_syms or [])), + "old_members": old_members, + "new_members": new_members, + } + + # Write combined report + summary = [] + summary.append("=" * 70) + summary.append(" 4JLibs Comparison Report") + summary.append("=" * 70) + summary.append(f" Old ref: {args.old_ref}") + summary.append(f" New ref: {args.new_ref}") + summary.append(f" Generated: {datetime.now().isoformat()}") + summary.append("") + summary.append("-" * 70) + summary.append(" Quick Summary") + summary.append("-" * 70) + + for name in sorted(lib_pairs.keys()): + entry = lib_pairs[name] + old_syms, new_syms = entry[4], entry[5] + old_set = set(old_syms or []) + new_set = set(new_syms or []) + added = len(new_set - old_set) + removed = len(old_set - new_set) + + if old_syms is None: + status = "ADDED" + elif new_syms is None: + status = "DELETED" + else: + status = "MODIFIED" + + summary.append(f" {name:30s} {status:10s} +{added} -{removed} symbols") + + summary.append("") + summary.append("=" * 70) + summary.append("") + + full_report = "\n".join(summary) + "\n\n" + "\n\n".join(all_reports) + summary_path = report_dir / "summary.txt" + summary_path.write_text(full_report, encoding="utf-8") + + if args.json: + json_path = report_dir / "data.json" + json_path.write_text(json.dumps(json_data, indent=2), encoding="utf-8") + + print() + print("\n".join(summary)) + print() + print(f"Full report: {summary_path}") + print(f"Per-lib diffs: {report_dir / 'diff'}") + if args.json: + print(f"JSON data: {report_dir / 'data.json'}") + + +if __name__ == "__main__": + main() diff --git a/tools/ghidra/compare-4jlibs.sh b/tools/ghidra/compare-4jlibs.sh new file mode 100644 index 00000000..88b7279a --- /dev/null +++ b/tools/ghidra/compare-4jlibs.sh @@ -0,0 +1,344 @@ +#!/usr/bin/env bash +# compare-4jlibs.sh - Compare 4JLibs between two git refs using Ghidra headless analysis. +# +# Extracts .lib files from both refs, runs Ghidra headless to export symbols/functions, +# then generates a structured diff report. +# +# Usage: +# ./tools/ghidra/compare-4jlibs.sh [OLD_REF] [NEW_REF] [LIB_FILTER] +# +# Arguments: +# OLD_REF - Git ref for the old version (default: HEAD) +# NEW_REF - Git ref for the new version (default: upstream/main) +# LIB_FILTER - Optional: only compare libs matching this pattern (e.g. "4J_Input") +# +# Environment: +# GHIDRA_HOME - Path to Ghidra installation +# (default: C:/Users/revela/Documents/Minecraft/Libraries/ghidra_12.0.4_PUBLIC) +# +# Output: +# tools/ghidra/output/report-/ +# old/ - Extracted old .lib files +# new/ - Extracted new .lib files +# analysis/ - Ghidra JSON exports (old_*.json, new_*.json) +# diff/ - Per-library diff reports +# summary.txt - Overall summary of changes + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +GHIDRA_HOME="${GHIDRA_HOME:-C:/Users/revela/Documents/Minecraft/Libraries/ghidra_12.0.4_PUBLIC}" +HEADLESS="$GHIDRA_HOME/support/analyzeHeadless" + +OLD_REF="${1:-HEAD}" +NEW_REF="${2:-upstream/main}" +LIB_FILTER="${3:-}" + +TIMESTAMP=$(date +%Y%m%d-%H%M%S) +OUTPUT_DIR="$SCRIPT_DIR/output/report-$TIMESTAMP" +OLD_DIR="$OUTPUT_DIR/old" +NEW_DIR="$OUTPUT_DIR/new" +ANALYSIS_DIR="$OUTPUT_DIR/analysis" +DIFF_DIR="$OUTPUT_DIR/diff" +PROJECT_DIR="$OUTPUT_DIR/ghidra-projects" + +mkdir -p "$OLD_DIR" "$NEW_DIR" "$ANALYSIS_DIR" "$DIFF_DIR" "$PROJECT_DIR" + +LIB_PATH="Minecraft.Client/Windows64/4JLibs/libs" + +echo "============================================" +echo " 4JLibs Comparison Tool (Ghidra Headless)" +echo "============================================" +echo "" +echo " Old ref: $OLD_REF" +echo " New ref: $NEW_REF" +echo " Filter: ${LIB_FILTER:-}" +echo " Output: $OUTPUT_DIR" +echo " Ghidra: $GHIDRA_HOME" +echo "" + +# ------------------------------------------------------- +# Step 1: Extract .lib files from both git refs +# ------------------------------------------------------- +echo "[1/4] Extracting .lib files from git..." + +cd "$REPO_ROOT" + +# Get list of .lib files that changed between the two refs +CHANGED_LIBS=$(git diff --name-only "$OLD_REF" "$NEW_REF" -- "$LIB_PATH/*.lib" 2>/dev/null || true) + +if [ -z "$CHANGED_LIBS" ]; then + echo " No .lib file changes found between $OLD_REF and $NEW_REF" + echo " Falling back to listing all libs at $NEW_REF..." + CHANGED_LIBS=$(git ls-tree --name-only -r "$NEW_REF" -- "$LIB_PATH/" | grep '\.lib$' || true) +fi + +if [ -z "$CHANGED_LIBS" ]; then + echo "ERROR: No .lib files found." + exit 1 +fi + +echo " Changed libraries:" +for lib in $CHANGED_LIBS; do + basename "$lib" + LIBNAME=$(basename "$lib" .lib) + + # Apply filter if specified + if [ -n "$LIB_FILTER" ] && [[ "$LIBNAME" != *"$LIB_FILTER"* ]]; then + continue + fi + + # Extract old version (may not exist if newly added) + if git cat-file -e "$OLD_REF:$lib" 2>/dev/null; then + git show "$OLD_REF:$lib" > "$OLD_DIR/$LIBNAME.lib" + echo " old: extracted $LIBNAME.lib ($(wc -c < "$OLD_DIR/$LIBNAME.lib") bytes)" + else + echo " old: $LIBNAME.lib does not exist at $OLD_REF" + fi + + # Extract new version (may not exist if deleted) + if git cat-file -e "$NEW_REF:$lib" 2>/dev/null; then + git show "$NEW_REF:$lib" > "$NEW_DIR/$LIBNAME.lib" + echo " new: extracted $LIBNAME.lib ($(wc -c < "$NEW_DIR/$LIBNAME.lib") bytes)" + else + echo " new: $LIBNAME.lib does not exist at $NEW_REF (deleted)" + fi +done +echo "" + +# ------------------------------------------------------- +# Step 2: Run Ghidra headless analysis on each .lib +# ------------------------------------------------------- +echo "[2/4] Running Ghidra headless analysis..." + +analyze_lib() { + local lib_file="$1" + local label="$2" # "old" or "new" + local libname + libname=$(basename "$lib_file" .lib) + local out_json="$ANALYSIS_DIR/${label}_${libname}.json" + local proj_dir="$PROJECT_DIR/${label}_${libname}" + + mkdir -p "$proj_dir" + + echo " Analyzing ${label}/${libname}.lib ..." + + # Run Ghidra headless: import the .lib, analyze, run our export script, then delete the project + "$HEADLESS" "$proj_dir" "proj" \ + -import "$lib_file" \ + -postScript ExportLibInfo.java "$out_json" \ + -scriptPath "$SCRIPT_DIR" \ + -deleteProject \ + -analysisTimeoutPerFile 300 \ + -max-cpu 4 \ + > "$ANALYSIS_DIR/${label}_${libname}_ghidra.log" 2>&1 || { + echo " WARNING: Ghidra analysis had issues for ${label}/${libname}. Check log." + } + + if [ -f "$out_json" ]; then + local func_count + func_count=$(grep -c '"name"' "$out_json" 2>/dev/null || echo "0") + echo " Done: $out_json ($func_count entries)" + else + echo " WARNING: No output generated for ${label}/${libname}" + fi +} + +# Analyze old libs +for lib_file in "$OLD_DIR"/*.lib; do + [ -f "$lib_file" ] || continue + analyze_lib "$lib_file" "old" +done + +# Analyze new libs +for lib_file in "$NEW_DIR"/*.lib; do + [ -f "$lib_file" ] || continue + analyze_lib "$lib_file" "new" +done +echo "" + +# ------------------------------------------------------- +# Step 3: Generate diff reports +# ------------------------------------------------------- +echo "[3/4] Generating diff reports..." + +generate_diff() { + local libname="$1" + local old_json="$ANALYSIS_DIR/old_${libname}.json" + local new_json="$ANALYSIS_DIR/new_${libname}.json" + local diff_file="$DIFF_DIR/${libname}.diff.txt" + + echo " Diffing $libname..." + echo "=== $libname ===" > "$diff_file" + echo "" >> "$diff_file" + + # Handle deleted libs + if [ ! -f "$new_json" ]; then + echo "STATUS: DELETED (library removed in new version)" >> "$diff_file" + echo "" >> "$diff_file" + if [ -f "$old_json" ]; then + echo "--- Functions that were in old version ---" >> "$diff_file" + grep '"name"' "$old_json" | head -200 >> "$diff_file" + fi + return + fi + + # Handle newly added libs + if [ ! -f "$old_json" ]; then + echo "STATUS: ADDED (library is new in new version)" >> "$diff_file" + echo "" >> "$diff_file" + echo "--- Functions in new version ---" >> "$diff_file" + grep '"name"' "$new_json" | head -200 >> "$diff_file" + return + fi + + # Both exist - compare + echo "STATUS: MODIFIED" >> "$diff_file" + echo "" >> "$diff_file" + + # Extract function names from each + local old_funcs new_funcs + old_funcs=$(mktemp) + new_funcs=$(mktemp) + + grep -oP '"name"\s*:\s*"[^"]*"' "$old_json" | sort -u > "$old_funcs" + grep -oP '"name"\s*:\s*"[^"]*"' "$new_json" | sort -u > "$new_funcs" + + local old_count new_count + old_count=$(wc -l < "$old_funcs") + new_count=$(wc -l < "$new_funcs") + + echo "Old function/symbol count: $old_count" >> "$diff_file" + echo "New function/symbol count: $new_count" >> "$diff_file" + echo "" >> "$diff_file" + + # Functions only in old (removed) + local removed + removed=$(comm -23 "$old_funcs" "$new_funcs") + if [ -n "$removed" ]; then + echo "--- REMOVED (in old, not in new) ---" >> "$diff_file" + echo "$removed" >> "$diff_file" + echo "" >> "$diff_file" + fi + + # Functions only in new (added) + local added + added=$(comm -13 "$old_funcs" "$new_funcs") + if [ -n "$added" ]; then + echo "+++ ADDED (in new, not in old) +++" >> "$diff_file" + echo "$added" >> "$diff_file" + echo "" >> "$diff_file" + fi + + # Functions in both (check for signature changes) + local common + common=$(comm -12 "$old_funcs" "$new_funcs") + if [ -n "$common" ]; then + local common_count + common_count=$(echo "$common" | wc -l) + echo "=== UNCHANGED names: $common_count ===" >> "$diff_file" + echo "(Signature changes require detailed Ghidra comparison)" >> "$diff_file" + echo "" >> "$diff_file" + fi + + # Extract signatures for comparison + local old_sigs new_sigs + old_sigs=$(mktemp) + new_sigs=$(mktemp) + + grep -oP '"signature"\s*:\s*"[^"]*"' "$old_json" | sort -u > "$old_sigs" 2>/dev/null || true + grep -oP '"signature"\s*:\s*"[^"]*"' "$new_json" | sort -u > "$new_sigs" 2>/dev/null || true + + local sig_removed sig_added + sig_removed=$(comm -23 "$old_sigs" "$new_sigs" 2>/dev/null || true) + sig_added=$(comm -13 "$old_sigs" "$new_sigs" 2>/dev/null || true) + + if [ -n "$sig_removed" ] || [ -n "$sig_added" ]; then + echo "--- SIGNATURE CHANGES ---" >> "$diff_file" + if [ -n "$sig_removed" ]; then + echo " Old signatures no longer present:" >> "$diff_file" + echo "$sig_removed" >> "$diff_file" + fi + if [ -n "$sig_added" ]; then + echo " New signatures:" >> "$diff_file" + echo "$sig_added" >> "$diff_file" + fi + echo "" >> "$diff_file" + fi + + # Size comparison + if [ -f "$OLD_DIR/${libname}.lib" ] && [ -f "$NEW_DIR/${libname}.lib" ]; then + local old_size new_size + old_size=$(wc -c < "$OLD_DIR/${libname}.lib") + new_size=$(wc -c < "$NEW_DIR/${libname}.lib") + local delta=$((new_size - old_size)) + local pct=0 + if [ "$old_size" -gt 0 ]; then + pct=$(( (delta * 100) / old_size )) + fi + echo "SIZE: $old_size -> $new_size bytes (${delta:+$delta} bytes, ${pct}%)" >> "$diff_file" + fi + + rm -f "$old_funcs" "$new_funcs" "$old_sigs" "$new_sigs" +} + +# Get unique lib names across old and new +ALL_LIBS=$(cd "$OUTPUT_DIR" && (ls old/*.lib new/*.lib 2>/dev/null || true) | xargs -I{} basename {} .lib | sort -u) + +for libname in $ALL_LIBS; do + generate_diff "$libname" +done +echo "" + +# ------------------------------------------------------- +# Step 4: Generate summary +# ------------------------------------------------------- +echo "[4/4] Generating summary..." + +SUMMARY="$OUTPUT_DIR/summary.txt" +{ + echo "============================================" + echo " 4JLibs Comparison Report" + echo "============================================" + echo "" + echo " Old ref: $OLD_REF" + echo " New ref: $NEW_REF" + echo " Generated: $(date)" + echo "" + echo "--------------------------------------------" + echo " Library Status" + echo "--------------------------------------------" + + for libname in $ALL_LIBS; do + local_diff="$DIFF_DIR/${libname}.diff.txt" + if [ -f "$local_diff" ]; then + status=$(grep "^STATUS:" "$local_diff" | head -1 | cut -d: -f2 | xargs) + size_line=$(grep "^SIZE:" "$local_diff" | head -1 || echo "") + echo "" + echo " $libname: $status" + if [ -n "$size_line" ]; then + echo " $size_line" + fi + + # Count added/removed + added_count=$(grep -c '^\+\+\+' "$local_diff" 2>/dev/null || echo "0") + removed_count=$(grep -c '^---' "$local_diff" 2>/dev/null || echo "0") + if [ "$added_count" -gt 0 ] || [ "$removed_count" -gt 0 ]; then + echo " Sections: +$added_count added, -$removed_count removed" + fi + fi + done + + echo "" + echo "--------------------------------------------" + echo " Detailed reports in: $DIFF_DIR/" + echo " Raw Ghidra JSON in: $ANALYSIS_DIR/" + echo " Ghidra logs in: $ANALYSIS_DIR/*_ghidra.log" + echo "--------------------------------------------" +} > "$SUMMARY" + +cat "$SUMMARY" + +echo "" +echo "Done. Full report: $OUTPUT_DIR" diff --git a/tools/ghidra/extract_lib.py b/tools/ghidra/extract_lib.py new file mode 100644 index 00000000..1aa6fbea --- /dev/null +++ b/tools/ghidra/extract_lib.py @@ -0,0 +1,104 @@ +"""Extract .obj members from a COFF .lib (ar archive) into a directory. + +Usage: + python extract_lib.py + +Each member is written as /.obj. The first/second linker +members and the long-name string table are skipped. +""" + +import os +import sys + + +def extract_lib(lib_path, out_dir): + os.makedirs(out_dir, exist_ok=True) + + with open(lib_path, "rb") as f: + magic = f.read(8) + if magic != b"!\n": + print(f"ERROR: Not an ar archive: {lib_path}", file=sys.stderr) + sys.exit(1) + + # Read the long-name string table if present + long_names = b"" + members = [] + + while True: + pos = f.tell() + header = f.read(60) + if len(header) < 60: + break + + raw_name = header[0:16] + size_str = header[48:58].decode("ascii").strip() + end_marker = header[58:60] + + if end_marker != b"\x60\x0a": + print(f"WARNING: Bad end marker at offset {pos}, stopping.", file=sys.stderr) + break + + size = int(size_str) + data = f.read(size) + + # Pad to even boundary + if size % 2 != 0: + f.read(1) + + name = raw_name.decode("ascii", errors="replace").rstrip() + + # Skip first and second linker members (both named "/") + if name == "/": + continue + + # Long-name string table + if name == "//": + long_names = data + continue + + # Resolve long name references like "/26" + if name.startswith("/") and name[1:].isdigit(): + offset = int(name[1:]) + end = long_names.find(b"\x00", offset) + if end == -1: + # Try newline-terminated (common in MSVC libs) + end = long_names.find(b"\n", offset) + if end == -1: + end = len(long_names) + resolved = long_names[offset:end].decode("ascii", errors="replace").rstrip("/") + name = resolved + + # Clean the name for filesystem use + safe_name = name.replace("/", "_").replace("\\", "_").replace("..", "_") + if not safe_name.endswith(".obj"): + safe_name += ".obj" + + members.append((safe_name, data)) + + # Write members + written = 0 + for safe_name, data in members: + out_path = os.path.join(out_dir, safe_name) + + # Handle duplicate names by appending a counter + if os.path.exists(out_path): + base, ext = os.path.splitext(safe_name) + counter = 2 + while os.path.exists(os.path.join(out_dir, f"{base}_{counter}{ext}")): + counter += 1 + out_path = os.path.join(out_dir, f"{base}_{counter}{ext}") + + with open(out_path, "wb") as out_f: + out_f.write(data) + written += 1 + + print(f"Extracted {written} object files from {os.path.basename(lib_path)} -> {out_dir}") + return written + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print(f"Usage: {sys.argv[0]} ", file=sys.stderr) + sys.exit(1) + + extract_lib(sys.argv[1], sys.argv[2]) diff --git a/tools/ghidra/list-lib-symbols.sh b/tools/ghidra/list-lib-symbols.sh new file mode 100644 index 00000000..464df06f --- /dev/null +++ b/tools/ghidra/list-lib-symbols.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# list-lib-symbols.sh - Quick symbol listing for a single .lib file using Ghidra headless. +# +# Usage: +# ./tools/ghidra/list-lib-symbols.sh [output.json] +# +# If no output path is given, writes to tools/ghidra/output/.json + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +GHIDRA_HOME="${GHIDRA_HOME:-C:/Users/revela/Documents/Minecraft/Libraries/ghidra_12.0.4_PUBLIC}" +HEADLESS="$GHIDRA_HOME/support/analyzeHeadless" + +LIB_FILE="${1:?Usage: list-lib-symbols.sh [output.json]}" +LIBNAME=$(basename "$LIB_FILE" .lib) + +OUTPUT="${2:-$SCRIPT_DIR/output/${LIBNAME}.json}" +mkdir -p "$(dirname "$OUTPUT")" + +PROJ_DIR=$(mktemp -d) + +echo "Analyzing $LIB_FILE ..." +echo " Output: $OUTPUT" + +"$HEADLESS" "$PROJ_DIR" "proj" \ + -import "$LIB_FILE" \ + -postScript ExportLibInfo.java "$OUTPUT" \ + -scriptPath "$SCRIPT_DIR" \ + -deleteProject \ + -analysisTimeoutPerFile 300 \ + -max-cpu 4 \ + 2>&1 | tail -5 + +rm -rf "$PROJ_DIR" + +if [ -f "$OUTPUT" ]; then + func_count=$(grep -c '"signature"' "$OUTPUT" 2>/dev/null || echo "0") + echo "" + echo "Done. $func_count function entries exported to $OUTPUT" +else + echo "ERROR: No output was generated." + exit 1 +fi diff --git a/tools/ghidra/output/.gitkeep b/tools/ghidra/output/.gitkeep new file mode 100644 index 00000000..e69de29b