From e0e249526ed9f03737a13aefcd2de9f6761d7406 Mon Sep 17 00:00:00 2001 From: Illidan Date: Fri, 24 Jan 2025 09:12:55 +0100 Subject: [PATCH] Added compression support for arch format. --- README.md | 2 +- hacktools/__init__.py | 2 +- hacktools/arch.py | 228 ++++++++++++++++++++++++++++++++---------- 3 files changed, 177 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index 1b80562..4f59568 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ External dependencies not included are marked as `(through *dependency*)` - BRFNT fonts (through [brfnt2tpl](https://wiki.tockdom.com/wiki/Brfnt2tpl) and [wimgt](https://szs.wiimm.de)) ### Other / Generic - CPK archives -- ARCH archives +- ARCH archives (including compression/decompression) - LZ10, LZ11, Huffman, CRILAYLA and RACJIN compression/decompression - ARM/MIPS binary patching - xdelta patch creation diff --git a/hacktools/__init__.py b/hacktools/__init__.py index 96545f2..76aee68 100644 --- a/hacktools/__init__.py +++ b/hacktools/__init__.py @@ -1 +1 @@ -__version__ = "0.37.1" +__version__ = "0.38.0" diff --git a/hacktools/arch.py b/hacktools/arch.py index 27aef9b..ce22268 100644 --- a/hacktools/arch.py +++ b/hacktools/arch.py @@ -1,4 +1,5 @@ import os +from collections import Counter from hacktools import common @@ -65,17 +66,34 @@ def repack(fin, f, archive, infolder): f.seek(archive.dataoff + dataoff) f.write(fin.read(subfile.length)) else: - # Set the file as not encoded and copy it size = os.path.getsize(filepath) f.seek(archive.fatoff + i * 16) - f.writeUInt(size) - f.writeUInt(size) - f.writeUInt(dataoff) - f.seek(2, 1) - f.writeUShort(0) - f.seek(archive.dataoff + dataoff) - with common.Stream(filepath, "rb") as subf: - f.write(subf.read()) + # If the file was not compressed, just copy it + if not subfile.encoded: + f.writeUInt(size) + f.writeUInt(size) + f.writeUInt(dataoff) + f.seek(2, 1) + f.writeUShort(0) + f.seek(archive.dataoff + dataoff) + with common.Stream(filepath, "rb") as subf: + f.write(subf.read()) + else: + common.logDebug("Compressing", subfile.name) + with common.Stream(filepath, "rb") as subf: + filedata = subf.read() + compdata = compress(filedata) + # For testing + uncompdata = decompress(compdata, subfile.declength) + if uncompdata != filedata: + common.logError("Error", subfile.name) + f.writeUInt(len(compdata)) + f.writeUInt(size) + f.writeUInt(dataoff) + f.seek(2, 1) + f.writeUShort(1) + f.seek(archive.dataoff + dataoff) + f.write(compdata) # Align with 0s if f.tell() % 16 > 0: f.writeZero(16 - (f.tell() % 16)) @@ -89,49 +107,153 @@ def extract(f, archive, outfolder): if not subfile.encoded: fout.write(f.read(subfile.length)) else: - # Based on Tinke's ARCH implementation - startpos = f.tell() - buffer1 = [] - buffer2 = [] + fout.write(decompress(f.read(subfile.length), subfile.declength)) + + +def compress(data): + # Find unused bytes in the data + dictkeys = [] + for i in range(1, 0x100): + dictkeys.append(i) + for b in data: + if b in dictkeys: + dictkeys.remove(b) + dictvalues = {} + # Recursively find the most used pair and replace it in the copied data + content = bytearray(data) + while True: + if len(dictkeys) == 0: + break + # Write all the pairs in a list, for simplicity will just stick to halfwords + allpairs = [] + for i in range(len(content) // 2): + allpairs.append((content[i], content[i+1])) + # Find the most common one + c = Counter(allpairs).most_common(1) + if len(c) < 1: + break + pair = c[0] + if pair[1] < 4: + break + dictkey = dictkeys.pop() + common.logDebug("setting pair", common.toHex(pair[0][0]), common.toHex(pair[0][1]), "with", pair[1], "occurrences as dict key", common.toHex(dictkey)) + dictvalues[dictkey] = pair[0] + content = content.replace(bytes(pair[0]), bytes([dictkey])) + with common.Stream() as f: + # Write the dictionary values + currentkey = 0 + ordkeys = list(dictvalues.keys()) + ordkeys.sort() + isconsecutive = False + # Special case where there are no dict keys + if len(ordkeys) == 0: + f.writeByte(0x7f + 0x7f) + f.writeByte(0x7f) + f.writeByte(0x7f + 0x7f) + f.writeByte(0xff) + currentkey = 0x100 + else: + for i in range(len(ordkeys)): + dictkey = ordkeys[i] + common.logDebug("Writing key", common.toHex(dictkey)) + # If the key is not consecutive, we need to skip places + if dictkey > currentkey: + keydiff = dictkey - currentkey + # Since we can only skip 0x7f bytes, we need to do an additional skip if it's bigger + while keydiff > 0x7f: + f.writeByte(0x7f + 0x7f) + # Also write a byte equal to the index + f.writeByte(0x7f) + keydiff -= 0x80 + f.writeByte(keydiff + 0x7f) + currentkey = dictkey + isconsecutive = False + elif not isconsecutive: + # If this is the first time we're writing a key, we need to check how many consecutive ones there are + consecutive = 1 + for j in range(i+1, len(ordkeys)): + if ordkeys[j] == dictkey + consecutive: + consecutive += 1 + f.writeByte(consecutive - 1) + isconsecutive = True + common.logDebug("Writing key pairs", common.toHex(dictvalues[dictkey][0]), common.toHex(dictvalues[dictkey][1])) + f.writeByte(dictvalues[dictkey][0]) + # Don't write the 2nd byte if it's the same as the index (shouldn't happen) + if dictvalues[dictkey][1] != dictkey: + f.writeByte(dictvalues[dictkey][1]) + currentkey += 1 + # We're forced to write all indexes even if they aren't used + if currentkey < 0x100: + f.writeByte(0x100 - currentkey - 1) + while currentkey < 0x100: + f.writeByte(currentkey) + currentkey += 1 + # Write the actual content + numloopspos = f.tell() + f.writeByte(0) + f.writeByte(0) + numloops = 0 + for b in content: + f.writeByte(b) + numloops += 1 + f.seek(numloopspos) + f.writeByte(numloops >> 8) + f.writeByte(numloops & 0xff) + f.seek(0) + return f.read() + + +def decompress(data, declen): + with common.Stream() as f: + with common.Stream() as fout: + f.write(data) + f.seek(0) + # Based on Tinke's ARCH implementation + buffer1 = [] + buffer2 = [] + for i in range(0x100): + buffer1.append(0) + buffer2.append(0) + while f.tell() < len(data): + # InitBuffer for i in range(0x100): - buffer1.append(0) - buffer2.append(0) - while f.tell() - startpos < subfile.length: - # InitBuffer - for i in range(0x100): - buffer2[i] = i - # FillBuffer - index = 0 - while index != 0x100: - bufid = f.readByte() - numloops = bufid - if bufid > 0x7f: - numloops = 0 - index += bufid - 0x7f - if index == 0x100: + buffer2[i] = i + # FillBuffer + index = 0 + while index != 0x100: + bufid = f.readByte() + numloops = bufid + if bufid > 0x7f: + numloops = 0 + index += bufid - 0x7f + if index == 0x100: + break + if numloops < 0: + continue + for i in range(numloops + 1): + byte = f.readByte() + buffer2[index] = byte + if byte != index: + buffer1[index] = f.readByte() + index += 1 + # Process + numloops = (f.readByte() << 8) + f.readByte() + common.logDebug("Decompressing with", common.toHex(numloops), "loops starting at", common.toHex(f.tell())) + nextsamples = [] + while True: + if len(nextsamples) == 0: + if numloops == 0: break - if numloops < 0: - continue - for i in range(numloops + 1): - byte = f.readByte() - buffer2[index] = byte - if byte != index: - buffer1[index] = f.readByte() - index += 1 - # Process - numloops = (f.readByte() << 8) + f.readByte() - nextsamples = [] - while True: - if len(nextsamples) == 0: - if numloops == 0: - break - numloops -= 1 - index = f.readByte() - else: - index = nextsamples.pop() - if buffer2[index] == index: - fout.writeByte(index) - else: - nextsamples.append(buffer1[index]) - nextsamples.append(buffer2[index]) - index = len(nextsamples) + numloops -= 1 + index = f.readByte() + else: + index = nextsamples.pop() + if buffer2[index] == index: + fout.writeByte(index) + else: + nextsamples.append(buffer1[index]) + nextsamples.append(buffer2[index]) + index = len(nextsamples) + common.logDebug("Finished at", common.toHex(f.tell()), "with numloops", common.toHex(numloops)) + fout.seek(0) + return fout.read() \ No newline at end of file