以下代码能够读取 bzipped 文件:
offset = 24
# Open the object
fobj = open(filey,'rb')
# Read the data
buffer = fobj.read()
# Apply bz2 compression
buffer_unbzip,places_to_bzip = bzip_blocks_decompress_all(buffer,offset)
其中 bzip_blocks_decompress_all 函数定义如下:
def bzip_blocks_decompress_all(data,offset):
import bz2
frames = bytearray()
places_to_bzip = []
while offset < len(data):
block_cmp_bytes = abs(int.from_bytes(data[offset:offset + 4], 'big', signed=True))
offset += 4
frames += bz2.decompress(data[offset:offset + block_cmp_bytes])
places_to_bzip.append([offset,offset+block_cmp_bytes])
offset += block_cmp_bytes
return frames,places_to_bzip
所以我有对象被 bzip 压缩的位置(places_to_bzip)。所以我的想法是我们应该能够做如下的事情:
# Try to compress using bz2 just based on some of the places_to_bzip
a1 = buffer[places_to_bzip[0][0]:places_to_bzip[0][1]]
a2 = buffer_unbzip[places_to_bzip[0][0]:places_to_bzip[0][1]]
# Convert a2 back to a1 with a bzip compression
a3 = bz2.compress(a2)
print(len(a1))
print(len(a2))
print(len(a3))
104
104
70
为什么不能正确重新压缩?以下是用于测试的 a1 和 a2 的输出:
print(a1)
b'BZh51AY&SY\xe6\xb1\xacS\x00\x00\x02_\xab\xfe(@\x00\x10\x00@\x04\x00@\x00@\x800\x02\x00\x00\x01\x00@\x08\x00\x00\x18 \x00T4\x8d\x004\x01\xa0\x91(\x01\x90\xd3\xd2\x14\xac\xd6v\x85\xf0\x0fD\x85\xc3A}\xe09\xbc\xe1\x8b\x04Y\xbfb$"\xcc\x13\xc0B\r\x99\xf1Qa%S\x00|]\xc9\x14\xe1BC\x9a\xc6\xb1L'
print(a2)
bytearray(b'\x00\x0b\x00\x02\x05z\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00X\x00\x00\x00\x00\x002\x04@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01h\x00\x00\x00\x00\x002\x04@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
根据我的评论,
buffer_unbzip
仅包含解压缩的数据,places_to_bzip
中的偏移量是原始压缩数据中切片的开始/结束偏移量。解压缩帧的偏移量未知。
下面我对输入文件进行了逆向工程并生成了一个,然后使用OP的代码来提取数据。代码被修改为还返回每个解压帧的开始/结束,然后遍历偏移量重新压缩并比较每个帧的压缩数据:
import bz2
import struct
### Reproducible input file example ###
def write_frame(f, data):
bzdata = bz2.compress(data)
# Write size of compressed data as big-endian 4-byte integer,
# then the compressed data.
f.write(struct.pack('>L', len(bzdata)) + bzdata)
with open('file.bin', 'wb') as f:
f.write(b'A' * 24) # header in the original data?
write_frame(f, b'B' * 50) # compressed frames
write_frame(f, b'C' * 25)
write_frame(f, b'D' * 30)
write_frame(f, b'E' * 12)
### END ###
offset = 24
# Open the object
with open('file.bin','rb') as fobj:
# Read the data
buffer = fobj.read()
def bzip_blocks_decompress_all(data,offset):
import bz2
frames = bytearray()
places_to_bzip = []
places_to_unbzip = []
while offset < len(data):
# Why signed and abs()? A length should never be negative.
# Note: >L means big-endian 4-byte unsigned integer.
# A tuple of the struct elements is returned,
# in this case a 1-tuple, so [0] to get the integer
block_cmp_bytes = struct.unpack_from('>L', data, offset)[0]
#block_cmp_bytes = abs(int.from_bytes(data[offset:offset + 4], 'big', signed=True))
offset += 4
start = len(frames)
frames += bz2.decompress(data[offset:offset + block_cmp_bytes])
end = len(frames)
places_to_bzip.append([start, end])
places_to_unbzip.append([offset, offset + block_cmp_bytes])
offset += block_cmp_bytes
return frames, places_to_bzip, places_to_unbzip
# Apply bz2 compression
buffer_unbzip, places_to_bzip, places_to_unbzip = bzip_blocks_decompress_all(buffer, offset)
print(f'{buffer=}')
print(f'{buffer_unbzip=}')
# Try to compress using bz2 just based on some of the places_to_bzip
for (bstart, bend), (unbstart, unbend) in zip(places_to_bzip, places_to_unbzip):
a1 = buffer[unbstart:unbend]
a2 = buffer_unbzip[bstart:bend]
# Convert a2 back to a1 with a bzip compression
a3 = bz2.compress(a2)
print(a1 == a3, a2)
输出:
buffer=b"AAAAAAAAAAAAAAAAAAAAAAAA\x00\x00\x00'BZh91AY&SY?\xbf\xc2\x8b\x00\x00\x02\x14\x00\x00\x01\x10\x00 \x00!\x00\x82\x0b\x17rE8P\x90?\xbf\xc2\x8b\x00\x00\x00'BZh91AY&SY\x0b\xc7\x94'\x00\x00\x02$\x00\x02\x00\x08\x00 \x00!\x00\x82\x0b\x17rE8P\x90\x0b\xc7\x94'\x00\x00\x00'BZh91AY&SYX\xf3\xe3\x91\x00\x00\x02$\x00\x00\x10\x04\x00 \x00!\x00\x82\x0b\x17rE8P\x90X\xf3\xe3\x91\x00\x00\x00'BZh91AY&SY\xb6\xa1w{\x00\x00\x02D\x00\x00@\x02\x00 \x00!\x00\x82\x0b\x17rE8P\x90\xb6\xa1w{"
buffer_unbzip=bytearray(b'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCCCCCCCCCCDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDEEEEEEEEEEEE')
True bytearray(b'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB')
True bytearray(b'CCCCCCCCCCCCCCCCCCCCCCCCC')
True bytearray(b'DDDDDDDDDDDDDDDDDDDDDDDDDDDDDD')
True bytearray(b'EEEEEEEEEEEE')