diff --git a/ofrak_core/CHANGELOG.md b/ofrak_core/CHANGELOG.md index 3c478f320..2d493fc2c 100644 --- a/ofrak_core/CHANGELOG.md +++ b/ofrak_core/CHANGELOG.md @@ -31,7 +31,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ### Changed - By default, the ofrak log is now `ofrak-YYYYMMDDhhmmss.log` rather than just `ofrak.log` and the name can be specified on the command line ([#480](https://github.com/redballoonsecurity/ofrak/pull/480)) -- In `GripUnpacker`, use `gzip.GzipFile` python unpacker for speed, fall back on `pigz` if needed ([#472](https://github.com/redballoonsecurity/ofrak/pull/472)) +- In `GzipUnpacker`, use the standard python `zlib` library to compress small files and decompress all files. Use `pigz` if it is installed to compress files 1MB and larger. ([#472](https://github.com/redballoonsecurity/ofrak/pull/472) and [#485](https://github.com/redballoonsecurity/ofrak/pull/485)) - Change `FreeSpaceModifier` & `PartialFreeSpaceModifier` behavior: an optional stub that isn't free space can be provided and fill-bytes for free space can be specified. ([#409](https://github.com/redballoonsecurity/ofrak/pull/409)) - `Resource.flush_to_disk` method renamed to `Resource.flush_data_to_disk`. ([#373](https://github.com/redballoonsecurity/ofrak/pull/373)) - `build_image.py` supports building Docker images with OFRAK packages from any ancestor directory. ([#425](https://github.com/redballoonsecurity/ofrak/pull/425)) diff --git a/ofrak_core/ofrak/core/gzip.py b/ofrak_core/ofrak/core/gzip.py index 8ec53f6fc..342810a6c 100644 --- a/ofrak_core/ofrak/core/gzip.py +++ b/ofrak_core/ofrak/core/gzip.py @@ -1,9 +1,9 @@ import asyncio import logging -import tempfile -from gzip import BadGzipFile, GzipFile -from io import BytesIO +from typing import Optional +import zlib from subprocess import CalledProcessError +import tempfile from ofrak.component.packer import Packer from ofrak.component.unpacker import Unpacker @@ -15,11 +15,23 @@ LOGGER = logging.getLogger(__name__) +# PIGZ provides significantly faster compression on multi core systems. +# It does not parallelize decompression, so we don't use it in GzipUnpacker. PIGZ = ComponentExternalTool( "pigz", "https://zlib.net/pigz/", "--help", apt_package="pigz", brew_package="pigz" ) +class PIGZInstalled: + _pigz_installed: Optional[bool] = None + + @staticmethod + async def is_pigz_installed() -> bool: + if PIGZInstalled._pigz_installed is None: + PIGZInstalled._pigz_installed = await PIGZ.is_tool_installed() + return PIGZInstalled._pigz_installed + + class GzipData(GenericBinary): """ A gzip binary blob. @@ -41,44 +53,32 @@ class GzipUnpacker(Unpacker[None]): async def unpack(self, resource: Resource, config=None): data = await resource.get_data() - # GzipFile is faster (spawning external processes has overhead), - # but pigz is more willing to tolerate things like extra junk at the end - try: - with GzipFile(fileobj=BytesIO(data), mode="r") as gzip_file: - return await resource.create_child( - tags=(GenericBinary,), - data=gzip_file.read(), - ) - except BadGzipFile: - # Create temporary file with .gz extension - with tempfile.NamedTemporaryFile(suffix=".gz") as temp_file: - temp_file.write(data) - temp_file.flush() - cmd = [ - "pigz", - "-d", - "-c", - temp_file.name, - ] - proc = await asyncio.create_subprocess_exec( - *cmd, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - data = stdout - if proc.returncode: - # Forward any gzip warning message and continue - if proc.returncode == -2 or proc.returncode == 2: - LOGGER.warning(stderr) - data = stdout - else: - raise CalledProcessError(returncode=proc.returncode, cmd=cmd, stderr=stderr) - - await resource.create_child( - tags=(GenericBinary,), - data=data, - ) + unpacked_data = await self.unpack_with_zlib_module(data) + return await resource.create_child(tags=(GenericBinary,), data=unpacked_data) + + @staticmethod + async def unpack_with_zlib_module(data: bytes) -> bytes: + # We use zlib.decompressobj instead of the gzip module to decompress + # because of a bug that causes gzip to raise BadGzipFile if there's + # trailing garbage after a compressed file instead of correctly ignoring it + # https://github.com/python/cpython/issues/68489 + + # gzip files can consist of multiple members, so we need to read them in + # a loop and concatenate them in the end. \037\213 are magic bytes + # indicating the start of a gzip header. + chunks = [] + while data.startswith(b"\037\213"): + # wbits > 16 handles the gzip header and footer + decompressor = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS) + chunks.append(decompressor.decompress(data)) + if not decompressor.eof: + raise ValueError("Incomplete gzip file") + data = decompressor.unused_data.lstrip(b"\0") + + if not len(chunks): + raise ValueError("Not a gzipped file") + + return b"".join(chunks) class GzipPacker(Packer[None]): @@ -87,19 +87,48 @@ class GzipPacker(Packer[None]): """ targets = (GzipData,) - external_dependencies = (PIGZ,) async def pack(self, resource: Resource, config=None): gzip_view = await resource.view_as(GzipData) + gzip_child_r = await gzip_view.get_file() + data = await gzip_child_r.get_data() - result = BytesIO() - with GzipFile(fileobj=result, mode="w") as gzip_file: - gzip_child_r = await gzip_view.get_file() - gzip_data = await gzip_child_r.get_data() - gzip_file.write(gzip_data) + if len(data) >= 1024 * 1024 and await PIGZInstalled.is_pigz_installed(): + packed_data = await self.pack_with_pigz(data) + else: + packed_data = await self.pack_with_zlib_module(data) original_gzip_size = await gzip_view.resource.get_data_length() - resource.queue_patch(Range(0, original_gzip_size), result.getvalue()) + resource.queue_patch(Range(0, original_gzip_size), data=packed_data) + + @staticmethod + async def pack_with_zlib_module(data: bytes) -> bytes: + compressor = zlib.compressobj(wbits=16 + zlib.MAX_WBITS) + result = compressor.compress(data) + result += compressor.flush() + return result + + @staticmethod + async def pack_with_pigz(data: bytes) -> bytes: + with tempfile.NamedTemporaryFile() as uncompressed_file: + uncompressed_file.write(data) + uncompressed_file.flush() + + cmd = [ + "pigz", + "-c", + uncompressed_file.name, + ] + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + if proc.returncode: + raise CalledProcessError(returncode=proc.returncode, stderr=stderr, cmd=cmd) + + return stdout MagicMimeIdentifier.register(GzipData, "application/gzip") diff --git a/ofrak_core/test_ofrak/components/assets/hello_ofrak b/ofrak_core/test_ofrak/components/assets/hello_ofrak new file mode 100644 index 000000000..8ee09ada1 --- /dev/null +++ b/ofrak_core/test_ofrak/components/assets/hello_ofrak @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c160ca1603c2a07cdbfde8119b0889fe8348e70a64899ca558808b39638cc1d0 +size 12 diff --git a/ofrak_core/test_ofrak/components/assets/hello_world b/ofrak_core/test_ofrak/components/assets/hello_world new file mode 100644 index 000000000..0db788d9c --- /dev/null +++ b/ofrak_core/test_ofrak/components/assets/hello_world @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a948904f2f0f479b8f8197694b30184b0d2ed1c1cd2a1ec0fb85d299a192a447 +size 12 diff --git a/ofrak_core/test_ofrak/components/assets/random8M b/ofrak_core/test_ofrak/components/assets/random8M new file mode 100644 index 000000000..173a9ab53 --- /dev/null +++ b/ofrak_core/test_ofrak/components/assets/random8M @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4d088e0d0685acdcccabbe03cc1127411db251eec6718c6d913120bac76e4b3 +size 8388608 diff --git a/ofrak_core/test_ofrak/components/assets/random8M_modified b/ofrak_core/test_ofrak/components/assets/random8M_modified new file mode 100644 index 000000000..1c117861e --- /dev/null +++ b/ofrak_core/test_ofrak/components/assets/random8M_modified @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c68b685e228e64f2f66f4be1104c42be942d927847e4cd5912405613d1f2e64c +size 8388608 diff --git a/ofrak_core/test_ofrak/components/test_gzip_component.py b/ofrak_core/test_ofrak/components/test_gzip_component.py index 6fde60cc7..9abc7ccb7 100644 --- a/ofrak_core/test_ofrak/components/test_gzip_component.py +++ b/ofrak_core/test_ofrak/components/test_gzip_component.py @@ -1,87 +1,109 @@ -import os -import subprocess -import tempfile -from gzip import GzipFile -from io import BytesIO - +import zlib +import gzip +from pathlib import Path +from asyncio import create_subprocess_exec +from typing import Tuple +from unittest.mock import patch +from abc import ABC, abstractmethod + +from ofrak.component.abstract import ComponentSubprocessError import pytest -from ofrak import OFRAKContext +from ofrak.ofrak_context import OFRAKContext from ofrak.resource import Resource from ofrak.core.gzip import GzipData from pytest_ofrak.patterns.compressed_filesystem_unpack_modify_pack import ( CompressedFileUnpackModifyPackPattern, ) -from pytest_ofrak.patterns.unpack_modify_pack import UnpackModifyPackPattern + +ASSETS_DIR = Path(__file__).parent / "assets" -class TestGzipUnpackModifyPack(CompressedFileUnpackModifyPackPattern): +@pytest.fixture( + autouse=True, + scope="module", + params=[ + (ASSETS_DIR / "hello_world", ASSETS_DIR / "hello_ofrak", False), + (ASSETS_DIR / "random8M", ASSETS_DIR / "random8M_modified", True), + ], + ids=["hello world", ""], +) +def gzip_test_input(request): + initial_path, repacked_path, expect_pigz = request.param + with open(initial_path, "rb") as initial_file: + initial_data = initial_file.read() + with open(repacked_path, "rb") as repacked_file: + expected_repacked_data = repacked_file.read() + return (initial_data, expected_repacked_data, expect_pigz) + + +class GzipUnpackModifyPackPattern(CompressedFileUnpackModifyPackPattern, ABC): + """ + Template for tests that test different inputs the gzip component should support + unpacking. + """ + + EXPECT_PIGZ: bool expected_tag = GzipData - @pytest.fixture(autouse=True) - def create_test_file(self, tmpdir): - d = tmpdir.mkdir("gzip") - fh = d.join("hello.gz") - result = BytesIO() - with GzipFile(fileobj=result, mode="w") as gzip_file: - gzip_file.write(self.INITIAL_DATA) - fh.write_binary(result.getvalue()) + @abstractmethod + def write_gzip(self, gzip_path: Path): + raise NotImplementedError() - self._test_file = fh.realpath() + @pytest.fixture(autouse=True) + def create_test_file(self, gzip_test_input: Tuple[bytes, bytes, bool], tmp_path: Path): + self.INITIAL_DATA, self.EXPECTED_REPACKED_DATA, self.EXPECT_PIGZ = gzip_test_input + gzip_path = tmp_path / "test.gz" + self.write_gzip(gzip_path) + self._test_file = gzip_path.resolve() + + async def test_unpack_modify_pack(self, ofrak_context: OFRAKContext): + with patch("asyncio.create_subprocess_exec", wraps=create_subprocess_exec) as mock_exec: + if self.EXPECT_PIGZ: + await super().test_unpack_modify_pack(ofrak_context) + assert any( + args[0][0] == "pigz" and args[0][1] == "-c" for args in mock_exec.call_args_list + ) + else: + await super().test_unpack_modify_pack(ofrak_context) + mock_exec.assert_not_called() async def verify(self, repacked_root_resource: Resource): - patched_gzip_file = GzipFile(fileobj=BytesIO(await repacked_root_resource.get_data())) - patched_decompressed_data = patched_gzip_file.read() + patched_decompressed_data = gzip.decompress(await repacked_root_resource.get_data()) assert patched_decompressed_data == self.EXPECTED_REPACKED_DATA -class TestGzipUnpackWithTrailingBytes(UnpackModifyPackPattern): - EXPECTED_TAG = GzipData - INITIAL_DATA = b"Hello World" - EXPECTED_DATA = INITIAL_DATA # Change expected when modifier is created - INNER_FILENAME = "hello.bin" - GZIP_FILENAME = "hello.bin.gz" - - async def create_root_resource(self, ofrak_context: OFRAKContext) -> Resource: - with tempfile.TemporaryDirectory() as d: - file_path = os.path.join(d, self.INNER_FILENAME) - with open(file_path, "wb") as f: - f.write(self.INITIAL_DATA) - - gzip_path = os.path.join(d, self.GZIP_FILENAME) - gzip_command = ["pigz", file_path] - subprocess.run(gzip_command, check=True, capture_output=True) - - # Add trailing bytes - with open(gzip_path, "ab") as a: - a.write(b"\xDE\xAD\xBE\xEF") - a.close() - return await ofrak_context.create_root_resource_from_file(gzip_path) - - async def unpack(self, root_resource: Resource) -> None: - await root_resource.unpack_recursively() - - async def modify(self, root_resource: Resource) -> None: - pass - - async def repack(self, root_resource: Resource) -> None: - pass - - async def verify(self, root_resource: Resource) -> None: - gzip_data = await root_resource.get_data() - with tempfile.TemporaryDirectory() as d: - gzip_path = os.path.join(d, self.GZIP_FILENAME) - with open(gzip_path, "wb") as f: - f.write(gzip_data) - - gunzip_command = ["pigz", "-d", "-c", gzip_path] - try: - result = subprocess.run(gunzip_command, check=True, capture_output=True) - data = result.stdout - except subprocess.CalledProcessError as e: - if e.returncode == 2 or e.returncode == -2: - data = e.stdout - else: - raise - - assert data == self.EXPECTED_DATA +class TestGzipUnpackModifyPack(GzipUnpackModifyPackPattern): + def write_gzip(self, gzip_path: Path): + with gzip.GzipFile(gzip_path, mode="w") as gzip_file: + gzip_file.write(self.INITIAL_DATA) + + +class TestGzipWithMultipleMembersUnpackModifyPack(GzipUnpackModifyPackPattern): + def write_gzip(self, gzip_path: Path): + middle = len(self.INITIAL_DATA) // 2 + with gzip.GzipFile(gzip_path, mode="w") as gzip_file: + gzip_file.write(self.INITIAL_DATA[:middle]) + + with gzip.GzipFile(gzip_path, mode="a") as gzip_file: + gzip_file.write(self.INITIAL_DATA[middle:]) + + +class TestGzipWithTrailingBytesUnpackModifyPack(GzipUnpackModifyPackPattern): + def write_gzip(self, gzip_path: Path): + with gzip.GzipFile(gzip_path, mode="w") as gzip_file: + gzip_file.write(self.INITIAL_DATA) + + with open(gzip_path, "ab") as raw_file: + raw_file.write(b"\xDE\xAD\xBE\xEF") + + +async def test_corrupted_gzip_fail( + gzip_test_input: Tuple[bytes, bytes, bool], ofrak_context: OFRAKContext +): + initial_data = gzip_test_input[0] + corrupted_data = bytearray(gzip.compress(initial_data)) + corrupted_data[10] = 255 + resource = await ofrak_context.create_root_resource("corrupted.gz", data=bytes(corrupted_data)) + with pytest.raises((zlib.error, ComponentSubprocessError)): + await resource.unpack()