From 0a8c0696d532a1b92fe61d544eedf4563ba5b9a2 Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Wed, 17 Jul 2024 18:23:55 +0000 Subject: [PATCH 01/18] Rewrite gzip component to support trailing bytes without external tool (using zlib library instead of gzip) --- ofrak_core/ofrak/core/gzip.py | 46 +++++----- .../components/test_gzip_component.py | 88 ++++++------------- 2 files changed, 53 insertions(+), 81 deletions(-) diff --git a/ofrak_core/ofrak/core/gzip.py b/ofrak_core/ofrak/core/gzip.py index 8ec53f6fc..58205c28c 100644 --- a/ofrak_core/ofrak/core/gzip.py +++ b/ofrak_core/ofrak/core/gzip.py @@ -1,8 +1,7 @@ import asyncio import logging import tempfile -from gzip import BadGzipFile, GzipFile -from io import BytesIO +import zlib from subprocess import CalledProcessError from ofrak.component.packer import Packer @@ -41,16 +40,24 @@ class GzipUnpacker(Unpacker[None]): async def unpack(self, resource: Resource, config=None): data = await resource.get_data() - # GzipFile is faster (spawning external processes has overhead), - # but pigz is more willing to tolerate things like extra junk at the end - try: - with GzipFile(fileobj=BytesIO(data), mode="r") as gzip_file: - return await resource.create_child( - tags=(GenericBinary,), - data=gzip_file.read(), - ) - except BadGzipFile: - # Create temporary file with .gz extension + if True: # TODO: add some heuristic to switch to using pigz + # wbits > 16 handles the gzip header and footer + # We need to create a zlib.Decompress object for Python < 3.11 + # to set this parameter + chunks = [] + + decompressor = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS) + while data.startswith(b"\037\213"): + chunks.append(decompressor.decompress(data)) + if decompressor.eof: + break + data = decompressor.unused_data.lstrip(b"\0") + + if not len(chunks): + raise ValueError("Not a gzipped file") + + return await resource.create_child(tags=(GenericBinary,), data=b"".join(chunks)) + else: with tempfile.NamedTemporaryFile(suffix=".gz") as temp_file: temp_file.write(data) temp_file.flush() @@ -87,19 +94,16 @@ class GzipPacker(Packer[None]): """ targets = (GzipData,) - external_dependencies = (PIGZ,) async def pack(self, resource: Resource, config=None): gzip_view = await resource.view_as(GzipData) - - result = BytesIO() - with GzipFile(fileobj=result, mode="w") as gzip_file: - gzip_child_r = await gzip_view.get_file() - gzip_data = await gzip_child_r.get_data() - gzip_file.write(gzip_data) - + gzip_child_r = await gzip_view.get_file() + gzip_data = await gzip_child_r.get_data() + compressor = zlib.compressobj(wbits=16 + zlib.MAX_WBITS) + result = compressor.compress(gzip_data) + result += compressor.flush() original_gzip_size = await gzip_view.resource.get_data_length() - resource.queue_patch(Range(0, original_gzip_size), result.getvalue()) + resource.queue_patch(Range(0, original_gzip_size), result) MagicMimeIdentifier.register(GzipData, "application/gzip") diff --git a/ofrak_core/test_ofrak/components/test_gzip_component.py b/ofrak_core/test_ofrak/components/test_gzip_component.py index 6fde60cc7..47ff2d33b 100644 --- a/ofrak_core/test_ofrak/components/test_gzip_component.py +++ b/ofrak_core/test_ofrak/components/test_gzip_component.py @@ -1,33 +1,30 @@ -import os -import subprocess -import tempfile from gzip import GzipFile from io import BytesIO +from pathlib import Path import pytest -from ofrak import OFRAKContext from ofrak.resource import Resource from ofrak.core.gzip import GzipData from pytest_ofrak.patterns.compressed_filesystem_unpack_modify_pack import ( CompressedFileUnpackModifyPackPattern, ) -from pytest_ofrak.patterns.unpack_modify_pack import UnpackModifyPackPattern -class TestGzipUnpackModifyPack(CompressedFileUnpackModifyPackPattern): +class GzipUnpackModifyPackPattern(CompressedFileUnpackModifyPackPattern): + """Template for tests that test different inputs the gzip component should support + unpacking.""" + expected_tag = GzipData - @pytest.fixture(autouse=True) - def create_test_file(self, tmpdir): - d = tmpdir.mkdir("gzip") - fh = d.join("hello.gz") - result = BytesIO() - with GzipFile(fileobj=result, mode="w") as gzip_file: - gzip_file.write(self.INITIAL_DATA) - fh.write_binary(result.getvalue()) + def write_gzip(self, gzip_path: Path): + raise NotImplementedError - self._test_file = fh.realpath() + @pytest.fixture(autouse=True) + def create_test_file(self, tmp_path: Path): + gzip_path = tmp_path / "hello.gz" + self.write_gzip(gzip_path) + self._test_file = gzip_path.resolve() async def verify(self, repacked_root_resource: Resource): patched_gzip_file = GzipFile(fileobj=BytesIO(await repacked_root_resource.get_data())) @@ -35,53 +32,24 @@ async def verify(self, repacked_root_resource: Resource): assert patched_decompressed_data == self.EXPECTED_REPACKED_DATA -class TestGzipUnpackWithTrailingBytes(UnpackModifyPackPattern): - EXPECTED_TAG = GzipData - INITIAL_DATA = b"Hello World" - EXPECTED_DATA = INITIAL_DATA # Change expected when modifier is created - INNER_FILENAME = "hello.bin" - GZIP_FILENAME = "hello.bin.gz" - - async def create_root_resource(self, ofrak_context: OFRAKContext) -> Resource: - with tempfile.TemporaryDirectory() as d: - file_path = os.path.join(d, self.INNER_FILENAME) - with open(file_path, "wb") as f: - f.write(self.INITIAL_DATA) - - gzip_path = os.path.join(d, self.GZIP_FILENAME) - gzip_command = ["pigz", file_path] - subprocess.run(gzip_command, check=True, capture_output=True) - - # Add trailing bytes - with open(gzip_path, "ab") as a: - a.write(b"\xDE\xAD\xBE\xEF") - a.close() - return await ofrak_context.create_root_resource_from_file(gzip_path) - - async def unpack(self, root_resource: Resource) -> None: - await root_resource.unpack_recursively() +class TestGzipUnpackModifyPack(GzipUnpackModifyPackPattern): + def write_gzip(self, gzip_path: Path): + with GzipFile(gzip_path, mode="w") as gzip_file: + gzip_file.write(self.INITIAL_DATA) - async def modify(self, root_resource: Resource) -> None: - pass - async def repack(self, root_resource: Resource) -> None: - pass +class TestGzipWithMultipleMembersUnpackModifyPack(GzipUnpackModifyPackPattern): + def write_gzip(self, gzip_path: Path): + with GzipFile(gzip_path, mode="w") as gzip_file: + middle = len(self.INITIAL_DATA) // 2 + gzip_file.write(self.INITIAL_DATA[:middle]) + gzip_file.write(self.INITIAL_DATA[middle:]) - async def verify(self, root_resource: Resource) -> None: - gzip_data = await root_resource.get_data() - with tempfile.TemporaryDirectory() as d: - gzip_path = os.path.join(d, self.GZIP_FILENAME) - with open(gzip_path, "wb") as f: - f.write(gzip_data) - gunzip_command = ["pigz", "-d", "-c", gzip_path] - try: - result = subprocess.run(gunzip_command, check=True, capture_output=True) - data = result.stdout - except subprocess.CalledProcessError as e: - if e.returncode == 2 or e.returncode == -2: - data = e.stdout - else: - raise +class TestGzipWithTrailingBytesUnpackModifyPack(GzipUnpackModifyPackPattern): + def write_gzip(self, gzip_path: Path): + with GzipFile(gzip_path, mode="w") as gzip_file: + gzip_file.write(self.INITIAL_DATA) - assert data == self.EXPECTED_DATA + with open(gzip_path, "ab") as raw_file: + raw_file.write(b"\xDE\xAD\xBE\xEF") From 3331f714acee02ee0bb8183e01273350e65a4e6d Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Wed, 17 Jul 2024 21:04:12 +0000 Subject: [PATCH 02/18] Switch to pigz for 4MiB or larger files and pipe directly to stdin instead of temp file --- ofrak_core/ofrak/core/gzip.py | 50 ++++++++++++++--------------------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/ofrak_core/ofrak/core/gzip.py b/ofrak_core/ofrak/core/gzip.py index 58205c28c..d3baf1ce9 100644 --- a/ofrak_core/ofrak/core/gzip.py +++ b/ofrak_core/ofrak/core/gzip.py @@ -1,6 +1,5 @@ import asyncio import logging -import tempfile import zlib from subprocess import CalledProcessError @@ -40,7 +39,7 @@ class GzipUnpacker(Unpacker[None]): async def unpack(self, resource: Resource, config=None): data = await resource.get_data() - if True: # TODO: add some heuristic to switch to using pigz + if len(data) < 1024 * 1024 * 4: # Use pigz for more than 4MiB # wbits > 16 handles the gzip header and footer # We need to create a zlib.Decompress object for Python < 3.11 # to set this parameter @@ -58,34 +57,25 @@ async def unpack(self, resource: Resource, config=None): return await resource.create_child(tags=(GenericBinary,), data=b"".join(chunks)) else: - with tempfile.NamedTemporaryFile(suffix=".gz") as temp_file: - temp_file.write(data) - temp_file.flush() - cmd = [ - "pigz", - "-d", - "-c", - temp_file.name, - ] - proc = await asyncio.create_subprocess_exec( - *cmd, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - data = stdout - if proc.returncode: - # Forward any gzip warning message and continue - if proc.returncode == -2 or proc.returncode == 2: - LOGGER.warning(stderr) - data = stdout - else: - raise CalledProcessError(returncode=proc.returncode, cmd=cmd, stderr=stderr) - - await resource.create_child( - tags=(GenericBinary,), - data=data, - ) + cmd = ["pigz", "-d"] + proc = await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate(data) + if proc.returncode: + # Forward any gzip warning message and continue + if proc.returncode == -2 or proc.returncode == 2: + LOGGER.warning(stderr) + else: + raise CalledProcessError(returncode=proc.returncode, cmd=cmd, stderr=stderr) + + await resource.create_child( + tags=(GenericBinary,), + data=stdout, + ) class GzipPacker(Packer[None]): From eecfcbe4a2f627ae80e9021d33813b4967a984cd Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Thu, 18 Jul 2024 10:48:48 -0400 Subject: [PATCH 03/18] Update docstring in test_gzip_component.py Co-authored-by: Jacob Strieb <99368685+rbs-jacob@users.noreply.github.com> --- ofrak_core/test_ofrak/components/test_gzip_component.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ofrak_core/test_ofrak/components/test_gzip_component.py b/ofrak_core/test_ofrak/components/test_gzip_component.py index 47ff2d33b..416253283 100644 --- a/ofrak_core/test_ofrak/components/test_gzip_component.py +++ b/ofrak_core/test_ofrak/components/test_gzip_component.py @@ -12,8 +12,10 @@ class GzipUnpackModifyPackPattern(CompressedFileUnpackModifyPackPattern): - """Template for tests that test different inputs the gzip component should support - unpacking.""" + """ + Template for tests that test different inputs the gzip component should support + unpacking. + """ expected_tag = GzipData From 37ba179f83f4a341c305447c2fa4a66fd9d39c04 Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Thu, 18 Jul 2024 17:48:44 +0000 Subject: [PATCH 04/18] Refactor unpack logic to separate functions --- ofrak_core/ofrak/core/gzip.py | 74 ++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/ofrak_core/ofrak/core/gzip.py b/ofrak_core/ofrak/core/gzip.py index d3baf1ce9..a6fd08c51 100644 --- a/ofrak_core/ofrak/core/gzip.py +++ b/ofrak_core/ofrak/core/gzip.py @@ -39,43 +39,45 @@ class GzipUnpacker(Unpacker[None]): async def unpack(self, resource: Resource, config=None): data = await resource.get_data() - if len(data) < 1024 * 1024 * 4: # Use pigz for more than 4MiB - # wbits > 16 handles the gzip header and footer - # We need to create a zlib.Decompress object for Python < 3.11 - # to set this parameter - chunks = [] - - decompressor = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS) - while data.startswith(b"\037\213"): - chunks.append(decompressor.decompress(data)) - if decompressor.eof: - break - data = decompressor.unused_data.lstrip(b"\0") - - if not len(chunks): - raise ValueError("Not a gzipped file") - - return await resource.create_child(tags=(GenericBinary,), data=b"".join(chunks)) + if len(data) >= 1024 * 1024 * 4 and await PIGZ.is_tool_installed(): + uncompressed_data = await self.unpack_with_pigz(data) else: - cmd = ["pigz", "-d"] - proc = await asyncio.create_subprocess_exec( - *cmd, - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate(data) - if proc.returncode: - # Forward any gzip warning message and continue - if proc.returncode == -2 or proc.returncode == 2: - LOGGER.warning(stderr) - else: - raise CalledProcessError(returncode=proc.returncode, cmd=cmd, stderr=stderr) - - await resource.create_child( - tags=(GenericBinary,), - data=stdout, - ) + uncompressed_data = await self.unpack_with_zlib_module(data) + return await resource.create_child(tags=(GenericBinary,), data=uncompressed_data) + + @staticmethod + async def unpack_with_zlib_module(data: bytes) -> bytes: + chunks = [] + + # wbits > 16 handles the gzip header and footer + # We need to create a zlib.Decompress object in order to use this + # parameter in Python < 3.11 + decompressor = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS) + while data.startswith(b"\037\213"): + chunks.append(decompressor.decompress(data)) + if decompressor.eof: + break + data = decompressor.unused_data.lstrip(b"\0") + + if not len(chunks): + raise ValueError("Not a gzipped file") + + return b"".join(chunks) + + @staticmethod + async def unpack_with_pigz(data: bytes) -> bytes: + cmd = ["pigz", "-d"] + proc = await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate(data) + if proc.returncode: + raise CalledProcessError(returncode=proc.returncode, cmd=cmd, stderr=stderr) + + return stdout class GzipPacker(Packer[None]): From ed9005f11e854228cea04cf5ba33fe51f421c473 Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Thu, 18 Jul 2024 17:49:03 +0000 Subject: [PATCH 05/18] cache result of is_tool_installed --- ofrak_core/ofrak/model/component_model.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/ofrak_core/ofrak/model/component_model.py b/ofrak_core/ofrak/model/component_model.py index be1123c36..8823ca42c 100644 --- a/ofrak_core/ofrak/model/component_model.py +++ b/ofrak_core/ofrak/model/component_model.py @@ -18,7 +18,7 @@ class ComponentConfig: """ -@dataclass(frozen=True) +@dataclass class ComponentExternalTool: """ An external tool or utility (like `zip` or `squashfs`) a component depends on. Includes some @@ -45,14 +45,21 @@ class ComponentExternalTool: apt_package: Optional[str] = None brew_package: Optional[str] = None + _installed: Optional[bool] = field(default=None, init=False, compare=False) + async def is_tool_installed(self) -> bool: """ Check if a tool is installed by running it with the `install_check_arg`. - This method runs ` `. + This method runs ` ` the first time it is called. + The result is cached for future calls. :return: True if the `tool` command returned zero, False if `tool` could not be found or returned non-zero exit code. """ + + if self._installed is not None: + return self._installed + try: cmd = [ self.tool, @@ -65,10 +72,11 @@ async def is_tool_installed(self) -> bool: ) returncode = await proc.wait() + self._installed = 0 == returncode except FileNotFoundError: - return False + self._installed = False - return 0 == returncode + return self._installed CC = TypeVar("CC", bound=Optional[ComponentConfig]) From e9fbdb38b0f00fe5916c08381a370ae3585b72a6 Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Thu, 18 Jul 2024 17:49:30 +0000 Subject: [PATCH 06/18] Comprehensive gzip test cases --- .../test_ofrak/components/assets/hello_ofrak | 3 + .../test_ofrak/components/assets/hello_world | 3 + .../test_ofrak/components/assets/random8M | 3 + .../components/assets/random8M_modified | 3 + .../components/test_gzip_component.py | 64 ++++++++++++++++--- 5 files changed, 67 insertions(+), 9 deletions(-) create mode 100644 ofrak_core/test_ofrak/components/assets/hello_ofrak create mode 100644 ofrak_core/test_ofrak/components/assets/hello_world create mode 100644 ofrak_core/test_ofrak/components/assets/random8M create mode 100644 ofrak_core/test_ofrak/components/assets/random8M_modified diff --git a/ofrak_core/test_ofrak/components/assets/hello_ofrak b/ofrak_core/test_ofrak/components/assets/hello_ofrak new file mode 100644 index 000000000..8ee09ada1 --- /dev/null +++ b/ofrak_core/test_ofrak/components/assets/hello_ofrak @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c160ca1603c2a07cdbfde8119b0889fe8348e70a64899ca558808b39638cc1d0 +size 12 diff --git a/ofrak_core/test_ofrak/components/assets/hello_world b/ofrak_core/test_ofrak/components/assets/hello_world new file mode 100644 index 000000000..0db788d9c --- /dev/null +++ b/ofrak_core/test_ofrak/components/assets/hello_world @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a948904f2f0f479b8f8197694b30184b0d2ed1c1cd2a1ec0fb85d299a192a447 +size 12 diff --git a/ofrak_core/test_ofrak/components/assets/random8M b/ofrak_core/test_ofrak/components/assets/random8M new file mode 100644 index 000000000..173a9ab53 --- /dev/null +++ b/ofrak_core/test_ofrak/components/assets/random8M @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4d088e0d0685acdcccabbe03cc1127411db251eec6718c6d913120bac76e4b3 +size 8388608 diff --git a/ofrak_core/test_ofrak/components/assets/random8M_modified b/ofrak_core/test_ofrak/components/assets/random8M_modified new file mode 100644 index 000000000..1c117861e --- /dev/null +++ b/ofrak_core/test_ofrak/components/assets/random8M_modified @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c68b685e228e64f2f66f4be1104c42be942d927847e4cd5912405613d1f2e64c +size 8388608 diff --git a/ofrak_core/test_ofrak/components/test_gzip_component.py b/ofrak_core/test_ofrak/components/test_gzip_component.py index 416253283..1bfe1d5f4 100644 --- a/ofrak_core/test_ofrak/components/test_gzip_component.py +++ b/ofrak_core/test_ofrak/components/test_gzip_component.py @@ -1,15 +1,41 @@ -from gzip import GzipFile -from io import BytesIO +import zlib +import gzip from pathlib import Path +from asyncio import create_subprocess_exec +from asyncio.subprocess import PIPE +from typing import Tuple +from unittest.mock import patch +from ofrak.component.abstract import ComponentSubprocessError import pytest +from ofrak.ofrak_context import OFRAKContext from ofrak.resource import Resource from ofrak.core.gzip import GzipData from pytest_ofrak.patterns.compressed_filesystem_unpack_modify_pack import ( CompressedFileUnpackModifyPackPattern, ) +ASSETS_DIR = Path(__file__).parent / "assets" + + +@pytest.fixture( + autouse=True, + scope="module", + params=[ + (ASSETS_DIR / "hello_world", ASSETS_DIR / "hello_ofrak", False), + (ASSETS_DIR / "random8M", ASSETS_DIR / "random8M_modified", True), + ], + ids=["hello world", ""], +) +def gzip_test_input(request): + initial_path, repacked_path, expect_pigz = request.param + with open(initial_path, "rb") as initial_file: + initial_data = initial_file.read() + with open(repacked_path, "rb") as repacked_file: + expected_repacked_data = repacked_file.read() + return (initial_data, expected_repacked_data, expect_pigz) + class GzipUnpackModifyPackPattern(CompressedFileUnpackModifyPackPattern): """ @@ -17,32 +43,41 @@ class GzipUnpackModifyPackPattern(CompressedFileUnpackModifyPackPattern): unpacking. """ + EXPECT_PIGZ: bool expected_tag = GzipData def write_gzip(self, gzip_path: Path): raise NotImplementedError @pytest.fixture(autouse=True) - def create_test_file(self, tmp_path: Path): - gzip_path = tmp_path / "hello.gz" + def create_test_file(self, gzip_test_input: Tuple[bytes, bytes, bool], tmp_path: Path): + self.INITIAL_DATA, self.EXPECTED_REPACKED_DATA, self.EXPECT_PIGZ = gzip_test_input + gzip_path = tmp_path / "test.gz" self.write_gzip(gzip_path) self._test_file = gzip_path.resolve() + async def test_unpack_modify_pack(self, ofrak_context: OFRAKContext): + if self.EXPECT_PIGZ: + with patch("asyncio.create_subprocess_exec", wraps=create_subprocess_exec) as mock: + await super().test_unpack_modify_pack(ofrak_context) + mock.assert_any_call("pigz", "-d", stdin=PIPE, stdout=PIPE, stderr=PIPE) + else: + await super().test_unpack_modify_pack(ofrak_context) + async def verify(self, repacked_root_resource: Resource): - patched_gzip_file = GzipFile(fileobj=BytesIO(await repacked_root_resource.get_data())) - patched_decompressed_data = patched_gzip_file.read() + patched_decompressed_data = gzip.decompress(await repacked_root_resource.get_data()) assert patched_decompressed_data == self.EXPECTED_REPACKED_DATA class TestGzipUnpackModifyPack(GzipUnpackModifyPackPattern): def write_gzip(self, gzip_path: Path): - with GzipFile(gzip_path, mode="w") as gzip_file: + with gzip.GzipFile(gzip_path, mode="w") as gzip_file: gzip_file.write(self.INITIAL_DATA) class TestGzipWithMultipleMembersUnpackModifyPack(GzipUnpackModifyPackPattern): def write_gzip(self, gzip_path: Path): - with GzipFile(gzip_path, mode="w") as gzip_file: + with gzip.GzipFile(gzip_path, mode="w") as gzip_file: middle = len(self.INITIAL_DATA) // 2 gzip_file.write(self.INITIAL_DATA[:middle]) gzip_file.write(self.INITIAL_DATA[middle:]) @@ -50,8 +85,19 @@ def write_gzip(self, gzip_path: Path): class TestGzipWithTrailingBytesUnpackModifyPack(GzipUnpackModifyPackPattern): def write_gzip(self, gzip_path: Path): - with GzipFile(gzip_path, mode="w") as gzip_file: + with gzip.GzipFile(gzip_path, mode="w") as gzip_file: gzip_file.write(self.INITIAL_DATA) with open(gzip_path, "ab") as raw_file: raw_file.write(b"\xDE\xAD\xBE\xEF") + + +async def test_corrupted_gzip_fail( + gzip_test_input: Tuple[bytes, bytes, bool], ofrak_context: OFRAKContext +): + initial_data = gzip_test_input[0] + corrupted_data = bytearray(gzip.compress(initial_data)) + corrupted_data[10] = 255 + resource = await ofrak_context.create_root_resource("corrupted.gz", data=bytes(corrupted_data)) + with pytest.raises((zlib.error, ComponentSubprocessError)): + await resource.unpack() From 9b5315a1d7512f4f64dc71b44b499648b32b31cc Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Thu, 18 Jul 2024 20:42:40 +0000 Subject: [PATCH 07/18] Make ComponentExternalTool hashable based on tool and install_check_arg --- ofrak_core/ofrak/model/component_model.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ofrak_core/ofrak/model/component_model.py b/ofrak_core/ofrak/model/component_model.py index 8823ca42c..c7a040180 100644 --- a/ofrak_core/ofrak/model/component_model.py +++ b/ofrak_core/ofrak/model/component_model.py @@ -18,7 +18,7 @@ class ComponentConfig: """ -@dataclass +@dataclass(unsafe_hash=True) class ComponentExternalTool: """ An external tool or utility (like `zip` or `squashfs`) a component depends on. Includes some @@ -39,11 +39,11 @@ class ComponentExternalTool: """ - tool: str - tool_homepage: str - install_check_arg: str - apt_package: Optional[str] = None - brew_package: Optional[str] = None + tool: str = field(hash=True) + tool_homepage: str = field(hash=False) + install_check_arg: str = field(hash=True) + apt_package: Optional[str] = field(default=None, hash=False) + brew_package: Optional[str] = field(default=None, hash=False) _installed: Optional[bool] = field(default=None, init=False, compare=False) From 7144874658596cd983d2f3404b49f6e197fc3aa3 Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Wed, 7 Aug 2024 10:40:50 -0400 Subject: [PATCH 08/18] Update previous gzip related changelog message Last change was still in unreleased section --- ofrak_core/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ofrak_core/CHANGELOG.md b/ofrak_core/CHANGELOG.md index 3c478f320..92477b994 100644 --- a/ofrak_core/CHANGELOG.md +++ b/ofrak_core/CHANGELOG.md @@ -31,7 +31,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ### Changed - By default, the ofrak log is now `ofrak-YYYYMMDDhhmmss.log` rather than just `ofrak.log` and the name can be specified on the command line ([#480](https://github.com/redballoonsecurity/ofrak/pull/480)) -- In `GripUnpacker`, use `gzip.GzipFile` python unpacker for speed, fall back on `pigz` if needed ([#472](https://github.com/redballoonsecurity/ofrak/pull/472)) +- In `GripUnpacker`, use python `zlib` library unpacker for speed, fall back on `pigz` if needed and installed. ([#472](https://github.com/redballoonsecurity/ofrak/pull/472) and [#485](https://github.com/redballoonsecurity/ofrak/pull/485)) - Change `FreeSpaceModifier` & `PartialFreeSpaceModifier` behavior: an optional stub that isn't free space can be provided and fill-bytes for free space can be specified. ([#409](https://github.com/redballoonsecurity/ofrak/pull/409)) - `Resource.flush_to_disk` method renamed to `Resource.flush_data_to_disk`. ([#373](https://github.com/redballoonsecurity/ofrak/pull/373)) - `build_image.py` supports building Docker images with OFRAK packages from any ancestor directory. ([#425](https://github.com/redballoonsecurity/ofrak/pull/425)) From 6305010e0f22d375428a559a7f453b275ce0df60 Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Mon, 12 Aug 2024 04:03:28 +0000 Subject: [PATCH 09/18] Actually use pigz as a fallback, clarify changelog message --- ofrak_core/CHANGELOG.md | 2 +- ofrak_core/ofrak/core/gzip.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/ofrak_core/CHANGELOG.md b/ofrak_core/CHANGELOG.md index 92477b994..92c0243d1 100644 --- a/ofrak_core/CHANGELOG.md +++ b/ofrak_core/CHANGELOG.md @@ -31,7 +31,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ### Changed - By default, the ofrak log is now `ofrak-YYYYMMDDhhmmss.log` rather than just `ofrak.log` and the name can be specified on the command line ([#480](https://github.com/redballoonsecurity/ofrak/pull/480)) -- In `GripUnpacker`, use python `zlib` library unpacker for speed, fall back on `pigz` if needed and installed. ([#472](https://github.com/redballoonsecurity/ofrak/pull/472) and [#485](https://github.com/redballoonsecurity/ofrak/pull/485)) +- In `GripUnpacker`, try to use standard python `zlib` library to decompress small files. Use `pigz` if it is installed for files 4MB and larger or as a fallback if python code fails. ([#472](https://github.com/redballoonsecurity/ofrak/pull/472) and [#485](https://github.com/redballoonsecurity/ofrak/pull/485)) - Change `FreeSpaceModifier` & `PartialFreeSpaceModifier` behavior: an optional stub that isn't free space can be provided and fill-bytes for free space can be specified. ([#409](https://github.com/redballoonsecurity/ofrak/pull/409)) - `Resource.flush_to_disk` method renamed to `Resource.flush_data_to_disk`. ([#373](https://github.com/redballoonsecurity/ofrak/pull/373)) - `build_image.py` supports building Docker images with OFRAK packages from any ancestor directory. ([#425](https://github.com/redballoonsecurity/ofrak/pull/425)) diff --git a/ofrak_core/ofrak/core/gzip.py b/ofrak_core/ofrak/core/gzip.py index a6fd08c51..2af23c74d 100644 --- a/ofrak_core/ofrak/core/gzip.py +++ b/ofrak_core/ofrak/core/gzip.py @@ -39,10 +39,16 @@ class GzipUnpacker(Unpacker[None]): async def unpack(self, resource: Resource, config=None): data = await resource.get_data() - if len(data) >= 1024 * 1024 * 4 and await PIGZ.is_tool_installed(): + pigz_installed = await PIGZ.is_tool_installed() + if len(data) >= 1024 * 1024 * 4 and pigz_installed: uncompressed_data = await self.unpack_with_pigz(data) else: - uncompressed_data = await self.unpack_with_zlib_module(data) + try: + uncompressed_data = await self.unpack_with_zlib_module(data) + except Exception: # pragma: no cover + if not pigz_installed: + raise + uncompressed_data = await self.unpack_with_pigz(data) return await resource.create_child(tags=(GenericBinary,), data=uncompressed_data) @staticmethod From 5e2c6200cf74dd5226dfd778aadb9b00c9626f3a Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Tue, 13 Aug 2024 14:31:50 +0000 Subject: [PATCH 10/18] Raise NotImplementedError instance in write_gzip() and make it abstractmethod --- ofrak_core/test_ofrak/components/test_gzip_component.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ofrak_core/test_ofrak/components/test_gzip_component.py b/ofrak_core/test_ofrak/components/test_gzip_component.py index 1bfe1d5f4..73f618c8a 100644 --- a/ofrak_core/test_ofrak/components/test_gzip_component.py +++ b/ofrak_core/test_ofrak/components/test_gzip_component.py @@ -5,6 +5,7 @@ from asyncio.subprocess import PIPE from typing import Tuple from unittest.mock import patch +from abc import ABC, abstractmethod from ofrak.component.abstract import ComponentSubprocessError import pytest @@ -37,7 +38,7 @@ def gzip_test_input(request): return (initial_data, expected_repacked_data, expect_pigz) -class GzipUnpackModifyPackPattern(CompressedFileUnpackModifyPackPattern): +class GzipUnpackModifyPackPattern(CompressedFileUnpackModifyPackPattern, ABC): """ Template for tests that test different inputs the gzip component should support unpacking. @@ -46,8 +47,9 @@ class GzipUnpackModifyPackPattern(CompressedFileUnpackModifyPackPattern): EXPECT_PIGZ: bool expected_tag = GzipData + @abstractmethod def write_gzip(self, gzip_path: Path): - raise NotImplementedError + raise NotImplementedError() @pytest.fixture(autouse=True) def create_test_file(self, gzip_test_input: Tuple[bytes, bytes, bool], tmp_path: Path): From ad6469a24002af6c6b76791dc2ff3907b4f9aa39 Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Tue, 13 Aug 2024 14:33:55 +0000 Subject: [PATCH 11/18] Revert caching of is_tool_installed in ComponentExternalTool --- ofrak_core/ofrak/model/component_model.py | 26 ++++++++--------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/ofrak_core/ofrak/model/component_model.py b/ofrak_core/ofrak/model/component_model.py index c7a040180..be1123c36 100644 --- a/ofrak_core/ofrak/model/component_model.py +++ b/ofrak_core/ofrak/model/component_model.py @@ -18,7 +18,7 @@ class ComponentConfig: """ -@dataclass(unsafe_hash=True) +@dataclass(frozen=True) class ComponentExternalTool: """ An external tool or utility (like `zip` or `squashfs`) a component depends on. Includes some @@ -39,27 +39,20 @@ class ComponentExternalTool: """ - tool: str = field(hash=True) - tool_homepage: str = field(hash=False) - install_check_arg: str = field(hash=True) - apt_package: Optional[str] = field(default=None, hash=False) - brew_package: Optional[str] = field(default=None, hash=False) - - _installed: Optional[bool] = field(default=None, init=False, compare=False) + tool: str + tool_homepage: str + install_check_arg: str + apt_package: Optional[str] = None + brew_package: Optional[str] = None async def is_tool_installed(self) -> bool: """ Check if a tool is installed by running it with the `install_check_arg`. - This method runs ` ` the first time it is called. - The result is cached for future calls. + This method runs ` `. :return: True if the `tool` command returned zero, False if `tool` could not be found or returned non-zero exit code. """ - - if self._installed is not None: - return self._installed - try: cmd = [ self.tool, @@ -72,11 +65,10 @@ async def is_tool_installed(self) -> bool: ) returncode = await proc.wait() - self._installed = 0 == returncode except FileNotFoundError: - self._installed = False + return False - return self._installed + return 0 == returncode CC = TypeVar("CC", bound=Optional[ComponentConfig]) From 595e5e1e04b708b354b9c9f6232b06b5049831d1 Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Tue, 13 Aug 2024 15:06:46 +0000 Subject: [PATCH 12/18] Cache PIGZ installed or not in gzip component module --- ofrak_core/ofrak/core/gzip.py | 57 ++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/ofrak_core/ofrak/core/gzip.py b/ofrak_core/ofrak/core/gzip.py index 2af23c74d..c97b25b12 100644 --- a/ofrak_core/ofrak/core/gzip.py +++ b/ofrak_core/ofrak/core/gzip.py @@ -1,5 +1,6 @@ import asyncio import logging +from typing import Optional import zlib from subprocess import CalledProcessError @@ -18,6 +19,16 @@ ) +class PIGZInstalled: + _pigz_installed: Optional[bool] = None + + @staticmethod + async def is_pigz_installed() -> bool: + if PIGZInstalled._pigz_installed is None: + PIGZInstalled._pigz_installed = await PIGZ.is_tool_installed() + return PIGZInstalled._pigz_installed + + class GzipData(GenericBinary): """ A gzip binary blob. @@ -39,17 +50,16 @@ class GzipUnpacker(Unpacker[None]): async def unpack(self, resource: Resource, config=None): data = await resource.get_data() - pigz_installed = await PIGZ.is_tool_installed() - if len(data) >= 1024 * 1024 * 4 and pigz_installed: - uncompressed_data = await self.unpack_with_pigz(data) + if len(data) >= 1024 * 1024 * 4 and await PIGZInstalled.is_pigz_installed(): + unpacked_data = await self.unpack_with_pigz(data) else: try: - uncompressed_data = await self.unpack_with_zlib_module(data) + unpacked_data = await self.unpack_with_zlib_module(data) except Exception: # pragma: no cover - if not pigz_installed: + if not PIGZInstalled.is_pigz_installed(): raise - uncompressed_data = await self.unpack_with_pigz(data) - return await resource.create_child(tags=(GenericBinary,), data=uncompressed_data) + unpacked_data = await self.unpack_with_pigz(data) + return await resource.create_child(tags=(GenericBinary,), data=unpacked_data) @staticmethod async def unpack_with_zlib_module(data: bytes) -> bytes: @@ -96,12 +106,37 @@ class GzipPacker(Packer[None]): async def pack(self, resource: Resource, config=None): gzip_view = await resource.view_as(GzipData) gzip_child_r = await gzip_view.get_file() - gzip_data = await gzip_child_r.get_data() + data = await gzip_child_r.get_data() + + if len(data) >= 1024 * 1024 and await PIGZInstalled.is_pigz_installed(): + packed_data = await self.pack_with_pigz(data) + else: + packed_data = await self.pack_with_zlib_module(data) + + original_gzip_size = await gzip_view.resource.get_data_length() + resource.queue_patch(Range(0, original_gzip_size), data=packed_data) + + @staticmethod + async def pack_with_zlib_module(data: bytes) -> bytes: compressor = zlib.compressobj(wbits=16 + zlib.MAX_WBITS) - result = compressor.compress(gzip_data) + result = compressor.compress(data) result += compressor.flush() - original_gzip_size = await gzip_view.resource.get_data_length() - resource.queue_patch(Range(0, original_gzip_size), result) + return result + + @staticmethod + async def pack_with_pigz(data: bytes) -> bytes: + cmd = ["pigz"] + proc = await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate(data) + if proc.returncode: + raise CalledProcessError(returncode=proc.returncode, cmd=cmd, stderr=stderr) + + return stdout MagicMimeIdentifier.register(GzipData, "application/gzip") From 595b9af19c1fc6600d45146b9af3e406771aaf8f Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Tue, 13 Aug 2024 15:07:49 +0000 Subject: [PATCH 13/18] Test that PIGZ is used for packing large file and NOT used for small files --- .../test_ofrak/components/test_gzip_component.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ofrak_core/test_ofrak/components/test_gzip_component.py b/ofrak_core/test_ofrak/components/test_gzip_component.py index 73f618c8a..ea0542b32 100644 --- a/ofrak_core/test_ofrak/components/test_gzip_component.py +++ b/ofrak_core/test_ofrak/components/test_gzip_component.py @@ -59,12 +59,14 @@ def create_test_file(self, gzip_test_input: Tuple[bytes, bytes, bool], tmp_path: self._test_file = gzip_path.resolve() async def test_unpack_modify_pack(self, ofrak_context: OFRAKContext): - if self.EXPECT_PIGZ: - with patch("asyncio.create_subprocess_exec", wraps=create_subprocess_exec) as mock: + with patch("asyncio.create_subprocess_exec", wraps=create_subprocess_exec) as mock_exec: + if self.EXPECT_PIGZ: await super().test_unpack_modify_pack(ofrak_context) - mock.assert_any_call("pigz", "-d", stdin=PIPE, stdout=PIPE, stderr=PIPE) - else: - await super().test_unpack_modify_pack(ofrak_context) + mock_exec.assert_any_call("pigz", "-d", stdin=PIPE, stdout=PIPE, stderr=PIPE) + mock_exec.assert_any_call("pigz", stdin=PIPE, stdout=PIPE, stderr=PIPE) + else: + await super().test_unpack_modify_pack(ofrak_context) + mock_exec.assert_not_called() async def verify(self, repacked_root_resource: Resource): patched_decompressed_data = gzip.decompress(await repacked_root_resource.get_data()) From 8609b8e2b77a034ada40f24e3950f94ac8cfc6b2 Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Wed, 14 Aug 2024 03:36:46 +0000 Subject: [PATCH 14/18] Only use PIGZ for compression, not decompression --- ofrak_core/CHANGELOG.md | 2 +- ofrak_core/ofrak/core/gzip.py | 59 ++++++++----------- .../components/test_gzip_component.py | 6 +- 3 files changed, 27 insertions(+), 40 deletions(-) diff --git a/ofrak_core/CHANGELOG.md b/ofrak_core/CHANGELOG.md index 92c0243d1..5b97ce71f 100644 --- a/ofrak_core/CHANGELOG.md +++ b/ofrak_core/CHANGELOG.md @@ -31,7 +31,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ### Changed - By default, the ofrak log is now `ofrak-YYYYMMDDhhmmss.log` rather than just `ofrak.log` and the name can be specified on the command line ([#480](https://github.com/redballoonsecurity/ofrak/pull/480)) -- In `GripUnpacker`, try to use standard python `zlib` library to decompress small files. Use `pigz` if it is installed for files 4MB and larger or as a fallback if python code fails. ([#472](https://github.com/redballoonsecurity/ofrak/pull/472) and [#485](https://github.com/redballoonsecurity/ofrak/pull/485)) +- In `GripUnpacker`, use the standard python `zlib` library to compress small files and decompress all files. Use `pigz` if it is installed to compress files 1MB and larger. ([#472](https://github.com/redballoonsecurity/ofrak/pull/472) and [#485](https://github.com/redballoonsecurity/ofrak/pull/485)) - Change `FreeSpaceModifier` & `PartialFreeSpaceModifier` behavior: an optional stub that isn't free space can be provided and fill-bytes for free space can be specified. ([#409](https://github.com/redballoonsecurity/ofrak/pull/409)) - `Resource.flush_to_disk` method renamed to `Resource.flush_data_to_disk`. ([#373](https://github.com/redballoonsecurity/ofrak/pull/373)) - `build_image.py` supports building Docker images with OFRAK packages from any ancestor directory. ([#425](https://github.com/redballoonsecurity/ofrak/pull/425)) diff --git a/ofrak_core/ofrak/core/gzip.py b/ofrak_core/ofrak/core/gzip.py index c97b25b12..d5c7a463e 100644 --- a/ofrak_core/ofrak/core/gzip.py +++ b/ofrak_core/ofrak/core/gzip.py @@ -3,6 +3,7 @@ from typing import Optional import zlib from subprocess import CalledProcessError +import tempfile from ofrak.component.packer import Packer from ofrak.component.unpacker import Unpacker @@ -14,6 +15,8 @@ LOGGER = logging.getLogger(__name__) +# PIGZ provides significantly faster compression on multi core systems. +# It does not parallelize decompression, so we don't use it in GzipUnpacker. PIGZ = ComponentExternalTool( "pigz", "https://zlib.net/pigz/", "--help", apt_package="pigz", brew_package="pigz" ) @@ -50,15 +53,7 @@ class GzipUnpacker(Unpacker[None]): async def unpack(self, resource: Resource, config=None): data = await resource.get_data() - if len(data) >= 1024 * 1024 * 4 and await PIGZInstalled.is_pigz_installed(): - unpacked_data = await self.unpack_with_pigz(data) - else: - try: - unpacked_data = await self.unpack_with_zlib_module(data) - except Exception: # pragma: no cover - if not PIGZInstalled.is_pigz_installed(): - raise - unpacked_data = await self.unpack_with_pigz(data) + unpacked_data = await self.unpack_with_zlib_module(data) return await resource.create_child(tags=(GenericBinary,), data=unpacked_data) @staticmethod @@ -80,21 +75,6 @@ async def unpack_with_zlib_module(data: bytes) -> bytes: return b"".join(chunks) - @staticmethod - async def unpack_with_pigz(data: bytes) -> bytes: - cmd = ["pigz", "-d"] - proc = await asyncio.create_subprocess_exec( - *cmd, - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate(data) - if proc.returncode: - raise CalledProcessError(returncode=proc.returncode, cmd=cmd, stderr=stderr) - - return stdout - class GzipPacker(Packer[None]): """ @@ -125,18 +105,25 @@ async def pack_with_zlib_module(data: bytes) -> bytes: @staticmethod async def pack_with_pigz(data: bytes) -> bytes: - cmd = ["pigz"] - proc = await asyncio.create_subprocess_exec( - *cmd, - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate(data) - if proc.returncode: - raise CalledProcessError(returncode=proc.returncode, cmd=cmd, stderr=stderr) - - return stdout + with tempfile.NamedTemporaryFile() as uncompressed_file: + uncompressed_file.write(data) + uncompressed_file.flush() + + cmd = [ + "pigz", + "-c", + uncompressed_file.name, + ] + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + if proc.returncode: + raise CalledProcessError(returncode=proc.returncode, stderr=stderr, cmd=cmd) + + return stdout MagicMimeIdentifier.register(GzipData, "application/gzip") diff --git a/ofrak_core/test_ofrak/components/test_gzip_component.py b/ofrak_core/test_ofrak/components/test_gzip_component.py index ea0542b32..9809aad86 100644 --- a/ofrak_core/test_ofrak/components/test_gzip_component.py +++ b/ofrak_core/test_ofrak/components/test_gzip_component.py @@ -2,7 +2,6 @@ import gzip from pathlib import Path from asyncio import create_subprocess_exec -from asyncio.subprocess import PIPE from typing import Tuple from unittest.mock import patch from abc import ABC, abstractmethod @@ -62,8 +61,9 @@ async def test_unpack_modify_pack(self, ofrak_context: OFRAKContext): with patch("asyncio.create_subprocess_exec", wraps=create_subprocess_exec) as mock_exec: if self.EXPECT_PIGZ: await super().test_unpack_modify_pack(ofrak_context) - mock_exec.assert_any_call("pigz", "-d", stdin=PIPE, stdout=PIPE, stderr=PIPE) - mock_exec.assert_any_call("pigz", stdin=PIPE, stdout=PIPE, stderr=PIPE) + assert any( + args[0][0] == "pigz" and args[0][1] == "-c" for args in mock_exec.call_args_list + ) else: await super().test_unpack_modify_pack(ofrak_context) mock_exec.assert_not_called() From 9753ca5a8652a8f249b79650c9bbb965b07699bd Mon Sep 17 00:00:00 2001 From: Wyatt <53830972+whyitfor@users.noreply.github.com> Date: Thu, 15 Aug 2024 15:43:39 -0400 Subject: [PATCH 15/18] Update ofrak_core/CHANGELOG.md --- ofrak_core/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ofrak_core/CHANGELOG.md b/ofrak_core/CHANGELOG.md index 5b97ce71f..2d493fc2c 100644 --- a/ofrak_core/CHANGELOG.md +++ b/ofrak_core/CHANGELOG.md @@ -31,7 +31,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ### Changed - By default, the ofrak log is now `ofrak-YYYYMMDDhhmmss.log` rather than just `ofrak.log` and the name can be specified on the command line ([#480](https://github.com/redballoonsecurity/ofrak/pull/480)) -- In `GripUnpacker`, use the standard python `zlib` library to compress small files and decompress all files. Use `pigz` if it is installed to compress files 1MB and larger. ([#472](https://github.com/redballoonsecurity/ofrak/pull/472) and [#485](https://github.com/redballoonsecurity/ofrak/pull/485)) +- In `GzipUnpacker`, use the standard python `zlib` library to compress small files and decompress all files. Use `pigz` if it is installed to compress files 1MB and larger. ([#472](https://github.com/redballoonsecurity/ofrak/pull/472) and [#485](https://github.com/redballoonsecurity/ofrak/pull/485)) - Change `FreeSpaceModifier` & `PartialFreeSpaceModifier` behavior: an optional stub that isn't free space can be provided and fill-bytes for free space can be specified. ([#409](https://github.com/redballoonsecurity/ofrak/pull/409)) - `Resource.flush_to_disk` method renamed to `Resource.flush_data_to_disk`. ([#373](https://github.com/redballoonsecurity/ofrak/pull/373)) - `build_image.py` supports building Docker images with OFRAK packages from any ancestor directory. ([#425](https://github.com/redballoonsecurity/ofrak/pull/425)) From d954073693be1023fe57ace72bebe23bd21f619b Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Thu, 15 Aug 2024 21:04:47 +0000 Subject: [PATCH 16/18] Improve comments in `unpack_with_zlib_module` --- ofrak_core/ofrak/core/gzip.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/ofrak_core/ofrak/core/gzip.py b/ofrak_core/ofrak/core/gzip.py index d5c7a463e..e6bcbb119 100644 --- a/ofrak_core/ofrak/core/gzip.py +++ b/ofrak_core/ofrak/core/gzip.py @@ -58,12 +58,17 @@ async def unpack(self, resource: Resource, config=None): @staticmethod async def unpack_with_zlib_module(data: bytes) -> bytes: - chunks = [] - + # We use zlib.decompressobj instead of the gzip module to decompress + # because of a bug that causes gzip raise BadGzipFile if there's + # trailing garbage after a compressed file instead of correctly ignoring it + # https://github.com/python/cpython/issues/68489 # wbits > 16 handles the gzip header and footer - # We need to create a zlib.Decompress object in order to use this - # parameter in Python < 3.11 decompressor = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS) + + # gzip files can consist of multiple members, so we need to read them in + # a loop and concatenate them in the end. \037\213 are magic bytes + # indicating the start of a gzip header. + chunks = [] while data.startswith(b"\037\213"): chunks.append(decompressor.decompress(data)) if decompressor.eof: From bec8fa147ebdcdc5426a88b5825f9f5724570abc Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Thu, 15 Aug 2024 21:37:41 +0000 Subject: [PATCH 17/18] Correctly handle multiple member decompression --- ofrak_core/ofrak/core/gzip.py | 8 ++++---- ofrak_core/test_ofrak/components/test_gzip_component.py | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ofrak_core/ofrak/core/gzip.py b/ofrak_core/ofrak/core/gzip.py index e6bcbb119..4952df8cb 100644 --- a/ofrak_core/ofrak/core/gzip.py +++ b/ofrak_core/ofrak/core/gzip.py @@ -59,20 +59,20 @@ async def unpack(self, resource: Resource, config=None): @staticmethod async def unpack_with_zlib_module(data: bytes) -> bytes: # We use zlib.decompressobj instead of the gzip module to decompress - # because of a bug that causes gzip raise BadGzipFile if there's + # because of a bug that causes gzip to raise BadGzipFile if there's # trailing garbage after a compressed file instead of correctly ignoring it # https://github.com/python/cpython/issues/68489 # wbits > 16 handles the gzip header and footer - decompressor = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS) # gzip files can consist of multiple members, so we need to read them in # a loop and concatenate them in the end. \037\213 are magic bytes # indicating the start of a gzip header. chunks = [] while data.startswith(b"\037\213"): + decompressor = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS) chunks.append(decompressor.decompress(data)) - if decompressor.eof: - break + if not decompressor.eof: + raise ValueError("Incomplete gzip file") data = decompressor.unused_data.lstrip(b"\0") if not len(chunks): diff --git a/ofrak_core/test_ofrak/components/test_gzip_component.py b/ofrak_core/test_ofrak/components/test_gzip_component.py index 9809aad86..9abc7ccb7 100644 --- a/ofrak_core/test_ofrak/components/test_gzip_component.py +++ b/ofrak_core/test_ofrak/components/test_gzip_component.py @@ -81,9 +81,11 @@ def write_gzip(self, gzip_path: Path): class TestGzipWithMultipleMembersUnpackModifyPack(GzipUnpackModifyPackPattern): def write_gzip(self, gzip_path: Path): + middle = len(self.INITIAL_DATA) // 2 with gzip.GzipFile(gzip_path, mode="w") as gzip_file: - middle = len(self.INITIAL_DATA) // 2 gzip_file.write(self.INITIAL_DATA[:middle]) + + with gzip.GzipFile(gzip_path, mode="a") as gzip_file: gzip_file.write(self.INITIAL_DATA[middle:]) From 88f230b00be8f8a5013a39f2740e7f24b9da008b Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Thu, 15 Aug 2024 21:40:14 +0000 Subject: [PATCH 18/18] Move wbits comment line --- ofrak_core/ofrak/core/gzip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ofrak_core/ofrak/core/gzip.py b/ofrak_core/ofrak/core/gzip.py index 4952df8cb..342810a6c 100644 --- a/ofrak_core/ofrak/core/gzip.py +++ b/ofrak_core/ofrak/core/gzip.py @@ -62,13 +62,13 @@ async def unpack_with_zlib_module(data: bytes) -> bytes: # because of a bug that causes gzip to raise BadGzipFile if there's # trailing garbage after a compressed file instead of correctly ignoring it # https://github.com/python/cpython/issues/68489 - # wbits > 16 handles the gzip header and footer # gzip files can consist of multiple members, so we need to read them in # a loop and concatenate them in the end. \037\213 are magic bytes # indicating the start of a gzip header. chunks = [] while data.startswith(b"\037\213"): + # wbits > 16 handles the gzip header and footer decompressor = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS) chunks.append(decompressor.decompress(data)) if not decompressor.eof: