Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite gzip component to support trailing bytes without external tool and compress with PIGZ. Addresses #476 #485

Merged
merged 20 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
0a8c069
Rewrite gzip component to support trailing bytes without external too…
alchzh Jul 17, 2024
3331f71
Switch to pigz for 4MiB or larger files and pipe directly to stdin in…
alchzh Jul 17, 2024
eecfcbe
Update docstring in test_gzip_component.py
alchzh Jul 18, 2024
37ba179
Refactor unpack logic to separate functions
alchzh Jul 18, 2024
ed9005f
cache result of is_tool_installed
alchzh Jul 18, 2024
e9fbdb3
Comprehensive gzip test cases
alchzh Jul 18, 2024
9b5315a
Make ComponentExternalTool hashable based on tool and install_check_arg
alchzh Jul 18, 2024
c0640ad
Merge branch 'redballoonsecurity:master' into python-gzip-trailing-bytes
alchzh Aug 7, 2024
7144874
Update previous gzip related changelog message
alchzh Aug 7, 2024
6305010
Actually use pigz as a fallback, clarify changelog message
alchzh Aug 12, 2024
5e2c620
Raise NotImplementedError instance in write_gzip() and make it abstra…
alchzh Aug 13, 2024
ad6469a
Revert caching of is_tool_installed in ComponentExternalTool
alchzh Aug 13, 2024
595e5e1
Cache PIGZ installed or not in gzip component module
alchzh Aug 13, 2024
595b9af
Test that PIGZ is used for packing large file and NOT used for small …
alchzh Aug 13, 2024
8609b8e
Only use PIGZ for compression, not decompression
alchzh Aug 14, 2024
f996194
Merge remote-tracking branch 'origin/master' into python-gzip-trailin…
alchzh Aug 15, 2024
9753ca5
Update ofrak_core/CHANGELOG.md
whyitfor Aug 15, 2024
d954073
Improve comments in `unpack_with_zlib_module`
alchzh Aug 15, 2024
bec8fa1
Correctly handle multiple member decompression
alchzh Aug 15, 2024
88f230b
Move wbits comment line
alchzh Aug 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ofrak_core/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)

### Changed
- By default, the ofrak log is now `ofrak-YYYYMMDDhhmmss.log` rather than just `ofrak.log` and the name can be specified on the command line ([#480](https://github.com/redballoonsecurity/ofrak/pull/480))
- In `GripUnpacker`, use `gzip.GzipFile` python unpacker for speed, fall back on `pigz` if needed ([#472](https://github.com/redballoonsecurity/ofrak/pull/472))
- In `GripUnpacker`, try to use standard python `zlib` library to decompress small files. Use `pigz` if it is installed for files 4MB and larger or as a fallback if python code fails. ([#472](https://github.com/redballoonsecurity/ofrak/pull/472) and [#485](https://github.com/redballoonsecurity/ofrak/pull/485))
- Change `FreeSpaceModifier` & `PartialFreeSpaceModifier` behavior: an optional stub that isn't free space can be provided and fill-bytes for free space can be specified. ([#409](https://github.com/redballoonsecurity/ofrak/pull/409))
- `Resource.flush_to_disk` method renamed to `Resource.flush_data_to_disk`. ([#373](https://github.com/redballoonsecurity/ofrak/pull/373))
- `build_image.py` supports building Docker images with OFRAK packages from any ancestor directory. ([#425](https://github.com/redballoonsecurity/ofrak/pull/425))
Expand Down
102 changes: 52 additions & 50 deletions ofrak_core/ofrak/core/gzip.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import asyncio
import logging
import tempfile
from gzip import BadGzipFile, GzipFile
from io import BytesIO
import zlib
from subprocess import CalledProcessError

from ofrak.component.packer import Packer
Expand Down Expand Up @@ -41,44 +39,51 @@ class GzipUnpacker(Unpacker[None]):

async def unpack(self, resource: Resource, config=None):
data = await resource.get_data()
# GzipFile is faster (spawning external processes has overhead),
# but pigz is more willing to tolerate things like extra junk at the end
try:
with GzipFile(fileobj=BytesIO(data), mode="r") as gzip_file:
return await resource.create_child(
tags=(GenericBinary,),
data=gzip_file.read(),
)
except BadGzipFile:
# Create temporary file with .gz extension
with tempfile.NamedTemporaryFile(suffix=".gz") as temp_file:
temp_file.write(data)
temp_file.flush()
cmd = [
"pigz",
"-d",
"-c",
temp_file.name,
]
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
data = stdout
if proc.returncode:
# Forward any gzip warning message and continue
if proc.returncode == -2 or proc.returncode == 2:
LOGGER.warning(stderr)
data = stdout
else:
raise CalledProcessError(returncode=proc.returncode, cmd=cmd, stderr=stderr)

await resource.create_child(
tags=(GenericBinary,),
data=data,
)
pigz_installed = await PIGZ.is_tool_installed()
if len(data) >= 1024 * 1024 * 4 and pigz_installed:
uncompressed_data = await self.unpack_with_pigz(data)
else:
try:
uncompressed_data = await self.unpack_with_zlib_module(data)
except Exception: # pragma: no cover
if not pigz_installed:
raise
uncompressed_data = await self.unpack_with_pigz(data)
alchzh marked this conversation as resolved.
Show resolved Hide resolved
return await resource.create_child(tags=(GenericBinary,), data=uncompressed_data)

@staticmethod
async def unpack_with_zlib_module(data: bytes) -> bytes:
chunks = []

# wbits > 16 handles the gzip header and footer
# We need to create a zlib.Decompress object in order to use this
# parameter in Python < 3.11
decompressor = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS)
while data.startswith(b"\037\213"):
chunks.append(decompressor.decompress(data))
if decompressor.eof:
break
data = decompressor.unused_data.lstrip(b"\0")
rbs-jacob marked this conversation as resolved.
Show resolved Hide resolved

if not len(chunks):
raise ValueError("Not a gzipped file")

return b"".join(chunks)

@staticmethod
async def unpack_with_pigz(data: bytes) -> bytes:
cmd = ["pigz", "-d"]
proc = await asyncio.create_subprocess_exec(
*cmd,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate(data)
if proc.returncode:
raise CalledProcessError(returncode=proc.returncode, cmd=cmd, stderr=stderr)

return stdout


class GzipPacker(Packer[None]):
Expand All @@ -87,19 +92,16 @@ class GzipPacker(Packer[None]):
"""

targets = (GzipData,)
external_dependencies = (PIGZ,)
whyitfor marked this conversation as resolved.
Show resolved Hide resolved

async def pack(self, resource: Resource, config=None):
gzip_view = await resource.view_as(GzipData)

result = BytesIO()
with GzipFile(fileobj=result, mode="w") as gzip_file:
gzip_child_r = await gzip_view.get_file()
gzip_data = await gzip_child_r.get_data()
gzip_file.write(gzip_data)

gzip_child_r = await gzip_view.get_file()
alchzh marked this conversation as resolved.
Show resolved Hide resolved
gzip_data = await gzip_child_r.get_data()
compressor = zlib.compressobj(wbits=16 + zlib.MAX_WBITS)
result = compressor.compress(gzip_data)
result += compressor.flush()
original_gzip_size = await gzip_view.resource.get_data_length()
resource.queue_patch(Range(0, original_gzip_size), result.getvalue())
resource.queue_patch(Range(0, original_gzip_size), result)


MagicMimeIdentifier.register(GzipData, "application/gzip")
Expand Down
26 changes: 17 additions & 9 deletions ofrak_core/ofrak/model/component_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class ComponentConfig:
"""


@dataclass(frozen=True)
@dataclass(unsafe_hash=True)
alchzh marked this conversation as resolved.
Show resolved Hide resolved
class ComponentExternalTool:
"""
An external tool or utility (like `zip` or `squashfs`) a component depends on. Includes some
Expand All @@ -39,20 +39,27 @@ class ComponentExternalTool:

"""

tool: str
tool_homepage: str
install_check_arg: str
apt_package: Optional[str] = None
brew_package: Optional[str] = None
tool: str = field(hash=True)
tool_homepage: str = field(hash=False)
install_check_arg: str = field(hash=True)
apt_package: Optional[str] = field(default=None, hash=False)
brew_package: Optional[str] = field(default=None, hash=False)

_installed: Optional[bool] = field(default=None, init=False, compare=False)

async def is_tool_installed(self) -> bool:
"""
Check if a tool is installed by running it with the `install_check_arg`.
This method runs `<tool> <install_check_arg>`.
This method runs `<tool> <install_check_arg>` the first time it is called.
The result is cached for future calls.

:return: True if the `tool` command returned zero, False if `tool` could not be found or
returned non-zero exit code.
"""

if self._installed is not None:
return self._installed

try:
cmd = [
self.tool,
Expand All @@ -65,10 +72,11 @@ async def is_tool_installed(self) -> bool:
)

returncode = await proc.wait()
self._installed = 0 == returncode
except FileNotFoundError:
return False
self._installed = False

return 0 == returncode
return self._installed


CC = TypeVar("CC", bound=Optional[ComponentConfig])
Expand Down
3 changes: 3 additions & 0 deletions ofrak_core/test_ofrak/components/assets/hello_ofrak
Git LFS file not shown
3 changes: 3 additions & 0 deletions ofrak_core/test_ofrak/components/assets/hello_world
Git LFS file not shown
3 changes: 3 additions & 0 deletions ofrak_core/test_ofrak/components/assets/random8M
Git LFS file not shown
3 changes: 3 additions & 0 deletions ofrak_core/test_ofrak/components/assets/random8M_modified
Git LFS file not shown
156 changes: 86 additions & 70 deletions ofrak_core/test_ofrak/components/test_gzip_component.py
Original file line number Diff line number Diff line change
@@ -1,87 +1,103 @@
import os
import subprocess
import tempfile
from gzip import GzipFile
from io import BytesIO

import zlib
import gzip
from pathlib import Path
from asyncio import create_subprocess_exec
from asyncio.subprocess import PIPE
from typing import Tuple
from unittest.mock import patch

from ofrak.component.abstract import ComponentSubprocessError
import pytest

from ofrak import OFRAKContext
from ofrak.ofrak_context import OFRAKContext
from ofrak.resource import Resource
from ofrak.core.gzip import GzipData
from pytest_ofrak.patterns.compressed_filesystem_unpack_modify_pack import (
CompressedFileUnpackModifyPackPattern,
)
from pytest_ofrak.patterns.unpack_modify_pack import UnpackModifyPackPattern

ASSETS_DIR = Path(__file__).parent / "assets"


class TestGzipUnpackModifyPack(CompressedFileUnpackModifyPackPattern):
@pytest.fixture(
autouse=True,
scope="module",
params=[
(ASSETS_DIR / "hello_world", ASSETS_DIR / "hello_ofrak", False),
(ASSETS_DIR / "random8M", ASSETS_DIR / "random8M_modified", True),
],
ids=["hello world", "<random 8MB data>"],
)
def gzip_test_input(request):
initial_path, repacked_path, expect_pigz = request.param
with open(initial_path, "rb") as initial_file:
initial_data = initial_file.read()
with open(repacked_path, "rb") as repacked_file:
expected_repacked_data = repacked_file.read()
return (initial_data, expected_repacked_data, expect_pigz)


class GzipUnpackModifyPackPattern(CompressedFileUnpackModifyPackPattern):
"""
Template for tests that test different inputs the gzip component should support
unpacking.
"""

EXPECT_PIGZ: bool
expected_tag = GzipData

@pytest.fixture(autouse=True)
def create_test_file(self, tmpdir):
d = tmpdir.mkdir("gzip")
fh = d.join("hello.gz")
result = BytesIO()
with GzipFile(fileobj=result, mode="w") as gzip_file:
gzip_file.write(self.INITIAL_DATA)
fh.write_binary(result.getvalue())
def write_gzip(self, gzip_path: Path):
raise NotImplementedError

alchzh marked this conversation as resolved.
Show resolved Hide resolved
self._test_file = fh.realpath()
@pytest.fixture(autouse=True)
def create_test_file(self, gzip_test_input: Tuple[bytes, bytes, bool], tmp_path: Path):
self.INITIAL_DATA, self.EXPECTED_REPACKED_DATA, self.EXPECT_PIGZ = gzip_test_input
gzip_path = tmp_path / "test.gz"
self.write_gzip(gzip_path)
self._test_file = gzip_path.resolve()

async def test_unpack_modify_pack(self, ofrak_context: OFRAKContext):
if self.EXPECT_PIGZ:
with patch("asyncio.create_subprocess_exec", wraps=create_subprocess_exec) as mock:
await super().test_unpack_modify_pack(ofrak_context)
mock.assert_any_call("pigz", "-d", stdin=PIPE, stdout=PIPE, stderr=PIPE)
else:
await super().test_unpack_modify_pack(ofrak_context)

async def verify(self, repacked_root_resource: Resource):
patched_gzip_file = GzipFile(fileobj=BytesIO(await repacked_root_resource.get_data()))
patched_decompressed_data = patched_gzip_file.read()
patched_decompressed_data = gzip.decompress(await repacked_root_resource.get_data())
assert patched_decompressed_data == self.EXPECTED_REPACKED_DATA


class TestGzipUnpackWithTrailingBytes(UnpackModifyPackPattern):
EXPECTED_TAG = GzipData
INITIAL_DATA = b"Hello World"
EXPECTED_DATA = INITIAL_DATA # Change expected when modifier is created
INNER_FILENAME = "hello.bin"
GZIP_FILENAME = "hello.bin.gz"

async def create_root_resource(self, ofrak_context: OFRAKContext) -> Resource:
with tempfile.TemporaryDirectory() as d:
file_path = os.path.join(d, self.INNER_FILENAME)
with open(file_path, "wb") as f:
f.write(self.INITIAL_DATA)

gzip_path = os.path.join(d, self.GZIP_FILENAME)
gzip_command = ["pigz", file_path]
subprocess.run(gzip_command, check=True, capture_output=True)

# Add trailing bytes
with open(gzip_path, "ab") as a:
a.write(b"\xDE\xAD\xBE\xEF")
a.close()
return await ofrak_context.create_root_resource_from_file(gzip_path)

async def unpack(self, root_resource: Resource) -> None:
await root_resource.unpack_recursively()

async def modify(self, root_resource: Resource) -> None:
pass

async def repack(self, root_resource: Resource) -> None:
pass

async def verify(self, root_resource: Resource) -> None:
gzip_data = await root_resource.get_data()
with tempfile.TemporaryDirectory() as d:
gzip_path = os.path.join(d, self.GZIP_FILENAME)
with open(gzip_path, "wb") as f:
f.write(gzip_data)

gunzip_command = ["pigz", "-d", "-c", gzip_path]
try:
result = subprocess.run(gunzip_command, check=True, capture_output=True)
data = result.stdout
except subprocess.CalledProcessError as e:
if e.returncode == 2 or e.returncode == -2:
data = e.stdout
else:
raise

assert data == self.EXPECTED_DATA
class TestGzipUnpackModifyPack(GzipUnpackModifyPackPattern):
def write_gzip(self, gzip_path: Path):
with gzip.GzipFile(gzip_path, mode="w") as gzip_file:
gzip_file.write(self.INITIAL_DATA)


class TestGzipWithMultipleMembersUnpackModifyPack(GzipUnpackModifyPackPattern):
def write_gzip(self, gzip_path: Path):
with gzip.GzipFile(gzip_path, mode="w") as gzip_file:
middle = len(self.INITIAL_DATA) // 2
gzip_file.write(self.INITIAL_DATA[:middle])
gzip_file.write(self.INITIAL_DATA[middle:])


class TestGzipWithTrailingBytesUnpackModifyPack(GzipUnpackModifyPackPattern):
def write_gzip(self, gzip_path: Path):
with gzip.GzipFile(gzip_path, mode="w") as gzip_file:
gzip_file.write(self.INITIAL_DATA)

alchzh marked this conversation as resolved.
Show resolved Hide resolved
with open(gzip_path, "ab") as raw_file:
raw_file.write(b"\xDE\xAD\xBE\xEF")


async def test_corrupted_gzip_fail(
gzip_test_input: Tuple[bytes, bytes, bool], ofrak_context: OFRAKContext
):
initial_data = gzip_test_input[0]
corrupted_data = bytearray(gzip.compress(initial_data))
corrupted_data[10] = 255
resource = await ofrak_context.create_root_resource("corrupted.gz", data=bytes(corrupted_data))
with pytest.raises((zlib.error, ComponentSubprocessError)):
await resource.unpack()
Loading