Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite gzip component to support trailing bytes without external tool and compress with PIGZ. Addresses #476 #485

Merged
merged 20 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
0a8c069
Rewrite gzip component to support trailing bytes without external too…
alchzh Jul 17, 2024
3331f71
Switch to pigz for 4MiB or larger files and pipe directly to stdin in…
alchzh Jul 17, 2024
eecfcbe
Update docstring in test_gzip_component.py
alchzh Jul 18, 2024
37ba179
Refactor unpack logic to separate functions
alchzh Jul 18, 2024
ed9005f
cache result of is_tool_installed
alchzh Jul 18, 2024
e9fbdb3
Comprehensive gzip test cases
alchzh Jul 18, 2024
9b5315a
Make ComponentExternalTool hashable based on tool and install_check_arg
alchzh Jul 18, 2024
c0640ad
Merge branch 'redballoonsecurity:master' into python-gzip-trailing-bytes
alchzh Aug 7, 2024
7144874
Update previous gzip related changelog message
alchzh Aug 7, 2024
6305010
Actually use pigz as a fallback, clarify changelog message
alchzh Aug 12, 2024
5e2c620
Raise NotImplementedError instance in write_gzip() and make it abstra…
alchzh Aug 13, 2024
ad6469a
Revert caching of is_tool_installed in ComponentExternalTool
alchzh Aug 13, 2024
595e5e1
Cache PIGZ installed or not in gzip component module
alchzh Aug 13, 2024
595b9af
Test that PIGZ is used for packing large file and NOT used for small …
alchzh Aug 13, 2024
8609b8e
Only use PIGZ for compression, not decompression
alchzh Aug 14, 2024
f996194
Merge remote-tracking branch 'origin/master' into python-gzip-trailin…
alchzh Aug 15, 2024
9753ca5
Update ofrak_core/CHANGELOG.md
whyitfor Aug 15, 2024
d954073
Improve comments in `unpack_with_zlib_module`
alchzh Aug 15, 2024
bec8fa1
Correctly handle multiple member decompression
alchzh Aug 15, 2024
88f230b
Move wbits comment line
alchzh Aug 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ofrak_core/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)

### Changed
- By default, the ofrak log is now `ofrak-YYYYMMDDhhmmss.log` rather than just `ofrak.log` and the name can be specified on the command line ([#480](https://github.com/redballoonsecurity/ofrak/pull/480))
- In `GripUnpacker`, use `gzip.GzipFile` python unpacker for speed, fall back on `pigz` if needed ([#472](https://github.com/redballoonsecurity/ofrak/pull/472))
- In `GzipUnpacker`, use the standard python `zlib` library to compress small files and decompress all files. Use `pigz` if it is installed to compress files 1MB and larger. ([#472](https://github.com/redballoonsecurity/ofrak/pull/472) and [#485](https://github.com/redballoonsecurity/ofrak/pull/485))
- Change `FreeSpaceModifier` & `PartialFreeSpaceModifier` behavior: an optional stub that isn't free space can be provided and fill-bytes for free space can be specified. ([#409](https://github.com/redballoonsecurity/ofrak/pull/409))
- `Resource.flush_to_disk` method renamed to `Resource.flush_data_to_disk`. ([#373](https://github.com/redballoonsecurity/ofrak/pull/373))
- `build_image.py` supports building Docker images with OFRAK packages from any ancestor directory. ([#425](https://github.com/redballoonsecurity/ofrak/pull/425))
Expand Down
125 changes: 77 additions & 48 deletions ofrak_core/ofrak/core/gzip.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import asyncio
import logging
import tempfile
from gzip import BadGzipFile, GzipFile
from io import BytesIO
from typing import Optional
import zlib
from subprocess import CalledProcessError
import tempfile

from ofrak.component.packer import Packer
from ofrak.component.unpacker import Unpacker
Expand All @@ -15,11 +15,23 @@

LOGGER = logging.getLogger(__name__)

# PIGZ provides significantly faster compression on multi core systems.
# It does not parallelize decompression, so we don't use it in GzipUnpacker.
PIGZ = ComponentExternalTool(
"pigz", "https://zlib.net/pigz/", "--help", apt_package="pigz", brew_package="pigz"
)


class PIGZInstalled:
_pigz_installed: Optional[bool] = None

@staticmethod
async def is_pigz_installed() -> bool:
if PIGZInstalled._pigz_installed is None:
PIGZInstalled._pigz_installed = await PIGZ.is_tool_installed()
return PIGZInstalled._pigz_installed


class GzipData(GenericBinary):
"""
A gzip binary blob.
Expand All @@ -41,44 +53,32 @@ class GzipUnpacker(Unpacker[None]):

async def unpack(self, resource: Resource, config=None):
data = await resource.get_data()
# GzipFile is faster (spawning external processes has overhead),
# but pigz is more willing to tolerate things like extra junk at the end
try:
with GzipFile(fileobj=BytesIO(data), mode="r") as gzip_file:
return await resource.create_child(
tags=(GenericBinary,),
data=gzip_file.read(),
)
except BadGzipFile:
# Create temporary file with .gz extension
with tempfile.NamedTemporaryFile(suffix=".gz") as temp_file:
temp_file.write(data)
temp_file.flush()
cmd = [
"pigz",
"-d",
"-c",
temp_file.name,
]
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
data = stdout
if proc.returncode:
# Forward any gzip warning message and continue
if proc.returncode == -2 or proc.returncode == 2:
LOGGER.warning(stderr)
data = stdout
else:
raise CalledProcessError(returncode=proc.returncode, cmd=cmd, stderr=stderr)

await resource.create_child(
tags=(GenericBinary,),
data=data,
)
unpacked_data = await self.unpack_with_zlib_module(data)
return await resource.create_child(tags=(GenericBinary,), data=unpacked_data)

@staticmethod
async def unpack_with_zlib_module(data: bytes) -> bytes:
# We use zlib.decompressobj instead of the gzip module to decompress
# because of a bug that causes gzip to raise BadGzipFile if there's
# trailing garbage after a compressed file instead of correctly ignoring it
# https://github.com/python/cpython/issues/68489

# gzip files can consist of multiple members, so we need to read them in
# a loop and concatenate them in the end. \037\213 are magic bytes
# indicating the start of a gzip header.
chunks = []
while data.startswith(b"\037\213"):
# wbits > 16 handles the gzip header and footer
decompressor = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS)
chunks.append(decompressor.decompress(data))
if not decompressor.eof:
raise ValueError("Incomplete gzip file")
data = decompressor.unused_data.lstrip(b"\0")

if not len(chunks):
raise ValueError("Not a gzipped file")

return b"".join(chunks)


class GzipPacker(Packer[None]):
Expand All @@ -87,19 +87,48 @@ class GzipPacker(Packer[None]):
"""

targets = (GzipData,)
external_dependencies = (PIGZ,)
whyitfor marked this conversation as resolved.
Show resolved Hide resolved

async def pack(self, resource: Resource, config=None):
gzip_view = await resource.view_as(GzipData)
gzip_child_r = await gzip_view.get_file()
alchzh marked this conversation as resolved.
Show resolved Hide resolved
data = await gzip_child_r.get_data()

result = BytesIO()
with GzipFile(fileobj=result, mode="w") as gzip_file:
gzip_child_r = await gzip_view.get_file()
gzip_data = await gzip_child_r.get_data()
gzip_file.write(gzip_data)
if len(data) >= 1024 * 1024 and await PIGZInstalled.is_pigz_installed():
packed_data = await self.pack_with_pigz(data)
else:
packed_data = await self.pack_with_zlib_module(data)

original_gzip_size = await gzip_view.resource.get_data_length()
resource.queue_patch(Range(0, original_gzip_size), result.getvalue())
resource.queue_patch(Range(0, original_gzip_size), data=packed_data)

@staticmethod
async def pack_with_zlib_module(data: bytes) -> bytes:
compressor = zlib.compressobj(wbits=16 + zlib.MAX_WBITS)
result = compressor.compress(data)
result += compressor.flush()
return result

@staticmethod
async def pack_with_pigz(data: bytes) -> bytes:
with tempfile.NamedTemporaryFile() as uncompressed_file:
uncompressed_file.write(data)
uncompressed_file.flush()

cmd = [
"pigz",
"-c",
uncompressed_file.name,
]
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
if proc.returncode:
raise CalledProcessError(returncode=proc.returncode, stderr=stderr, cmd=cmd)

return stdout


MagicMimeIdentifier.register(GzipData, "application/gzip")
Expand Down
3 changes: 3 additions & 0 deletions ofrak_core/test_ofrak/components/assets/hello_ofrak
Git LFS file not shown
3 changes: 3 additions & 0 deletions ofrak_core/test_ofrak/components/assets/hello_world
Git LFS file not shown
3 changes: 3 additions & 0 deletions ofrak_core/test_ofrak/components/assets/random8M
Git LFS file not shown
3 changes: 3 additions & 0 deletions ofrak_core/test_ofrak/components/assets/random8M_modified
Git LFS file not shown
162 changes: 92 additions & 70 deletions ofrak_core/test_ofrak/components/test_gzip_component.py
Original file line number Diff line number Diff line change
@@ -1,87 +1,109 @@
import os
import subprocess
import tempfile
from gzip import GzipFile
from io import BytesIO

import zlib
import gzip
from pathlib import Path
from asyncio import create_subprocess_exec
from typing import Tuple
from unittest.mock import patch
from abc import ABC, abstractmethod

from ofrak.component.abstract import ComponentSubprocessError
import pytest

from ofrak import OFRAKContext
from ofrak.ofrak_context import OFRAKContext
from ofrak.resource import Resource
from ofrak.core.gzip import GzipData
from pytest_ofrak.patterns.compressed_filesystem_unpack_modify_pack import (
CompressedFileUnpackModifyPackPattern,
)
from pytest_ofrak.patterns.unpack_modify_pack import UnpackModifyPackPattern

ASSETS_DIR = Path(__file__).parent / "assets"


class TestGzipUnpackModifyPack(CompressedFileUnpackModifyPackPattern):
@pytest.fixture(
autouse=True,
scope="module",
params=[
(ASSETS_DIR / "hello_world", ASSETS_DIR / "hello_ofrak", False),
(ASSETS_DIR / "random8M", ASSETS_DIR / "random8M_modified", True),
],
ids=["hello world", "<random 8MB data>"],
)
def gzip_test_input(request):
initial_path, repacked_path, expect_pigz = request.param
with open(initial_path, "rb") as initial_file:
initial_data = initial_file.read()
with open(repacked_path, "rb") as repacked_file:
expected_repacked_data = repacked_file.read()
return (initial_data, expected_repacked_data, expect_pigz)


class GzipUnpackModifyPackPattern(CompressedFileUnpackModifyPackPattern, ABC):
"""
Template for tests that test different inputs the gzip component should support
unpacking.
"""

EXPECT_PIGZ: bool
expected_tag = GzipData

@pytest.fixture(autouse=True)
def create_test_file(self, tmpdir):
d = tmpdir.mkdir("gzip")
fh = d.join("hello.gz")
result = BytesIO()
with GzipFile(fileobj=result, mode="w") as gzip_file:
gzip_file.write(self.INITIAL_DATA)
fh.write_binary(result.getvalue())
@abstractmethod
def write_gzip(self, gzip_path: Path):
raise NotImplementedError()

self._test_file = fh.realpath()
@pytest.fixture(autouse=True)
def create_test_file(self, gzip_test_input: Tuple[bytes, bytes, bool], tmp_path: Path):
self.INITIAL_DATA, self.EXPECTED_REPACKED_DATA, self.EXPECT_PIGZ = gzip_test_input
gzip_path = tmp_path / "test.gz"
self.write_gzip(gzip_path)
self._test_file = gzip_path.resolve()

async def test_unpack_modify_pack(self, ofrak_context: OFRAKContext):
with patch("asyncio.create_subprocess_exec", wraps=create_subprocess_exec) as mock_exec:
if self.EXPECT_PIGZ:
await super().test_unpack_modify_pack(ofrak_context)
assert any(
args[0][0] == "pigz" and args[0][1] == "-c" for args in mock_exec.call_args_list
)
else:
await super().test_unpack_modify_pack(ofrak_context)
mock_exec.assert_not_called()

async def verify(self, repacked_root_resource: Resource):
patched_gzip_file = GzipFile(fileobj=BytesIO(await repacked_root_resource.get_data()))
patched_decompressed_data = patched_gzip_file.read()
patched_decompressed_data = gzip.decompress(await repacked_root_resource.get_data())
assert patched_decompressed_data == self.EXPECTED_REPACKED_DATA


class TestGzipUnpackWithTrailingBytes(UnpackModifyPackPattern):
EXPECTED_TAG = GzipData
INITIAL_DATA = b"Hello World"
EXPECTED_DATA = INITIAL_DATA # Change expected when modifier is created
INNER_FILENAME = "hello.bin"
GZIP_FILENAME = "hello.bin.gz"

async def create_root_resource(self, ofrak_context: OFRAKContext) -> Resource:
with tempfile.TemporaryDirectory() as d:
file_path = os.path.join(d, self.INNER_FILENAME)
with open(file_path, "wb") as f:
f.write(self.INITIAL_DATA)

gzip_path = os.path.join(d, self.GZIP_FILENAME)
gzip_command = ["pigz", file_path]
subprocess.run(gzip_command, check=True, capture_output=True)

# Add trailing bytes
with open(gzip_path, "ab") as a:
a.write(b"\xDE\xAD\xBE\xEF")
a.close()
return await ofrak_context.create_root_resource_from_file(gzip_path)

async def unpack(self, root_resource: Resource) -> None:
await root_resource.unpack_recursively()

async def modify(self, root_resource: Resource) -> None:
pass

async def repack(self, root_resource: Resource) -> None:
pass

async def verify(self, root_resource: Resource) -> None:
gzip_data = await root_resource.get_data()
with tempfile.TemporaryDirectory() as d:
gzip_path = os.path.join(d, self.GZIP_FILENAME)
with open(gzip_path, "wb") as f:
f.write(gzip_data)

gunzip_command = ["pigz", "-d", "-c", gzip_path]
try:
result = subprocess.run(gunzip_command, check=True, capture_output=True)
data = result.stdout
except subprocess.CalledProcessError as e:
if e.returncode == 2 or e.returncode == -2:
data = e.stdout
else:
raise

assert data == self.EXPECTED_DATA
class TestGzipUnpackModifyPack(GzipUnpackModifyPackPattern):
def write_gzip(self, gzip_path: Path):
with gzip.GzipFile(gzip_path, mode="w") as gzip_file:
gzip_file.write(self.INITIAL_DATA)


class TestGzipWithMultipleMembersUnpackModifyPack(GzipUnpackModifyPackPattern):
def write_gzip(self, gzip_path: Path):
middle = len(self.INITIAL_DATA) // 2
with gzip.GzipFile(gzip_path, mode="w") as gzip_file:
gzip_file.write(self.INITIAL_DATA[:middle])

with gzip.GzipFile(gzip_path, mode="a") as gzip_file:
gzip_file.write(self.INITIAL_DATA[middle:])


class TestGzipWithTrailingBytesUnpackModifyPack(GzipUnpackModifyPackPattern):
def write_gzip(self, gzip_path: Path):
with gzip.GzipFile(gzip_path, mode="w") as gzip_file:
gzip_file.write(self.INITIAL_DATA)

alchzh marked this conversation as resolved.
Show resolved Hide resolved
with open(gzip_path, "ab") as raw_file:
raw_file.write(b"\xDE\xAD\xBE\xEF")


async def test_corrupted_gzip_fail(
gzip_test_input: Tuple[bytes, bytes, bool], ofrak_context: OFRAKContext
):
initial_data = gzip_test_input[0]
corrupted_data = bytearray(gzip.compress(initial_data))
corrupted_data[10] = 255
resource = await ofrak_context.create_root_resource("corrupted.gz", data=bytes(corrupted_data))
with pytest.raises((zlib.error, ComponentSubprocessError)):
await resource.unpack()
Loading