diff --git a/ofrak_core/CHANGELOG.md b/ofrak_core/CHANGELOG.md index b0fef6b15..1e173a5e5 100644 --- a/ofrak_core/CHANGELOG.md +++ b/ofrak_core/CHANGELOG.md @@ -29,6 +29,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - View resource attribute string values containing only digits primarily as strings, alternatively as hex numbers. ([#423](https://github.com/redballoonsecurity/ofrak/pull/423)) ### Changed +- In `GripUnpacker`, use `gzip.GzipFile` python unpacker for speed, fall back on `pigz` if needed ([#472](https://github.com/redballoonsecurity/ofrak/pull/472)) - Change `FreeSpaceModifier` & `PartialFreeSpaceModifier` behavior: an optional stub that isn't free space can be provided and fill-bytes for free space can be specified. ([#409](https://github.com/redballoonsecurity/ofrak/pull/409)) - `Resource.flush_to_disk` method renamed to `Resource.flush_data_to_disk`. ([#373](https://github.com/redballoonsecurity/ofrak/pull/373)) - `build_image.py` supports building Docker images with OFRAK packages from any ancestor directory. ([#425](https://github.com/redballoonsecurity/ofrak/pull/425)) diff --git a/ofrak_core/ofrak/core/gzip.py b/ofrak_core/ofrak/core/gzip.py index c5b7568b7..8ec53f6fc 100644 --- a/ofrak_core/ofrak/core/gzip.py +++ b/ofrak_core/ofrak/core/gzip.py @@ -1,7 +1,7 @@ import asyncio import logging import tempfile -from gzip import GzipFile +from gzip import BadGzipFile, GzipFile from io import BytesIO from subprocess import CalledProcessError @@ -40,35 +40,45 @@ class GzipUnpacker(Unpacker[None]): external_dependencies = (PIGZ,) async def unpack(self, resource: Resource, config=None): - # Create temporary file with .gz extension - with tempfile.NamedTemporaryFile(suffix=".gz") as temp_file: - temp_file.write(await resource.get_data()) - temp_file.flush() - cmd = [ - "pigz", - "-d", - "-c", - temp_file.name, - ] - proc = await asyncio.create_subprocess_exec( - *cmd, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - data = stdout - if proc.returncode: - # Forward any gzip warning message and continue - if proc.returncode == -2 or proc.returncode == 2: - LOGGER.warning(stderr) - data = stdout - else: - raise CalledProcessError(returncode=proc.returncode, cmd=cmd) - - await resource.create_child( - tags=(GenericBinary,), - data=data, - ) + data = await resource.get_data() + # GzipFile is faster (spawning external processes has overhead), + # but pigz is more willing to tolerate things like extra junk at the end + try: + with GzipFile(fileobj=BytesIO(data), mode="r") as gzip_file: + return await resource.create_child( + tags=(GenericBinary,), + data=gzip_file.read(), + ) + except BadGzipFile: + # Create temporary file with .gz extension + with tempfile.NamedTemporaryFile(suffix=".gz") as temp_file: + temp_file.write(data) + temp_file.flush() + cmd = [ + "pigz", + "-d", + "-c", + temp_file.name, + ] + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + data = stdout + if proc.returncode: + # Forward any gzip warning message and continue + if proc.returncode == -2 or proc.returncode == 2: + LOGGER.warning(stderr) + data = stdout + else: + raise CalledProcessError(returncode=proc.returncode, cmd=cmd, stderr=stderr) + + await resource.create_child( + tags=(GenericBinary,), + data=data, + ) class GzipPacker(Packer[None]):