Implement utility to extract CARv1 from a CARv2

Implement `ExtractV1File` where the function takes path to a CARv2 file and efficiently extracts its inner CARv1 payload. Note, the implementation only supports CARv2 as input and returns a dedicated error if the supplied input is already in CARv1 format. Implement benchmarks comparing extraction using `Reader` vs `ExtractV1File`. Implement tests that assert in-place extraction as well as invalid input and both v1/v2 input Fixes #207 This commit was moved from ipld/go-car@8113794
ipfs · Aug 11, 2021 · 240282a · 240282a
1 parent facff84
commit 240282a
Show file tree

Hide file tree

Showing 3 changed files with 249 additions and 0 deletions.
diff --git a/ipld/car/v2/bench_test.go b/ipld/car/v2/bench_test.go
@@ -2,12 +2,20 @@ package car_test
 
 import (
 	"io"
+	"math/rand"
 	"os"
+	"path/filepath"
 	"testing"
 
+	"github.com/ipfs/go-cid"
+	"github.com/ipfs/go-merkledag"
+	"github.com/ipld/go-car/v2/blockstore"
+
 	carv2 "github.com/ipld/go-car/v2"
 )
 
+var rng = rand.New(rand.NewSource(1413))
+
 // BenchmarkReadBlocks instantiates a BlockReader, and iterates over all blocks.
 // It essentially looks at the contents of any CARv1 or CARv2 file.
 // Note that this also uses internal carv1.ReadHeader underneath.
@@ -47,3 +55,93 @@ func BenchmarkReadBlocks(b *testing.B) {
 		}
 	})
 }
+
+// BenchmarkExtractV1File extracts inner CARv1 payload from a sample CARv2 file using ExtractV1File.
+func BenchmarkExtractV1File(b *testing.B) {
+	path := filepath.Join(b.TempDir(), "bench-large-v2.car")
+	generateRandomCarV2File(b, path, 10*1024*1024) // 10 MiB
+	defer os.Remove(path)
+
+	info, err := os.Stat(path)
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.SetBytes(info.Size())
+	b.ReportAllocs()
+	b.ResetTimer()
+
+	b.RunParallel(func(pb *testing.PB) {
+		dstPath := filepath.Join(b.TempDir(), "destination.car")
+		for pb.Next() {
+			err = carv2.ExtractV1File(path, dstPath)
+			if err != nil {
+				b.Fatal(err)
+			}
+			_ = os.Remove(dstPath)
+		}
+	})
+}
+
+// BenchmarkExtractV1UsingReader extracts inner CARv1 payload from a sample CARv2 file using Reader
+// API. This benchmark is implemented to be used as a comparison in conjunction with
+// BenchmarkExtractV1File.
+func BenchmarkExtractV1UsingReader(b *testing.B) {
+	path := filepath.Join(b.TempDir(), "bench-large-v2.car")
+	generateRandomCarV2File(b, path, 10*1024*1024) // 10 MiB
+	defer os.Remove(path)
+
+	info, err := os.Stat(path)
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.SetBytes(info.Size())
+	b.ReportAllocs()
+	b.ResetTimer()
+
+	b.RunParallel(func(pb *testing.PB) {
+		dstPath := filepath.Join(b.TempDir(), "destination.car")
+		for pb.Next() {
+			dst, err := os.Create(dstPath)
+			if err != nil {
+				b.Fatal(err)
+			}
+			reader, err := carv2.OpenReader(path)
+			if err != nil {
+				b.Fatal(err)
+			}
+			_, err = io.Copy(dst, reader.DataReader())
+			if err != nil {
+				b.Fatal(err)
+			}
+			if err := dst.Close(); err != nil {
+				b.Fatal(err)
+			}
+		}
+	})
+}
+
+func generateRandomCarV2File(b *testing.B, path string, minTotalBlockSize int) {
+	bs, err := blockstore.OpenReadWrite(path, []cid.Cid{})
+	defer func() {
+		if err := bs.Finalize(); err != nil {
+			b.Fatal(err)
+		}
+	}()
+	if err != nil {
+		b.Fatal(err)
+	}
+	buf := make([]byte, 1024)
+	var totalBlockSize int
+	for totalBlockSize < minTotalBlockSize {
+		size, err := rng.Read(buf)
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		blk := merkledag.NewRawNode(buf)
+		if err := bs.Put(blk); err != nil {
+			b.Fatal(err)
+		}
+		totalBlockSize += size
+	}
+}
diff --git a/ipld/car/v2/writer.go b/ipld/car/v2/writer.go
@@ -1,6 +1,8 @@
 package car
 
 import (
+	"errors"
+	"fmt"
 	"io"
 	"os"
 
@@ -9,6 +11,9 @@ import (
 	"github.com/ipld/go-car/v2/index"
 )
 
+// ErrAlreadyV1 signals that the given payload is already in CARv1 format.
+var ErrAlreadyV1 = errors.New("already a CARv1")
+
 // WrapV1File is a wrapper around WrapV1 that takes filesystem paths.
 // The source path is assumed to exist, and the destination path is overwritten.
 // Note that the destination path might still be created even if an error
@@ -79,6 +84,109 @@ func WrapV1(src io.ReadSeeker, dst io.Writer) error {
 	return nil
 }
 
+// ExtractV1File takes a CARv2 file and extracts its CARv1 data payload, unmodified.
+// The resulting CARv1 file will not include any data payload padding that may be present in the
+// CARv2 srcPath.
+// If srcPath represents a CARv1 ErrAlreadyV1 error is returned.
+// The srcPath is assumed to exist, and the destination path is created if not exist.
+// Note that the destination path might still be created even if an error
+// occurred.
+// If srcPath and dstPath are the same, then the dstPath is converted, in-place, to CARv1.
+func ExtractV1File(srcPath, dstPath string) (err error) {
+	src, err := os.Open(srcPath)
+	if err != nil {
+		return err
+	}
+
+	// Ignore close error since only reading from src.
+	defer src.Close()
+
+	// Detect CAR version.
+	version, err := ReadVersion(src)
+	if err != nil {
+		return err
+	}
+	if version == 1 {
+		return ErrAlreadyV1
+	}
+	if version != 2 {
+		return fmt.Errorf("invalid source version: %v", version)
+	}
+
+	// Read CARv2 header to locate data payload.
+	var v2h Header
+	if _, err := v2h.ReadFrom(src); err != nil {
+		return err
+	}
+
+	// TODO consider extracting this into Header.Validate since it is also implemented in BlockReader.
+	// Validate header
+	dataOffset := int64(v2h.DataOffset)
+	if dataOffset < PragmaSize+HeaderSize {
+		return fmt.Errorf("invalid data payload offset: %v", dataOffset)
+	}
+	dataSize := int64(v2h.DataSize)
+	if dataSize <= 0 {
+		return fmt.Errorf("invalid data payload size: %v", dataSize)
+	}
+
+	// Seek to the point where the data payload starts
+	if _, err := src.Seek(dataOffset, io.SeekStart); err != nil {
+		return err
+	}
+
+	// Open destination as late as possible to minimise unintended file creation in case an error
+	// occurs earlier.
+	// Note, we explicitly do not use os.O_TRUNC here so that we can support in-place extraction.
+	// Otherwise, truncation of an existing file will wipe the data we would be reading from if
+	// source and destination paths are the same.
+	// Later, we do truncate the file to the right size to assert there are no tailing extra bytes.
+	dst, err := os.OpenFile(dstPath, os.O_CREATE|os.O_WRONLY, 0o666)
+	if err != nil {
+		return err
+	}
+
+	defer func() {
+		// Close destination and override return error type if it is nil.
+		cerr := dst.Close()
+		if err == nil {
+			err = cerr
+		}
+	}()
+
+	// Copy data payload over, expecting to write exactly the right number of bytes.
+	// Note that we explicitly use io.CopyN using file descriptors to leverage the SDK's efficient
+	// byte copy which should stay out of userland.
+	// There are two benchmarks to measure this: BenchmarkExtractV1File vs. BenchmarkExtractV1UsingReader
+	written, err := io.CopyN(dst, src, dataSize)
+	if err != nil {
+		return err
+	}
+	if written != dataSize {
+		return fmt.Errorf("expected to write exactly %v but wrote %v", dataSize, written)
+	}
+
+	// Check that the size destination file matches expected size.
+	// If bigger truncate.
+	// Note, we need to truncate:
+	// - if file is changed in-place, i.e. src and dst paths are the same then index or padding
+	//   could be present after the data payload.
+	// - if an existing file is passed as destination which is different from source and is larger
+	//   than the data payload size.
+	// In general, we want to guarantee that this function produces correct CARv2 payload in
+	// destination.
+	stat, err := dst.Stat()
+	if err != nil {
+		return err
+	}
+	if stat.Size() > dataSize {
+		// Truncate to the expected size to assure the resulting file is a correctly sized CARv1.
+		err = dst.Truncate(written)
+	}
+
+	return err
+}
+
 // AttachIndex attaches a given index to an existing CARv2 file at given path and offset.
 func AttachIndex(path string, idx index.Index, offset uint64) error {
 	// TODO: instead of offset, maybe take padding?

diff --git a/ipld/car/v2/writer_test.go b/ipld/car/v2/writer_test.go
@@ -59,6 +59,49 @@ func TestWrapV1(t *testing.T) {
 	require.Equal(t, wantIdx, gotIdx)
 }
 
+func TestExtractV1(t *testing.T) {
+	// Produce a CARv1 file to test.
+	dagSvc := dstest.Mock()
+	v1Src := filepath.Join(t.TempDir(), "original-test-v1.car")
+	v1f, err := os.Create(v1Src)
+	require.NoError(t, err)
+	t.Cleanup(func() { require.NoError(t, v1f.Close()) })
+	require.NoError(t, carv1.WriteCar(context.Background(), dagSvc, generateRootCid(t, dagSvc), v1f))
+	_, err = v1f.Seek(0, io.SeekStart)
+	require.NoError(t, err)
+	wantV1, err := ioutil.ReadAll(v1f)
+	require.NoError(t, err)
+
+	// Wrap the produced CARv1 into a CARv2 to use for testing.
+	v2path := filepath.Join(t.TempDir(), "wrapped-for-extract-test-v2.car")
+	require.NoError(t, WrapV1File(v1Src, v2path))
+
+	// Assert extract from CARv2 file is as expected.
+	dstPath := filepath.Join(t.TempDir(), "extract-file-test-v1.car")
+	require.NoError(t, ExtractV1File(v2path, dstPath))
+	gotFromFile, err := ioutil.ReadFile(dstPath)
+	require.NoError(t, err)
+	require.Equal(t, wantV1, gotFromFile)
+
+	// Assert extract from CARv2 file in-place is as expected
+	require.NoError(t, ExtractV1File(v2path, v2path))
+	gotFromInPlaceFile, err := ioutil.ReadFile(v2path)
+	require.NoError(t, err)
+	require.Equal(t, wantV1, gotFromInPlaceFile)
+}
+
+func TestExtractV1WithUnknownVersionIsError(t *testing.T) {
+	dstPath := filepath.Join(t.TempDir(), "extract-dst-file-test-v42.car")
+	err := ExtractV1File("testdata/sample-rootless-v42.car", dstPath)
+	require.EqualError(t, err, "invalid source version: 42")
+}
+
+func TestExtractV1FromACarV1IsError(t *testing.T) {
+	dstPath := filepath.Join(t.TempDir(), "extract-dst-file-test-v1.car")
+	err := ExtractV1File("testdata/sample-v1.car", dstPath)
+	require.Equal(t, ErrAlreadyV1, err)
+}
+
 func generateRootCid(t *testing.T, adder format.NodeAdder) []cid.Cid {
 	// TODO convert this into a utility testing lib that takes an rng and generates a random DAG with some threshold for depth/breadth.
 	this := merkledag.NewRawNode([]byte("fish"))