Skip to content

Commit

Permalink
Implement utility to extract CARv1 from a CARv2
Browse files Browse the repository at this point in the history
Implement `ExtractV1File` where the function takes path to a CARv2 file
and efficiently extracts its inner CARv1 payload. Note, the
implementation only supports CARv2 as input and returns a dedicated
error if the supplied input is already in CARv1 format.

Implement benchmarks comparing extraction using `Reader` vs
`ExtractV1File`.

Implement tests that assert in-place extraction as well as invalid input
and both v1/v2 input

Fixes #207


This commit was moved from ipld/go-car@8113794
  • Loading branch information
masih committed Aug 11, 2021
1 parent facff84 commit 240282a
Show file tree
Hide file tree
Showing 3 changed files with 249 additions and 0 deletions.
98 changes: 98 additions & 0 deletions ipld/car/v2/bench_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,20 @@ package car_test

import (
"io"
"math/rand"
"os"
"path/filepath"
"testing"

"github.com/ipfs/go-cid"
"github.com/ipfs/go-merkledag"
"github.com/ipld/go-car/v2/blockstore"

carv2 "github.com/ipld/go-car/v2"
)

var rng = rand.New(rand.NewSource(1413))

// BenchmarkReadBlocks instantiates a BlockReader, and iterates over all blocks.
// It essentially looks at the contents of any CARv1 or CARv2 file.
// Note that this also uses internal carv1.ReadHeader underneath.
Expand Down Expand Up @@ -47,3 +55,93 @@ func BenchmarkReadBlocks(b *testing.B) {
}
})
}

// BenchmarkExtractV1File extracts inner CARv1 payload from a sample CARv2 file using ExtractV1File.
func BenchmarkExtractV1File(b *testing.B) {
path := filepath.Join(b.TempDir(), "bench-large-v2.car")
generateRandomCarV2File(b, path, 10*1024*1024) // 10 MiB
defer os.Remove(path)

info, err := os.Stat(path)
if err != nil {
b.Fatal(err)
}
b.SetBytes(info.Size())
b.ReportAllocs()
b.ResetTimer()

b.RunParallel(func(pb *testing.PB) {
dstPath := filepath.Join(b.TempDir(), "destination.car")
for pb.Next() {
err = carv2.ExtractV1File(path, dstPath)
if err != nil {
b.Fatal(err)
}
_ = os.Remove(dstPath)
}
})
}

// BenchmarkExtractV1UsingReader extracts inner CARv1 payload from a sample CARv2 file using Reader
// API. This benchmark is implemented to be used as a comparison in conjunction with
// BenchmarkExtractV1File.
func BenchmarkExtractV1UsingReader(b *testing.B) {
path := filepath.Join(b.TempDir(), "bench-large-v2.car")
generateRandomCarV2File(b, path, 10*1024*1024) // 10 MiB
defer os.Remove(path)

info, err := os.Stat(path)
if err != nil {
b.Fatal(err)
}
b.SetBytes(info.Size())
b.ReportAllocs()
b.ResetTimer()

b.RunParallel(func(pb *testing.PB) {
dstPath := filepath.Join(b.TempDir(), "destination.car")
for pb.Next() {
dst, err := os.Create(dstPath)
if err != nil {
b.Fatal(err)
}
reader, err := carv2.OpenReader(path)
if err != nil {
b.Fatal(err)
}
_, err = io.Copy(dst, reader.DataReader())
if err != nil {
b.Fatal(err)
}
if err := dst.Close(); err != nil {
b.Fatal(err)
}
}
})
}

func generateRandomCarV2File(b *testing.B, path string, minTotalBlockSize int) {
bs, err := blockstore.OpenReadWrite(path, []cid.Cid{})
defer func() {
if err := bs.Finalize(); err != nil {
b.Fatal(err)
}
}()
if err != nil {
b.Fatal(err)
}
buf := make([]byte, 1024)
var totalBlockSize int
for totalBlockSize < minTotalBlockSize {
size, err := rng.Read(buf)
if err != nil {
b.Fatal(err)
}

blk := merkledag.NewRawNode(buf)
if err := bs.Put(blk); err != nil {
b.Fatal(err)
}
totalBlockSize += size
}
}
108 changes: 108 additions & 0 deletions ipld/car/v2/writer.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package car

import (
"errors"
"fmt"
"io"
"os"

Expand All @@ -9,6 +11,9 @@ import (
"github.com/ipld/go-car/v2/index"
)

// ErrAlreadyV1 signals that the given payload is already in CARv1 format.
var ErrAlreadyV1 = errors.New("already a CARv1")

// WrapV1File is a wrapper around WrapV1 that takes filesystem paths.
// The source path is assumed to exist, and the destination path is overwritten.
// Note that the destination path might still be created even if an error
Expand Down Expand Up @@ -79,6 +84,109 @@ func WrapV1(src io.ReadSeeker, dst io.Writer) error {
return nil
}

// ExtractV1File takes a CARv2 file and extracts its CARv1 data payload, unmodified.
// The resulting CARv1 file will not include any data payload padding that may be present in the
// CARv2 srcPath.
// If srcPath represents a CARv1 ErrAlreadyV1 error is returned.
// The srcPath is assumed to exist, and the destination path is created if not exist.
// Note that the destination path might still be created even if an error
// occurred.
// If srcPath and dstPath are the same, then the dstPath is converted, in-place, to CARv1.
func ExtractV1File(srcPath, dstPath string) (err error) {
src, err := os.Open(srcPath)
if err != nil {
return err
}

// Ignore close error since only reading from src.
defer src.Close()

// Detect CAR version.
version, err := ReadVersion(src)
if err != nil {
return err
}
if version == 1 {
return ErrAlreadyV1
}
if version != 2 {
return fmt.Errorf("invalid source version: %v", version)
}

// Read CARv2 header to locate data payload.
var v2h Header
if _, err := v2h.ReadFrom(src); err != nil {
return err
}

// TODO consider extracting this into Header.Validate since it is also implemented in BlockReader.
// Validate header
dataOffset := int64(v2h.DataOffset)
if dataOffset < PragmaSize+HeaderSize {
return fmt.Errorf("invalid data payload offset: %v", dataOffset)
}
dataSize := int64(v2h.DataSize)
if dataSize <= 0 {
return fmt.Errorf("invalid data payload size: %v", dataSize)
}

// Seek to the point where the data payload starts
if _, err := src.Seek(dataOffset, io.SeekStart); err != nil {
return err
}

// Open destination as late as possible to minimise unintended file creation in case an error
// occurs earlier.
// Note, we explicitly do not use os.O_TRUNC here so that we can support in-place extraction.
// Otherwise, truncation of an existing file will wipe the data we would be reading from if
// source and destination paths are the same.
// Later, we do truncate the file to the right size to assert there are no tailing extra bytes.
dst, err := os.OpenFile(dstPath, os.O_CREATE|os.O_WRONLY, 0o666)
if err != nil {
return err
}

defer func() {
// Close destination and override return error type if it is nil.
cerr := dst.Close()
if err == nil {
err = cerr
}
}()

// Copy data payload over, expecting to write exactly the right number of bytes.
// Note that we explicitly use io.CopyN using file descriptors to leverage the SDK's efficient
// byte copy which should stay out of userland.
// There are two benchmarks to measure this: BenchmarkExtractV1File vs. BenchmarkExtractV1UsingReader
written, err := io.CopyN(dst, src, dataSize)
if err != nil {
return err
}
if written != dataSize {
return fmt.Errorf("expected to write exactly %v but wrote %v", dataSize, written)
}

// Check that the size destination file matches expected size.
// If bigger truncate.
// Note, we need to truncate:
// - if file is changed in-place, i.e. src and dst paths are the same then index or padding
// could be present after the data payload.
// - if an existing file is passed as destination which is different from source and is larger
// than the data payload size.
// In general, we want to guarantee that this function produces correct CARv2 payload in
// destination.
stat, err := dst.Stat()
if err != nil {
return err
}
if stat.Size() > dataSize {
// Truncate to the expected size to assure the resulting file is a correctly sized CARv1.
err = dst.Truncate(written)
}

return err
}

// AttachIndex attaches a given index to an existing CARv2 file at given path and offset.
func AttachIndex(path string, idx index.Index, offset uint64) error {
// TODO: instead of offset, maybe take padding?
Expand Down
43 changes: 43 additions & 0 deletions ipld/car/v2/writer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,49 @@ func TestWrapV1(t *testing.T) {
require.Equal(t, wantIdx, gotIdx)
}

func TestExtractV1(t *testing.T) {
// Produce a CARv1 file to test.
dagSvc := dstest.Mock()
v1Src := filepath.Join(t.TempDir(), "original-test-v1.car")
v1f, err := os.Create(v1Src)
require.NoError(t, err)
t.Cleanup(func() { require.NoError(t, v1f.Close()) })
require.NoError(t, carv1.WriteCar(context.Background(), dagSvc, generateRootCid(t, dagSvc), v1f))
_, err = v1f.Seek(0, io.SeekStart)
require.NoError(t, err)
wantV1, err := ioutil.ReadAll(v1f)
require.NoError(t, err)

// Wrap the produced CARv1 into a CARv2 to use for testing.
v2path := filepath.Join(t.TempDir(), "wrapped-for-extract-test-v2.car")
require.NoError(t, WrapV1File(v1Src, v2path))

// Assert extract from CARv2 file is as expected.
dstPath := filepath.Join(t.TempDir(), "extract-file-test-v1.car")
require.NoError(t, ExtractV1File(v2path, dstPath))
gotFromFile, err := ioutil.ReadFile(dstPath)
require.NoError(t, err)
require.Equal(t, wantV1, gotFromFile)

// Assert extract from CARv2 file in-place is as expected
require.NoError(t, ExtractV1File(v2path, v2path))
gotFromInPlaceFile, err := ioutil.ReadFile(v2path)
require.NoError(t, err)
require.Equal(t, wantV1, gotFromInPlaceFile)
}

func TestExtractV1WithUnknownVersionIsError(t *testing.T) {
dstPath := filepath.Join(t.TempDir(), "extract-dst-file-test-v42.car")
err := ExtractV1File("testdata/sample-rootless-v42.car", dstPath)
require.EqualError(t, err, "invalid source version: 42")
}

func TestExtractV1FromACarV1IsError(t *testing.T) {
dstPath := filepath.Join(t.TempDir(), "extract-dst-file-test-v1.car")
err := ExtractV1File("testdata/sample-v1.car", dstPath)
require.Equal(t, ErrAlreadyV1, err)
}

func generateRootCid(t *testing.T, adder format.NodeAdder) []cid.Cid {
// TODO convert this into a utility testing lib that takes an rng and generates a random DAG with some threshold for depth/breadth.
this := merkledag.NewRawNode([]byte("fish"))
Expand Down

0 comments on commit 240282a

Please sign in to comment.