From 07aa29517a1b40cf74a487ed4da26dfe959e7571 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Wed, 13 Jul 2022 09:56:14 -0500 Subject: [PATCH 01/32] Reuse ledger state in checkpoints (save 152GB RAM) Avoid creating separate ledger state during checkpointing. Based on EN2 logs (July 8, 2022), this will - reduce operational memory (peak RAM) by at least 152GB - reduce checkpoint duration by 24 mins (from 45 mins) - reduce memory allocations (TBD) Impact of this change will increase as ledger state grows larger, so memory savings will be substantially better than 152GB this year. --- cmd/execution_builder.go | 16 +- ledger/complete/compactor.go | 276 ++++++++++++++++ ledger/complete/compactor_test.go | 400 +++++++++++++++++++++++ ledger/complete/ledger.go | 58 +++- ledger/complete/ledger_test.go | 66 +--- ledger/complete/mtrie/forest.go | 36 +- ledger/complete/wal/checkpointer_test.go | 6 +- ledger/complete/wal/compactor.go | 157 --------- ledger/complete/wal/compactor_test.go | 330 ------------------- ledger/complete/wal/fixtures/noopwal.go | 2 +- ledger/complete/wal/wal.go | 15 +- 11 files changed, 771 insertions(+), 591 deletions(-) create mode 100644 ledger/complete/compactor.go create mode 100644 ledger/complete/compactor_test.go delete mode 100644 ledger/complete/wal/compactor.go delete mode 100644 ledger/complete/wal/compactor_test.go diff --git a/cmd/execution_builder.go b/cmd/execution_builder.go index f2b6e48fff2..0dfb24f76fb 100644 --- a/cmd/execution_builder.go +++ b/cmd/execution_builder.go @@ -382,23 +382,19 @@ func (e *ExecutionNodeBuilder) LoadComponentsAndModules() { } } - ledgerStorage, err = ledger.NewLedger(diskWAL, int(e.exeConf.mTrieCacheSize), collector, node.Logger.With().Str("subcomponent", + ledgerStorage, err = ledger.NewSyncLedger(diskWAL, int(e.exeConf.mTrieCacheSize), collector, node.Logger.With().Str("subcomponent", "ledger").Logger(), ledger.DefaultPathFinderVersion) return ledgerStorage, err }). Component("execution state ledger WAL compactor", func(node *NodeConfig) (module.ReadyDoneAware, error) { - checkpointer, err := ledgerStorage.Checkpointer() - if err != nil { - return nil, fmt.Errorf("cannot create checkpointer: %w", err) - } - compactor := wal.NewCompactor(checkpointer, - 10*time.Second, + return ledger.NewCompactor( + ledgerStorage, + diskWAL, e.exeConf.checkpointDistance, e.exeConf.checkpointsToKeep, - node.Logger.With().Str("subcomponent", "checkpointer").Logger()) - - return compactor, nil + node.Logger.With().Str("subcomponent", "checkpointer").Logger(), + ) }). Component("execution data service", func(node *NodeConfig) (module.ReadyDoneAware, error) { err := os.MkdirAll(e.exeConf.executionDataDir, 0700) diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go new file mode 100644 index 00000000000..e2d2afae88f --- /dev/null +++ b/ledger/complete/compactor.go @@ -0,0 +1,276 @@ +package complete + +import ( + "errors" + "fmt" + "time" + + "github.com/rs/zerolog" + "golang.org/x/sync/semaphore" + + "github.com/onflow/flow-go/ledger" + "github.com/onflow/flow-go/ledger/complete/mtrie/trie" + realWAL "github.com/onflow/flow-go/ledger/complete/wal" + "github.com/onflow/flow-go/module/lifecycle" + "github.com/onflow/flow-go/module/observable" +) + +type WALTrieUpdate struct { + Update *ledger.TrieUpdate + Resultc chan<- error +} + +type Compactor struct { + checkpointer *realWAL.Checkpointer + wal *realWAL.DiskWAL + ledger *Ledger + logger zerolog.Logger + stopc chan struct{} + trieUpdatec <-chan *WALTrieUpdate + lm *lifecycle.LifecycleManager + observers map[observable.Observer]struct{} + checkpointDistance uint + checkpointsToKeep uint +} + +func NewCompactor(l *Ledger, w *realWAL.DiskWAL, checkpointDistance uint, checkpointsToKeep uint, logger zerolog.Logger) (*Compactor, error) { + if checkpointDistance < 1 { + checkpointDistance = 1 + } + + checkpointer, err := w.NewCheckpointer() + if err != nil { + return nil, err + } + + trieUpdatec := l.TrieUpdateChan() + if trieUpdatec == nil { + return nil, errors.New("failed to get valid trie update channel from ledger") + } + + return &Compactor{ + checkpointer: checkpointer, + wal: w, + ledger: l, + logger: logger, + stopc: make(chan struct{}), + trieUpdatec: trieUpdatec, + observers: make(map[observable.Observer]struct{}), + lm: lifecycle.NewLifecycleManager(), + checkpointDistance: checkpointDistance, + checkpointsToKeep: checkpointsToKeep, + }, nil +} + +func (c *Compactor) Subscribe(observer observable.Observer) { + var void struct{} + c.observers[observer] = void +} + +func (c *Compactor) Unsubscribe(observer observable.Observer) { + delete(c.observers, observer) +} + +// Ready periodically fires Run function, every `interval` +func (c *Compactor) Ready() <-chan struct{} { + c.lm.OnStart(func() { + go c.start() + }) + return c.lm.Started() +} + +func (c *Compactor) Done() <-chan struct{} { + c.lm.OnStop(func() { + for observer := range c.observers { + observer.OnComplete() + } + c.stopc <- struct{}{} + }) + return c.lm.Stopped() +} + +func (c *Compactor) start() { + + // Use checkpointSem to limit checkpointing goroutine to one. + // If checkpointing isn't finished, then wait before starting new checkpointing. + checkpointSem := semaphore.NewWeighted(1) + + // Get active segment number + // activeSegmentNum is updated when record is written to a new segment. + _, activeSegmentNum, err := c.wal.Segments() + if err != nil { + // TODO: handle error + c.logger.Error().Err(err).Msg("error getting active segment number") + } + + lastCheckpointNum, err := c.checkpointer.LatestCheckpoint() + if err != nil { + // TODO: handle error + c.logger.Error().Err(err).Msg("error getting last checkpoint number") + } + + // Compute next checkpoint number + // nextCheckpointNum is updated when checkpointing goroutine starts. + // NOTE: next checkpoint number must be >= active segment num + nextCheckpointNum := lastCheckpointNum + int(c.checkpointDistance) + if activeSegmentNum > nextCheckpointNum { + nextCheckpointNum = activeSegmentNum + } + +Loop: + for { + select { + + case <-c.stopc: + break Loop + + case update := <-c.trieUpdatec: + // RecordUpdate returns the segment number the record was written to. + // Returned segment number can be + // - the same as previous segment number (same segment), or + // - incremented by 1 from previous segment number (new segment) + segmentNum, skipped, err := c.wal.RecordUpdate(update.Update) + + if err != nil || skipped || segmentNum == activeSegmentNum { + update.Resultc <- err + continue + } + + // Check new segment number is incremented by 1 + if segmentNum != activeSegmentNum+1 { + // TODO: must handle error without panic before merging this code + panic(fmt.Sprintf("expected new segment numer %d, got %d", activeSegmentNum+1, segmentNum)) + } + + // Update activeSegmentNum + prevSegmentNum := activeSegmentNum + activeSegmentNum = segmentNum + + if nextCheckpointNum > prevSegmentNum { + // Not enough segments for checkpointing + update.Resultc <- nil + continue + } + + // Enough segments are created for checkpointing + + // Get forest from ledger before sending WAL update result + tries, err := c.ledger.Tries() + if err != nil { + // TODO: handle error + c.logger.Error().Err(err).Msg("error getting ledger tries") + // Try again after active segment is finalized. + nextCheckpointNum = activeSegmentNum + continue + } + + // Send WAL update result after ledger state snapshot is done. + update.Resultc <- nil + + // Try to checkpoint + if checkpointSem.TryAcquire(1) { + + checkpointNum := nextCheckpointNum + + // Compute next checkpoint number + nextCheckpointNum += int(c.checkpointDistance) + + go func() { + defer checkpointSem.Release(1) + + err = c.checkpoint(tries, checkpointNum) + if err != nil { + c.logger.Error().Err(err).Msg("error checkpointing") + } + // TODO: retry if checkpointing fails. + }() + } else { + // Failed to get semaphore because checkpointing is running. + // Try again when active segment is finalized. + c.logger.Info().Msgf("checkpoint %d is delayed because prior checkpointing is ongoing", nextCheckpointNum) + nextCheckpointNum = activeSegmentNum + } + } + } + + // Drain and record remaining trie update in the channel. + for update := range c.trieUpdatec { + _, _, err := c.wal.RecordUpdate(update.Update) + update.Resultc <- err + } + + // TODO: wait for checkpointing goroutine? +} + +func (c *Compactor) checkpoint(tries []*trie.MTrie, checkpointNum int) error { + + err := createCheckpoint(c.checkpointer, c.logger, tries, checkpointNum) + if err != nil { + return fmt.Errorf("cannot create checkpoints: %w", err) + } + + err = cleanupCheckpoints(c.checkpointer, int(c.checkpointsToKeep)) + if err != nil { + return fmt.Errorf("cannot cleanup checkpoints: %w", err) + } + + if checkpointNum > 0 { + for observer := range c.observers { + observer.OnNext(checkpointNum) + } + } + + return nil +} + +func createCheckpoint(checkpointer *realWAL.Checkpointer, logger zerolog.Logger, tries []*trie.MTrie, checkpointNum int) error { + + logger.Info().Msgf("serializing checkpoint %d with %d tries", checkpointNum, len(tries)) + + startTime := time.Now() + + writer, err := checkpointer.CheckpointWriter(checkpointNum) + if err != nil { + return fmt.Errorf("cannot generate checkpoint writer: %w", err) + } + defer func() { + closeErr := writer.Close() + // Return close error if there isn't any prior error to return. + if err == nil { + err = closeErr + } + }() + + err = realWAL.StoreCheckpoint(writer, tries...) + if err != nil { + return fmt.Errorf("error serializing checkpoint (%d): %w", checkpointNum, err) + } + + duration := time.Since(startTime) + logger.Info().Float64("total_time_s", duration.Seconds()).Msgf("created checkpoint %d with %d tries", checkpointNum, len(tries)) + + return nil +} + +func cleanupCheckpoints(checkpointer *realWAL.Checkpointer, checkpointsToKeep int) error { + // Don't list checkpoints if we keep them all + if checkpointsToKeep == 0 { + return nil + } + checkpoints, err := checkpointer.Checkpoints() + if err != nil { + return fmt.Errorf("cannot list checkpoints: %w", err) + } + if len(checkpoints) > int(checkpointsToKeep) { + // if condition guarantees this never fails + checkpointsToRemove := checkpoints[:len(checkpoints)-int(checkpointsToKeep)] + + for _, checkpoint := range checkpointsToRemove { + err := checkpointer.RemoveCheckpoint(checkpoint) + if err != nil { + return fmt.Errorf("cannot remove checkpoint %d: %w", checkpoint, err) + } + } + } + return nil +} diff --git a/ledger/complete/compactor_test.go b/ledger/complete/compactor_test.go new file mode 100644 index 00000000000..49a29d99dc7 --- /dev/null +++ b/ledger/complete/compactor_test.go @@ -0,0 +1,400 @@ +package complete + +import ( + "fmt" + "os" + "path" + "reflect" + "testing" + "time" + + prometheusWAL "github.com/m4ksio/wal/wal" + "github.com/rs/zerolog" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/onflow/flow-go/ledger" + "github.com/onflow/flow-go/ledger/common/utils" + "github.com/onflow/flow-go/ledger/complete/mtrie" + "github.com/onflow/flow-go/ledger/complete/mtrie/trie" + realWAL "github.com/onflow/flow-go/ledger/complete/wal" + "github.com/onflow/flow-go/module/metrics" + "github.com/onflow/flow-go/utils/unittest" +) + +// Compactor observer that waits until it gets notified of a +// latest checkpoint larger than fromBound +type CompactorObserver struct { + fromBound int + done chan struct{} +} + +func (co *CompactorObserver) OnNext(val interface{}) { + res, ok := val.(int) + if ok { + new := res + if new >= co.fromBound { + co.done <- struct{}{} + } + } +} +func (co *CompactorObserver) OnError(err error) {} +func (co *CompactorObserver) OnComplete() { + close(co.done) +} + +func TestCompactor(t *testing.T) { + numInsPerStep := 2 + pathByteSize := 32 + minPayloadByteSize := 2 << 15 + maxPayloadByteSize := 2 << 16 + size := 10 + metricsCollector := &metrics.NoopCollector{} + checkpointDistance := uint(2) + checkpointsToKeep := uint(1) + + unittest.RunWithTempDir(t, func(dir string) { + + var l *Ledger + + // saved data after updates + savedData := make(map[ledger.RootHash]map[string]*ledger.Payload) + + t.Run("creates checkpoints", func(t *testing.T) { + + wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, size*10, pathByteSize, 32*1024) + require.NoError(t, err) + + l, err = NewSyncLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) + require.NoError(t, err) + + // WAL segments are 32kB, so here we generate 2 keys 64kB each, times `size` + // so we should get at least `size` segments + + compactor, err := NewCompactor(l, wal, checkpointDistance, checkpointsToKeep, zerolog.Nop()) + require.NoError(t, err) + + co := CompactorObserver{fromBound: 9, done: make(chan struct{})} + compactor.Subscribe(&co) + + // Run Compactor in background. + <-compactor.Ready() + + rootHash := trie.EmptyTrieRootHash() + + // Generate the tree and create WAL + for i := 0; i < size; i++ { + + payloads := utils.RandomPayloads(numInsPerStep, minPayloadByteSize, maxPayloadByteSize) + + keys := make([]ledger.Key, len(payloads)) + values := make([]ledger.Value, len(payloads)) + for i, p := range payloads { + keys[i] = p.Key + values[i] = p.Value + } + + update, err := ledger.NewUpdate(ledger.State(rootHash), keys, values) + require.NoError(t, err) + + rootHash, _, err := l.Set(update) + require.NoError(t, err) + + require.FileExists(t, path.Join(dir, realWAL.NumberToFilenamePart(i))) + + data := make(map[string]*ledger.Payload, len(keys)) + for j, k := range keys { + ks := string(k.CanonicalForm()) + data[ks] = payloads[j] + } + + savedData[ledger.RootHash(rootHash)] = data + } + + // wait for the bound-checking observer to confirm checkpoints have been made + select { + case <-co.done: + // continue + case <-time.After(60 * time.Second): + assert.FailNow(t, "timed out") + } + + checkpointer, err := wal.NewCheckpointer() + require.NoError(t, err) + + from, to, err := checkpointer.NotCheckpointedSegments() + require.NoError(t, err) + + assert.True(t, from == 10 && to == 10, "from: %v, to: %v", from, to) // Make sure there is no leftover + + require.NoFileExists(t, path.Join(dir, "checkpoint.00000000")) + require.NoFileExists(t, path.Join(dir, "checkpoint.00000001")) + require.NoFileExists(t, path.Join(dir, "checkpoint.00000002")) + require.NoFileExists(t, path.Join(dir, "checkpoint.00000003")) + require.NoFileExists(t, path.Join(dir, "checkpoint.00000004")) + require.NoFileExists(t, path.Join(dir, "checkpoint.00000005")) + require.NoFileExists(t, path.Join(dir, "checkpoint.00000006")) + require.NoFileExists(t, path.Join(dir, "checkpoint.00000007")) + require.NoFileExists(t, path.Join(dir, "checkpoint.00000008")) + require.FileExists(t, path.Join(dir, "checkpoint.00000009")) + + <-compactor.Done() + <-wal.Done() + }) + + time.Sleep(2 * time.Second) + + t.Run("remove unnecessary files", func(t *testing.T) { + // Remove all files apart from target checkpoint and WAL segments ahead of it + // We know their names, so just hardcode them + dirF, _ := os.Open(dir) + files, _ := dirF.Readdir(0) + + for _, fileInfo := range files { + + name := fileInfo.Name() + + if name != "checkpoint.00000009" && + name != "00000010" { + err := os.Remove(path.Join(dir, name)) + require.NoError(t, err) + } + } + }) + + var l2 *Ledger + + time.Sleep(2 * time.Second) + + t.Run("load data from checkpoint and WAL", func(t *testing.T) { + + wal2, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, size*10, pathByteSize, 32*1024) + require.NoError(t, err) + + l2, err = NewSyncLedger(wal2, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) + require.NoError(t, err) + + <-wal2.Done() + }) + + t.Run("make sure forests are equal", func(t *testing.T) { + + // Check for same data + for rootHash, data := range savedData { + + keys := make([]ledger.Key, 0, len(data)) + for _, p := range data { + keys = append(keys, p.Key) + } + + q, err := ledger.NewQuery(ledger.State(rootHash), keys) + require.NoError(t, err) + + values, err := l.Get(q) + require.NoError(t, err) + + values2, err := l2.Get(q) + require.NoError(t, err) + + for i, k := range keys { + ks := k.CanonicalForm() + require.Equal(t, data[string(ks)].Value, values[i]) + require.Equal(t, data[string(ks)].Value, values2[i]) + } + } + + // check for + forestTries, err := l.Tries() + require.NoError(t, err) + + forestTries2, err := l.Tries() + require.NoError(t, err) + + // order might be different + require.Equal(t, len(forestTries), len(forestTries2)) + }) + + }) +} + +// TestCompactorAccuracy expects checkpointed tries to match replayed tries. +// Replayed tries are tries updated by replaying all WAL segments +// (from segment 0, ignoring prior checkpoints) to the checkpoint number. +// This verifies that checkpointed tries are snapshopt of segments and at segment boundary. +func TestCompactorAccuracy(t *testing.T) { + numInsPerStep := 2 + pathByteSize := 32 + minPayloadByteSize := 2 << 15 + maxPayloadByteSize := 2 << 16 + size := 10 + metricsCollector := &metrics.NoopCollector{} + checkpointDistance := uint(5) + checkpointsToKeep := uint(0) + + unittest.RunWithTempDir(t, func(dir string) { + + lastCheckpointNum := -1 + + rootHash := trie.EmptyTrieRootHash() + + // Create DiskWAL and Ledger repeatedly to test rebuilding ledger state at restart. + for i := 0; i < 3; i++ { + + wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, size*10, pathByteSize, 32*1024) + require.NoError(t, err) + + l, err := NewSyncLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) + require.NoError(t, err) + + checkpointer, err := wal.NewCheckpointer() + require.NoError(t, err) + + // WAL segments are 32kB, so here we generate 2 keys 64kB each, times `size` + // so we should get at least `size` segments + + compactor, err := NewCompactor(l, wal, checkpointDistance, checkpointsToKeep, zerolog.Nop()) + require.NoError(t, err) + + fromBound := lastCheckpointNum + int(checkpointDistance)*2 + lastCheckpointNum = fromBound + + co := CompactorObserver{fromBound: fromBound, done: make(chan struct{})} + compactor.Subscribe(&co) + + // Run Compactor in background. + <-compactor.Ready() + + // Generate the tree and create WAL + for i := 0; i < size; i++ { + + payloads := utils.RandomPayloads(numInsPerStep, minPayloadByteSize, maxPayloadByteSize) + + keys := make([]ledger.Key, len(payloads)) + values := make([]ledger.Value, len(payloads)) + for i, p := range payloads { + keys[i] = p.Key + values[i] = p.Value + } + + update, err := ledger.NewUpdate(ledger.State(rootHash), keys, values) + require.NoError(t, err) + + _, _, err = l.Set(update) + require.NoError(t, err) + } + + // wait for the bound-checking observer to confirm checkpoints have been made + select { + case <-co.done: + // continue + case <-time.After(120 * time.Second): + assert.FailNow(t, "timed out") + } + + <-compactor.Done() + <-wal.Done() + + nums, err := checkpointer.Checkpoints() + require.NoError(t, err) + + for _, n := range nums { + testCheckpointedTriesMatchReplayedTriesFromSegments(t, checkpointer, n, dir) + } + } + }) +} + +func testCheckpointedTriesMatchReplayedTriesFromSegments(t *testing.T, checkpointer *realWAL.Checkpointer, checkpointNum int, dir string) { + + // Get tries by replaying segments up to checkpoint number + triesFromReplayingSegments, err := triesUpToSegment(dir, checkpointNum) + require.NoError(t, err) + + // Get tries by loading checkpoint + triesFromLoadingCheckpoint, err := checkpointer.LoadCheckpoint(checkpointNum) + require.NoError(t, err) + + // Test that checkpointed tries match replayed tries. + triesSetFromReplayingSegments := make(map[ledger.RootHash]struct{}) + for _, t := range triesFromReplayingSegments { + triesSetFromReplayingSegments[t.RootHash()] = struct{}{} + } + + triesSetFromLoadingCheckpoint := make(map[ledger.RootHash]struct{}) + for _, t := range triesFromLoadingCheckpoint { + triesSetFromLoadingCheckpoint[t.RootHash()] = struct{}{} + } + + require.True(t, reflect.DeepEqual(triesSetFromReplayingSegments, triesSetFromLoadingCheckpoint)) +} + +func triesUpToSegment(dir string, to int) ([]*trie.MTrie, error) { + + forest, err := mtrie.NewForest(500, &metrics.NoopCollector{}, nil) + if err != nil { + return nil, fmt.Errorf("cannot create Forest: %w", err) + } + + err = replaySegments( + dir, + to, + func(update *ledger.TrieUpdate) error { + _, err := forest.Update(update) + return err + }, func(rootHash ledger.RootHash) error { + return nil + }) + if err != nil { + return nil, err + } + + return forest.GetTries() +} + +func replaySegments( + dir string, + to int, + updateFn func(update *ledger.TrieUpdate) error, + deleteFn func(rootHash ledger.RootHash) error, +) error { + sr, err := prometheusWAL.NewSegmentsRangeReader(prometheusWAL.SegmentRange{ + Dir: dir, + First: 0, + Last: to, + }) + if err != nil { + return fmt.Errorf("cannot create segment reader: %w", err) + } + + reader := prometheusWAL.NewReader(sr) + + defer sr.Close() + + for reader.Next() { + record := reader.Record() + operation, rootHash, update, err := realWAL.Decode(record) + if err != nil { + return fmt.Errorf("cannot decode LedgerWAL record: %w", err) + } + + switch operation { + case realWAL.WALUpdate: + err = updateFn(update) + if err != nil { + return fmt.Errorf("error while processing LedgerWAL update: %w", err) + } + case realWAL.WALDelete: + err = deleteFn(rootHash) + if err != nil { + return fmt.Errorf("error while processing LedgerWAL deletion: %w", err) + } + } + + err = reader.Err() + if err != nil { + return fmt.Errorf("cannot read LedgerWAL: %w", err) + } + } + + return nil +} diff --git a/ledger/complete/ledger.go b/ledger/complete/ledger.go index 245d84387bb..6ad846f8034 100644 --- a/ledger/complete/ledger.go +++ b/ledger/complete/ledger.go @@ -15,12 +15,13 @@ import ( "github.com/onflow/flow-go/ledger/common/pathfinder" "github.com/onflow/flow-go/ledger/complete/mtrie" "github.com/onflow/flow-go/ledger/complete/mtrie/trie" - "github.com/onflow/flow-go/ledger/complete/wal" + realWAL "github.com/onflow/flow-go/ledger/complete/wal" "github.com/onflow/flow-go/module" ) const DefaultCacheSize = 1000 const DefaultPathFinderVersion = 1 +const defaultTrieUpdateChanSize = 500 // Ledger (complete) is a fast memory-efficient fork-aware thread-safe trie-based key/value storage. // Ledger holds an array of registers (key-value pairs) and keeps tracks of changes over a limited time. @@ -35,15 +36,16 @@ const DefaultPathFinderVersion = 1 // for archival usage but make it possible for other software components to reconstruct very old tries using write-ahead logs. type Ledger struct { forest *mtrie.Forest - wal wal.LedgerWAL + wal realWAL.LedgerWAL metrics module.LedgerMetrics logger zerolog.Logger + trieUpdatec chan *WALTrieUpdate pathFinderVersion uint8 } // NewLedger creates a new in-memory trie-backed ledger storage with persistence. func NewLedger( - wal wal.LedgerWAL, + wal realWAL.LedgerWAL, capacity int, metrics module.LedgerMetrics, log zerolog.Logger, @@ -86,6 +88,26 @@ func NewLedger( return storage, nil } +func NewSyncLedger( + wal realWAL.LedgerWAL, + capacity int, + metrics module.LedgerMetrics, + log zerolog.Logger, + pathFinderVer uint8) (*Ledger, error) { + + l, err := NewLedger(wal, capacity, metrics, log, pathFinderVer) + if err != nil { + return nil, err + } + + l.trieUpdatec = make(chan *WALTrieUpdate, defaultTrieUpdateChanSize) + return l, nil +} + +func (l *Ledger) TrieUpdateChan() <-chan *WALTrieUpdate { + return l.trieUpdatec +} + // Ready implements interface module.ReadyDoneAware // it starts the EventLoop's internal processing loop. func (l *Ledger) Ready() <-chan struct{} { @@ -202,11 +224,16 @@ func (l *Ledger) Set(update *ledger.Update) (newState ledger.State, trieUpdate * walChan := make(chan error) - go func() { - walChan <- l.wal.RecordUpdate(trieUpdate) - }() + if l.trieUpdatec == nil { + go func() { + _, _, err := l.wal.RecordUpdate(trieUpdate) + walChan <- err + }() + } else { + l.trieUpdatec <- &WALTrieUpdate{Update: trieUpdate, Resultc: walChan} + } - newRootHash, err := l.forest.Update(trieUpdate) + newTrie, err := l.forest.NewTrie(trieUpdate) walError := <-walChan if err != nil { @@ -216,6 +243,11 @@ func (l *Ledger) Set(update *ledger.Update) (newState ledger.State, trieUpdate * return ledger.State(hash.DummyHash), nil, fmt.Errorf("error while writing LedgerWAL: %w", walError) } + err = l.forest.AddTrie(newTrie) + if err != nil { + return ledger.State(hash.DummyHash), nil, fmt.Errorf("adding updated trie to forest failed: %w", err) + } + // TODO update to proper value once https://github.com/onflow/flow-go/pull/3720 is merged l.metrics.ForestApproxMemorySize(0) @@ -228,6 +260,7 @@ func (l *Ledger) Set(update *ledger.Update) (newState ledger.State, trieUpdate * } state := update.State() + newRootHash := newTrie.RootHash() l.logger.Info().Hex("from", state[:]). Hex("to", newRootHash[:]). Int("update_size", update.Size()). @@ -273,8 +306,13 @@ func (l *Ledger) ForestSize() int { return l.forest.Size() } +// Tries returns the tries stored in the forest +func (l *Ledger) Tries() ([]*trie.MTrie, error) { + return l.forest.GetTries() +} + // Checkpointer returns a checkpointer instance -func (l *Ledger) Checkpointer() (*wal.Checkpointer, error) { +func (l *Ledger) Checkpointer() (*realWAL.Checkpointer, error) { checkpointer, err := l.wal.NewCheckpointer() if err != nil { return nil, fmt.Errorf("cannot create checkpointer for compactor: %w", err) @@ -388,14 +426,14 @@ func (l *Ledger) ExportCheckpointAt( l.logger.Info().Msgf("finished running pre-checkpoint reporters") l.logger.Info().Msg("creating a checkpoint for the new trie") - writer, err := wal.CreateCheckpointWriterForFile(outputDir, outputFile, &l.logger) + writer, err := realWAL.CreateCheckpointWriterForFile(outputDir, outputFile, &l.logger) if err != nil { return ledger.State(hash.DummyHash), fmt.Errorf("failed to create a checkpoint writer: %w", err) } l.logger.Info().Msg("storing the checkpoint to the file") - err = wal.StoreCheckpoint(writer, newTrie) + err = realWAL.StoreCheckpoint(writer, newTrie) // Writing the checkpoint takes time to write and copy. // Without relying on an exit code or stdout, we need to know when the copy is complete. diff --git a/ledger/complete/ledger_test.go b/ledger/complete/ledger_test.go index 841a57b2991..aa0d0243bcf 100644 --- a/ledger/complete/ledger_test.go +++ b/ledger/complete/ledger_test.go @@ -2,11 +2,9 @@ package complete_test import ( "bytes" - "encoding/hex" "errors" "fmt" "math/rand" - "sync" "testing" "time" @@ -734,69 +732,13 @@ func Test_ExportCheckpointAt(t *testing.T) { }) } -func TestWALUpdateIsRunInParallel(t *testing.T) { - - // The idea of this test is - WAL update should be run in parallel - // so we block it until we can find a new trie with expected state - // this doesn't really proves WAL update is run in parallel, but at least - // checks if its run after the trie update - - wg := sync.WaitGroup{} - wg.Add(1) - - w := &LongRunningDummyWAL{ - updateFn: func(update *ledger.TrieUpdate) error { - wg.Wait() //wg will let work after the trie has been updated - return nil - }, - } - - led, err := complete.NewLedger(w, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) - require.NoError(t, err) - - key := ledger.NewKey([]ledger.KeyPart{ledger.NewKeyPart(0, []byte{1, 2, 3})}) - - values := []ledger.Value{[]byte{1, 2, 3}} - update, err := ledger.NewUpdate(led.InitialState(), []ledger.Key{key}, values) - require.NoError(t, err) - - // this state should correspond to fresh state with given update - decoded, err := hex.DecodeString("097b7f74413bc03200889c34c6979eacbad58345ef7c0c65e8057a071440df75") - var expectedState ledger.State - copy(expectedState[:], decoded) - require.NoError(t, err) - - query, err := ledger.NewQuery(expectedState, []ledger.Key{key}) - require.NoError(t, err) - - go func() { - newState, _, err := led.Set(update) - require.NoError(t, err) - require.Equal(t, newState, expectedState) - }() - - require.Eventually(t, func() bool { - retrievedValues, err := led.Get(query) - if err != nil { - return false - } - - require.NoError(t, err) - require.Equal(t, values, retrievedValues) - - wg.Done() - - return true - }, 500*time.Millisecond, 5*time.Millisecond) -} - func TestWALUpdateFailuresBubbleUp(t *testing.T) { theError := fmt.Errorf("error error") w := &LongRunningDummyWAL{ - updateFn: func(update *ledger.TrieUpdate) error { - return theError + updateFn: func(update *ledger.TrieUpdate) (int, bool, error) { + return 0, false, theError }, } @@ -849,10 +791,10 @@ func migrationByValue(p []ledger.Payload) ([]ledger.Payload, error) { type LongRunningDummyWAL struct { fixtures.NoopWAL - updateFn func(update *ledger.TrieUpdate) error + updateFn func(update *ledger.TrieUpdate) (int, bool, error) } -func (w *LongRunningDummyWAL) RecordUpdate(update *ledger.TrieUpdate) error { +func (w *LongRunningDummyWAL) RecordUpdate(update *ledger.TrieUpdate) (int, bool, error) { return w.updateFn(update) } diff --git a/ledger/complete/mtrie/forest.go b/ledger/complete/mtrie/forest.go index 5833f2fc678..c8d4e52ef5b 100644 --- a/ledger/complete/mtrie/forest.go +++ b/ledger/complete/mtrie/forest.go @@ -190,19 +190,36 @@ func (f *Forest) Read(r *ledger.TrieRead) ([]ledger.Value, error) { return orderedValues, nil } -// Update updates the Values for the registers and returns rootHash and error (if any). +// Update updates the Values for the registers, adds updated tries to forest, +// and returns rootHash and error (if any). +// In case there are multiple updates to the same register, Update will persist +// the latest written value. +func (f *Forest) Update(u *ledger.TrieUpdate) (ledger.RootHash, error) { + t, err := f.NewTrie(u) + if err != nil { + return ledger.RootHash(hash.DummyHash), err + } + + err = f.AddTrie(t) + if err != nil { + return ledger.RootHash(hash.DummyHash), fmt.Errorf("adding updated trie to forest failed: %w", err) + } + + return t.RootHash(), nil +} + +// NewTrie updates the Values for the registers and returns updated trie and error (if any). // In case there are multiple updates to the same register, Update will persist the latest // written value. -func (f *Forest) Update(u *ledger.TrieUpdate) (ledger.RootHash, error) { - emptyHash := ledger.RootHash(hash.DummyHash) +func (f *Forest) NewTrie(u *ledger.TrieUpdate) (*trie.MTrie, error) { parentTrie, err := f.GetTrie(u.RootHash) if err != nil { - return emptyHash, err + return nil, err } if len(u.Paths) == 0 { // no key no change - return u.RootHash, nil + return parentTrie, nil } // Deduplicate writes to the same register: we only retain the value of the last write @@ -235,7 +252,7 @@ func (f *Forest) Update(u *ledger.TrieUpdate) (ledger.RootHash, error) { applyPruning := true newTrie, maxDepthTouched, err := trie.NewTrieWithUpdatedRegisters(parentTrie, deduplicatedPaths, deduplicatedPayloads, applyPruning) if err != nil { - return emptyHash, fmt.Errorf("constructing updated trie failed: %w", err) + return nil, fmt.Errorf("constructing updated trie failed: %w", err) } f.metrics.LatestTrieRegCount(newTrie.AllocatedRegCount()) @@ -244,12 +261,7 @@ func (f *Forest) Update(u *ledger.TrieUpdate) (ledger.RootHash, error) { f.metrics.LatestTrieRegSizeDiff(int64(newTrie.AllocatedRegSize() - parentTrie.AllocatedRegSize())) f.metrics.LatestTrieMaxDepthTouched(maxDepthTouched) - err = f.AddTrie(newTrie) - if err != nil { - return emptyHash, fmt.Errorf("adding updated trie to forest failed: %w", err) - } - - return newTrie.RootHash(), nil + return newTrie, nil } // Proofs returns a batch proof for the given paths. diff --git a/ledger/complete/wal/checkpointer_test.go b/ledger/complete/wal/checkpointer_test.go index 53db783bbdf..24beed91efb 100644 --- a/ledger/complete/wal/checkpointer_test.go +++ b/ledger/complete/wal/checkpointer_test.go @@ -148,7 +148,7 @@ func Test_Checkpointing(t *testing.T) { trieUpdate, err := pathfinder.UpdateToTrieUpdate(update, pathFinderVersion) require.NoError(t, err) - err = wal.RecordUpdate(trieUpdate) + _, _, err = wal.RecordUpdate(trieUpdate) require.NoError(t, err) rootHash, err := f.Update(trieUpdate) @@ -277,7 +277,7 @@ func Test_Checkpointing(t *testing.T) { trieUpdate, err := pathfinder.UpdateToTrieUpdate(update, pathFinderVersion) require.NoError(t, err) - err = wal4.RecordUpdate(trieUpdate) + _, _, err = wal4.RecordUpdate(trieUpdate) require.NoError(t, err) rootHash, err = f.Update(trieUpdate) @@ -448,7 +448,7 @@ func TestCheckpointFileError(t *testing.T) { trieUpdate, err := pathfinder.UpdateToTrieUpdate(update, pathFinderVersion) require.NoError(t, err) - err = wal.RecordUpdate(trieUpdate) + _, _, err = wal.RecordUpdate(trieUpdate) require.NoError(t, err) // some buffer time of the checkpointer to run diff --git a/ledger/complete/wal/compactor.go b/ledger/complete/wal/compactor.go deleted file mode 100644 index bfcd4be729d..00000000000 --- a/ledger/complete/wal/compactor.go +++ /dev/null @@ -1,157 +0,0 @@ -package wal - -import ( - "fmt" - "io" - "sync" - "time" - - "github.com/rs/zerolog" - - "github.com/onflow/flow-go/module/lifecycle" - "github.com/onflow/flow-go/module/observable" -) - -type Compactor struct { - checkpointer *Checkpointer - logger zerolog.Logger - stopc chan struct{} - lm *lifecycle.LifecycleManager - sync.Mutex - observers map[observable.Observer]struct{} - interval time.Duration - checkpointDistance uint - checkpointsToKeep uint -} - -func NewCompactor(checkpointer *Checkpointer, interval time.Duration, checkpointDistance uint, checkpointsToKeep uint, logger zerolog.Logger) *Compactor { - if checkpointDistance < 1 { - checkpointDistance = 1 - } - return &Compactor{ - checkpointer: checkpointer, - logger: logger, - stopc: make(chan struct{}), - observers: make(map[observable.Observer]struct{}), - lm: lifecycle.NewLifecycleManager(), - interval: interval, - checkpointDistance: checkpointDistance, - checkpointsToKeep: checkpointsToKeep, - } -} - -func (c *Compactor) Subscribe(observer observable.Observer) { - var void struct{} - c.observers[observer] = void -} - -func (c *Compactor) Unsubscribe(observer observable.Observer) { - delete(c.observers, observer) -} - -// Ready periodically fires Run function, every `interval` -func (c *Compactor) Ready() <-chan struct{} { - c.lm.OnStart(func() { - go c.start() - }) - return c.lm.Started() -} - -func (c *Compactor) Done() <-chan struct{} { - c.lm.OnStop(func() { - for observer := range c.observers { - observer.OnComplete() - } - c.stopc <- struct{}{} - }) - return c.lm.Stopped() -} - -func (c *Compactor) start() { - for { - err := c.Run() - if err != nil { - c.logger.Error().Err(err).Msg("error running compactor") - } - - select { - case <-c.stopc: - return - case <-time.After(c.interval): - } - } -} - -func (c *Compactor) Run() error { - c.Lock() - defer c.Unlock() - - newLatestCheckpoint, err := c.createCheckpoints() - if err != nil { - return fmt.Errorf("cannot create checkpoints: %w", err) - } - - err = c.cleanupCheckpoints() - if err != nil { - return fmt.Errorf("cannot cleanup checkpoints: %w", err) - } - - if newLatestCheckpoint > 0 { - for observer := range c.observers { - observer.OnNext(newLatestCheckpoint) - } - } - - return nil -} - -func (c *Compactor) createCheckpoints() (int, error) { - from, to, err := c.checkpointer.NotCheckpointedSegments() - if err != nil { - return -1, fmt.Errorf("cannot get latest checkpoint: %w", err) - } - - // we only return a positive value if the latest checkpoint index has changed - newLatestCheckpoint := -1 - // more then one segment means we can checkpoint safely up to `to`-1 - // presumably last segment is being written to - if to-from > int(c.checkpointDistance) { - startTime := time.Now() - - checkpointNumber := to - 1 - c.logger.Info().Msgf("creating checkpoint %d from segment %d to segment %d", checkpointNumber, from, checkpointNumber) - err = c.checkpointer.Checkpoint(checkpointNumber, func() (io.WriteCloser, error) { - return c.checkpointer.CheckpointWriter(checkpointNumber) - }) - if err != nil { - return -1, fmt.Errorf("error creating checkpoint (%d): %w", checkpointNumber, err) - } - newLatestCheckpoint = checkpointNumber - - duration := time.Since(startTime) - c.logger.Info().Float64("total_time_s", duration.Seconds()).Msgf("created checkpoint %d from segment %d to segment %d", checkpointNumber, from, checkpointNumber) - } - return newLatestCheckpoint, nil -} - -func (c *Compactor) cleanupCheckpoints() error { - // don't bother listing checkpoints if we keep them all - if c.checkpointsToKeep == 0 { - return nil - } - checkpoints, err := c.checkpointer.Checkpoints() - if err != nil { - return fmt.Errorf("cannot list checkpoints: %w", err) - } - if len(checkpoints) > int(c.checkpointsToKeep) { - checkpointsToRemove := checkpoints[:len(checkpoints)-int(c.checkpointsToKeep)] // if condition guarantees this never fails - - for _, checkpoint := range checkpointsToRemove { - err := c.checkpointer.RemoveCheckpoint(checkpoint) - if err != nil { - return fmt.Errorf("cannot remove checkpoint %d: %w", checkpoint, err) - } - } - } - return nil -} diff --git a/ledger/complete/wal/compactor_test.go b/ledger/complete/wal/compactor_test.go deleted file mode 100644 index 4a29aad8ff5..00000000000 --- a/ledger/complete/wal/compactor_test.go +++ /dev/null @@ -1,330 +0,0 @@ -package wal - -import ( - "fmt" - "os" - "path" - "testing" - "time" - - "github.com/rs/zerolog" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/onflow/flow-go/ledger" - "github.com/onflow/flow-go/ledger/common/utils" - "github.com/onflow/flow-go/ledger/complete/mtrie" - "github.com/onflow/flow-go/ledger/complete/mtrie/trie" - "github.com/onflow/flow-go/model/bootstrap" - "github.com/onflow/flow-go/module/metrics" - "github.com/onflow/flow-go/utils/unittest" -) - -// Compactor observer that waits until it gets notified of a -// latest checkpoint larger than fromBound -type CompactorObserver struct { - fromBound int - done chan struct{} -} - -func (co *CompactorObserver) OnNext(val interface{}) { - res, ok := val.(int) - if ok { - new := res - if new >= co.fromBound { - co.done <- struct{}{} - } - } -} -func (co *CompactorObserver) OnError(err error) {} -func (co *CompactorObserver) OnComplete() { - close(co.done) -} - -func Test_Compactor(t *testing.T) { - numInsPerStep := 2 - pathByteSize := 32 - minPayloadByteSize := 2 << 15 - maxPayloadByteSize := 2 << 16 - size := 10 - metricsCollector := &metrics.NoopCollector{} - checkpointDistance := uint(2) - - unittest.RunWithTempDir(t, func(dir string) { - - f, err := mtrie.NewForest(size*10, metricsCollector, nil) - require.NoError(t, err) - - var rootHash = f.GetEmptyRootHash() - - //saved data after updates - savedData := make(map[ledger.RootHash]map[ledger.Path]*ledger.Payload) - - t.Run("Compactor creates checkpoints eventually", func(t *testing.T) { - - wal, err := NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, size*10, pathByteSize, 32*1024) - require.NoError(t, err) - - // WAL segments are 32kB, so here we generate 2 keys 64kB each, times `size` - // so we should get at least `size` segments - - checkpointer, err := wal.NewCheckpointer() - require.NoError(t, err) - - compactor := NewCompactor(checkpointer, 100*time.Millisecond, checkpointDistance, 1, zerolog.Nop()) //keep only latest checkpoint - co := CompactorObserver{fromBound: 9, done: make(chan struct{})} - compactor.Subscribe(&co) - - // Run Compactor in background. - <-compactor.Ready() - - // Generate the tree and create WAL - for i := 0; i < size; i++ { - - paths0 := utils.RandomPaths(numInsPerStep) - payloads0 := utils.RandomPayloads(numInsPerStep, minPayloadByteSize, maxPayloadByteSize) - - var paths []ledger.Path - var payloads []*ledger.Payload - paths = append(paths, paths0...) - payloads = append(payloads, payloads0...) - - update := &ledger.TrieUpdate{RootHash: rootHash, Paths: paths, Payloads: payloads} - - err = wal.RecordUpdate(update) - require.NoError(t, err) - - rootHash, err = f.Update(update) - require.NoError(t, err) - - require.FileExists(t, path.Join(dir, NumberToFilenamePart(i))) - - data := make(map[ledger.Path]*ledger.Payload, len(paths)) - for j, path := range paths { - data[path] = payloads[j] - } - - savedData[rootHash] = data - } - - // wait for the bound-checking observer to confirm checkpoints have been made - select { - case <-co.done: - // continue - case <-time.After(60 * time.Second): - assert.FailNow(t, "timed out") - } - - from, to, err := checkpointer.NotCheckpointedSegments() - require.NoError(t, err) - - assert.True(t, from == 10 && to == 10, "from: %v, to: %v", from, to) //make sure there is no leftover - - require.NoFileExists(t, path.Join(dir, "checkpoint.00000000")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000001")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000002")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000003")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000004")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000005")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000006")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000007")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000008")) - require.FileExists(t, path.Join(dir, "checkpoint.00000009")) - - <-compactor.Done() - <-wal.Done() - require.NoError(t, err) - }) - - time.Sleep(2 * time.Second) - - t.Run("remove unnecessary files", func(t *testing.T) { - // Remove all files apart from target checkpoint and WAL segments ahead of it - // We know their names, so just hardcode them - dirF, _ := os.Open(dir) - files, _ := dirF.Readdir(0) - - for _, fileInfo := range files { - - name := fileInfo.Name() - - if name != "checkpoint.00000009" && - name != "00000010" { - err := os.Remove(path.Join(dir, name)) - require.NoError(t, err) - } - } - }) - - f2, err := mtrie.NewForest(size*10, metricsCollector, nil) - require.NoError(t, err) - - time.Sleep(2 * time.Second) - - t.Run("load data from checkpoint and WAL", func(t *testing.T) { - wal2, err := NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, size*10, pathByteSize, 32*1024) - require.NoError(t, err) - - err = wal2.Replay( - func(tries []*trie.MTrie) error { - return f2.AddTries(tries) - }, - func(update *ledger.TrieUpdate) error { - _, err := f2.Update(update) - return err - }, - func(rootHash ledger.RootHash) error { - return fmt.Errorf("no deletion expected") - }, - ) - require.NoError(t, err) - - <-wal2.Done() - - }) - - t.Run("make sure forests are equal", func(t *testing.T) { - - //check for same data - for rootHash, data := range savedData { - - paths := make([]ledger.Path, 0, len(data)) - for path := range data { - paths = append(paths, path) - } - - read := &ledger.TrieRead{RootHash: rootHash, Paths: paths} - values, err := f.Read(read) - require.NoError(t, err) - - values2, err := f2.Read(read) - require.NoError(t, err) - - for i, path := range paths { - require.Equal(t, data[path].Value, values[i]) - require.Equal(t, data[path].Value, values2[i]) - } - } - - // check for - forestTries, err := f.GetTries() - require.NoError(t, err) - - forestTries2, err := f2.GetTries() - require.NoError(t, err) - - // order might be different - require.Equal(t, len(forestTries), len(forestTries2)) - }) - - }) -} - -func Test_Compactor_checkpointInterval(t *testing.T) { - - numInsPerStep := 2 - pathByteSize := 32 - minPayloadByteSize := 100 - maxPayloadByteSize := 2 << 16 - size := 20 - metricsCollector := &metrics.NoopCollector{} - checkpointDistance := uint(3) // there should be 3 WAL not checkpointed - - unittest.RunWithTempDir(t, func(dir string) { - - f, err := mtrie.NewForest(size*10, metricsCollector, nil) - require.NoError(t, err) - - var rootHash = f.GetEmptyRootHash() - - t.Run("Compactor creates checkpoints", func(t *testing.T) { - - wal, err := NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, size*10, pathByteSize, 32*1024) - require.NoError(t, err) - - // WAL segments are 32kB, so here we generate 2 keys 64kB each, times `size` - // so we should get at least `size` segments - checkpointer, err := wal.NewCheckpointer() - require.NoError(t, err) - - compactor := NewCompactor(checkpointer, 100*time.Millisecond, checkpointDistance, 2, zerolog.Nop()) - - // Generate the tree and create WAL - for i := 0; i < size; i++ { - - paths0 := utils.RandomPaths(numInsPerStep) - payloads0 := utils.RandomPayloads(numInsPerStep, minPayloadByteSize, maxPayloadByteSize) - - var paths []ledger.Path - var payloads []*ledger.Payload - // TODO figure out twice insert - paths = append(paths, paths0...) - payloads = append(payloads, payloads0...) - - update := &ledger.TrieUpdate{RootHash: rootHash, Paths: paths, Payloads: payloads} - - err = wal.RecordUpdate(update) - require.NoError(t, err) - - rootHash, err = f.Update(update) - require.NoError(t, err) - - require.FileExists(t, path.Join(dir, NumberToFilenamePart(i))) - - // run checkpoint creation after every file - _, err = compactor.createCheckpoints() - require.NoError(t, err) - } - - // assert creation of checkpoint files precisely - require.NoFileExists(t, path.Join(dir, bootstrap.FilenameWALRootCheckpoint)) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000001")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000002")) - require.FileExists(t, path.Join(dir, "checkpoint.00000003")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000004")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000005")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000006")) - require.FileExists(t, path.Join(dir, "checkpoint.00000007")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000008")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000009")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000010")) - require.FileExists(t, path.Join(dir, "checkpoint.00000011")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000012")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000013")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000014")) - require.FileExists(t, path.Join(dir, "checkpoint.00000015")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000016")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000017")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000017")) - require.FileExists(t, path.Join(dir, "checkpoint.00000019")) - - // expect all but last 2 checkpoints gone - err = compactor.cleanupCheckpoints() - require.NoError(t, err) - - require.NoFileExists(t, path.Join(dir, bootstrap.FilenameWALRootCheckpoint)) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000001")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000002")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000003")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000004")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000005")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000006")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000007")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000008")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000009")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000010")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000011")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000012")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000013")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000014")) - require.FileExists(t, path.Join(dir, "checkpoint.00000015")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000016")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000017")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000017")) - require.FileExists(t, path.Join(dir, "checkpoint.00000019")) - - <-wal.Done() - require.NoError(t, err) - }) - }) -} diff --git a/ledger/complete/wal/fixtures/noopwal.go b/ledger/complete/wal/fixtures/noopwal.go index 8f705efdbf2..becefb042b1 100644 --- a/ledger/complete/wal/fixtures/noopwal.go +++ b/ledger/complete/wal/fixtures/noopwal.go @@ -29,7 +29,7 @@ func (w *NoopWAL) PauseRecord() {} func (w *NoopWAL) UnpauseRecord() {} -func (w *NoopWAL) RecordUpdate(update *ledger.TrieUpdate) error { return nil } +func (w *NoopWAL) RecordUpdate(update *ledger.TrieUpdate) (int, bool, error) { return 0, false, nil } func (w *NoopWAL) RecordDelete(rootHash ledger.RootHash) error { return nil } diff --git a/ledger/complete/wal/wal.go b/ledger/complete/wal/wal.go index c76df91aee1..9323fd18b43 100644 --- a/ledger/complete/wal/wal.go +++ b/ledger/complete/wal/wal.go @@ -49,20 +49,23 @@ func (w *DiskWAL) UnpauseRecord() { w.paused = false } -func (w *DiskWAL) RecordUpdate(update *ledger.TrieUpdate) error { +func (w *DiskWAL) RecordUpdate(update *ledger.TrieUpdate) (segmentNum int, skipped bool, err error) { if w.paused { - return nil + return 0, true, nil } bytes := EncodeUpdate(update) - _, err := w.wal.Log(bytes) + locations, err := w.wal.Log(bytes) if err != nil { - return fmt.Errorf("error while recording update in LedgerWAL: %w", err) + return 0, false, fmt.Errorf("error while recording update in LedgerWAL: %w", err) + } + if len(locations) != 1 { + return 0, false, fmt.Errorf("error while recording update in LedgerWAL: got %d location, expect 1 location", len(locations)) } - return nil + return locations[0].Segment, false, nil } func (w *DiskWAL) RecordDelete(rootHash ledger.RootHash) error { @@ -314,7 +317,7 @@ type LedgerWAL interface { NewCheckpointer() (*Checkpointer, error) PauseRecord() UnpauseRecord() - RecordUpdate(update *ledger.TrieUpdate) error + RecordUpdate(update *ledger.TrieUpdate) (int, bool, error) RecordDelete(rootHash ledger.RootHash) error ReplayOnForest(forest *mtrie.Forest) error Segments() (first, last int, err error) From 69443275aaa3154bee915ae80d6f8a5b3179cbd5 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Tue, 19 Jul 2022 17:50:08 -0500 Subject: [PATCH 02/32] Refactor Ledger/DiskWAL/Compactor shutdowns Improve concurrency, so goroutines don't close WAL file before writes to it finish: - Remove DiskWAL as standalone component in NodeBuilder. - Make Ledger reponsible for starting DiskWAL. - Make Ledger reponsible for shutting down DiskWAL to ensure that all WAL updates are completed before closing opened WAL segment. - Make Compactor close a channel to signal to Ledger that all WAL updates are done during shutdown. - Remove explicit DiskWAL shutdown when it is used by Ledger. --- cmd/bootstrap/run/execution_state.go | 6 +- cmd/execution_builder.go | 14 +++-- .../exec-data-json-export/ledger_exporter.go | 6 +- .../execution_state_extract.go | 6 +- .../execution_state_extract_test.go | 2 - ledger/complete/checkpoint_benchmark_test.go | 1 - ledger/complete/compactor.go | 57 +++++++++++++------ ledger/complete/compactor_test.go | 46 ++++++++++----- ledger/complete/ledger.go | 42 +++++++++++--- ledger/complete/ledger_benchmark_test.go | 35 +++++------- ledger/complete/ledger_test.go | 16 +++--- ledger/complete/wal/checkpointer_test.go | 2 - 12 files changed, 144 insertions(+), 89 deletions(-) diff --git a/cmd/bootstrap/run/execution_state.go b/cmd/bootstrap/run/execution_state.go index 0a5a3cc3d29..802b04a3ef0 100644 --- a/cmd/bootstrap/run/execution_state.go +++ b/cmd/bootstrap/run/execution_state.go @@ -41,14 +41,14 @@ func GenerateExecutionState( if err != nil { return flow.DummyStateCommitment, err } - defer func() { - <-diskWal.Done() - }() ledgerStorage, err := ledger.NewLedger(diskWal, 100, metricsCollector, zerolog.Nop(), ledger.DefaultPathFinderVersion) if err != nil { return flow.DummyStateCommitment, err } + defer func() { + <-ledgerStorage.Done() + }() return bootstrap.NewBootstrapper( zerolog.Nop()).BootstrapLedger( diff --git a/cmd/execution_builder.go b/cmd/execution_builder.go index 0dfb24f76fb..641221d6f65 100644 --- a/cmd/execution_builder.go +++ b/cmd/execution_builder.go @@ -341,12 +341,6 @@ func (e *ExecutionNodeBuilder) LoadComponentsAndModules() { } return nil }). - Component("Write-Ahead Log", func(node *NodeConfig) (module.ReadyDoneAware, error) { - var err error - diskWAL, err = wal.NewDiskWAL(node.Logger.With().Str("subcomponent", "wal").Logger(), - node.MetricsRegisterer, collector, e.exeConf.triedir, int(e.exeConf.mTrieCacheSize), pathfinder.PathByteSize, wal.SegmentSize) - return diskWAL, err - }). Component("execution state ledger", func(node *NodeConfig) (module.ReadyDoneAware, error) { // check if the execution database already exists @@ -382,6 +376,14 @@ func (e *ExecutionNodeBuilder) LoadComponentsAndModules() { } } + // Ledger is responsible for starting and shutdowning DiskWAL component. + // This ensures that all WAL updates are completed before closing opened WAL segment. + diskWAL, err = wal.NewDiskWAL(node.Logger.With().Str("subcomponent", "wal").Logger(), + node.MetricsRegisterer, collector, e.exeConf.triedir, int(e.exeConf.mTrieCacheSize), pathfinder.PathByteSize, wal.SegmentSize) + if err != nil { + return nil, fmt.Errorf("failed to initialize wal: %w", err) + } + ledgerStorage, err = ledger.NewSyncLedger(diskWAL, int(e.exeConf.mTrieCacheSize), collector, node.Logger.With().Str("subcomponent", "ledger").Logger(), ledger.DefaultPathFinderVersion) return ledgerStorage, err diff --git a/cmd/util/cmd/exec-data-json-export/ledger_exporter.go b/cmd/util/cmd/exec-data-json-export/ledger_exporter.go index c1b9df611f9..e97cd9f34a2 100644 --- a/cmd/util/cmd/exec-data-json-export/ledger_exporter.go +++ b/cmd/util/cmd/exec-data-json-export/ledger_exporter.go @@ -24,13 +24,13 @@ func ExportLedger(ledgerPath string, targetstate string, outputPath string) erro if err != nil { return fmt.Errorf("cannot create WAL: %w", err) } - defer func() { - <-diskWal.Done() - }() led, err := complete.NewLedger(diskWal, complete.DefaultCacheSize, &metrics.NoopCollector{}, log.Logger, 0) if err != nil { return fmt.Errorf("cannot create ledger from write-a-head logs and checkpoints: %w", err) } + defer func() { + <-led.Done() + }() stateBytes, err := hex.DecodeString(targetstate) if err != nil { return fmt.Errorf("failed to decode hex code of state: %w", err) diff --git a/cmd/util/cmd/execution-state-extract/execution_state_extract.go b/cmd/util/cmd/execution-state-extract/execution_state_extract.go index 499a2b0ec27..7e6fdd96579 100644 --- a/cmd/util/cmd/execution-state-extract/execution_state_extract.go +++ b/cmd/util/cmd/execution-state-extract/execution_state_extract.go @@ -43,9 +43,6 @@ func extractExecutionState( if err != nil { return fmt.Errorf("cannot create disk WAL: %w", err) } - defer func() { - <-diskWal.Done() - }() led, err := complete.NewLedger( diskWal, @@ -56,6 +53,9 @@ func extractExecutionState( if err != nil { return fmt.Errorf("cannot create ledger from write-a-head logs and checkpoints: %w", err) } + defer func() { + <-led.Done() + }() var migrations []ledger.Migration var preCheckpointReporters, postCheckpointReporters []ledger.Reporter diff --git a/cmd/util/cmd/execution-state-extract/execution_state_extract_test.go b/cmd/util/cmd/execution-state-extract/execution_state_extract_test.go index edb029fce3a..f865a13107c 100644 --- a/cmd/util/cmd/execution-state-extract/execution_state_extract_test.go +++ b/cmd/util/cmd/execution-state-extract/execution_state_extract_test.go @@ -120,7 +120,6 @@ func TestExtractExecutionState(t *testing.T) { blocksInOrder[i] = blockID } - <-diskWal.Done() <-f.Done() err = db.Close() require.NoError(t, err) @@ -181,7 +180,6 @@ func TestExtractExecutionState(t *testing.T) { require.Error(t, err) } - <-diskWal.Done() <-storage.Done() }) } diff --git a/ledger/complete/checkpoint_benchmark_test.go b/ledger/complete/checkpoint_benchmark_test.go index 799651035a8..875ec0c1c0b 100644 --- a/ledger/complete/checkpoint_benchmark_test.go +++ b/ledger/complete/checkpoint_benchmark_test.go @@ -284,7 +284,6 @@ func benchmarkNewCheckpointRandomData(b *testing.B, segmentCount int) { b.Fatal(err) } - <-wal1.Done() <-led.Done() wal2, err := wal.NewDiskWAL( diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index e2d2afae88f..8389029cc88 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -16,8 +16,8 @@ import ( ) type WALTrieUpdate struct { - Update *ledger.TrieUpdate - Resultc chan<- error + Update *ledger.TrieUpdate + ResultCh chan<- error } type Compactor struct { @@ -25,8 +25,9 @@ type Compactor struct { wal *realWAL.DiskWAL ledger *Ledger logger zerolog.Logger - stopc chan struct{} - trieUpdatec <-chan *WALTrieUpdate + stopCh chan chan struct{} + trieUpdateCh <-chan *WALTrieUpdate + trieUpdateDoneCh chan<- struct{} lm *lifecycle.LifecycleManager observers map[observable.Observer]struct{} checkpointDistance uint @@ -43,18 +44,24 @@ func NewCompactor(l *Ledger, w *realWAL.DiskWAL, checkpointDistance uint, checkp return nil, err } - trieUpdatec := l.TrieUpdateChan() - if trieUpdatec == nil { + trieUpdateCh := l.TrieUpdateChan() + if trieUpdateCh == nil { return nil, errors.New("failed to get valid trie update channel from ledger") } + trieUpdateDoneCh := l.TrieUpdateDoneChan() + if trieUpdateDoneCh == nil { + return nil, errors.New("failed to get valid trie update done channel from ledger") + } + return &Compactor{ checkpointer: checkpointer, wal: w, ledger: l, logger: logger, - stopc: make(chan struct{}), - trieUpdatec: trieUpdatec, + stopCh: make(chan chan struct{}), + trieUpdateCh: trieUpdateCh, + trieUpdateDoneCh: l.trieUpdateDoneCh, observers: make(map[observable.Observer]struct{}), lm: lifecycle.NewLifecycleManager(), checkpointDistance: checkpointDistance, @@ -81,10 +88,20 @@ func (c *Compactor) Ready() <-chan struct{} { func (c *Compactor) Done() <-chan struct{} { c.lm.OnStop(func() { + // Notify observers for observer := range c.observers { observer.OnComplete() } - c.stopc <- struct{}{} + + // Signal Compactor goroutine to stop + doneCh := make(chan struct{}) + c.stopCh <- doneCh + + // Wait for Compactor goroutine to stop + <-doneCh + + // Close trieUpdateDoneCh to signal trie updates are finished + close(c.trieUpdateDoneCh) }) return c.lm.Stopped() } @@ -121,10 +138,11 @@ Loop: for { select { - case <-c.stopc: + case doneCh := <-c.stopCh: + defer close(doneCh) break Loop - case update := <-c.trieUpdatec: + case update := <-c.trieUpdateCh: // RecordUpdate returns the segment number the record was written to. // Returned segment number can be // - the same as previous segment number (same segment), or @@ -132,7 +150,7 @@ Loop: segmentNum, skipped, err := c.wal.RecordUpdate(update.Update) if err != nil || skipped || segmentNum == activeSegmentNum { - update.Resultc <- err + update.ResultCh <- err continue } @@ -148,7 +166,7 @@ Loop: if nextCheckpointNum > prevSegmentNum { // Not enough segments for checkpointing - update.Resultc <- nil + update.ResultCh <- nil continue } @@ -165,7 +183,7 @@ Loop: } // Send WAL update result after ledger state snapshot is done. - update.Resultc <- nil + update.ResultCh <- nil // Try to checkpoint if checkpointSem.TryAcquire(1) { @@ -193,13 +211,16 @@ Loop: } } - // Drain and record remaining trie update in the channel. - for update := range c.trieUpdatec { + // Drain and process remaining trie updates in channel. + for update := range c.trieUpdateCh { _, _, err := c.wal.RecordUpdate(update.Update) - update.Resultc <- err + select { + case update.ResultCh <- err: + default: + } } - // TODO: wait for checkpointing goroutine? + // Don't wait for checkpointing to finish because it might take too long. } func (c *Compactor) checkpoint(tries []*trie.MTrie, checkpointNum int) error { diff --git a/ledger/complete/compactor_test.go b/ledger/complete/compactor_test.go index 49a29d99dc7..74d42675ae4 100644 --- a/ledger/complete/compactor_test.go +++ b/ledger/complete/compactor_test.go @@ -80,7 +80,7 @@ func TestCompactor(t *testing.T) { // Run Compactor in background. <-compactor.Ready() - rootHash := trie.EmptyTrieRootHash() + rootState := l.InitialState() // Generate the tree and create WAL for i := 0; i < size; i++ { @@ -94,10 +94,10 @@ func TestCompactor(t *testing.T) { values[i] = p.Value } - update, err := ledger.NewUpdate(ledger.State(rootHash), keys, values) + update, err := ledger.NewUpdate(rootState, keys, values) require.NoError(t, err) - rootHash, _, err := l.Set(update) + newState, _, err := l.Set(update) require.NoError(t, err) require.FileExists(t, path.Join(dir, realWAL.NumberToFilenamePart(i))) @@ -108,7 +108,9 @@ func TestCompactor(t *testing.T) { data[ks] = payloads[j] } - savedData[ledger.RootHash(rootHash)] = data + savedData[ledger.RootHash(newState)] = data + + rootState = newState } // wait for the bound-checking observer to confirm checkpoints have been made @@ -138,8 +140,11 @@ func TestCompactor(t *testing.T) { require.NoFileExists(t, path.Join(dir, "checkpoint.00000008")) require.FileExists(t, path.Join(dir, "checkpoint.00000009")) - <-compactor.Done() - <-wal.Done() + ledgerDone := l.Done() + compactorDone := compactor.Done() + + <-ledgerDone + <-compactorDone }) time.Sleep(2 * time.Second) @@ -203,15 +208,23 @@ func TestCompactor(t *testing.T) { } } - // check for forestTries, err := l.Tries() require.NoError(t, err) + forestTriesSet := make(map[ledger.RootHash]struct{}) + for _, trie := range forestTries { + forestTriesSet[trie.RootHash()] = struct{}{} + } + forestTries2, err := l.Tries() require.NoError(t, err) - // order might be different - require.Equal(t, len(forestTries), len(forestTries2)) + forestTries2Set := make(map[ledger.RootHash]struct{}) + for _, trie := range forestTries2 { + forestTries2Set[trie.RootHash()] = struct{}{} + } + + require.Equal(t, forestTriesSet, forestTries2Set) }) }) @@ -256,7 +269,6 @@ func TestCompactorAccuracy(t *testing.T) { require.NoError(t, err) fromBound := lastCheckpointNum + int(checkpointDistance)*2 - lastCheckpointNum = fromBound co := CompactorObserver{fromBound: fromBound, done: make(chan struct{})} compactor.Subscribe(&co) @@ -279,20 +291,26 @@ func TestCompactorAccuracy(t *testing.T) { update, err := ledger.NewUpdate(ledger.State(rootHash), keys, values) require.NoError(t, err) - _, _, err = l.Set(update) + newState, _, err := l.Set(update) require.NoError(t, err) + + rootHash = ledger.RootHash(newState) } // wait for the bound-checking observer to confirm checkpoints have been made select { case <-co.done: // continue - case <-time.After(120 * time.Second): + case <-time.After(60 * time.Second): assert.FailNow(t, "timed out") } - <-compactor.Done() - <-wal.Done() + // Shutdown ledger and compactor + ledgerDone := l.Done() + compactorDone := compactor.Done() + + <-ledgerDone + <-compactorDone nums, err := checkpointer.Checkpoints() require.NoError(t, err) diff --git a/ledger/complete/ledger.go b/ledger/complete/ledger.go index 6ad846f8034..4a1dbf2c8c3 100644 --- a/ledger/complete/ledger.go +++ b/ledger/complete/ledger.go @@ -39,7 +39,8 @@ type Ledger struct { wal realWAL.LedgerWAL metrics module.LedgerMetrics logger zerolog.Logger - trieUpdatec chan *WALTrieUpdate + trieUpdateCh chan *WALTrieUpdate + trieUpdateDoneCh chan struct{} pathFinderVersion uint8 } @@ -100,27 +101,52 @@ func NewSyncLedger( return nil, err } - l.trieUpdatec = make(chan *WALTrieUpdate, defaultTrieUpdateChanSize) + l.trieUpdateCh = make(chan *WALTrieUpdate, defaultTrieUpdateChanSize) + l.trieUpdateDoneCh = make(chan struct{}) return l, nil } +// TrieUpdateChan returns a channel which is used to receive trie updates that needs to be logged in WALs. +// This channel is closed when ledger component shutdowns down. func (l *Ledger) TrieUpdateChan() <-chan *WALTrieUpdate { - return l.trieUpdatec + return l.trieUpdateCh +} + +// TrieUpdateDoneChan returns a channel which is closed when there are no more WAL updates. +// This is used to signal that it is safe to shutdown WAL component (closing opened WAL segment). +func (l *Ledger) TrieUpdateDoneChan() chan<- struct{} { + return l.trieUpdateDoneCh } // Ready implements interface module.ReadyDoneAware // it starts the EventLoop's internal processing loop. func (l *Ledger) Ready() <-chan struct{} { ready := make(chan struct{}) - close(ready) + go func() { + defer close(ready) + // Start WAL component. + <-l.wal.Ready() + }() return ready } // Done implements interface module.ReadyDoneAware -// it closes all the open write-ahead log files. func (l *Ledger) Done() <-chan struct{} { done := make(chan struct{}) - close(done) + go func() { + defer close(done) + if l.trieUpdateCh != nil { + // Compactor is running in parallel. + // Ledger is responsible for closing trieUpdateCh channel, + // so Compactor can drain and process remaining updates. + close(l.trieUpdateCh) + + // Wait for Compactor to finish all trie updates before closing WAL component. + <-l.trieUpdateDoneCh + } + // Shut down WAL component. + <-l.wal.Done() + }() return done } @@ -224,13 +250,13 @@ func (l *Ledger) Set(update *ledger.Update) (newState ledger.State, trieUpdate * walChan := make(chan error) - if l.trieUpdatec == nil { + if l.trieUpdateCh == nil { go func() { _, _, err := l.wal.RecordUpdate(trieUpdate) walChan <- err }() } else { - l.trieUpdatec <- &WALTrieUpdate{Update: trieUpdate, Resultc: walChan} + l.trieUpdateCh <- &WALTrieUpdate{Update: trieUpdate, ResultCh: walChan} } newTrie, err := l.forest.NewTrie(trieUpdate) diff --git a/ledger/complete/ledger_benchmark_test.go b/ledger/complete/ledger_benchmark_test.go index 519aee54e7c..c370fd68da7 100644 --- a/ledger/complete/ledger_benchmark_test.go +++ b/ledger/complete/ledger_benchmark_test.go @@ -43,12 +43,11 @@ func benchmarkStorage(steps int, b *testing.B) { diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, steps+1, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(b, err) - defer func() { - <-diskWal.Done() - }() led, err := complete.NewLedger(diskWal, steps+1, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) - defer led.Done() + defer func() { + <-led.Done() + }() if err != nil { b.Fatal("can't create a new complete ledger") } @@ -151,12 +150,11 @@ func BenchmarkTrieUpdate(b *testing.B) { diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, 101, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(b, err) - defer func() { - <-diskWal.Done() - }() led, err := complete.NewLedger(diskWal, 101, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) - defer led.Done() + defer func() { + <-led.Done() + }() if err != nil { b.Fatal("can't create a new complete ledger") } @@ -199,12 +197,11 @@ func BenchmarkTrieRead(b *testing.B) { diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, 101, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(b, err) - defer func() { - <-diskWal.Done() - }() led, err := complete.NewLedger(diskWal, 101, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) - defer led.Done() + defer func() { + <-led.Done() + }() if err != nil { b.Fatal("can't create a new complete ledger") } @@ -256,12 +253,11 @@ func BenchmarkLedgerGetOneValue(b *testing.B) { diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, 101, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(b, err) - defer func() { - <-diskWal.Done() - }() led, err := complete.NewLedger(diskWal, 101, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) - defer led.Done() + defer func() { + <-led.Done() + }() if err != nil { b.Fatal("can't create a new complete ledger") } @@ -330,12 +326,11 @@ func BenchmarkTrieProve(b *testing.B) { diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, 101, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(b, err) - defer func() { - <-diskWal.Done() - }() led, err := complete.NewLedger(diskWal, 101, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) - defer led.Done() + defer func() { + <-led.Done() + }() if err != nil { b.Fatal("can't create a new complete ledger") } diff --git a/ledger/complete/ledger_test.go b/ledger/complete/ledger_test.go index aa0d0243bcf..d9240a2c39f 100644 --- a/ledger/complete/ledger_test.go +++ b/ledger/complete/ledger_test.go @@ -448,7 +448,6 @@ func Test_WAL(t *testing.T) { savedData[string(state[:])] = data } - <-diskWal.Done() <-led.Done() diskWal2, err := wal.NewDiskWAL(zerolog.Nop(), nil, metricsCollector, dir, size, pathfinder.PathByteSize, wal.SegmentSize) @@ -485,7 +484,6 @@ func Test_WAL(t *testing.T) { s := led2.ForestSize() assert.Equal(t, s, size) - <-diskWal2.Done() <-led2.Done() }) } @@ -595,7 +593,7 @@ func TestLedgerFunctionality(t *testing.T) { } state = newState } - <-diskWal.Done() + <-led.Done() }) } } @@ -641,8 +639,8 @@ func Test_ExportCheckpointAt(t *testing.T) { assert.Equal(t, v, retValues[i]) } - <-diskWal.Done() - <-diskWal2.Done() + <-led.Done() + <-led2.Done() }) }) }) @@ -683,8 +681,8 @@ func Test_ExportCheckpointAt(t *testing.T) { assert.Equal(t, retValues[0], ledger.Value([]byte{'C'})) assert.Equal(t, retValues[1], ledger.Value([]byte{'B'})) - <-diskWal.Done() - <-diskWal2.Done() + <-led.Done() + <-led2.Done() }) }) }) @@ -725,8 +723,8 @@ func Test_ExportCheckpointAt(t *testing.T) { assert.Equal(t, retValues[0], ledger.Value([]byte{'D'})) assert.Equal(t, retValues[1], ledger.Value([]byte{'B'})) - <-diskWal.Done() - <-diskWal2.Done() + <-led.Done() + <-led2.Done() }) }) }) diff --git a/ledger/complete/wal/checkpointer_test.go b/ledger/complete/wal/checkpointer_test.go index 24beed91efb..2b108236f32 100644 --- a/ledger/complete/wal/checkpointer_test.go +++ b/ledger/complete/wal/checkpointer_test.go @@ -80,7 +80,6 @@ func Test_WAL(t *testing.T) { savedData[string(state[:])] = data } - <-diskWal.Done() <-led.Done() diskWal2, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metricsCollector, dir, size, pathfinder.PathByteSize, realWAL.SegmentSize) @@ -112,7 +111,6 @@ func Test_WAL(t *testing.T) { } } - <-diskWal2.Done() <-led2.Done() }) } From 64d4c039f660ce936569d5deec582ffc62d080e1 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Tue, 19 Jul 2022 21:57:10 -0500 Subject: [PATCH 03/32] Handle more error conditions in compactor Retry checkpointing at next segment on - failure to checkpoint - failure to get tries snapshot - failure to start checkpointing because previous checkpointing is still running Add more logging. --- ledger/complete/compactor.go | 79 +++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 24 deletions(-) diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index 8389029cc88..c2a12bcd1a3 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -20,6 +20,11 @@ type WALTrieUpdate struct { ResultCh chan<- error } +type checkpointResult struct { + num int + err error +} + type Compactor struct { checkpointer *realWAL.Checkpointer wal *realWAL.DiskWAL @@ -81,7 +86,7 @@ func (c *Compactor) Unsubscribe(observer observable.Observer) { // Ready periodically fires Run function, every `interval` func (c *Compactor) Ready() <-chan struct{} { c.lm.OnStart(func() { - go c.start() + go c.run() }) return c.lm.Started() } @@ -106,29 +111,35 @@ func (c *Compactor) Done() <-chan struct{} { return c.lm.Stopped() } -func (c *Compactor) start() { +func (c *Compactor) run() { - // Use checkpointSem to limit checkpointing goroutine to one. - // If checkpointing isn't finished, then wait before starting new checkpointing. + // checkpointSem is used to limit checkpointing to one. + // If previous checkpointing isn't finished when enough segments + // are finalized for next checkpointing, retry checkpointing + // again when next segment is finalized. + // This avoids having more tries in memory than needed. checkpointSem := semaphore.NewWeighted(1) - // Get active segment number + checkpointResultCh := make(chan checkpointResult, 1) + + // Get active segment number. // activeSegmentNum is updated when record is written to a new segment. _, activeSegmentNum, err := c.wal.Segments() if err != nil { - // TODO: handle error - c.logger.Error().Err(err).Msg("error getting active segment number") + c.logger.Error().Err(err).Msg("compactor failed to get active segment number") + activeSegmentNum = -1 } lastCheckpointNum, err := c.checkpointer.LatestCheckpoint() if err != nil { - // TODO: handle error - c.logger.Error().Err(err).Msg("error getting last checkpoint number") + c.logger.Error().Err(err).Msg("compactor failed to get last checkpoint number") + lastCheckpointNum = -1 } - // Compute next checkpoint number - // nextCheckpointNum is updated when checkpointing goroutine starts. - // NOTE: next checkpoint number must be >= active segment num + // Compute next checkpoint number. + // nextCheckpointNum is updated when checkpointing starts, fails to start, or fails. + // NOTE: next checkpoint number must >= active segment num. + // We can't reuse mtrie state to checkpoint tries in older segments. nextCheckpointNum := lastCheckpointNum + int(c.checkpointDistance) if activeSegmentNum > nextCheckpointNum { nextCheckpointNum = activeSegmentNum @@ -142,6 +153,16 @@ Loop: defer close(doneCh) break Loop + case checkpointResult := <-checkpointResultCh: + if checkpointResult.err != nil { + c.logger.Error().Err(checkpointResult.err).Msgf( + "compactor failed to checkpoint %d", checkpointResult.num, + ) + + // Retry checkpointing after active segment is finalized. + nextCheckpointNum = activeSegmentNum + } + case update := <-c.trieUpdateCh: // RecordUpdate returns the segment number the record was written to. // Returned segment number can be @@ -149,15 +170,26 @@ Loop: // - incremented by 1 from previous segment number (new segment) segmentNum, skipped, err := c.wal.RecordUpdate(update.Update) + if activeSegmentNum == -1 { + // Recover from failure to get active segment number outside loop + activeSegmentNum = segmentNum + if activeSegmentNum > nextCheckpointNum { + nextCheckpointNum = activeSegmentNum + } + } + if err != nil || skipped || segmentNum == activeSegmentNum { update.ResultCh <- err continue } + // In the remaining code: segmentNum > activeSegmentNum + + // active segment is finalized. + // Check new segment number is incremented by 1 if segmentNum != activeSegmentNum+1 { - // TODO: must handle error without panic before merging this code - panic(fmt.Sprintf("expected new segment numer %d, got %d", activeSegmentNum+1, segmentNum)) + c.logger.Error().Msg(fmt.Sprintf("compactor got unexpected new segment numer %d, want %d", segmentNum, activeSegmentNum+1)) } // Update activeSegmentNum @@ -170,13 +202,14 @@ Loop: continue } + // In the remaining code: nextCheckpointNum == prevSegmentNum + // Enough segments are created for checkpointing // Get forest from ledger before sending WAL update result tries, err := c.ledger.Tries() if err != nil { - // TODO: handle error - c.logger.Error().Err(err).Msg("error getting ledger tries") + c.logger.Error().Err(err).Msg("compactor failed to get ledger tries") // Try again after active segment is finalized. nextCheckpointNum = activeSegmentNum continue @@ -196,16 +229,14 @@ Loop: go func() { defer checkpointSem.Release(1) - err = c.checkpoint(tries, checkpointNum) - if err != nil { - c.logger.Error().Err(err).Msg("error checkpointing") - } - // TODO: retry if checkpointing fails. + err := c.checkpoint(tries, checkpointNum) + + checkpointResultCh <- checkpointResult{checkpointNum, err} }() } else { // Failed to get semaphore because checkpointing is running. // Try again when active segment is finalized. - c.logger.Info().Msgf("checkpoint %d is delayed because prior checkpointing is ongoing", nextCheckpointNum) + c.logger.Info().Msgf("compactor delayed checkpoint %d because prior checkpointing is ongoing", nextCheckpointNum) nextCheckpointNum = activeSegmentNum } } @@ -246,7 +277,7 @@ func (c *Compactor) checkpoint(tries []*trie.MTrie, checkpointNum int) error { func createCheckpoint(checkpointer *realWAL.Checkpointer, logger zerolog.Logger, tries []*trie.MTrie, checkpointNum int) error { - logger.Info().Msgf("serializing checkpoint %d with %d tries", checkpointNum, len(tries)) + logger.Info().Msgf("serializing checkpoint %d", checkpointNum) startTime := time.Now() @@ -268,7 +299,7 @@ func createCheckpoint(checkpointer *realWAL.Checkpointer, logger zerolog.Logger, } duration := time.Since(startTime) - logger.Info().Float64("total_time_s", duration.Seconds()).Msgf("created checkpoint %d with %d tries", checkpointNum, len(tries)) + logger.Info().Float64("total_time_s", duration.Seconds()).Msgf("created checkpoint %d", checkpointNum) return nil } From 3b35eb9a6da173f8c93984adad10ea80267d7400 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Tue, 19 Jul 2022 22:46:25 -0500 Subject: [PATCH 04/32] Check closed channel in Compactor --- ledger/complete/compactor.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index c2a12bcd1a3..f0468092be8 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -137,7 +137,9 @@ func (c *Compactor) run() { } // Compute next checkpoint number. - // nextCheckpointNum is updated when checkpointing starts, fails to start, or fails. + // nextCheckpointNum is updated when + // - checkpointing starts, fails to start, or fails. + // - tries snapshot fails. // NOTE: next checkpoint number must >= active segment num. // We can't reuse mtrie state to checkpoint tries in older segments. nextCheckpointNum := lastCheckpointNum + int(c.checkpointDistance) @@ -163,7 +165,13 @@ Loop: nextCheckpointNum = activeSegmentNum } - case update := <-c.trieUpdateCh: + case update, ok := <-c.trieUpdateCh: + if !ok { + // trieUpdateCh channel is closed. + // Wait for stop signal from c.stopCh + continue + } + // RecordUpdate returns the segment number the record was written to. // Returned segment number can be // - the same as previous segment number (same segment), or From 6d60646f835088acaa1ed5c19df546dbd2dd3e8b Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Wed, 20 Jul 2022 09:49:09 -0500 Subject: [PATCH 05/32] Add debug logging in test --- ledger/complete/compactor_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/ledger/complete/compactor_test.go b/ledger/complete/compactor_test.go index 74d42675ae4..b7044f73b77 100644 --- a/ledger/complete/compactor_test.go +++ b/ledger/complete/compactor_test.go @@ -33,6 +33,7 @@ func (co *CompactorObserver) OnNext(val interface{}) { res, ok := val.(int) if ok { new := res + fmt.Printf("Compactor observer received checkpoint num %d\n", new) if new >= co.fromBound { co.done <- struct{}{} } From 5a0044f48e3de9f28d79cafca013e82c51da78e9 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Wed, 20 Jul 2022 10:42:53 -0500 Subject: [PATCH 06/32] Add debug logging in test --- ledger/complete/compactor_test.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ledger/complete/compactor_test.go b/ledger/complete/compactor_test.go index b7044f73b77..8c0836c802b 100644 --- a/ledger/complete/compactor_test.go +++ b/ledger/complete/compactor_test.go @@ -2,6 +2,7 @@ package complete import ( "fmt" + "io/ioutil" "os" "path" "reflect" @@ -119,6 +120,14 @@ func TestCompactor(t *testing.T) { case <-co.done: // continue case <-time.After(60 * time.Second): + // Log segment and checkpoint files + files, err := ioutil.ReadDir(dir) + require.NoError(t, err) + + for _, file := range files { + fmt.Printf("%s, size %d\n", file.Name(), file.Size()) + } + assert.FailNow(t, "timed out") } From bf38e53d97175b917c8765810ae6e8daa4c2c62f Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Thu, 21 Jul 2022 08:53:59 -0500 Subject: [PATCH 07/32] Avoid observer.OnNext() when Compactor is closing observer.OnComplete() is called when Compactor starts shutting down, which may close channel that observer.OnNext() uses to send data. --- ledger/complete/compactor.go | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index f0468092be8..641feb08ffe 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -1,6 +1,7 @@ package complete import ( + "context" "errors" "fmt" "time" @@ -147,12 +148,15 @@ func (c *Compactor) run() { nextCheckpointNum = activeSegmentNum } + ctx, cancel := context.WithCancel(context.Background()) + Loop: for { select { case doneCh := <-c.stopCh: defer close(doneCh) + cancel() break Loop case checkpointResult := <-checkpointResultCh: @@ -237,7 +241,7 @@ Loop: go func() { defer checkpointSem.Release(1) - err := c.checkpoint(tries, checkpointNum) + err := c.checkpoint(ctx, tries, checkpointNum) checkpointResultCh <- checkpointResult{checkpointNum, err} }() @@ -262,13 +266,20 @@ Loop: // Don't wait for checkpointing to finish because it might take too long. } -func (c *Compactor) checkpoint(tries []*trie.MTrie, checkpointNum int) error { +func (c *Compactor) checkpoint(ctx context.Context, tries []*trie.MTrie, checkpointNum int) error { err := createCheckpoint(c.checkpointer, c.logger, tries, checkpointNum) if err != nil { return fmt.Errorf("cannot create checkpoints: %w", err) } + // Return if context is canceled. + select { + case <-ctx.Done(): + return nil + default: + } + err = cleanupCheckpoints(c.checkpointer, int(c.checkpointsToKeep)) if err != nil { return fmt.Errorf("cannot cleanup checkpoints: %w", err) @@ -276,7 +287,15 @@ func (c *Compactor) checkpoint(tries []*trie.MTrie, checkpointNum int) error { if checkpointNum > 0 { for observer := range c.observers { - observer.OnNext(checkpointNum) + // Don't notify observer if context is canceled. + // observer.OnComplete() is called when Compactor starts shutting down, + // which may close channel that observer.OnNext() uses to send data. + select { + case <-ctx.Done(): + return nil + default: + observer.OnNext(checkpointNum) + } } } From 710f1c602b84c45b008cfe4af3e76839800ae160 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Thu, 21 Jul 2022 09:13:59 -0500 Subject: [PATCH 08/32] Handle ledger updates in parallel with Compactor Handle ledger updates in parallel for checkpointing by making WAL update and state update atomic through channels between Ledger and Compactor. In order to reuse ledger state for checkpointing, these two ops must be atomic: - WAL update (logging encoded update to WAL) - state update (adding new trie to forest) In other words, when Compactor starts processing new WAL update, previous WAL update and state update are complete. --- ledger/complete/compactor.go | 30 ++++++++-- ledger/complete/compactor_test.go | 94 +++++++++++++++++++++++++++++++ ledger/complete/ledger.go | 62 ++++++++++++-------- 3 files changed, 155 insertions(+), 31 deletions(-) diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index 641feb08ffe..ac1e2a95fde 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -19,6 +19,7 @@ import ( type WALTrieUpdate struct { Update *ledger.TrieUpdate ResultCh chan<- error + DoneCh <-chan struct{} } type checkpointResult struct { @@ -177,7 +178,7 @@ Loop: } // RecordUpdate returns the segment number the record was written to. - // Returned segment number can be + // Returned segment number (>= 0) can be // - the same as previous segment number (same segment), or // - incremented by 1 from previous segment number (new segment) segmentNum, skipped, err := c.wal.RecordUpdate(update.Update) @@ -191,7 +192,7 @@ Loop: } if err != nil || skipped || segmentNum == activeSegmentNum { - update.ResultCh <- err + processUpdateResult(update, err) continue } @@ -210,7 +211,7 @@ Loop: if nextCheckpointNum > prevSegmentNum { // Not enough segments for checkpointing - update.ResultCh <- nil + processUpdateResult(update, nil) continue } @@ -218,17 +219,23 @@ Loop: // Enough segments are created for checkpointing - // Get forest from ledger before sending WAL update result + // Get ledger snapshot before sending WAL update result. + // At this point, ledger snapshot contains tries up to + // last update (logged as last record in finalized segment) + // Ledger doesn't include new trie for this update + // until WAL result is sent back. tries, err := c.ledger.Tries() if err != nil { c.logger.Error().Err(err).Msg("compactor failed to get ledger tries") // Try again after active segment is finalized. nextCheckpointNum = activeSegmentNum + + processUpdateResult(update, nil) continue } - // Send WAL update result after ledger state snapshot is done. - update.ResultCh <- nil + // Send WAL update result after ledger state snapshot is taken. + processUpdateResult(update, nil) // Try to checkpoint if checkpointSem.TryAcquire(1) { @@ -266,6 +273,17 @@ Loop: // Don't wait for checkpointing to finish because it might take too long. } +// processUpdateResult sends WAL update result using ResultCh channel +// and waits for signal from DoneCh channel. +// This ensures that WAL update and ledger state update are in sync. +func processUpdateResult(update *WALTrieUpdate, updateResult error) { + // Send result of WAL update + update.ResultCh <- updateResult + + // Wait for trie update to complete + <-update.DoneCh +} + func (c *Compactor) checkpoint(ctx context.Context, tries []*trie.MTrie, checkpointNum int) error { err := createCheckpoint(c.checkpointer, c.logger, tries, checkpointNum) diff --git a/ledger/complete/compactor_test.go b/ledger/complete/compactor_test.go index 8c0836c802b..5f5caf3c3af 100644 --- a/ledger/complete/compactor_test.go +++ b/ledger/complete/compactor_test.go @@ -426,3 +426,97 @@ func replaySegments( return nil } + +func TestCompactorConcurrency(t *testing.T) { + numInsPerStep := 2 + pathByteSize := 32 + minPayloadByteSize := 2 << 15 + maxPayloadByteSize := 2 << 16 + size := 10 + metricsCollector := &metrics.NoopCollector{} + checkpointDistance := uint(5) + checkpointsToKeep := uint(0) // keep all + + unittest.RunWithTempDir(t, func(dir string) { + + lastCheckpointNum := -1 + + rootHash := trie.EmptyTrieRootHash() + + // Create DiskWAL and Ledger repeatedly to test rebuilding ledger state at restart. + for i := 0; i < 3; i++ { + + wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, size*10, pathByteSize, 32*1024) + require.NoError(t, err) + + l, err := NewSyncLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) + require.NoError(t, err) + + checkpointer, err := wal.NewCheckpointer() + require.NoError(t, err) + + // WAL segments are 32kB, so here we generate 2 keys 64kB each, times `size` + // so we should get at least `size` segments + + compactor, err := NewCompactor(l, wal, checkpointDistance, checkpointsToKeep, zerolog.Nop()) + require.NoError(t, err) + + fromBound := lastCheckpointNum + int(checkpointDistance)*2 + + co := CompactorObserver{fromBound: fromBound, done: make(chan struct{})} + compactor.Subscribe(&co) + + // Run Compactor in background. + <-compactor.Ready() + + // Generate updates + var updates []*ledger.Update + for i := 0; i < size; i++ { + + payloads := utils.RandomPayloads(numInsPerStep, minPayloadByteSize, maxPayloadByteSize) + + keys := make([]ledger.Key, len(payloads)) + values := make([]ledger.Value, len(payloads)) + for i, p := range payloads { + keys[i] = p.Key + values[i] = p.Value + } + + update, err := ledger.NewUpdate(ledger.State(rootHash), keys, values) + require.NoError(t, err) + + updates = append(updates, update) + } + + // Generate the tree and create WAL in parallel + for _, update := range updates { + go func(update *ledger.Update) { + _, _, err := l.Set(update) + require.NoError(t, err) + }(update) + } + + // wait for the bound-checking observer to confirm checkpoints have been made + select { + case <-co.done: + // continue + case <-time.After(60 * time.Second): + assert.FailNow(t, "timed out") + } + + // Shutdown ledger and compactor + ledgerDone := l.Done() + compactorDone := compactor.Done() + + <-ledgerDone + <-compactorDone + + nums, err := checkpointer.Checkpoints() + require.NoError(t, err) + + for _, n := range nums { + testCheckpointedTriesMatchReplayedTriesFromSegments(t, checkpointer, n, dir) + } + } + }) +} diff --git a/ledger/complete/ledger.go b/ledger/complete/ledger.go index 4a1dbf2c8c3..ee2722fb064 100644 --- a/ledger/complete/ledger.go +++ b/ledger/complete/ledger.go @@ -248,50 +248,62 @@ func (l *Ledger) Set(update *ledger.Update) (newState ledger.State, trieUpdate * l.metrics.UpdateCount() - walChan := make(chan error) + newState, err = l.set(trieUpdate) + if err != nil { + return ledger.State(hash.DummyHash), nil, err + } + + // TODO update to proper value once https://github.com/onflow/flow-go/pull/3720 is merged + l.metrics.ForestApproxMemorySize(0) + + elapsed := time.Since(start) + l.metrics.UpdateDuration(elapsed) + + if len(trieUpdate.Paths) > 0 { + durationPerValue := time.Duration(elapsed.Nanoseconds() / int64(len(trieUpdate.Paths))) + l.metrics.UpdateDurationPerItem(durationPerValue) + } + + state := update.State() + l.logger.Info().Hex("from", state[:]). + Hex("to", newState[:]). + Int("update_size", update.Size()). + Msg("ledger updated") + return newState, trieUpdate, nil +} + +func (l *Ledger) set(trieUpdate *ledger.TrieUpdate) (newState ledger.State, err error) { + + resultCh := make(chan error) if l.trieUpdateCh == nil { go func() { _, _, err := l.wal.RecordUpdate(trieUpdate) - walChan <- err + resultCh <- err }() } else { - l.trieUpdateCh <- &WALTrieUpdate{Update: trieUpdate, ResultCh: walChan} + doneCh := make(chan struct{}) + defer close(doneCh) + + l.trieUpdateCh <- &WALTrieUpdate{Update: trieUpdate, ResultCh: resultCh, DoneCh: doneCh} } newTrie, err := l.forest.NewTrie(trieUpdate) - walError := <-walChan + walError := <-resultCh if err != nil { - return ledger.State(hash.DummyHash), nil, fmt.Errorf("cannot update state: %w", err) + return ledger.State(hash.DummyHash), fmt.Errorf("cannot update state: %w", err) } if walError != nil { - return ledger.State(hash.DummyHash), nil, fmt.Errorf("error while writing LedgerWAL: %w", walError) + return ledger.State(hash.DummyHash), fmt.Errorf("error while writing LedgerWAL: %w", walError) } err = l.forest.AddTrie(newTrie) if err != nil { - return ledger.State(hash.DummyHash), nil, fmt.Errorf("adding updated trie to forest failed: %w", err) + return ledger.State(hash.DummyHash), fmt.Errorf("failed to add new trie to forest: %w", err) } - // TODO update to proper value once https://github.com/onflow/flow-go/pull/3720 is merged - l.metrics.ForestApproxMemorySize(0) - - elapsed := time.Since(start) - l.metrics.UpdateDuration(elapsed) - - if len(trieUpdate.Paths) > 0 { - durationPerValue := time.Duration(elapsed.Nanoseconds() / int64(len(trieUpdate.Paths))) - l.metrics.UpdateDurationPerItem(durationPerValue) - } - - state := update.State() - newRootHash := newTrie.RootHash() - l.logger.Info().Hex("from", state[:]). - Hex("to", newRootHash[:]). - Int("update_size", update.Size()). - Msg("ledger updated") - return ledger.State(newRootHash), trieUpdate, nil + return ledger.State(newTrie.RootHash()), nil } // Prove provides proofs for a ledger query and errors (if any). From 8e036c829f5c4e79f9ba65b98f980dd9bf4d18b8 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Thu, 21 Jul 2022 10:37:05 -0500 Subject: [PATCH 09/32] Make Compactor test less flaky --- ledger/complete/compactor_test.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ledger/complete/compactor_test.go b/ledger/complete/compactor_test.go index 5f5caf3c3af..7d688abefcd 100644 --- a/ledger/complete/compactor_test.go +++ b/ledger/complete/compactor_test.go @@ -52,7 +52,7 @@ func TestCompactor(t *testing.T) { maxPayloadByteSize := 2 << 16 size := 10 metricsCollector := &metrics.NoopCollector{} - checkpointDistance := uint(2) + checkpointDistance := uint(3) checkpointsToKeep := uint(1) unittest.RunWithTempDir(t, func(dir string) { @@ -76,7 +76,7 @@ func TestCompactor(t *testing.T) { compactor, err := NewCompactor(l, wal, checkpointDistance, checkpointsToKeep, zerolog.Nop()) require.NoError(t, err) - co := CompactorObserver{fromBound: 9, done: make(chan struct{})} + co := CompactorObserver{fromBound: 8, done: make(chan struct{})} compactor.Subscribe(&co) // Run Compactor in background. @@ -137,7 +137,7 @@ func TestCompactor(t *testing.T) { from, to, err := checkpointer.NotCheckpointedSegments() require.NoError(t, err) - assert.True(t, from == 10 && to == 10, "from: %v, to: %v", from, to) // Make sure there is no leftover + assert.True(t, from == 9 && to == 10, "from: %v, to: %v", from, to) // Make sure there is no leftover require.NoFileExists(t, path.Join(dir, "checkpoint.00000000")) require.NoFileExists(t, path.Join(dir, "checkpoint.00000001")) @@ -147,8 +147,8 @@ func TestCompactor(t *testing.T) { require.NoFileExists(t, path.Join(dir, "checkpoint.00000005")) require.NoFileExists(t, path.Join(dir, "checkpoint.00000006")) require.NoFileExists(t, path.Join(dir, "checkpoint.00000007")) - require.NoFileExists(t, path.Join(dir, "checkpoint.00000008")) - require.FileExists(t, path.Join(dir, "checkpoint.00000009")) + require.FileExists(t, path.Join(dir, "checkpoint.00000008")) + require.NoFileExists(t, path.Join(dir, "checkpoint.00000009")) ledgerDone := l.Done() compactorDone := compactor.Done() @@ -169,7 +169,8 @@ func TestCompactor(t *testing.T) { name := fileInfo.Name() - if name != "checkpoint.00000009" && + if name != "checkpoint.00000008" && + name != "00000009" && name != "00000010" { err := os.Remove(path.Join(dir, name)) require.NoError(t, err) From 65f20620dc38ec35efce75e407a6ee241410c091 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Thu, 21 Jul 2022 17:51:12 -0500 Subject: [PATCH 10/32] Refactor trie update processing in Compactor --- ledger/complete/compactor.go | 142 ++++++++++++++++++++--------------- 1 file changed, 81 insertions(+), 61 deletions(-) diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index ac1e2a95fde..d1fde3a1fe2 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -177,79 +177,30 @@ Loop: continue } - // RecordUpdate returns the segment number the record was written to. - // Returned segment number (>= 0) can be - // - the same as previous segment number (same segment), or - // - incremented by 1 from previous segment number (new segment) - segmentNum, skipped, err := c.wal.RecordUpdate(update.Update) - - if activeSegmentNum == -1 { - // Recover from failure to get active segment number outside loop - activeSegmentNum = segmentNum - if activeSegmentNum > nextCheckpointNum { + var checkpointNum int + var checkpointTries []*trie.MTrie + activeSegmentNum, checkpointNum, checkpointTries = + c.processTrieUpdate(update, activeSegmentNum, nextCheckpointNum) + + if checkpointTries == nil { + // Don't checkpoint yet because + // - not enough segments for checkpointing (nextCheckpointNum >= activeSegmentNum), or + // - failed to get ledger state snapshop (nextCheckpointNum < activeSegmentNum) + if nextCheckpointNum < activeSegmentNum { nextCheckpointNum = activeSegmentNum } - } - - if err != nil || skipped || segmentNum == activeSegmentNum { - processUpdateResult(update, err) continue } - // In the remaining code: segmentNum > activeSegmentNum - - // active segment is finalized. - - // Check new segment number is incremented by 1 - if segmentNum != activeSegmentNum+1 { - c.logger.Error().Msg(fmt.Sprintf("compactor got unexpected new segment numer %d, want %d", segmentNum, activeSegmentNum+1)) - } - - // Update activeSegmentNum - prevSegmentNum := activeSegmentNum - activeSegmentNum = segmentNum - - if nextCheckpointNum > prevSegmentNum { - // Not enough segments for checkpointing - processUpdateResult(update, nil) - continue - } - - // In the remaining code: nextCheckpointNum == prevSegmentNum - - // Enough segments are created for checkpointing - - // Get ledger snapshot before sending WAL update result. - // At this point, ledger snapshot contains tries up to - // last update (logged as last record in finalized segment) - // Ledger doesn't include new trie for this update - // until WAL result is sent back. - tries, err := c.ledger.Tries() - if err != nil { - c.logger.Error().Err(err).Msg("compactor failed to get ledger tries") - // Try again after active segment is finalized. - nextCheckpointNum = activeSegmentNum - - processUpdateResult(update, nil) - continue - } - - // Send WAL update result after ledger state snapshot is taken. - processUpdateResult(update, nil) - // Try to checkpoint if checkpointSem.TryAcquire(1) { - checkpointNum := nextCheckpointNum - // Compute next checkpoint number - nextCheckpointNum += int(c.checkpointDistance) + nextCheckpointNum = checkpointNum + int(c.checkpointDistance) go func() { defer checkpointSem.Release(1) - - err := c.checkpoint(ctx, tries, checkpointNum) - + err := c.checkpoint(ctx, checkpointTries, checkpointNum) checkpointResultCh <- checkpointResult{checkpointNum, err} }() } else { @@ -371,3 +322,72 @@ func cleanupCheckpoints(checkpointer *realWAL.Checkpointer, checkpointsToKeep in } return nil } + +// processTrieUpdate writes trie update to WAL, updates activeSegmentNum, +// and takes snapshot of ledger state for checkpointing if needed. +// It also sends WAL update result and waits for trie update completion. +func (c *Compactor) processTrieUpdate( + update *WALTrieUpdate, + activeSegmentNum int, + nextCheckpointNum int, +) ( + _activeSegmentNum int, + checkpointNum int, + checkpointTries []*trie.MTrie, +) { + + // RecordUpdate returns the segment number the record was written to. + // Returned segment number (>= 0) can be + // - the same as previous segment number (same segment), or + // - incremented by 1 from previous segment number (new segment) + segmentNum, skipped, updateErr := c.wal.RecordUpdate(update.Update) + + // processUpdateResult must be called to ensure that ledger state update isn't blocked. + defer processUpdateResult(update, updateErr) + + if activeSegmentNum == -1 { + // Recover from failure to get active segment number at initialization. + return segmentNum, -1, nil + } + + if updateErr != nil || skipped || segmentNum == activeSegmentNum { + return activeSegmentNum, -1, nil + } + + // In the remaining code: segmentNum > activeSegmentNum + + // active segment is finalized. + + // Check new segment number is incremented by 1 + if segmentNum != activeSegmentNum+1 { + c.logger.Error().Msg(fmt.Sprintf("compactor got unexpected new segment numer %d, want %d", segmentNum, activeSegmentNum+1)) + } + + // Update activeSegmentNum + prevSegmentNum := activeSegmentNum + activeSegmentNum = segmentNum + + if nextCheckpointNum > prevSegmentNum { + // Not enough segments for checkpointing + return activeSegmentNum, -1, nil + } + + // In the remaining code: nextCheckpointNum == prevSegmentNum + + // Enough segments are created for checkpointing + + // Get ledger snapshot before sending WAL update result. + // At this point, ledger snapshot contains tries up to + // last update (logged as last record in finalized segment) + // Ledger doesn't include new trie for this update + // until WAL result is sent back. + tries, err := c.ledger.Tries() + if err != nil { + c.logger.Error().Err(err).Msg("compactor failed to get ledger tries") + return activeSegmentNum, -1, nil + } + + checkpointNum = nextCheckpointNum + + return activeSegmentNum, checkpointNum, tries +} From b8797cc2e4564ffceae8cde1bb577cf2fd0863ba Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Thu, 21 Jul 2022 21:13:32 -0500 Subject: [PATCH 11/32] Refactor Compactor shutdown Make Compactor shutdown use defer to close trieUpdateDoneCh. Although it was closing trieUpdateDoneCh, using defer is better. --- ledger/complete/compactor.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index d1fde3a1fe2..c93571b7510 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -95,6 +95,9 @@ func (c *Compactor) Ready() <-chan struct{} { func (c *Compactor) Done() <-chan struct{} { c.lm.OnStop(func() { + // Close trieUpdateDoneCh to signal trie updates are finished + defer close(c.trieUpdateDoneCh) + // Notify observers for observer := range c.observers { observer.OnComplete() @@ -106,9 +109,6 @@ func (c *Compactor) Done() <-chan struct{} { // Wait for Compactor goroutine to stop <-doneCh - - // Close trieUpdateDoneCh to signal trie updates are finished - close(c.trieUpdateDoneCh) }) return c.lm.Stopped() } From 6dddcf76e3a4672f5d8e7616f7378e2fd42fe6e1 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Sun, 24 Jul 2022 17:30:38 -0500 Subject: [PATCH 12/32] Use CheckpointQueue to store tries for checkpointing CheckpointQueue is a fix-sized FIFO queue. Compactor stores new tries in CheckpointQueue and retrieves all stored tries for checkpointing when needed. --- cmd/execution_builder.go | 3 +- ledger/complete/checkpointqueue.go | 87 ++++++++++++++++ ledger/complete/checkpointqueue_test.go | 131 ++++++++++++++++++++++++ ledger/complete/compactor.go | 73 +++++++------ ledger/complete/compactor_test.go | 68 +++++++----- ledger/complete/ledger.go | 10 +- 6 files changed, 308 insertions(+), 64 deletions(-) create mode 100644 ledger/complete/checkpointqueue.go create mode 100644 ledger/complete/checkpointqueue_test.go diff --git a/cmd/execution_builder.go b/cmd/execution_builder.go index 6b9c1e30f36..3b1c9daaf2d 100644 --- a/cmd/execution_builder.go +++ b/cmd/execution_builder.go @@ -395,9 +395,10 @@ func (e *ExecutionNodeBuilder) LoadComponentsAndModules() { return ledger.NewCompactor( ledgerStorage, diskWAL, + node.Logger.With().Str("subcomponent", "checkpointer").Logger(), + uint(e.exeConf.mTrieCacheSize), e.exeConf.checkpointDistance, e.exeConf.checkpointsToKeep, - node.Logger.With().Str("subcomponent", "checkpointer").Logger(), ) }). Component("execution data service", func(node *NodeConfig) (module.ReadyDoneAware, error) { diff --git a/ledger/complete/checkpointqueue.go b/ledger/complete/checkpointqueue.go new file mode 100644 index 00000000000..f76998c8174 --- /dev/null +++ b/ledger/complete/checkpointqueue.go @@ -0,0 +1,87 @@ +package complete + +import ( + "github.com/onflow/flow-go/ledger/complete/mtrie/trie" +) + +// CheckpointQueue is a fix-sized FIFO queue of MTrie. +// CheckpointQueue is intentionally not threadsafe given its limited use case. +// CheckpointQueue is not a general purpose queue to avoid incurring overhead +// for features not needed for its limited use case. +type CheckpointQueue struct { + ts []*trie.MTrie + capacity int + tail int // element index to write to + count int // number of elements (count <= capacity) +} + +// NewCheckpointQueue returns a new CheckpointQueue with given capacity. +func NewCheckpointQueue(capacity uint) *CheckpointQueue { + return &CheckpointQueue{ + ts: make([]*trie.MTrie, capacity), + capacity: int(capacity), + } +} + +// NewCheckpointQueueWithValues returns a new CheckpointQueue with given capacity and initial values. +func NewCheckpointQueueWithValues(capacity uint, tries []*trie.MTrie) *CheckpointQueue { + q := NewCheckpointQueue(capacity) + + start := 0 + if len(tries) > q.capacity { + start = len(tries) - q.capacity + } + n := copy(q.ts, tries[start:]) + q.count = n + q.tail = q.count % q.capacity + + return q +} + +// Push pushes trie to queue. If queue is full, it overwrites the oldest element. +func (q *CheckpointQueue) Push(t *trie.MTrie) { + q.ts[q.tail] = t + q.tail = (q.tail + 1) % q.capacity + if !q.isFull() { + q.count++ + } +} + +// Tries returns elements in queue, starting from the oldest element +// to the newest element. +func (q *CheckpointQueue) Tries() []*trie.MTrie { + if q.count == 0 { + return nil + } + + tries := make([]*trie.MTrie, q.count) + + if q.isFull() { + // If queue is full, tail points to the oldest element. + head := q.tail + n := copy(tries, q.ts[head:]) + copy(tries[n:], q.ts[:q.tail]) + } else { + if q.tail >= q.count { // Data isn't wrapped around the slice. + head := q.tail - q.count + copy(tries, q.ts[head:q.tail]) + } else { // q.tail < q.count, data is wrapped around the slice. + // This branch isn't used until CheckpointQueue supports Pop (removing oldest element). + // At this time, there is no reason to implement Pop, so this branch is here to prevent future bug. + head := q.capacity - q.count + q.tail + n := copy(tries, q.ts[head:]) + copy(tries[n:], q.ts[:q.tail]) + } + } + + return tries +} + +// Count returns element count. +func (q *CheckpointQueue) Count() int { + return q.count +} + +func (q *CheckpointQueue) isFull() bool { + return q.count == q.capacity +} diff --git a/ledger/complete/checkpointqueue_test.go b/ledger/complete/checkpointqueue_test.go new file mode 100644 index 00000000000..8ce75257927 --- /dev/null +++ b/ledger/complete/checkpointqueue_test.go @@ -0,0 +1,131 @@ +package complete + +import ( + "math/rand" + "testing" + + "github.com/onflow/flow-go/ledger" + "github.com/onflow/flow-go/ledger/common/hash" + "github.com/onflow/flow-go/ledger/complete/mtrie/node" + "github.com/onflow/flow-go/ledger/complete/mtrie/trie" + "github.com/stretchr/testify/require" +) + +func TestEmptyCheckpointQueue(t *testing.T) { + const capacity = 10 + + q := NewCheckpointQueue(capacity) + require.Equal(t, 0, q.Count()) + + tries := q.Tries() + require.Equal(t, 0, len(tries)) + require.Equal(t, 0, q.Count()) + + // savedTries contains all tries that are pushed to queue + var savedTries []*trie.MTrie + + // Push tries to queue to fill out capacity + for i := 0; i < capacity; i++ { + trie, err := randomMTrie() + require.NoError(t, err) + + q.Push(trie) + + savedTries = append(savedTries, trie) + + tr := q.Tries() + require.Equal(t, savedTries, tr) + require.Equal(t, len(savedTries), q.Count()) + } + + // Push more tries to queue to overwrite older elements + for i := 0; i < capacity; i++ { + trie, err := randomMTrie() + require.NoError(t, err) + + q.Push(trie) + + savedTries = append(savedTries, trie) + + tr := q.Tries() + require.Equal(t, capacity, len(tr)) + require.Equal(t, savedTries[len(savedTries)-capacity:], tr) + require.Equal(t, capacity, q.Count()) + } +} + +func TestCheckpointQueueWithInitialValues(t *testing.T) { + const capacity = 10 + + // Test CheckpointQueue with initial values. Numbers of initial values + // are from 1 to capacity + 1. + for initialValueCount := 1; initialValueCount <= capacity+1; initialValueCount++ { + + initialValues := make([]*trie.MTrie, initialValueCount) + for i := 0; i < len(initialValues); i++ { + tr, err := randomMTrie() + require.NoError(t, err) + initialValues[i] = tr + } + + expectedCount := initialValueCount + expectedTries := initialValues + if initialValueCount > capacity { + expectedCount = capacity + expectedTries = initialValues[initialValueCount-capacity:] + } + + q := NewCheckpointQueueWithValues(capacity, initialValues) + require.Equal(t, expectedCount, q.Count()) + + tries := q.Tries() + require.Equal(t, expectedTries, tries) + require.Equal(t, expectedCount, q.Count()) + + // savedTries contains all tries that are pushed to queue, including initial values. + savedTries := initialValues + + // Push tries to queue to fill out remaining capacity. + if initialValueCount < capacity { + for i := 0; i < capacity-initialValueCount; i++ { + trie, err := randomMTrie() + require.NoError(t, err) + + q.Push(trie) + + savedTries = append(savedTries, trie) + + tr := q.Tries() + require.Equal(t, savedTries, tr) + require.Equal(t, len(savedTries), q.Count()) + } + } + + // Push more tries to queue to overwrite older elements. + for i := 0; i < capacity; i++ { + trie, err := randomMTrie() + require.NoError(t, err) + + q.Push(trie) + + savedTries = append(savedTries, trie) + + tr := q.Tries() + require.Equal(t, capacity, len(tr)) + require.Equal(t, savedTries[len(savedTries)-capacity:], tr) + require.Equal(t, capacity, q.Count()) + } + } +} + +func randomMTrie() (*trie.MTrie, error) { + var randomPath ledger.Path + rand.Read(randomPath[:]) + + var randomHashValue hash.Hash + rand.Read(randomHashValue[:]) + + root := node.NewNode(256, nil, nil, randomPath, nil, randomHashValue) + + return trie.NewMTrie(root, 1, 1) +} diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index c93571b7510..9c2e74f7a94 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -19,7 +19,7 @@ import ( type WALTrieUpdate struct { Update *ledger.TrieUpdate ResultCh chan<- error - DoneCh <-chan struct{} + TrieCh <-chan *trie.MTrie } type checkpointResult struct { @@ -31,6 +31,7 @@ type Compactor struct { checkpointer *realWAL.Checkpointer wal *realWAL.DiskWAL ledger *Ledger + checkpointQueue *CheckpointQueue logger zerolog.Logger stopCh chan chan struct{} trieUpdateCh <-chan *WALTrieUpdate @@ -41,7 +42,14 @@ type Compactor struct { checkpointsToKeep uint } -func NewCompactor(l *Ledger, w *realWAL.DiskWAL, checkpointDistance uint, checkpointsToKeep uint, logger zerolog.Logger) (*Compactor, error) { +func NewCompactor( + l *Ledger, + w *realWAL.DiskWAL, + logger zerolog.Logger, + checkpointCapacity uint, + checkpointDistance uint, + checkpointsToKeep uint, +) (*Compactor, error) { if checkpointDistance < 1 { checkpointDistance = 1 } @@ -61,10 +69,18 @@ func NewCompactor(l *Ledger, w *realWAL.DiskWAL, checkpointDistance uint, checkp return nil, errors.New("failed to get valid trie update done channel from ledger") } + tries, err := l.Tries() + if err != nil { + return nil, err + } + + checkpointQueue := NewCheckpointQueueWithValues(checkpointCapacity, tries) + return &Compactor{ checkpointer: checkpointer, wal: w, ledger: l, + checkpointQueue: checkpointQueue, logger: logger, stopCh: make(chan chan struct{}), trieUpdateCh: trieUpdateCh, @@ -180,15 +196,10 @@ Loop: var checkpointNum int var checkpointTries []*trie.MTrie activeSegmentNum, checkpointNum, checkpointTries = - c.processTrieUpdate(update, activeSegmentNum, nextCheckpointNum) + c.processTrieUpdate(update, c.checkpointQueue, activeSegmentNum, nextCheckpointNum) if checkpointTries == nil { - // Don't checkpoint yet because - // - not enough segments for checkpointing (nextCheckpointNum >= activeSegmentNum), or - // - failed to get ledger state snapshop (nextCheckpointNum < activeSegmentNum) - if nextCheckpointNum < activeSegmentNum { - nextCheckpointNum = activeSegmentNum - } + // Not enough segments for checkpointing (nextCheckpointNum >= activeSegmentNum) continue } @@ -224,17 +235,6 @@ Loop: // Don't wait for checkpointing to finish because it might take too long. } -// processUpdateResult sends WAL update result using ResultCh channel -// and waits for signal from DoneCh channel. -// This ensures that WAL update and ledger state update are in sync. -func processUpdateResult(update *WALTrieUpdate, updateResult error) { - // Send result of WAL update - update.ResultCh <- updateResult - - // Wait for trie update to complete - <-update.DoneCh -} - func (c *Compactor) checkpoint(ctx context.Context, tries []*trie.MTrie, checkpointNum int) error { err := createCheckpoint(c.checkpointer, c.logger, tries, checkpointNum) @@ -324,10 +324,11 @@ func cleanupCheckpoints(checkpointer *realWAL.Checkpointer, checkpointsToKeep in } // processTrieUpdate writes trie update to WAL, updates activeSegmentNum, -// and takes snapshot of ledger state for checkpointing if needed. +// and gets tries from checkpointQueue for checkpointing if needed. // It also sends WAL update result and waits for trie update completion. func (c *Compactor) processTrieUpdate( update *WALTrieUpdate, + checkpointQueue *CheckpointQueue, activeSegmentNum int, nextCheckpointNum int, ) ( @@ -342,8 +343,20 @@ func (c *Compactor) processTrieUpdate( // - incremented by 1 from previous segment number (new segment) segmentNum, skipped, updateErr := c.wal.RecordUpdate(update.Update) - // processUpdateResult must be called to ensure that ledger state update isn't blocked. - defer processUpdateResult(update, updateErr) + // This ensures that updated trie matches WAL update. + defer func(updateResult error) { + // Send result of WAL update + update.ResultCh <- updateResult + + // Wait for updated trie + trie := <-update.TrieCh + if trie == nil { + c.logger.Error().Msg("compactor failed to get updated trie") + return + } + + checkpointQueue.Push(trie) + }(updateErr) if activeSegmentNum == -1 { // Recover from failure to get active segment number at initialization. @@ -376,16 +389,12 @@ func (c *Compactor) processTrieUpdate( // Enough segments are created for checkpointing - // Get ledger snapshot before sending WAL update result. - // At this point, ledger snapshot contains tries up to + // Get tries from checkpoint queue. + // At this point, checkpoint queue contains tries up to // last update (logged as last record in finalized segment) - // Ledger doesn't include new trie for this update - // until WAL result is sent back. - tries, err := c.ledger.Tries() - if err != nil { - c.logger.Error().Err(err).Msg("compactor failed to get ledger tries") - return activeSegmentNum, -1, nil - } + // It doesn't include trie for this update + // until updated trie is received and added to checkpointQueue. + tries := checkpointQueue.Tries() checkpointNum = nextCheckpointNum diff --git a/ledger/complete/compactor_test.go b/ledger/complete/compactor_test.go index 7d688abefcd..c7c08375e39 100644 --- a/ledger/complete/compactor_test.go +++ b/ledger/complete/compactor_test.go @@ -40,20 +40,25 @@ func (co *CompactorObserver) OnNext(val interface{}) { } } } + func (co *CompactorObserver) OnError(err error) {} func (co *CompactorObserver) OnComplete() { close(co.done) } func TestCompactor(t *testing.T) { - numInsPerStep := 2 - pathByteSize := 32 - minPayloadByteSize := 2 << 15 - maxPayloadByteSize := 2 << 16 - size := 10 + const ( + numInsPerStep = 2 + pathByteSize = 32 + minPayloadByteSize = 2 << 15 + maxPayloadByteSize = 2 << 16 + size = 10 + checkpointDistance = 3 + checkpointsToKeep = 1 + forestCapacity = size * 10 + ) + metricsCollector := &metrics.NoopCollector{} - checkpointDistance := uint(3) - checkpointsToKeep := uint(1) unittest.RunWithTempDir(t, func(dir string) { @@ -64,7 +69,7 @@ func TestCompactor(t *testing.T) { t.Run("creates checkpoints", func(t *testing.T) { - wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, size*10, pathByteSize, 32*1024) + wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, forestCapacity, pathByteSize, 32*1024) require.NoError(t, err) l, err = NewSyncLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) @@ -73,7 +78,7 @@ func TestCompactor(t *testing.T) { // WAL segments are 32kB, so here we generate 2 keys 64kB each, times `size` // so we should get at least `size` segments - compactor, err := NewCompactor(l, wal, checkpointDistance, checkpointsToKeep, zerolog.Nop()) + compactor, err := NewCompactor(l, wal, zerolog.Nop(), forestCapacity, checkpointDistance, checkpointsToKeep) require.NoError(t, err) co := CompactorObserver{fromBound: 8, done: make(chan struct{})} @@ -246,14 +251,19 @@ func TestCompactor(t *testing.T) { // (from segment 0, ignoring prior checkpoints) to the checkpoint number. // This verifies that checkpointed tries are snapshopt of segments and at segment boundary. func TestCompactorAccuracy(t *testing.T) { - numInsPerStep := 2 - pathByteSize := 32 - minPayloadByteSize := 2 << 15 - maxPayloadByteSize := 2 << 16 - size := 10 + + const ( + numInsPerStep = 2 + pathByteSize = 32 + minPayloadByteSize = 2 << 15 + maxPayloadByteSize = 2 << 16 + size = 10 + checkpointDistance = 5 + checkpointsToKeep = 0 + forestCapacity = size * 10 + ) + metricsCollector := &metrics.NoopCollector{} - checkpointDistance := uint(5) - checkpointsToKeep := uint(0) unittest.RunWithTempDir(t, func(dir string) { @@ -264,7 +274,7 @@ func TestCompactorAccuracy(t *testing.T) { // Create DiskWAL and Ledger repeatedly to test rebuilding ledger state at restart. for i := 0; i < 3; i++ { - wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, size*10, pathByteSize, 32*1024) + wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, forestCapacity, pathByteSize, 32*1024) require.NoError(t, err) l, err := NewSyncLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) @@ -276,7 +286,7 @@ func TestCompactorAccuracy(t *testing.T) { // WAL segments are 32kB, so here we generate 2 keys 64kB each, times `size` // so we should get at least `size` segments - compactor, err := NewCompactor(l, wal, checkpointDistance, checkpointsToKeep, zerolog.Nop()) + compactor, err := NewCompactor(l, wal, zerolog.Nop(), forestCapacity, checkpointDistance, checkpointsToKeep) require.NoError(t, err) fromBound := lastCheckpointNum + int(checkpointDistance)*2 @@ -429,14 +439,18 @@ func replaySegments( } func TestCompactorConcurrency(t *testing.T) { - numInsPerStep := 2 - pathByteSize := 32 - minPayloadByteSize := 2 << 15 - maxPayloadByteSize := 2 << 16 - size := 10 + const ( + numInsPerStep = 2 + pathByteSize = 32 + minPayloadByteSize = 2 << 15 + maxPayloadByteSize = 2 << 16 + size = 10 + checkpointDistance = 5 + checkpointsToKeep = 0 // keep all + forestCapacity = size * 10 + ) + metricsCollector := &metrics.NoopCollector{} - checkpointDistance := uint(5) - checkpointsToKeep := uint(0) // keep all unittest.RunWithTempDir(t, func(dir string) { @@ -447,7 +461,7 @@ func TestCompactorConcurrency(t *testing.T) { // Create DiskWAL and Ledger repeatedly to test rebuilding ledger state at restart. for i := 0; i < 3; i++ { - wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, size*10, pathByteSize, 32*1024) + wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, forestCapacity, pathByteSize, 32*1024) require.NoError(t, err) l, err := NewSyncLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) @@ -459,7 +473,7 @@ func TestCompactorConcurrency(t *testing.T) { // WAL segments are 32kB, so here we generate 2 keys 64kB each, times `size` // so we should get at least `size` segments - compactor, err := NewCompactor(l, wal, checkpointDistance, checkpointsToKeep, zerolog.Nop()) + compactor, err := NewCompactor(l, wal, zerolog.Nop(), forestCapacity, checkpointDistance, checkpointsToKeep) require.NoError(t, err) fromBound := lastCheckpointNum + int(checkpointDistance)*2 diff --git a/ledger/complete/ledger.go b/ledger/complete/ledger.go index ee2722fb064..054979fae31 100644 --- a/ledger/complete/ledger.go +++ b/ledger/complete/ledger.go @@ -276,16 +276,16 @@ func (l *Ledger) set(trieUpdate *ledger.TrieUpdate) (newState ledger.State, err resultCh := make(chan error) + trieCh := make(chan *trie.MTrie, 1) + defer close(trieCh) + if l.trieUpdateCh == nil { go func() { _, _, err := l.wal.RecordUpdate(trieUpdate) resultCh <- err }() } else { - doneCh := make(chan struct{}) - defer close(doneCh) - - l.trieUpdateCh <- &WALTrieUpdate{Update: trieUpdate, ResultCh: resultCh, DoneCh: doneCh} + l.trieUpdateCh <- &WALTrieUpdate{Update: trieUpdate, ResultCh: resultCh, TrieCh: trieCh} } newTrie, err := l.forest.NewTrie(trieUpdate) @@ -303,6 +303,8 @@ func (l *Ledger) set(trieUpdate *ledger.TrieUpdate) (newState ledger.State, err return ledger.State(hash.DummyHash), fmt.Errorf("failed to add new trie to forest: %w", err) } + trieCh <- newTrie + return ledger.State(newTrie.RootHash()), nil } From ccf45718206e0ca7779dfab7c1db534f98a98ad2 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Sun, 24 Jul 2022 17:57:17 -0500 Subject: [PATCH 13/32] Fix lint error --- ledger/complete/checkpointqueue_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ledger/complete/checkpointqueue_test.go b/ledger/complete/checkpointqueue_test.go index 8ce75257927..add18e0cba9 100644 --- a/ledger/complete/checkpointqueue_test.go +++ b/ledger/complete/checkpointqueue_test.go @@ -4,11 +4,12 @@ import ( "math/rand" "testing" + "github.com/stretchr/testify/require" + "github.com/onflow/flow-go/ledger" "github.com/onflow/flow-go/ledger/common/hash" "github.com/onflow/flow-go/ledger/complete/mtrie/node" "github.com/onflow/flow-go/ledger/complete/mtrie/trie" - "github.com/stretchr/testify/require" ) func TestEmptyCheckpointQueue(t *testing.T) { From 91320b432aef95c4ad5f197196dee40d2640e889 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Mon, 25 Jul 2022 14:55:15 -0500 Subject: [PATCH 14/32] Remove anon goroutine updating WAL in Ledger.Set Remove using anonymous goroutine updating WAL in old implementation. Update tests to use compactor for WAL update. Move WAL component shutdown from Ledger to Compactor. Simplify shutdown communication between Ledger and Compactor. --- cmd/bootstrap/run/execution_state.go | 9 + cmd/execution_builder.go | 6 +- .../exec-data-json-export/ledger_exporter.go | 6 + .../execution_state_extract.go | 9 + .../execution_state_extract_test.go | 11 ++ .../execution_verification_test.go | 7 + engine/execution/computation/manager_test.go | 7 + .../state/bootstrap/bootstrap_test.go | 12 ++ engine/execution/state/state_test.go | 6 + engine/testutil/mock/nodes.go | 7 +- engine/testutil/nodes.go | 7 +- engine/verification/utils/unittest/fixture.go | 9 +- fvm/fvm_bench_test.go | 16 ++ ledger/complete/checkpoint_benchmark_test.go | 172 ----------------- ledger/complete/compactor.go | 33 ++-- ledger/complete/compactor_test.go | 8 +- ledger/complete/ledger.go | 51 +---- ledger/complete/ledger_benchmark_test.go | 56 ++++-- ledger/complete/ledger_test.go | 178 ++++++++++++++++-- ledger/complete/wal/checkpointer_test.go | 14 +- ledger/complete/wal/fixtures/noopcompactor.go | 48 +++++ ledger/partial/ledger_test.go | 12 +- module/chunks/chunkVerifier_test.go | 8 + 23 files changed, 404 insertions(+), 288 deletions(-) create mode 100644 ledger/complete/wal/fixtures/noopcompactor.go diff --git a/cmd/bootstrap/run/execution_state.go b/cmd/bootstrap/run/execution_state.go index 802b04a3ef0..34ee773c392 100644 --- a/cmd/bootstrap/run/execution_state.go +++ b/cmd/bootstrap/run/execution_state.go @@ -8,6 +8,7 @@ import ( "github.com/onflow/flow-go/engine/execution/state/bootstrap" "github.com/onflow/flow-go/fvm" "github.com/onflow/flow-go/ledger/common/pathfinder" + "github.com/onflow/flow-go/ledger/complete" ledger "github.com/onflow/flow-go/ledger/complete" "github.com/onflow/flow-go/ledger/complete/wal" "github.com/onflow/flow-go/model/flow" @@ -46,8 +47,16 @@ func GenerateExecutionState( if err != nil { return flow.DummyStateCommitment, err } + + compactor, err := complete.NewCompactor(ledgerStorage, diskWal, zerolog.Nop(), 100, 1_000_000, 1) + if err != nil { + return flow.DummyStateCommitment, err + } + <-compactor.Ready() + defer func() { <-ledgerStorage.Done() + <-compactor.Done() }() return bootstrap.NewBootstrapper( diff --git a/cmd/execution_builder.go b/cmd/execution_builder.go index 0fb5b799a4b..ec5994ee1cb 100644 --- a/cmd/execution_builder.go +++ b/cmd/execution_builder.go @@ -376,15 +376,15 @@ func (e *ExecutionNodeBuilder) LoadComponentsAndModules() { } } - // Ledger is responsible for starting and shutdowning DiskWAL component. - // This ensures that all WAL updates are completed before closing opened WAL segment. + // DiskWal is a dependent component because we need to ensure + // that all WAL updates are completed before closing opened WAL segment. diskWAL, err = wal.NewDiskWAL(node.Logger.With().Str("subcomponent", "wal").Logger(), node.MetricsRegisterer, collector, e.exeConf.triedir, int(e.exeConf.mTrieCacheSize), pathfinder.PathByteSize, wal.SegmentSize) if err != nil { return nil, fmt.Errorf("failed to initialize wal: %w", err) } - ledgerStorage, err = ledger.NewSyncLedger(diskWAL, int(e.exeConf.mTrieCacheSize), collector, node.Logger.With().Str("subcomponent", + ledgerStorage, err = ledger.NewLedger(diskWAL, int(e.exeConf.mTrieCacheSize), collector, node.Logger.With().Str("subcomponent", "ledger").Logger(), ledger.DefaultPathFinderVersion) return ledgerStorage, err }). diff --git a/cmd/util/cmd/exec-data-json-export/ledger_exporter.go b/cmd/util/cmd/exec-data-json-export/ledger_exporter.go index e97cd9f34a2..2e3b6b6cba6 100644 --- a/cmd/util/cmd/exec-data-json-export/ledger_exporter.go +++ b/cmd/util/cmd/exec-data-json-export/ledger_exporter.go @@ -28,8 +28,14 @@ func ExportLedger(ledgerPath string, targetstate string, outputPath string) erro if err != nil { return fmt.Errorf("cannot create ledger from write-a-head logs and checkpoints: %w", err) } + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), complete.DefaultCacheSize, 1_000_000, 1) + if err != nil { + return fmt.Errorf("cannot create compactor: %w", err) + } + <-compactor.Ready() defer func() { <-led.Done() + <-compactor.Done() }() stateBytes, err := hex.DecodeString(targetstate) if err != nil { diff --git a/cmd/util/cmd/execution-state-extract/execution_state_extract.go b/cmd/util/cmd/execution-state-extract/execution_state_extract.go index 782ad63f1bb..22756be42cf 100644 --- a/cmd/util/cmd/execution-state-extract/execution_state_extract.go +++ b/cmd/util/cmd/execution-state-extract/execution_state_extract.go @@ -53,8 +53,17 @@ func extractExecutionState( if err != nil { return fmt.Errorf("cannot create ledger from write-a-head logs and checkpoints: %w", err) } + + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), complete.DefaultCacheSize, 1_000_000, 1) + if err != nil { + return fmt.Errorf("cannot create compactor: %w", err) + } + + <-compactor.Ready() + defer func() { <-led.Done() + <-compactor.Done() }() var migrations []ledger.Migration diff --git a/cmd/util/cmd/execution-state-extract/execution_state_extract_test.go b/cmd/util/cmd/execution-state-extract/execution_state_extract_test.go index 7f05a6c09f9..61b882c3b6f 100644 --- a/cmd/util/cmd/execution-state-extract/execution_state_extract_test.go +++ b/cmd/util/cmd/execution-state-extract/execution_state_extract_test.go @@ -84,6 +84,9 @@ func TestExtractExecutionState(t *testing.T) { require.NoError(t, err) f, err := complete.NewLedger(diskWal, size*10, metr, zerolog.Nop(), complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor, err := complete.NewCompactor(f, diskWal, zerolog.Nop(), uint(size), 1_000_000, 1) + require.NoError(t, err) + <-compactor.Ready() var stateCommitment = f.InitialState() @@ -121,6 +124,8 @@ func TestExtractExecutionState(t *testing.T) { } <-f.Done() + <-compactor.Done() + err = db.Close() require.NoError(t, err) @@ -151,6 +156,11 @@ func TestExtractExecutionState(t *testing.T) { storage, err := complete.NewLedger(diskWal, 1000, metr, zerolog.Nop(), complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor, err := complete.NewCompactor(storage, diskWal, zerolog.Nop(), uint(size), 1_000_000, 1) + require.NoError(t, err) + + <-compactor.Ready() + data := keysValuesByCommit[string(stateCommitment[:])] keys := make([]ledger.Key, 0, len(data)) @@ -181,6 +191,7 @@ func TestExtractExecutionState(t *testing.T) { } <-storage.Done() + <-compactor.Done() }) } }) diff --git a/engine/execution/computation/execution_verification_test.go b/engine/execution/computation/execution_verification_test.go index 0e7443fad36..24901088874 100644 --- a/engine/execution/computation/execution_verification_test.go +++ b/engine/execution/computation/execution_verification_test.go @@ -643,6 +643,13 @@ func executeBlockAndVerifyWithParameters(t *testing.T, ledger, err := completeLedger.NewLedger(wal, 100, collector, logger, completeLedger.DefaultPathFinderVersion) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(ledger) + <-compactor.Ready() + defer func() { + <-ledger.Done() + <-compactor.Done() + }() + bootstrapper := bootstrapexec.NewBootstrapper(logger) initialCommit, err := bootstrapper.BootstrapLedger( diff --git a/engine/execution/computation/manager_test.go b/engine/execution/computation/manager_test.go index 765545da197..e2b0781bfd8 100644 --- a/engine/execution/computation/manager_test.go +++ b/engine/execution/computation/manager_test.go @@ -153,6 +153,13 @@ func TestComputeBlock_Uploader(t *testing.T) { ledger, err := complete.NewLedger(&fixtures.NoopWAL{}, 10, noopCollector, zerolog.Nop(), complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(ledger) + <-compactor.Ready() + defer func() { + <-ledger.Done() + <-compactor.Done() + }() + me := new(module.Local) me.On("NodeID").Return(flow.ZeroID) diff --git a/engine/execution/state/bootstrap/bootstrap_test.go b/engine/execution/state/bootstrap/bootstrap_test.go index 25bc40f8a98..38c0cf3bfa5 100644 --- a/engine/execution/state/bootstrap/bootstrap_test.go +++ b/engine/execution/state/bootstrap/bootstrap_test.go @@ -25,6 +25,12 @@ func TestBootstrapLedger(t *testing.T) { wal := &fixtures.NoopWAL{} ls, err := completeLedger.NewLedger(wal, 100, metricsCollector, zerolog.Nop(), completeLedger.DefaultPathFinderVersion) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(ls) + <-compactor.Ready() + defer func() { + <-ls.Done() + <-compactor.Done() + }() stateCommitment, err := NewBootstrapper(zerolog.Nop()).BootstrapLedger( ls, @@ -59,6 +65,12 @@ func TestBootstrapLedger_ZeroTokenSupply(t *testing.T) { wal := &fixtures.NoopWAL{} ls, err := completeLedger.NewLedger(wal, 100, metricsCollector, zerolog.Nop(), completeLedger.DefaultPathFinderVersion) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(ls) + <-compactor.Ready() + defer func() { + <-ls.Done() + <-compactor.Done() + }() stateCommitment, err := NewBootstrapper(zerolog.Nop()).BootstrapLedger( ls, diff --git a/engine/execution/state/state_test.go b/engine/execution/state/state_test.go index a1aee4708e6..f797bd7a51c 100644 --- a/engine/execution/state/state_test.go +++ b/engine/execution/state/state_test.go @@ -31,6 +31,12 @@ func prepareTest(f func(t *testing.T, es state.ExecutionState, l *ledger.Ledger) diskWal := &fixtures.NoopWAL{} ls, err := ledger.NewLedger(diskWal, 100, metricsCollector, zerolog.Nop(), ledger.DefaultPathFinderVersion) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(ls) + <-compactor.Ready() + defer func() { + <-ls.Done() + <-compactor.Done() + }() ctrl := gomock.NewController(t) diff --git a/engine/testutil/mock/nodes.go b/engine/testutil/mock/nodes.go index dbb9f355942..ca6037c57f1 100644 --- a/engine/testutil/mock/nodes.go +++ b/engine/testutil/mock/nodes.go @@ -35,7 +35,7 @@ import ( "github.com/onflow/flow-go/fvm" fvmState "github.com/onflow/flow-go/fvm/state" "github.com/onflow/flow-go/ledger" - "github.com/onflow/flow-go/ledger/complete/wal" + "github.com/onflow/flow-go/ledger/complete" "github.com/onflow/flow-go/model/flow" "github.com/onflow/flow-go/module" "github.com/onflow/flow-go/module/finalizer/consensus" @@ -204,7 +204,7 @@ type ExecutionNode struct { ReceiptsEngine *executionprovider.Engine FollowerEngine *followereng.Engine SyncEngine *synchronization.Engine - DiskWAL *wal.DiskWAL + Compactor *complete.Compactor BadgerDB *badger.DB VM *fvm.VirtualMachine ExecutionState state.ExecutionState @@ -223,7 +223,6 @@ func (en ExecutionNode) Ready() { en.FollowerEngine, en.RequestEngine, en.SyncEngine, - en.DiskWAL, ) } @@ -236,7 +235,7 @@ func (en ExecutionNode) Done() { en.FollowerEngine, en.RequestEngine, en.SyncEngine, - en.DiskWAL, + en.Compactor, ) os.RemoveAll(en.LevelDbDir) en.GenericNode.Done() diff --git a/engine/testutil/nodes.go b/engine/testutil/nodes.go index 0c5e0cb1b47..3f7b2144a01 100644 --- a/engine/testutil/nodes.go +++ b/engine/testutil/nodes.go @@ -505,6 +505,11 @@ func ExecutionNode(t *testing.T, hub *stub.Hub, identity *flow.Identity, identit ls, err := completeLedger.NewLedger(diskWal, 100, metricsCollector, node.Log.With().Str("compontent", "ledger").Logger(), completeLedger.DefaultPathFinderVersion) require.NoError(t, err) + compactor, err := completeLedger.NewCompactor(ls, diskWal, zerolog.Nop(), 100, 1_000_000, 1) + require.NoError(t, err) + + <-compactor.Ready() // Need to start compactor here because BootstrapLedger() updates ledger state. + genesisHead, err := node.State.Final().Head() require.NoError(t, err) @@ -666,7 +671,7 @@ func ExecutionNode(t *testing.T, hub *stub.Hub, identity *flow.Identity, identit Collections: collectionsStorage, Finalizer: finalizer, MyExecutionReceipts: myReceipts, - DiskWAL: diskWal, + Compactor: compactor, } } diff --git a/engine/verification/utils/unittest/fixture.go b/engine/verification/utils/unittest/fixture.go index c54e7dd5f3d..ce8e5bb2940 100644 --- a/engine/verification/utils/unittest/fixture.go +++ b/engine/verification/utils/unittest/fixture.go @@ -221,7 +221,14 @@ func ExecutionResultFixture(t *testing.T, chunkCount int, chain flow.Chain, refB led, err := completeLedger.NewLedger(w, 100, metricsCollector, zerolog.Nop(), completeLedger.DefaultPathFinderVersion) require.NoError(t, err) - defer led.Done() + + compactor := fixtures.NewNoopCompactor(led) + <-compactor.Ready() + + defer func() { + <-led.Done() + <-compactor.Done() + }() // set 0 clusters to pass n_collectors >= n_clusters check epochConfig := epochs.DefaultEpochConfig() diff --git a/fvm/fvm_bench_test.go b/fvm/fvm_bench_test.go index 23e72d6fd02..51edddd99be 100644 --- a/fvm/fvm_bench_test.go +++ b/fvm/fvm_bench_test.go @@ -128,6 +128,7 @@ type BasicBlockExecutor struct { activeStateCommitment flow.StateCommitment chain flow.Chain serviceAccount *TestBenchAccount + onStopFunc func() } func NewBasicBlockExecutor(tb testing.TB, chain flow.Chain, logger zerolog.Logger) *BasicBlockExecutor { @@ -149,6 +150,14 @@ func NewBasicBlockExecutor(tb testing.TB, chain flow.Chain, logger zerolog.Logge ledger, err := completeLedger.NewLedger(wal, 100, collector, logger, completeLedger.DefaultPathFinderVersion) require.NoError(tb, err) + compactor := fixtures.NewNoopCompactor(ledger) + <-compactor.Ready() + + onStopFunc := func() { + <-ledger.Done() + <-compactor.Done() + } + bootstrapper := bootstrapexec.NewBootstrapper(logger) serviceAccount := &TestBenchAccount{ @@ -182,6 +191,7 @@ func NewBasicBlockExecutor(tb testing.TB, chain flow.Chain, logger zerolog.Logge activeView: view, chain: chain, serviceAccount: serviceAccount, + onStopFunc: onStopFunc, } } @@ -327,6 +337,9 @@ func BenchmarkRuntimeTransaction(b *testing.B) { logger := zerolog.New(logE).Level(zerolog.DebugLevel) blockExecutor := NewBasicBlockExecutor(b, chain, logger) + defer func() { + blockExecutor.onStopFunc() + }() serviceAccount := blockExecutor.ServiceAccount(b) // Create an account private key. @@ -513,6 +526,9 @@ const TransferTxTemplate = ` // BenchmarkRuntimeNFTBatchTransfer runs BenchRunNFTBatchTransfer with BasicBlockExecutor func BenchmarkRuntimeNFTBatchTransfer(b *testing.B) { blockExecutor := NewBasicBlockExecutor(b, flow.Testnet.Chain(), zerolog.Nop()) + defer func() { + blockExecutor.onStopFunc() + }() // Create an account private key. privateKeys, err := testutil.GenerateAccountPrivateKeys(3) diff --git a/ledger/complete/checkpoint_benchmark_test.go b/ledger/complete/checkpoint_benchmark_test.go index 875ec0c1c0b..859b4d631cd 100644 --- a/ledger/complete/checkpoint_benchmark_test.go +++ b/ledger/complete/checkpoint_benchmark_test.go @@ -4,7 +4,6 @@ import ( "flag" "fmt" "io" - "math/rand" "os" "strconv" "strings" @@ -12,13 +11,9 @@ import ( "time" "github.com/rs/zerolog" - "github.com/stretchr/testify/require" "github.com/onflow/flow-go/ledger" - "github.com/onflow/flow-go/ledger/common/hash" "github.com/onflow/flow-go/ledger/common/pathfinder" - "github.com/onflow/flow-go/ledger/common/utils" - "github.com/onflow/flow-go/ledger/complete" "github.com/onflow/flow-go/ledger/complete/mtrie" "github.com/onflow/flow-go/ledger/complete/mtrie/trie" "github.com/onflow/flow-go/ledger/complete/wal" @@ -205,170 +200,3 @@ func hasCheckpointInDir(dir string) (bool, error) { return false, nil } - -func BenchmarkNewCheckpointRandom5Seg(b *testing.B) { benchmarkNewCheckpointRandomData(b, 5) } - -func BenchmarkNewCheckpointRandom10Seg(b *testing.B) { benchmarkNewCheckpointRandomData(b, 10) } - -func BenchmarkNewCheckpointRandom20Seg(b *testing.B) { benchmarkNewCheckpointRandomData(b, 20) } - -func BenchmarkNewCheckpointRandom30Seg(b *testing.B) { benchmarkNewCheckpointRandomData(b, 30) } - -func BenchmarkNewCheckpointRandom40Seg(b *testing.B) { benchmarkNewCheckpointRandomData(b, 40) } - -// benchmarkCheckpointCreate benchmarks checkpoint file creation. -// This benchmark creates segmentCount+1 WAL segments. It also creates two checkpoint files: -// - checkpoint file A from segment 0, and -// - checkpoint file B from checkpoint file A and all segments after segment 0. -// This benchmark measures the creation of checkpoint file B: -// - loading checkpoint file A -// - replaying all segments after segment 0 -// - creating checkpoint file B -// Because payload data is random, number of segments created can differ from segmentCount. -func benchmarkNewCheckpointRandomData(b *testing.B, segmentCount int) { - - const ( - updatePerSegment = 75 // 75 updates for 1 segment by approximation. - kvBatchCount = 500 // Each update has 500 new payloads. - ) - - if segmentCount < 1 { - segmentCount = 1 - } - - kvOpts := randKeyValueOptions{ - keyNumberOfParts: 3, - keyPartMinByteSize: 1, - keyPartMaxByteSize: 50, - valueMinByteSize: 50, - valueMaxByteSize: 1024 * 1.5, - } - updateCount := (segmentCount + 1) * updatePerSegment - - seed := uint64(0x9E3779B97F4A7C15) // golden ratio - rand.Seed(int64(seed)) - - dir, err := os.MkdirTemp("", "test-mtrie-") - defer os.RemoveAll(dir) - if err != nil { - b.Fatal(err) - } - - wal1, err := wal.NewDiskWAL( - zerolog.Nop(), - nil, - metrics.NewNoopCollector(), - dir, - 500, - pathfinder.PathByteSize, - wal.SegmentSize) - if err != nil { - b.Fatal(err) - } - - led, err := complete.NewLedger( - wal1, - 500, - &metrics.NoopCollector{}, - zerolog.Logger{}, - complete.DefaultPathFinderVersion, - ) - if err != nil { - b.Fatal(err) - } - - state := led.InitialState() - - _, err = updateLedgerWithRandomData(led, state, updateCount, kvBatchCount, kvOpts) - if err != nil { - b.Fatal(err) - } - - <-led.Done() - - wal2, err := wal.NewDiskWAL( - zerolog.Nop(), - nil, - metrics.NewNoopCollector(), - dir, - 500, - pathfinder.PathByteSize, - wal.SegmentSize, - ) - if err != nil { - b.Fatal(err) - } - - checkpointer, err := wal2.NewCheckpointer() - if err != nil { - b.Fatal(err) - } - - // Create checkpoint with only one segment as the base checkpoint for the next step. - err = checkpointer.Checkpoint(0, func() (io.WriteCloser, error) { - return checkpointer.CheckpointWriter(0) - }) - require.NoError(b, err) - - // Create checkpoint with remaining segments - _, to, err := wal2.Segments() - require.NoError(b, err) - - if to == 1 { - fmt.Printf("skip creating second checkpoint file because to segment is 1\n") - return - } - - start := time.Now() - b.ResetTimer() - - err = checkpointer.Checkpoint(to-1, func() (io.WriteCloser, error) { - return checkpointer.CheckpointWriter(to) - }) - - b.StopTimer() - elapsed := time.Since(start) - - if err != nil { - b.Fatal(err) - } - - b.ReportMetric(float64(elapsed/time.Millisecond), "newcheckpoint_rand_time_(ms)") - b.ReportAllocs() -} - -type randKeyValueOptions struct { - keyNumberOfParts int - keyPartMinByteSize int - keyPartMaxByteSize int - valueMinByteSize int - valueMaxByteSize int -} - -func updateLedgerWithRandomData( - led ledger.Ledger, - state ledger.State, - updateCount int, - kvBatchCount int, - kvOpts randKeyValueOptions, -) (ledger.State, error) { - - for i := 0; i < updateCount; i++ { - keys := utils.RandomUniqueKeys(kvBatchCount, kvOpts.keyNumberOfParts, kvOpts.keyPartMinByteSize, kvOpts.keyPartMaxByteSize) - values := utils.RandomValues(kvBatchCount, kvOpts.valueMinByteSize, kvOpts.valueMaxByteSize) - - update, err := ledger.NewUpdate(state, keys, values) - if err != nil { - return ledger.State(hash.DummyHash), err - } - - newState, _, err := led.Set(update) - if err != nil { - return ledger.State(hash.DummyHash), err - } - - state = newState - } - - return state, nil -} diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index 9c2e74f7a94..e45873e6905 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -29,22 +29,20 @@ type checkpointResult struct { type Compactor struct { checkpointer *realWAL.Checkpointer - wal *realWAL.DiskWAL - ledger *Ledger + wal realWAL.LedgerWAL checkpointQueue *CheckpointQueue logger zerolog.Logger - stopCh chan chan struct{} - trieUpdateCh <-chan *WALTrieUpdate - trieUpdateDoneCh chan<- struct{} lm *lifecycle.LifecycleManager observers map[observable.Observer]struct{} checkpointDistance uint checkpointsToKeep uint + stopCh chan chan struct{} + trieUpdateCh <-chan *WALTrieUpdate } func NewCompactor( l *Ledger, - w *realWAL.DiskWAL, + w realWAL.LedgerWAL, logger zerolog.Logger, checkpointCapacity uint, checkpointDistance uint, @@ -64,11 +62,6 @@ func NewCompactor( return nil, errors.New("failed to get valid trie update channel from ledger") } - trieUpdateDoneCh := l.TrieUpdateDoneChan() - if trieUpdateDoneCh == nil { - return nil, errors.New("failed to get valid trie update done channel from ledger") - } - tries, err := l.Tries() if err != nil { return nil, err @@ -79,12 +72,10 @@ func NewCompactor( return &Compactor{ checkpointer: checkpointer, wal: w, - ledger: l, checkpointQueue: checkpointQueue, logger: logger, stopCh: make(chan chan struct{}), trieUpdateCh: trieUpdateCh, - trieUpdateDoneCh: l.trieUpdateDoneCh, observers: make(map[observable.Observer]struct{}), lm: lifecycle.NewLifecycleManager(), checkpointDistance: checkpointDistance, @@ -111,20 +102,20 @@ func (c *Compactor) Ready() <-chan struct{} { func (c *Compactor) Done() <-chan struct{} { c.lm.OnStop(func() { - // Close trieUpdateDoneCh to signal trie updates are finished - defer close(c.trieUpdateDoneCh) - - // Notify observers - for observer := range c.observers { - observer.OnComplete() - } - // Signal Compactor goroutine to stop doneCh := make(chan struct{}) c.stopCh <- doneCh // Wait for Compactor goroutine to stop <-doneCh + + // Shut down WAL component. + <-c.wal.Done() + + // Notify observers + for observer := range c.observers { + observer.OnComplete() + } }) return c.lm.Stopped() } diff --git a/ledger/complete/compactor_test.go b/ledger/complete/compactor_test.go index c7c08375e39..f0de8f7d2ae 100644 --- a/ledger/complete/compactor_test.go +++ b/ledger/complete/compactor_test.go @@ -72,7 +72,7 @@ func TestCompactor(t *testing.T) { wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, forestCapacity, pathByteSize, 32*1024) require.NoError(t, err) - l, err = NewSyncLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) + l, err = NewLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) require.NoError(t, err) // WAL segments are 32kB, so here we generate 2 keys 64kB each, times `size` @@ -192,7 +192,7 @@ func TestCompactor(t *testing.T) { wal2, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, size*10, pathByteSize, 32*1024) require.NoError(t, err) - l2, err = NewSyncLedger(wal2, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) + l2, err = NewLedger(wal2, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) require.NoError(t, err) <-wal2.Done() @@ -277,7 +277,7 @@ func TestCompactorAccuracy(t *testing.T) { wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, forestCapacity, pathByteSize, 32*1024) require.NoError(t, err) - l, err := NewSyncLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) + l, err := NewLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) require.NoError(t, err) checkpointer, err := wal.NewCheckpointer() @@ -464,7 +464,7 @@ func TestCompactorConcurrency(t *testing.T) { wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, forestCapacity, pathByteSize, 32*1024) require.NoError(t, err) - l, err := NewSyncLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) + l, err := NewLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) require.NoError(t, err) checkpointer, err := wal.NewCheckpointer() diff --git a/ledger/complete/ledger.go b/ledger/complete/ledger.go index 054979fae31..2fd16f4a6e2 100644 --- a/ledger/complete/ledger.go +++ b/ledger/complete/ledger.go @@ -40,7 +40,6 @@ type Ledger struct { metrics module.LedgerMetrics logger zerolog.Logger trieUpdateCh chan *WALTrieUpdate - trieUpdateDoneCh chan struct{} pathFinderVersion uint8 } @@ -70,6 +69,7 @@ func NewLedger( metrics: metrics, logger: logger, pathFinderVersion: pathFinderVer, + trieUpdateCh: make(chan *WALTrieUpdate, defaultTrieUpdateChanSize), } // pause records to prevent double logging trie removals @@ -89,35 +89,12 @@ func NewLedger( return storage, nil } -func NewSyncLedger( - wal realWAL.LedgerWAL, - capacity int, - metrics module.LedgerMetrics, - log zerolog.Logger, - pathFinderVer uint8) (*Ledger, error) { - - l, err := NewLedger(wal, capacity, metrics, log, pathFinderVer) - if err != nil { - return nil, err - } - - l.trieUpdateCh = make(chan *WALTrieUpdate, defaultTrieUpdateChanSize) - l.trieUpdateDoneCh = make(chan struct{}) - return l, nil -} - // TrieUpdateChan returns a channel which is used to receive trie updates that needs to be logged in WALs. // This channel is closed when ledger component shutdowns down. func (l *Ledger) TrieUpdateChan() <-chan *WALTrieUpdate { return l.trieUpdateCh } -// TrieUpdateDoneChan returns a channel which is closed when there are no more WAL updates. -// This is used to signal that it is safe to shutdown WAL component (closing opened WAL segment). -func (l *Ledger) TrieUpdateDoneChan() chan<- struct{} { - return l.trieUpdateDoneCh -} - // Ready implements interface module.ReadyDoneAware // it starts the EventLoop's internal processing loop. func (l *Ledger) Ready() <-chan struct{} { @@ -135,17 +112,10 @@ func (l *Ledger) Done() <-chan struct{} { done := make(chan struct{}) go func() { defer close(done) - if l.trieUpdateCh != nil { - // Compactor is running in parallel. - // Ledger is responsible for closing trieUpdateCh channel, - // so Compactor can drain and process remaining updates. - close(l.trieUpdateCh) - - // Wait for Compactor to finish all trie updates before closing WAL component. - <-l.trieUpdateDoneCh - } - // Shut down WAL component. - <-l.wal.Done() + + // Ledger is responsible for closing trieUpdateCh channel, + // so Compactor can drain and process remaining updates. + close(l.trieUpdateCh) }() return done } @@ -274,19 +244,12 @@ func (l *Ledger) Set(update *ledger.Update) (newState ledger.State, trieUpdate * func (l *Ledger) set(trieUpdate *ledger.TrieUpdate) (newState ledger.State, err error) { - resultCh := make(chan error) + resultCh := make(chan error, 1) trieCh := make(chan *trie.MTrie, 1) defer close(trieCh) - if l.trieUpdateCh == nil { - go func() { - _, _, err := l.wal.RecordUpdate(trieUpdate) - resultCh <- err - }() - } else { - l.trieUpdateCh <- &WALTrieUpdate{Update: trieUpdate, ResultCh: resultCh, TrieCh: trieCh} - } + l.trieUpdateCh <- &WALTrieUpdate{Update: trieUpdate, ResultCh: resultCh, TrieCh: trieCh} newTrie, err := l.forest.NewTrie(trieUpdate) walError := <-resultCh diff --git a/ledger/complete/ledger_benchmark_test.go b/ledger/complete/ledger_benchmark_test.go index c370fd68da7..52c0ce0c55a 100644 --- a/ledger/complete/ledger_benchmark_test.go +++ b/ledger/complete/ledger_benchmark_test.go @@ -45,12 +45,18 @@ func benchmarkStorage(steps int, b *testing.B) { require.NoError(b, err) led, err := complete.NewLedger(diskWal, steps+1, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + require.NoError(b, err) + + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), uint(steps+1), 1_000_000, 1) + require.NoError(b, err) + + <-compactor.Ready() + defer func() { <-led.Done() + <-compactor.Done() }() - if err != nil { - b.Fatal("can't create a new complete ledger") - } + totalUpdateTimeMS := 0 totalReadTimeMS := 0 totalProofTimeMS := 0 @@ -152,12 +158,17 @@ func BenchmarkTrieUpdate(b *testing.B) { require.NoError(b, err) led, err := complete.NewLedger(diskWal, 101, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + require.NoError(b, err) + + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), 101, 1_000_000, 1) + require.NoError(b, err) + + <-compactor.Ready() + defer func() { <-led.Done() + <-compactor.Done() }() - if err != nil { - b.Fatal("can't create a new complete ledger") - } state := led.InitialState() @@ -199,12 +210,17 @@ func BenchmarkTrieRead(b *testing.B) { require.NoError(b, err) led, err := complete.NewLedger(diskWal, 101, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + require.NoError(b, err) + + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), 101, 1_000_000, 1) + require.NoError(b, err) + + <-compactor.Ready() + defer func() { <-led.Done() + <-compactor.Done() }() - if err != nil { - b.Fatal("can't create a new complete ledger") - } state := led.InitialState() @@ -255,12 +271,17 @@ func BenchmarkLedgerGetOneValue(b *testing.B) { require.NoError(b, err) led, err := complete.NewLedger(diskWal, 101, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + require.NoError(b, err) + + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), 101, 1_000_000, 1) + require.NoError(b, err) + + <-compactor.Ready() + defer func() { <-led.Done() + <-compactor.Done() }() - if err != nil { - b.Fatal("can't create a new complete ledger") - } state := led.InitialState() @@ -328,12 +349,17 @@ func BenchmarkTrieProve(b *testing.B) { require.NoError(b, err) led, err := complete.NewLedger(diskWal, 101, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + require.NoError(b, err) + + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), 101, 1_000_000, 1) + require.NoError(b, err) + + <-compactor.Ready() + defer func() { <-led.Done() + <-compactor.Done() }() - if err != nil { - b.Fatal("can't create a new complete ledger") - } state := led.InitialState() diff --git a/ledger/complete/ledger_test.go b/ledger/complete/ledger_test.go index d9240a2c39f..7c30ccb08d4 100644 --- a/ledger/complete/ledger_test.go +++ b/ledger/complete/ledger_test.go @@ -41,6 +41,13 @@ func TestLedger_Update(t *testing.T) { l, err := complete.NewLedger(wal, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(l) + <-compactor.Ready() + defer func() { + <-l.Done() + <-compactor.Done() + }() + // create empty update currentState := l.InitialState() up, err := ledger.NewEmptyUpdate(currentState) @@ -60,6 +67,13 @@ func TestLedger_Update(t *testing.T) { led, err := complete.NewLedger(wal, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(led) + <-compactor.Ready() + defer func() { + <-led.Done() + <-compactor.Done() + }() + curSC := led.InitialState() u := utils.UpdateFixture() @@ -89,6 +103,13 @@ func TestLedger_Get(t *testing.T) { led, err := complete.NewLedger(wal, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(led) + <-compactor.Ready() + defer func() { + <-led.Done() + <-compactor.Done() + }() + curSC := led.InitialState() q, err := ledger.NewEmptyQuery(curSC) require.NoError(t, err) @@ -105,6 +126,13 @@ func TestLedger_Get(t *testing.T) { led, err := complete.NewLedger(wal, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(led) + <-compactor.Ready() + defer func() { + <-led.Done() + <-compactor.Done() + }() + curS := led.InitialState() q := utils.QueryFixture() @@ -133,6 +161,13 @@ func TestLedger_GetSingleValue(t *testing.T) { ) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(led) + <-compactor.Ready() + defer func() { + <-led.Done() + <-compactor.Done() + }() + state := led.InitialState() t.Run("non-existent key", func(t *testing.T) { @@ -223,6 +258,13 @@ func TestLedgerValueSizes(t *testing.T) { ) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(led) + <-compactor.Ready() + defer func() { + <-led.Done() + <-compactor.Done() + }() + curState := led.InitialState() q, err := ledger.NewEmptyQuery(curState) require.NoError(t, err) @@ -244,6 +286,13 @@ func TestLedgerValueSizes(t *testing.T) { ) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(led) + <-compactor.Ready() + defer func() { + <-led.Done() + <-compactor.Done() + }() + curState := led.InitialState() q := utils.QueryFixture() q.SetState(curState) @@ -268,6 +317,13 @@ func TestLedgerValueSizes(t *testing.T) { ) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(led) + <-compactor.Ready() + defer func() { + <-led.Done() + <-compactor.Done() + }() + curState := led.InitialState() u := utils.UpdateFixture() u.SetState(curState) @@ -299,6 +355,13 @@ func TestLedgerValueSizes(t *testing.T) { ) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(led) + <-compactor.Ready() + defer func() { + <-led.Done() + <-compactor.Done() + }() + curState := led.InitialState() u := utils.UpdateFixture() u.SetState(curState) @@ -343,6 +406,13 @@ func TestLedger_Proof(t *testing.T) { led, err := complete.NewLedger(wal, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(led) + <-compactor.Ready() + defer func() { + <-led.Done() + <-compactor.Done() + }() + curSC := led.InitialState() q, err := ledger.NewEmptyQuery(curSC) require.NoError(t, err) @@ -362,6 +432,13 @@ func TestLedger_Proof(t *testing.T) { led, err := complete.NewLedger(wal, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(led) + <-compactor.Ready() + defer func() { + <-led.Done() + <-compactor.Done() + }() + curS := led.InitialState() q := utils.QueryFixture() q.SetState(curS) @@ -383,6 +460,13 @@ func TestLedger_Proof(t *testing.T) { led, err := complete.NewLedger(wal, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(led) + <-compactor.Ready() + defer func() { + <-led.Done() + <-compactor.Done() + }() + curS := led.InitialState() u := utils.UpdateFixture() @@ -424,6 +508,11 @@ func Test_WAL(t *testing.T) { led, err := complete.NewLedger(diskWal, size, metricsCollector, logger, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), uint(size), 1_000_000, 1) + require.NoError(t, err) + + <-compactor.Ready() + var state = led.InitialState() //saved data after updates @@ -449,6 +538,7 @@ func Test_WAL(t *testing.T) { } <-led.Done() + <-compactor.Done() diskWal2, err := wal.NewDiskWAL(zerolog.Nop(), nil, metricsCollector, dir, size, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(t, err) @@ -456,6 +546,11 @@ func Test_WAL(t *testing.T) { led2, err := complete.NewLedger(diskWal2, size+10, metricsCollector, logger, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor2, err := complete.NewCompactor(led2, diskWal2, zerolog.Nop(), uint(size), 1_000_000, 1) + require.NoError(t, err) + + <-compactor2.Ready() + // random map iteration order is a benefit here for state, data := range savedData { @@ -485,6 +580,7 @@ func Test_WAL(t *testing.T) { assert.Equal(t, s, size) <-led2.Done() + <-compactor2.Done() }) } @@ -512,6 +608,10 @@ func TestLedgerFunctionality(t *testing.T) { require.NoError(t, err) led, err := complete.NewLedger(diskWal, activeTries, metricsCollector, logger, complete.DefaultPathFinderVersion) assert.NoError(t, err) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), uint(activeTries), 1_000_000, 1) + require.NoError(t, err) + <-compactor.Ready() + state := led.InitialState() for i := 0; i < steps; i++ { // add new keys @@ -594,6 +694,7 @@ func TestLedgerFunctionality(t *testing.T) { state = newState } <-led.Done() + <-compactor.Done() }) } } @@ -612,6 +713,9 @@ func Test_ExportCheckpointAt(t *testing.T) { require.NoError(t, err) led, err := complete.NewLedger(diskWal, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), 100, 1_000_000, 1) + require.NoError(t, err) + <-compactor.Ready() state := led.InitialState() u := utils.UpdateFixture() @@ -628,6 +732,9 @@ func Test_ExportCheckpointAt(t *testing.T) { require.NoError(t, err) led2, err := complete.NewLedger(diskWal2, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor2, err := complete.NewCompactor(led2, diskWal2, zerolog.Nop(), 100, 1_000_000, 1) + require.NoError(t, err) + <-compactor2.Ready() q, err := ledger.NewQuery(state, u.Keys()) require.NoError(t, err) @@ -640,7 +747,9 @@ func Test_ExportCheckpointAt(t *testing.T) { } <-led.Done() + <-compactor.Done() <-led2.Done() + <-compactor2.Done() }) }) }) @@ -656,6 +765,9 @@ func Test_ExportCheckpointAt(t *testing.T) { require.NoError(t, err) led, err := complete.NewLedger(diskWal, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), 100, 1_000_000, 1) + require.NoError(t, err) + <-compactor.Ready() state := led.InitialState() u := utils.UpdateFixture() @@ -671,6 +783,9 @@ func Test_ExportCheckpointAt(t *testing.T) { require.NoError(t, err) led2, err := complete.NewLedger(diskWal2, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor2, err := complete.NewCompactor(led2, diskWal2, zerolog.Nop(), 100, 1_000_000, 1) + require.NoError(t, err) + <-compactor2.Ready() q, err := ledger.NewQuery(newState, u.Keys()) require.NoError(t, err) @@ -682,7 +797,9 @@ func Test_ExportCheckpointAt(t *testing.T) { assert.Equal(t, retValues[1], ledger.Value([]byte{'B'})) <-led.Done() + <-compactor.Done() <-led2.Done() + <-compactor2.Done() }) }) }) @@ -698,6 +815,9 @@ func Test_ExportCheckpointAt(t *testing.T) { require.NoError(t, err) led, err := complete.NewLedger(diskWal, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), 100, 1_000_000, 1) + require.NoError(t, err) + <-compactor.Ready() state := led.InitialState() u := utils.UpdateFixture() @@ -713,6 +833,9 @@ func Test_ExportCheckpointAt(t *testing.T) { require.NoError(t, err) led2, err := complete.NewLedger(diskWal2, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor2, err := complete.NewCompactor(led2, diskWal2, zerolog.Nop(), 100, 1_000_000, 1) + require.NoError(t, err) + <-compactor2.Ready() q, err := ledger.NewQuery(newState, u.Keys()) require.NoError(t, err) @@ -724,34 +847,53 @@ func Test_ExportCheckpointAt(t *testing.T) { assert.Equal(t, retValues[1], ledger.Value([]byte{'B'})) <-led.Done() + <-compactor.Done() <-led2.Done() + <-compactor2.Done() }) }) }) } func TestWALUpdateFailuresBubbleUp(t *testing.T) { + unittest.RunWithTempDir(t, func(dir string) { + theError := fmt.Errorf("error error") - theError := fmt.Errorf("error error") + metricsCollector := &metrics.NoopCollector{} - w := &LongRunningDummyWAL{ - updateFn: func(update *ledger.TrieUpdate) (int, bool, error) { - return 0, false, theError - }, - } + diskWAL, err := wal.NewDiskWAL(zerolog.Nop(), nil, metricsCollector, dir, 100, pathfinder.PathByteSize, wal.SegmentSize) + require.NoError(t, err) - led, err := complete.NewLedger(w, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) - require.NoError(t, err) + w := &CustomUpdateWAL{ + DiskWAL: diskWAL, + updateFn: func(update *ledger.TrieUpdate) (int, bool, error) { + return 0, false, theError + }, + } - key := ledger.NewKey([]ledger.KeyPart{ledger.NewKeyPart(0, []byte{1, 2, 3})}) + led, err := complete.NewLedger(w, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + require.NoError(t, err) - values := []ledger.Value{[]byte{1, 2, 3}} - update, err := ledger.NewUpdate(led.InitialState(), []ledger.Key{key}, values) - require.NoError(t, err) + compactor, err := complete.NewCompactor(led, w, zerolog.Nop(), 100, 1_000_000, 1) + require.NoError(t, err) + + <-compactor.Ready() - _, _, err = led.Set(update) - require.Error(t, err) - require.True(t, errors.Is(err, theError)) + defer func() { + <-led.Done() + <-compactor.Done() + }() + + key := ledger.NewKey([]ledger.KeyPart{ledger.NewKeyPart(0, []byte{1, 2, 3})}) + + values := []ledger.Value{[]byte{1, 2, 3}} + update, err := ledger.NewUpdate(led.InitialState(), []ledger.Key{key}, values) + require.NoError(t, err) + + _, _, err = led.Set(update) + require.Error(t, err) + require.True(t, errors.Is(err, theError)) + }) } func valuesMatches(expected []ledger.Value, got []ledger.Value) bool { @@ -787,12 +929,12 @@ func migrationByValue(p []ledger.Payload) ([]ledger.Payload, error) { return ret, nil } -type LongRunningDummyWAL struct { - fixtures.NoopWAL +type CustomUpdateWAL struct { + *wal.DiskWAL updateFn func(update *ledger.TrieUpdate) (int, bool, error) } -func (w *LongRunningDummyWAL) RecordUpdate(update *ledger.TrieUpdate) (int, bool, error) { +func (w *CustomUpdateWAL) RecordUpdate(update *ledger.TrieUpdate) (int, bool, error) { return w.updateFn(update) } diff --git a/ledger/complete/wal/checkpointer_test.go b/ledger/complete/wal/checkpointer_test.go index 2b108236f32..af35f25a8cb 100644 --- a/ledger/complete/wal/checkpointer_test.go +++ b/ledger/complete/wal/checkpointer_test.go @@ -25,6 +25,7 @@ import ( "github.com/onflow/flow-go/ledger/complete/mtrie/trie" "github.com/onflow/flow-go/ledger/complete/wal" realWAL "github.com/onflow/flow-go/ledger/complete/wal" + "github.com/onflow/flow-go/ledger/complete/wal/fixtures" "github.com/onflow/flow-go/module/metrics" "github.com/onflow/flow-go/utils/unittest" ) @@ -53,6 +54,11 @@ func Test_WAL(t *testing.T) { led, err := complete.NewLedger(diskWal, size*10, metricsCollector, logger, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), uint(size), 1_000_000, 1) + require.NoError(t, err) + + <-compactor.Ready() + var state = led.InitialState() //saved data after updates @@ -70,8 +76,6 @@ func Test_WAL(t *testing.T) { state, _, err = led.Set(update) require.NoError(t, err) - fmt.Printf("Updated with %x\n", state) - data := make(map[string]ledger.Value, len(keys)) for j, key := range keys { data[string(encoding.EncodeKey(&key))] = values[j] @@ -81,11 +85,14 @@ func Test_WAL(t *testing.T) { } <-led.Done() + <-compactor.Done() diskWal2, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metricsCollector, dir, size, pathfinder.PathByteSize, realWAL.SegmentSize) require.NoError(t, err) led2, err := complete.NewLedger(diskWal2, (size*10)+10, metricsCollector, logger, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor2 := fixtures.NewNoopCompactor(led2) // noop compactor is used because no write is needed. + <-compactor2.Ready() // random map iteration order is a benefit here for state, data := range savedData { @@ -97,8 +104,6 @@ func Test_WAL(t *testing.T) { keys = append(keys, *key) } - fmt.Printf("Querying with %x\n", state) - var ledgerState ledger.State copy(ledgerState[:], state) query, err := ledger.NewQuery(ledgerState, keys) @@ -112,6 +117,7 @@ func Test_WAL(t *testing.T) { } <-led2.Done() + <-compactor2.Done() }) } diff --git a/ledger/complete/wal/fixtures/noopcompactor.go b/ledger/complete/wal/fixtures/noopcompactor.go new file mode 100644 index 00000000000..403708dcb29 --- /dev/null +++ b/ledger/complete/wal/fixtures/noopcompactor.go @@ -0,0 +1,48 @@ +package fixtures + +import ( + "github.com/onflow/flow-go/ledger/complete" +) + +type NoopCompactor struct { + stopCh chan struct{} + trieUpdateCh <-chan *complete.WALTrieUpdate +} + +func NewNoopCompactor(l *complete.Ledger) *NoopCompactor { + return &NoopCompactor{ + stopCh: make(chan struct{}), + trieUpdateCh: l.TrieUpdateChan(), + } +} + +func (c *NoopCompactor) Ready() <-chan struct{} { + ch := make(chan struct{}) + close(ch) + go c.run() + return ch +} + +func (c *NoopCompactor) Done() <-chan struct{} { + close(c.stopCh) + return c.stopCh +} + +func (c *NoopCompactor) run() { + for { + select { + case <-c.stopCh: + return + case update, ok := <-c.trieUpdateCh: + if !ok { + continue + } + + // Send result of WAL update + update.ResultCh <- nil + + // Wait for trie update + <-update.TrieCh + } + } +} diff --git a/ledger/partial/ledger_test.go b/ledger/partial/ledger_test.go index 0e30ebb7565..31efda011d4 100644 --- a/ledger/partial/ledger_test.go +++ b/ledger/partial/ledger_test.go @@ -20,9 +20,19 @@ import ( func TestFunctionalityWithCompleteTrie(t *testing.T) { - l, err := complete.NewLedger(&fixtures.NoopWAL{}, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + w := &fixtures.NoopWAL{} + + l, err := complete.NewLedger(w, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) + compactor := fixtures.NewNoopCompactor(l) + <-compactor.Ready() + + defer func() { + <-l.Done() + <-compactor.Done() + }() + // create empty update state := l.InitialState() keys := utils.RandomUniqueKeys(3, 2, 2, 4) diff --git a/module/chunks/chunkVerifier_test.go b/module/chunks/chunkVerifier_test.go index 79faf6a364b..02485724080 100644 --- a/module/chunks/chunkVerifier_test.go +++ b/module/chunks/chunkVerifier_test.go @@ -271,6 +271,14 @@ func GetBaselineVerifiableChunk(t *testing.T, script string, system bool) *verif f, _ := completeLedger.NewLedger(&fixtures.NoopWAL{}, 1000, metricsCollector, zerolog.Nop(), completeLedger.DefaultPathFinderVersion) + compactor := fixtures.NewNoopCompactor(f) + <-compactor.Ready() + + defer func() { + <-f.Done() + <-compactor.Done() + }() + keys := executionState.RegisterIDSToKeys(ids) update, err := ledger.NewUpdate( f.InitialState(), From 9b9d6f3bb96b3276d6442dcd4656ecfb3a392e0a Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Tue, 26 Jul 2022 13:59:21 -0500 Subject: [PATCH 15/32] Add comments --- ledger/complete/checkpointqueue_test.go | 6 ++++++ ledger/complete/compactor.go | 20 +++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/ledger/complete/checkpointqueue_test.go b/ledger/complete/checkpointqueue_test.go index add18e0cba9..7490232688a 100644 --- a/ledger/complete/checkpointqueue_test.go +++ b/ledger/complete/checkpointqueue_test.go @@ -50,6 +50,12 @@ func TestEmptyCheckpointQueue(t *testing.T) { tr := q.Tries() require.Equal(t, capacity, len(tr)) + + // After queue reaches capacity in previous loop, + // queue overwrites older elements with new insertions, + // and element count is its capacity value. + // savedTries contains all elements inserted from previous loop and current loop, so + // tr (queue snapshot) matches the last C elements in savedTries (where C is capacity). require.Equal(t, savedTries[len(savedTries)-capacity:], tr) require.Equal(t, capacity, q.Count()) } diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index e45873e6905..ebac7740329 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -16,17 +16,31 @@ import ( "github.com/onflow/flow-go/module/observable" ) +// WALTrieUpdate is a message communicated through channel between Ledger and Compactor. type WALTrieUpdate struct { - Update *ledger.TrieUpdate - ResultCh chan<- error - TrieCh <-chan *trie.MTrie + Update *ledger.TrieUpdate // Update data needs to be encoded and saved in WAL. + ResultCh chan<- error // ResultCh channel is used to send WAL update result from Compactor to Ledger. + TrieCh <-chan *trie.MTrie // TrieCh channel is used to send new trie from Ledger to Compactor. } +// checkpointResult is a message to communicate checkpointing number and error if any. type checkpointResult struct { num int err error } +// Compactor is a long-running goroutine responsible for: +// - writing WAL record from trie update, +// - starting checkpointing async when enough segments are finalized. +// +// Compactor communicates with Ledger through channels +// to ensure that by the end of any trie update processing, +// update is written to WAL and new trie is pushed to trie queue. +// +// Compactor stores pointers to tries in ledger state in a fix-sized +// checkpointing queue (FIFO). Checkpointing queue is decoupled from +// main ledger state to allow separate optimizaiton, etc. +// NOTE: ledger state and checkpointing queue may contain different tries. type Compactor struct { checkpointer *realWAL.Checkpointer wal realWAL.LedgerWAL From f048d62d80f92f765a788822376009d686cd6e7d Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Tue, 26 Jul 2022 17:12:18 -0500 Subject: [PATCH 16/32] Send WAL update result outside of defer --- ledger/complete/compactor.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index ebac7740329..a459798d1fd 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -348,11 +348,11 @@ func (c *Compactor) processTrieUpdate( // - incremented by 1 from previous segment number (new segment) segmentNum, skipped, updateErr := c.wal.RecordUpdate(update.Update) - // This ensures that updated trie matches WAL update. - defer func(updateResult error) { - // Send result of WAL update - update.ResultCh <- updateResult + // Send result of WAL update + update.ResultCh <- updateErr + // This ensures that updated trie matches WAL update. + defer func() { // Wait for updated trie trie := <-update.TrieCh if trie == nil { @@ -361,7 +361,7 @@ func (c *Compactor) processTrieUpdate( } checkpointQueue.Push(trie) - }(updateErr) + }() if activeSegmentNum == -1 { // Recover from failure to get active segment number at initialization. From c5455bd07435c7664565e5c169f6e1d601ecf827 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Tue, 26 Jul 2022 19:09:48 -0500 Subject: [PATCH 17/32] Refactor to use constants for default values --- cmd/bootstrap/run/execution_state.go | 14 ++- .../exec-data-json-export/ledger_exporter.go | 7 +- .../execution_state_extract.go | 7 +- .../execution_state_extract_test.go | 14 ++- engine/testutil/nodes.go | 12 +- ledger/complete/ledger_benchmark_test.go | 106 +++++++++++------- ledger/complete/ledger_test.go | 96 +++++++++++----- ledger/complete/wal/checkpointer_test.go | 17 ++- 8 files changed, 191 insertions(+), 82 deletions(-) diff --git a/cmd/bootstrap/run/execution_state.go b/cmd/bootstrap/run/execution_state.go index 34ee773c392..bde57a78866 100644 --- a/cmd/bootstrap/run/execution_state.go +++ b/cmd/bootstrap/run/execution_state.go @@ -1,6 +1,8 @@ package run import ( + "math" + "github.com/rs/zerolog" "github.com/onflow/flow-go/crypto" @@ -36,19 +38,25 @@ func GenerateExecutionState( chain flow.Chain, bootstrapOptions ...fvm.BootstrapProcedureOption, ) (flow.StateCommitment, error) { + const ( + capacity = 100 + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + metricsCollector := &metrics.NoopCollector{} - diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metricsCollector, dbDir, 100, pathfinder.PathByteSize, wal.SegmentSize) + diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metricsCollector, dbDir, capacity, pathfinder.PathByteSize, wal.SegmentSize) if err != nil { return flow.DummyStateCommitment, err } - ledgerStorage, err := ledger.NewLedger(diskWal, 100, metricsCollector, zerolog.Nop(), ledger.DefaultPathFinderVersion) + ledgerStorage, err := ledger.NewLedger(diskWal, capacity, metricsCollector, zerolog.Nop(), ledger.DefaultPathFinderVersion) if err != nil { return flow.DummyStateCommitment, err } - compactor, err := complete.NewCompactor(ledgerStorage, diskWal, zerolog.Nop(), 100, 1_000_000, 1) + compactor, err := complete.NewCompactor(ledgerStorage, diskWal, zerolog.Nop(), capacity, checkpointDistance, checkpointsToKeep) if err != nil { return flow.DummyStateCommitment, err } diff --git a/cmd/util/cmd/exec-data-json-export/ledger_exporter.go b/cmd/util/cmd/exec-data-json-export/ledger_exporter.go index 2e3b6b6cba6..42340d13ca6 100644 --- a/cmd/util/cmd/exec-data-json-export/ledger_exporter.go +++ b/cmd/util/cmd/exec-data-json-export/ledger_exporter.go @@ -4,6 +4,7 @@ import ( "bufio" "encoding/hex" "fmt" + "math" "os" "path/filepath" @@ -19,6 +20,10 @@ import ( // ExportLedger exports ledger key value pairs at the given blockID func ExportLedger(ledgerPath string, targetstate string, outputPath string) error { + const ( + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, &metrics.NoopCollector{}, ledgerPath, complete.DefaultCacheSize, pathfinder.PathByteSize, wal.SegmentSize) if err != nil { @@ -28,7 +33,7 @@ func ExportLedger(ledgerPath string, targetstate string, outputPath string) erro if err != nil { return fmt.Errorf("cannot create ledger from write-a-head logs and checkpoints: %w", err) } - compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), complete.DefaultCacheSize, 1_000_000, 1) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), complete.DefaultCacheSize, checkpointDistance, checkpointsToKeep) if err != nil { return fmt.Errorf("cannot create compactor: %w", err) } diff --git a/cmd/util/cmd/execution-state-extract/execution_state_extract.go b/cmd/util/cmd/execution-state-extract/execution_state_extract.go index 22756be42cf..678ffacadc8 100644 --- a/cmd/util/cmd/execution-state-extract/execution_state_extract.go +++ b/cmd/util/cmd/execution-state-extract/execution_state_extract.go @@ -2,6 +2,7 @@ package extract import ( "fmt" + "math" "github.com/rs/zerolog" @@ -54,7 +55,11 @@ func extractExecutionState( return fmt.Errorf("cannot create ledger from write-a-head logs and checkpoints: %w", err) } - compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), complete.DefaultCacheSize, 1_000_000, 1) + const ( + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), complete.DefaultCacheSize, checkpointDistance, checkpointsToKeep) if err != nil { return fmt.Errorf("cannot create compactor: %w", err) } diff --git a/cmd/util/cmd/execution-state-extract/execution_state_extract_test.go b/cmd/util/cmd/execution-state-extract/execution_state_extract_test.go index 61b882c3b6f..06b2832f72b 100644 --- a/cmd/util/cmd/execution-state-extract/execution_state_extract_test.go +++ b/cmd/util/cmd/execution-state-extract/execution_state_extract_test.go @@ -2,6 +2,7 @@ package extract import ( "crypto/rand" + "math" "testing" "github.com/rs/zerolog" @@ -74,6 +75,11 @@ func TestExtractExecutionState(t *testing.T) { withDirs(t, func(datadir, execdir, _ string) { + const ( + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + db := common.InitStorage(datadir) commits := badger.NewCommits(metr, db) @@ -84,7 +90,7 @@ func TestExtractExecutionState(t *testing.T) { require.NoError(t, err) f, err := complete.NewLedger(diskWal, size*10, metr, zerolog.Nop(), complete.DefaultPathFinderVersion) require.NoError(t, err) - compactor, err := complete.NewCompactor(f, diskWal, zerolog.Nop(), uint(size), 1_000_000, 1) + compactor, err := complete.NewCompactor(f, diskWal, zerolog.Nop(), uint(size), checkpointDistance, checkpointsToKeep) require.NoError(t, err) <-compactor.Ready() @@ -156,7 +162,11 @@ func TestExtractExecutionState(t *testing.T) { storage, err := complete.NewLedger(diskWal, 1000, metr, zerolog.Nop(), complete.DefaultPathFinderVersion) require.NoError(t, err) - compactor, err := complete.NewCompactor(storage, diskWal, zerolog.Nop(), uint(size), 1_000_000, 1) + const ( + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + compactor, err := complete.NewCompactor(storage, diskWal, zerolog.Nop(), uint(size), checkpointDistance, checkpointsToKeep) require.NoError(t, err) <-compactor.Ready() diff --git a/engine/testutil/nodes.go b/engine/testutil/nodes.go index 3f7b2144a01..8aa39aa8f18 100644 --- a/engine/testutil/nodes.go +++ b/engine/testutil/nodes.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "math" "path/filepath" "testing" "time" @@ -499,13 +500,18 @@ func ExecutionNode(t *testing.T, hub *stub.Hub, identity *flow.Identity, identit metricsCollector := &metrics.NoopCollector{} - diskWal, err := wal.NewDiskWAL(node.Log.With().Str("subcomponent", "wal").Logger(), nil, metricsCollector, dbDir, 100, pathfinder.PathByteSize, wal.SegmentSize) + const ( + capacity = 100 + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + diskWal, err := wal.NewDiskWAL(node.Log.With().Str("subcomponent", "wal").Logger(), nil, metricsCollector, dbDir, capacity, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(t, err) - ls, err := completeLedger.NewLedger(diskWal, 100, metricsCollector, node.Log.With().Str("compontent", "ledger").Logger(), completeLedger.DefaultPathFinderVersion) + ls, err := completeLedger.NewLedger(diskWal, capacity, metricsCollector, node.Log.With().Str("compontent", "ledger").Logger(), completeLedger.DefaultPathFinderVersion) require.NoError(t, err) - compactor, err := completeLedger.NewCompactor(ls, diskWal, zerolog.Nop(), 100, 1_000_000, 1) + compactor, err := completeLedger.NewCompactor(ls, diskWal, zerolog.Nop(), capacity, checkpointDistance, checkpointsToKeep) require.NoError(t, err) <-compactor.Ready() // Need to start compactor here because BootstrapLedger() updates ledger state. diff --git a/ledger/complete/ledger_benchmark_test.go b/ledger/complete/ledger_benchmark_test.go index 52c0ce0c55a..20ba93480e8 100644 --- a/ledger/complete/ledger_benchmark_test.go +++ b/ledger/complete/ledger_benchmark_test.go @@ -1,6 +1,7 @@ package complete_test import ( + "math" "math/rand" "os" "testing" @@ -28,11 +29,16 @@ func BenchmarkStorage(b *testing.B) { benchmarkStorage(100, b) } // BenchmarkStorage benchmarks the performance of the storage layer func benchmarkStorage(steps int, b *testing.B) { // assumption: 1000 key updates per collection - numInsPerStep := 1000 - keyNumberOfParts := 10 - keyPartMinByteSize := 1 - keyPartMaxByteSize := 100 - valueMaxByteSize := 32 + const ( + numInsPerStep = 1000 + keyNumberOfParts = 10 + keyPartMinByteSize = 1 + keyPartMaxByteSize = 100 + valueMaxByteSize = 32 + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + rand.Seed(time.Now().UnixNano()) dir, err := os.MkdirTemp("", "test-mtrie-") @@ -47,7 +53,7 @@ func benchmarkStorage(steps int, b *testing.B) { led, err := complete.NewLedger(diskWal, steps+1, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(b, err) - compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), uint(steps+1), 1_000_000, 1) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), uint(steps+1), checkpointDistance, checkpointsToKeep) require.NoError(b, err) <-compactor.Ready() @@ -141,11 +147,17 @@ func benchmarkStorage(steps int, b *testing.B) { // BenchmarkTrieUpdate benchmarks the performance of a trie update func BenchmarkTrieUpdate(b *testing.B) { // key updates per iteration - numInsPerStep := 10000 - keyNumberOfParts := 10 - keyPartMinByteSize := 1 - keyPartMaxByteSize := 100 - valueMaxByteSize := 32 + const ( + numInsPerStep = 10000 + keyNumberOfParts = 10 + keyPartMinByteSize = 1 + keyPartMaxByteSize = 100 + valueMaxByteSize = 32 + capacity = 101 + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + rand.Seed(1) dir, err := os.MkdirTemp("", "test-mtrie-") @@ -154,13 +166,13 @@ func BenchmarkTrieUpdate(b *testing.B) { b.Fatal(err) } - diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, 101, pathfinder.PathByteSize, wal.SegmentSize) + diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, capacity, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(b, err) - led, err := complete.NewLedger(diskWal, 101, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + led, err := complete.NewLedger(diskWal, capacity, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(b, err) - compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), 101, 1_000_000, 1) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), capacity, checkpointDistance, checkpointsToKeep) require.NoError(b, err) <-compactor.Ready() @@ -193,11 +205,17 @@ func BenchmarkTrieUpdate(b *testing.B) { // BenchmarkTrieUpdate benchmarks the performance of a trie read func BenchmarkTrieRead(b *testing.B) { // key updates per iteration - numInsPerStep := 10000 - keyNumberOfParts := 10 - keyPartMinByteSize := 1 - keyPartMaxByteSize := 100 - valueMaxByteSize := 32 + const ( + numInsPerStep = 10000 + keyNumberOfParts = 10 + keyPartMinByteSize = 1 + keyPartMaxByteSize = 100 + valueMaxByteSize = 32 + capacity = 101 + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + rand.Seed(1) dir, err := os.MkdirTemp("", "test-mtrie-") @@ -206,13 +224,13 @@ func BenchmarkTrieRead(b *testing.B) { b.Fatal(err) } - diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, 101, pathfinder.PathByteSize, wal.SegmentSize) + diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, capacity, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(b, err) - led, err := complete.NewLedger(diskWal, 101, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + led, err := complete.NewLedger(diskWal, capacity, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(b, err) - compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), 101, 1_000_000, 1) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), capacity, checkpointDistance, checkpointsToKeep) require.NoError(b, err) <-compactor.Ready() @@ -254,11 +272,17 @@ func BenchmarkTrieRead(b *testing.B) { func BenchmarkLedgerGetOneValue(b *testing.B) { // key updates per iteration - numInsPerStep := 10000 - keyNumberOfParts := 10 - keyPartMinByteSize := 1 - keyPartMaxByteSize := 100 - valueMaxByteSize := 32 + const ( + numInsPerStep = 10000 + keyNumberOfParts = 10 + keyPartMinByteSize = 1 + keyPartMaxByteSize = 100 + valueMaxByteSize = 32 + capacity = 101 + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + rand.Seed(1) dir, err := os.MkdirTemp("", "test-mtrie-") @@ -267,13 +291,13 @@ func BenchmarkLedgerGetOneValue(b *testing.B) { b.Fatal(err) } - diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, 101, pathfinder.PathByteSize, wal.SegmentSize) + diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, capacity, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(b, err) - led, err := complete.NewLedger(diskWal, 101, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + led, err := complete.NewLedger(diskWal, capacity, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(b, err) - compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), 101, 1_000_000, 1) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), capacity, checkpointDistance, checkpointsToKeep) require.NoError(b, err) <-compactor.Ready() @@ -332,11 +356,17 @@ func BenchmarkLedgerGetOneValue(b *testing.B) { // BenchmarkTrieUpdate benchmarks the performance of a trie prove func BenchmarkTrieProve(b *testing.B) { // key updates per iteration - numInsPerStep := 10000 - keyNumberOfParts := 10 - keyPartMinByteSize := 1 - keyPartMaxByteSize := 100 - valueMaxByteSize := 32 + const ( + numInsPerStep = 10000 + keyNumberOfParts = 10 + keyPartMinByteSize = 1 + keyPartMaxByteSize = 100 + valueMaxByteSize = 32 + capacity = 101 + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + rand.Seed(1) dir, err := os.MkdirTemp("", "test-mtrie-") @@ -345,13 +375,13 @@ func BenchmarkTrieProve(b *testing.B) { b.Fatal(err) } - diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, 101, pathfinder.PathByteSize, wal.SegmentSize) + diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, capacity, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(b, err) - led, err := complete.NewLedger(diskWal, 101, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + led, err := complete.NewLedger(diskWal, capacity, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(b, err) - compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), 101, 1_000_000, 1) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), capacity, checkpointDistance, checkpointsToKeep) require.NoError(b, err) <-compactor.Ready() diff --git a/ledger/complete/ledger_test.go b/ledger/complete/ledger_test.go index 7c30ccb08d4..408c61e6a6d 100644 --- a/ledger/complete/ledger_test.go +++ b/ledger/complete/ledger_test.go @@ -4,6 +4,7 @@ import ( "bytes" "errors" "fmt" + "math" "math/rand" "testing" "time" @@ -490,12 +491,17 @@ func TestLedger_Proof(t *testing.T) { } func Test_WAL(t *testing.T) { - numInsPerStep := 2 - keyNumberOfParts := 10 - keyPartMinByteSize := 1 - keyPartMaxByteSize := 100 - valueMaxByteSize := 2 << 16 //16kB - size := 10 + const ( + numInsPerStep = 2 + keyNumberOfParts = 10 + keyPartMinByteSize = 1 + keyPartMaxByteSize = 100 + valueMaxByteSize = 2 << 16 //16kB + size = 10 + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + metricsCollector := &metrics.NoopCollector{} logger := zerolog.Logger{} @@ -508,7 +514,7 @@ func Test_WAL(t *testing.T) { led, err := complete.NewLedger(diskWal, size, metricsCollector, logger, complete.DefaultPathFinderVersion) require.NoError(t, err) - compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), uint(size), 1_000_000, 1) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), size, checkpointDistance, checkpointsToKeep) require.NoError(t, err) <-compactor.Ready() @@ -546,7 +552,7 @@ func Test_WAL(t *testing.T) { led2, err := complete.NewLedger(diskWal2, size+10, metricsCollector, logger, complete.DefaultPathFinderVersion) require.NoError(t, err) - compactor2, err := complete.NewCompactor(led2, diskWal2, zerolog.Nop(), uint(size), 1_000_000, 1) + compactor2, err := complete.NewCompactor(led2, diskWal2, zerolog.Nop(), uint(size), checkpointDistance, checkpointsToKeep) require.NoError(t, err) <-compactor2.Ready() @@ -585,6 +591,11 @@ func Test_WAL(t *testing.T) { } func TestLedgerFunctionality(t *testing.T) { + const ( + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + rand.Seed(time.Now().UnixNano()) // You can manually increase this for more coverage experimentRep := 2 @@ -608,7 +619,7 @@ func TestLedgerFunctionality(t *testing.T) { require.NoError(t, err) led, err := complete.NewLedger(diskWal, activeTries, metricsCollector, logger, complete.DefaultPathFinderVersion) assert.NoError(t, err) - compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), uint(activeTries), 1_000_000, 1) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), uint(activeTries), checkpointDistance, checkpointsToKeep) require.NoError(t, err) <-compactor.Ready() @@ -709,11 +720,17 @@ func Test_ExportCheckpointAt(t *testing.T) { unittest.RunWithTempDir(t, func(dbDir string) { unittest.RunWithTempDir(t, func(dir2 string) { - diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dbDir, 100, pathfinder.PathByteSize, wal.SegmentSize) + const ( + capacity = 100 + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + + diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dbDir, capacity, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(t, err) - led, err := complete.NewLedger(diskWal, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + led, err := complete.NewLedger(diskWal, capacity, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) - compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), 100, 1_000_000, 1) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), capacity, checkpointDistance, checkpointsToKeep) require.NoError(t, err) <-compactor.Ready() @@ -728,11 +745,11 @@ func Test_ExportCheckpointAt(t *testing.T) { require.NoError(t, err) assert.Equal(t, newState, state) - diskWal2, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir2, 100, pathfinder.PathByteSize, wal.SegmentSize) + diskWal2, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir2, capacity, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(t, err) - led2, err := complete.NewLedger(diskWal2, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + led2, err := complete.NewLedger(diskWal2, capacity, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) - compactor2, err := complete.NewCompactor(led2, diskWal2, zerolog.Nop(), 100, 1_000_000, 1) + compactor2, err := complete.NewCompactor(led2, diskWal2, zerolog.Nop(), capacity, checkpointDistance, checkpointsToKeep) require.NoError(t, err) <-compactor2.Ready() @@ -761,11 +778,17 @@ func Test_ExportCheckpointAt(t *testing.T) { unittest.RunWithTempDir(t, func(dbDir string) { unittest.RunWithTempDir(t, func(dir2 string) { - diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dbDir, 100, pathfinder.PathByteSize, wal.SegmentSize) + const ( + capacity = 100 + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + + diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dbDir, capacity, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(t, err) - led, err := complete.NewLedger(diskWal, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + led, err := complete.NewLedger(diskWal, capacity, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) - compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), 100, 1_000_000, 1) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), capacity, checkpointDistance, checkpointsToKeep) require.NoError(t, err) <-compactor.Ready() @@ -779,11 +802,11 @@ func Test_ExportCheckpointAt(t *testing.T) { newState, err := led.ExportCheckpointAt(state, []ledger.Migration{migrationByValue}, []ledger.Reporter{}, []ledger.Reporter{}, complete.DefaultPathFinderVersion, dir2, "root.checkpoint") require.NoError(t, err) - diskWal2, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir2, 100, pathfinder.PathByteSize, wal.SegmentSize) + diskWal2, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir2, capacity, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(t, err) - led2, err := complete.NewLedger(diskWal2, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + led2, err := complete.NewLedger(diskWal2, capacity, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) - compactor2, err := complete.NewCompactor(led2, diskWal2, zerolog.Nop(), 100, 1_000_000, 1) + compactor2, err := complete.NewCompactor(led2, diskWal2, zerolog.Nop(), capacity, checkpointDistance, checkpointsToKeep) require.NoError(t, err) <-compactor2.Ready() @@ -811,11 +834,17 @@ func Test_ExportCheckpointAt(t *testing.T) { unittest.RunWithTempDir(t, func(dbDir string) { unittest.RunWithTempDir(t, func(dir2 string) { - diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dbDir, 100, pathfinder.PathByteSize, wal.SegmentSize) + const ( + capacity = 100 + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + + diskWal, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dbDir, capacity, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(t, err) - led, err := complete.NewLedger(diskWal, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + led, err := complete.NewLedger(diskWal, capacity, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) - compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), 100, 1_000_000, 1) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), capacity, checkpointDistance, checkpointsToKeep) require.NoError(t, err) <-compactor.Ready() @@ -829,11 +858,11 @@ func Test_ExportCheckpointAt(t *testing.T) { newState, err := led.ExportCheckpointAt(state, []ledger.Migration{migrationByKey}, []ledger.Reporter{}, []ledger.Reporter{}, complete.DefaultPathFinderVersion, dir2, "root.checkpoint") require.NoError(t, err) - diskWal2, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir2, 100, pathfinder.PathByteSize, wal.SegmentSize) + diskWal2, err := wal.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir2, capacity, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(t, err) - led2, err := complete.NewLedger(diskWal2, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + led2, err := complete.NewLedger(diskWal2, capacity, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) - compactor2, err := complete.NewCompactor(led2, diskWal2, zerolog.Nop(), 100, 1_000_000, 1) + compactor2, err := complete.NewCompactor(led2, diskWal2, zerolog.Nop(), capacity, checkpointDistance, checkpointsToKeep) require.NoError(t, err) <-compactor2.Ready() @@ -857,11 +886,18 @@ func Test_ExportCheckpointAt(t *testing.T) { func TestWALUpdateFailuresBubbleUp(t *testing.T) { unittest.RunWithTempDir(t, func(dir string) { + + const ( + capacity = 100 + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + theError := fmt.Errorf("error error") metricsCollector := &metrics.NoopCollector{} - diskWAL, err := wal.NewDiskWAL(zerolog.Nop(), nil, metricsCollector, dir, 100, pathfinder.PathByteSize, wal.SegmentSize) + diskWAL, err := wal.NewDiskWAL(zerolog.Nop(), nil, metricsCollector, dir, capacity, pathfinder.PathByteSize, wal.SegmentSize) require.NoError(t, err) w := &CustomUpdateWAL{ @@ -871,10 +907,10 @@ func TestWALUpdateFailuresBubbleUp(t *testing.T) { }, } - led, err := complete.NewLedger(w, 100, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) + led, err := complete.NewLedger(w, capacity, &metrics.NoopCollector{}, zerolog.Logger{}, complete.DefaultPathFinderVersion) require.NoError(t, err) - compactor, err := complete.NewCompactor(led, w, zerolog.Nop(), 100, 1_000_000, 1) + compactor, err := complete.NewCompactor(led, w, zerolog.Nop(), capacity, checkpointDistance, checkpointsToKeep) require.NoError(t, err) <-compactor.Ready() diff --git a/ledger/complete/wal/checkpointer_test.go b/ledger/complete/wal/checkpointer_test.go index af35f25a8cb..23c60be0b59 100644 --- a/ledger/complete/wal/checkpointer_test.go +++ b/ledger/complete/wal/checkpointer_test.go @@ -6,6 +6,7 @@ import ( "fmt" "io" "io/ioutil" + "math" "math/rand" "os" "path" @@ -30,31 +31,39 @@ import ( "github.com/onflow/flow-go/utils/unittest" ) -var ( +const ( numInsPerStep = 2 keyNumberOfParts = 10 keyPartMinByteSize = 1 keyPartMaxByteSize = 100 valueMaxByteSize = 2 << 16 //16kB size = 10 - metricsCollector = &metrics.NoopCollector{} - logger = zerolog.Logger{} segmentSize = 32 * 1024 pathByteSize = 32 pathFinderVersion = uint8(complete.DefaultPathFinderVersion) ) +var ( + logger = zerolog.Logger{} + metricsCollector = &metrics.NoopCollector{} +) + func Test_WAL(t *testing.T) { unittest.RunWithTempDir(t, func(dir string) { + const ( + checkpointDistance = math.MaxInt // A large number to prevent checkpoint creation. + checkpointsToKeep = 1 + ) + diskWal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metricsCollector, dir, size, pathfinder.PathByteSize, realWAL.SegmentSize) require.NoError(t, err) led, err := complete.NewLedger(diskWal, size*10, metricsCollector, logger, complete.DefaultPathFinderVersion) require.NoError(t, err) - compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), uint(size), 1_000_000, 1) + compactor, err := complete.NewCompactor(led, diskWal, zerolog.Nop(), size, checkpointDistance, checkpointsToKeep) require.NoError(t, err) <-compactor.Ready() From 240520d97949a6aecf4faa8f8ae5b1324822d139 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Tue, 26 Jul 2022 21:05:12 -0500 Subject: [PATCH 18/32] Add comment --- ledger/complete/mtrie/forest.go | 13 +++++++----- ledger/complete/mtrie/forest_test.go | 31 ++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/ledger/complete/mtrie/forest.go b/ledger/complete/mtrie/forest.go index c8d4e52ef5b..43a5809f199 100644 --- a/ledger/complete/mtrie/forest.go +++ b/ledger/complete/mtrie/forest.go @@ -190,10 +190,11 @@ func (f *Forest) Read(r *ledger.TrieRead) ([]ledger.Value, error) { return orderedValues, nil } -// Update updates the Values for the registers, adds updated tries to forest, -// and returns rootHash and error (if any). +// Update creates a new trie by updating Values for registers in the parent trie, +// adds new trie to forest, and returns rootHash and error (if any). // In case there are multiple updates to the same register, Update will persist // the latest written value. +// Note: Update adds new trie to forest, unlike NewTrie(). func (f *Forest) Update(u *ledger.TrieUpdate) (ledger.RootHash, error) { t, err := f.NewTrie(u) if err != nil { @@ -208,9 +209,11 @@ func (f *Forest) Update(u *ledger.TrieUpdate) (ledger.RootHash, error) { return t.RootHash(), nil } -// NewTrie updates the Values for the registers and returns updated trie and error (if any). -// In case there are multiple updates to the same register, Update will persist the latest -// written value. +// NewTrie creates a new trie by updating Values for registers in the parent trie, +// and returns new trie and error (if any). +// In case there are multiple updates to the same register, NewTrie will persist +// the latest written value. +// Note: NewTrie doesn't add new trie to forest, unlike Update(). func (f *Forest) NewTrie(u *ledger.TrieUpdate) (*trie.MTrie, error) { parentTrie, err := f.GetTrie(u.RootHash) diff --git a/ledger/complete/mtrie/forest_test.go b/ledger/complete/mtrie/forest_test.go index bf7c9b6eda4..78d36d3d48c 100644 --- a/ledger/complete/mtrie/forest_test.go +++ b/ledger/complete/mtrie/forest_test.go @@ -1041,3 +1041,34 @@ func TestValueSizesWithDuplicatedKeys(t *testing.T) { require.Equal(t, expectedValueSizes[i], retValueSizes[i]) } } + +func TestNow(t *testing.T) { + + metricsCollector := &metrics.NoopCollector{} + forest, err := NewForest(5, metricsCollector, nil) + require.NoError(t, err) + rootHash := forest.GetEmptyRootHash() + + p1 := pathByUint8s([]uint8{uint8(53), uint8(74)}) + v1 := payloadBySlices([]byte{'A'}, []byte{'A'}) + + paths := []ledger.Path{p1} + payloads := []*ledger.Payload{v1} + update := &ledger.TrieUpdate{RootHash: rootHash, Paths: paths, Payloads: payloads} + updatedRoot, err := forest.Update(update) + size := forest.Size() + require.NoError(t, err) + require.Equal(t, 2, size) + + read := &ledger.TrieRead{RootHash: updatedRoot, Paths: paths} + retValues, err := forest.Read(read) + require.NoError(t, err) + require.Equal(t, retValues[0], payloads[0].Value) + + update = &ledger.TrieUpdate{RootHash: updatedRoot, Paths: nil, Payloads: nil} + updatedRoot2, err := forest.Update(update) + size = forest.Size() + require.NoError(t, err) + require.Equal(t, updatedRoot, updatedRoot2) + require.Equal(t, 2, size) +} From 853ff0545636870a21393258e81c986dccdbfc4a Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Wed, 27 Jul 2022 15:36:29 -0500 Subject: [PATCH 19/32] Rename CheckpointQueue to TrieQueue Rename CheckpointQueue to TrieQueue as suggested in PR review. Since the name TrieQueue is not specific to checkpointing, also moved it from complete package to wal package. Putting it in the wal package makes it more clear TrieQueue is specific to checkpointing and can prevent it from being used for other purposes, getting modified, and introducing side-effects to checkpointing. --- ledger/complete/compactor.go | 18 ++++----- .../{checkpointqueue.go => wal/triequeue.go} | 37 +++++++++++-------- .../triequeue_test.go} | 12 +++--- 3 files changed, 36 insertions(+), 31 deletions(-) rename ledger/complete/{checkpointqueue.go => wal/triequeue.go} (60%) rename ledger/complete/{checkpointqueue_test.go => wal/triequeue_test.go} (91%) diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index a459798d1fd..0309b662a64 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -44,7 +44,7 @@ type checkpointResult struct { type Compactor struct { checkpointer *realWAL.Checkpointer wal realWAL.LedgerWAL - checkpointQueue *CheckpointQueue + trieQueue *realWAL.TrieQueue logger zerolog.Logger lm *lifecycle.LifecycleManager observers map[observable.Observer]struct{} @@ -81,12 +81,12 @@ func NewCompactor( return nil, err } - checkpointQueue := NewCheckpointQueueWithValues(checkpointCapacity, tries) + trieQueue := realWAL.NewTrieQueueWithValues(checkpointCapacity, tries) return &Compactor{ checkpointer: checkpointer, wal: w, - checkpointQueue: checkpointQueue, + trieQueue: trieQueue, logger: logger, stopCh: make(chan chan struct{}), trieUpdateCh: trieUpdateCh, @@ -201,7 +201,7 @@ Loop: var checkpointNum int var checkpointTries []*trie.MTrie activeSegmentNum, checkpointNum, checkpointTries = - c.processTrieUpdate(update, c.checkpointQueue, activeSegmentNum, nextCheckpointNum) + c.processTrieUpdate(update, c.trieQueue, activeSegmentNum, nextCheckpointNum) if checkpointTries == nil { // Not enough segments for checkpointing (nextCheckpointNum >= activeSegmentNum) @@ -329,11 +329,11 @@ func cleanupCheckpoints(checkpointer *realWAL.Checkpointer, checkpointsToKeep in } // processTrieUpdate writes trie update to WAL, updates activeSegmentNum, -// and gets tries from checkpointQueue for checkpointing if needed. +// and gets tries from trieQueue for checkpointing if needed. // It also sends WAL update result and waits for trie update completion. func (c *Compactor) processTrieUpdate( update *WALTrieUpdate, - checkpointQueue *CheckpointQueue, + trieQueue *realWAL.TrieQueue, activeSegmentNum int, nextCheckpointNum int, ) ( @@ -360,7 +360,7 @@ func (c *Compactor) processTrieUpdate( return } - checkpointQueue.Push(trie) + trieQueue.Push(trie) }() if activeSegmentNum == -1 { @@ -398,8 +398,8 @@ func (c *Compactor) processTrieUpdate( // At this point, checkpoint queue contains tries up to // last update (logged as last record in finalized segment) // It doesn't include trie for this update - // until updated trie is received and added to checkpointQueue. - tries := checkpointQueue.Tries() + // until updated trie is received and added to trieQueue. + tries := trieQueue.Tries() checkpointNum = nextCheckpointNum diff --git a/ledger/complete/checkpointqueue.go b/ledger/complete/wal/triequeue.go similarity index 60% rename from ledger/complete/checkpointqueue.go rename to ledger/complete/wal/triequeue.go index f76998c8174..c66d8c231a9 100644 --- a/ledger/complete/checkpointqueue.go +++ b/ledger/complete/wal/triequeue.go @@ -1,31 +1,36 @@ -package complete +package wal import ( "github.com/onflow/flow-go/ledger/complete/mtrie/trie" ) -// CheckpointQueue is a fix-sized FIFO queue of MTrie. -// CheckpointQueue is intentionally not threadsafe given its limited use case. -// CheckpointQueue is not a general purpose queue to avoid incurring overhead +// IMPORTANT: TrieQueue is in the wal package to prevent it +// from being used for other purposes and getting modified +// (to prevent introducing side-effects to checkpointing). + +// TrieQueue is a fix-sized FIFO queue of MTrie. +// It is only used by Compactor for checkpointing, and +// it is intentionally not threadsafe given its limited use case. +// It is not a general purpose queue to avoid incurring overhead // for features not needed for its limited use case. -type CheckpointQueue struct { +type TrieQueue struct { ts []*trie.MTrie capacity int tail int // element index to write to count int // number of elements (count <= capacity) } -// NewCheckpointQueue returns a new CheckpointQueue with given capacity. -func NewCheckpointQueue(capacity uint) *CheckpointQueue { - return &CheckpointQueue{ +// NewTrieQueue returns a new TrieQueue with given capacity. +func NewTrieQueue(capacity uint) *TrieQueue { + return &TrieQueue{ ts: make([]*trie.MTrie, capacity), capacity: int(capacity), } } -// NewCheckpointQueueWithValues returns a new CheckpointQueue with given capacity and initial values. -func NewCheckpointQueueWithValues(capacity uint, tries []*trie.MTrie) *CheckpointQueue { - q := NewCheckpointQueue(capacity) +// NewTrieQueueWithValues returns a new TrieQueue with given capacity and initial values. +func NewTrieQueueWithValues(capacity uint, tries []*trie.MTrie) *TrieQueue { + q := NewTrieQueue(capacity) start := 0 if len(tries) > q.capacity { @@ -39,7 +44,7 @@ func NewCheckpointQueueWithValues(capacity uint, tries []*trie.MTrie) *Checkpoin } // Push pushes trie to queue. If queue is full, it overwrites the oldest element. -func (q *CheckpointQueue) Push(t *trie.MTrie) { +func (q *TrieQueue) Push(t *trie.MTrie) { q.ts[q.tail] = t q.tail = (q.tail + 1) % q.capacity if !q.isFull() { @@ -49,7 +54,7 @@ func (q *CheckpointQueue) Push(t *trie.MTrie) { // Tries returns elements in queue, starting from the oldest element // to the newest element. -func (q *CheckpointQueue) Tries() []*trie.MTrie { +func (q *TrieQueue) Tries() []*trie.MTrie { if q.count == 0 { return nil } @@ -66,7 +71,7 @@ func (q *CheckpointQueue) Tries() []*trie.MTrie { head := q.tail - q.count copy(tries, q.ts[head:q.tail]) } else { // q.tail < q.count, data is wrapped around the slice. - // This branch isn't used until CheckpointQueue supports Pop (removing oldest element). + // This branch isn't used until TrieQueue supports Pop (removing oldest element). // At this time, there is no reason to implement Pop, so this branch is here to prevent future bug. head := q.capacity - q.count + q.tail n := copy(tries, q.ts[head:]) @@ -78,10 +83,10 @@ func (q *CheckpointQueue) Tries() []*trie.MTrie { } // Count returns element count. -func (q *CheckpointQueue) Count() int { +func (q *TrieQueue) Count() int { return q.count } -func (q *CheckpointQueue) isFull() bool { +func (q *TrieQueue) isFull() bool { return q.count == q.capacity } diff --git a/ledger/complete/checkpointqueue_test.go b/ledger/complete/wal/triequeue_test.go similarity index 91% rename from ledger/complete/checkpointqueue_test.go rename to ledger/complete/wal/triequeue_test.go index 7490232688a..54dd2e1ef6c 100644 --- a/ledger/complete/checkpointqueue_test.go +++ b/ledger/complete/wal/triequeue_test.go @@ -1,4 +1,4 @@ -package complete +package wal import ( "math/rand" @@ -12,10 +12,10 @@ import ( "github.com/onflow/flow-go/ledger/complete/mtrie/trie" ) -func TestEmptyCheckpointQueue(t *testing.T) { +func TestEmptyTrieQueue(t *testing.T) { const capacity = 10 - q := NewCheckpointQueue(capacity) + q := NewTrieQueue(capacity) require.Equal(t, 0, q.Count()) tries := q.Tries() @@ -61,10 +61,10 @@ func TestEmptyCheckpointQueue(t *testing.T) { } } -func TestCheckpointQueueWithInitialValues(t *testing.T) { +func TestTrieQueueWithInitialValues(t *testing.T) { const capacity = 10 - // Test CheckpointQueue with initial values. Numbers of initial values + // Test TrieQueue with initial values. Numbers of initial values // are from 1 to capacity + 1. for initialValueCount := 1; initialValueCount <= capacity+1; initialValueCount++ { @@ -82,7 +82,7 @@ func TestCheckpointQueueWithInitialValues(t *testing.T) { expectedTries = initialValues[initialValueCount-capacity:] } - q := NewCheckpointQueueWithValues(capacity, initialValues) + q := NewTrieQueueWithValues(capacity, initialValues) require.Equal(t, expectedCount, q.Count()) tries := q.Tries() From 46576c5fa2d364249f59b82b1de16f7cca004a21 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Wed, 27 Jul 2022 15:58:40 -0500 Subject: [PATCH 20/32] Add comment for DiskWAL.RecordUpdate Co-authored-by: Leo Zhang --- ledger/complete/wal/wal.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ledger/complete/wal/wal.go b/ledger/complete/wal/wal.go index 9323fd18b43..888f101cdd7 100644 --- a/ledger/complete/wal/wal.go +++ b/ledger/complete/wal/wal.go @@ -49,6 +49,9 @@ func (w *DiskWAL) UnpauseRecord() { w.paused = false } +// RecordUpdate writes the trie update to the write ahead log on disk. +// if write ahead logging is not paused, it returns the file num (write ahead log) that the trie update was written to. +// if write ahead logging is enabled, the second returned value is false, otherwise it's true, meaning WAL is disabled. func (w *DiskWAL) RecordUpdate(update *ledger.TrieUpdate) (segmentNum int, skipped bool, err error) { if w.paused { return 0, true, nil From 31ff7e4acbd323f05e379d2e2823fcdc90e53cc3 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Wed, 27 Jul 2022 16:05:37 -0500 Subject: [PATCH 21/32] Add comment for WAL shutdown in Compactor Co-authored-by: Leo Zhang --- ledger/complete/compactor.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index 0309b662a64..fd7b4e5c442 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -124,6 +124,8 @@ func (c *Compactor) Done() <-chan struct{} { <-doneCh // Shut down WAL component. + // only shut down wal after compactor has been shut down, in case there + // is still writing to WAL files. <-c.wal.Done() // Notify observers From 7e29a67c6d7199d66dd0f29785f05868339d1f94 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Wed, 27 Jul 2022 16:12:45 -0500 Subject: [PATCH 22/32] Add more logging in Compactor --- ledger/complete/compactor.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index fd7b4e5c442..f457bb577a7 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -124,8 +124,8 @@ func (c *Compactor) Done() <-chan struct{} { <-doneCh // Shut down WAL component. - // only shut down wal after compactor has been shut down, in case there - // is still writing to WAL files. + // only shut down wal after compactor has been shut down, in case there + // is still writing to WAL files. <-c.wal.Done() // Notify observers @@ -231,6 +231,7 @@ Loop: } // Drain and process remaining trie updates in channel. + c.logger.Info().Msg("Starting draining trie update channel in compactor on shutdown") for update := range c.trieUpdateCh { _, _, err := c.wal.RecordUpdate(update.Update) select { @@ -238,6 +239,7 @@ Loop: default: } } + c.logger.Info().Msg("Finished draining trie update channel in compactor on shutdown") // Don't wait for checkpointing to finish because it might take too long. } From b657ca38ba2321e9dd1eded36e91422c7904e1b0 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Wed, 27 Jul 2022 17:05:27 -0500 Subject: [PATCH 23/32] Add comment for Compactor and Ledger Co-authored-by: Leo Zhang --- ledger/complete/ledger.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ledger/complete/ledger.go b/ledger/complete/ledger.go index 2fd16f4a6e2..c76daf48c44 100644 --- a/ledger/complete/ledger.go +++ b/ledger/complete/ledger.go @@ -249,6 +249,13 @@ func (l *Ledger) set(trieUpdate *ledger.TrieUpdate) (newState ledger.State, err trieCh := make(chan *trie.MTrie, 1) defer close(trieCh) + // These run concurrently in two goroutines: + // 1. writing the trie update to WAL (in Compactor goroutine) + // 2. creating a new trie from the trie update (in this goroutine) + // Since writing to WAL is running concurrently, we use resultCh to wait for WAL update result from Compactor. + // Compactor also needs new trie created here + // because Compactor caches new trie to minimize memory foot-print while checkpointing, + // `trieCh` is used to send created trie to Compactor. l.trieUpdateCh <- &WALTrieUpdate{Update: trieUpdate, ResultCh: resultCh, TrieCh: trieCh} newTrie, err := l.forest.NewTrie(trieUpdate) From 4d81b171c2ed8e491897c25f869a59857991f9ef Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Wed, 27 Jul 2022 17:21:17 -0500 Subject: [PATCH 24/32] Fix linter error --- ledger/complete/ledger.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ledger/complete/ledger.go b/ledger/complete/ledger.go index c76daf48c44..e89ff0a6b53 100644 --- a/ledger/complete/ledger.go +++ b/ledger/complete/ledger.go @@ -249,13 +249,13 @@ func (l *Ledger) set(trieUpdate *ledger.TrieUpdate) (newState ledger.State, err trieCh := make(chan *trie.MTrie, 1) defer close(trieCh) - // These run concurrently in two goroutines: + // These run concurrently in two goroutines: // 1. writing the trie update to WAL (in Compactor goroutine) // 2. creating a new trie from the trie update (in this goroutine) // Since writing to WAL is running concurrently, we use resultCh to wait for WAL update result from Compactor. - // Compactor also needs new trie created here + // Compactor also needs new trie created here // because Compactor caches new trie to minimize memory foot-print while checkpointing, - // `trieCh` is used to send created trie to Compactor. + // `trieCh` is used to send created trie to Compactor. l.trieUpdateCh <- &WALTrieUpdate{Update: trieUpdate, ResultCh: resultCh, TrieCh: trieCh} newTrie, err := l.forest.NewTrie(trieUpdate) From 8afb36bc9ae75bae7d5cf83d722f099fd545a0a7 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Fri, 29 Jul 2022 11:28:00 -0500 Subject: [PATCH 25/32] Refactor compactor tests --- ledger/complete/compactor_test.go | 413 ++++++++++++++++++++---------- 1 file changed, 282 insertions(+), 131 deletions(-) diff --git a/ledger/complete/compactor_test.go b/ledger/complete/compactor_test.go index f0de8f7d2ae..03a2ccfba27 100644 --- a/ledger/complete/compactor_test.go +++ b/ledger/complete/compactor_test.go @@ -46,6 +46,8 @@ func (co *CompactorObserver) OnComplete() { close(co.done) } +// TestCompactor tests creation of WAL segments and checkpoints, and +// checks if the rebuilt ledger state matches previous ledger state. func TestCompactor(t *testing.T) { const ( numInsPerStep = 2 @@ -155,11 +157,8 @@ func TestCompactor(t *testing.T) { require.FileExists(t, path.Join(dir, "checkpoint.00000008")) require.NoFileExists(t, path.Join(dir, "checkpoint.00000009")) - ledgerDone := l.Done() - compactorDone := compactor.Done() - - <-ledgerDone - <-compactorDone + <-l.Done() + <-compactor.Done() }) time.Sleep(2 * time.Second) @@ -246,6 +245,111 @@ func TestCompactor(t *testing.T) { }) } +// TestCompactorSkipCheckpointing tests that only one +// checkpointing is running at a time. +func TestCompactorSkipCheckpointing(t *testing.T) { + const ( + numInsPerStep = 2 + pathByteSize = 32 + minPayloadByteSize = 2 << 15 + maxPayloadByteSize = 2 << 16 + size = 10 + checkpointDistance = 1 // checkpointDistance 1 triggers checkpointing for every segment. + checkpointsToKeep = 0 + forestCapacity = size * 10 + ) + + metricsCollector := &metrics.NoopCollector{} + + unittest.RunWithTempDir(t, func(dir string) { + + var l *Ledger + + // saved data after updates + savedData := make(map[ledger.RootHash]map[string]*ledger.Payload) + + wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, forestCapacity, pathByteSize, 32*1024) + require.NoError(t, err) + + l, err = NewLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) + require.NoError(t, err) + + // WAL segments are 32kB, so here we generate 2 keys 64kB each, times `size` + // so we should get at least `size` segments + + compactor, err := NewCompactor(l, wal, zerolog.Nop(), forestCapacity, checkpointDistance, checkpointsToKeep) + require.NoError(t, err) + + co := CompactorObserver{fromBound: 8, done: make(chan struct{})} + compactor.Subscribe(&co) + + // Run Compactor in background. + <-compactor.Ready() + + rootState := l.InitialState() + + // Generate the tree and create WAL + for i := 0; i < size; i++ { + + payloads := utils.RandomPayloads(numInsPerStep, minPayloadByteSize, maxPayloadByteSize) + + keys := make([]ledger.Key, len(payloads)) + values := make([]ledger.Value, len(payloads)) + for i, p := range payloads { + keys[i] = p.Key + values[i] = p.Value + } + + update, err := ledger.NewUpdate(rootState, keys, values) + require.NoError(t, err) + + newState, _, err := l.Set(update) + require.NoError(t, err) + + require.FileExists(t, path.Join(dir, realWAL.NumberToFilenamePart(i))) + + data := make(map[string]*ledger.Payload, len(keys)) + for j, k := range keys { + ks := string(k.CanonicalForm()) + data[ks] = payloads[j] + } + + savedData[ledger.RootHash(newState)] = data + + rootState = newState + } + + // wait for the bound-checking observer to confirm checkpoints have been made + select { + case <-co.done: + // continue + case <-time.After(60 * time.Second): + // Log segment and checkpoint files + files, err := ioutil.ReadDir(dir) + require.NoError(t, err) + + for _, file := range files { + fmt.Printf("%s, size %d\n", file.Name(), file.Size()) + } + + assert.FailNow(t, "timed out") + } + + <-l.Done() + <-compactor.Done() + + checkpointer, err := wal.NewCheckpointer() + require.NoError(t, err) + + nums, err := checkpointer.Checkpoints() + require.NoError(t, err) + + // Check that there are gaps between checkpoints (some checkpoints are skipped) + firstNum, lastNum := nums[0], nums[len(nums)-1] + require.True(t, len(nums) < lastNum-firstNum+1) + }) +} + // TestCompactorAccuracy expects checkpointed tries to match replayed tries. // Replayed tries are tries updated by replaying all WAL segments // (from segment 0, ignoring prior checkpoints) to the checkpoint number. @@ -255,18 +359,21 @@ func TestCompactorAccuracy(t *testing.T) { const ( numInsPerStep = 2 pathByteSize = 32 - minPayloadByteSize = 2 << 15 - maxPayloadByteSize = 2 << 16 - size = 10 + minPayloadByteSize = 2<<11 - 256 // 3840 bytes + maxPayloadByteSize = 2 << 11 // 4096 bytes + size = 20 checkpointDistance = 5 - checkpointsToKeep = 0 - forestCapacity = size * 10 + checkpointsToKeep = 0 // keep all + forestCapacity = 500 ) metricsCollector := &metrics.NoopCollector{} unittest.RunWithTempDir(t, func(dir string) { + // There appears to be 1-2 records per segment (according to logs), so + // generate size/2 segments. + lastCheckpointNum := -1 rootHash := trie.EmptyTrieRootHash() @@ -277,19 +384,13 @@ func TestCompactorAccuracy(t *testing.T) { wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, forestCapacity, pathByteSize, 32*1024) require.NoError(t, err) - l, err := NewLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) - require.NoError(t, err) - - checkpointer, err := wal.NewCheckpointer() + l, err := NewLedger(wal, forestCapacity, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) require.NoError(t, err) - // WAL segments are 32kB, so here we generate 2 keys 64kB each, times `size` - // so we should get at least `size` segments - compactor, err := NewCompactor(l, wal, zerolog.Nop(), forestCapacity, checkpointDistance, checkpointsToKeep) require.NoError(t, err) - fromBound := lastCheckpointNum + int(checkpointDistance)*2 + fromBound := lastCheckpointNum + (size / 2) co := CompactorObserver{fromBound: fromBound, done: make(chan struct{})} compactor.Subscribe(&co) @@ -298,7 +399,8 @@ func TestCompactorAccuracy(t *testing.T) { <-compactor.Ready() // Generate the tree and create WAL - for i := 0; i < size; i++ { + // size+2 is used to ensure that size/2 segments are finalized. + for i := 0; i < size+2; i++ { payloads := utils.RandomPayloads(numInsPerStep, minPayloadByteSize, maxPayloadByteSize) @@ -327,33 +429,167 @@ func TestCompactorAccuracy(t *testing.T) { } // Shutdown ledger and compactor - ledgerDone := l.Done() - compactorDone := compactor.Done() + <-l.Done() + <-compactor.Done() - <-ledgerDone - <-compactorDone + checkpointer, err := wal.NewCheckpointer() + require.NoError(t, err) nums, err := checkpointer.Checkpoints() require.NoError(t, err) for _, n := range nums { - testCheckpointedTriesMatchReplayedTriesFromSegments(t, checkpointer, n, dir) + // TODO: After the LRU Cache (outside of checkpointing code) is replaced + // by a queue, etc. we should make sure the insertion order is preserved. + + checkSequence := false + if i == 0 { + // Sequence check only works when initial state is blank. + // When initial state is from ledger's forest (LRU cache), + // its sequence is altered by reads when replaying segment records. + // Insertion order is not preserved (which is the way + // it is currently on mainnet). However, with PR 2792, only the + // initial values are affected and those would likely + // get into insertion order for the next checkpoint. Once + // the LRU Cache (outside of checkpointing code) is replaced, + // then we can verify insertion order. + checkSequence = true + } + testCheckpointedTriesMatchReplayedTriesFromSegments(t, checkpointer, n, dir, checkSequence) } + + lastCheckpointNum = nums[len(nums)-1] } }) } -func testCheckpointedTriesMatchReplayedTriesFromSegments(t *testing.T, checkpointer *realWAL.Checkpointer, checkpointNum int, dir string) { +// TestCompactorConcurrency expects checkpointed tries to +// match replayed tries in sequence with concurrent updates. +// Replayed tries are tries updated by replaying all WAL segments +// (from segment 0, ignoring prior checkpoints) to the checkpoint number. +// This verifies that checkpointed tries are snapshopt of segments +// and at segment boundary. +// Note: sequence check only works when initial state is blank. +// When initial state is from ledger's forest (LRU cache), its +// sequence is altered by reads when replaying segment records. +func TestCompactorConcurrency(t *testing.T) { + const ( + numInsPerStep = 2 + pathByteSize = 32 + minPayloadByteSize = 2<<11 - 256 // 3840 bytes + maxPayloadByteSize = 2 << 11 // 4096 bytes + size = 20 + checkpointDistance = 5 + checkpointsToKeep = 0 // keep all + forestCapacity = 500 + numGoroutine = 4 + lastCheckpointNum = -1 + ) - // Get tries by replaying segments up to checkpoint number - triesFromReplayingSegments, err := triesUpToSegment(dir, checkpointNum) - require.NoError(t, err) + rootState := ledger.State(trie.EmptyTrieRootHash()) + + metricsCollector := &metrics.NoopCollector{} + + unittest.RunWithTempDir(t, func(dir string) { + + // There are 1-2 records per segment (according to logs), so + // generate size/2 segments. + + wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, forestCapacity, pathByteSize, 32*1024) + require.NoError(t, err) + + l, err := NewLedger(wal, forestCapacity, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) + require.NoError(t, err) + + compactor, err := NewCompactor(l, wal, zerolog.Nop(), forestCapacity, checkpointDistance, checkpointsToKeep) + require.NoError(t, err) + + fromBound := lastCheckpointNum + (size / 2 * numGoroutine) + + co := CompactorObserver{fromBound: fromBound, done: make(chan struct{})} + compactor.Subscribe(&co) + + // Run Compactor in background. + <-compactor.Ready() + + // Run 4 goroutines and each goroutine updates size+1 tries. + for j := 0; j < numGoroutine; j++ { + go func(parentState ledger.State) { + // size+1 is used to ensure that size/2*numGoroutine segments are finalized. + for i := 0; i < size+1; i++ { + payloads := utils.RandomPayloads(numInsPerStep, minPayloadByteSize, maxPayloadByteSize) + + keys := make([]ledger.Key, len(payloads)) + values := make([]ledger.Value, len(payloads)) + for i, p := range payloads { + keys[i] = p.Key + values[i] = p.Value + } + + update, err := ledger.NewUpdate(parentState, keys, values) + require.NoError(t, err) + + newState, _, err := l.Set(update) + require.NoError(t, err) + + parentState = newState + } + }(rootState) + } + + // wait for the bound-checking observer to confirm checkpoints have been made + select { + case <-co.done: + // continue + case <-time.After(120 * time.Second): + assert.FailNow(t, "timed out") + } + + // Shutdown ledger and compactor + <-l.Done() + <-compactor.Done() + + checkpointer, err := wal.NewCheckpointer() + require.NoError(t, err) + + nums, err := checkpointer.Checkpoints() + require.NoError(t, err) + for _, n := range nums { + // For each created checkpoint: + // - get tries by loading checkpoint + // - get tries by replaying segments up to checkpoint number (ignoring all prior checkpoints) + // - test that these 2 sets of tries match in content and sequence (insertion order). + testCheckpointedTriesMatchReplayedTriesFromSegments(t, checkpointer, n, dir, true) + } + }) +} + +func testCheckpointedTriesMatchReplayedTriesFromSegments( + t *testing.T, + checkpointer *realWAL.Checkpointer, + checkpointNum int, + dir string, + inSequence bool, +) { // Get tries by loading checkpoint triesFromLoadingCheckpoint, err := checkpointer.LoadCheckpoint(checkpointNum) require.NoError(t, err) - // Test that checkpointed tries match replayed tries. + // Get tries by replaying segments up to checkpoint number (ignoring checkpoints) + triesFromReplayingSegments, err := triesUpToSegment(dir, checkpointNum, len(triesFromLoadingCheckpoint)) + require.NoError(t, err) + + if inSequence { + // Test that checkpointed tries match replayed tries in content and sequence (insertion order). + require.Equal(t, len(triesFromReplayingSegments), len(triesFromLoadingCheckpoint)) + for i := 0; i < len(triesFromReplayingSegments); i++ { + require.Equal(t, triesFromReplayingSegments[i].RootHash(), triesFromLoadingCheckpoint[i].RootHash()) + } + return + } + + // Test that checkpointed tries match replayed tries in content (ignore order). triesSetFromReplayingSegments := make(map[ledger.RootHash]struct{}) for _, t := range triesFromReplayingSegments { triesSetFromReplayingSegments[t.RootHash()] = struct{}{} @@ -367,18 +603,31 @@ func testCheckpointedTriesMatchReplayedTriesFromSegments(t *testing.T, checkpoin require.True(t, reflect.DeepEqual(triesSetFromReplayingSegments, triesSetFromLoadingCheckpoint)) } -func triesUpToSegment(dir string, to int) ([]*trie.MTrie, error) { +func triesUpToSegment(dir string, to int, capacity int) ([]*trie.MTrie, error) { - forest, err := mtrie.NewForest(500, &metrics.NoopCollector{}, nil) + // forest is used to create new trie. + forest, err := mtrie.NewForest(capacity, &metrics.NoopCollector{}, nil) if err != nil { return nil, fmt.Errorf("cannot create Forest: %w", err) } + initialTries, err := forest.GetTries() + if err != nil { + return nil, fmt.Errorf("cannot get tries from forest: %w", err) + } + + // TrieQueue is used to store last n tries from segment files in order (n = capacity) + tries := realWAL.NewTrieQueueWithValues(uint(capacity), initialTries) + err = replaySegments( dir, to, func(update *ledger.TrieUpdate) error { - _, err := forest.Update(update) + t, err := forest.NewTrie(update) + if err == nil { + forest.AddTrie(t) + tries.Push(t) + } return err }, func(rootHash ledger.RootHash) error { return nil @@ -387,7 +636,7 @@ func triesUpToSegment(dir string, to int) ([]*trie.MTrie, error) { return nil, err } - return forest.GetTries() + return tries.Tries(), nil } func replaySegments( @@ -437,101 +686,3 @@ func replaySegments( return nil } - -func TestCompactorConcurrency(t *testing.T) { - const ( - numInsPerStep = 2 - pathByteSize = 32 - minPayloadByteSize = 2 << 15 - maxPayloadByteSize = 2 << 16 - size = 10 - checkpointDistance = 5 - checkpointsToKeep = 0 // keep all - forestCapacity = size * 10 - ) - - metricsCollector := &metrics.NoopCollector{} - - unittest.RunWithTempDir(t, func(dir string) { - - lastCheckpointNum := -1 - - rootHash := trie.EmptyTrieRootHash() - - // Create DiskWAL and Ledger repeatedly to test rebuilding ledger state at restart. - for i := 0; i < 3; i++ { - - wal, err := realWAL.NewDiskWAL(zerolog.Nop(), nil, metrics.NewNoopCollector(), dir, forestCapacity, pathByteSize, 32*1024) - require.NoError(t, err) - - l, err := NewLedger(wal, size*10, metricsCollector, zerolog.Logger{}, DefaultPathFinderVersion) - require.NoError(t, err) - - checkpointer, err := wal.NewCheckpointer() - require.NoError(t, err) - - // WAL segments are 32kB, so here we generate 2 keys 64kB each, times `size` - // so we should get at least `size` segments - - compactor, err := NewCompactor(l, wal, zerolog.Nop(), forestCapacity, checkpointDistance, checkpointsToKeep) - require.NoError(t, err) - - fromBound := lastCheckpointNum + int(checkpointDistance)*2 - - co := CompactorObserver{fromBound: fromBound, done: make(chan struct{})} - compactor.Subscribe(&co) - - // Run Compactor in background. - <-compactor.Ready() - - // Generate updates - var updates []*ledger.Update - for i := 0; i < size; i++ { - - payloads := utils.RandomPayloads(numInsPerStep, minPayloadByteSize, maxPayloadByteSize) - - keys := make([]ledger.Key, len(payloads)) - values := make([]ledger.Value, len(payloads)) - for i, p := range payloads { - keys[i] = p.Key - values[i] = p.Value - } - - update, err := ledger.NewUpdate(ledger.State(rootHash), keys, values) - require.NoError(t, err) - - updates = append(updates, update) - } - - // Generate the tree and create WAL in parallel - for _, update := range updates { - go func(update *ledger.Update) { - _, _, err := l.Set(update) - require.NoError(t, err) - }(update) - } - - // wait for the bound-checking observer to confirm checkpoints have been made - select { - case <-co.done: - // continue - case <-time.After(60 * time.Second): - assert.FailNow(t, "timed out") - } - - // Shutdown ledger and compactor - ledgerDone := l.Done() - compactorDone := compactor.Done() - - <-ledgerDone - <-compactorDone - - nums, err := checkpointer.Checkpoints() - require.NoError(t, err) - - for _, n := range nums { - testCheckpointedTriesMatchReplayedTriesFromSegments(t, checkpointer, n, dir) - } - } - }) -} From bd62030c48d8779953004ab3f8e1cc5e4fa9988f Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Sun, 31 Jul 2022 12:18:55 -0500 Subject: [PATCH 26/32] Fix lint error --- ledger/complete/compactor_test.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ledger/complete/compactor_test.go b/ledger/complete/compactor_test.go index 03a2ccfba27..0c3dafa9d09 100644 --- a/ledger/complete/compactor_test.go +++ b/ledger/complete/compactor_test.go @@ -625,7 +625,10 @@ func triesUpToSegment(dir string, to int, capacity int) ([]*trie.MTrie, error) { func(update *ledger.TrieUpdate) error { t, err := forest.NewTrie(update) if err == nil { - forest.AddTrie(t) + err = forest.AddTrie(t) + if err != nil { + return err + } tries.Push(t) } return err From 9f2ec46a7bc7dd59984aefa82e1fb963cc27dc9e Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Sun, 31 Jul 2022 14:44:48 -0500 Subject: [PATCH 27/32] Add code comments --- ledger/complete/compactor.go | 53 ++++++++++++++++++++++++++++-------- ledger/complete/ledger.go | 16 +++++++---- 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index f457bb577a7..eaee362fa0f 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -39,8 +39,11 @@ type checkpointResult struct { // // Compactor stores pointers to tries in ledger state in a fix-sized // checkpointing queue (FIFO). Checkpointing queue is decoupled from -// main ledger state to allow separate optimizaiton, etc. -// NOTE: ledger state and checkpointing queue may contain different tries. +// main ledger state to allow separate optimization and looser coupling, etc. +// CAUTION: If the forest LRU Cache is used for main state, +// then ledger state and checkpointing queue may contain different tries. +// This will be resolved automaticaly after the forest LRU Cache +// (code outside checkpointing) is replaced by something like a FIFO queue. type Compactor struct { checkpointer *realWAL.Checkpointer wal realWAL.LedgerWAL @@ -54,6 +57,15 @@ type Compactor struct { trieUpdateCh <-chan *WALTrieUpdate } +// NewCompactor creates new Compactor which writes WAL record and triggers +// checkpointing asynchronously when enough segments are finalized. +// The checkpointDistance is a flag that specifies how many segments need to +// be finalized to trigger checkpointing. However, if a prior checkpointing +// is already running and not finished, then more segments than specified +// could be accumulated for the new checkpointing (to reduce memory). +// All returned errors indicate that Compactor can't be created. +// Since failure to create Compactor will end up blocking ledger updates, +// the caller should handle all returned errors as unrecoverable. func NewCompactor( l *Ledger, w realWAL.LedgerWAL, @@ -71,16 +83,20 @@ func NewCompactor( return nil, err } + // Get trieUpdateCh channel to communicate trieUpdate, WAL result, and new trie + // created from the update. trieUpdateCh := l.TrieUpdateChan() if trieUpdateCh == nil { return nil, errors.New("failed to get valid trie update channel from ledger") } + // Get all tries from ledger state. tries, err := l.Tries() if err != nil { return nil, err } + // Create trieQueue with initial values from ledger state. trieQueue := realWAL.NewTrieQueueWithValues(checkpointCapacity, tries) return &Compactor{ @@ -97,16 +113,18 @@ func NewCompactor( }, nil } +// Subscribe subscribes observer to Compactor. func (c *Compactor) Subscribe(observer observable.Observer) { var void struct{} c.observers[observer] = void } +// Unsubscribe unsubscribes observer to Compactor. func (c *Compactor) Unsubscribe(observer observable.Observer) { delete(c.observers, observer) } -// Ready periodically fires Run function, every `interval` +// Ready returns channel which would be closed when Compactor goroutine starts. func (c *Compactor) Ready() <-chan struct{} { c.lm.OnStart(func() { go c.run() @@ -114,6 +132,7 @@ func (c *Compactor) Ready() <-chan struct{} { return c.lm.Started() } +// Done returns channel which would be closed when Compactor goroutine exits. func (c *Compactor) Done() <-chan struct{} { c.lm.OnStop(func() { // Signal Compactor goroutine to stop @@ -136,6 +155,8 @@ func (c *Compactor) Done() <-chan struct{} { return c.lm.Stopped() } +// run writes WAL records from trie updates and starts checkpointing +// asynchronously when enough segments are finalized. func (c *Compactor) run() { // checkpointSem is used to limit checkpointing to one. @@ -147,7 +168,7 @@ func (c *Compactor) run() { checkpointResultCh := make(chan checkpointResult, 1) - // Get active segment number. + // Get active segment number (opened segment that new records write to). // activeSegmentNum is updated when record is written to a new segment. _, activeSegmentNum, err := c.wal.Segments() if err != nil { @@ -162,11 +183,8 @@ func (c *Compactor) run() { } // Compute next checkpoint number. - // nextCheckpointNum is updated when - // - checkpointing starts, fails to start, or fails. - // - tries snapshot fails. + // nextCheckpointNum is updated when checkpointing starts, fails to start, or fails. // NOTE: next checkpoint number must >= active segment num. - // We can't reuse mtrie state to checkpoint tries in older segments. nextCheckpointNum := lastCheckpointNum + int(c.checkpointDistance) if activeSegmentNum > nextCheckpointNum { nextCheckpointNum = activeSegmentNum @@ -189,7 +207,7 @@ Loop: "compactor failed to checkpoint %d", checkpointResult.num, ) - // Retry checkpointing after active segment is finalized. + // Retry checkpointing when active segment is finalized. nextCheckpointNum = activeSegmentNum } @@ -244,6 +262,11 @@ Loop: // Don't wait for checkpointing to finish because it might take too long. } +// checkpoint creates checkpoint of tries snapshot, +// deletes prior checkpoint files (if needed), and notifies observers. +// Errors indicate that checkpoint file can't be created or prior checkpoints can't be removed. +// Caller should handle returned errors by retrying checkpointing when appropriate. +// Since this function is only for checkpointing, Compactor isn't affected by returned error. func (c *Compactor) checkpoint(ctx context.Context, tries []*trie.MTrie, checkpointNum int) error { err := createCheckpoint(c.checkpointer, c.logger, tries, checkpointNum) @@ -280,6 +303,9 @@ func (c *Compactor) checkpoint(ctx context.Context, tries []*trie.MTrie, checkpo return nil } +// createCheckpoint creates checkpoint with given checkpointNum and tries. +// Errors indicate that checkpoint file can't be created. +// Caller should handle returned errors by retrying checkpointing when appropriate. func createCheckpoint(checkpointer *realWAL.Checkpointer, logger zerolog.Logger, tries []*trie.MTrie, checkpointNum int) error { logger.Info().Msgf("serializing checkpoint %d", checkpointNum) @@ -309,6 +335,8 @@ func createCheckpoint(checkpointer *realWAL.Checkpointer, logger zerolog.Logger, return nil } +// cleanupCheckpoints deletes prior checkpoint files if needed. +// Since the function is side-effect free, all failures are simply a no-op. func cleanupCheckpoints(checkpointer *realWAL.Checkpointer, checkpointsToKeep int) error { // Don't list checkpoints if we keep them all if checkpointsToKeep == 0 { @@ -333,8 +361,9 @@ func cleanupCheckpoints(checkpointer *realWAL.Checkpointer, checkpointsToKeep in } // processTrieUpdate writes trie update to WAL, updates activeSegmentNum, -// and gets tries from trieQueue for checkpointing if needed. -// It also sends WAL update result and waits for trie update completion. +// and returns tries for checkpointing if needed. +// It sends WAL update result, receives updated trie, and pushes updated trie to trieQueue. +// When this function returns, WAL update is in sync with trieQueue update. func (c *Compactor) processTrieUpdate( update *WALTrieUpdate, trieQueue *realWAL.TrieQueue, @@ -400,7 +429,7 @@ func (c *Compactor) processTrieUpdate( // Get tries from checkpoint queue. // At this point, checkpoint queue contains tries up to - // last update (logged as last record in finalized segment) + // last update (last record in finalized segment) // It doesn't include trie for this update // until updated trie is received and added to trieQueue. tries := trieQueue.Tries() diff --git a/ledger/complete/ledger.go b/ledger/complete/ledger.go index e89ff0a6b53..be2b76d25d8 100644 --- a/ledger/complete/ledger.go +++ b/ledger/complete/ledger.go @@ -200,8 +200,8 @@ func (l *Ledger) Get(query *ledger.Query) (values []ledger.Value, err error) { return values, err } -// Set updates the ledger given an update -// it returns the state after update and errors (if any) +// Set updates the ledger given an update. +// It returns the state after update and errors (if any) func (l *Ledger) Set(update *ledger.Update) (newState ledger.State, trieUpdate *ledger.TrieUpdate, err error) { start := time.Now() @@ -244,17 +244,21 @@ func (l *Ledger) Set(update *ledger.Update) (newState ledger.State, trieUpdate * func (l *Ledger) set(trieUpdate *ledger.TrieUpdate) (newState ledger.State, err error) { + // resultCh is a buffered channel to receive WAL update result. resultCh := make(chan error, 1) + // trieCh is a buffered channel to send updated trie. + // trieCh can be closed without sending updated trie to indicate failure to update trie. trieCh := make(chan *trie.MTrie, 1) defer close(trieCh) - // These run concurrently in two goroutines: + // There are two goroutines: // 1. writing the trie update to WAL (in Compactor goroutine) // 2. creating a new trie from the trie update (in this goroutine) - // Since writing to WAL is running concurrently, we use resultCh to wait for WAL update result from Compactor. - // Compactor also needs new trie created here - // because Compactor caches new trie to minimize memory foot-print while checkpointing, + // Since writing to WAL is running concurrently, we use resultCh + // to receive WAL update result from Compactor. + // Compactor also needs new trie created here because Compactor + // caches new trie to minimize memory foot-print while checkpointing. // `trieCh` is used to send created trie to Compactor. l.trieUpdateCh <- &WALTrieUpdate{Update: trieUpdate, ResultCh: resultCh, TrieCh: trieCh} From 22eeb3e14953197e4eed8120b2add8d32a9ed29a Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Sun, 31 Jul 2022 15:47:03 -0500 Subject: [PATCH 28/32] Only retry checkpointing on appropriate error Asynchronous checkpointing can return createCheckpointError and removeCheckpointError. Only retry checkpointing if createCheckpointError is returned. Log and ignore removeCheckpointError. --- ledger/complete/compactor.go | 39 +++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/ledger/complete/compactor.go b/ledger/complete/compactor.go index eaee362fa0f..44c81a2bdcd 100644 --- a/ledger/complete/compactor.go +++ b/ledger/complete/compactor.go @@ -203,12 +203,14 @@ Loop: case checkpointResult := <-checkpointResultCh: if checkpointResult.err != nil { - c.logger.Error().Err(checkpointResult.err).Msgf( - "compactor failed to checkpoint %d", checkpointResult.num, + c.logger.Error().Err(checkpointResult.err).Msg( + "compactor failed to create or remove checkpoint", ) - - // Retry checkpointing when active segment is finalized. - nextCheckpointNum = activeSegmentNum + var createError *createCheckpointError + if errors.As(checkpointResult.err, &createError) { + // Retry checkpointing when active segment is finalized. + nextCheckpointNum = activeSegmentNum + } } case update, ok := <-c.trieUpdateCh: @@ -271,7 +273,7 @@ func (c *Compactor) checkpoint(ctx context.Context, tries []*trie.MTrie, checkpo err := createCheckpoint(c.checkpointer, c.logger, tries, checkpointNum) if err != nil { - return fmt.Errorf("cannot create checkpoints: %w", err) + return &createCheckpointError{num: checkpointNum, err: err} } // Return if context is canceled. @@ -283,7 +285,7 @@ func (c *Compactor) checkpoint(ctx context.Context, tries []*trie.MTrie, checkpo err = cleanupCheckpoints(c.checkpointer, int(c.checkpointsToKeep)) if err != nil { - return fmt.Errorf("cannot cleanup checkpoints: %w", err) + return &removeCheckpointError{err: err} } if checkpointNum > 0 { @@ -438,3 +440,26 @@ func (c *Compactor) processTrieUpdate( return activeSegmentNum, checkpointNum, tries } + +// createCheckpointError creates a checkpoint creation error. +type createCheckpointError struct { + num int + err error +} + +func (e *createCheckpointError) Error() string { + return fmt.Sprintf("cannot create checkpoint %d: %s", e.num, e.err) +} + +func (e *createCheckpointError) Unwrap() error { return e.err } + +// removeCheckpointError creates a checkpoint removal error. +type removeCheckpointError struct { + err error +} + +func (e *removeCheckpointError) Error() string { + return fmt.Sprintf("cannot cleanup checkpoints: %s", e.err) +} + +func (e *removeCheckpointError) Unwrap() error { return e.err } From f7746174e19d884226aefa82dcda639c255b8a7c Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Sun, 31 Jul 2022 17:08:40 -0500 Subject: [PATCH 29/32] Remove unnecessary branch in TrieQueue --- ledger/complete/wal/triequeue.go | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/ledger/complete/wal/triequeue.go b/ledger/complete/wal/triequeue.go index c66d8c231a9..dd63467f48d 100644 --- a/ledger/complete/wal/triequeue.go +++ b/ledger/complete/wal/triequeue.go @@ -61,22 +61,13 @@ func (q *TrieQueue) Tries() []*trie.MTrie { tries := make([]*trie.MTrie, q.count) - if q.isFull() { - // If queue is full, tail points to the oldest element. - head := q.tail + if q.tail >= q.count { // Data isn't wrapped around the slice. + head := q.tail - q.count + copy(tries, q.ts[head:q.tail]) + } else { // q.tail < q.count, data is wrapped around the slice. + head := q.capacity - q.count + q.tail n := copy(tries, q.ts[head:]) copy(tries[n:], q.ts[:q.tail]) - } else { - if q.tail >= q.count { // Data isn't wrapped around the slice. - head := q.tail - q.count - copy(tries, q.ts[head:q.tail]) - } else { // q.tail < q.count, data is wrapped around the slice. - // This branch isn't used until TrieQueue supports Pop (removing oldest element). - // At this time, there is no reason to implement Pop, so this branch is here to prevent future bug. - head := q.capacity - q.count + q.tail - n := copy(tries, q.ts[head:]) - copy(tries[n:], q.ts[:q.tail]) - } } return tries From ec132df8b545c0bf53ca929c28835139bfac0efd Mon Sep 17 00:00:00 2001 From: "Ramtin M. Seraj" Date: Tue, 2 Aug 2022 09:11:48 -0700 Subject: [PATCH 30/32] [Ledger] Replace LRU cache with a FIFO queue (circular buffer) (#2893) --- ledger/complete/checkpoint_benchmark_test.go | 1 - ledger/complete/ledger.go | 26 +-- ledger/complete/ledger_test.go | 5 - ledger/complete/mtrie/forest.go | 96 +++------- ledger/complete/mtrie/forest_test.go | 52 +++++- ledger/complete/mtrie/trieCache.go | 144 +++++++++++++++ ledger/complete/mtrie/trieCache_test.go | 185 +++++++++++++++++++ ledger/complete/wal/wal.go | 1 - 8 files changed, 408 insertions(+), 102 deletions(-) create mode 100644 ledger/complete/mtrie/trieCache.go create mode 100644 ledger/complete/mtrie/trieCache_test.go diff --git a/ledger/complete/checkpoint_benchmark_test.go b/ledger/complete/checkpoint_benchmark_test.go index 859b4d631cd..4ccfec49e21 100644 --- a/ledger/complete/checkpoint_benchmark_test.go +++ b/ledger/complete/checkpoint_benchmark_test.go @@ -145,7 +145,6 @@ func BenchmarkLoadCheckpointAndWALs(b *testing.B) { return err }, func(rootHash ledger.RootHash) error { - forest.RemoveTrie(rootHash) return nil }, ) diff --git a/ledger/complete/ledger.go b/ledger/complete/ledger.go index be2b76d25d8..d0e22ca293a 100644 --- a/ledger/complete/ledger.go +++ b/ledger/complete/ledger.go @@ -32,7 +32,7 @@ const defaultTrieUpdateChanSize = 500 // Ledger is fork-aware which means any update can be applied at any previous state which forms a tree of tries (forest). // The forest is in memory but all changes (e.g. register updates) are captured inside write-ahead-logs for crash recovery reasons. // In order to limit the memory usage and maintain the performance storage only keeps a limited number of -// tries and purge the old ones (LRU-based); in other words, Ledger is not designed to be used +// tries and purge the old ones (FIFO-based); in other words, Ledger is not designed to be used // for archival usage but make it possible for other software components to reconstruct very old tries using write-ahead logs. type Ledger struct { forest *mtrie.Forest @@ -53,12 +53,7 @@ func NewLedger( logger := log.With().Str("ledger", "complete").Logger() - forest, err := mtrie.NewForest(capacity, metrics, func(evictedTrie *trie.MTrie) { - err := wal.RecordDelete(evictedTrie.RootHash()) - if err != nil { - logger.Error().Err(err).Msg("failed to save delete record in wal") - } - }) + forest, err := mtrie.NewForest(capacity, metrics, nil) if err != nil { return nil, fmt.Errorf("cannot create forest: %w", err) } @@ -358,7 +353,7 @@ func (l *Ledger) ExportCheckpointAt( Str("hash", rh.String()). Msgf("Most recently touched root hash.") return ledger.State(hash.DummyHash), - fmt.Errorf("cannot get try at the given state commitment: %w", err) + fmt.Errorf("cannot get trie at the given state commitment: %w", err) } // clean up tries to release memory @@ -505,20 +500,7 @@ func (l *Ledger) keepOnlyOneTrie(state ledger.State) error { // don't write things to WALs l.wal.PauseRecord() defer l.wal.UnpauseRecord() - - allTries, err := l.forest.GetTries() - if err != nil { - return err - } - - targetRootHash := ledger.RootHash(state) - for _, trie := range allTries { - trieRootHash := trie.RootHash() - if trieRootHash != targetRootHash { - l.forest.RemoveTrie(trieRootHash) - } - } - return nil + return l.forest.PurgeCacheExcept(ledger.RootHash(state)) } func runReport(r ledger.Reporter, p []ledger.Payload, commit ledger.State, l zerolog.Logger) error { diff --git a/ledger/complete/ledger_test.go b/ledger/complete/ledger_test.go index 408c61e6a6d..60f4517e28c 100644 --- a/ledger/complete/ledger_test.go +++ b/ledger/complete/ledger_test.go @@ -532,7 +532,6 @@ func Test_WAL(t *testing.T) { assert.NoError(t, err) state, _, err = led.Set(update) require.NoError(t, err) - fmt.Printf("Updated with %x\n", state) data := make(map[string]ledger.Value, len(keys)) for j, key := range keys { @@ -581,10 +580,6 @@ func Test_WAL(t *testing.T) { } } - // test deletion - s := led2.ForestSize() - assert.Equal(t, s, size) - <-led2.Done() <-compactor2.Done() }) diff --git a/ledger/complete/mtrie/forest.go b/ledger/complete/mtrie/forest.go index 43a5809f199..e43891af8e6 100644 --- a/ledger/complete/mtrie/forest.go +++ b/ledger/complete/mtrie/forest.go @@ -1,11 +1,8 @@ package mtrie import ( - "errors" "fmt" - lru "github.com/hashicorp/golang-lru" - "github.com/onflow/flow-go/ledger" "github.com/onflow/flow-go/ledger/common/hash" "github.com/onflow/flow-go/ledger/complete/mtrie/trie" @@ -27,7 +24,7 @@ type Forest struct { // tries stores all MTries in the forest. It is NOT a CACHE in the conventional sense: // there is no mechanism to load a trie from disk in case of a cache miss. Missing a // needed trie in the forest might cause a fatal application logic error. - tries *lru.Cache + tries *TrieCache forestCapacity int onTreeEvicted func(tree *trie.MTrie) metrics module.LedgerMetrics @@ -36,30 +33,11 @@ type Forest struct { // NewForest returns a new instance of memory forest. // // CAUTION on forestCapacity: the specified capacity MUST be SUFFICIENT to store all needed MTries in the forest. -// If more tries are added than the capacity, the Least Recently Used trie is removed (evicted) from the Forest. -// THIS IS A ROUGH HEURISTIC as it might evict tries that are still needed. +// If more tries are added than the capacity, the Least Recently Added trie is removed (evicted) from the Forest (FIFO queue). // Make sure you chose a sufficiently large forestCapacity, such that, when reaching the capacity, the -// Least Recently Used trie will never be needed again. +// Least Recently Added trie will never be needed again. func NewForest(forestCapacity int, metrics module.LedgerMetrics, onTreeEvicted func(tree *trie.MTrie)) (*Forest, error) { - // init LRU cache as a SHORTCUT for a usage-related storage eviction policy - var cache *lru.Cache - var err error - if onTreeEvicted != nil { - cache, err = lru.NewWithEvict(forestCapacity, func(key interface{}, value interface{}) { - trie, ok := value.(*trie.MTrie) - if !ok { - panic(fmt.Sprintf("cache contains item of type %T", value)) - } - onTreeEvicted(trie) - }) - } else { - cache, err = lru.New(forestCapacity) - } - if err != nil { - return nil, fmt.Errorf("cannot create forest cache: %w", err) - } - - forest := &Forest{tries: cache, + forest := &Forest{tries: NewTrieCache(uint(forestCapacity), onTreeEvicted), forestCapacity: forestCapacity, onTreeEvicted: onTreeEvicted, metrics: metrics, @@ -67,7 +45,7 @@ func NewForest(forestCapacity int, metrics module.LedgerMetrics, onTreeEvicted f // add trie with no allocated registers emptyTrie := trie.NewEmptyMTrie() - err = forest.AddTrie(emptyTrie) + err := forest.AddTrie(emptyTrie) if err != nil { return nil, fmt.Errorf("adding empty trie to forest failed: %w", err) } @@ -333,11 +311,7 @@ func (f *Forest) HasTrie(rootHash ledger.RootHash) bool { // warning, use this function for read-only operation func (f *Forest) GetTrie(rootHash ledger.RootHash) (*trie.MTrie, error) { // if in memory - if ent, found := f.tries.Get(rootHash); found { - trie, ok := ent.(*trie.MTrie) - if !ok { - return nil, fmt.Errorf("forest contains an element of a wrong type") - } + if trie, found := f.tries.Get(rootHash); found { return trie, nil } return nil, fmt.Errorf("trie with the given rootHash %s not found", rootHash) @@ -345,21 +319,7 @@ func (f *Forest) GetTrie(rootHash ledger.RootHash) (*trie.MTrie, error) { // GetTries returns list of currently cached tree root hashes func (f *Forest) GetTries() ([]*trie.MTrie, error) { - // ToDo needs concurrency safety - keys := f.tries.Keys() - tries := make([]*trie.MTrie, len(keys)) - for i, key := range keys { - t, ok := f.tries.Get(key) - if !ok { - return nil, errors.New("concurrent Forest modification") - } - trie, ok := t.(*trie.MTrie) - if !ok { - return nil, errors.New("forest contains an element of a wrong type") - } - tries[i] = trie - } - return tries, nil + return f.tries.Tries(), nil } // AddTries adds a trie to the forest @@ -381,29 +341,16 @@ func (f *Forest) AddTrie(newTrie *trie.MTrie) error { // TODO: check Thread safety rootHash := newTrie.RootHash() - if storedTrie, found := f.tries.Get(rootHash); found { - trie, ok := storedTrie.(*trie.MTrie) - if !ok { - return fmt.Errorf("forest contains an element of a wrong type") - } - if trie.Equals(newTrie) { - return nil - } - return fmt.Errorf("forest already contains a tree with same root hash but other properties") + if _, found := f.tries.Get(rootHash); found { + // do no op + return nil } - f.tries.Add(rootHash, newTrie) - f.metrics.ForestNumberOfTrees(uint64(f.tries.Len())) + f.tries.Push(newTrie) + f.metrics.ForestNumberOfTrees(uint64(f.tries.Count())) return nil } -// RemoveTrie removes a trie to the forest -func (f *Forest) RemoveTrie(rootHash ledger.RootHash) { - // TODO remove from the file as well - f.tries.Remove(rootHash) - f.metrics.ForestNumberOfTrees(uint64(f.tries.Len())) -} - // GetEmptyRootHash returns the rootHash of empty Trie func (f *Forest) GetEmptyRootHash() ledger.RootHash { return trie.EmptyTrieRootHash() @@ -411,14 +358,25 @@ func (f *Forest) GetEmptyRootHash() ledger.RootHash { // MostRecentTouchedRootHash returns the rootHash of the most recently touched trie func (f *Forest) MostRecentTouchedRootHash() (ledger.RootHash, error) { - keys := f.tries.Keys() - if len(keys) > 0 { - return keys[len(keys)-1].(ledger.RootHash), nil + trie := f.tries.LastAddedTrie() + if trie != nil { + return trie.RootHash(), nil } return ledger.RootHash(hash.DummyHash), fmt.Errorf("no trie is stored in the forest") } +// PurgeCacheExcept removes all tries in the memory except the one with the given root hash +func (f *Forest) PurgeCacheExcept(rootHash ledger.RootHash) error { + trie, found := f.tries.Get(rootHash) + if !found { + return fmt.Errorf("trie with the given root hash not found") + } + f.tries.Purge() + f.tries.Push(trie) + return nil +} + // Size returns the number of active tries in this store func (f *Forest) Size() int { - return f.tries.Len() + return f.tries.Count() } diff --git a/ledger/complete/mtrie/forest_test.go b/ledger/complete/mtrie/forest_test.go index 78d36d3d48c..bfc4e72084d 100644 --- a/ledger/complete/mtrie/forest_test.go +++ b/ledger/complete/mtrie/forest_test.go @@ -41,10 +41,6 @@ func TestTrieOperations(t *testing.T) { require.NoError(t, err) require.Equal(t, retnt.RootHash(), updatedTrie.RootHash()) require.Equal(t, 2, forest.Size()) - - // Remove trie - forest.RemoveTrie(updatedTrie.RootHash()) - require.Equal(t, 1, forest.Size()) } // TestTrieUpdate updates the empty trie with some values and verifies that the @@ -1072,3 +1068,51 @@ func TestNow(t *testing.T) { require.Equal(t, updatedRoot, updatedRoot2) require.Equal(t, 2, size) } + +func TestPurgeCacheExcept(t *testing.T) { + forest, err := NewForest(5, &metrics.NoopCollector{}, nil) + require.NoError(t, err) + + nt := trie.NewEmptyMTrie() + p1 := pathByUint8s([]uint8{uint8(53), uint8(74)}) + v1 := payloadBySlices([]byte{'A'}, []byte{'A'}) + + updatedTrie1, _, err := trie.NewTrieWithUpdatedRegisters(nt, []ledger.Path{p1}, []ledger.Payload{*v1}, true) + require.NoError(t, err) + + err = forest.AddTrie(updatedTrie1) + require.NoError(t, err) + + p2 := pathByUint8s([]uint8{uint8(12), uint8(34)}) + v2 := payloadBySlices([]byte{'B'}, []byte{'B'}) + + updatedTrie2, _, err := trie.NewTrieWithUpdatedRegisters(nt, []ledger.Path{p2}, []ledger.Payload{*v2}, true) + require.NoError(t, err) + + err = forest.AddTrie(updatedTrie2) + require.NoError(t, err) + + require.Equal(t, 3, forest.tries.Count()) + forest.PurgeCacheExcept(updatedTrie2.RootHash()) + + require.Equal(t, 1, forest.tries.Count()) + ret, err := forest.GetTrie(updatedTrie2.RootHash()) + require.NoError(t, err) + require.Equal(t, ret, updatedTrie2) + + _, err = forest.GetTrie(updatedTrie1.RootHash()) + require.Error(t, err) + + // test purge when only a single target trie exist there + forest.PurgeCacheExcept(updatedTrie1.RootHash()) + ret, err = forest.GetTrie(updatedTrie2.RootHash()) + require.NoError(t, err) + require.Equal(t, ret, updatedTrie2) + + _, err = forest.GetTrie(updatedTrie1.RootHash()) + require.Error(t, err) + + // purge with non existing trie + forest.PurgeCacheExcept(updatedTrie2.RootHash()) + require.Error(t, err) +} diff --git a/ledger/complete/mtrie/trieCache.go b/ledger/complete/mtrie/trieCache.go new file mode 100644 index 00000000000..1597f94ea68 --- /dev/null +++ b/ledger/complete/mtrie/trieCache.go @@ -0,0 +1,144 @@ +package mtrie + +import ( + "sync" + + "github.com/onflow/flow-go/ledger" + "github.com/onflow/flow-go/ledger/complete/mtrie/trie" +) + +type OnTreeEvictedFunc func(tree *trie.MTrie) + +// TrieCache caches tries into memory, it acts as a fifo queue +// so when it reaches to the capacity it would evict the oldest trie +// from the cache. +// +// Under the hood it uses a circular buffer +// of mtrie pointers and a map of rootHash to cache index for fast lookup +type TrieCache struct { + tries []*trie.MTrie + lookup map[ledger.RootHash]int // index to item + lock sync.RWMutex + capacity int + tail int // element index to write to + count int // number of elements (count <= capacity) + onTreeEvicted OnTreeEvictedFunc +} + +// NewTrieCache returns a new TrieCache with given capacity. +func NewTrieCache(capacity uint, onTreeEvicted OnTreeEvictedFunc) *TrieCache { + return &TrieCache{ + tries: make([]*trie.MTrie, capacity), + lookup: make(map[ledger.RootHash]int, capacity), + lock: sync.RWMutex{}, + capacity: int(capacity), + tail: 0, + count: 0, + onTreeEvicted: onTreeEvicted, + } +} + +// Purge removes all mtries stored in the buffer +func (tc *TrieCache) Purge() { + tc.lock.Lock() + defer tc.lock.Unlock() + + if tc.count == 0 { + return + } + + toEvict := 0 + for i := 0; i < tc.capacity; i++ { + toEvict = (tc.tail + i) % tc.capacity + if tc.onTreeEvicted != nil { + if tc.tries[toEvict] != nil { + tc.onTreeEvicted(tc.tries[toEvict]) + } + } + tc.tries[toEvict] = nil + } + tc.tail = 0 + tc.count = 0 + tc.lookup = make(map[ledger.RootHash]int, tc.capacity) +} + +// Tries returns elements in queue, starting from the oldest element +// to the newest element. +func (tc *TrieCache) Tries() []*trie.MTrie { + tc.lock.RLock() + defer tc.lock.RUnlock() + + if tc.count == 0 { + return nil + } + + tries := make([]*trie.MTrie, tc.count) + + if tc.tail >= tc.count { // Data isn't wrapped around the slice. + head := tc.tail - tc.count + copy(tries, tc.tries[head:tc.tail]) + } else { // q.tail < q.count, data is wrapped around the slice. + // This branch isn't used until TrieQueue supports Pop (removing oldest element). + // At this time, there is no reason to implement Pop, so this branch is here to prevent future bug. + head := tc.capacity - tc.count + tc.tail + n := copy(tries, tc.tries[head:]) + copy(tries[n:], tc.tries[:tc.tail]) + } + + return tries +} + +// Push pushes trie to queue. If queue is full, it overwrites the oldest element. +func (tc *TrieCache) Push(t *trie.MTrie) { + tc.lock.Lock() + defer tc.lock.Unlock() + + // if its full + if tc.count == tc.capacity { + oldtrie := tc.tries[tc.tail] + if tc.onTreeEvicted != nil { + tc.onTreeEvicted(oldtrie) + } + delete(tc.lookup, oldtrie.RootHash()) + tc.count-- // so when we increment at the end of method we don't go beyond capacity + } + tc.tries[tc.tail] = t + tc.lookup[t.RootHash()] = tc.tail + tc.tail = (tc.tail + 1) % tc.capacity + tc.count++ +} + +// LastAddedTrie returns the last trie added to the cache +func (tc *TrieCache) LastAddedTrie() *trie.MTrie { + tc.lock.RLock() + defer tc.lock.RUnlock() + + if tc.count == 0 { + return nil + } + indx := tc.tail - 1 + if indx < 0 { + indx = tc.capacity - 1 + } + return tc.tries[indx] +} + +// Get returns the trie by rootHash, if not exist will return nil and false +func (tc *TrieCache) Get(rootHash ledger.RootHash) (*trie.MTrie, bool) { + tc.lock.RLock() + defer tc.lock.RUnlock() + + idx, found := tc.lookup[rootHash] + if !found { + return nil, false + } + return tc.tries[idx], true +} + +// Count returns number of items stored in the cache +func (tc *TrieCache) Count() int { + tc.lock.RLock() + defer tc.lock.RUnlock() + + return tc.count +} diff --git a/ledger/complete/mtrie/trieCache_test.go b/ledger/complete/mtrie/trieCache_test.go new file mode 100644 index 00000000000..df01688d627 --- /dev/null +++ b/ledger/complete/mtrie/trieCache_test.go @@ -0,0 +1,185 @@ +package mtrie + +// test addition +// test under capacity +// test on capacity +// test across boundry + +import ( + "math/rand" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/onflow/flow-go/ledger" + "github.com/onflow/flow-go/ledger/common/hash" + "github.com/onflow/flow-go/ledger/complete/mtrie/node" + "github.com/onflow/flow-go/ledger/complete/mtrie/trie" + "github.com/onflow/flow-go/utils/unittest" +) + +func TestTrieCache(t *testing.T) { + const capacity = 10 + + tc := NewTrieCache(capacity, nil) + require.Equal(t, 0, tc.Count()) + + tries := tc.Tries() + require.Equal(t, 0, len(tries)) + require.Equal(t, 0, tc.Count()) + require.Equal(t, 0, len(tc.lookup)) + + // savedTries contains all tries that are pushed to queue + var savedTries []*trie.MTrie + + // Push tries to queue to fill out capacity + for i := 0; i < capacity; i++ { + trie, err := randomMTrie() + require.NoError(t, err) + + tc.Push(trie) + + savedTries = append(savedTries, trie) + + tr := tc.Tries() + require.Equal(t, savedTries, tr) + require.Equal(t, len(savedTries), tc.Count()) + require.Equal(t, len(savedTries), len(tc.lookup)) + + retTrie, found := tc.Get(trie.RootHash()) + require.Equal(t, retTrie, trie) + require.True(t, found) + + // check last added trie functionality + retTrie = tc.LastAddedTrie() + require.Equal(t, retTrie, trie) + } + + // Push more tries to queue to overwrite older elements + for i := 0; i < capacity; i++ { + trie, err := randomMTrie() + require.NoError(t, err) + + tc.Push(trie) + + savedTries = append(savedTries, trie) + + tr := tc.Tries() + require.Equal(t, capacity, len(tr)) + + // After queue reaches capacity in previous loop, + // queue overwrites older elements with new insertions, + // and element count is its capacity value. + // savedTries contains all elements inserted from previous loop and current loop, so + // tr (queue snapshot) matches the last C elements in savedTries (where C is capacity). + require.Equal(t, savedTries[len(savedTries)-capacity:], tr) + require.Equal(t, capacity, tc.Count()) + require.Equal(t, capacity, len(tc.lookup)) + + // check the trie is lookable + retTrie, found := tc.Get(trie.RootHash()) + require.Equal(t, retTrie, trie) + require.True(t, found) + + // check the last evicted value is not kept + retTrie, found = tc.Get(savedTries[len(savedTries)-capacity-1].RootHash()) + require.Nil(t, retTrie) + require.False(t, found) + + // check last added trie functionality + retTrie = tc.LastAddedTrie() + require.Equal(t, retTrie, trie) + } + +} + +func TestPurge(t *testing.T) { + const capacity = 5 + + trie1, err := randomMTrie() + require.NoError(t, err) + trie2, err := randomMTrie() + require.NoError(t, err) + trie3, err := randomMTrie() + require.NoError(t, err) + + called := 0 + tc := NewTrieCache(capacity, func(tree *trie.MTrie) { + switch called { + case 0: + require.Equal(t, trie1, tree) + case 1: + require.Equal(t, trie2, tree) + case 2: + require.Equal(t, trie3, tree) + } + called++ + + }) + tc.Push(trie1) + tc.Push(trie2) + tc.Push(trie3) + + tc.Purge() + require.Equal(t, 0, tc.Count()) + require.Equal(t, 0, tc.tail) + require.Equal(t, 0, len(tc.lookup)) + + require.Equal(t, 3, called) +} + +func TestEvictCallBack(t *testing.T) { + const capacity = 2 + + trie1, err := randomMTrie() + require.NoError(t, err) + + called := false + tc := NewTrieCache(capacity, func(tree *trie.MTrie) { + called = true + require.Equal(t, trie1, tree) + }) + tc.Push(trie1) + + trie2, err := randomMTrie() + require.NoError(t, err) + tc.Push(trie2) + + trie3, err := randomMTrie() + require.NoError(t, err) + tc.Push(trie3) + + require.True(t, called) +} + +func TestConcurrentAccess(t *testing.T) { + + const worker = 50 + const capacity = 100 // large enough to not worry evicts + + tc := NewTrieCache(capacity, nil) + + unittest.Concurrently(worker, func(i int) { + trie, err := randomMTrie() + require.NoError(t, err) + tc.Push(trie) + + ret, found := tc.Get(trie.RootHash()) + require.True(t, found) + require.Equal(t, trie, ret) + }) + + require.Equal(t, worker, tc.Count()) +} + +func randomMTrie() (*trie.MTrie, error) { + var randomPath ledger.Path + rand.Read(randomPath[:]) + + var randomHashValue hash.Hash + rand.Read(randomHashValue[:]) + + root := node.NewNode(256, nil, nil, randomPath, nil, randomHashValue) + + return trie.NewMTrie(root, 1, 1) +} diff --git a/ledger/complete/wal/wal.go b/ledger/complete/wal/wal.go index 888f101cdd7..9e287a58950 100644 --- a/ledger/complete/wal/wal.go +++ b/ledger/complete/wal/wal.go @@ -100,7 +100,6 @@ func (w *DiskWAL) ReplayOnForest(forest *mtrie.Forest) error { return err }, func(rootHash ledger.RootHash) error { - forest.RemoveTrie(rootHash) return nil }, ) From 711deaacd7a375e0e017f226646a917a1c84241c Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Tue, 2 Aug 2022 11:44:08 -0500 Subject: [PATCH 31/32] Fix lint errors --- cmd/util/cmd/export-json-execution-state/cmd.go | 1 - ledger/complete/mtrie/forest_test.go | 11 +++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cmd/util/cmd/export-json-execution-state/cmd.go b/cmd/util/cmd/export-json-execution-state/cmd.go index 0af67ca6ca5..f06a38c628a 100644 --- a/cmd/util/cmd/export-json-execution-state/cmd.go +++ b/cmd/util/cmd/export-json-execution-state/cmd.go @@ -140,7 +140,6 @@ func ExportLedger(ledgerPath string, targetstate string, outputPath string, full return err }, func(rootHash ledger.RootHash) error { - forest.RemoveTrie(rootHash) return nil }, ) diff --git a/ledger/complete/mtrie/forest_test.go b/ledger/complete/mtrie/forest_test.go index bfc4e72084d..d3d2e25925e 100644 --- a/ledger/complete/mtrie/forest_test.go +++ b/ledger/complete/mtrie/forest_test.go @@ -1091,9 +1091,10 @@ func TestPurgeCacheExcept(t *testing.T) { err = forest.AddTrie(updatedTrie2) require.NoError(t, err) - require.Equal(t, 3, forest.tries.Count()) - forest.PurgeCacheExcept(updatedTrie2.RootHash()) + + err = forest.PurgeCacheExcept(updatedTrie2.RootHash()) + require.NoError(t, err) require.Equal(t, 1, forest.tries.Count()) ret, err := forest.GetTrie(updatedTrie2.RootHash()) @@ -1104,7 +1105,9 @@ func TestPurgeCacheExcept(t *testing.T) { require.Error(t, err) // test purge when only a single target trie exist there - forest.PurgeCacheExcept(updatedTrie1.RootHash()) + err = forest.PurgeCacheExcept(updatedTrie1.RootHash()) + require.Error(t, err) + ret, err = forest.GetTrie(updatedTrie2.RootHash()) require.NoError(t, err) require.Equal(t, ret, updatedTrie2) @@ -1113,6 +1116,6 @@ func TestPurgeCacheExcept(t *testing.T) { require.Error(t, err) // purge with non existing trie - forest.PurgeCacheExcept(updatedTrie2.RootHash()) + err = forest.PurgeCacheExcept(updatedTrie2.RootHash()) require.Error(t, err) } From c7003898e7d5fee6febf9b16aa54594fd0b2af25 Mon Sep 17 00:00:00 2001 From: Faye Amacker <33205765+fxamacker@users.noreply.github.com> Date: Tue, 2 Aug 2022 12:17:46 -0500 Subject: [PATCH 32/32] Fix PurgeCacheExcept test --- ledger/complete/mtrie/forest_test.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ledger/complete/mtrie/forest_test.go b/ledger/complete/mtrie/forest_test.go index d3d2e25925e..a7be2ae21da 100644 --- a/ledger/complete/mtrie/forest_test.go +++ b/ledger/complete/mtrie/forest_test.go @@ -1095,8 +1095,8 @@ func TestPurgeCacheExcept(t *testing.T) { err = forest.PurgeCacheExcept(updatedTrie2.RootHash()) require.NoError(t, err) - require.Equal(t, 1, forest.tries.Count()) + ret, err := forest.GetTrie(updatedTrie2.RootHash()) require.NoError(t, err) require.Equal(t, ret, updatedTrie2) @@ -1104,7 +1104,7 @@ func TestPurgeCacheExcept(t *testing.T) { _, err = forest.GetTrie(updatedTrie1.RootHash()) require.Error(t, err) - // test purge when only a single target trie exist there + // test purge with non existing trie err = forest.PurgeCacheExcept(updatedTrie1.RootHash()) require.Error(t, err) @@ -1115,7 +1115,8 @@ func TestPurgeCacheExcept(t *testing.T) { _, err = forest.GetTrie(updatedTrie1.RootHash()) require.Error(t, err) - // purge with non existing trie + // test purge when only a single target trie exist there err = forest.PurgeCacheExcept(updatedTrie2.RootHash()) - require.Error(t, err) + require.NoError(t, err) + require.Equal(t, 1, forest.tries.Count()) }