From 3860bc9c6c74ca4f84ff2f12d6c3521b3df99df2 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 24 May 2024 09:33:19 +0100 Subject: [PATCH] pageserver: post-shard-split layer rewrites (2/2) (#7531) ## Problem - After a shard split of a large existing tenant, child tenants can end up with oversized historic layers indefinitely, if those layers are prevented from being GC'd by branchpoints. This PR follows https://github.com/neondatabase/neon/pull/7531, and adds rewriting of layers that contain a mixture of needed & un-needed contents, in addition to dropping un-needed layers. Closes: https://github.com/neondatabase/neon/issues/7504 ## Summary of changes - Add methods to ImageLayer for reading back existing layers - Extend `compact_shard_ancestors` to rewrite layer files that contain a mixture of keys that we want and keys we do not, if unwanted keys are the majority of those in the file. - Amend initialization code to handle multiple layers with the same LayerName properly - Get rid of of renaming bad layer files to `.old` since that's now expected on restarts during rewrites. --- .../src/tenant/storage_layer/image_layer.rs | 200 +++++++++++++++++- pageserver/src/tenant/storage_layer/layer.rs | 32 ++- pageserver/src/tenant/timeline.rs | 84 +++----- pageserver/src/tenant/timeline/compaction.rs | 101 +++++++-- pageserver/src/tenant/timeline/init.rs | 189 ++++++++--------- .../src/tenant/timeline/layer_manager.rs | 27 ++- test_runner/fixtures/neon_fixtures.py | 4 +- test_runner/regress/test_sharding.py | 104 ++++++++- 8 files changed, 548 insertions(+), 193 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 67b489ce0d2d..8394b33f1947 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -47,7 +47,7 @@ use hex; use itertools::Itertools; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::LayerAccessKind; -use pageserver_api::shard::TenantShardId; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::fs::File; @@ -473,7 +473,7 @@ impl ImageLayerInner { ctx: &RequestContext, ) -> Result<(), GetVectoredError> { let reads = self - .plan_reads(keyspace, ctx) + .plan_reads(keyspace, None, ctx) .await .map_err(GetVectoredError::Other)?; @@ -485,9 +485,15 @@ impl ImageLayerInner { Ok(()) } + /// Traverse the layer's index to build read operations on the overlap of the input keyspace + /// and the keys in this layer. + /// + /// If shard_identity is provided, it will be used to filter keys down to those stored on + /// this shard. async fn plan_reads( &self, keyspace: KeySpace, + shard_identity: Option<&ShardIdentity>, ctx: &RequestContext, ) -> anyhow::Result> { let mut planner = VectoredReadPlanner::new( @@ -507,7 +513,6 @@ impl ImageLayerInner { for range in keyspace.ranges.iter() { let mut range_end_handled = false; - let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; range.start.write_to_byte_slice(&mut search_key); @@ -520,12 +525,22 @@ impl ImageLayerInner { let key = Key::from_slice(&raw_key[..KEY_SIZE]); assert!(key >= range.start); + let flag = if let Some(shard_identity) = shard_identity { + if shard_identity.is_key_disposable(&key) { + BlobFlag::Ignore + } else { + BlobFlag::None + } + } else { + BlobFlag::None + }; + if key >= range.end { planner.handle_range_end(offset); range_end_handled = true; break; } else { - planner.handle(key, self.lsn, offset, BlobFlag::None); + planner.handle(key, self.lsn, offset, flag); } } @@ -538,6 +553,50 @@ impl ImageLayerInner { Ok(planner.finish()) } + /// Given a key range, select the parts of that range that should be retained by the ShardIdentity, + /// then execute vectored GET operations, passing the results of all read keys into the writer. + pub(super) async fn filter( + &self, + shard_identity: &ShardIdentity, + writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + // Fragment the range into the regions owned by this ShardIdentity + let plan = self + .plan_reads( + KeySpace { + // If asked for the total key space, plan_reads will give us all the keys in the layer + ranges: vec![Key::MIN..Key::MAX], + }, + Some(shard_identity), + ctx, + ) + .await?; + + let vectored_blob_reader = VectoredBlobReader::new(&self.file); + let mut key_count = 0; + for read in plan.into_iter() { + let buf_size = read.size(); + + let buf = BytesMut::with_capacity(buf_size); + let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?; + + let frozen_buf = blobs_buf.buf.freeze(); + + for meta in blobs_buf.blobs.iter() { + let img_buf = frozen_buf.slice(meta.start..meta.end); + + key_count += 1; + writer + .put_image(meta.meta.key, img_buf, ctx) + .await + .context(format!("Storing key {}", meta.meta.key))?; + } + } + + Ok(key_count) + } + async fn do_reads_and_update_state( &self, reads: Vec, @@ -855,3 +914,136 @@ impl Drop for ImageLayerWriter { } } } + +#[cfg(test)] +mod test { + use bytes::Bytes; + use pageserver_api::{ + key::Key, + shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}, + }; + use utils::{id::TimelineId, lsn::Lsn}; + + use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION}; + + use super::ImageLayerWriter; + + #[tokio::test] + async fn image_layer_rewrite() { + let harness = TenantHarness::create("test_image_layer_rewrite").unwrap(); + let (tenant, ctx) = harness.load().await; + + // The LSN at which we will create an image layer to filter + let lsn = Lsn(0xdeadbeef0000); + + let timeline_id = TimelineId::generate(); + let timeline = tenant + .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + let range = input_start..input_end; + + // Build an image layer to filter + let resident = { + let mut writer = ImageLayerWriter::new( + harness.conf, + timeline_id, + harness.tenant_shard_id, + &range, + lsn, + &ctx, + ) + .await + .unwrap(); + + let foo_img = Bytes::from_static(&[1, 2, 3, 4]); + let mut key = range.start; + while key < range.end { + writer.put_image(key, foo_img.clone(), &ctx).await.unwrap(); + + key = key.next(); + } + writer.finish(&timeline, &ctx).await.unwrap() + }; + let original_size = resident.metadata().file_size; + + // Filter for various shards: this exercises cases like values at start of key range, end of key + // range, middle of key range. + for shard_number in 0..4 { + let mut filtered_writer = ImageLayerWriter::new( + harness.conf, + timeline_id, + harness.tenant_shard_id, + &range, + lsn, + &ctx, + ) + .await + .unwrap(); + + // TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity + // to exercise filter() + let shard_identity = ShardIdentity::new( + ShardNumber(shard_number), + ShardCount::new(4), + ShardStripeSize(0x8000), + ) + .unwrap(); + + let wrote_keys = resident + .filter(&shard_identity, &mut filtered_writer, &ctx) + .await + .unwrap(); + let replacement = if wrote_keys > 0 { + Some(filtered_writer.finish(&timeline, &ctx).await.unwrap()) + } else { + None + }; + + // This exact size and those below will need updating as/when the layer encoding changes, but + // should be deterministic for a given version of the format, as we used no randomness generating the input. + assert_eq!(original_size, 1597440); + + match shard_number { + 0 => { + // We should have written out just one stripe for our shard identity + assert_eq!(wrote_keys, 0x8000); + let replacement = replacement.unwrap(); + + // We should have dropped some of the data + assert!(replacement.metadata().file_size < original_size); + assert!(replacement.metadata().file_size > 0); + + // Assert that we dropped ~3/4 of the data. + assert_eq!(replacement.metadata().file_size, 417792); + } + 1 => { + // Shard 1 has no keys in our input range + assert_eq!(wrote_keys, 0x0); + assert!(replacement.is_none()); + } + 2 => { + // Shard 2 has one stripes in the input range + assert_eq!(wrote_keys, 0x8000); + let replacement = replacement.unwrap(); + assert!(replacement.metadata().file_size < original_size); + assert!(replacement.metadata().file_size > 0); + assert_eq!(replacement.metadata().file_size, 417792); + } + 3 => { + // Shard 3 has two stripes in the input range + assert_eq!(wrote_keys, 0x10000); + let replacement = replacement.unwrap(); + assert!(replacement.metadata().file_size < original_size); + assert!(replacement.metadata().file_size > 0); + assert_eq!(replacement.metadata().file_size, 811008); + } + _ => unreachable!(), + } + } + } +} diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index b2f3bdb55239..3ac799c69a21 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -4,7 +4,7 @@ use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{ HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus, }; -use pageserver_api::shard::{ShardIndex, TenantShardId}; +use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; @@ -23,10 +23,10 @@ use crate::tenant::timeline::GetVectoredError; use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; use super::delta_layer::{self, DeltaEntry}; -use super::image_layer; +use super::image_layer::{self}; use super::{ - AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc, - ValueReconstructResult, ValueReconstructState, ValuesReconstructState, + AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, + PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState, }; use utils::generation::Generation; @@ -1802,16 +1802,15 @@ impl ResidentLayer { use LayerKind::*; let owner = &self.owner.0; - match self.downloaded.get(owner, ctx).await? { Delta(ref d) => { + // this is valid because the DownloadedLayer::kind is a OnceCell, not a + // Mutex, so we cannot go and deinitialize the value with OnceCell::take + // while it's being held. owner .access_stats .record_access(LayerAccessKind::KeyIter, ctx); - // this is valid because the DownloadedLayer::kind is a OnceCell, not a - // Mutex, so we cannot go and deinitialize the value with OnceCell::take - // while it's being held. delta_layer::DeltaLayerInner::load_keys(d, ctx) .await .with_context(|| format!("Layer index is corrupted for {self}")) @@ -1820,6 +1819,23 @@ impl ResidentLayer { } } + /// Read all they keys in this layer which match the ShardIdentity, and write them all to + /// the provided writer. Return the number of keys written. + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))] + pub(crate) async fn filter<'a>( + &'a self, + shard_identity: &ShardIdentity, + writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + use LayerKind::*; + + match self.downloaded.get(&self.owner.0, ctx).await? { + Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")), + Image(i) => i.filter(shard_identity, writer, ctx).await, + } + } + /// Returns the amount of keys and values written to the writer. pub(crate) async fn copy_delta_prefix( &self, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 342fc4fc59e4..1bdbddd95f34 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -41,6 +41,7 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::{ bin_ser::BeSer, + fs_ext, sync::gate::{Gate, GateGuard}, vec_map::VecMap, }; @@ -60,6 +61,7 @@ use std::{ ops::ControlFlow, }; +use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS; use crate::{ aux_file::AuxFileSizeEstimator, tenant::{ @@ -88,9 +90,6 @@ use crate::{ metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, }; use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; -use crate::{ - pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata, -}; use crate::{ pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, virtual_file::{MaybeFatalIo, VirtualFile}, @@ -2454,8 +2453,6 @@ impl Timeline { let span = tracing::Span::current(); // Copy to move into the task we're about to spawn - let generation = self.generation; - let shard = self.get_shard_index(); let this = self.myself.upgrade().expect("&self method holds the arc"); let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({ @@ -2469,11 +2466,14 @@ impl Timeline { for discovered in discovered { let (name, kind) = match discovered { - Discovered::Layer(layer_file_name, local_path, file_size) => { - discovered_layers.push((layer_file_name, local_path, file_size)); + Discovered::Layer(layer_file_name, local_metadata) => { + discovered_layers.push((layer_file_name, local_metadata)); continue; } - Discovered::IgnoredBackup => { + Discovered::IgnoredBackup(path) => { + std::fs::remove_file(path) + .or_else(fs_ext::ignore_not_found) + .fatal_err("Removing .old file"); continue; } Discovered::Unknown(file_name) => { @@ -2499,13 +2499,8 @@ impl Timeline { ); } - let decided = init::reconcile( - discovered_layers, - index_part.as_ref(), - disk_consistent_lsn, - generation, - shard, - ); + let decided = + init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn); let mut loaded_layers = Vec::new(); let mut needs_cleanup = Vec::new(); @@ -2513,21 +2508,6 @@ impl Timeline { for (name, decision) in decided { let decision = match decision { - Ok(UseRemote { local, remote }) => { - // Remote is authoritative, but we may still choose to retain - // the local file if the contents appear to match - if local.metadata.file_size == remote.file_size { - // Use the local file, but take the remote metadata so that we pick up - // the correct generation. - UseLocal(LocalLayerFileMetadata { - metadata: remote, - local_path: local.local_path, - }) - } else { - init::cleanup_local_file_for_remote(&local, &remote)?; - UseRemote { local, remote } - } - } Ok(decision) => decision, Err(DismissedLayer::Future { local }) => { if let Some(local) = local { @@ -2545,6 +2525,11 @@ impl Timeline { // this file never existed remotely, we will have to do rework continue; } + Err(DismissedLayer::BadMetadata(local)) => { + init::cleanup_local_file_for_remote(&local)?; + // this file never existed remotely, we will have to do rework + continue; + } }; match &name { @@ -2555,14 +2540,12 @@ impl Timeline { tracing::debug!(layer=%name, ?decision, "applied"); let layer = match decision { - UseLocal(local) => { - total_physical_size += local.metadata.file_size; - Layer::for_resident(conf, &this, local.local_path, name, local.metadata) + Resident { local, remote } => { + total_physical_size += local.file_size; + Layer::for_resident(conf, &this, local.local_path, name, remote) .drop_eviction_guard() } - Evicted(remote) | UseRemote { remote, .. } => { - Layer::for_evicted(conf, &this, name, remote) - } + Evicted(remote) => Layer::for_evicted(conf, &this, name, remote), }; loaded_layers.push(layer); @@ -4725,11 +4708,16 @@ impl Timeline { async fn rewrite_layers( self: &Arc, - replace_layers: Vec<(Layer, ResidentLayer)>, - drop_layers: Vec, + mut replace_layers: Vec<(Layer, ResidentLayer)>, + mut drop_layers: Vec, ) -> anyhow::Result<()> { let mut guard = self.layers.write().await; + // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want + // to avoid double-removing, and avoid rewriting something that was removed. + replace_layers.retain(|(l, _)| guard.contains(l)); + drop_layers.retain(|l| guard.contains(l)); + guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics); let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect(); @@ -5604,26 +5592,6 @@ fn is_send() { _assert_send::>(); } -/// Add a suffix to a layer file's name: .{num}.old -/// Uses the first available num (starts at 0) -fn rename_to_backup(path: &Utf8Path) -> anyhow::Result<()> { - let filename = path - .file_name() - .ok_or_else(|| anyhow!("Path {path} don't have a file name"))?; - let mut new_path = path.to_owned(); - - for i in 0u32.. { - new_path.set_file_name(format!("{filename}.{i}.old")); - if !new_path.exists() { - std::fs::rename(path, &new_path) - .with_context(|| format!("rename {path:?} to {new_path:?}"))?; - return Ok(()); - } - } - - bail!("couldn't find an unused backup number for {:?}", path) -} - #[cfg(test)] mod tests { use utils::{id::TimelineId, lsn::Lsn}; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index db8adfc16cb3..07a12f535aed 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -176,13 +176,24 @@ impl Timeline { async fn compact_shard_ancestors( self: &Arc, rewrite_max: usize, - _ctx: &RequestContext, + ctx: &RequestContext, ) -> anyhow::Result<()> { let mut drop_layers = Vec::new(); - let layers_to_rewrite: Vec = Vec::new(); + let mut layers_to_rewrite: Vec = Vec::new(); - // We will use the PITR cutoff as a condition for rewriting layers. - let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr; + // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a + // layer is behind this Lsn, it indicates that the layer is being retained beyond the + // pitr_interval, for example because a branchpoint references it. + // + // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we + // are rewriting layers. + let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn(); + + tracing::info!( + "latest_gc_cutoff: {}, pitr cutoff {}", + *latest_gc_cutoff, + self.gc_info.read().unwrap().cutoffs.pitr + ); let layers = self.layers.read().await; for layer_desc in layers.layer_map().iter_historic_layers() { @@ -241,9 +252,9 @@ impl Timeline { // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually // without incurring the I/O cost of a rewrite. - if layer_desc.get_lsn_range().end >= pitr_cutoff { - debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})", - layer_desc.get_lsn_range().end, pitr_cutoff); + if layer_desc.get_lsn_range().end >= *latest_gc_cutoff { + debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})", + layer_desc.get_lsn_range().end, *latest_gc_cutoff); continue; } @@ -253,13 +264,10 @@ impl Timeline { continue; } - // Only rewrite layers if they would have different remote paths: either they belong to this - // shard but an old generation, or they belonged to another shard. This also implicitly - // guarantees that the layer is persistent in remote storage (as only remote persistent - // layers are carried across shard splits, any local-only layer would be in the current generation) - if layer.metadata().generation == self.generation - && layer.metadata().shard.shard_count == self.shard_identity.count - { + // Only rewrite layers if their generations differ. This guarantees: + // - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one + // - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage + if layer.metadata().generation == self.generation { debug!(%layer, "Skipping rewrite, is not from old generation"); continue; } @@ -272,18 +280,69 @@ impl Timeline { } // Fall through: all our conditions for doing a rewrite passed. - // TODO: implement rewriting - tracing::debug!(%layer, "Would rewrite layer"); + layers_to_rewrite.push(layer); } - // Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`] + // Drop read lock on layer map before we start doing time-consuming I/O drop(layers); - // TODO: collect layers to rewrite - let replace_layers = Vec::new(); + let mut replace_image_layers = Vec::new(); + + for layer in layers_to_rewrite { + tracing::info!(layer=%layer, "Rewriting layer after shard split..."); + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &layer.layer_desc().key_range, + layer.layer_desc().image_layer_lsn(), + ctx, + ) + .await?; + + // Safety of layer rewrites: + // - We are writing to a different local file path than we are reading from, so the old Layer + // cannot interfere with the new one. + // - In the page cache, contents for a particular VirtualFile are stored with a file_id that + // is different for two layers with the same name (in `ImageLayerInner::new` we always + // acquire a fresh id from [`crate::page_cache::next_file_id`]. So readers do not risk + // reading the index from one layer file, and then data blocks from the rewritten layer file. + // - Any readers that have a reference to the old layer will keep it alive until they are done + // with it. If they are trying to promote from remote storage, that will fail, but this is the same + // as for compaction generally: compaction is allowed to delete layers that readers might be trying to use. + // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are: + // - GC, which at worst witnesses us "undelete" a layer that they just deleted. + // - ingestion, which only inserts layers, therefore cannot collide with us. + let resident = layer.download_and_keep_resident().await?; + + let keys_written = resident + .filter(&self.shard_identity, &mut image_layer_writer, ctx) + .await?; + + if keys_written > 0 { + let new_layer = image_layer_writer.finish(self, ctx).await?; + tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes", + layer.metadata().file_size, + new_layer.metadata().file_size); + + replace_image_layers.push((layer, new_layer)); + } else { + // Drop the old layer. Usually for this case we would already have noticed that + // the layer has no data for us with the ShardedRange check above, but + drop_layers.push(layer); + } + } + + // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded + // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch + // to remote index) and be removed. This is inefficient but safe. + fail::fail_point!("compact-shard-ancestors-localonly"); // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage - self.rewrite_layers(replace_layers, drop_layers).await?; + self.rewrite_layers(replace_image_layers, drop_layers) + .await?; + + fail::fail_point!("compact-shard-ancestors-enqueued"); // We wait for all uploads to complete before finishing this compaction stage. This is not // necessary for correctness, but it simplifies testing, and avoids proceeding with another @@ -291,6 +350,8 @@ impl Timeline { // load. self.remote_client.wait_completion().await?; + fail::fail_point!("compact-shard-ancestors-persistent"); + Ok(()) } diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 0cbaf395551d..5bc67c7133cb 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -7,19 +7,20 @@ use crate::{ index::{IndexPart, LayerFileMetadata}, }, storage_layer::LayerName, - Generation, }, }; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; -use pageserver_api::shard::ShardIndex; -use std::{collections::HashMap, str::FromStr}; +use std::{ + collections::{hash_map, HashMap}, + str::FromStr, +}; use utils::lsn::Lsn; /// Identified files in the timeline directory. pub(super) enum Discovered { /// The only one we care about - Layer(LayerName, Utf8PathBuf, u64), + Layer(LayerName, LocalLayerFileMetadata), /// Old ephmeral files from previous launches, should be removed Ephemeral(String), /// Old temporary timeline files, unsure what these really are, should be removed @@ -27,7 +28,7 @@ pub(super) enum Discovered { /// Temporary on-demand download files, should be removed TemporaryDownload(String), /// Backup file from previously future layers - IgnoredBackup, + IgnoredBackup(Utf8PathBuf), /// Unrecognized, warn about these Unknown(String), } @@ -43,12 +44,15 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result { let file_size = direntry.metadata()?.len(); - Discovered::Layer(file_name, direntry.path().to_owned(), file_size) + Discovered::Layer( + file_name, + LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size), + ) } Err(_) => { if file_name.ends_with(".old") { // ignore these - Discovered::IgnoredBackup + Discovered::IgnoredBackup(direntry.path().to_owned()) } else if remote_timeline_client::is_temp_download_file(direntry.path()) { Discovered::TemporaryDownload(file_name) } else if is_ephemeral_file(&file_name) { @@ -71,37 +75,32 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result Self { + pub fn new(local_path: Utf8PathBuf, file_size: u64) -> Self { Self { local_path, - metadata: LayerFileMetadata::new(file_size, generation, shard), + file_size, } } } -/// Decision on what to do with a layer file after considering its local and remote metadata. +/// For a layer that is present in remote metadata, this type describes how to handle +/// it during startup: it is either Resident (and we have some metadata about a local file), +/// or it is Evicted (and we only have remote metadata). #[derive(Clone, Debug)] pub(super) enum Decision { /// The layer is not present locally. Evicted(LayerFileMetadata), - /// The layer is present locally, but local metadata does not match remote; we must - /// delete it and treat it as evicted. - UseRemote { + /// The layer is present locally, and metadata matches: we may hook up this layer to the + /// existing file in local storage. + Resident { local: LocalLayerFileMetadata, remote: LayerFileMetadata, }, - /// The layer is present locally, and metadata matches. - UseLocal(LocalLayerFileMetadata), } /// A layer needs to be left out of the layer map. @@ -117,77 +116,81 @@ pub(super) enum DismissedLayer { /// In order to make crash safe updates to layer map, we must dismiss layers which are only /// found locally or not yet included in the remote `index_part.json`. LocalOnly(LocalLayerFileMetadata), + + /// The layer exists in remote storage but the local layer's metadata (e.g. file size) + /// does not match it + BadMetadata(LocalLayerFileMetadata), } /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions. pub(super) fn reconcile( - discovered: Vec<(LayerName, Utf8PathBuf, u64)>, + local_layers: Vec<(LayerName, LocalLayerFileMetadata)>, index_part: Option<&IndexPart>, disk_consistent_lsn: Lsn, - generation: Generation, - shard: ShardIndex, ) -> Vec<(LayerName, Result)> { - use Decision::*; - - // name => (local_metadata, remote_metadata) - type Collected = - HashMap, Option)>; - - let mut discovered = discovered - .into_iter() - .map(|(layer_name, local_path, file_size)| { - ( - layer_name, - // The generation and shard here will be corrected to match IndexPart in the merge below, unless - // it is not in IndexPart, in which case using our current generation makes sense - // because it will be uploaded in this generation. - ( - Some(LocalLayerFileMetadata::new( - local_path, file_size, generation, shard, - )), - None, - ), - ) - }) - .collect::(); - - // merge any index_part information, when available + let Some(index_part) = index_part else { + // If we have no remote metadata, no local layer files are considered valid to load + return local_layers + .into_iter() + .map(|(layer_name, local_metadata)| { + (layer_name, Err(DismissedLayer::LocalOnly(local_metadata))) + }) + .collect(); + }; + + let mut result = Vec::new(); + + let mut remote_layers = HashMap::new(); + + // Construct Decisions for layers that are found locally, if they're in remote metadata. Otherwise + // construct DismissedLayers to get rid of them. + for (layer_name, local_metadata) in local_layers { + let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else { + result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))); + continue; + }; + + if remote_metadata.file_size != local_metadata.file_size { + result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata)))); + continue; + } + + remote_layers.insert( + layer_name, + Decision::Resident { + local: local_metadata, + remote: remote_metadata.clone(), + }, + ); + } + + // Construct Decision for layers that were not found locally index_part - .as_ref() - .map(|ip| ip.layer_metadata.iter()) - .into_iter() - .flatten() - .map(|(name, metadata)| (name, metadata.clone())) + .layer_metadata + .iter() .for_each(|(name, metadata)| { - if let Some(existing) = discovered.get_mut(name) { - existing.1 = Some(metadata); - } else { - discovered.insert(name.to_owned(), (None, Some(metadata))); + if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) { + entry.insert(Decision::Evicted(metadata.clone())); } }); - discovered - .into_iter() - .map(|(name, (local, remote))| { - let decision = if name.is_in_future(disk_consistent_lsn) { - Err(DismissedLayer::Future { local }) - } else { - match (local, remote) { - (Some(local), Some(remote)) if local.metadata != remote => { - Ok(UseRemote { local, remote }) - } - (Some(x), Some(_)) => Ok(UseLocal(x)), - (None, Some(x)) => Ok(Evicted(x)), - (Some(x), None) => Err(DismissedLayer::LocalOnly(x)), - (None, None) => { - unreachable!("there must not be any non-local non-remote files") - } - } - }; + // For layers that were found in authoritative remote metadata, apply a final check that they are within + // the disk_consistent_lsn. + result.extend(remote_layers.into_iter().map(|(name, decision)| { + if name.is_in_future(disk_consistent_lsn) { + match decision { + Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })), + Decision::Resident { + local, + remote: _remote, + } => (name, Err(DismissedLayer::Future { local: Some(local) })), + } + } else { + (name, Ok(decision)) + } + })); - (name, decision) - }) - .collect::>() + result } pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> { @@ -196,25 +199,15 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> { std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}")) } -pub(super) fn cleanup_local_file_for_remote( - local: &LocalLayerFileMetadata, - remote: &LayerFileMetadata, -) -> anyhow::Result<()> { - let local_size = local.metadata.file_size; - let remote_size = remote.file_size; +pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> { + let local_size = local.file_size; let path = &local.local_path; - let file_name = path.file_name().expect("must be file path"); - tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}"); - if let Err(err) = crate::tenant::timeline::rename_to_backup(path) { - assert!( - path.exists(), - "we would leave the local_layer without a file if this does not hold: {path}", - ); - Err(err) - } else { - Ok(()) - } + tracing::warn!( + "removing local file {file_name:?} because it has unexpected length {local_size};" + ); + + std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}")) } pub(super) fn cleanup_future_layer( @@ -236,8 +229,8 @@ pub(super) fn cleanup_local_only_file( ) -> anyhow::Result<()> { let kind = name.kind(); tracing::info!( - "found local-only {kind} layer {name}, metadata {:?}", - local.metadata + "found local-only {kind} layer {name} size {}", + local.file_size ); std::fs::remove_file(&local.local_path)?; Ok(()) diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 248420e63208..884b71df7521 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -212,13 +212,34 @@ impl LayerManager { &mut self, rewrite_layers: &[(Layer, ResidentLayer)], drop_layers: &[Layer], - _metrics: &TimelineMetrics, + metrics: &TimelineMetrics, ) { let mut updates = self.layer_map.batch_update(); + for (old_layer, new_layer) in rewrite_layers { + debug_assert_eq!( + old_layer.layer_desc().key_range, + new_layer.layer_desc().key_range + ); + debug_assert_eq!( + old_layer.layer_desc().lsn_range, + new_layer.layer_desc().lsn_range + ); + + // Safety: we may never rewrite the same file in-place. Callers are responsible + // for ensuring that they only rewrite layers after something changes the path, + // such as an increment in the generation number. + assert_ne!(old_layer.local_path(), new_layer.local_path()); - // TODO: implement rewrites (currently this code path only used for drops) - assert!(rewrite_layers.is_empty()); + Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr); + Self::insert_historic_layer( + new_layer.as_ref().clone(), + &mut updates, + &mut self.layer_fmgr, + ); + + metrics.record_new_file_metrics(new_layer.layer_desc().file_size); + } for l in drop_layers { Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr); } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 796ae7217b02..36aa18f1f99a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2667,7 +2667,9 @@ def tenant_load(self, tenant_id: TenantId): tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id) ) - def list_layers(self, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]: + def list_layers( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ) -> list[Path]: """ Inspect local storage on a pageserver to discover which layer files are present. diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index bbb1ad0c6d70..545ba05b172c 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -177,7 +177,16 @@ def test_sharding_split_unsharded( env.storage_controller.consistency_check() -def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize( + "failpoint", + [ + None, + "compact-shard-ancestors-localonly", + "compact-shard-ancestors-enqueued", + "compact-shard-ancestors-persistent", + ], +) +def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: Optional[str]): """ Test that after a split, we clean up parent layer data in the child shards via compaction. """ @@ -196,6 +205,11 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder): "image_layer_creation_check_threshold": "0", } + neon_env_builder.storage_controller_config = { + # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. + "max_unavailable": "300s" + } + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -213,6 +227,10 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder): # Split one shard into two shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2) + # Let all shards move into their stable locations, so that during subsequent steps we + # don't have reconciles in progress (simpler to reason about what messages we expect in logs) + env.storage_controller.reconcile_until_idle() + # Check we got the shard IDs we expected assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None @@ -237,6 +255,90 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder): # Compaction shouldn't make anything unreadable workload.validate() + # Force a generation increase: layer rewrites are a long-term thing and only happen after + # the generation has increased. + env.pageserver.stop() + env.pageserver.start() + + # Cleanup part 2: once layers are outside the PITR window, they will be rewritten if they are partially redundant + env.storage_controller.pageserver_api().set_tenant_config(tenant_id, {"pitr_interval": "0s"}) + env.storage_controller.reconcile_until_idle() + + for shard in shards: + ps = env.get_tenant_pageserver(shard) + + # Apply failpoints for the layer-rewriting phase: this is the area of code that has sensitive behavior + # across restarts, as we will have local layer files that temporarily disagree with the remote metadata + # for the same local layer file name. + if failpoint is not None: + ps.http_client().configure_failpoints((failpoint, "exit")) + + # Do a GC to update gc_info (compaction uses this to decide whether a layer is to be rewritten) + # Set gc_horizon=0 to let PITR horizon control GC cutoff exclusively. + ps.http_client().timeline_gc(shard, timeline_id, gc_horizon=0) + + # We will compare stats before + after compaction + detail_before = ps.http_client().timeline_detail(shard, timeline_id) + + # Invoke compaction: this should rewrite layers that are behind the pitr horizon + try: + ps.http_client().timeline_compact(shard, timeline_id) + except requests.ConnectionError as e: + if failpoint is None: + raise e + else: + log.info(f"Compaction failed (failpoint={failpoint}): {e}") + + if failpoint in ( + "compact-shard-ancestors-localonly", + "compact-shard-ancestors-enqueued", + ): + # If we left local files that don't match remote metadata, we expect warnings on next startup + env.pageserver.allowed_errors.append( + ".*removing local file .+ because it has unexpected length.*" + ) + + # Post-failpoint: we check that the pageserver comes back online happily. + env.pageserver.running = False + env.pageserver.start() + else: + assert failpoint is None # We shouldn't reach success path if a failpoint was set + + detail_after = ps.http_client().timeline_detail(shard, timeline_id) + + # Physical size should shrink because layers are smaller + assert detail_after["current_physical_size"] < detail_before["current_physical_size"] + + # Validate size statistics + for shard in shards: + ps = env.get_tenant_pageserver(shard) + timeline_info = ps.http_client().timeline_detail(shard, timeline_id) + reported_size = timeline_info["current_physical_size"] + layer_paths = ps.list_layers(shard, timeline_id) + measured_size = 0 + for p in layer_paths: + abs_path = ps.timeline_dir(shard, timeline_id) / p + measured_size += os.stat(abs_path).st_size + + log.info( + f"shard {shard} reported size {reported_size}, measured size {measured_size} ({len(layer_paths)} layers)" + ) + + if failpoint in ( + "compact-shard-ancestors-localonly", + "compact-shard-ancestors-enqueued", + ): + # If we injected a failure between local rewrite and remote upload, then after + # restart we may end up with neither version of the file on local disk (the new file + # is cleaned up because it doesn't matchc remote metadata). So local size isn't + # necessarily going to match remote physical size. + continue + + assert measured_size == reported_size + + # Compaction shouldn't make anything unreadable + workload.validate() + def test_sharding_split_smoke( neon_env_builder: NeonEnvBuilder,