Skip to content

Commit

Permalink
Add eviction_state to control file (#8125)
Browse files Browse the repository at this point in the history
This is a preparation for #8022, to make the PR both backwards and
foward compatible.

This commit adds `eviction_state` field to control file. Adds support
for reading it, but writes control file in old format where possible, to
keep the disk format forward compatible.

Note: in `patch_control_file`, new field gets serialized to json like
this:
- `"eviction_state": "Present"`
- `"eviction_state": {"Offloaded": "0/8F"}`
  • Loading branch information
petuhovskiy authored and conradludgate committed Jun 27, 2024
1 parent 81c873e commit 06456f7
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 6 deletions.
19 changes: 15 additions & 4 deletions safekeeper/src/control_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,16 @@ use std::ops::Deref;
use std::path::Path;
use std::time::Instant;

use crate::control_file_upgrade::downgrade_v9_to_v8;
use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
use crate::state::TimelinePersistentState;
use crate::state::{EvictionState, TimelinePersistentState};
use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
use utils::{bin_ser::LeSer, id::TenantTimelineId};

use crate::SafeKeeperConf;

pub const SK_MAGIC: u32 = 0xcafeceefu32;
pub const SK_FORMAT_VERSION: u32 = 8;
pub const SK_FORMAT_VERSION: u32 = 9;

// contains persistent metadata for safekeeper
pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
Expand Down Expand Up @@ -178,8 +179,18 @@ impl Storage for FileStorage {
})?;
let mut buf: Vec<u8> = Vec::new();
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
s.ser_into(&mut buf)?;

if s.eviction_state == EvictionState::Present {
// temp hack for forward compatibility
const PREV_FORMAT_VERSION: u32 = 8;
let prev = downgrade_v9_to_v8(s);
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
prev.ser_into(&mut buf)?;
} else {
// otherwise, we write the current format version
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
s.ser_into(&mut buf)?;
}

// calculate checksum before resize
let checksum = crc32c::crc32c(&buf);
Expand Down
94 changes: 93 additions & 1 deletion safekeeper/src/control_file_upgrade.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! Code to deal with safekeeper control file upgrades
use crate::{
safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
state::{PersistedPeers, TimelinePersistentState},
state::{EvictionState, PersistedPeers, TimelinePersistentState},
wal_backup_partial,
};
use anyhow::{bail, Result};
Expand Down Expand Up @@ -183,6 +183,55 @@ pub struct SafeKeeperStateV7 {
pub peers: PersistedPeers,
}

/// Persistent information stored on safekeeper node about timeline.
/// On disk data is prefixed by magic and format version and followed by checksum.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct SafeKeeperStateV8 {
#[serde(with = "hex")]
pub tenant_id: TenantId,
#[serde(with = "hex")]
pub timeline_id: TimelineId,
/// persistent acceptor state
pub acceptor_state: AcceptorState,
/// information about server
pub server: ServerInfo,
/// Unique id of the last *elected* proposer we dealt with. Not needed
/// for correctness, exists for monitoring purposes.
#[serde(with = "hex")]
pub proposer_uuid: PgUuid,
/// Since which LSN this timeline generally starts. Safekeeper might have
/// joined later.
pub timeline_start_lsn: Lsn,
/// Since which LSN safekeeper has (had) WAL for this timeline.
/// All WAL segments next to one containing local_start_lsn are
/// filled with data from the beginning.
pub local_start_lsn: Lsn,
/// Part of WAL acknowledged by quorum *and available locally*. Always points
/// to record boundary.
pub commit_lsn: Lsn,
/// LSN that points to the end of the last backed up segment. Useful to
/// persist to avoid finding out offloading progress on boot.
pub backup_lsn: Lsn,
/// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
/// of last record streamed to everyone). Persisting it helps skipping
/// recovery in walproposer, generally we compute it from peers. In
/// walproposer proto called 'truncate_lsn'. Updates are currently drived
/// only by walproposer.
pub peer_horizon_lsn: Lsn,
/// LSN of the oldest known checkpoint made by pageserver and successfully
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
/// informational purposes, we receive it from pageserver (or broker).
pub remote_consistent_lsn: Lsn,
/// Peers and their state as we remember it. Knowing peers themselves is
/// fundamental; but state is saved here only for informational purposes and
/// obviously can be stale. (Currently not saved at all, but let's provision
/// place to have less file version upgrades).
pub peers: PersistedPeers,
/// Holds names of partial segments uploaded to remote storage. Used to
/// clean up old objects without leaving garbage in remote storage.
pub partial_backup: wal_backup_partial::State,
}

pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
// migrate to storing full term history
if version == 1 {
Expand Down Expand Up @@ -213,6 +262,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
remote_consistent_lsn: Lsn(0),
peers: PersistedPeers(vec![]),
partial_backup: wal_backup_partial::State::default(),
eviction_state: EvictionState::Present,
});
// migrate to hexing some ids
} else if version == 2 {
Expand All @@ -237,6 +287,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
remote_consistent_lsn: Lsn(0),
peers: PersistedPeers(vec![]),
partial_backup: wal_backup_partial::State::default(),
eviction_state: EvictionState::Present,
});
// migrate to moving tenant_id/timeline_id to the top and adding some lsns
} else if version == 3 {
Expand All @@ -261,6 +312,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
remote_consistent_lsn: Lsn(0),
peers: PersistedPeers(vec![]),
partial_backup: wal_backup_partial::State::default(),
eviction_state: EvictionState::Present,
});
// migrate to having timeline_start_lsn
} else if version == 4 {
Expand All @@ -285,6 +337,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
remote_consistent_lsn: Lsn(0),
peers: PersistedPeers(vec![]),
partial_backup: wal_backup_partial::State::default(),
eviction_state: EvictionState::Present,
});
} else if version == 5 {
info!("reading safekeeper control file version {}", version);
Expand Down Expand Up @@ -329,6 +382,26 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
remote_consistent_lsn: oldstate.remote_consistent_lsn,
peers: oldstate.peers,
partial_backup: wal_backup_partial::State::default(),
eviction_state: EvictionState::Present,
});
} else if version == 8 {
let oldstate = SafeKeeperStateV8::des(&buf[..buf.len()])?;

return Ok(TimelinePersistentState {
tenant_id: oldstate.tenant_id,
timeline_id: oldstate.timeline_id,
acceptor_state: oldstate.acceptor_state,
server: oldstate.server,
proposer_uuid: oldstate.proposer_uuid,
timeline_start_lsn: oldstate.timeline_start_lsn,
local_start_lsn: oldstate.local_start_lsn,
commit_lsn: oldstate.commit_lsn,
backup_lsn: oldstate.backup_lsn,
peer_horizon_lsn: oldstate.peer_horizon_lsn,
remote_consistent_lsn: oldstate.remote_consistent_lsn,
peers: oldstate.peers,
partial_backup: oldstate.partial_backup,
eviction_state: EvictionState::Present,
});
}

Expand All @@ -338,6 +411,25 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
bail!("unsupported safekeeper control file version {}", version)
}

pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 {
assert!(state.eviction_state == EvictionState::Present);
SafeKeeperStateV8 {
tenant_id: state.tenant_id,
timeline_id: state.timeline_id,
acceptor_state: state.acceptor_state.clone(),
server: state.server.clone(),
proposer_uuid: state.proposer_uuid,
timeline_start_lsn: state.timeline_start_lsn,
local_start_lsn: state.local_start_lsn,
commit_lsn: state.commit_lsn,
backup_lsn: state.backup_lsn,
peer_horizon_lsn: state.peer_horizon_lsn,
remote_consistent_lsn: state.remote_consistent_lsn,
peers: state.peers.clone(),
partial_backup: state.partial_backup.clone(),
}
}

#[cfg(test)]
mod tests {
use std::str::FromStr;
Expand Down
5 changes: 4 additions & 1 deletion safekeeper/src/safekeeper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -958,7 +958,7 @@ mod tests {

use super::*;
use crate::{
state::{PersistedPeers, TimelinePersistentState},
state::{EvictionState, PersistedPeers, TimelinePersistentState},
wal_storage::Storage,
};
use std::{ops::Deref, str::FromStr, time::Instant};
Expand Down Expand Up @@ -1225,6 +1225,7 @@ mod tests {
},
)]),
partial_backup: crate::wal_backup_partial::State::default(),
eviction_state: EvictionState::Present,
};

let ser = state.ser().unwrap();
Expand Down Expand Up @@ -1272,6 +1273,8 @@ mod tests {
0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
// partial_backup
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
// eviction_state
0x00, 0x00, 0x00, 0x00,
];

assert_eq!(Hex(&ser), Hex(&expected));
Expand Down
16 changes: 16 additions & 0 deletions safekeeper/src/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,26 @@ pub struct TimelinePersistentState {
/// Holds names of partial segments uploaded to remote storage. Used to
/// clean up old objects without leaving garbage in remote storage.
pub partial_backup: wal_backup_partial::State,
/// Eviction state of the timeline. If it's Offloaded, we should download
/// WAL files from remote storage to serve the timeline.
pub eviction_state: EvictionState,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);

/// State of the local WAL files. Used to track current timeline state,
/// that can be either WAL files are present on disk or last partial segment
/// is offloaded to remote storage.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub enum EvictionState {
/// WAL files are present on disk.
Present,
/// Last partial segment is offloaded to remote storage.
/// Contains flush_lsn of the last offloaded segment.
Offloaded(Lsn),
}

impl TimelinePersistentState {
pub fn new(
ttid: &TenantTimelineId,
Expand Down Expand Up @@ -98,6 +113,7 @@ impl TimelinePersistentState {
.collect(),
),
partial_backup: wal_backup_partial::State::default(),
eviction_state: EvictionState::Present,
}
}

Expand Down

0 comments on commit 06456f7

Please sign in to comment.