Skip to content

Commit

Permalink
feat(pageserver): add an optional lease to the get_lsn_by_timestamp A…
Browse files Browse the repository at this point in the history
…PI (#8104)

Part of #7497, closes #8072.

## Problem

Currently the `get_lsn_by_timestamp` and branch creation pageserver APIs do not provide a pleasant client experience where the looked-up LSN might be GC-ed between the two API calls.

This PR attempts to prevent common races between GC and branch creation by making use of LSN leases provided in #8084. A lease can be optionally granted to a looked-up LSN. With the lease, GC will not touch layers needed to reconstruct all pages at this LSN for the duration of the lease.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
  • Loading branch information
yliang412 authored and conradludgate committed Jun 27, 2024
1 parent e8f3411 commit 76fc580
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 11 deletions.
11 changes: 11 additions & 0 deletions pageserver/src/http/openapi_spec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,13 @@ paths:
type: string
format: date-time
description: A timestamp to get the LSN
- name: with_lease
in: query
required: false
schema:
type: boolean
description: Whether to grant a lease to the corresponding LSN. Default to false.

responses:
"200":
description: OK
Expand Down Expand Up @@ -1029,6 +1036,10 @@ components:
kind:
type: string
enum: [past, present, future, nodata]
valid_until:
type: string
format: date-time
description: The expiration time of the granted lease.

LsnLease:
type: object
Expand Down
27 changes: 26 additions & 1 deletion pageserver/src/http/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use pageserver_api::models::IngestAuxFilesRequest;
use pageserver_api::models::ListAuxFilesRequest;
use pageserver_api::models::LocationConfig;
use pageserver_api::models::LocationConfigListResponse;
use pageserver_api::models::LsnLease;
use pageserver_api::models::ShardParameters;
use pageserver_api::models::TenantDetails;
use pageserver_api::models::TenantLocationConfigResponse;
Expand Down Expand Up @@ -728,6 +729,8 @@ async fn get_lsn_by_timestamp_handler(
.map_err(ApiError::BadRequest)?;
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);

let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false);

let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

let timeline =
Expand All @@ -736,22 +739,44 @@ async fn get_lsn_by_timestamp_handler(
let result = timeline
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
.await?;

#[derive(serde::Serialize, Debug)]
struct Result {
lsn: Lsn,
kind: &'static str,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(flatten)]
lease: Option<LsnLease>,
}
let (lsn, kind) = match result {
LsnForTimestamp::Present(lsn) => (lsn, "present"),
LsnForTimestamp::Future(lsn) => (lsn, "future"),
LsnForTimestamp::Past(lsn) => (lsn, "past"),
LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
};
let result = Result { lsn, kind };

let lease = if with_lease {
timeline
.make_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
.inspect_err(|_| {
warn!("fail to grant a lease to {}", lsn);
})
.ok()
} else {
None
};

let result = Result { lsn, kind, lease };
let valid_until = result
.lease
.as_ref()
.map(|l| humantime::format_rfc3339_millis(l.valid_until).to_string());
tracing::info!(
lsn=?result.lsn,
kind=%result.kind,
timestamp=%timestamp_raw,
valid_until=?valid_until,
"lsn_by_timestamp finished"
);
json_response(StatusCode::OK, result)
Expand Down
6 changes: 4 additions & 2 deletions test_runner/fixtures/pageserver/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,13 +619,15 @@ def timeline_get_lsn_by_timestamp(
tenant_id: Union[TenantId, TenantShardId],
timeline_id: TimelineId,
timestamp: datetime,
with_lease: bool = False,
**kwargs,
):
log.info(
f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}, {with_lease=}"
)
with_lease_query = f"{with_lease=}".lower()
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z",
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z&{with_lease_query}",
**kwargs,
)
self.verbose_error(res)
Expand Down
43 changes: 35 additions & 8 deletions test_runner/regress/test_lsn_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,24 @@
from requests.exceptions import ReadTimeout


#
# Test pageserver get_lsn_by_timestamp API
#
def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
def assert_lsn_lease_granted(result, with_lease: bool):
"""
Asserts an LSN lease is granted when `with_lease` flag is turned on.
Always asserts no LSN lease is granted when `with_lease` flag is off.
"""
if with_lease:
assert result.get("valid_until")
else:
assert result.get("valid_until") is None


@pytest.mark.parametrize("with_lease", [True, False])
def test_lsn_mapping(neon_env_builder: NeonEnvBuilder, with_lease: bool):
"""
Test pageserver get_lsn_by_timestamp API.
:param with_lease: Whether to get a lease associated with returned LSN.
"""
env = neon_env_builder.init_start()

tenant_id, _ = env.neon_cli.create_tenant(
Expand Down Expand Up @@ -67,23 +81,33 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
# Check edge cases
# Timestamp is in the future
probe_timestamp = tbl[-1][1] + timedelta(hours=1)
result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
result = client.timeline_get_lsn_by_timestamp(
tenant_id, timeline_id, probe_timestamp, with_lease=with_lease
)
assert result["kind"] == "future"
assert_lsn_lease_granted(result, with_lease)
# make sure that we return a well advanced lsn here
assert Lsn(result["lsn"]) > start_lsn

# Timestamp is in the unreachable past
probe_timestamp = tbl[0][1] - timedelta(hours=10)
result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
result = client.timeline_get_lsn_by_timestamp(
tenant_id, timeline_id, probe_timestamp, with_lease=with_lease
)
assert result["kind"] == "past"
assert_lsn_lease_granted(result, with_lease)

# make sure that we return the minimum lsn here at the start of the range
assert Lsn(result["lsn"]) < start_lsn

# Probe a bunch of timestamps in the valid range
for i in range(1, len(tbl), 100):
probe_timestamp = tbl[i][1]
result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp)
result = client.timeline_get_lsn_by_timestamp(
tenant_id, timeline_id, probe_timestamp, with_lease=with_lease
)
assert result["kind"] not in ["past", "nodata"]
assert_lsn_lease_granted(result, with_lease)
lsn = result["lsn"]
# Call get_lsn_by_timestamp to get the LSN
# Launch a new read-only node at that LSN, and check that only the rows
Expand All @@ -105,8 +129,11 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):

# Timestamp is in the unreachable past
probe_timestamp = tbl[0][1] - timedelta(hours=10)
result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id_child, probe_timestamp)
result = client.timeline_get_lsn_by_timestamp(
tenant_id, timeline_id_child, probe_timestamp, with_lease=with_lease
)
assert result["kind"] == "past"
assert_lsn_lease_granted(result, with_lease)
# make sure that we return the minimum lsn here at the start of the range
assert Lsn(result["lsn"]) >= last_flush_lsn

Expand Down

0 comments on commit 76fc580

Please sign in to comment.