diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 3db75b7d0e39..b1e4525cc03c 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -607,31 +607,6 @@ impl TenantConfigRequest { } } -#[derive(Debug, Deserialize)] -pub struct TenantAttachRequest { - #[serde(default)] - pub config: TenantAttachConfig, - #[serde(default)] - pub generation: Option, -} - -/// Newtype to enforce deny_unknown_fields on TenantConfig for -/// its usage inside `TenantAttachRequest`. -#[derive(Debug, Serialize, Deserialize, Default)] -#[serde(deny_unknown_fields)] -pub struct TenantAttachConfig { - #[serde(flatten)] - allowing_unknown_fields: TenantConfig, -} - -impl std::ops::Deref for TenantAttachConfig { - type Target = TenantConfig; - - fn deref(&self) -> &Self::Target { - &self.allowing_unknown_fields - } -} - /// See [`TenantState::attachment_status`] and the OpenAPI docs for context. #[derive(Serialize, Deserialize, Clone)] #[serde(tag = "slug", content = "data", rename_all = "snake_case")] @@ -1554,18 +1529,6 @@ mod tests { "expect unknown field `unknown_field` error, got: {}", err ); - - let attach_request = json!({ - "config": { - "unknown_field": "unknown_value".to_string(), - }, - }); - let err = serde_json::from_value::(attach_request).unwrap_err(); - assert!( - err.to_string().contains("unknown field `unknown_field`"), - "expect unknown field `unknown_field` error, got: {}", - err - ); } #[test] diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs index 7ca62561feb2..6c25440b429d 100644 --- a/libs/utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -8,22 +8,15 @@ use super::error::ApiError; pub async fn json_request Deserialize<'de>>( request: &mut Request, ) -> Result { - json_request_or_empty_body(request) - .await? - .context("missing request body") - .map_err(ApiError::BadRequest) -} - -/// Will be removed as part of -pub async fn json_request_or_empty_body Deserialize<'de>>( - request: &mut Request, -) -> Result, ApiError> { let body = hyper::body::aggregate(request.body_mut()) .await .context("Failed to read request body") .map_err(ApiError::BadRequest)?; + if body.remaining() == 0 { - return Ok(None); + return Err(ApiError::BadRequest(anyhow::anyhow!( + "missing request body" + ))); } let mut deser = serde_json::de::Deserializer::from_reader(body.reader()); @@ -31,7 +24,6 @@ pub async fn json_request_or_empty_body Deserialize<'de>>( serde_path_to_error::deserialize(&mut deser) // intentionally stringify because the debug version is not helpful in python logs .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}")) - .map(Some) .map_err(ApiError::BadRequest) } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index e583992a58f9..58ff6e3f83cc 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -367,16 +367,7 @@ paths: $ref: "#/components/schemas/TenantLocationConfigResponse" "409": description: | - The tenant is already known to Pageserver in some way, - and hence this `/attach` call has been rejected. - - Some examples of how this can happen: - - tenant was created on this pageserver - - tenant attachment was started by an earlier call to `/attach`. - - Callers should poll the tenant status's `attachment_status` field, - like for status 202. See the longer description for `POST /attach` - for details. + The tenant is already being modified, perhaps by a concurrent call to this API content: application/json: schema: @@ -762,8 +753,6 @@ components: For example this can be caused by s3 being unreachable. The retry may be implemented with call to detach, though it would be better to not automate it and inspec failed state manually before proceeding with a retry. - - See the tenant `/attach` endpoint for more information. type: object required: - slug diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 450f89820e5b..d6ba9ee35e17 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -31,13 +31,11 @@ use pageserver_api::models::TenantShardLocation; use pageserver_api::models::TenantShardSplitRequest; use pageserver_api::models::TenantShardSplitResponse; use pageserver_api::models::TenantSorting; -use pageserver_api::models::TenantState; use pageserver_api::models::TopTenantShardItem; use pageserver_api::models::TopTenantShardsRequest; use pageserver_api::models::TopTenantShardsResponse; use pageserver_api::models::{ - DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest, - TenantLocationConfigRequest, + DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest, }; use pageserver_api::shard::ShardCount; use pageserver_api::shard::TenantShardId; @@ -51,7 +49,6 @@ use utils::auth::JwtAuth; use utils::failpoint_support::failpoints_handler; use utils::http::endpoint::prometheus_metrics_handler; use utils::http::endpoint::request_span; -use utils::http::json::json_request_or_empty_body; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; use crate::context::{DownloadBehavior, RequestContext}; @@ -821,58 +818,6 @@ async fn get_timestamp_of_lsn_handler( } } -async fn tenant_attach_handler( - mut request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - - let maybe_body: Option = json_request_or_empty_body(&mut request).await?; - let tenant_conf = match &maybe_body { - Some(request) => TenantConfOpt::try_from(&*request.config).map_err(ApiError::BadRequest)?, - None => TenantConfOpt::default(), - }; - - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - - info!("Handling tenant attach {tenant_id}"); - - let state = get_state(&request); - - let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?; - - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - let shard_params = ShardParameters::default(); - let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params); - - let tenant = state - .tenant_manager - .upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx) - .await?; - - let Some(tenant) = tenant else { - // This should never happen: indicates a bug in upsert_location - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Upsert succeeded but didn't return tenant!" - ))); - }; - - // We might have successfully constructed a Tenant, but it could still - // end up in a broken state: - if let TenantState::Broken { - reason, - backtrace: _, - } = tenant.current_state() - { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Tenant state is Broken: {reason}" - ))); - } - - json_response(StatusCode::ACCEPTED, ()) -} - async fn timeline_delete_handler( request: Request, _cancel: CancellationToken, @@ -903,26 +848,6 @@ async fn timeline_delete_handler( json_response(StatusCode::ACCEPTED, ()) } -async fn tenant_detach_handler( - request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - // This is a legacy API (`/location_conf` is the replacement). It only supports unsharded tenants - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - - let state = get_state(&request); - let conf = state.conf; - state - .tenant_manager - .detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client) - .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug())) - .await?; - - json_response(StatusCode::OK, ()) -} - async fn tenant_reset_handler( request: Request, _cancel: CancellationToken, @@ -2711,12 +2636,6 @@ pub fn make_router( .post("/v1/tenant/:tenant_shard_id/timeline", |r| { api_handler(r, timeline_create_handler) }) - .post("/v1/tenant/:tenant_id/attach", |r| { - api_handler(r, tenant_attach_handler) - }) - .post("/v1/tenant/:tenant_id/detach", |r| { - api_handler(r, tenant_detach_handler) - }) .post("/v1/tenant/:tenant_shard_id/reset", |r| { api_handler(r, tenant_reset_handler) }) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 388e0eadc8e9..e329f42dd610 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1231,6 +1231,13 @@ impl Service { &self, attach_req: AttachHookRequest, ) -> anyhow::Result { + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + attach_req.tenant_shard_id.tenant_id, + TenantOperations::ShardSplit, + ) + .await; + // This is a test hook. To enable using it on tenants that were created directly with // the pageserver API (not via this service), we will auto-create any missing tenant // shards with default state. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b624c84fad42..84fb1f7cb47d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2684,7 +2684,6 @@ def tenant_attach( self, tenant_id: TenantId, config: None | Dict[str, Any] = None, - config_null: bool = False, generation: Optional[int] = None, override_storage_controller_generation: bool = False, ): @@ -2702,7 +2701,6 @@ def tenant_attach( return client.tenant_attach( tenant_id, config, - config_null, generation=generation, ) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 64c7ddee6c8c..2a7cbea20010 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1,6 +1,5 @@ from __future__ import annotations -import json import time from collections import defaultdict from dataclasses import dataclass @@ -253,39 +252,30 @@ def tenant_attach( self, tenant_id: Union[TenantId, TenantShardId], config: None | Dict[str, Any] = None, - config_null: bool = False, generation: Optional[int] = None, ): - if config_null: - assert config is None - body: Any = None - else: - # null-config is prohibited by the API - config = config or {} - body = {"config": config} - if generation is not None: - body.update({"generation": generation}) + config = config or {} - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach", - data=json.dumps(body), - headers={"Content-Type": "application/json"}, + return self.tenant_location_conf( + tenant_id, + location_conf={ + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": config, + "generation": generation, + }, ) - self.verbose_error(res) - - def tenant_detach(self, tenant_id: TenantId, detach_ignored=False, timeout_secs=None): - params = {} - if detach_ignored: - params["detach_ignored"] = "true" - - kwargs = {} - if timeout_secs is not None: - kwargs["timeout"] = timeout_secs - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params, **kwargs + def tenant_detach(self, tenant_id: TenantId): + return self.tenant_location_conf( + tenant_id, + location_conf={ + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, ) - self.verbose_error(res) def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool): params = {} diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index f4667a82dc33..e117c2140f5e 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -7,7 +7,7 @@ NeonEnv, NeonEnvBuilder, ) -from fixtures.pageserver.http import PageserverApiException, TenantConfig +from fixtures.pageserver.http import TenantConfig from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import wait_until @@ -82,8 +82,8 @@ def test_null_body(negative_env: NegativeTests): tenant_id = negative_env.tenant_id ps_http = env.pageserver.http_client() - res = ps_http.post( - f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", + res = ps_http.put( + f"{ps_http.base_url}/v1/tenant/{tenant_id}/location_config", data=b"null", headers={"Content-Type": "application/json"}, ) @@ -99,35 +99,16 @@ def test_null_config(negative_env: NegativeTests): tenant_id = negative_env.tenant_id ps_http = env.pageserver.http_client() - res = ps_http.post( - f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", - data=b'{"config": null}', + res = ps_http.put( + f"{ps_http.base_url}/v1/tenant/{tenant_id}/location_config", + json={"mode": "AttachedSingle", "generation": 1, "tenant_conf": None}, headers={"Content-Type": "application/json"}, ) assert res.status_code == 400 -def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests): - """ - If we send a config with unknown keys, the request should be rejected with status 400. - """ - - env = negative_env.neon_env - tenant_id = negative_env.tenant_id - - config_with_unknown_keys = { - "compaction_period": "1h", - "this_key_does_not_exist": "some value", - } - - with pytest.raises(PageserverApiException) as e: - env.pageserver.tenant_attach(tenant_id, config=config_with_unknown_keys) - assert e.type == PageserverApiException - assert e.value.status_code == 400 - - @pytest.mark.parametrize("content_type", [None, "application/json"]) -def test_no_config(positive_env: NeonEnv, content_type: Optional[str]): +def test_empty_config(positive_env: NeonEnv, content_type: Optional[str]): """ When the 'config' body attribute is omitted, the request should be accepted and the tenant should use the default configuration @@ -141,11 +122,13 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]): ps_http.tenant_detach(tenant_id) assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()] - body = {"generation": env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)} - - ps_http.post( - f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", - json=body, + ps_http.put( + f"{ps_http.base_url}/v1/tenant/{tenant_id}/location_config", + json={ + "mode": "AttachedSingle", + "generation": env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id), + "tenant_conf": {}, + }, headers=None if content_type else {"Content-Type": "application/json"}, ).raise_for_status() diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 7f79bf5d5cab..b26bd3422f30 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -164,13 +164,14 @@ def test_remote_storage_backup_and_restore( "data": {"reason": "storage-sync-list-remote-timelines"}, } + # Even though the tenant is broken, subsequent calls to location_conf API will succeed, but + # the tenant will always end up in a broken state as a result of the failpoint. # Ensure that even though the tenant is broken, retrying the attachment fails - with pytest.raises(Exception, match="Tenant state is Broken"): - # Use same generation as in previous attempt - gen_state = env.storage_controller.inspect(tenant_id) - assert gen_state is not None - generation = gen_state[0] - env.pageserver.tenant_attach(tenant_id, generation=generation) + tenant_info = wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) + gen_state = env.storage_controller.inspect(tenant_id) + assert gen_state is not None + generation = gen_state[0] + env.pageserver.tenant_attach(tenant_id, generation=generation) # Restart again, this implicitly clears the failpoint. # test_remote_failures=1 remains active, though, as it's in the pageserver config. diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 4c49e6fb856c..2056840558e6 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -275,16 +275,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # first check for non existing tenant - tenant_id = TenantId.generate() - with pytest.raises( - expected_exception=PageserverApiException, - match=f"NotFound: tenant {tenant_id}", - ) as excinfo: - pageserver_http.tenant_detach(tenant_id) - - assert excinfo.value.status_code == 404 - # create new nenant tenant_id, timeline_id = env.neon_cli.create_tenant() @@ -344,44 +334,6 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): pageserver_http.timeline_gc(tenant_id, timeline_id, 0) -# Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach. -# Tenant should be detached without issues. -def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv): - env = neon_simple_env - client = env.pageserver.http_client() - - # create a new tenant - tenant_id, _ = env.neon_cli.create_tenant() - - env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - - # assert tenant exists on disk - assert env.pageserver.tenant_dir(tenant_id).exists() - - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - # we rely upon autocommit after each statement - endpoint.safe_psql_many( - queries=[ - "CREATE TABLE t(key int primary key, value text)", - "INSERT INTO t SELECT generate_series(1,100000), 'payload'", - ] - ) - - log.info("detaching regular tenant with detach ignored flag") - client.tenant_detach(tenant_id, True) - - log.info("regular tenant detached without error") - - # check that nothing is left on disk for deleted tenant - assert not env.pageserver.tenant_dir(tenant_id).exists() - - # assert the tenant does not exists in the Pageserver - tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()] - assert ( - tenant_id not in tenants_after_detach - ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory" - - def test_detach_while_attaching( neon_env_builder: NeonEnvBuilder, ): diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 3110833563cd..f47356839c26 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -840,7 +840,7 @@ def all_active(): # Detaching a stuck tenant should proceed promptly # (reproducer for https://github.com/neondatabase/neon/pull/6430) - env.pageserver.http_client().tenant_detach(detach_tenant_id, timeout_secs=10) + env.pageserver.http_client().tenant_detach(detach_tenant_id) tenant_ids.remove(detach_tenant_id) # FIXME: currently the mechanism for cancelling attach is to set state to broken, which is reported spuriously at error level env.pageserver.allowed_errors.append( diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index ad37807dba21..375cfcb4feb0 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -37,7 +37,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}", ): - pageserver_http.tenant_detach(tenant_id) + pageserver_http.tenant_status(tenant_id) # create new nenant tenant_id, _ = env.neon_cli.create_tenant()