Skip to content

Commit

Permalink
controller: add storcon-cli (#7114)
Browse files Browse the repository at this point in the history
## Problem

During incidents, we may need to quickly access the storage controller's
API without trying API client code or crafting `curl` CLIs on the fly. A
basic CLI client is needed for this.

## Summary of changes

- Update storage controller node listing API to only use public types in
controller_api.rs
- Add a storage controller API for listing tenants
- Add a basic test that the CLI can list and modify nodes and tenants.
  • Loading branch information
jcsp committed Apr 3, 2024
1 parent 582cec5 commit 6e3834d
Show file tree
Hide file tree
Showing 11 changed files with 822 additions and 67 deletions.
21 changes: 21 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ members = [
"compute_tools",
"control_plane",
"control_plane/attachment_service",
"control_plane/storcon_cli",
"pageserver",
"pageserver/compaction",
"pageserver/ctl",
Expand Down
1 change: 1 addition & 0 deletions control_plane/attachment_service/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ git-version.workspace = true
hex.workspace = true
hyper.workspace = true
humantime.workspace = true
itertools.workspace = true
lasso.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true
Expand Down
17 changes: 16 additions & 1 deletion control_plane/attachment_service/src/http.rs
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,15 @@ async fn handle_tenant_describe(
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
}

async fn handle_tenant_list(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;

json_response(StatusCode::OK, service.tenant_list())
}

async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;

Expand All @@ -412,7 +421,10 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
check_permissions(&req, Scope::Admin)?;

let state = get_state(&req);
json_response(StatusCode::OK, state.service.node_list().await?)
let nodes = state.service.node_list().await?;
let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();

json_response(StatusCode::OK, api_nodes)
}

async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
Expand Down Expand Up @@ -793,6 +805,9 @@ pub fn make_router(
RequestName("control_v1_tenant_describe"),
)
})
.get("/control/v1/tenant", |r| {
tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
})
.put("/control/v1/tenant/:tenant_id/policy", |r| {
named_request_span(
r,
Expand Down
16 changes: 15 additions & 1 deletion control_plane/attachment_service/src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ use std::{str::FromStr, time::Duration};
use hyper::StatusCode;
use pageserver_api::{
controller_api::{
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
TenantLocateResponseShard,
},
shard::TenantShardId,
};
Expand Down Expand Up @@ -256,6 +257,19 @@ impl Node {
)
.await
}

/// Generate the simplified API-friendly description of a node's state
pub(crate) fn describe(&self) -> NodeDescribeResponse {
NodeDescribeResponse {
id: self.id,
availability: self.availability.into(),
scheduling: self.scheduling,
listen_http_addr: self.listen_http_addr.clone(),
listen_http_port: self.listen_http_port,
listen_pg_addr: self.listen_pg_addr.clone(),
listen_pg_port: self.listen_pg_port,
}
}
}

impl std::fmt::Display for Node {
Expand Down
67 changes: 47 additions & 20 deletions control_plane/attachment_service/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use control_plane::storage_controller::{
use diesel::result::DatabaseErrorKind;
use futures::{stream::FuturesUnordered, StreamExt};
use hyper::StatusCode;
use itertools::Itertools;
use pageserver_api::{
controller_api::{
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
Expand Down Expand Up @@ -2735,47 +2736,73 @@ impl Service {
})
}

pub(crate) fn tenant_describe(
/// Returns None if the input iterator of shards does not include a shard with number=0
fn tenant_describe_impl<'a>(
&self,
tenant_id: TenantId,
) -> Result<TenantDescribeResponse, ApiError> {
let locked = self.inner.read().unwrap();

shards: impl Iterator<Item = &'a TenantState>,
) -> Option<TenantDescribeResponse> {
let mut shard_zero = None;
let mut shards = Vec::new();
let mut describe_shards = Vec::new();

for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
{
if tenant_shard_id.is_zero() {
for shard in shards {
if shard.tenant_shard_id.is_zero() {
shard_zero = Some(shard);
}

let response_shard = TenantDescribeResponseShard {
tenant_shard_id: *tenant_shard_id,
describe_shards.push(TenantDescribeResponseShard {
tenant_shard_id: shard.tenant_shard_id,
node_attached: *shard.intent.get_attached(),
node_secondary: shard.intent.get_secondary().to_vec(),
last_error: shard.last_error.lock().unwrap().clone(),
is_reconciling: shard.reconciler.is_some(),
is_pending_compute_notification: shard.pending_compute_notification,
is_splitting: matches!(shard.splitting, SplitState::Splitting),
};
shards.push(response_shard);
scheduling_policy: *shard.get_scheduling_policy(),
})
}

let Some(shard_zero) = shard_zero else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant {tenant_id} not found").into(),
));
};
let shard_zero = shard_zero?;

Ok(TenantDescribeResponse {
shards,
Some(TenantDescribeResponse {
tenant_id: shard_zero.tenant_shard_id.tenant_id,
shards: describe_shards,
stripe_size: shard_zero.shard.stripe_size,
policy: shard_zero.policy.clone(),
config: shard_zero.config.clone(),
})
}

pub(crate) fn tenant_describe(
&self,
tenant_id: TenantId,
) -> Result<TenantDescribeResponse, ApiError> {
let locked = self.inner.read().unwrap();

self.tenant_describe_impl(
locked
.tenants
.range(TenantShardId::tenant_range(tenant_id))
.map(|(_k, v)| v),
)
.ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
}

pub(crate) fn tenant_list(&self) -> Vec<TenantDescribeResponse> {
let locked = self.inner.read().unwrap();

let mut result = Vec::new();
for (_tenant_id, tenant_shards) in
&locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id)
{
result.push(
self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v))
.expect("Groups are always non-empty"),
);
}

result
}

#[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
async fn abort_tenant_shard_split(
&self,
Expand Down
25 changes: 1 addition & 24 deletions control_plane/src/bin/neon_local.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage_controller::StorageController;
use control_plane::{broker, local_env};
use pageserver_api::controller_api::{
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
};
use pageserver_api::controller_api::PlacementPolicy;
use pageserver_api::models::{
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
};
Expand Down Expand Up @@ -1060,21 +1058,6 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
}
}

Some(("set-state", subcommand_args)) => {
let pageserver = get_pageserver(env, subcommand_args)?;
let scheduling = subcommand_args.get_one("scheduling");
let availability = subcommand_args.get_one("availability");

let storage_controller = StorageController::from_env(env);
storage_controller
.node_configure(NodeConfigureRequest {
node_id: pageserver.conf.id,
scheduling: scheduling.cloned(),
availability: availability.cloned(),
})
.await?;
}

Some(("status", subcommand_args)) => {
match get_pageserver(env, subcommand_args)?.check_status().await {
Ok(_) => println!("Page server is up and running"),
Expand Down Expand Up @@ -1515,12 +1498,6 @@ fn cli() -> Command {
.about("Restart local pageserver")
.arg(pageserver_config_args.clone())
)
.subcommand(Command::new("set-state")
.arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
.arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
.about("Set scheduling or availability state of pageserver node")
.arg(pageserver_config_args.clone())
)
)
.subcommand(
Command::new("storage_controller")
Expand Down
23 changes: 23 additions & 0 deletions control_plane/storcon_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[package]
name = "storcon_cli"
version = "0.1.0"
edition.workspace = true
license.workspace = true


[dependencies]
anyhow.workspace = true
clap.workspace = true
comfy-table.workspace = true
hyper.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true
reqwest.workspace = true
serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
thiserror.workspace = true
tokio.workspace = true
tracing.workspace = true
utils.workspace = true
workspace_hack.workspace = true

Loading

1 comment on commit 6e3834d

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2834 tests run: 2681 passed, 0 failed, 153 skipped (full report)


Flaky tests (1)

Postgres 15

  • test_vm_bit_clear_on_heap_lock: debug

Code coverage* (full report)

  • functions: 28.1% (6380 of 22709 functions)
  • lines: 47.0% (44976 of 95727 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
6e3834d at 2024-04-03T11:19:53.610Z :recycle:

Please sign in to comment.