Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Blocking nvme disconnect fixes #776

Merged
merged 3 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 45 additions & 48 deletions control-plane/agents/src/bin/ha/node/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,52 +97,48 @@ impl NodeAgentSvc {
}

/// Disconnect cached NVMe controller.
fn disconnect_controller(ctrlr: &NvmeController, new_path: String) -> Result<(), SvcError> {
let parsed_path = parse_uri(new_path.as_str())?;

match get_nvme_path_entry(&ctrlr.path) {
async fn disconnect_controller(ctrlr: &NvmeController, new_path: String) -> Result<(), SvcError> {
let new_path_uri = parse_uri(new_path.as_str())?;
let path = &ctrlr.path;
match get_nvme_path_entry(path) {
Some(pbuf) => {
let subsystem = Subsystem::new(pbuf.path()).map_err(|_| SvcError::NotFound {
kind: ResourceKind::NvmeSubsystem,
id: ctrlr.path.to_owned(),
id: path.to_owned(),
})?;

// sanity check to make sure this information is still up to date!
if subsystem.nqn != parsed_path.nqn {
if subsystem.nqn != new_path_uri.nqn {
return Err(SvcError::UnexpectedSubsystemNqn {
nqn: subsystem.nqn,
expected_nqn: parsed_path.nqn,
expected_nqn: new_path_uri.nqn,
path: subsystem.name,
});
}

if subsystem
.address
.match_host_port(parsed_path.host(), &parsed_path.port().to_string())
.match_host_port(new_path_uri.host(), &new_path_uri.port().to_string())
{
tracing::info!(path = ctrlr.path, "Not disconnecting same NVMe controller");
tracing::info!(path, "Not disconnecting same NVMe controller");
Ok(())
} else {
tracing::info!(path = ctrlr.path, "Disconnecting NVMe controller");
tracing::info!(path, "Disconnecting NVMe controller");

// clarification: we're not disconnecting the subsystem, but rather the controller
subsystem.disconnect().map_err(|e| SvcError::Internal {
details: format!(
"Failed to disconnect NVMe controller {}: {:?}",
ctrlr.path, e,
),
tokio::task::block_in_place(move || subsystem.disconnect()).map_err(|error| {
SvcError::Internal {
details: format!("Failed to disconnect NVMe controller {path}: {error:?}"),
}
})
}
}
None => {
tracing::error!(
path = ctrlr.path,
"Failed to get system path for controller"
);
tracing::error!(path, "Failed to get system path for controller");

Err(SvcError::NotFound {
kind: ResourceKind::NvmePath,
id: ctrlr.path.to_owned(),
id: path.to_owned(),
})
}
}
Expand Down Expand Up @@ -193,30 +189,26 @@ impl NodeAgentSvc {
) {
Ok(subsystem) => {
tracing::info!(new_path, "Successfully connected to NVMe target");
subsystem
}
Err(error) => {
return Err(SubsystemNotFound {
nqn,
details: error.to_string(),
});
Ok(subsystem)
}
Err(error) => Err(SubsystemNotFound {
nqn: nqn.clone(),
details: error.to_string(),
}),
},
Err(error) => {
tracing::error!(
new_path,
%error,
"Failed to connect to new NVMe target"
);
let nvme_err = format!(
"Failed to connect to new NVMe target: {}, new path: {}, nqn: {}",
error.full_string(),
new_path,
nqn
let details = format!(
"Failed to connect to new NVMe target: {error}, new path: {new_path}, nqn: {nqn}",
error = error.full_string(),
);
return Err(SvcError::NvmeConnectError { details: nvme_err });
Err(SvcError::NvmeConnectError { details })
}
};
}?;

let now = Instant::now();
let timeout = self.path_connection_timeout;
Expand Down Expand Up @@ -301,16 +293,21 @@ impl NodeAgentSvc {
}
};

// Remove the subsystem.
if let Err(error) = curr_subsystem.disconnect() {
tracing::error!(
?curr_subsystem,
?error,
"Failed to disconnect NVMe subsystem"
);
} else {
tracing::info!(?curr_subsystem, "NVMe subsystem successfully disconnected");
}
tokio::task::block_in_place(move || {
// Remove the subsystem.
if let Err(error) = curr_subsystem.disconnect() {
tracing::error!(
?curr_subsystem,
?error,
"Failed to disconnect NVMe subsystem"
);
} else {
tracing::info!(
?curr_subsystem,
"NVMe subsystem successfully disconnected"
);
}
});
} else {
tracing::info!(?subsystem, "Keeping NVMe subsystem after connect timeout");
}
Expand All @@ -330,9 +327,9 @@ impl NodeAgentOperations for NodeAgentSvc {
request: &dyn ReplacePathInfo,
_context: Option<Context>,
) -> Result<(), ReplyError> {
tracing::info!("Replacing failed NVMe path: {:?}", request);
tracing::info!("Replacing failed NVMe path: {request:?}");
// Lookup NVMe controller whose path has failed.
let ctrlr = self
let ctrlrs = self
.path_cache
.lookup_controllers(request.target_nqn())
.await
Expand All @@ -359,8 +356,8 @@ impl NodeAgentOperations for NodeAgentSvc {
// path has been successfully created, so having the first failed path in addition
// to the second healthy one is OK: just display a warning and proceed as if
// the call has completed successfully.
for ctrl in ctrlr {
if let Err(error) = disconnect_controller(&ctrl, request.new_path()) {
for ctrl in ctrlrs {
if let Err(error) = disconnect_controller(&ctrl, request.new_path()).await {
tracing::warn!(
uri=%request.new_path(),
%error,
Expand Down
2 changes: 1 addition & 1 deletion control-plane/grpc/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ pub fn timeout_grpc(op_id: MessageId, timeout_opts: TimeoutOptions) -> Duration
MessageIdVs::DestroyReplicaSnapshot => min_timeouts.replica_snapshot(),

MessageIdVs::CreatePool => min_timeouts.pool(),
MessageIdVs::ImportPool => min_timeouts.pool(),
MessageIdVs::ImportPool => min_timeouts.pool() * 3,
MessageIdVs::DestroyPool => min_timeouts.pool(),

MessageIdVs::ReplacePathInfo => min_timeouts.nvme_reconnect(),
Expand Down
2 changes: 1 addition & 1 deletion control-plane/stor-port/src/transport_api/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,7 @@ impl Default for RequestMinTimeout {
pool: Duration::from_secs(20),
nexus_shutdown: Duration::from_secs(15),
nexus_snapshot: Duration::from_secs(30),
nvme_reconnect: Duration::from_secs(12),
nvme_reconnect: Duration::from_secs(62),
}
}
}
Expand Down
Loading