Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(ClientGoalHandle): Made mutex recursive to prevent deadlocks #2267

Merged
merged 2 commits into from
Sep 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ class ClientGoalHandle
ResultCallback result_callback_{nullptr};
int8_t status_{GoalStatus::STATUS_ACCEPTED};

std::mutex handle_mutex_;
std::recursive_mutex handle_mutex_;
};
} // namespace rclcpp_action

Expand Down
22 changes: 11 additions & 11 deletions rclcpp_action/include/rclcpp_action/client_goal_handle_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ template<typename ActionT>
std::shared_future<typename ClientGoalHandle<ActionT>::WrappedResult>
ClientGoalHandle<ActionT>::async_get_result()
{
std::lock_guard<std::mutex> guard(handle_mutex_);
std::lock_guard<std::recursive_mutex> guard(handle_mutex_);
if (!is_result_aware_) {
throw exceptions::UnawareGoalHandleError();
}
Expand All @@ -70,7 +70,7 @@ template<typename ActionT>
void
ClientGoalHandle<ActionT>::set_result(const WrappedResult & wrapped_result)
{
std::lock_guard<std::mutex> guard(handle_mutex_);
std::lock_guard<std::recursive_mutex> guard(handle_mutex_);
status_ = static_cast<int8_t>(wrapped_result.code);
result_promise_.set_value(wrapped_result);
if (result_callback_) {
Expand All @@ -82,55 +82,55 @@ template<typename ActionT>
void
ClientGoalHandle<ActionT>::set_feedback_callback(FeedbackCallback callback)
{
std::lock_guard<std::mutex> guard(handle_mutex_);
std::lock_guard<std::recursive_mutex> guard(handle_mutex_);
feedback_callback_ = callback;
}

template<typename ActionT>
void
ClientGoalHandle<ActionT>::set_result_callback(ResultCallback callback)
{
std::lock_guard<std::mutex> guard(handle_mutex_);
std::lock_guard<std::recursive_mutex> guard(handle_mutex_);
result_callback_ = callback;
}

template<typename ActionT>
int8_t
ClientGoalHandle<ActionT>::get_status()
{
std::lock_guard<std::mutex> guard(handle_mutex_);
std::lock_guard<std::recursive_mutex> guard(handle_mutex_);
return status_;
}

template<typename ActionT>
void
ClientGoalHandle<ActionT>::set_status(int8_t status)
{
std::lock_guard<std::mutex> guard(handle_mutex_);
std::lock_guard<std::recursive_mutex> guard(handle_mutex_);
status_ = status;
}

template<typename ActionT>
bool
ClientGoalHandle<ActionT>::is_feedback_aware()
{
std::lock_guard<std::mutex> guard(handle_mutex_);
std::lock_guard<std::recursive_mutex> guard(handle_mutex_);
return feedback_callback_ != nullptr;
}

template<typename ActionT>
bool
ClientGoalHandle<ActionT>::is_result_aware()
{
std::lock_guard<std::mutex> guard(handle_mutex_);
std::lock_guard<std::recursive_mutex> guard(handle_mutex_);
return is_result_aware_;
}

template<typename ActionT>
bool
ClientGoalHandle<ActionT>::set_result_awareness(bool awareness)
{
std::lock_guard<std::mutex> guard(handle_mutex_);
std::lock_guard<std::recursive_mutex> guard(handle_mutex_);
bool previous = is_result_aware_;
is_result_aware_ = awareness;
return previous;
Expand All @@ -140,7 +140,7 @@ template<typename ActionT>
void
ClientGoalHandle<ActionT>::invalidate(const exceptions::UnawareGoalHandleError & ex)
{
std::lock_guard<std::mutex> guard(handle_mutex_);
std::lock_guard<std::recursive_mutex> guard(handle_mutex_);
// Guard against multiple calls
if (is_invalidated()) {
return;
Expand Down Expand Up @@ -168,7 +168,7 @@ ClientGoalHandle<ActionT>::call_feedback_callback(
RCLCPP_ERROR(rclcpp::get_logger("rclcpp_action"), "Sent feedback to wrong goal handle.");
return;
}
std::lock_guard<std::mutex> guard(handle_mutex_);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternative idea, do we need to hold the lock here? What if we hold the lock in a scope {...} only long enough to copy feedback_callback_ to a local variable, then release the lock before calling the callback? I think that solves the issue and avoids (probably small) overhead of using a recursive mutex.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure that will work.

There are a couple of assumptions at work here:

  1. That the shared_this being passed in here is directly the same as this. That is a weird API, but OK. In that case, we can consider that everything that would need to be protected from concurrent access to this also applies to shared_this.
  2. That the handle_mutex_ here is protecting all members of this object from concurrent modification/access.

Taken together, I think it means that we need to hold the mutex across the entire operation (since feedback_callback_ is passed shared_this). Does that make sense? Or am I maybe missing something?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That the shared_this being passed in here is directly the same as this. That is a weird API, but OK. In that case, we can consider that everything that would need to be protected from concurrent access to this also applies to shared_this.

If shared_this isn't this then holding a lock on this->handle_mutex_ isn't going to help as the feedback callback is given shared_this, not this. I think we can ignore this though because there is a check that shared_this.get() == this above.

That the handle_mutex_ here is protecting all members of this object from concurrent modification/access.
[...] Taken together, I think it means that we need to hold the mutex across the entire operation [...]

IIUC holding the lock during the feedback's entire call would protect against a situation where the goal handle changed state between two feedback calls. Just getting an idea for what could change, it looks like all the public methods are:

  /// Get the unique ID for the goal.
  const GoalUUID &
  get_goal_id() const;

  /// Get the time when the goal was accepted.
  rclcpp::Time
  get_goal_stamp() const;

  /// Get the goal status code.
  int8_t
  get_status();

  /// Check if an action client has subscribed to feedback for the goal.
  bool
  is_feedback_aware();

  /// Check if an action client has requested the result for the goal.
  bool
  is_result_aware()

All of those methods only "read" data from the class, so the concern would be a "write" somewhere else at the same time as the feedback callback. "writes" could come from the Client as it's a friend of this class.

I'm choosing to ignore get_goal_id() and get_goal_stamp() as they're not even protected by locks. They assume info_ is constant (side note: I think this could be enforced by adding const here).

get_status is protected by a lock. If we wrap the feedback_callback_ in the lock then it's possible the feedback callback could call get status twice and get different results (executing one call, then complete the next).

is_feedback_aware() is protected by a lock and means "does this goal handle have a feedback callback"? If the callback is being executed, it already knows the answer and doesn't have a need to call it. I suppose it's possible the feedback callback could get set to a nullptr while this one is running and is_feedback_aware() might return false if this function called it, but I wasn't able to find any calls to client_goal_handle->set_feedback_callback() anywhere (side note: if it's private and no one calls it, can we delete it?).

is_result_aware() is protected by a lock and means "does someone want the result of this goal"? It's possible that someone could write a feedback callback that they gave access to the Client and only asked for the result if the feedback reports a certain value. If they were using multithreaded executor and put the Client into a Reentrant callback group and feedback messages came in fast enough then two feedback callbacks could be executed simultaneously and both would call client->async_get_result(). There can only be one result callback, so the second's feedback callback would replace the first's.

Affecting decision making in the feedback_callback based on is_result_aware() seems like the only potential negative consequence to me, and it requires the user passing in the Client to the callback some other way.

All that to say, I'm fine with the current approach, but I don't see any significant problems with releasing the lock during the feedback callback either. All holding the lock seems to do is make sure two feedback callbacks on one goal handle can't be run in parallel regardless of the executor or callback group.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

side note: if it's private and no one calls it, can we delete it?

I was thinking the same thing, i guess we can just remove this. but originally this should have been exposed to user application to change the feedback callback? i was not sure about that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think releasing the mutex is a bad idea.
If a reentrant cb group is used, not holding the mutex could lead to strange stuff like the handle->getStatus() function changing the result in the middle of the feedback callback.
I guess this is the case, why the call_feedback_callback function exists in the first place.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sloretz Thanks for the detailed analysis, that is very helpful. Here are a few comments:

IIUC holding the lock during the feedback's entire call would protect against a situation where the goal handle changed state between two feedback calls.

To be clear, we are already holding the lock across the feedback_callback_ right now. The difference in this PR is that we are switching that to a recursive_mutex. But your overall point stands that we could consider dropping the mutex before making that call.

Considering your points, I agree that given the code today we could probably drop the lock without too many negative consequences. But I think I still prefer to keep holding the lock for two reasons:

  1. @jmachowinski point about things changing while the callback is running. We've run into these kinds of problems before in rclcpp, particularly around parameters, so we also hold a lock around that for similar reasons.
  2. In the future it is possible someone could add a method that did a "write", and it would be hard to figure out that it breaks our current assumptions.

So all of that is to say that I'm good with this PR as-is. I'm going to do a review here, and then we can run CI on it.

std::lock_guard<std::recursive_mutex> guard(handle_mutex_);
if (nullptr == feedback_callback_) {
// Normal, some feedback messages may arrive after the goal result.
RCLCPP_DEBUG(rclcpp::get_logger("rclcpp_action"), "Received feedback but goal ignores it.");
Expand Down
80 changes: 80 additions & 0 deletions rclcpp_action/test/test_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -852,6 +852,86 @@ TEST_F(TestClientAgainstServer, async_cancel_some_goals_with_callback)
EXPECT_EQ(rclcpp_action::GoalStatus::STATUS_CANCELED, goal_handle0->get_status());
}

TEST_F(TestClientAgainstServer, deadlock_in_callbacks)
{
std::atomic<bool> feedback_callback_called = false;
std::atomic<bool> response_callback_called = false;
std::atomic<bool> result_callback_called = false;
std::atomic<bool> no_deadlock = false;

std::thread tr = std::thread(
[&]() {
auto action_client = rclcpp_action::create_client<ActionType>(client_node, action_name);
ASSERT_TRUE(action_client->wait_for_action_server(WAIT_FOR_SERVER_TIMEOUT));

ActionGoal goal;

using GoalHandle = rclcpp_action::ClientGoalHandle<ActionType>;
rclcpp_action::Client<ActionType>::SendGoalOptions ops;
ops.feedback_callback =
[&feedback_callback_called](const GoalHandle::SharedPtr handle,
ActionType::Feedback::ConstSharedPtr) {
// call functions on the handle that acquire the lock
handle->get_status();
handle->is_feedback_aware();
handle->is_result_aware();

feedback_callback_called = true;
};
ops.goal_response_callback = [&response_callback_called](
const GoalHandle::SharedPtr & handle) {
// call functions on the handle that acquire the lock
handle->get_status();
handle->is_feedback_aware();
handle->is_result_aware();

response_callback_called = true;
};
ops.result_callback = [&result_callback_called](
const GoalHandle::WrappedResult &) {
result_callback_called = true;
};

clalancette marked this conversation as resolved.
Show resolved Hide resolved
goal.order = 6;
auto future_goal_handle = action_client->async_send_goal(goal, ops);
dual_spin_until_future_complete(future_goal_handle);
auto goal_handle = future_goal_handle.get();

ASSERT_TRUE(goal_handle);

ASSERT_EQ(RCL_RET_OK, rcl_set_ros_time_override(clock.get_clock_handle(), RCL_S_TO_NS(2)));

auto result_future = action_client->async_get_result(goal_handle);
dual_spin_until_future_complete(result_future);

EXPECT_TRUE(result_future.valid());
auto result = result_future.get();

no_deadlock = true;
});

auto start_time = std::chrono::system_clock::now();

while (std::chrono::system_clock::now() - start_time < std::chrono::milliseconds(2000) &&
!no_deadlock)
{
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}

if (no_deadlock) {
tr.join();
} else {
// In case of a failure, the thread is assumed to be in a deadlock.
// We detach the thread so we don't block further tests.
tr.detach();
}

EXPECT_TRUE(no_deadlock);
EXPECT_TRUE(response_callback_called);
jmachowinski marked this conversation as resolved.
Show resolved Hide resolved
EXPECT_TRUE(result_callback_called);
EXPECT_TRUE(feedback_callback_called);
}

TEST_F(TestClientAgainstServer, send_rcl_errors)
{
auto action_client = rclcpp_action::create_client<ActionType>(client_node, action_name);
Expand Down