Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check if copysets on chunkservers consistent with mds when check copyset health. #184

Merged
merged 1 commit into from
Dec 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions proto/topology.proto
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,14 @@ message GetClusterInfoResponse {
optional string clusterId = 2;
}

message GetCopySetsInClusterRequest {
}

message GetCopySetsInClusterResponse {
required sint32 statusCode = 1;
repeated CopysetInfo copysetInfos = 2;
}

//TODO(hzsunjianliang): update userPolicy and so on
service TopologyService {
rpc RegistChunkServer(ChunkServerRegistRequest) returns (ChunkServerRegistResponse);
Expand Down Expand Up @@ -468,6 +476,7 @@ service TopologyService {

rpc GetChunkServerListInCopySets(GetChunkServerListInCopySetsRequest) returns (GetChunkServerListInCopySetsResponse);
rpc GetCopySetsInChunkServer(GetCopySetsInChunkServerRequest) returns (GetCopySetsInChunkServerResponse);
rpc GetCopySetsInCluster(GetCopySetsInClusterRequest) returns (GetCopySetsInClusterResponse);
rpc GetClusterInfo(GetClusterInfoRequest) returns (GetClusterInfoResponse);
}

29 changes: 29 additions & 0 deletions src/mds/topology/topology_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,35 @@ void TopologyServiceImpl::GetCopySetsInChunkServer(
}
}

void TopologyServiceImpl::GetCopySetsInCluster(
google::protobuf::RpcController* cntl_base,
const GetCopySetsInClusterRequest* request,
GetCopySetsInClusterResponse* response,
google::protobuf::Closure* done) {
brpc::ClosureGuard done_guard(done);
brpc::Controller* cntl =
static_cast<brpc::Controller*>(cntl_base);

LOG(INFO) << "Received request[log_id=" << cntl->log_id()
<< "] from " << cntl->remote_side()
<< " to " << cntl->local_side()
<< ". [GetCopySetsInClusterRequest]";
topology_->GetCopySetsInCluster(request, response);
if (kTopoErrCodeSuccess != response->statuscode()) {
LOG(ERROR) << "Send response[log_id=" << cntl->log_id()
<< "] from " << cntl->local_side()
<< " to " << cntl->remote_side()
<< ". [GetCopySetsInClusterResponse] "
<< response->DebugString();
} else {
LOG(INFO) << "Send response[log_id=" << cntl->log_id()
<< "] from " << cntl->local_side()
<< " to " << cntl->remote_side()
<< ". [GetCopySetsInClusterResponse] copyset num: "
<< response->copysetinfos_size();
}
}

void TopologyServiceImpl::GetClusterInfo(
google::protobuf::RpcController* cntl_base,
const GetClusterInfoRequest* request,
Expand Down
6 changes: 6 additions & 0 deletions src/mds/topology/topology_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,12 @@ class TopologyServiceImpl : public TopologyService {
GetCopySetsInChunkServerResponse* response,
google::protobuf::Closure* done);

virtual void GetCopySetsInCluster(
google::protobuf::RpcController* cntl_base,
const GetCopySetsInClusterRequest* request,
GetCopySetsInClusterResponse* response,
google::protobuf::Closure* done);

virtual void GetClusterInfo(
google::protobuf::RpcController* cntl_base,
const GetClusterInfoRequest* request,
Expand Down
13 changes: 13 additions & 0 deletions src/mds/topology/topology_service_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1373,6 +1373,19 @@ void TopologyServiceManager::GetCopySetsInChunkServer(
}
}

void TopologyServiceManager::GetCopySetsInCluster(
const GetCopySetsInClusterRequest* request,
GetCopySetsInClusterResponse* response) {
response->set_statuscode(kTopoErrCodeSuccess);
std::vector<CopySetKey> copysets =
topology_->GetCopySetsInCluster();
for (const CopySetKey& copyset : copysets) {
CopysetInfo *info = response->add_copysetinfos();
info->set_logicalpoolid(copyset.first);
info->set_copysetid(copyset.second);
}
}

void TopologyServiceManager::GetClusterInfo(
const GetClusterInfoRequest* request,
GetClusterInfoResponse* response) {
Expand Down
4 changes: 4 additions & 0 deletions src/mds/topology/topology_service_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ class TopologyServiceManager {
const GetCopySetsInChunkServerRequest* request,
GetCopySetsInChunkServerResponse* response);

virtual void GetCopySetsInCluster(
const GetCopySetsInClusterRequest* request,
GetCopySetsInClusterResponse* response);

virtual void GetClusterInfo(
const GetClusterInfoRequest* request,
GetClusterInfoResponse* response);
Expand Down
59 changes: 59 additions & 0 deletions src/tools/copyset_check_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,12 @@ int CopysetCheckCore::CheckCopysetsInCluster() {
isHealthy = false;
}
}
// 检查从chunkserver上获取的copyset数量与mds记录的数量是否一致
res = CheckCopysetsWithMds();
if (res != 0) {
std::cout << "CheckCopysetNumWithMds fail!" << std::endl;
return -1;
}
// 如果不健康,直接返回,如果健康,还需要对operator作出判断
if (!isHealthy) {
return -1;
Expand All @@ -416,6 +422,59 @@ int CopysetCheckCore::CheckCopysetsInCluster() {
return 0;
}

int CopysetCheckCore::CheckCopysetsWithMds() {
std::vector<CopysetInfo> copysetsInMds;
int res = mdsClient_->GetCopySetsInCluster(&copysetsInMds);
if (res != 0) {
std::cout << "GetCopySetsInCluster fail!" << std::endl;
return -1;
}
CopysetStatistics statistics = GetCopysetStatistics();
if (copysetsInMds.size() != copysets_[kTotal].size()) {
std::cout << "Copyset numbers in chunkservers not consistent"
" with mds, please check! copysets on chunkserver: "
<< copysets_[kTotal].size() << ", copysets in mds: "
<< copysetsInMds.size() << std::endl;
return -1;
}
std::set<std::string> copysetsInMdsGid;
for (const auto& copyset : copysetsInMds) {
std::string gId = ToGroupId(copyset.logicalpoolid(),
copyset.copysetid());
copysetsInMdsGid.insert(gId);
}
int ret = 0;
std::vector<std::string> copysetsInMdsNotInCs(10);
auto iter = std::set_difference(copysetsInMdsGid.begin(),
copysetsInMdsGid.end(), copysets_[kTotal].begin(),
copysets_[kTotal].end(), copysetsInMdsNotInCs.begin());
copysetsInMdsNotInCs.resize(iter - copysetsInMdsNotInCs.begin());
if (!copysetsInMdsNotInCs.empty()) {
std::cout << "There are " << copysetsInMdsNotInCs.size()
<< " copysets on mds not found on chunkserver, defail:";
for (const auto& copyset : copysetsInMdsNotInCs) {
std::cout << " " << copyset;
}
std::cout << std::endl;
ret = -1;
}
std::vector<std::string> copysetsInCsNotInMds(10);
iter = std::set_difference(copysets_[kTotal].begin(),
copysets_[kTotal].end(), copysetsInMdsGid.begin(),
copysetsInMdsGid.end(), copysetsInCsNotInMds.begin());
copysetsInCsNotInMds.resize(iter - copysetsInCsNotInMds.begin());
if (!copysetsInCsNotInMds.empty()) {
std::cout << "There are " << copysetsInCsNotInMds.size()
<< " copysets on chunkserver not found on Mds, defail:";
for (const auto& copyset : copysetsInCsNotInMds) {
std::cout << " " << copyset;
}
std::cout << std::endl;
ret = -1;
}
return ret;
}

int CopysetCheckCore::CheckOperator(const std::string& opName,
uint64_t checkTimeSec) {
uint64_t startTime = curve::common::TimeUtility::GetTimeofDaySec();
Expand Down
2 changes: 2 additions & 0 deletions src/tools/copyset_check_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,8 @@ class CopysetCheckCore {
void UpdateChunkServerCopysets(const std::string& csAddr,
const CopySetInfosType& copysetInfos);

int CheckCopysetsWithMds();

private:
// 向mds发送RPC的client
std::shared_ptr<MDSClient> mdsClient_;
Expand Down
2 changes: 0 additions & 2 deletions src/tools/curve_tool_define.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,6 @@ const char kRemovePeerCmd[] = "remove-peer";
const char kTransferLeaderCmd[] = "transfer-leader";
const char kResetPeerCmd[] = "reset-peer";

// 快照检查命令
const char kSnapshotCheckCmd[] = "snapshot-check";
// 调度模块命令
const char kRapidLeaderSchedule[] = "rapid-leader-schedule";

Expand Down
5 changes: 4 additions & 1 deletion src/tools/curve_tool_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ const char* kHelpStr = "Usage: curve_ops_tool [Command] [OPTIONS...]\n"
"etcd-status : show the etcd status\n"
"snapshot-clone-status : show the snapshot clone server status\n"
"copysets-status : check the health state of all copysets\n"
"chunkserver-list : show curve chunkserver-list, list all chunkserver infomation\n" //NOLINT
"chunkserver-list : show curve chunkserver-list, list all chunkserver information\n" //NOLINT
"server-list : list all server information\n"
"logical-pool-list : list all logical pool information\n"
"cluster-status : show cluster status\n"
"get : show the file info and the actual space of file\n"
"list : list the file info of files in the directory\n"
"seginfo : list the segments info of the file\n"
Expand Down
30 changes: 30 additions & 0 deletions src/tools/mds_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,36 @@ int MDSClient::GetCopySetsInChunkServer(
return -1;
}

int MDSClient::GetCopySetsInCluster(std::vector<CopysetInfo>* copysets) {
assert(copysets != nullptr);
curve::mds::topology::GetCopySetsInClusterRequest request;
curve::mds::topology::GetCopySetsInClusterResponse response;
curve::mds::topology::TopologyService_Stub stub(&channel_);

void (curve::mds::topology::TopologyService_Stub::*fp)(
google::protobuf::RpcController*,
const curve::mds::topology::GetCopySetsInClusterRequest*,
curve::mds::topology::GetCopySetsInClusterResponse*,
google::protobuf::Closure*);
fp = &curve::mds::topology::TopologyService_Stub::GetCopySetsInCluster;
if (SendRpcToMds(&request, &response, &stub, fp) != 0) {
std::cout << "GetCopySetsInCluster from all mds fail!"
<< std::endl;
return -1;
}

if (response.has_statuscode() &&
response.statuscode() == kTopoErrCodeSuccess) {
for (int i =0; i < response.copysetinfos_size(); ++i) {
copysets->emplace_back(response.copysetinfos(i));
}
return 0;
}
std::cout << "GetCopySetsInCluster fail with errCode: "
<< response.statuscode() << std::endl;
return -1;
}

int MDSClient::ListServersInCluster(std::vector<ServerInfo>* servers) {
assert(servers != nullptr);
// 先列出逻辑池
Expand Down
7 changes: 7 additions & 0 deletions src/tools/mds_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,13 @@ class MDSClient {
virtual int GetCopySetsInChunkServer(const std::string& csAddr,
std::vector<CopysetInfo>* copysets);

/**
* @brief 获取集群中的所有copyset
* @param[out] copysets 集群中copyset的列表
* @return 成功返回0,失败返回-1
*/
virtual int GetCopySetsInCluster(std::vector<CopysetInfo>* copysets);

/**
* @brief 列出集群中的所有server
* @param[out] servers server信息的列表,返回值为0时有效
Expand Down
4 changes: 4 additions & 0 deletions test/mds/topology/mock_topology.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,10 @@ class MockTopologyServiceManager : public TopologyServiceManager {
const GetCopySetsInChunkServerRequest *request,
GetCopySetsInChunkServerResponse *response));

MOCK_METHOD2(GetCopySetsInCluster, void(
const GetCopySetsInClusterRequest *request,
GetCopySetsInClusterResponse *response));

MOCK_METHOD2(GetClusterInfo,
void(const GetClusterInfoRequest* request,
GetClusterInfoResponse* response));
Expand Down
56 changes: 56 additions & 0 deletions test/mds/topology/test_topology_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1458,6 +1458,62 @@ TEST_F(TestTopologyService, test_GetCopySetsInChunkServer_fail) {
ASSERT_EQ(kTopoErrCodeInvalidParam, response.statuscode());
}

TEST_F(TestTopologyService, test_GetCopySetsInCluster_success) {
brpc::Channel channel;
if (channel.Init(listenAddr_, NULL) != 0) {
FAIL() << "Fail to init channel "
<< std::endl;
}

TopologyService_Stub stub(&channel);

brpc::Controller cntl;
GetCopySetsInClusterRequest request;

GetCopySetsInClusterResponse response;

GetCopySetsInClusterResponse reps;
reps.set_statuscode(kTopoErrCodeSuccess);
EXPECT_CALL(*manager_, GetCopySetsInCluster(_, _))
.WillRepeatedly(SetArgPointee<1>(reps));

stub.GetCopySetsInCluster(&cntl, &request, &response, nullptr);

if (cntl.Failed()) {
FAIL() << cntl.ErrorText() << std::endl;
}

ASSERT_EQ(kTopoErrCodeSuccess, response.statuscode());
}

TEST_F(TestTopologyService, test_GetCopySetsInCluster_fail) {
brpc::Channel channel;
if (channel.Init(listenAddr_, NULL) != 0) {
FAIL() << "Fail to init channel "
<< std::endl;
}

TopologyService_Stub stub(&channel);

brpc::Controller cntl;
GetCopySetsInClusterRequest request;

GetCopySetsInClusterResponse response;

GetCopySetsInClusterResponse reps;
reps.set_statuscode(kTopoErrCodeInvalidParam);
EXPECT_CALL(*manager_, GetCopySetsInCluster(_, _))
.WillRepeatedly(SetArgPointee<1>(reps));

stub.GetCopySetsInCluster(&cntl, &request, &response, nullptr);

if (cntl.Failed()) {
FAIL() << cntl.ErrorText() << std::endl;
}

ASSERT_EQ(kTopoErrCodeInvalidParam, response.statuscode());
}

} // namespace topology
} // namespace mds
} // namespace curve
Expand Down
35 changes: 35 additions & 0 deletions test/mds/topology/test_topology_service_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2783,6 +2783,41 @@ TEST_F(TestTopologyServiceManager,
ASSERT_EQ(0, response.copysetinfos_size());
}

TEST_F(TestTopologyServiceManager, test_GetCopySetsInCluster) {
PoolIdType logicalPoolId1 = 0x1;
PoolIdType physicalPoolId1 = 0x11;
PrepareAddPhysicalPool(physicalPoolId1);
PrepareAddLogicalPool(logicalPoolId1, "logicalPool1", physicalPoolId1);
PoolIdType logicalPoolId2 = 0x2;
PoolIdType physicalPoolId2 = 0x12;
PrepareAddPhysicalPool(physicalPoolId2);
PrepareAddLogicalPool(logicalPoolId2, "logicalPool2", physicalPoolId2);

std::set<ChunkServerIdType> members = {1, 2, 3};
for (int i = 1; i <= 10; ++i) {
PrepareAddCopySet(i, logicalPoolId1, members);
}
for (int i = 11; i <= 20; ++i) {
PrepareAddCopySet(i, logicalPoolId2, members);
}

GetCopySetsInClusterRequest request;
GetCopySetsInClusterResponse response;
serviceManager_->GetCopySetsInCluster(&request, &response);

ASSERT_EQ(kTopoErrCodeSuccess, response.statuscode());
ASSERT_EQ(20, response.copysetinfos_size());
for (int i = 0; i < 20; i++) {
if (i < 10) {
ASSERT_EQ(1, response.copysetinfos(i).logicalpoolid());
} else {
ASSERT_EQ(2, response.copysetinfos(i).logicalpoolid());
}
ASSERT_EQ(i + 1, response.copysetinfos(i).copysetid());
}
}


} // namespace topology
} // namespace mds
} // namespace curve
Expand Down
Loading