Skip to content

Commit

Permalink
Check if copysets on chunkservers consistent with mds when check copy…
Browse files Browse the repository at this point in the history
…sets health

Change-Id: I8fdb353323595b583dd2a31f5b6b397b8539bade
  • Loading branch information
bai-charisu committed Dec 3, 2020
1 parent fcbe654 commit c1997e2
Show file tree
Hide file tree
Showing 18 changed files with 390 additions and 3 deletions.
9 changes: 9 additions & 0 deletions proto/topology.proto
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,14 @@ message GetClusterInfoResponse {
optional string clusterId = 2;
}

message GetCopySetsInClusterRequest {
}

message GetCopySetsInClusterResponse {
required sint32 statusCode = 1;
repeated CopysetInfo copysetInfos = 2;
}

//TODO(hzsunjianliang): update userPolicy and so on
service TopologyService {
rpc RegistChunkServer(ChunkServerRegistRequest) returns (ChunkServerRegistResponse);
Expand Down Expand Up @@ -468,6 +476,7 @@ service TopologyService {

rpc GetChunkServerListInCopySets(GetChunkServerListInCopySetsRequest) returns (GetChunkServerListInCopySetsResponse);
rpc GetCopySetsInChunkServer(GetCopySetsInChunkServerRequest) returns (GetCopySetsInChunkServerResponse);
rpc GetCopySetsInCluster(GetCopySetsInClusterRequest) returns (GetCopySetsInClusterResponse);
rpc GetClusterInfo(GetClusterInfoRequest) returns (GetClusterInfoResponse);
}

29 changes: 29 additions & 0 deletions src/mds/topology/topology_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,35 @@ void TopologyServiceImpl::GetCopySetsInChunkServer(
}
}

void TopologyServiceImpl::GetCopySetsInCluster(
google::protobuf::RpcController* cntl_base,
const GetCopySetsInClusterRequest* request,
GetCopySetsInClusterResponse* response,
google::protobuf::Closure* done) {
brpc::ClosureGuard done_guard(done);
brpc::Controller* cntl =
static_cast<brpc::Controller*>(cntl_base);

LOG(INFO) << "Received request[log_id=" << cntl->log_id()
<< "] from " << cntl->remote_side()
<< " to " << cntl->local_side()
<< ". [GetCopySetsInClusterRequest]";
topology_->GetCopySetsInCluster(request, response);
if (kTopoErrCodeSuccess != response->statuscode()) {
LOG(ERROR) << "Send response[log_id=" << cntl->log_id()
<< "] from " << cntl->local_side()
<< " to " << cntl->remote_side()
<< ". [GetCopySetsInClusterResponse] "
<< response->DebugString();
} else {
LOG(INFO) << "Send response[log_id=" << cntl->log_id()
<< "] from " << cntl->local_side()
<< " to " << cntl->remote_side()
<< ". [GetCopySetsInClusterResponse] copyset num: "
<< response->copysetinfos_size();
}
}

void TopologyServiceImpl::GetClusterInfo(
google::protobuf::RpcController* cntl_base,
const GetClusterInfoRequest* request,
Expand Down
6 changes: 6 additions & 0 deletions src/mds/topology/topology_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,12 @@ class TopologyServiceImpl : public TopologyService {
GetCopySetsInChunkServerResponse* response,
google::protobuf::Closure* done);

virtual void GetCopySetsInCluster(
google::protobuf::RpcController* cntl_base,
const GetCopySetsInClusterRequest* request,
GetCopySetsInClusterResponse* response,
google::protobuf::Closure* done);

virtual void GetClusterInfo(
google::protobuf::RpcController* cntl_base,
const GetClusterInfoRequest* request,
Expand Down
13 changes: 13 additions & 0 deletions src/mds/topology/topology_service_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1373,6 +1373,19 @@ void TopologyServiceManager::GetCopySetsInChunkServer(
}
}

void TopologyServiceManager::GetCopySetsInCluster(
const GetCopySetsInClusterRequest* request,
GetCopySetsInClusterResponse* response) {
response->set_statuscode(kTopoErrCodeSuccess);
std::vector<CopySetKey> copysets =
topology_->GetCopySetsInCluster();
for (const CopySetKey& copyset : copysets) {
CopysetInfo *info = response->add_copysetinfos();
info->set_logicalpoolid(copyset.first);
info->set_copysetid(copyset.second);
}
}

void TopologyServiceManager::GetClusterInfo(
const GetClusterInfoRequest* request,
GetClusterInfoResponse* response) {
Expand Down
4 changes: 4 additions & 0 deletions src/mds/topology/topology_service_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ class TopologyServiceManager {
const GetCopySetsInChunkServerRequest* request,
GetCopySetsInChunkServerResponse* response);

virtual void GetCopySetsInCluster(
const GetCopySetsInClusterRequest* request,
GetCopySetsInClusterResponse* response);

virtual void GetClusterInfo(
const GetClusterInfoRequest* request,
GetClusterInfoResponse* response);
Expand Down
60 changes: 60 additions & 0 deletions src/tools/copyset_check_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,12 @@ int CopysetCheckCore::CheckCopysetsInCluster() {
isHealthy = false;
}
}
// 检查从chunkserver上获取的copyset数量与mds记录的数量是否一致
res = CheckCopysetsWithMds();
if (res != 0) {
std::cout << "CheckCopysetNumWithMds fail!" << std::endl;
return -1;
}
// 如果不健康,直接返回,如果健康,还需要对operator作出判断
if (!isHealthy) {
return -1;
Expand All @@ -416,6 +422,60 @@ int CopysetCheckCore::CheckCopysetsInCluster() {
return 0;
}

int CopysetCheckCore::CheckCopysetsWithMds() {
std::vector<CopysetInfo> copysetsInMds;
int res = mdsClient_->GetCopySetsInCluster(&copysetsInMds);
if (res != 0) {
std::cout << "GetCopySetsInCluster fail!" << std::endl;
return -1;
}
CopysetStatistics statistics = GetCopysetStatistics();
if (copysetsInMds.size() != copysets_[kTotal].size()) {
std::cout << "Copyset numbers in chunkservers not consistent"
" with mds, please check! copysets on chunkserver: "
<< copysets_[kTotal].size() << ", copysets in mds: "
<< copysetsInMds.size() << std::endl;
return -1;
}
if (copysetsInMds.empty()) return true;
std::set<std::string> copysetsInMdsGid;
for (const auto& copyset : copysetsInMds) {
std::string gId = ToGroupId(copyset.logicalpoolid(),
copyset.copysetid());
copysetsInMdsGid.insert(gId);
}
int ret = 0;
std::vector<std::string> copysetsInMdsNotInCs(10);
auto iter = std::set_difference(copysetsInMdsGid.begin(),
copysetsInMdsGid.end(), copysets_[kTotal].begin(),
copysets_[kTotal].end(), copysetsInMdsNotInCs.begin());
copysetsInMdsNotInCs.resize(iter - copysetsInMdsNotInCs.begin());
if (!copysetsInMdsNotInCs.empty()) {
std::cout << "There are " << copysetsInMdsNotInCs.size()
<< " copysets on mds not found on chunkserver, defail:";
for (const auto& copyset : copysetsInMdsNotInCs) {
std::cout << " " << copyset;
}
std::cout << std::endl;
ret = -1;
}
std::vector<std::string> copysetsInCsNotInMds(10);
iter = std::set_difference(copysets_[kTotal].begin(),
copysets_[kTotal].end(), copysetsInMdsGid.begin(),
copysetsInMdsGid.end(), copysetsInCsNotInMds.begin());
copysetsInCsNotInMds.resize(iter - copysetsInCsNotInMds.begin());
if (!copysetsInCsNotInMds.empty()) {
std::cout << "There are " << copysetsInCsNotInMds.size()
<< " copysets on chunkserver not found on Mds, defail:";
for (const auto& copyset : copysetsInCsNotInMds) {
std::cout << " " << copyset;
}
std::cout << std::endl;
ret = -1;
}
return ret;
}

int CopysetCheckCore::CheckOperator(const std::string& opName,
uint64_t checkTimeSec) {
uint64_t startTime = curve::common::TimeUtility::GetTimeofDaySec();
Expand Down
2 changes: 2 additions & 0 deletions src/tools/copyset_check_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,8 @@ class CopysetCheckCore {
void UpdateChunkServerCopysets(const std::string& csAddr,
const CopySetInfosType& copysetInfos);

int CheckCopysetsWithMds();

private:
// 向mds发送RPC的client
std::shared_ptr<MDSClient> mdsClient_;
Expand Down
2 changes: 0 additions & 2 deletions src/tools/curve_tool_define.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,6 @@ const char kRemovePeerCmd[] = "remove-peer";
const char kTransferLeaderCmd[] = "transfer-leader";
const char kResetPeerCmd[] = "reset-peer";

// 快照检查命令
const char kSnapshotCheckCmd[] = "snapshot-check";
// 调度模块命令
const char kRapidLeaderSchedule[] = "rapid-leader-schedule";

Expand Down
5 changes: 4 additions & 1 deletion src/tools/curve_tool_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ const char* kHelpStr = "Usage: curve_ops_tool [Command] [OPTIONS...]\n"
"etcd-status : show the etcd status\n"
"snapshot-clone-status : show the snapshot clone server status\n"
"copysets-status : check the health state of all copysets\n"
"chunkserver-list : show curve chunkserver-list, list all chunkserver infomation\n" //NOLINT
"chunkserver-list : show curve chunkserver-list, list all chunkserver information\n" //NOLINT
"server-list : list all server information\n"
"logical-pool-list : list all logical pool information\n"
"cluster-status : show cluster status\n"
"get : show the file info and the actual space of file\n"
"list : list the file info of files in the directory\n"
"seginfo : list the segments info of the file\n"
Expand Down
30 changes: 30 additions & 0 deletions src/tools/mds_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,36 @@ int MDSClient::GetCopySetsInChunkServer(
return -1;
}

int MDSClient::GetCopySetsInCluster(std::vector<CopysetInfo>* copysets) {
assert(copysets != nullptr);
curve::mds::topology::GetCopySetsInClusterRequest request;
curve::mds::topology::GetCopySetsInClusterResponse response;
curve::mds::topology::TopologyService_Stub stub(&channel_);

void (curve::mds::topology::TopologyService_Stub::*fp)(
google::protobuf::RpcController*,
const curve::mds::topology::GetCopySetsInClusterRequest*,
curve::mds::topology::GetCopySetsInClusterResponse*,
google::protobuf::Closure*);
fp = &curve::mds::topology::TopologyService_Stub::GetCopySetsInCluster;
if (SendRpcToMds(&request, &response, &stub, fp) != 0) {
std::cout << "GetCopySetsInCluster from all mds fail!"
<< std::endl;
return -1;
}

if (response.has_statuscode() &&
response.statuscode() == kTopoErrCodeSuccess) {
for (int i =0; i < response.copysetinfos_size(); ++i) {
copysets->emplace_back(response.copysetinfos(i));
}
return 0;
}
std::cout << "GetCopySetsInCluster fail with errCode: "
<< response.statuscode() << std::endl;
return -1;
}

int MDSClient::ListServersInCluster(std::vector<ServerInfo>* servers) {
assert(servers != nullptr);
// 先列出逻辑池
Expand Down
7 changes: 7 additions & 0 deletions src/tools/mds_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,13 @@ class MDSClient {
virtual int GetCopySetsInChunkServer(const std::string& csAddr,
std::vector<CopysetInfo>* copysets);

/**
* @brief 获取集群中的所有copyset
* @param[out] copysets 集群中copyset的列表
* @return 成功返回0,失败返回-1
*/
virtual int GetCopySetsInCluster(std::vector<CopysetInfo>* copysets);

/**
* @brief 列出集群中的所有server
* @param[out] servers server信息的列表,返回值为0时有效
Expand Down
4 changes: 4 additions & 0 deletions test/mds/topology/mock_topology.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,10 @@ class MockTopologyServiceManager : public TopologyServiceManager {
const GetCopySetsInChunkServerRequest *request,
GetCopySetsInChunkServerResponse *response));

MOCK_METHOD2(GetCopySetsInCluster, void(
const GetCopySetsInClusterRequest *request,
GetCopySetsInClusterResponse *response));

MOCK_METHOD2(GetClusterInfo,
void(const GetClusterInfoRequest* request,
GetClusterInfoResponse* response));
Expand Down
56 changes: 56 additions & 0 deletions test/mds/topology/test_topology_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1458,6 +1458,62 @@ TEST_F(TestTopologyService, test_GetCopySetsInChunkServer_fail) {
ASSERT_EQ(kTopoErrCodeInvalidParam, response.statuscode());
}

TEST_F(TestTopologyService, test_GetCopySetsInCluster_success) {
brpc::Channel channel;
if (channel.Init(listenAddr_, NULL) != 0) {
FAIL() << "Fail to init channel "
<< std::endl;
}

TopologyService_Stub stub(&channel);

brpc::Controller cntl;
GetCopySetsInClusterRequest request;

GetCopySetsInClusterResponse response;

GetCopySetsInClusterResponse reps;
reps.set_statuscode(kTopoErrCodeSuccess);
EXPECT_CALL(*manager_, GetCopySetsInCluster(_, _))
.WillRepeatedly(SetArgPointee<1>(reps));

stub.GetCopySetsInCluster(&cntl, &request, &response, nullptr);

if (cntl.Failed()) {
FAIL() << cntl.ErrorText() << std::endl;
}

ASSERT_EQ(kTopoErrCodeSuccess, response.statuscode());
}

TEST_F(TestTopologyService, test_GetCopySetsInCluster_fail) {
brpc::Channel channel;
if (channel.Init(listenAddr_, NULL) != 0) {
FAIL() << "Fail to init channel "
<< std::endl;
}

TopologyService_Stub stub(&channel);

brpc::Controller cntl;
GetCopySetsInClusterRequest request;

GetCopySetsInClusterResponse response;

GetCopySetsInClusterResponse reps;
reps.set_statuscode(kTopoErrCodeInvalidParam);
EXPECT_CALL(*manager_, GetCopySetsInCluster(_, _))
.WillRepeatedly(SetArgPointee<1>(reps));

stub.GetCopySetsInCluster(&cntl, &request, &response, nullptr);

if (cntl.Failed()) {
FAIL() << cntl.ErrorText() << std::endl;
}

ASSERT_EQ(kTopoErrCodeInvalidParam, response.statuscode());
}

} // namespace topology
} // namespace mds
} // namespace curve
Expand Down
35 changes: 35 additions & 0 deletions test/mds/topology/test_topology_service_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2783,6 +2783,41 @@ TEST_F(TestTopologyServiceManager,
ASSERT_EQ(0, response.copysetinfos_size());
}

TEST_F(TestTopologyServiceManager, test_GetCopySetsInCluster) {
PoolIdType logicalPoolId1 = 0x1;
PoolIdType physicalPoolId1 = 0x11;
PrepareAddPhysicalPool(physicalPoolId1);
PrepareAddLogicalPool(logicalPoolId1, "logicalPool1", physicalPoolId1);
PoolIdType logicalPoolId2 = 0x2;
PoolIdType physicalPoolId2 = 0x12;
PrepareAddPhysicalPool(physicalPoolId2);
PrepareAddLogicalPool(logicalPoolId2, "logicalPool2", physicalPoolId2);

std::set<ChunkServerIdType> members = {1, 2, 3};
for (int i = 1; i <= 10; ++i) {
PrepareAddCopySet(i, logicalPoolId1, members);
}
for (int i = 11; i <= 20; ++i) {
PrepareAddCopySet(i, logicalPoolId2, members);
}

GetCopySetsInClusterRequest request;
GetCopySetsInClusterResponse response;
serviceManager_->GetCopySetsInCluster(&request, &response);

ASSERT_EQ(kTopoErrCodeSuccess, response.statuscode());
ASSERT_EQ(20, response.copysetinfos_size());
for (int i = 0; i < 20; i++) {
if (i < 10) {
ASSERT_EQ(1, response.copysetinfos(i).logicalpoolid());
} else {
ASSERT_EQ(2, response.copysetinfos(i).logicalpoolid());
}
ASSERT_EQ(i + 1, response.copysetinfos(i).copysetid());
}
}


} // namespace topology
} // namespace mds
} // namespace curve
Expand Down
Loading

0 comments on commit c1997e2

Please sign in to comment.