-
Notifications
You must be signed in to change notification settings - Fork 62
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ML] Add new categorization stats to model_size_stats #989
Changes from 1 commit
4bde366
e55b927
9c2ede9
7047f85
7021ae1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -99,11 +99,35 @@ class MODEL_EXPORT CTokenListCategory { | |
//! if (category.missingCommonTokenWeight(uniqueTokenIds) == 0) | ||
//! instead of calling this method. However, this method is much faster | ||
//! as it can return false as soon as a mismatch occurs. | ||
bool isMissingCommonTokenWeightZero(const TSizeSizeMap& uniqueTokenIds) const; | ||
template<typename PAIR_CONTAINER> | ||
bool isMissingCommonTokenWeightZero(const PAIR_CONTAINER& uniqueTokenIds) const { | ||
|
||
auto commonIter = m_CommonUniqueTokenIds.begin(); | ||
auto testIter = uniqueTokenIds.begin(); | ||
while (commonIter != m_CommonUniqueTokenIds.end() && | ||
testIter != uniqueTokenIds.end()) { | ||
if (commonIter->first < testIter->first) { | ||
return false; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It strikes me that this becomes somewhat cleaner using
I guess this is just moving well tested code to a different place, so feel free to leave as if you'd rather not change. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I more-or-less did this. The second part of the condition has to be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, great. It's a shame comparison doesn't work with const. That feels like an error in the std::pair implementation. |
||
|
||
if (commonIter->first == testIter->first) { | ||
// The tokens must appear the same number of times in the two | ||
// strings | ||
if (commonIter->second != testIter->second) { | ||
return false; | ||
} | ||
++commonIter; | ||
} | ||
|
||
++testIter; | ||
} | ||
|
||
return commonIter == m_CommonUniqueTokenIds.end(); | ||
} | ||
|
||
//! Does the supplied token vector contain all our common tokens in the | ||
//! same order as our base token vector? | ||
bool containsCommonTokensInOrder(const TSizeSizePrVec& tokenIds) const; | ||
bool containsCommonInOrderTokensInOrder(const TSizeSizePrVec& tokenIds) const; | ||
|
||
//! \return Does the supplied token ID represent a common unique token? | ||
bool isTokenCommon(std::size_t tokenId) const; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,8 +37,13 @@ BOOST_AUTO_TEST_CASE(testWrite) { | |
model_t::E_MemoryStatusOk, // memory status | ||
core_t::TTime(1521046309), // bucket start time | ||
0, // model bytes exceeded | ||
50000 // model bytes memory limit | ||
}; | ||
50000, // model bytes memory limit | ||
1000, // categorized messages | ||
100, // total categories | ||
7, // frequent categories | ||
13, // rare categories | ||
2, // dead categories | ||
model_t::E_CategorizationStatusPoor}; | ||
|
||
CModelSnapshotJsonWriter::SModelSnapshotReport report{ | ||
"6.3.0", | ||
|
@@ -110,14 +115,27 @@ BOOST_AUTO_TEST_CASE(testWrite) { | |
BOOST_TEST_REQUIRE(modelSizeStats.HasMember("memory_status")); | ||
BOOST_REQUIRE_EQUAL(std::string("ok"), | ||
std::string(modelSizeStats["memory_status"].GetString())); | ||
BOOST_TEST_REQUIRE(modelSizeStats.HasMember("timestamp")); | ||
BOOST_REQUIRE_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64()); | ||
BOOST_TEST_REQUIRE(modelSizeStats.HasMember("log_time")); | ||
BOOST_TEST_REQUIRE(modelSizeStats.HasMember("model_bytes_exceeded")); | ||
BOOST_REQUIRE_EQUAL(int64_t(0), modelSizeStats["model_bytes_exceeded"].GetInt64()); | ||
BOOST_TEST_REQUIRE(modelSizeStats.HasMember("model_bytes_memory_limit")); | ||
BOOST_REQUIRE_EQUAL(int64_t(50000), | ||
modelSizeStats["model_bytes_memory_limit"].GetInt64()); | ||
BOOST_TEST_REQUIRE(modelSizeStats.HasMember("categorized_doc_count")); | ||
BOOST_REQUIRE_EQUAL(int64_t(1000), modelSizeStats["categorized_doc_count"].GetInt64()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: maybe include There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This file was also dodgy before my changes because many expected results should have been unsigned rather than signed. I copied that bug into my new lines, but I'll correct that throughout the file at the same time as adding |
||
BOOST_TEST_REQUIRE(modelSizeStats.HasMember("total_category_count")); | ||
BOOST_REQUIRE_EQUAL(int64_t(100), modelSizeStats["total_category_count"].GetInt64()); | ||
BOOST_TEST_REQUIRE(modelSizeStats.HasMember("frequent_category_count")); | ||
BOOST_REQUIRE_EQUAL(int64_t(7), modelSizeStats["frequent_category_count"].GetInt64()); | ||
BOOST_TEST_REQUIRE(modelSizeStats.HasMember("rare_category_count")); | ||
BOOST_REQUIRE_EQUAL(int64_t(13), modelSizeStats["rare_category_count"].GetInt64()); | ||
BOOST_TEST_REQUIRE(modelSizeStats.HasMember("dead_category_count")); | ||
BOOST_REQUIRE_EQUAL(int64_t(2), modelSizeStats["dead_category_count"].GetInt64()); | ||
BOOST_TEST_REQUIRE(modelSizeStats.HasMember("memory_status")); | ||
BOOST_REQUIRE_EQUAL(std::string("poor"), | ||
std::string(modelSizeStats["categorization_status"].GetString())); | ||
BOOST_TEST_REQUIRE(modelSizeStats.HasMember("timestamp")); | ||
BOOST_REQUIRE_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64()); | ||
BOOST_TEST_REQUIRE(modelSizeStats.HasMember("log_time")); | ||
|
||
BOOST_TEST_REQUIRE(snapshot.HasMember("quantiles")); | ||
const rapidjson::Value& quantiles = snapshot["quantiles"]; | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -272,10 +272,30 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi | |||||
res.s_AllocationFailures = 0; | ||||||
res.s_MemoryStatus = m_MemoryStatus; | ||||||
res.s_BucketStartTime = bucketStartTime; | ||||||
res.s_CategorizedMessages = 0; | ||||||
res.s_TotalCategories = 0; | ||||||
res.s_FrequentCategories = 0; | ||||||
res.s_RareCategories = 0; | ||||||
res.s_DeadCategories = 0; | ||||||
res.s_CategorizationStatus = model_t::E_CategorizationStatusOk; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd make all these defaults in the class definition. |
||||||
for (const auto& resource : m_Resources) { | ||||||
resource.first->updateMemoryResults(res); | ||||||
} | ||||||
res.s_AllocationFailures += m_AllocationFailures.size(); | ||||||
// Categorization status is poor if: | ||||||
// - At least 100 messages have been categorized | ||||||
// and one of the following holds: | ||||||
// - There is only 1 category | ||||||
// - More than 90% of categories have 1 message | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
// - The number of categories is greater than 50% of the number of categorized messages | ||||||
// - There are no frequent match categories | ||||||
// - More than 50% of categories are dead | ||||||
if (res.s_CategorizedMessages > 100 && | ||||||
(res.s_TotalCategories == 1 || 10 * res.s_RareCategories > 9 * res.s_TotalCategories || | ||||||
2 * res.s_TotalCategories > res.s_CategorizedMessages || | ||||||
res.s_FrequentCategories == 0 || 2 * res.s_DeadCategories > res.s_TotalCategories)) { | ||||||
res.s_CategorizationStatus = model_t::E_CategorizationStatusPoor; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This feels like an odd place for this logic: could it perhaps be better encapsulated. Maybe a static member on some class related to categorisation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Come to think of it I can just put this in (Plus of course while we're not doing per-partition categorization the cost concerns are all theoretical anyway as we only have one categorizer object.) |
||||||
} | ||||||
return res; | ||||||
} | ||||||
|
||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know you're only extending, but should this provide defaults? It seems like this object is a recipe for uninitialised variables as it stands.