From 4bde3667c6fe2e19b6a5ba9cced1f696efcac761 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Tue, 4 Feb 2020 17:31:55 +0000 Subject: [PATCH 1/5] [ML] Add new categorization stats to model_size_stats This change adds support for the following new model_size_stats fields: - categorized_doc_count - total_category_count - frequent_category_count - rare_category_count - dead_category_count - categorization_status Relates #50749 --- include/model/CResourceMonitor.h | 6 ++ include/model/CTokenListCategory.h | 28 +++++- include/model/CTokenListDataCategorizer.h | 6 -- include/model/CTokenListDataCategorizerBase.h | 18 ++-- include/model/ModelTypes.h | 12 +++ lib/api/CModelSizeStatsJsonWriter.cc | 48 ++++++++--- .../unittest/CModelSnapshotJsonWriterTest.cc | 28 ++++-- lib/model/CResourceMonitor.cc | 20 +++++ lib/model/CTokenListCategory.cc | 43 ++-------- lib/model/CTokenListDataCategorizerBase.cc | 86 +++++++++++++------ lib/model/ModelTypes.cc | 10 +++ 11 files changed, 208 insertions(+), 97 deletions(-) diff --git a/include/model/CResourceMonitor.h b/include/model/CResourceMonitor.h index f5fd05332f..f23cbe858c 100644 --- a/include/model/CResourceMonitor.h +++ b/include/model/CResourceMonitor.h @@ -49,6 +49,12 @@ class MODEL_EXPORT CResourceMonitor { core_t::TTime s_BucketStartTime; std::size_t s_BytesExceeded; std::size_t s_BytesMemoryLimit; + std::size_t s_CategorizedMessages; + std::size_t s_TotalCategories; + std::size_t s_FrequentCategories; + std::size_t s_RareCategories; + std::size_t s_DeadCategories; + model_t::ECategorizationStatus s_CategorizationStatus; }; public: diff --git a/include/model/CTokenListCategory.h b/include/model/CTokenListCategory.h index 7470004daf..999ad5def6 100644 --- a/include/model/CTokenListCategory.h +++ b/include/model/CTokenListCategory.h @@ -99,11 +99,35 @@ class MODEL_EXPORT CTokenListCategory { //! if (category.missingCommonTokenWeight(uniqueTokenIds) == 0) //! instead of calling this method. However, this method is much faster //! as it can return false as soon as a mismatch occurs. - bool isMissingCommonTokenWeightZero(const TSizeSizeMap& uniqueTokenIds) const; + template + bool isMissingCommonTokenWeightZero(const PAIR_CONTAINER& uniqueTokenIds) const { + + auto commonIter = m_CommonUniqueTokenIds.begin(); + auto testIter = uniqueTokenIds.begin(); + while (commonIter != m_CommonUniqueTokenIds.end() && + testIter != uniqueTokenIds.end()) { + if (commonIter->first < testIter->first) { + return false; + } + + if (commonIter->first == testIter->first) { + // The tokens must appear the same number of times in the two + // strings + if (commonIter->second != testIter->second) { + return false; + } + ++commonIter; + } + + ++testIter; + } + + return commonIter == m_CommonUniqueTokenIds.end(); + } //! Does the supplied token vector contain all our common tokens in the //! same order as our base token vector? - bool containsCommonTokensInOrder(const TSizeSizePrVec& tokenIds) const; + bool containsCommonInOrderTokensInOrder(const TSizeSizePrVec& tokenIds) const; //! \return Does the supplied token ID represent a common unique token? bool isTokenCommon(std::size_t tokenId) const; diff --git a/include/model/CTokenListDataCategorizer.h b/include/model/CTokenListDataCategorizer.h index d50fc22262..66757544a0 100644 --- a/include/model/CTokenListDataCategorizer.h +++ b/include/model/CTokenListDataCategorizer.h @@ -81,12 +81,6 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase { //! Get the static size of this object - used for virtual hierarchies std::size_t staticSize() const override { return sizeof(*this); } - //! Currently the overall model memory stats do not contain any categorizer - //! stats fields. - void updateMemoryResults(CResourceMonitor::SResults& /*results*/) const override { - // NO-OP - } - protected: //! Split the string into a list of tokens. The result of the //! tokenisation is returned in \p tokenIds, \p tokenUniqueIds and diff --git a/include/model/CTokenListDataCategorizerBase.h b/include/model/CTokenListDataCategorizerBase.h index bdb4ca773a..b4c655b994 100644 --- a/include/model/CTokenListDataCategorizerBase.h +++ b/include/model/CTokenListDataCategorizerBase.h @@ -14,7 +14,6 @@ #include #include -#include #include #include #include @@ -75,8 +74,9 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer { //! second -> weighting using TSizeSizePr = std::pair; - //! Used for storing token ID sequences + //! Used for storing token ID sequences and categories with counts using TSizeSizePrVec = std::vector; + using TSizeSizePrVecItr = TSizeSizePrVec::iterator; //! Used for storing distinct token IDs using TSizeSizeMap = std::map; @@ -157,6 +157,10 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer { //! Get the memory used by this categorizer. std::size_t memoryUsage() const override; + //! Currently the overall model memory stats do not contain any categorizer + //! stats fields. + void updateMemoryResults(CResourceMonitor::SResults& results) const override; + protected: //! Split the string into a list of tokens. The result of the //! tokenisation is returned in \p tokenIds, \p tokenUniqueIds and @@ -180,19 +184,13 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer { const TSizeSizePrVec& right, std::size_t rightWeight) const = 0; - //! Used to hold statistics about the categories we compute: - //! first -> count of matches - //! second -> category vector index - using TSizeSizePrList = std::list; - using TSizeSizePrListItr = TSizeSizePrList::iterator; - //! Add a match to an existing category void addCategoryMatch(bool isDryRun, const std::string& str, std::size_t rawStringLen, const TSizeSizePrVec& tokenIds, const TSizeSizeMap& tokenUniqueIds, - TSizeSizePrListItr& iter); + TSizeSizePrVecItr& iter); //! Given the total token weight in a vector and a threshold, what is //! the minimum possible token weight in a different vector that could @@ -304,7 +302,7 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer { //! List of match count/index into category vector in descending order of //! match count - TSizeSizePrList m_CategoriesByCount; + TSizeSizePrVec m_CategoriesByCount; //! Used for looking up tokens to a unique ID TTokenMIndex m_TokenIdLookup; diff --git a/include/model/ModelTypes.h b/include/model/ModelTypes.h index 6232c507ad..6945a83980 100644 --- a/include/model/ModelTypes.h +++ b/include/model/ModelTypes.h @@ -802,6 +802,18 @@ enum EMemoryStatus { MODEL_EXPORT std::string print(EMemoryStatus memoryStatus); +//! An enumeration of the TokenListDataCategorizer status - +//! Start in the OK state. Moves into the poor state if too +//! few categories are being seen frequently. +enum ECategorizationStatus { + E_CategorizationStatusOk = 0, //!< Categorization working as intended + E_CategorizationStatusPoor = 1 //!< Too many categories being created +}; + +//! Get a string description of \p categorizationStatus. +MODEL_EXPORT +std::string print(ECategorizationStatus categorizationStatus); + //! Styles of probability aggregation available: //! -# AggregatePeople: the style used to aggregate results for distinct //! values of the over and partition field. diff --git a/lib/api/CModelSizeStatsJsonWriter.cc b/lib/api/CModelSizeStatsJsonWriter.cc index 0959812912..e91668382e 100644 --- a/lib/api/CModelSizeStatsJsonWriter.cc +++ b/lib/api/CModelSizeStatsJsonWriter.cc @@ -13,18 +13,24 @@ namespace api { namespace { // JSON field names -const std::string JOB_ID("job_id"); -const std::string MODEL_SIZE_STATS("model_size_stats"); -const std::string MODEL_BYTES("model_bytes"); -const std::string MODEL_BYTES_EXCEEDED("model_bytes_exceeded"); -const std::string MODEL_BYTES_MEMORY_LIMIT("model_bytes_memory_limit"); -const std::string TOTAL_BY_FIELD_COUNT("total_by_field_count"); -const std::string TOTAL_OVER_FIELD_COUNT("total_over_field_count"); -const std::string TOTAL_PARTITION_FIELD_COUNT("total_partition_field_count"); -const std::string BUCKET_ALLOCATION_FAILURES_COUNT("bucket_allocation_failures_count"); -const std::string MEMORY_STATUS("memory_status"); -const std::string TIMESTAMP("timestamp"); -const std::string LOG_TIME("log_time"); +const std::string JOB_ID{"job_id"}; +const std::string MODEL_SIZE_STATS{"model_size_stats"}; +const std::string MODEL_BYTES{"model_bytes"}; +const std::string MODEL_BYTES_EXCEEDED{"model_bytes_exceeded"}; +const std::string MODEL_BYTES_MEMORY_LIMIT{"model_bytes_memory_limit"}; +const std::string TOTAL_BY_FIELD_COUNT{"total_by_field_count"}; +const std::string TOTAL_OVER_FIELD_COUNT{"total_over_field_count"}; +const std::string TOTAL_PARTITION_FIELD_COUNT{"total_partition_field_count"}; +const std::string BUCKET_ALLOCATION_FAILURES_COUNT{"bucket_allocation_failures_count"}; +const std::string MEMORY_STATUS{"memory_status"}; +const std::string CATEGORIZED_DOC_COUNT{"categorized_doc_count"}; +const std::string TOTAL_CATEGORY_COUNT{"total_category_count"}; +const std::string FREQUENT_CATEGORY_COUNT{"frequent_category_count"}; +const std::string RARE_CATEGORY_COUNT{"rare_category_count"}; +const std::string DEAD_CATEGORY_COUNT{"dead_category_count"}; +const std::string CATEGORIZATION_STATUS{"categorization_status"}; +const std::string TIMESTAMP{"timestamp"}; +const std::string LOG_TIME{"log_time"}; } void CModelSizeStatsJsonWriter::write(const std::string& jobId, @@ -60,6 +66,24 @@ void CModelSizeStatsJsonWriter::write(const std::string& jobId, writer.String(MEMORY_STATUS); writer.String(print(results.s_MemoryStatus)); + writer.String(CATEGORIZED_DOC_COUNT); + writer.Uint64(results.s_CategorizedMessages); + + writer.String(TOTAL_CATEGORY_COUNT); + writer.Uint64(results.s_TotalCategories); + + writer.String(FREQUENT_CATEGORY_COUNT); + writer.Uint64(results.s_FrequentCategories); + + writer.String(RARE_CATEGORY_COUNT); + writer.Uint64(results.s_RareCategories); + + writer.String(DEAD_CATEGORY_COUNT); + writer.Uint64(results.s_DeadCategories); + + writer.String(CATEGORIZATION_STATUS); + writer.String(print(results.s_CategorizationStatus)); + writer.String(TIMESTAMP); writer.Time(results.s_BucketStartTime); diff --git a/lib/api/unittest/CModelSnapshotJsonWriterTest.cc b/lib/api/unittest/CModelSnapshotJsonWriterTest.cc index ac352b98e1..bd325ca0c9 100644 --- a/lib/api/unittest/CModelSnapshotJsonWriterTest.cc +++ b/lib/api/unittest/CModelSnapshotJsonWriterTest.cc @@ -37,8 +37,13 @@ BOOST_AUTO_TEST_CASE(testWrite) { model_t::E_MemoryStatusOk, // memory status core_t::TTime(1521046309), // bucket start time 0, // model bytes exceeded - 50000 // model bytes memory limit - }; + 50000, // model bytes memory limit + 1000, // categorized messages + 100, // total categories + 7, // frequent categories + 13, // rare categories + 2, // dead categories + model_t::E_CategorizationStatusPoor}; CModelSnapshotJsonWriter::SModelSnapshotReport report{ "6.3.0", @@ -110,14 +115,27 @@ BOOST_AUTO_TEST_CASE(testWrite) { BOOST_TEST_REQUIRE(modelSizeStats.HasMember("memory_status")); BOOST_REQUIRE_EQUAL(std::string("ok"), std::string(modelSizeStats["memory_status"].GetString())); - BOOST_TEST_REQUIRE(modelSizeStats.HasMember("timestamp")); - BOOST_REQUIRE_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64()); - BOOST_TEST_REQUIRE(modelSizeStats.HasMember("log_time")); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("model_bytes_exceeded")); BOOST_REQUIRE_EQUAL(int64_t(0), modelSizeStats["model_bytes_exceeded"].GetInt64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("model_bytes_memory_limit")); BOOST_REQUIRE_EQUAL(int64_t(50000), modelSizeStats["model_bytes_memory_limit"].GetInt64()); + BOOST_TEST_REQUIRE(modelSizeStats.HasMember("categorized_doc_count")); + BOOST_REQUIRE_EQUAL(int64_t(1000), modelSizeStats["categorized_doc_count"].GetInt64()); + BOOST_TEST_REQUIRE(modelSizeStats.HasMember("total_category_count")); + BOOST_REQUIRE_EQUAL(int64_t(100), modelSizeStats["total_category_count"].GetInt64()); + BOOST_TEST_REQUIRE(modelSizeStats.HasMember("frequent_category_count")); + BOOST_REQUIRE_EQUAL(int64_t(7), modelSizeStats["frequent_category_count"].GetInt64()); + BOOST_TEST_REQUIRE(modelSizeStats.HasMember("rare_category_count")); + BOOST_REQUIRE_EQUAL(int64_t(13), modelSizeStats["rare_category_count"].GetInt64()); + BOOST_TEST_REQUIRE(modelSizeStats.HasMember("dead_category_count")); + BOOST_REQUIRE_EQUAL(int64_t(2), modelSizeStats["dead_category_count"].GetInt64()); + BOOST_TEST_REQUIRE(modelSizeStats.HasMember("memory_status")); + BOOST_REQUIRE_EQUAL(std::string("poor"), + std::string(modelSizeStats["categorization_status"].GetString())); + BOOST_TEST_REQUIRE(modelSizeStats.HasMember("timestamp")); + BOOST_REQUIRE_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64()); + BOOST_TEST_REQUIRE(modelSizeStats.HasMember("log_time")); BOOST_TEST_REQUIRE(snapshot.HasMember("quantiles")); const rapidjson::Value& quantiles = snapshot["quantiles"]; diff --git a/lib/model/CResourceMonitor.cc b/lib/model/CResourceMonitor.cc index 04a50211e6..b75f54c41e 100644 --- a/lib/model/CResourceMonitor.cc +++ b/lib/model/CResourceMonitor.cc @@ -272,10 +272,30 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi res.s_AllocationFailures = 0; res.s_MemoryStatus = m_MemoryStatus; res.s_BucketStartTime = bucketStartTime; + res.s_CategorizedMessages = 0; + res.s_TotalCategories = 0; + res.s_FrequentCategories = 0; + res.s_RareCategories = 0; + res.s_DeadCategories = 0; + res.s_CategorizationStatus = model_t::E_CategorizationStatusOk; for (const auto& resource : m_Resources) { resource.first->updateMemoryResults(res); } res.s_AllocationFailures += m_AllocationFailures.size(); + // Categorization status is poor if: + // - At least 100 messages have been categorized + // and one of the following holds: + // - There is only 1 category + // - More than 90% of categories have 1 message + // - The number of categories is greater than 50% of the number of categorized messages + // - There are no frequent match categories + // - More than 50% of categories are dead + if (res.s_CategorizedMessages > 100 && + (res.s_TotalCategories == 1 || 10 * res.s_RareCategories > 9 * res.s_TotalCategories || + 2 * res.s_TotalCategories > res.s_CategorizedMessages || + res.s_FrequentCategories == 0 || 2 * res.s_DeadCategories > res.s_TotalCategories)) { + res.s_CategorizationStatus = model_t::E_CategorizationStatusPoor; + } return res; } diff --git a/lib/model/CTokenListCategory.cc b/lib/model/CTokenListCategory.cc index 9ad3d91ced..d486a50ddb 100644 --- a/lib/model/CTokenListCategory.cc +++ b/lib/model/CTokenListCategory.cc @@ -411,52 +411,27 @@ std::size_t CTokenListCategory::missingCommonTokenWeight(const TSizeSizeMap& uni return m_CommonUniqueTokenWeight - presentWeight; } -bool CTokenListCategory::isMissingCommonTokenWeightZero(const TSizeSizeMap& uniqueTokenIds) const { - // This method could be implemented as: - // return this->missingCommonTokenWeight(uniqueTokenIds) == 0; - // - // However, it's much faster to return false as soon as a mismatch occurs +bool CTokenListCategory::containsCommonInOrderTokensInOrder(const TSizeSizePrVec& tokenIds) const { - auto commonIter = m_CommonUniqueTokenIds.begin(); - auto testIter = uniqueTokenIds.begin(); - while (commonIter != m_CommonUniqueTokenIds.end() && - testIter != uniqueTokenIds.end()) { - if (commonIter->first < testIter->first) { - return false; - } - - if (commonIter->first == testIter->first) { - // The tokens must appear the same number of times in the two - // strings - if (commonIter->second != testIter->second) { - return false; - } - ++commonIter; - } - - ++testIter; - } - - return commonIter == m_CommonUniqueTokenIds.end(); -} - -bool CTokenListCategory::containsCommonTokensInOrder(const TSizeSizePrVec& tokenIds) const { auto testIter = tokenIds.begin(); - for (auto baseTokenId : m_BaseTokenIds) { + for (std::size_t index = m_OrderedCommonTokenBeginIndex; + index < m_OrderedCommonTokenEndIndex; ++index) { + std::size_t baseTokenId{m_BaseTokenIds[index].first}; + // Ignore tokens that are not in the common unique tokens - if (this->isTokenCommon(baseTokenId.first) == false) { + if (this->isTokenCommon(baseTokenId) == false) { continue; } // Skip tokens in the test tokens until we find one that matches the // base token. If we reach the end of the test tokens whilst doing - // this, it means the test tokens don't contain the base tokens in the - // correct order. + // this, it means the test tokens don't contain the common ordered base + // tokens in the correct order. do { if (testIter == tokenIds.end()) { return false; } - } while ((testIter++)->first != baseTokenId.first); + } while ((testIter++)->first != baseTokenId); } return true; diff --git a/lib/model/CTokenListDataCategorizerBase.cc b/lib/model/CTokenListDataCategorizerBase.cc index a873282676..f2880b2489 100644 --- a/lib/model/CTokenListDataCategorizerBase.cc +++ b/lib/model/CTokenListDataCategorizerBase.cc @@ -11,6 +11,8 @@ #include #include +#include + #include #include @@ -77,10 +79,9 @@ int CTokenListDataCategorizerBase::computeCategory(bool isDryRun, // We search previous categories in descending order of the number of matches // we've seen for them - TSizeSizePrListItr bestSoFarIter(m_CategoriesByCount.end()); + auto bestSoFarIter = m_CategoriesByCount.end(); double bestSoFarSimilarity(m_LowerThreshold); - for (TSizeSizePrListItr iter = m_CategoriesByCount.begin(); - iter != m_CategoriesByCount.end(); ++iter) { + for (auto iter = m_CategoriesByCount.begin(); iter != m_CategoriesByCount.end(); ++iter) { const CTokenListCategory& compCategory = m_Categories[iter->second]; const TSizeSizePrVec& baseTokenIds = compCategory.baseTokenIds(); std::size_t baseWeight(compCategory.baseWeight()); @@ -90,10 +91,11 @@ int CTokenListDataCategorizerBase::computeCategory(bool isDryRun, // further checks. The first condition here ensures that we never say // a string with tokens matches the reverse search of a string with no // tokens (which the other criteria alone might say matched). - bool matchesSearch((baseWeight == 0) == (workWeight == 0) && - compCategory.maxMatchingStringLen() >= rawStringLen && - compCategory.isMissingCommonTokenWeightZero(m_WorkTokenUniqueIds) && - compCategory.containsCommonTokensInOrder(m_WorkTokenIds)); + bool matchesSearch( + (baseWeight == 0) == (workWeight == 0) && + compCategory.maxMatchingStringLen() >= rawStringLen && + compCategory.isMissingCommonTokenWeightZero(m_WorkTokenUniqueIds) && + compCategory.containsCommonInOrderTokensInOrder(m_WorkTokenIds)); if (!matchesSearch) { // Quickly rule out wildly different token weights prior to doing // the expensive similarity calculations @@ -149,17 +151,16 @@ int CTokenListDataCategorizerBase::computeCategory(bool isDryRun, if (bestSoFarIter != m_CategoriesByCount.end()) { // Return the best match - use vector index plus one as ML category - int categoryId(1 + int(bestSoFarIter->second)); + int categoryId{1 + static_cast(bestSoFarIter->second)}; this->addCategoryMatch(isDryRun, str, rawStringLen, m_WorkTokenIds, m_WorkTokenUniqueIds, bestSoFarIter); return categoryId; } // If we get here we haven't matched, so create a new category - CTokenListCategory obj{isDryRun, str, rawStringLen, - m_WorkTokenIds, workWeight, m_WorkTokenUniqueIds}; - m_CategoriesByCount.push_back(TSizeSizePr(1, m_Categories.size())); - m_Categories.push_back(obj); + m_CategoriesByCount.emplace_back(1, m_Categories.size()); + m_Categories.emplace_back(isDryRun, str, rawStringLen, m_WorkTokenIds, + workWeight, m_WorkTokenUniqueIds); m_HasChanged = true; // Increment the counts of categories that use a given token @@ -319,19 +320,6 @@ bool CTokenListDataCategorizerBase::createReverseSearch(int categoryId, return true; } -namespace { - -class CPairFirstElementGreater { -public: - //! This operator is designed for pairs that are small enough for - //! passing by value to be most efficient - template - bool operator()(const PAIR pr1, const PAIR pr2) { - return pr1.first > pr2.first; - } -}; -} - bool CTokenListDataCategorizerBase::hasChanged() const { return m_HasChanged; } @@ -374,7 +362,8 @@ bool CTokenListDataCategorizerBase::acceptRestoreTraverser(core::CStateRestoreTr // Categories are persisted in order of creation, but this list needs to be // sorted by count instead - m_CategoriesByCount.sort(CPairFirstElementGreater()); + std::sort(m_CategoriesByCount.begin(), m_CategoriesByCount.end(), + maths::COrderings::SFirstGreater()); return true; } @@ -420,7 +409,7 @@ void CTokenListDataCategorizerBase::addCategoryMatch(bool isDryRun, std::size_t rawStringLen, const TSizeSizePrVec& tokenIds, const TSizeSizeMap& tokenUniqueIds, - TSizeSizePrListItr& iter) { + TSizeSizePrVecItr& iter) { if (m_Categories[iter->second].addString(isDryRun, str, rawStringLen, tokenIds, tokenUniqueIds) == true) { m_HasChanged = true; @@ -489,7 +478,7 @@ std::size_t CTokenListDataCategorizerBase::idForToken(const std::string& token) return iter->index(); } - std::size_t nextIndex(m_TokenIdLookup.size()); + std::size_t nextIndex{m_TokenIdLookup.size()}; m_TokenIdLookup.push_back(CTokenInfoItem(token, nextIndex)); return nextIndex; } @@ -539,6 +528,47 @@ std::size_t CTokenListDataCategorizerBase::memoryUsage() const { return mem; } +void CTokenListDataCategorizerBase::updateMemoryResults(CResourceMonitor::SResults& results) const { + + results.s_TotalCategories = m_Categories.size(); + + std::size_t categorizedMessagesThisCategorizer{0}; + for (auto categoryByCount : m_CategoriesByCount) { + categorizedMessagesThisCategorizer += categoryByCount.first; + } + results.s_CategorizedMessages += categorizedMessagesThisCategorizer; + + for (std::size_t i = 0; i < m_CategoriesByCount.size(); ++i) { + const CTokenListCategory& category{m_Categories[m_CategoriesByCount[i].second]}; + // Definitions for frequent/rare categories are: + // - rare = single match + // - frequent = matches more than 1% of messages + if (category.numMatches() == 1) { + ++results.s_RareCategories; + } else if (category.numMatches() * 100 > categorizedMessagesThisCategorizer) { + ++results.s_FrequentCategories; + } + for (std::size_t j = 0; j < i; ++j) { + const CTokenListCategory& moreFrequentCategory{ + m_Categories[m_CategoriesByCount[j].second]}; + bool matchesSearch(moreFrequentCategory.maxMatchingStringLen() >= + category.maxMatchingStringLen() && + moreFrequentCategory.isMissingCommonTokenWeightZero( + category.commonUniqueTokenIds()) && + moreFrequentCategory.containsCommonInOrderTokensInOrder( + category.baseTokenIds())); + if (matchesSearch) { + ++results.s_DeadCategories; + LOG_DEBUG(<< "Category " << (m_CategoriesByCount[i].second + 1) + << " (" << category.baseString() << ") is killed by category " + << (m_CategoriesByCount[j].second + 1) << " (" + << moreFrequentCategory.baseString() << ")"); + break; + } + } + } +} + CTokenListDataCategorizerBase::CTokenInfoItem::CTokenInfoItem(const std::string& str, std::size_t index) : m_Str(str), m_Index(index), m_CategoryCount(0) { diff --git a/lib/model/ModelTypes.cc b/lib/model/ModelTypes.cc index 8df369b3cd..05889dc1a0 100644 --- a/lib/model/ModelTypes.cc +++ b/lib/model/ModelTypes.cc @@ -2045,5 +2045,15 @@ std::string print(EMemoryStatus memoryStatus) { } return "-"; } + +std::string print(ECategorizationStatus categorizationStatus) { + switch (categorizationStatus) { + case E_CategorizationStatusOk: + return "ok"; + case E_CategorizationStatusPoor: + return "poor"; + } + return "-"; +} } } From e55b927707844ecf18536817ca246212f23278c1 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Thu, 6 Feb 2020 12:53:29 +0000 Subject: [PATCH 2/5] Address review comments --- include/api/CAnomalyJob.h | 6 +- include/api/CJsonOutputWriter.h | 2 +- include/api/CModelSizeStatsJsonWriter.h | 2 +- include/api/CModelSnapshotJsonWriter.h | 2 +- include/model/CAnomalyDetector.h | 6 +- include/model/CMonitoredResource.h | 5 +- include/model/CResourceMonitor.h | 39 ++++----- include/model/CTokenListCategory.h | 34 ++++---- include/model/CTokenListDataCategorizerBase.h | 5 +- lib/api/CAnomalyJob.cc | 4 +- lib/api/CJsonOutputWriter.cc | 2 +- lib/api/CModelSizeStatsJsonWriter.cc | 2 +- lib/api/unittest/CJsonOutputWriterTest.cc | 2 +- .../unittest/CModelSnapshotJsonWriterTest.cc | 56 +++++++------ lib/model/CAnomalyDetector.cc | 8 +- lib/model/CResourceMonitor.cc | 31 +------ lib/model/CTokenListCategory.cc | 1 - lib/model/CTokenListDataCategorizerBase.cc | 43 +++++++--- lib/model/unittest/CResourceMonitorTest.cc | 80 ++++++++++--------- 19 files changed, 168 insertions(+), 162 deletions(-) diff --git a/include/api/CAnomalyJob.h b/include/api/CAnomalyJob.h index ff753dacd3..e87837e65c 100644 --- a/include/api/CAnomalyJob.h +++ b/include/api/CAnomalyJob.h @@ -120,14 +120,14 @@ class API_EXPORT CAnomalyJob : public CDataProcessor { struct SBackgroundPersistArgs { SBackgroundPersistArgs(core_t::TTime time, - const model::CResourceMonitor::SResults& modelSizeStats, + const model::CResourceMonitor::SModelSizeStats& modelSizeStats, const model::CInterimBucketCorrector& interimBucketCorrector, const model::CHierarchicalResultsAggregator& aggregator, core_t::TTime latestRecordTime, core_t::TTime lastResultsTime); core_t::TTime s_Time; - model::CResourceMonitor::SResults s_ModelSizeStats; + model::CResourceMonitor::SModelSizeStats s_ModelSizeStats; model::CInterimBucketCorrector s_InterimBucketCorrector; model::CHierarchicalResultsAggregator s_Aggregator; std::string s_NormalizerState; @@ -258,7 +258,7 @@ class API_EXPORT CAnomalyJob : public CDataProcessor { bool persistCopiedState(const std::string& descriptionPrefix, core_t::TTime time, const TKeyCRefAnomalyDetectorPtrPrVec& detectors, - const model::CResourceMonitor::SResults& modelSizeStats, + const model::CResourceMonitor::SModelSizeStats& modelSizeStats, const model::CInterimBucketCorrector& interimBucketCorrector, const model::CHierarchicalResultsAggregator& aggregator, const std::string& normalizerState, diff --git a/include/api/CJsonOutputWriter.h b/include/api/CJsonOutputWriter.h index 220f014b02..ded2ef414b 100644 --- a/include/api/CJsonOutputWriter.h +++ b/include/api/CJsonOutputWriter.h @@ -232,7 +232,7 @@ class API_EXPORT CJsonOutputWriter : public COutputHandler { //! Report the current levels of resource usage, as given to us //! from the CResourceMonitor via a callback - void reportMemoryUsage(const model::CResourceMonitor::SResults& results); + void reportMemoryUsage(const model::CResourceMonitor::SModelSizeStats& modelSizeStats); //! Acknowledge a flush request by echoing back the flush ID void acknowledgeFlush(const std::string& flushId, core_t::TTime lastFinalizedBucketEnd); diff --git a/include/api/CModelSizeStatsJsonWriter.h b/include/api/CModelSizeStatsJsonWriter.h index 3296c0a6c1..1c6beccd07 100644 --- a/include/api/CModelSizeStatsJsonWriter.h +++ b/include/api/CModelSizeStatsJsonWriter.h @@ -24,7 +24,7 @@ class API_EXPORT CModelSizeStatsJsonWriter : private core::CNonInstantiatable { public: //! Writes the model size stats in the \p results in JSON format. static void write(const std::string& jobId, - const model::CResourceMonitor::SResults& results, + const model::CResourceMonitor::SModelSizeStats& results, core::CRapidJsonConcurrentLineWriter& writer); }; } diff --git a/include/api/CModelSnapshotJsonWriter.h b/include/api/CModelSnapshotJsonWriter.h index bf348a1a2c..934a190873 100644 --- a/include/api/CModelSnapshotJsonWriter.h +++ b/include/api/CModelSnapshotJsonWriter.h @@ -33,7 +33,7 @@ class API_EXPORT CModelSnapshotJsonWriter { std::string s_Description; std::string s_SnapshotId; size_t s_NumDocs; - model::CResourceMonitor::SResults s_ModelSizeStats; + model::CResourceMonitor::SModelSizeStats s_ModelSizeStats; std::string s_NormalizerState; core_t::TTime s_LatestRecordTime; core_t::TTime s_LatestFinalResultTime; diff --git a/include/model/CAnomalyDetector.h b/include/model/CAnomalyDetector.h index f875b05187..66d2aa450a 100644 --- a/include/model/CAnomalyDetector.h +++ b/include/model/CAnomalyDetector.h @@ -270,9 +270,9 @@ class MODEL_EXPORT CAnomalyDetector : public CMonitoredResource { //! Prune the model. void prune(std::size_t maximumAge) override; - //! Update the overall model memory stats results with stats from this - //! anomaly detector. - void updateMemoryResults(CResourceMonitor::SResults& results) const override; + //! Update the overall model size stats with information from this anomaly + //! detector. + void updateModelSizeStats(CResourceMonitor::SModelSizeStats& modelSizeStats) const override; //! Get end of the last complete bucket we've observed. const core_t::TTime& lastBucketEndTime() const; diff --git a/include/model/CMonitoredResource.h b/include/model/CMonitoredResource.h index 447874bc6f..a091991aa1 100644 --- a/include/model/CMonitoredResource.h +++ b/include/model/CMonitoredResource.h @@ -58,9 +58,10 @@ class MODEL_EXPORT CMonitoredResource { //! discarding the least recently seen entities it knows about. virtual void prune(std::size_t maximumAge); - //! Update the overall model memory stats results with stats from this + //! Update the overall model size stats results with stats from this //! monitored resource. - virtual void updateMemoryResults(CResourceMonitor::SResults& results) const = 0; + virtual void + updateModelSizeStats(CResourceMonitor::SModelSizeStats& modelSizeStats) const = 0; }; } } diff --git a/include/model/CResourceMonitor.h b/include/model/CResourceMonitor.h index f23cbe858c..c923fe8a88 100644 --- a/include/model/CResourceMonitor.h +++ b/include/model/CResourceMonitor.h @@ -38,29 +38,30 @@ class CMonitoredResource; //! Assess memory used by models and decide on further memory allocations. class MODEL_EXPORT CResourceMonitor { public: - struct MODEL_EXPORT SResults { - std::size_t s_Usage; - std::size_t s_AdjustedUsage; - std::size_t s_ByFields; - std::size_t s_PartitionFields; - std::size_t s_OverFields; - std::size_t s_AllocationFailures; - model_t::EMemoryStatus s_MemoryStatus; - core_t::TTime s_BucketStartTime; - std::size_t s_BytesExceeded; - std::size_t s_BytesMemoryLimit; - std::size_t s_CategorizedMessages; - std::size_t s_TotalCategories; - std::size_t s_FrequentCategories; - std::size_t s_RareCategories; - std::size_t s_DeadCategories; - model_t::ECategorizationStatus s_CategorizationStatus; + struct MODEL_EXPORT SModelSizeStats { + std::size_t s_Usage = 0; + std::size_t s_AdjustedUsage = 0; + std::size_t s_ByFields = 0; + std::size_t s_PartitionFields = 0; + std::size_t s_OverFields = 0; + std::size_t s_AllocationFailures = 0; + model_t::EMemoryStatus s_MemoryStatus = model_t::E_MemoryStatusOk; + core_t::TTime s_BucketStartTime = 0; + std::size_t s_BytesExceeded = 0; + std::size_t s_BytesMemoryLimit = 0; + std::size_t s_CategorizedMessages = 0; + std::size_t s_TotalCategories = 0; + std::size_t s_FrequentCategories = 0; + std::size_t s_RareCategories = 0; + std::size_t s_DeadCategories = 0; + model_t::ECategorizationStatus s_CategorizationStatus = model_t::E_CategorizationStatusOk; }; public: using TMonitoredResourcePtrSizeUMap = boost::unordered_map; - using TMemoryUsageReporterFunc = std::function; + using TMemoryUsageReporterFunc = + std::function; using TTimeSizeMap = std::map; //! The minimum time between prunes @@ -115,7 +116,7 @@ class MODEL_EXPORT CResourceMonitor { void sendMemoryUsageReport(core_t::TTime bucketStartTime); //! Create a memory usage report - SResults createMemoryUsageReport(core_t::TTime bucketStartTime); + SModelSizeStats createMemoryUsageReport(core_t::TTime bucketStartTime); //! We are being told that a class has failed to allocate memory //! based on the resource limits, and we will report this to the diff --git a/include/model/CTokenListCategory.h b/include/model/CTokenListCategory.h index 999ad5def6..ace04d4c01 100644 --- a/include/model/CTokenListCategory.h +++ b/include/model/CTokenListCategory.h @@ -10,6 +10,7 @@ #include +#include #include #include #include @@ -94,35 +95,34 @@ class MODEL_EXPORT CTokenListCategory { //! this category's common unique tokens? std::size_t missingCommonTokenWeight(const TSizeSizeMap& uniqueTokenIds) const; - //! Is the weight of tokens in a given map that are missing from this - //! category's common unique tokens equal to zero? It is possible to test: + //! Is the weight of tokens in the provided container that are missing from + //! this category's common unique tokens equal to zero? It is possible to + //! test: //! if (category.missingCommonTokenWeight(uniqueTokenIds) == 0) //! instead of calling this method. However, this method is much faster //! as it can return false as soon as a mismatch occurs. + //! \param uniqueTokenIds A container of pairs where the first element is + //! a token ID and the container is sorted into + //! ascending token ID order. template bool isMissingCommonTokenWeightZero(const PAIR_CONTAINER& uniqueTokenIds) const { - auto commonIter = m_CommonUniqueTokenIds.begin(); auto testIter = uniqueTokenIds.begin(); - while (commonIter != m_CommonUniqueTokenIds.end() && - testIter != uniqueTokenIds.end()) { - if (commonIter->first < testIter->first) { + for (auto commonIter = m_CommonUniqueTokenIds.begin(); + commonIter != m_CommonUniqueTokenIds.end(); ++commonIter) { + testIter = std::find_if(testIter, uniqueTokenIds.end(), + [&commonIter](const auto& testItem) { + return testItem.first >= commonIter->first; + }); + if (testIter == uniqueTokenIds.end() || + testIter->first != commonIter->first || + testIter->second != commonIter->second) { return false; } - - if (commonIter->first == testIter->first) { - // The tokens must appear the same number of times in the two - // strings - if (commonIter->second != testIter->second) { - return false; - } - ++commonIter; - } - ++testIter; } - return commonIter == m_CommonUniqueTokenIds.end(); + return true; } //! Does the supplied token vector contain all our common tokens in the diff --git a/include/model/CTokenListDataCategorizerBase.h b/include/model/CTokenListDataCategorizerBase.h index b4c655b994..1f475e17e2 100644 --- a/include/model/CTokenListDataCategorizerBase.h +++ b/include/model/CTokenListDataCategorizerBase.h @@ -157,9 +157,8 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer { //! Get the memory used by this categorizer. std::size_t memoryUsage() const override; - //! Currently the overall model memory stats do not contain any categorizer - //! stats fields. - void updateMemoryResults(CResourceMonitor::SResults& results) const override; + //! Update the model size stats with information from this categorizer. + void updateModelSizeStats(CResourceMonitor::SModelSizeStats& modelSizeStats) const override; protected: //! Split the string into a list of tokens. The result of the diff --git a/lib/api/CAnomalyJob.cc b/lib/api/CAnomalyJob.cc index 7724adf99c..2752884494 100644 --- a/lib/api/CAnomalyJob.cc +++ b/lib/api/CAnomalyJob.cc @@ -1179,7 +1179,7 @@ bool CAnomalyJob::persistModelsState(const TKeyCRefAnomalyDetectorPtrPrVec& dete bool CAnomalyJob::persistCopiedState(const std::string& descriptionPrefix, core_t::TTime time, const TKeyCRefAnomalyDetectorPtrPrVec& detectors, - const model::CResourceMonitor::SResults& modelSizeStats, + const model::CResourceMonitor::SModelSizeStats& modelSizeStats, const model::CInterimBucketCorrector& interimBucketCorrector, const model::CHierarchicalResultsAggregator& aggregator, const std::string& normalizerState, @@ -1563,7 +1563,7 @@ void CAnomalyJob::addRecord(const TAnomalyDetectorPtr detector, CAnomalyJob::SBackgroundPersistArgs::SBackgroundPersistArgs( core_t::TTime time, - const model::CResourceMonitor::SResults& modelSizeStats, + const model::CResourceMonitor::SModelSizeStats& modelSizeStats, const model::CInterimBucketCorrector& interimBucketCorrector, const model::CHierarchicalResultsAggregator& aggregator, core_t::TTime latestRecordTime, diff --git a/lib/api/CJsonOutputWriter.cc b/lib/api/CJsonOutputWriter.cc index b4d061ae7f..461d8d3fcc 100644 --- a/lib/api/CJsonOutputWriter.cc +++ b/lib/api/CJsonOutputWriter.cc @@ -857,7 +857,7 @@ void CJsonOutputWriter::popAllocator() { m_Writer.popAllocator(); } -void CJsonOutputWriter::reportMemoryUsage(const model::CResourceMonitor::SResults& results) { +void CJsonOutputWriter::reportMemoryUsage(const model::CResourceMonitor::SModelSizeStats& results) { m_Writer.StartObject(); CModelSizeStatsJsonWriter::write(m_JobId, results, m_Writer); m_Writer.EndObject(); diff --git a/lib/api/CModelSizeStatsJsonWriter.cc b/lib/api/CModelSizeStatsJsonWriter.cc index e91668382e..7d66e5875d 100644 --- a/lib/api/CModelSizeStatsJsonWriter.cc +++ b/lib/api/CModelSizeStatsJsonWriter.cc @@ -34,7 +34,7 @@ const std::string LOG_TIME{"log_time"}; } void CModelSizeStatsJsonWriter::write(const std::string& jobId, - const model::CResourceMonitor::SResults& results, + const model::CResourceMonitor::SModelSizeStats& results, core::CRapidJsonConcurrentLineWriter& writer) { writer.String(MODEL_SIZE_STATS); writer.StartObject(); diff --git a/lib/api/unittest/CJsonOutputWriterTest.cc b/lib/api/unittest/CJsonOutputWriterTest.cc index 5386f6960d..e85380f57c 100644 --- a/lib/api/unittest/CJsonOutputWriterTest.cc +++ b/lib/api/unittest/CJsonOutputWriterTest.cc @@ -1746,7 +1746,7 @@ BOOST_AUTO_TEST_CASE(testReportMemoryUsage) { ml::core::CJsonOutputStreamWrapper outputStream(sstream); ml::api::CJsonOutputWriter writer("job", outputStream); - ml::model::CResourceMonitor::SResults resourceUsage; + ml::model::CResourceMonitor::SModelSizeStats resourceUsage; resourceUsage.s_Usage = 1; resourceUsage.s_AdjustedUsage = 2; resourceUsage.s_ByFields = 3; diff --git a/lib/api/unittest/CModelSnapshotJsonWriterTest.cc b/lib/api/unittest/CModelSnapshotJsonWriterTest.cc index bd325ca0c9..ff12ce0e61 100644 --- a/lib/api/unittest/CModelSnapshotJsonWriterTest.cc +++ b/lib/api/unittest/CModelSnapshotJsonWriterTest.cc @@ -15,6 +15,7 @@ #include +#include #include BOOST_AUTO_TEST_SUITE(CModelSnapshotJsonWriterTest) @@ -27,7 +28,7 @@ BOOST_AUTO_TEST_CASE(testWrite) { // The output writer won't close the JSON structures until is is destroyed { - model::CResourceMonitor::SResults modelSizeStats{ + model::CResourceMonitor::SModelSizeStats modelSizeStats{ 10000, // bytes used 20000, // bytes used (adjusted) 3, // # by fields @@ -82,18 +83,18 @@ BOOST_AUTO_TEST_CASE(testWrite) { BOOST_REQUIRE_EQUAL(std::string("test_snapshot_id"), std::string(snapshot["snapshot_id"].GetString())); BOOST_TEST_REQUIRE(snapshot.HasMember("snapshot_doc_count")); - BOOST_REQUIRE_EQUAL(int64_t(15), snapshot["snapshot_doc_count"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(15), snapshot["snapshot_doc_count"].GetUint64()); BOOST_TEST_REQUIRE(snapshot.HasMember("timestamp")); - BOOST_REQUIRE_EQUAL(int64_t(1521046309000), snapshot["timestamp"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::int64_t(1521046309000), snapshot["timestamp"].GetInt64()); BOOST_TEST_REQUIRE(snapshot.HasMember("description")); BOOST_REQUIRE_EQUAL(std::string("the snapshot description"), std::string(snapshot["description"].GetString())); BOOST_TEST_REQUIRE(snapshot.HasMember("latest_record_time_stamp")); - BOOST_REQUIRE_EQUAL(int64_t(1521046409000), - snapshot["latest_record_time_stamp"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(1521046409000), + snapshot["latest_record_time_stamp"].GetUint64()); BOOST_TEST_REQUIRE(snapshot.HasMember("latest_result_time_stamp")); - BOOST_REQUIRE_EQUAL(int64_t(1521040000000), - snapshot["latest_result_time_stamp"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(1521040000000), + snapshot["latest_result_time_stamp"].GetUint64()); BOOST_TEST_REQUIRE(snapshot.HasMember("model_size_stats")); const rapidjson::Value& modelSizeStats = snapshot["model_size_stats"]; @@ -101,40 +102,47 @@ BOOST_AUTO_TEST_CASE(testWrite) { BOOST_REQUIRE_EQUAL(std::string("job"), std::string(modelSizeStats["job_id"].GetString())); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("model_bytes")); - BOOST_REQUIRE_EQUAL(int64_t(20000), modelSizeStats["model_bytes"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(20000), modelSizeStats["model_bytes"].GetUint64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("total_by_field_count")); - BOOST_REQUIRE_EQUAL(int64_t(3), modelSizeStats["total_by_field_count"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(3), + modelSizeStats["total_by_field_count"].GetUint64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("total_partition_field_count")); - BOOST_REQUIRE_EQUAL(int64_t(1), - modelSizeStats["total_partition_field_count"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(1), + modelSizeStats["total_partition_field_count"].GetUint64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("total_over_field_count")); - BOOST_REQUIRE_EQUAL(int64_t(150), modelSizeStats["total_over_field_count"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(150), + modelSizeStats["total_over_field_count"].GetUint64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("bucket_allocation_failures_count")); - BOOST_REQUIRE_EQUAL( - int64_t(4), modelSizeStats["bucket_allocation_failures_count"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(4), + modelSizeStats["bucket_allocation_failures_count"].GetUint64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("memory_status")); BOOST_REQUIRE_EQUAL(std::string("ok"), std::string(modelSizeStats["memory_status"].GetString())); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("model_bytes_exceeded")); - BOOST_REQUIRE_EQUAL(int64_t(0), modelSizeStats["model_bytes_exceeded"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(0), + modelSizeStats["model_bytes_exceeded"].GetUint64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("model_bytes_memory_limit")); - BOOST_REQUIRE_EQUAL(int64_t(50000), - modelSizeStats["model_bytes_memory_limit"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(50000), + modelSizeStats["model_bytes_memory_limit"].GetUint64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("categorized_doc_count")); - BOOST_REQUIRE_EQUAL(int64_t(1000), modelSizeStats["categorized_doc_count"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(1000), + modelSizeStats["categorized_doc_count"].GetUint64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("total_category_count")); - BOOST_REQUIRE_EQUAL(int64_t(100), modelSizeStats["total_category_count"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(100), + modelSizeStats["total_category_count"].GetUint64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("frequent_category_count")); - BOOST_REQUIRE_EQUAL(int64_t(7), modelSizeStats["frequent_category_count"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(7), + modelSizeStats["frequent_category_count"].GetUint64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("rare_category_count")); - BOOST_REQUIRE_EQUAL(int64_t(13), modelSizeStats["rare_category_count"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(13), + modelSizeStats["rare_category_count"].GetUint64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("dead_category_count")); - BOOST_REQUIRE_EQUAL(int64_t(2), modelSizeStats["dead_category_count"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::uint64_t(2), modelSizeStats["dead_category_count"].GetUint64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("memory_status")); BOOST_REQUIRE_EQUAL(std::string("poor"), std::string(modelSizeStats["categorization_status"].GetString())); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("timestamp")); - BOOST_REQUIRE_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("log_time")); BOOST_TEST_REQUIRE(snapshot.HasMember("quantiles")); @@ -145,7 +153,7 @@ BOOST_AUTO_TEST_CASE(testWrite) { BOOST_REQUIRE_EQUAL(std::string("some normalizer state"), std::string(quantiles["quantile_state"].GetString())); BOOST_TEST_REQUIRE(quantiles.HasMember("timestamp")); - BOOST_REQUIRE_EQUAL(int64_t(1521040000000), quantiles["timestamp"].GetInt64()); + BOOST_REQUIRE_EQUAL(std::int64_t(1521040000000), quantiles["timestamp"].GetInt64()); } BOOST_AUTO_TEST_SUITE_END() diff --git a/lib/model/CAnomalyDetector.cc b/lib/model/CAnomalyDetector.cc index ff8cb72d79..2a851ec056 100644 --- a/lib/model/CAnomalyDetector.cc +++ b/lib/model/CAnomalyDetector.cc @@ -627,11 +627,11 @@ void CAnomalyDetector::prune(std::size_t maximumAge) { m_Model->prune(maximumAge); } -void CAnomalyDetector::updateMemoryResults(CResourceMonitor::SResults& results) const { - ++results.s_PartitionFields; +void CAnomalyDetector::updateModelSizeStats(CResourceMonitor::SModelSizeStats& modelSizeStats) const { + ++modelSizeStats.s_PartitionFields; const auto& dataGatherer = m_Model->dataGatherer(); - results.s_OverFields += dataGatherer.numberOverFieldValues(); - results.s_ByFields += dataGatherer.numberByFieldValues(); + modelSizeStats.s_OverFields += dataGatherer.numberOverFieldValues(); + modelSizeStats.s_ByFields += dataGatherer.numberByFieldValues(); } const core_t::TTime& CAnomalyDetector::lastBucketEndTime() const { diff --git a/lib/model/CResourceMonitor.cc b/lib/model/CResourceMonitor.cc index b75f54c41e..3b60903f7b 100644 --- a/lib/model/CResourceMonitor.cc +++ b/lib/model/CResourceMonitor.cc @@ -260,42 +260,19 @@ void CResourceMonitor::sendMemoryUsageReport(core_t::TTime bucketStartTime) { m_PreviousTotal = total; } -CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTime bucketStartTime) { - SResults res; - res.s_ByFields = 0; - res.s_OverFields = 0; - res.s_PartitionFields = 0; +CResourceMonitor::SModelSizeStats +CResourceMonitor::createMemoryUsageReport(core_t::TTime bucketStartTime) { + SModelSizeStats res; res.s_Usage = this->totalMemory(); res.s_AdjustedUsage = this->adjustedUsage(res.s_Usage); res.s_BytesMemoryLimit = 2 * m_ByteLimitHigh; res.s_BytesExceeded = m_CurrentBytesExceeded; - res.s_AllocationFailures = 0; res.s_MemoryStatus = m_MemoryStatus; res.s_BucketStartTime = bucketStartTime; - res.s_CategorizedMessages = 0; - res.s_TotalCategories = 0; - res.s_FrequentCategories = 0; - res.s_RareCategories = 0; - res.s_DeadCategories = 0; - res.s_CategorizationStatus = model_t::E_CategorizationStatusOk; for (const auto& resource : m_Resources) { - resource.first->updateMemoryResults(res); + resource.first->updateModelSizeStats(res); } res.s_AllocationFailures += m_AllocationFailures.size(); - // Categorization status is poor if: - // - At least 100 messages have been categorized - // and one of the following holds: - // - There is only 1 category - // - More than 90% of categories have 1 message - // - The number of categories is greater than 50% of the number of categorized messages - // - There are no frequent match categories - // - More than 50% of categories are dead - if (res.s_CategorizedMessages > 100 && - (res.s_TotalCategories == 1 || 10 * res.s_RareCategories > 9 * res.s_TotalCategories || - 2 * res.s_TotalCategories > res.s_CategorizedMessages || - res.s_FrequentCategories == 0 || 2 * res.s_DeadCategories > res.s_TotalCategories)) { - res.s_CategorizationStatus = model_t::E_CategorizationStatusPoor; - } return res; } diff --git a/lib/model/CTokenListCategory.cc b/lib/model/CTokenListCategory.cc index d486a50ddb..16f3e41642 100644 --- a/lib/model/CTokenListCategory.cc +++ b/lib/model/CTokenListCategory.cc @@ -11,7 +11,6 @@ #include #include -#include #include namespace ml { diff --git a/lib/model/CTokenListDataCategorizerBase.cc b/lib/model/CTokenListDataCategorizerBase.cc index f2880b2489..c8a7e319ac 100644 --- a/lib/model/CTokenListDataCategorizerBase.cc +++ b/lib/model/CTokenListDataCategorizerBase.cc @@ -91,11 +91,11 @@ int CTokenListDataCategorizerBase::computeCategory(bool isDryRun, // further checks. The first condition here ensures that we never say // a string with tokens matches the reverse search of a string with no // tokens (which the other criteria alone might say matched). - bool matchesSearch( + bool matchesSearch{ (baseWeight == 0) == (workWeight == 0) && compCategory.maxMatchingStringLen() >= rawStringLen && compCategory.isMissingCommonTokenWeightZero(m_WorkTokenUniqueIds) && - compCategory.containsCommonInOrderTokensInOrder(m_WorkTokenIds)); + compCategory.containsCommonInOrderTokensInOrder(m_WorkTokenIds)}; if (!matchesSearch) { // Quickly rule out wildly different token weights prior to doing // the expensive similarity calculations @@ -528,15 +528,15 @@ std::size_t CTokenListDataCategorizerBase::memoryUsage() const { return mem; } -void CTokenListDataCategorizerBase::updateMemoryResults(CResourceMonitor::SResults& results) const { +void CTokenListDataCategorizerBase::updateModelSizeStats(CResourceMonitor::SModelSizeStats& modelSizeStats) const { - results.s_TotalCategories = m_Categories.size(); + modelSizeStats.s_TotalCategories = m_Categories.size(); std::size_t categorizedMessagesThisCategorizer{0}; for (auto categoryByCount : m_CategoriesByCount) { categorizedMessagesThisCategorizer += categoryByCount.first; } - results.s_CategorizedMessages += categorizedMessagesThisCategorizer; + modelSizeStats.s_CategorizedMessages += categorizedMessagesThisCategorizer; for (std::size_t i = 0; i < m_CategoriesByCount.size(); ++i) { const CTokenListCategory& category{m_Categories[m_CategoriesByCount[i].second]}; @@ -544,21 +544,21 @@ void CTokenListDataCategorizerBase::updateMemoryResults(CResourceMonitor::SResul // - rare = single match // - frequent = matches more than 1% of messages if (category.numMatches() == 1) { - ++results.s_RareCategories; + ++modelSizeStats.s_RareCategories; } else if (category.numMatches() * 100 > categorizedMessagesThisCategorizer) { - ++results.s_FrequentCategories; + ++modelSizeStats.s_FrequentCategories; } for (std::size_t j = 0; j < i; ++j) { const CTokenListCategory& moreFrequentCategory{ m_Categories[m_CategoriesByCount[j].second]}; - bool matchesSearch(moreFrequentCategory.maxMatchingStringLen() >= + bool matchesSearch{moreFrequentCategory.maxMatchingStringLen() >= category.maxMatchingStringLen() && moreFrequentCategory.isMissingCommonTokenWeightZero( category.commonUniqueTokenIds()) && moreFrequentCategory.containsCommonInOrderTokensInOrder( - category.baseTokenIds())); + category.baseTokenIds())}; if (matchesSearch) { - ++results.s_DeadCategories; + ++modelSizeStats.s_DeadCategories; LOG_DEBUG(<< "Category " << (m_CategoriesByCount[i].second + 1) << " (" << category.baseString() << ") is killed by category " << (m_CategoriesByCount[j].second + 1) << " (" @@ -567,11 +567,30 @@ void CTokenListDataCategorizerBase::updateMemoryResults(CResourceMonitor::SResul } } } + + // Categorization status is poor if: + // - At least 100 messages have been categorized + // and one of the following holds: + // - There is only 1 category + // - More than 90% of categories are rare + // - The number of categories is greater than 50% of the number of categorized messages + // - There are no frequent match categories + // - More than 50% of categories are dead + if (modelSizeStats.s_CategorizedMessages > 100 && + (modelSizeStats.s_TotalCategories == 1 || + 10 * modelSizeStats.s_RareCategories > 9 * modelSizeStats.s_TotalCategories || + 2 * modelSizeStats.s_TotalCategories > modelSizeStats.s_CategorizedMessages || + modelSizeStats.s_FrequentCategories == 0 || + 2 * modelSizeStats.s_DeadCategories > modelSizeStats.s_TotalCategories)) { + modelSizeStats.s_CategorizationStatus = model_t::E_CategorizationStatusPoor; + } else { + modelSizeStats.s_CategorizationStatus = model_t::E_CategorizationStatusOk; + } } CTokenListDataCategorizerBase::CTokenInfoItem::CTokenInfoItem(const std::string& str, std::size_t index) - : m_Str(str), m_Index(index), m_CategoryCount(0) { + : m_Str{str}, m_Index{index}, m_CategoryCount{0} { } const std::string& CTokenListDataCategorizerBase::CTokenInfoItem::str() const { @@ -611,7 +630,7 @@ CTokenListDataCategorizerBase::CSizePairFirstElementEquals::CSizePairFirstElemen CTokenListDataCategorizerBase::SIdTranslater::SIdTranslater(const CTokenListDataCategorizerBase& categorizer, const TSizeSizePrVec& tokenIds, char separator) - : s_Categorizer(categorizer), s_TokenIds(tokenIds), s_Separator(separator) { + : s_Categorizer{categorizer}, s_TokenIds{tokenIds}, s_Separator{separator} { } std::ostream& operator<<(std::ostream& strm, diff --git a/lib/model/unittest/CResourceMonitorTest.cc b/lib/model/unittest/CResourceMonitorTest.cc index 5742c1a646..047a13467a 100644 --- a/lib/model/unittest/CResourceMonitorTest.cc +++ b/lib/model/unittest/CResourceMonitorTest.cc @@ -32,8 +32,8 @@ class CTestFixture { CStringStore::influencers().clearEverythingTestOnly(); } - void reportCallback(const CResourceMonitor::SResults& results) { - m_CallbackResults = results; + void reportCallback(const CResourceMonitor::SModelSizeStats& modelSizeStats) { + m_ReportedModelSizeStats = modelSizeStats; } void addTestData(core_t::TTime& firstTime, @@ -83,7 +83,7 @@ class CTestFixture { } protected: - CResourceMonitor::SResults m_CallbackResults; + CResourceMonitor::SModelSizeStats m_ReportedModelSizeStats; }; BOOST_FIXTURE_TEST_CASE(testMonitor, CTestFixture) { @@ -252,15 +252,15 @@ BOOST_FIXTURE_TEST_CASE(testMonitor, CTestFixture) { mon.memoryUsageReporter(std::bind(&CTestFixture::reportCallback, this, std::placeholders::_1)); - m_CallbackResults.s_Usage = 0; - BOOST_REQUIRE_EQUAL(std::size_t(0), m_CallbackResults.s_Usage); + m_ReportedModelSizeStats.s_Usage = 0; + BOOST_REQUIRE_EQUAL(std::size_t(0), m_ReportedModelSizeStats.s_Usage); mon.refresh(categorizer); mon.refresh(detector1); mon.refresh(detector2); mon.sendMemoryUsageReportIfSignificantlyChanged(0); - BOOST_REQUIRE_EQUAL(mem, m_CallbackResults.s_Usage); - BOOST_REQUIRE_EQUAL(model_t::E_MemoryStatusOk, m_CallbackResults.s_MemoryStatus); + BOOST_REQUIRE_EQUAL(mem, m_ReportedModelSizeStats.s_Usage); + BOOST_REQUIRE_EQUAL(model_t::E_MemoryStatusOk, m_ReportedModelSizeStats.s_MemoryStatus); } { // Test the report callback for allocation failures @@ -271,8 +271,8 @@ BOOST_FIXTURE_TEST_CASE(testMonitor, CTestFixture) { mon.memoryUsageReporter(std::bind(&CTestFixture::reportCallback, this, std::placeholders::_1)); - m_CallbackResults.s_AllocationFailures = 0; - BOOST_REQUIRE_EQUAL(std::size_t(0), m_CallbackResults.s_AllocationFailures); + m_ReportedModelSizeStats.s_AllocationFailures = 0; + BOOST_REQUIRE_EQUAL(std::size_t(0), m_ReportedModelSizeStats.s_AllocationFailures); // Set a soft-limit degraded status mon.acceptPruningResult(); @@ -282,27 +282,28 @@ BOOST_FIXTURE_TEST_CASE(testMonitor, CTestFixture) { mon.refresh(detector1); mon.refresh(detector2); mon.sendMemoryUsageReportIfSignificantlyChanged(0); - BOOST_REQUIRE_EQUAL(std::size_t(0), m_CallbackResults.s_AllocationFailures); - BOOST_REQUIRE_EQUAL(mem, m_CallbackResults.s_Usage); - BOOST_REQUIRE_EQUAL(model_t::E_MemoryStatusSoftLimit, m_CallbackResults.s_MemoryStatus); + BOOST_REQUIRE_EQUAL(std::size_t(0), m_ReportedModelSizeStats.s_AllocationFailures); + BOOST_REQUIRE_EQUAL(mem, m_ReportedModelSizeStats.s_Usage); + BOOST_REQUIRE_EQUAL(model_t::E_MemoryStatusSoftLimit, + m_ReportedModelSizeStats.s_MemoryStatus); // Set some canary values - m_CallbackResults.s_AllocationFailures = 12345; - m_CallbackResults.s_ByFields = 54321; - m_CallbackResults.s_OverFields = 23456; - m_CallbackResults.s_PartitionFields = 65432; - m_CallbackResults.s_Usage = 1357924; + m_ReportedModelSizeStats.s_AllocationFailures = 12345; + m_ReportedModelSizeStats.s_ByFields = 54321; + m_ReportedModelSizeStats.s_OverFields = 23456; + m_ReportedModelSizeStats.s_PartitionFields = 65432; + m_ReportedModelSizeStats.s_Usage = 1357924; // This should not trigger a report mon.refresh(categorizer); mon.refresh(detector1); mon.refresh(detector2); mon.sendMemoryUsageReportIfSignificantlyChanged(0); - BOOST_REQUIRE_EQUAL(std::size_t(12345), m_CallbackResults.s_AllocationFailures); - BOOST_REQUIRE_EQUAL(std::size_t(54321), m_CallbackResults.s_ByFields); - BOOST_REQUIRE_EQUAL(std::size_t(23456), m_CallbackResults.s_OverFields); - BOOST_REQUIRE_EQUAL(std::size_t(65432), m_CallbackResults.s_PartitionFields); - BOOST_REQUIRE_EQUAL(std::size_t(1357924), m_CallbackResults.s_Usage); + BOOST_REQUIRE_EQUAL(std::size_t(12345), m_ReportedModelSizeStats.s_AllocationFailures); + BOOST_REQUIRE_EQUAL(std::size_t(54321), m_ReportedModelSizeStats.s_ByFields); + BOOST_REQUIRE_EQUAL(std::size_t(23456), m_ReportedModelSizeStats.s_OverFields); + BOOST_REQUIRE_EQUAL(std::size_t(65432), m_ReportedModelSizeStats.s_PartitionFields); + BOOST_REQUIRE_EQUAL(std::size_t(1357924), m_ReportedModelSizeStats.s_Usage); // Add some memory failures, which should be reported mon.acceptAllocationFailureResult(14400000); @@ -316,28 +317,29 @@ BOOST_FIXTURE_TEST_CASE(testMonitor, CTestFixture) { mon.refresh(detector1); mon.refresh(detector2); mon.sendMemoryUsageReportIfSignificantlyChanged(0); - BOOST_REQUIRE_EQUAL(std::size_t(3), m_CallbackResults.s_AllocationFailures); - BOOST_REQUIRE_EQUAL(mem, m_CallbackResults.s_Usage); + BOOST_REQUIRE_EQUAL(std::size_t(3), m_ReportedModelSizeStats.s_AllocationFailures); + BOOST_REQUIRE_EQUAL(mem, m_ReportedModelSizeStats.s_Usage); BOOST_REQUIRE_EQUAL(core_t::TTime(14402000), mon.m_LastAllocationFailureReport); - BOOST_REQUIRE_EQUAL(model_t::E_MemoryStatusHardLimit, m_CallbackResults.s_MemoryStatus); + BOOST_REQUIRE_EQUAL(model_t::E_MemoryStatusHardLimit, + m_ReportedModelSizeStats.s_MemoryStatus); // Set some canary values - m_CallbackResults.s_AllocationFailures = 12345; - m_CallbackResults.s_ByFields = 54321; - m_CallbackResults.s_OverFields = 23456; - m_CallbackResults.s_PartitionFields = 65432; - m_CallbackResults.s_Usage = 1357924; + m_ReportedModelSizeStats.s_AllocationFailures = 12345; + m_ReportedModelSizeStats.s_ByFields = 54321; + m_ReportedModelSizeStats.s_OverFields = 23456; + m_ReportedModelSizeStats.s_PartitionFields = 65432; + m_ReportedModelSizeStats.s_Usage = 1357924; // As nothing has changed, nothing should be reported mon.refresh(categorizer); mon.refresh(detector1); mon.refresh(detector2); mon.sendMemoryUsageReportIfSignificantlyChanged(0); - BOOST_REQUIRE_EQUAL(std::size_t(12345), m_CallbackResults.s_AllocationFailures); - BOOST_REQUIRE_EQUAL(std::size_t(54321), m_CallbackResults.s_ByFields); - BOOST_REQUIRE_EQUAL(std::size_t(23456), m_CallbackResults.s_OverFields); - BOOST_REQUIRE_EQUAL(std::size_t(65432), m_CallbackResults.s_PartitionFields); - BOOST_REQUIRE_EQUAL(std::size_t(1357924), m_CallbackResults.s_Usage); + BOOST_REQUIRE_EQUAL(std::size_t(12345), m_ReportedModelSizeStats.s_AllocationFailures); + BOOST_REQUIRE_EQUAL(std::size_t(54321), m_ReportedModelSizeStats.s_ByFields); + BOOST_REQUIRE_EQUAL(std::size_t(23456), m_ReportedModelSizeStats.s_OverFields); + BOOST_REQUIRE_EQUAL(std::size_t(65432), m_ReportedModelSizeStats.s_PartitionFields); + BOOST_REQUIRE_EQUAL(std::size_t(1357924), m_ReportedModelSizeStats.s_Usage); } { // Test the need to report usage based on a change in levels, up and down @@ -352,7 +354,7 @@ BOOST_FIXTURE_TEST_CASE(testMonitor, CTestFixture) { mon.m_MonitoredResourceCurrentMemory = 10; BOOST_TEST_REQUIRE(mon.needToSendReport()); mon.sendMemoryUsageReport(0); - BOOST_REQUIRE_EQUAL(origTotalMemory + 10, m_CallbackResults.s_Usage); + BOOST_REQUIRE_EQUAL(origTotalMemory + 10, m_ReportedModelSizeStats.s_Usage); // Nothing new added, so no report BOOST_TEST_REQUIRE(!mon.needToSendReport()); @@ -362,13 +364,13 @@ BOOST_FIXTURE_TEST_CASE(testMonitor, CTestFixture) { BOOST_TEST_REQUIRE(mon.needToSendReport()); mon.sendMemoryUsageReport(0); BOOST_REQUIRE_EQUAL(origTotalMemory + 11 + (origTotalMemory + 9) / 10, - m_CallbackResults.s_Usage); + m_ReportedModelSizeStats.s_Usage); // Huge increase should trigger a need mon.m_MonitoredResourceCurrentMemory = 1000; BOOST_TEST_REQUIRE(mon.needToSendReport()); mon.sendMemoryUsageReport(0); - BOOST_REQUIRE_EQUAL(origTotalMemory + 1000, m_CallbackResults.s_Usage); + BOOST_REQUIRE_EQUAL(origTotalMemory + 1000, m_ReportedModelSizeStats.s_Usage); // 0.1% increase should not trigger a need mon.m_MonitoredResourceCurrentMemory += 1 + (origTotalMemory + 999) / 1000; @@ -378,7 +380,7 @@ BOOST_FIXTURE_TEST_CASE(testMonitor, CTestFixture) { mon.m_MonitoredResourceCurrentMemory = 900; BOOST_TEST_REQUIRE(mon.needToSendReport()); mon.sendMemoryUsageReport(0); - BOOST_REQUIRE_EQUAL(origTotalMemory + 900, m_CallbackResults.s_Usage); + BOOST_REQUIRE_EQUAL(origTotalMemory + 900, m_ReportedModelSizeStats.s_Usage); // A tiny decrease should not trigger a need mon.m_MonitoredResourceCurrentMemory = 899; From 9c2ede99daf71fed166674985e1ca5573f705b9a Mon Sep 17 00:00:00 2001 From: David Roberts Date: Thu, 6 Feb 2020 14:07:53 +0000 Subject: [PATCH 3/5] Changing categorization status "poor" to "warn" --- include/model/ModelTypes.h | 4 ++-- lib/api/unittest/CModelSnapshotJsonWriterTest.cc | 4 ++-- lib/model/CTokenListDataCategorizerBase.cc | 4 ++-- lib/model/ModelTypes.cc | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/model/ModelTypes.h b/include/model/ModelTypes.h index 6945a83980..8d17f2afaf 100644 --- a/include/model/ModelTypes.h +++ b/include/model/ModelTypes.h @@ -803,11 +803,11 @@ MODEL_EXPORT std::string print(EMemoryStatus memoryStatus); //! An enumeration of the TokenListDataCategorizer status - -//! Start in the OK state. Moves into the poor state if too +//! Start in the OK state. Moves into the "warn" state if too //! few categories are being seen frequently. enum ECategorizationStatus { E_CategorizationStatusOk = 0, //!< Categorization working as intended - E_CategorizationStatusPoor = 1 //!< Too many categories being created + E_CategorizationStatusWarn = 1 //!< Too many categories being created }; //! Get a string description of \p categorizationStatus. diff --git a/lib/api/unittest/CModelSnapshotJsonWriterTest.cc b/lib/api/unittest/CModelSnapshotJsonWriterTest.cc index ff12ce0e61..cd75c7e1cb 100644 --- a/lib/api/unittest/CModelSnapshotJsonWriterTest.cc +++ b/lib/api/unittest/CModelSnapshotJsonWriterTest.cc @@ -44,7 +44,7 @@ BOOST_AUTO_TEST_CASE(testWrite) { 7, // frequent categories 13, // rare categories 2, // dead categories - model_t::E_CategorizationStatusPoor}; + model_t::E_CategorizationStatusWarn}; CModelSnapshotJsonWriter::SModelSnapshotReport report{ "6.3.0", @@ -139,7 +139,7 @@ BOOST_AUTO_TEST_CASE(testWrite) { BOOST_TEST_REQUIRE(modelSizeStats.HasMember("dead_category_count")); BOOST_REQUIRE_EQUAL(std::uint64_t(2), modelSizeStats["dead_category_count"].GetUint64()); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("memory_status")); - BOOST_REQUIRE_EQUAL(std::string("poor"), + BOOST_REQUIRE_EQUAL(std::string("warn"), std::string(modelSizeStats["categorization_status"].GetString())); BOOST_TEST_REQUIRE(modelSizeStats.HasMember("timestamp")); BOOST_REQUIRE_EQUAL(std::int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64()); diff --git a/lib/model/CTokenListDataCategorizerBase.cc b/lib/model/CTokenListDataCategorizerBase.cc index c8a7e319ac..db5055cd22 100644 --- a/lib/model/CTokenListDataCategorizerBase.cc +++ b/lib/model/CTokenListDataCategorizerBase.cc @@ -568,7 +568,7 @@ void CTokenListDataCategorizerBase::updateModelSizeStats(CResourceMonitor::SMode } } - // Categorization status is poor if: + // Categorization status is "warn" if: // - At least 100 messages have been categorized // and one of the following holds: // - There is only 1 category @@ -582,7 +582,7 @@ void CTokenListDataCategorizerBase::updateModelSizeStats(CResourceMonitor::SMode 2 * modelSizeStats.s_TotalCategories > modelSizeStats.s_CategorizedMessages || modelSizeStats.s_FrequentCategories == 0 || 2 * modelSizeStats.s_DeadCategories > modelSizeStats.s_TotalCategories)) { - modelSizeStats.s_CategorizationStatus = model_t::E_CategorizationStatusPoor; + modelSizeStats.s_CategorizationStatus = model_t::E_CategorizationStatusWarn; } else { modelSizeStats.s_CategorizationStatus = model_t::E_CategorizationStatusOk; } diff --git a/lib/model/ModelTypes.cc b/lib/model/ModelTypes.cc index 05889dc1a0..058982ec5f 100644 --- a/lib/model/ModelTypes.cc +++ b/lib/model/ModelTypes.cc @@ -2050,8 +2050,8 @@ std::string print(ECategorizationStatus categorizationStatus) { switch (categorizationStatus) { case E_CategorizationStatusOk: return "ok"; - case E_CategorizationStatusPoor: - return "poor"; + case E_CategorizationStatusWarn: + return "warn"; } return "-"; } From 7047f854eb3bfbd6c89c2a81561cc2f42652fe12 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Thu, 6 Feb 2020 14:25:56 +0000 Subject: [PATCH 4/5] Add changelog --- docs/CHANGELOG.asciidoc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 19241db66d..b154bd85e7 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -46,6 +46,8 @@ progress, memory usage, etc. (See {ml-pull}906[#906].) * Improve initialization of learn rate for better and more stable results in regression and classification. (See {ml-pull}948[#948].) +* Add new model_size_stats fields to instrument categorization. (See {ml-pull}948[#948] +and {pull}51879[#51879], issue: {issue}50794[#50749].) === Bug Fixes From 7021ae122c38d5df7031b59ae17b972a477f1441 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Thu, 6 Feb 2020 15:27:12 +0000 Subject: [PATCH 5/5] Add a test for "warn" status --- include/model/CTokenListCategory.h | 11 ++--- include/model/CTokenListDataCategorizerBase.h | 16 +++++++ lib/model/CTokenListDataCategorizerBase.cc | 48 +++++++++++++++---- .../CTokenListDataCategorizerBaseTest.cc | 30 ++++++++++++ 4 files changed, 90 insertions(+), 15 deletions(-) diff --git a/include/model/CTokenListCategory.h b/include/model/CTokenListCategory.h index ace04d4c01..34208c5f59 100644 --- a/include/model/CTokenListCategory.h +++ b/include/model/CTokenListCategory.h @@ -108,15 +108,14 @@ class MODEL_EXPORT CTokenListCategory { bool isMissingCommonTokenWeightZero(const PAIR_CONTAINER& uniqueTokenIds) const { auto testIter = uniqueTokenIds.begin(); - for (auto commonIter = m_CommonUniqueTokenIds.begin(); - commonIter != m_CommonUniqueTokenIds.end(); ++commonIter) { + for (const auto& commonItem : m_CommonUniqueTokenIds) { testIter = std::find_if(testIter, uniqueTokenIds.end(), - [&commonIter](const auto& testItem) { - return testItem.first >= commonIter->first; + [&commonItem](const auto& testItem) { + return testItem.first >= commonItem.first; }); if (testIter == uniqueTokenIds.end() || - testIter->first != commonIter->first || - testIter->second != commonIter->second) { + testIter->first != commonItem.first || + testIter->second != commonItem.second) { return false; } ++testIter; diff --git a/include/model/CTokenListDataCategorizerBase.h b/include/model/CTokenListDataCategorizerBase.h index 1f475e17e2..132ebc313b 100644 --- a/include/model/CTokenListDataCategorizerBase.h +++ b/include/model/CTokenListDataCategorizerBase.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -160,6 +161,21 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer { //! Update the model size stats with information from this categorizer. void updateModelSizeStats(CResourceMonitor::SModelSizeStats& modelSizeStats) const override; + //! Categorization status is "warn" if: + //! - At least 100 messages have been categorized + //! and one of the following holds: + //! - There is only 1 category + //! - More than 90% of categories are rare + //! - The number of categories is greater than 50% of the number of categorized messages + //! - There are no frequent match categories + //! - More than 50% of categories are dead + static model_t::ECategorizationStatus + calculateCategorizationStatus(std::size_t categorizedMessages, + std::size_t totalCategories, + std::size_t frequentCategories, + std::size_t rareCategories, + std::size_t deadCategories); + protected: //! Split the string into a list of tokens. The result of the //! tokenisation is returned in \p tokenIds, \p tokenUniqueIds and diff --git a/lib/model/CTokenListDataCategorizerBase.cc b/lib/model/CTokenListDataCategorizerBase.cc index db5055cd22..c4569ad207 100644 --- a/lib/model/CTokenListDataCategorizerBase.cc +++ b/lib/model/CTokenListDataCategorizerBase.cc @@ -568,24 +568,54 @@ void CTokenListDataCategorizerBase::updateModelSizeStats(CResourceMonitor::SMode } } + modelSizeStats.s_CategorizationStatus = CTokenListDataCategorizerBase::calculateCategorizationStatus( + modelSizeStats.s_CategorizedMessages, modelSizeStats.s_TotalCategories, + modelSizeStats.s_FrequentCategories, modelSizeStats.s_RareCategories, + modelSizeStats.s_DeadCategories); +} + +model_t::ECategorizationStatus +CTokenListDataCategorizerBase::calculateCategorizationStatus(std::size_t categorizedMessages, + std::size_t totalCategories, + std::size_t frequentCategories, + std::size_t rareCategories, + std::size_t deadCategories) { + // Categorization status is "warn" if: + // - At least 100 messages have been categorized + if (categorizedMessages <= 100) { + return model_t::E_CategorizationStatusOk; + } + // and one of the following holds: + // - There is only 1 category + if (totalCategories == 1) { + return model_t::E_CategorizationStatusWarn; + } + // - More than 90% of categories are rare + if (10 * rareCategories > 9 * totalCategories) { + return model_t::E_CategorizationStatusWarn; + } + // - The number of categories is greater than 50% of the number of categorized messages + if (2 * totalCategories > categorizedMessages) { + return model_t::E_CategorizationStatusWarn; + } + // - There are no frequent match categories + if (frequentCategories == 0) { + return model_t::E_CategorizationStatusWarn; + } + // - More than 50% of categories are dead - if (modelSizeStats.s_CategorizedMessages > 100 && - (modelSizeStats.s_TotalCategories == 1 || - 10 * modelSizeStats.s_RareCategories > 9 * modelSizeStats.s_TotalCategories || - 2 * modelSizeStats.s_TotalCategories > modelSizeStats.s_CategorizedMessages || - modelSizeStats.s_FrequentCategories == 0 || - 2 * modelSizeStats.s_DeadCategories > modelSizeStats.s_TotalCategories)) { - modelSizeStats.s_CategorizationStatus = model_t::E_CategorizationStatusWarn; - } else { - modelSizeStats.s_CategorizationStatus = model_t::E_CategorizationStatusOk; + if (2 * deadCategories > totalCategories) { + return model_t::E_CategorizationStatusWarn; } + + return model_t::E_CategorizationStatusOk; } CTokenListDataCategorizerBase::CTokenInfoItem::CTokenInfoItem(const std::string& str, diff --git a/lib/model/unittest/CTokenListDataCategorizerBaseTest.cc b/lib/model/unittest/CTokenListDataCategorizerBaseTest.cc index ce28b5f722..93ba51e90f 100644 --- a/lib/model/unittest/CTokenListDataCategorizerBaseTest.cc +++ b/lib/model/unittest/CTokenListDataCategorizerBaseTest.cc @@ -60,4 +60,34 @@ BOOST_AUTO_TEST_CASE(testMaxMatchingWeights) { ml::model::CTokenListDataCategorizerBase::maxMatchingWeight(10, 0.7)); } +BOOST_AUTO_TEST_CASE(testCalculateCategorizationStatus) { + BOOST_REQUIRE_EQUAL(ml::model_t::E_CategorizationStatusOk, + ml::model::CTokenListDataCategorizerBase::calculateCategorizationStatus( + 99, 99, 0, 99, 0)); + + BOOST_REQUIRE_EQUAL(ml::model_t::E_CategorizationStatusWarn, + ml::model::CTokenListDataCategorizerBase::calculateCategorizationStatus( + 1000, 1, 1, 0, 0)); + + BOOST_REQUIRE_EQUAL(ml::model_t::E_CategorizationStatusWarn, + ml::model::CTokenListDataCategorizerBase::calculateCategorizationStatus( + 1000, 100, 3, 91, 1)); + + BOOST_REQUIRE_EQUAL(ml::model_t::E_CategorizationStatusWarn, + ml::model::CTokenListDataCategorizerBase::calculateCategorizationStatus( + 1000, 501, 1, 99, 0)); + + BOOST_REQUIRE_EQUAL(ml::model_t::E_CategorizationStatusWarn, + ml::model::CTokenListDataCategorizerBase::calculateCategorizationStatus( + 1000, 200, 0, 20, 0)); + + BOOST_REQUIRE_EQUAL(ml::model_t::E_CategorizationStatusWarn, + ml::model::CTokenListDataCategorizerBase::calculateCategorizationStatus( + 1000, 300, 2, 50, 151)); + + BOOST_REQUIRE_EQUAL(ml::model_t::E_CategorizationStatusOk, + ml::model::CTokenListDataCategorizerBase::calculateCategorizationStatus( + 1000, 120, 20, 40, 1)); +} + BOOST_AUTO_TEST_SUITE_END()