elastic · droberts195 · Feb 7, 2020 · Feb 4, 2020 · Feb 6, 2020 · Feb 6, 2020
diff --git a/include/model/CResourceMonitor.h b/include/model/CResourceMonitor.h
@@ -49,6 +49,12 @@ class MODEL_EXPORT CResourceMonitor {
         core_t::TTime s_BucketStartTime;
         std::size_t s_BytesExceeded;
         std::size_t s_BytesMemoryLimit;
+        std::size_t s_CategorizedMessages;
+        std::size_t s_TotalCategories;
+        std::size_t s_FrequentCategories;
+        std::size_t s_RareCategories;
+        std::size_t s_DeadCategories;
+        model_t::ECategorizationStatus s_CategorizationStatus;
     };
 
 public:

diff --git a/include/model/CTokenListCategory.h b/include/model/CTokenListCategory.h
@@ -99,11 +99,35 @@ class MODEL_EXPORT CTokenListCategory {
     //!     if (category.missingCommonTokenWeight(uniqueTokenIds) == 0)
     //! instead of calling this method.  However, this method is much faster
     //! as it can return false as soon as a mismatch occurs.
-    bool isMissingCommonTokenWeightZero(const TSizeSizeMap& uniqueTokenIds) const;
+    template<typename PAIR_CONTAINER>
+    bool isMissingCommonTokenWeightZero(const PAIR_CONTAINER& uniqueTokenIds) const {
+
+        auto commonIter = m_CommonUniqueTokenIds.begin();
+        auto testIter = uniqueTokenIds.begin();
+        while (commonIter != m_CommonUniqueTokenIds.end() &&
+               testIter != uniqueTokenIds.end()) {
+            if (commonIter->first < testIter->first) {
+                return false;
+            }
+
+            if (commonIter->first == testIter->first) {
+                // The tokens must appear the same number of times in the two
+                // strings
+                if (commonIter->second != testIter->second) {
+                    return false;
+                }
+                ++commonIter;
+            }
+
+            ++testIter;
+        }
+
+        return commonIter == m_CommonUniqueTokenIds.end();
+    }
 
     //! Does the supplied token vector contain all our common tokens in the
     //! same order as our base token vector?
-    bool containsCommonTokensInOrder(const TSizeSizePrVec& tokenIds) const;
+    bool containsCommonInOrderTokensInOrder(const TSizeSizePrVec& tokenIds) const;
 
     //! \return Does the supplied token ID represent a common unique token?
     bool isTokenCommon(std::size_t tokenId) const;

diff --git a/include/model/CTokenListDataCategorizer.h b/include/model/CTokenListDataCategorizer.h
@@ -81,12 +81,6 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
     //! Get the static size of this object - used for virtual hierarchies
     std::size_t staticSize() const override { return sizeof(*this); }
 
-    //! Currently the overall model memory stats do not contain any categorizer
-    //! stats fields.
-    void updateMemoryResults(CResourceMonitor::SResults& /*results*/) const override {
-        // NO-OP
-    }
-
 protected:
     //! Split the string into a list of tokens.  The result of the
     //! tokenisation is returned in \p tokenIds, \p tokenUniqueIds and

diff --git a/include/model/CTokenListDataCategorizerBase.h b/include/model/CTokenListDataCategorizerBase.h
@@ -14,7 +14,6 @@
 #include <model/ImportExport.h>
 
 #include <iosfwd>
-#include <list>
 #include <map>
 #include <memory>
 #include <string>
@@ -75,8 +74,9 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {
     //! second -> weighting
     using TSizeSizePr = std::pair<std::size_t, std::size_t>;
 
-    //! Used for storing token ID sequences
+    //! Used for storing token ID sequences and categories with counts
     using TSizeSizePrVec = std::vector<TSizeSizePr>;
+    using TSizeSizePrVecItr = TSizeSizePrVec::iterator;
 
     //! Used for storing distinct token IDs
     using TSizeSizeMap = std::map<std::size_t, std::size_t>;
@@ -157,6 +157,10 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {
     //! Get the memory used by this categorizer.
     std::size_t memoryUsage() const override;
 
+    //! Currently the overall model memory stats do not contain any categorizer
+    //! stats fields.
+    void updateMemoryResults(CResourceMonitor::SResults& results) const override;
+
 protected:
     //! Split the string into a list of tokens.  The result of the
     //! tokenisation is returned in \p tokenIds, \p tokenUniqueIds and
@@ -180,19 +184,13 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {
                               const TSizeSizePrVec& right,
                               std::size_t rightWeight) const = 0;
 
-    //! Used to hold statistics about the categories we compute:
-    //! first -> count of matches
-    //! second -> category vector index
-    using TSizeSizePrList = std::list<TSizeSizePr>;
-    using TSizeSizePrListItr = TSizeSizePrList::iterator;
-
     //! Add a match to an existing category
     void addCategoryMatch(bool isDryRun,
                           const std::string& str,
                           std::size_t rawStringLen,
                           const TSizeSizePrVec& tokenIds,
                           const TSizeSizeMap& tokenUniqueIds,
-                          TSizeSizePrListItr& iter);
+                          TSizeSizePrVecItr& iter);
 
     //! Given the total token weight in a vector and a threshold, what is
     //! the minimum possible token weight in a different vector that could
@@ -304,7 +302,7 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {
 
     //! List of match count/index into category vector in descending order of
     //! match count
-    TSizeSizePrList m_CategoriesByCount;
+    TSizeSizePrVec m_CategoriesByCount;
 
     //! Used for looking up tokens to a unique ID
     TTokenMIndex m_TokenIdLookup;

diff --git a/include/model/ModelTypes.h b/include/model/ModelTypes.h
@@ -802,6 +802,18 @@ enum EMemoryStatus {
 MODEL_EXPORT
 std::string print(EMemoryStatus memoryStatus);
 
+//! An enumeration of the TokenListDataCategorizer status -
+//! Start in the OK state. Moves into the poor state if too
+//! few categories are being seen frequently.
+enum ECategorizationStatus {
+    E_CategorizationStatusOk = 0,  //!< Categorization working as intended
+    E_CategorizationStatusPoor = 1 //!< Too many categories being created
+};
+
+//! Get a string description of \p categorizationStatus.
+MODEL_EXPORT
+std::string print(ECategorizationStatus categorizationStatus);
+
 //! Styles of probability aggregation available:
 //!   -# AggregatePeople: the style used to aggregate results for distinct
 //!      values of the over and partition field.

diff --git a/lib/api/CModelSizeStatsJsonWriter.cc b/lib/api/CModelSizeStatsJsonWriter.cc
@@ -13,18 +13,24 @@ namespace api {
 namespace {
 
 // JSON field names
-const std::string JOB_ID("job_id");
-const std::string MODEL_SIZE_STATS("model_size_stats");
-const std::string MODEL_BYTES("model_bytes");
-const std::string MODEL_BYTES_EXCEEDED("model_bytes_exceeded");
-const std::string MODEL_BYTES_MEMORY_LIMIT("model_bytes_memory_limit");
-const std::string TOTAL_BY_FIELD_COUNT("total_by_field_count");
-const std::string TOTAL_OVER_FIELD_COUNT("total_over_field_count");
-const std::string TOTAL_PARTITION_FIELD_COUNT("total_partition_field_count");
-const std::string BUCKET_ALLOCATION_FAILURES_COUNT("bucket_allocation_failures_count");
-const std::string MEMORY_STATUS("memory_status");
-const std::string TIMESTAMP("timestamp");
-const std::string LOG_TIME("log_time");
+const std::string JOB_ID{"job_id"};
+const std::string MODEL_SIZE_STATS{"model_size_stats"};
+const std::string MODEL_BYTES{"model_bytes"};
+const std::string MODEL_BYTES_EXCEEDED{"model_bytes_exceeded"};
+const std::string MODEL_BYTES_MEMORY_LIMIT{"model_bytes_memory_limit"};
+const std::string TOTAL_BY_FIELD_COUNT{"total_by_field_count"};
+const std::string TOTAL_OVER_FIELD_COUNT{"total_over_field_count"};
+const std::string TOTAL_PARTITION_FIELD_COUNT{"total_partition_field_count"};
+const std::string BUCKET_ALLOCATION_FAILURES_COUNT{"bucket_allocation_failures_count"};
+const std::string MEMORY_STATUS{"memory_status"};
+const std::string CATEGORIZED_DOC_COUNT{"categorized_doc_count"};
+const std::string TOTAL_CATEGORY_COUNT{"total_category_count"};
+const std::string FREQUENT_CATEGORY_COUNT{"frequent_category_count"};
+const std::string RARE_CATEGORY_COUNT{"rare_category_count"};
+const std::string DEAD_CATEGORY_COUNT{"dead_category_count"};
+const std::string CATEGORIZATION_STATUS{"categorization_status"};
+const std::string TIMESTAMP{"timestamp"};
+const std::string LOG_TIME{"log_time"};
 }
 
 void CModelSizeStatsJsonWriter::write(const std::string& jobId,
@@ -60,6 +66,24 @@ void CModelSizeStatsJsonWriter::write(const std::string& jobId,
     writer.String(MEMORY_STATUS);
     writer.String(print(results.s_MemoryStatus));
 
+    writer.String(CATEGORIZED_DOC_COUNT);
+    writer.Uint64(results.s_CategorizedMessages);
+
+    writer.String(TOTAL_CATEGORY_COUNT);
+    writer.Uint64(results.s_TotalCategories);
+
+    writer.String(FREQUENT_CATEGORY_COUNT);
+    writer.Uint64(results.s_FrequentCategories);
+
+    writer.String(RARE_CATEGORY_COUNT);
+    writer.Uint64(results.s_RareCategories);
+
+    writer.String(DEAD_CATEGORY_COUNT);
+    writer.Uint64(results.s_DeadCategories);
+
+    writer.String(CATEGORIZATION_STATUS);
+    writer.String(print(results.s_CategorizationStatus));
+
     writer.String(TIMESTAMP);
     writer.Time(results.s_BucketStartTime);
 

diff --git a/lib/api/unittest/CModelSnapshotJsonWriterTest.cc b/lib/api/unittest/CModelSnapshotJsonWriterTest.cc
@@ -37,8 +37,13 @@ BOOST_AUTO_TEST_CASE(testWrite) {
             model_t::E_MemoryStatusOk, // memory status
             core_t::TTime(1521046309), // bucket start time
             0,                         // model bytes exceeded
-            50000                      // model bytes memory limit
-        };
+            50000,                     // model bytes memory limit
+            1000,                      // categorized messages
+            100,                       // total categories
+            7,                         // frequent categories
+            13,                        // rare categories
+            2,                         // dead categories
+            model_t::E_CategorizationStatusPoor};
 
         CModelSnapshotJsonWriter::SModelSnapshotReport report{
             "6.3.0",
@@ -110,14 +115,27 @@ BOOST_AUTO_TEST_CASE(testWrite) {
     BOOST_TEST_REQUIRE(modelSizeStats.HasMember("memory_status"));
     BOOST_REQUIRE_EQUAL(std::string("ok"),
                         std::string(modelSizeStats["memory_status"].GetString()));
-    BOOST_TEST_REQUIRE(modelSizeStats.HasMember("timestamp"));
-    BOOST_REQUIRE_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64());
-    BOOST_TEST_REQUIRE(modelSizeStats.HasMember("log_time"));
     BOOST_TEST_REQUIRE(modelSizeStats.HasMember("model_bytes_exceeded"));
     BOOST_REQUIRE_EQUAL(int64_t(0), modelSizeStats["model_bytes_exceeded"].GetInt64());
     BOOST_TEST_REQUIRE(modelSizeStats.HasMember("model_bytes_memory_limit"));
     BOOST_REQUIRE_EQUAL(int64_t(50000),
                         modelSizeStats["model_bytes_memory_limit"].GetInt64());
+    BOOST_TEST_REQUIRE(modelSizeStats.HasMember("categorized_doc_count"));
+    BOOST_REQUIRE_EQUAL(int64_t(1000), modelSizeStats["categorized_doc_count"].GetInt64());
+    BOOST_TEST_REQUIRE(modelSizeStats.HasMember("total_category_count"));
+    BOOST_REQUIRE_EQUAL(int64_t(100), modelSizeStats["total_category_count"].GetInt64());
+    BOOST_TEST_REQUIRE(modelSizeStats.HasMember("frequent_category_count"));
+    BOOST_REQUIRE_EQUAL(int64_t(7), modelSizeStats["frequent_category_count"].GetInt64());
+    BOOST_TEST_REQUIRE(modelSizeStats.HasMember("rare_category_count"));
+    BOOST_REQUIRE_EQUAL(int64_t(13), modelSizeStats["rare_category_count"].GetInt64());
+    BOOST_TEST_REQUIRE(modelSizeStats.HasMember("dead_category_count"));
+    BOOST_REQUIRE_EQUAL(int64_t(2), modelSizeStats["dead_category_count"].GetInt64());
+    BOOST_TEST_REQUIRE(modelSizeStats.HasMember("memory_status"));
+    BOOST_REQUIRE_EQUAL(std::string("poor"),
+                        std::string(modelSizeStats["categorization_status"].GetString()));
+    BOOST_TEST_REQUIRE(modelSizeStats.HasMember("timestamp"));
+    BOOST_REQUIRE_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64());
+    BOOST_TEST_REQUIRE(modelSizeStats.HasMember("log_time"));
 
     BOOST_TEST_REQUIRE(snapshot.HasMember("quantiles"));
     const rapidjson::Value& quantiles = snapshot["quantiles"];

diff --git a/lib/model/CResourceMonitor.cc b/lib/model/CResourceMonitor.cc
@@ -272,10 +272,30 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
     res.s_AllocationFailures = 0;
     res.s_MemoryStatus = m_MemoryStatus;
     res.s_BucketStartTime = bucketStartTime;
+    res.s_CategorizedMessages = 0;
+    res.s_TotalCategories = 0;
+    res.s_FrequentCategories = 0;
+    res.s_RareCategories = 0;
+    res.s_DeadCategories = 0;
+    res.s_CategorizationStatus = model_t::E_CategorizationStatusOk;
     for (const auto& resource : m_Resources) {
         resource.first->updateMemoryResults(res);
     }
     res.s_AllocationFailures += m_AllocationFailures.size();
+    // Categorization status is poor if:
+    // - At least 100 messages have been categorized
+    // and one of the following holds:
+    // - There is only 1 category
+    // - More than 90% of categories have 1 message
-    // - More than 90% of categories have 1 message
+    // - More than 90% of categories are rare
-    // - More than 90% of categories have 1 message
+    // - More than 90% of categories are rare
+    // - The number of categories is greater than 50% of the number of categorized messages
+    // - There are no frequent match categories
+    // - More than 50% of categories are dead
+    if (res.s_CategorizedMessages > 100 &&
+        (res.s_TotalCategories == 1 || 10 * res.s_RareCategories > 9 * res.s_TotalCategories ||
+         2 * res.s_TotalCategories > res.s_CategorizedMessages ||
+         res.s_FrequentCategories == 0 || 2 * res.s_DeadCategories > res.s_TotalCategories)) {
+        res.s_CategorizationStatus = model_t::E_CategorizationStatusPoor;
+    }
     return res;
 }
 

diff --git a/lib/model/CTokenListCategory.cc b/lib/model/CTokenListCategory.cc
@@ -411,52 +411,27 @@ std::size_t CTokenListCategory::missingCommonTokenWeight(const TSizeSizeMap& uni
     return m_CommonUniqueTokenWeight - presentWeight;
 }
 
-bool CTokenListCategory::isMissingCommonTokenWeightZero(const TSizeSizeMap& uniqueTokenIds) const {
-    // This method could be implemented as:
-    // return this->missingCommonTokenWeight(uniqueTokenIds) == 0;
-    //
-    // However, it's much faster to return false as soon as a mismatch occurs
+bool CTokenListCategory::containsCommonInOrderTokensInOrder(const TSizeSizePrVec& tokenIds) const {
 
-    auto commonIter = m_CommonUniqueTokenIds.begin();
-    auto testIter = uniqueTokenIds.begin();
-    while (commonIter != m_CommonUniqueTokenIds.end() &&
-           testIter != uniqueTokenIds.end()) {
-        if (commonIter->first < testIter->first) {
-            return false;
-        }
-
-        if (commonIter->first == testIter->first) {
-            // The tokens must appear the same number of times in the two
-            // strings
-            if (commonIter->second != testIter->second) {
-                return false;
-            }
-            ++commonIter;
-        }
-
-        ++testIter;
-    }
-
-    return commonIter == m_CommonUniqueTokenIds.end();
-}
-
-bool CTokenListCategory::containsCommonTokensInOrder(const TSizeSizePrVec& tokenIds) const {
     auto testIter = tokenIds.begin();
-    for (auto baseTokenId : m_BaseTokenIds) {
+    for (std::size_t index = m_OrderedCommonTokenBeginIndex;
+         index < m_OrderedCommonTokenEndIndex; ++index) {
+        std::size_t baseTokenId{m_BaseTokenIds[index].first};
+
         // Ignore tokens that are not in the common unique tokens
-        if (this->isTokenCommon(baseTokenId.first) == false) {
+        if (this->isTokenCommon(baseTokenId) == false) {
             continue;
         }
 
         // Skip tokens in the test tokens until we find one that matches the
         // base token.  If we reach the end of the test tokens whilst doing
-        // this, it means the test tokens don't contain the base tokens in the
-        // correct order.
+        // this, it means the test tokens don't contain the common ordered base
+        // tokens in the correct order.
         do {
             if (testIter == tokenIds.end()) {
                 return false;
             }
-        } while ((testIter++)->first != baseTokenId.first);
+        } while ((testIter++)->first != baseTokenId);
     }
 
     return true;