jagprog5 · jagprog5 · Sep 7, 2023 · Sep 7, 2023 · Sep 7, 2023 · Sep 7, 2023
diff --git a/src/algo_utils.hpp b/src/algo_utils.hpp
@@ -2,15 +2,18 @@
 
 #include <algorithm>
 #include <charconv>
+#include <cstring> // memmove
 #include <execution>
+#include <set>
+#include <unordered_set>
 #include <vector>
 
+#include "likely_unlikely.hpp"
+
 namespace choose {
 
 template <typename ExecutionPolicy, typename it, typename Comp>
 void stable_partial_sort(ExecutionPolicy&& policy, it begin, it middle, it end, Comp comp) {
-  static_assert(std::is_same_v<typename std::iterator_traits<it>::iterator_category, //
-                               std::random_access_iterator_tag>);
   // adapted from https://stackoverflow.com/a/27248519/15534181
   std::vector<it> sorted;
   sorted.resize(end - begin);
@@ -43,6 +46,98 @@ void stable_partial_sort(ExecutionPolicy&& policy, it begin, it middle, it end,
   }
 }
 
+// only remembers last n elements
+template <typename Key, typename Compare = std::less<Key>, typename Allocator = std::allocator<Key>, bool unordered = false>
+class ForgetfulSet {
+  std::set<Key, Compare, Allocator> s;
+  const size_t n;                                    // cap for iters
+  std::vector<typename decltype(s)::iterator> iters; // point within s
+ public:
+  ForgetfulSet(const Compare& comp, size_t n) : s(comp), n(n == 0 ? 1 : n) {
+    // ^ n must be positive.
+    // given the context where it is constructed, arg n is never 0 anyways. but this is for safety.
+    // required for precondition below
+    iters.reserve(this->n);
+  }
+
+  void clear() {
+    s.clear();
+    iters.clear();
+  }
+
+  auto insert(Key k) {
+    auto ret = this->s.insert(k);
+
+    if (likely(this->s.size() > this->n)) {
+      // precondition this->iters not empty
+
+      // element was inserted AND it's now exceeding capacity
+      this->s.erase(*this->iters.begin());
+
+      // treating this->iters like a fixed size array.
+      // erase first element and push_back
+      std::memmove(this->iters.data(), this->iters.data() + 1, (this->iters.size() - 1) * sizeof(typename decltype(iters)::value_type));
+      *this->iters.rbegin() = ret.first;
+      return ret;
+    }
+
+    if (ret.second) {
+      this->iters.push_back(ret.first);
+    }
+
+    return ret;
+  }
+};
+
+// only remembers last n elements.
+// largely copy paste from ForgetfulSet.
+template <typename Key, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>, typename Allocator = std::allocator<Key>>
+class ForgetfulUnorderedSet {
+  std::unordered_set<Key, Hash, KeyEqual, Allocator> s;
+  const size_t n;                                    // cap for iters
+  std::vector<typename decltype(s)::iterator> iters; // point within s
+
+ public:
+  ForgetfulUnorderedSet(const Hash& hash, const KeyEqual key_equal, float load_factor, size_t n) : s(0, hash, key_equal), n(n == 0 ? 1 : n) {
+    // ^ n must be positive.
+    // given the context where it is constructed, arg n is never 0 anyways. but this is for safety.
+    // required for precondition below
+    iters.reserve(this->n);
+
+    s.max_load_factor(load_factor);
+    // prevent rehashing by allocating a large enough bucket size. required to prevent iters invalidation
+    s.reserve(this->n);
+  }
+
+  void clear() {
+    s.clear();
+    iters.clear();
+  }
+
+  auto insert(Key k) {
+    auto ret = this->s.insert(k);
+
+    if (likely(this->s.size() > this->n)) {
+      // precondition this->iters not empty
+
+      // element was inserted AND it's now exceeding capacity
+      this->s.erase(*this->iters.begin());
+
+      // treating this->iters like a fixed size array.
+      // erase first element and push_back
+      std::memmove(this->iters.data(), this->iters.data() + 1, (this->iters.size() - 1) * sizeof(typename decltype(iters)::value_type));
+      *this->iters.rbegin() = ret.first;
+      return ret;
+    }
+
+    if (ret.second) {
+      this->iters.push_back(ret.first);
+    }
+
+    return ret;
+  }
+};
+
 bool general_numeric_compare(const char* lhs_begin, const char* lhs_end, const char* rhs_begin, const char* rhs_end) { //
   float lhs, rhs;
   // if from_chars isn't found, get a newer compiler. e.g.
@@ -96,11 +191,9 @@ size_t general_numeric_hash(const char* begin, const char* end) {
 // implementation is based on c strings, versus here ranges were used. so this
 // didn't exist yet.
 
-// leveraged under the following assumptions:
+// likely and unlikely are leveraged under the following assumptions:
 //   - end of string has not been reached
 //   - character frequency. e.g. obtaining any non-zero digit is more likely than zero digit (8/9 vs 1/9)
-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(!!(x), 0)
 
 namespace {
 

diff --git a/src/args.hpp b/src/args.hpp
@@ -48,6 +48,8 @@ struct Arguments {
   float unique_load_factor = UNIQUE_LOAD_FACTOR_DEFAULT;
   bool unique_consecutive = false; // after sorting uniqueness
 
+  size_t unique_limit = 0; // 0 indicates unused
+
   bool flip = false;
   bool flush = false;
   bool multiple_selections = false;
@@ -385,11 +387,14 @@ void print_help_message() {
       "                consecutive duplicate elements. requires --sort. ignored by\n"
       "                truncation --out/--tail (use normal -u in these cases instead)\n"
       "        --unique-numeric\n"
-      "                apply uniqueness numerically. implies --unique\n"
+      "                apply uniqueness numerically. implies -u\n"
       "        --unique-general-numeric\n"
-      "                apply uniqueness general numerically. implies --unique\n"
+      "                apply uniqueness general numerically. implies -u\n"
+      "        --unique-limit [<#tokens>]\n"
+      "                implies -u. checks uniqueness against only the most recent\n"
+      "                distinct tokens. older tokens are forgotten\n"
       "        --unique-use-set\n"
-      "                apply uniqueness with a tree instead of a hash table\n"
+      "                implies -u. apply uniqueness with a tree instead of a hash table\n"
       "        --use-delimiter\n"
       "                don't ignore a delimiter at the end of the input\n"
       "        --utf\n"
@@ -502,6 +507,7 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
         {"load-factor", required_argument, NULL, 0},
         {"locale", required_argument, NULL, 0},
         {"replace", required_argument, NULL, 0},
+        {"unique-limit", required_argument, NULL, 0},
         {"head", optional_argument, NULL, 0},
         {"index", optional_argument, NULL, 0},
         {"out", optional_argument, NULL, 0},
@@ -685,6 +691,9 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
               }
             }
             uncompiled_output.ordered_ops.push_back(uncompiled::UncompiledReplaceOp(optarg));
+          } else if (strcmp("unique-limit", name) == 0) {
+            ret.unique_limit = num::parse_number<decltype(ret.unique_limit)>(on_num_err, optarg, false);
+            ret.unique = true;
           } else if (strcmp("sub", name) == 0 || strcmp("substitute", name) == 0) {
             // special handing here since getopt doesn't normally support multiple arguments
             if (optind >= argc) {
@@ -773,6 +782,7 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
           } else if (strcmp("index", name) == 0) {
             index_handler(false);
           } else if (strcmp("unique-use-set", name) == 0) {
+            ret.unique = true;
             ret.unique_use_set = true;
           } else if (strcmp("use-delimiter", name) == 0) {
             ret.use_input_delimiter = true;

diff --git a/src/likely_unlikely.hpp b/src/likely_unlikely.hpp
@@ -0,0 +1,4 @@
+#pragma once
+
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
diff --git a/src/string_utils.hpp b/src/string_utils.hpp
@@ -11,6 +11,8 @@
 #include <stdexcept>
 #include <vector>
 
+#include "likely_unlikely.hpp"
+
 namespace choose {
 
 namespace str {
@@ -299,9 +301,10 @@ struct QueuedOutput {
   }
 };
 
+// block until n bytes have been made available in the file, or EOF
 size_t get_bytes(FILE* f, size_t n, char* out) {
   size_t read_ret = fread(out, sizeof(char), n, f);
-  if (read_ret == 0) {
+  if (unlikely(read_ret == 0)) {
     if (feof(f)) {
       return read_ret;
     } else if (ferror(f)) {
@@ -312,9 +315,10 @@ size_t get_bytes(FILE* f, size_t n, char* out) {
   return read_ret;
 }
 
+// returns once any positive number of bytes are made available in the file, or EOF
 size_t get_bytes_unbuffered(int fileno, size_t n, char* out) {
   ssize_t read_ret = read(fileno, out, n);
-  if (read_ret == -1) {
+  if (unlikely(read_ret == -1)) {
     const char* err_string = strerror(errno);
     throw std::runtime_error(err_string);
   }

diff --git a/src/test.cpp b/src/test.cpp
@@ -554,6 +554,18 @@ BOOST_AUTO_TEST_CASE(general_numeric_unique_with_parse_failure) {
   BOOST_REQUIRE_EQUAL(out, correct_output);
 }
 
+BOOST_AUTO_TEST_CASE(unique_limit_set) {
+  choose_output out = run_choose("1\n1\n2\n2\n3\n3\n1\n2\n3\n4\n1\n1\n4\n4\n3\n3", {"--tui", "--unique-use-set", "--unique-limit", "3"});
+  choose_output correct_output{std::vector<choose::Token>{"1", "2", "3", "4", "1"}};
+  BOOST_REQUIRE_EQUAL(out, correct_output);
+}
+
+BOOST_AUTO_TEST_CASE(unique_limit) {
+  choose_output out = run_choose("1\n1\n2\n2\n3\n3\n1\n2\n3\n4\n1\n1\n4\n4\n3\n3", {"--tui", "--unique-limit", "3"});
+  choose_output correct_output{std::vector<choose::Token>{"1", "2", "3", "4", "1"}};
+  BOOST_REQUIRE_EQUAL(out, correct_output);
+}
+
 BOOST_AUTO_TEST_CASE(numeric_unique_use_set) {
   choose_output out = run_choose("-0\n0\n.0\n1\n1.0\n0001.0", {"--unique-numeric", "--unique-use-set"});
   choose_output correct_output{to_vec("-0\n1\n")};

diff --git a/src/token.hpp b/src/token.hpp
@@ -238,7 +238,6 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
   const bool tail = args.tail;
 
   const bool unique = args.unique;
-  const bool unique_use_set = args.unique_use_set;
   const Comparison unique_type = args.unique_type;
   const bool sort = args.sort;
   const Comparison sort_type = args.sort_type;
@@ -346,18 +345,31 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
       }
     };
 
-    using uniqueness_set_T = std::set<indirect, decltype(uniqueness_set_comparison)>;
     using unordered_uniqueness_set_T = std::unordered_set<indirect, decltype(unordered_set_hash), decltype(unordered_set_equals)>;
-    using unique_checker_T = std::variant<std::monostate, uniqueness_set_T, unordered_uniqueness_set_T>;
+    using unordered_uniqueness_limit_set_T = ForgetfulUnorderedSet<indirect, decltype(unordered_set_hash), decltype(unordered_set_equals)>;
+    using uniqueness_set_T = std::set<indirect, decltype(uniqueness_set_comparison)>;
+    using uniqueness_limit_set_T = ForgetfulSet<indirect, decltype(uniqueness_set_comparison)>;
+    using unique_checker_T = std::variant<std::monostate, unordered_uniqueness_set_T, unordered_uniqueness_limit_set_T, uniqueness_set_T, uniqueness_limit_set_T>;
 
     unique_checker_T unique_checker = [&]() -> unique_checker_T {
       if (unique) {
-        if (unique_use_set) {
-          return unique_checker_T(uniqueness_set_T(uniqueness_set_comparison));
+        if (args.unique_use_set) {
+          if (args.unique_limit == 0) {
+            return unique_checker_T(uniqueness_set_T(uniqueness_set_comparison));
+          } else {
+            return unique_checker_T(uniqueness_limit_set_T(uniqueness_set_comparison, args.unique_limit));
+          }
         } else {
-          auto s = unordered_uniqueness_set_T(8, unordered_set_hash, unordered_set_equals);
-          s.max_load_factor(args.unique_load_factor);
-          return unique_checker_T(std::move(s));
+          if (args.unique_limit == 0) {
+            auto s = unordered_uniqueness_set_T(8, unordered_set_hash, unordered_set_equals);
+            s.max_load_factor(args.unique_load_factor);
+            return unique_checker_T(std::move(s));
+          } else {
+            return unique_checker_T(unordered_uniqueness_limit_set_T(unordered_set_hash,      //
+                                                                     unordered_set_equals,    //
+                                                                     args.unique_load_factor, //
+                                                                     args.unique_limit));
+          }
         }
       } else {
         return unique_checker_T();
@@ -366,10 +378,14 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
 
     // returns true if output[elem] is unique. requires unique == true
     auto uniqueness_check = [&](indirect elem) -> bool { //
-      if (unordered_uniqueness_set_T* set = std::get_if<unordered_uniqueness_set_T>(&unique_checker)) {
+      if (unordered_uniqueness_set_T* unordered_set = std::get_if<unordered_uniqueness_set_T>(&unique_checker)) {
+        return unordered_set->insert(elem).second;
+      } else if (unordered_uniqueness_limit_set_T* unordered_set_limit = std::get_if<unordered_uniqueness_limit_set_T>(&unique_checker)) {
+        return unordered_set_limit->insert(elem).second;
+      } else if (uniqueness_set_T* set = std::get_if<uniqueness_set_T>(&unique_checker)) {
         return set->insert(elem).second;
       } else {
-        return std::get<uniqueness_set_T>(unique_checker).insert(elem).second;
+        return std::get<uniqueness_limit_set_T>(unique_checker).insert(elem).second;
       }
     };
 
@@ -806,8 +822,12 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
 
     if (unordered_uniqueness_set_T* uniqueness_unordered_set = std::get_if<unordered_uniqueness_set_T>(&unique_checker)) {
       uniqueness_unordered_set->clear();
+    } else if (unordered_uniqueness_limit_set_T* uniqueness_limit_unordered_set = std::get_if<unordered_uniqueness_limit_set_T>(&unique_checker)) {
+      uniqueness_limit_unordered_set->clear();
     } else if (uniqueness_set_T* set = std::get_if<uniqueness_set_T>(&unique_checker)) {
       set->clear();
+    } else if (uniqueness_limit_set_T* set_limit = std::get_if<uniqueness_limit_set_T>(&unique_checker)) {
+      set_limit->clear();
     }
 
     if (!args.out_start && !args.out_end) {