diff --git a/src/algo_utils.hpp b/src/algo_utils.hpp index dfad465..430a481 100644 --- a/src/algo_utils.hpp +++ b/src/algo_utils.hpp @@ -2,15 +2,18 @@ #include #include +#include // memmove #include +#include +#include #include +#include "likely_unlikely.hpp" + namespace choose { template void stable_partial_sort(ExecutionPolicy&& policy, it begin, it middle, it end, Comp comp) { - static_assert(std::is_same_v::iterator_category, // - std::random_access_iterator_tag>); // adapted from https://stackoverflow.com/a/27248519/15534181 std::vector sorted; sorted.resize(end - begin); @@ -43,6 +46,98 @@ void stable_partial_sort(ExecutionPolicy&& policy, it begin, it middle, it end, } } +// only remembers last n elements +template , typename Allocator = std::allocator, bool unordered = false> +class ForgetfulSet { + std::set s; + const size_t n; // cap for iters + std::vector iters; // point within s + public: + ForgetfulSet(const Compare& comp, size_t n) : s(comp), n(n == 0 ? 1 : n) { + // ^ n must be positive. + // given the context where it is constructed, arg n is never 0 anyways. but this is for safety. + // required for precondition below + iters.reserve(this->n); + } + + void clear() { + s.clear(); + iters.clear(); + } + + auto insert(Key k) { + auto ret = this->s.insert(k); + + if (likely(this->s.size() > this->n)) { + // precondition this->iters not empty + + // element was inserted AND it's now exceeding capacity + this->s.erase(*this->iters.begin()); + + // treating this->iters like a fixed size array. + // erase first element and push_back + std::memmove(this->iters.data(), this->iters.data() + 1, (this->iters.size() - 1) * sizeof(typename decltype(iters)::value_type)); + *this->iters.rbegin() = ret.first; + return ret; + } + + if (ret.second) { + this->iters.push_back(ret.first); + } + + return ret; + } +}; + +// only remembers last n elements. +// largely copy paste from ForgetfulSet. +template , typename KeyEqual = std::equal_to, typename Allocator = std::allocator> +class ForgetfulUnorderedSet { + std::unordered_set s; + const size_t n; // cap for iters + std::vector iters; // point within s + + public: + ForgetfulUnorderedSet(const Hash& hash, const KeyEqual key_equal, float load_factor, size_t n) : s(0, hash, key_equal), n(n == 0 ? 1 : n) { + // ^ n must be positive. + // given the context where it is constructed, arg n is never 0 anyways. but this is for safety. + // required for precondition below + iters.reserve(this->n); + + s.max_load_factor(load_factor); + // prevent rehashing by allocating a large enough bucket size. required to prevent iters invalidation + s.reserve(this->n); + } + + void clear() { + s.clear(); + iters.clear(); + } + + auto insert(Key k) { + auto ret = this->s.insert(k); + + if (likely(this->s.size() > this->n)) { + // precondition this->iters not empty + + // element was inserted AND it's now exceeding capacity + this->s.erase(*this->iters.begin()); + + // treating this->iters like a fixed size array. + // erase first element and push_back + std::memmove(this->iters.data(), this->iters.data() + 1, (this->iters.size() - 1) * sizeof(typename decltype(iters)::value_type)); + *this->iters.rbegin() = ret.first; + return ret; + } + + if (ret.second) { + this->iters.push_back(ret.first); + } + + return ret; + } +}; + bool general_numeric_compare(const char* lhs_begin, const char* lhs_end, const char* rhs_begin, const char* rhs_end) { // float lhs, rhs; // if from_chars isn't found, get a newer compiler. e.g. @@ -96,11 +191,9 @@ size_t general_numeric_hash(const char* begin, const char* end) { // implementation is based on c strings, versus here ranges were used. so this // didn't exist yet. -// leveraged under the following assumptions: +// likely and unlikely are leveraged under the following assumptions: // - end of string has not been reached // - character frequency. e.g. obtaining any non-zero digit is more likely than zero digit (8/9 vs 1/9) -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) namespace { diff --git a/src/args.hpp b/src/args.hpp index 49104a9..e83b920 100644 --- a/src/args.hpp +++ b/src/args.hpp @@ -48,6 +48,8 @@ struct Arguments { float unique_load_factor = UNIQUE_LOAD_FACTOR_DEFAULT; bool unique_consecutive = false; // after sorting uniqueness + size_t unique_limit = 0; // 0 indicates unused + bool flip = false; bool flush = false; bool multiple_selections = false; @@ -385,11 +387,14 @@ void print_help_message() { " consecutive duplicate elements. requires --sort. ignored by\n" " truncation --out/--tail (use normal -u in these cases instead)\n" " --unique-numeric\n" - " apply uniqueness numerically. implies --unique\n" + " apply uniqueness numerically. implies -u\n" " --unique-general-numeric\n" - " apply uniqueness general numerically. implies --unique\n" + " apply uniqueness general numerically. implies -u\n" + " --unique-limit [<#tokens>]\n" + " implies -u. checks uniqueness against only the most recent\n" + " distinct tokens. older tokens are forgotten\n" " --unique-use-set\n" - " apply uniqueness with a tree instead of a hash table\n" + " implies -u. apply uniqueness with a tree instead of a hash table\n" " --use-delimiter\n" " don't ignore a delimiter at the end of the input\n" " --utf\n" @@ -502,6 +507,7 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out {"load-factor", required_argument, NULL, 0}, {"locale", required_argument, NULL, 0}, {"replace", required_argument, NULL, 0}, + {"unique-limit", required_argument, NULL, 0}, {"head", optional_argument, NULL, 0}, {"index", optional_argument, NULL, 0}, {"out", optional_argument, NULL, 0}, @@ -685,6 +691,9 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out } } uncompiled_output.ordered_ops.push_back(uncompiled::UncompiledReplaceOp(optarg)); + } else if (strcmp("unique-limit", name) == 0) { + ret.unique_limit = num::parse_number(on_num_err, optarg, false); + ret.unique = true; } else if (strcmp("sub", name) == 0 || strcmp("substitute", name) == 0) { // special handing here since getopt doesn't normally support multiple arguments if (optind >= argc) { @@ -773,6 +782,7 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out } else if (strcmp("index", name) == 0) { index_handler(false); } else if (strcmp("unique-use-set", name) == 0) { + ret.unique = true; ret.unique_use_set = true; } else if (strcmp("use-delimiter", name) == 0) { ret.use_input_delimiter = true; diff --git a/src/likely_unlikely.hpp b/src/likely_unlikely.hpp new file mode 100644 index 0000000..6d3a530 --- /dev/null +++ b/src/likely_unlikely.hpp @@ -0,0 +1,4 @@ +#pragma once + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) diff --git a/src/string_utils.hpp b/src/string_utils.hpp index 3a11d11..197af0b 100644 --- a/src/string_utils.hpp +++ b/src/string_utils.hpp @@ -11,6 +11,8 @@ #include #include +#include "likely_unlikely.hpp" + namespace choose { namespace str { @@ -299,9 +301,10 @@ struct QueuedOutput { } }; +// block until n bytes have been made available in the file, or EOF size_t get_bytes(FILE* f, size_t n, char* out) { size_t read_ret = fread(out, sizeof(char), n, f); - if (read_ret == 0) { + if (unlikely(read_ret == 0)) { if (feof(f)) { return read_ret; } else if (ferror(f)) { @@ -312,9 +315,10 @@ size_t get_bytes(FILE* f, size_t n, char* out) { return read_ret; } +// returns once any positive number of bytes are made available in the file, or EOF size_t get_bytes_unbuffered(int fileno, size_t n, char* out) { ssize_t read_ret = read(fileno, out, n); - if (read_ret == -1) { + if (unlikely(read_ret == -1)) { const char* err_string = strerror(errno); throw std::runtime_error(err_string); } diff --git a/src/test.cpp b/src/test.cpp index 3dd78d4..5b7d922 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -554,6 +554,18 @@ BOOST_AUTO_TEST_CASE(general_numeric_unique_with_parse_failure) { BOOST_REQUIRE_EQUAL(out, correct_output); } +BOOST_AUTO_TEST_CASE(unique_limit_set) { + choose_output out = run_choose("1\n1\n2\n2\n3\n3\n1\n2\n3\n4\n1\n1\n4\n4\n3\n3", {"--tui", "--unique-use-set", "--unique-limit", "3"}); + choose_output correct_output{std::vector{"1", "2", "3", "4", "1"}}; + BOOST_REQUIRE_EQUAL(out, correct_output); +} + +BOOST_AUTO_TEST_CASE(unique_limit) { + choose_output out = run_choose("1\n1\n2\n2\n3\n3\n1\n2\n3\n4\n1\n1\n4\n4\n3\n3", {"--tui", "--unique-limit", "3"}); + choose_output correct_output{std::vector{"1", "2", "3", "4", "1"}}; + BOOST_REQUIRE_EQUAL(out, correct_output); +} + BOOST_AUTO_TEST_CASE(numeric_unique_use_set) { choose_output out = run_choose("-0\n0\n.0\n1\n1.0\n0001.0", {"--unique-numeric", "--unique-use-set"}); choose_output correct_output{to_vec("-0\n1\n")}; diff --git a/src/token.hpp b/src/token.hpp index 93b7bb9..b84db61 100644 --- a/src/token.hpp +++ b/src/token.hpp @@ -238,7 +238,6 @@ std::vector create_tokens(choose::Arguments& args) { const bool tail = args.tail; const bool unique = args.unique; - const bool unique_use_set = args.unique_use_set; const Comparison unique_type = args.unique_type; const bool sort = args.sort; const Comparison sort_type = args.sort_type; @@ -346,18 +345,31 @@ std::vector create_tokens(choose::Arguments& args) { } }; - using uniqueness_set_T = std::set; using unordered_uniqueness_set_T = std::unordered_set; - using unique_checker_T = std::variant; + using unordered_uniqueness_limit_set_T = ForgetfulUnorderedSet; + using uniqueness_set_T = std::set; + using uniqueness_limit_set_T = ForgetfulSet; + using unique_checker_T = std::variant; unique_checker_T unique_checker = [&]() -> unique_checker_T { if (unique) { - if (unique_use_set) { - return unique_checker_T(uniqueness_set_T(uniqueness_set_comparison)); + if (args.unique_use_set) { + if (args.unique_limit == 0) { + return unique_checker_T(uniqueness_set_T(uniqueness_set_comparison)); + } else { + return unique_checker_T(uniqueness_limit_set_T(uniqueness_set_comparison, args.unique_limit)); + } } else { - auto s = unordered_uniqueness_set_T(8, unordered_set_hash, unordered_set_equals); - s.max_load_factor(args.unique_load_factor); - return unique_checker_T(std::move(s)); + if (args.unique_limit == 0) { + auto s = unordered_uniqueness_set_T(8, unordered_set_hash, unordered_set_equals); + s.max_load_factor(args.unique_load_factor); + return unique_checker_T(std::move(s)); + } else { + return unique_checker_T(unordered_uniqueness_limit_set_T(unordered_set_hash, // + unordered_set_equals, // + args.unique_load_factor, // + args.unique_limit)); + } } } else { return unique_checker_T(); @@ -366,10 +378,14 @@ std::vector create_tokens(choose::Arguments& args) { // returns true if output[elem] is unique. requires unique == true auto uniqueness_check = [&](indirect elem) -> bool { // - if (unordered_uniqueness_set_T* set = std::get_if(&unique_checker)) { + if (unordered_uniqueness_set_T* unordered_set = std::get_if(&unique_checker)) { + return unordered_set->insert(elem).second; + } else if (unordered_uniqueness_limit_set_T* unordered_set_limit = std::get_if(&unique_checker)) { + return unordered_set_limit->insert(elem).second; + } else if (uniqueness_set_T* set = std::get_if(&unique_checker)) { return set->insert(elem).second; } else { - return std::get(unique_checker).insert(elem).second; + return std::get(unique_checker).insert(elem).second; } }; @@ -806,8 +822,12 @@ std::vector create_tokens(choose::Arguments& args) { if (unordered_uniqueness_set_T* uniqueness_unordered_set = std::get_if(&unique_checker)) { uniqueness_unordered_set->clear(); + } else if (unordered_uniqueness_limit_set_T* uniqueness_limit_unordered_set = std::get_if(&unique_checker)) { + uniqueness_limit_unordered_set->clear(); } else if (uniqueness_set_T* set = std::get_if(&unique_checker)) { set->clear(); + } else if (uniqueness_limit_set_T* set_limit = std::get_if(&unique_checker)) { + set_limit->clear(); } if (!args.out_start && !args.out_end) {