Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unique limit #40

Merged
merged 4 commits into from
Sep 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 98 additions & 5 deletions src/algo_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@

#include <algorithm>
#include <charconv>
#include <cstring> // memmove
#include <execution>
#include <set>
#include <unordered_set>
#include <vector>

#include "likely_unlikely.hpp"

namespace choose {

template <typename ExecutionPolicy, typename it, typename Comp>
void stable_partial_sort(ExecutionPolicy&& policy, it begin, it middle, it end, Comp comp) {
static_assert(std::is_same_v<typename std::iterator_traits<it>::iterator_category, //
std::random_access_iterator_tag>);
// adapted from https://stackoverflow.com/a/27248519/15534181
std::vector<it> sorted;
sorted.resize(end - begin);
Expand Down Expand Up @@ -43,6 +46,98 @@ void stable_partial_sort(ExecutionPolicy&& policy, it begin, it middle, it end,
}
}

// only remembers last n elements
template <typename Key, typename Compare = std::less<Key>, typename Allocator = std::allocator<Key>, bool unordered = false>
class ForgetfulSet {
std::set<Key, Compare, Allocator> s;
const size_t n; // cap for iters
std::vector<typename decltype(s)::iterator> iters; // point within s
public:
ForgetfulSet(const Compare& comp, size_t n) : s(comp), n(n == 0 ? 1 : n) {
// ^ n must be positive.
// given the context where it is constructed, arg n is never 0 anyways. but this is for safety.
// required for precondition below
iters.reserve(this->n);
}

void clear() {
s.clear();
iters.clear();
}

auto insert(Key k) {
auto ret = this->s.insert(k);

if (likely(this->s.size() > this->n)) {
// precondition this->iters not empty

// element was inserted AND it's now exceeding capacity
this->s.erase(*this->iters.begin());

// treating this->iters like a fixed size array.
// erase first element and push_back
std::memmove(this->iters.data(), this->iters.data() + 1, (this->iters.size() - 1) * sizeof(typename decltype(iters)::value_type));
*this->iters.rbegin() = ret.first;
return ret;
}

if (ret.second) {
this->iters.push_back(ret.first);
}

return ret;
}
};

// only remembers last n elements.
// largely copy paste from ForgetfulSet.
template <typename Key, typename Hash = std::hash<Key>, typename KeyEqual = std::equal_to<Key>, typename Allocator = std::allocator<Key>>
class ForgetfulUnorderedSet {
std::unordered_set<Key, Hash, KeyEqual, Allocator> s;
const size_t n; // cap for iters
std::vector<typename decltype(s)::iterator> iters; // point within s

public:
ForgetfulUnorderedSet(const Hash& hash, const KeyEqual key_equal, float load_factor, size_t n) : s(0, hash, key_equal), n(n == 0 ? 1 : n) {
// ^ n must be positive.
// given the context where it is constructed, arg n is never 0 anyways. but this is for safety.
// required for precondition below
iters.reserve(this->n);

s.max_load_factor(load_factor);
// prevent rehashing by allocating a large enough bucket size. required to prevent iters invalidation
s.reserve(this->n);
}

void clear() {
s.clear();
iters.clear();
}

auto insert(Key k) {
auto ret = this->s.insert(k);

if (likely(this->s.size() > this->n)) {
// precondition this->iters not empty

// element was inserted AND it's now exceeding capacity
this->s.erase(*this->iters.begin());

// treating this->iters like a fixed size array.
// erase first element and push_back
std::memmove(this->iters.data(), this->iters.data() + 1, (this->iters.size() - 1) * sizeof(typename decltype(iters)::value_type));
*this->iters.rbegin() = ret.first;
return ret;
}

if (ret.second) {
this->iters.push_back(ret.first);
}

return ret;
}
};

bool general_numeric_compare(const char* lhs_begin, const char* lhs_end, const char* rhs_begin, const char* rhs_end) { //
float lhs, rhs;
// if from_chars isn't found, get a newer compiler. e.g.
Expand Down Expand Up @@ -96,11 +191,9 @@ size_t general_numeric_hash(const char* begin, const char* end) {
// implementation is based on c strings, versus here ranges were used. so this
// didn't exist yet.

// leveraged under the following assumptions:
// likely and unlikely are leveraged under the following assumptions:
// - end of string has not been reached
// - character frequency. e.g. obtaining any non-zero digit is more likely than zero digit (8/9 vs 1/9)
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)

namespace {

Expand Down
16 changes: 13 additions & 3 deletions src/args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ struct Arguments {
float unique_load_factor = UNIQUE_LOAD_FACTOR_DEFAULT;
bool unique_consecutive = false; // after sorting uniqueness

size_t unique_limit = 0; // 0 indicates unused

bool flip = false;
bool flush = false;
bool multiple_selections = false;
Expand Down Expand Up @@ -385,11 +387,14 @@ void print_help_message() {
" consecutive duplicate elements. requires --sort. ignored by\n"
" truncation --out/--tail (use normal -u in these cases instead)\n"
" --unique-numeric\n"
" apply uniqueness numerically. implies --unique\n"
" apply uniqueness numerically. implies -u\n"
" --unique-general-numeric\n"
" apply uniqueness general numerically. implies --unique\n"
" apply uniqueness general numerically. implies -u\n"
" --unique-limit [<#tokens>]\n"
" implies -u. checks uniqueness against only the most recent\n"
" distinct tokens. older tokens are forgotten\n"
" --unique-use-set\n"
" apply uniqueness with a tree instead of a hash table\n"
" implies -u. apply uniqueness with a tree instead of a hash table\n"
" --use-delimiter\n"
" don't ignore a delimiter at the end of the input\n"
" --utf\n"
Expand Down Expand Up @@ -502,6 +507,7 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
{"load-factor", required_argument, NULL, 0},
{"locale", required_argument, NULL, 0},
{"replace", required_argument, NULL, 0},
{"unique-limit", required_argument, NULL, 0},
{"head", optional_argument, NULL, 0},
{"index", optional_argument, NULL, 0},
{"out", optional_argument, NULL, 0},
Expand Down Expand Up @@ -685,6 +691,9 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
}
}
uncompiled_output.ordered_ops.push_back(uncompiled::UncompiledReplaceOp(optarg));
} else if (strcmp("unique-limit", name) == 0) {
ret.unique_limit = num::parse_number<decltype(ret.unique_limit)>(on_num_err, optarg, false);
ret.unique = true;
} else if (strcmp("sub", name) == 0 || strcmp("substitute", name) == 0) {
// special handing here since getopt doesn't normally support multiple arguments
if (optind >= argc) {
Expand Down Expand Up @@ -773,6 +782,7 @@ Arguments handle_args(int argc, char* const* argv, FILE* input = NULL, FILE* out
} else if (strcmp("index", name) == 0) {
index_handler(false);
} else if (strcmp("unique-use-set", name) == 0) {
ret.unique = true;
ret.unique_use_set = true;
} else if (strcmp("use-delimiter", name) == 0) {
ret.use_input_delimiter = true;
Expand Down
4 changes: 4 additions & 0 deletions src/likely_unlikely.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#pragma once

#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
8 changes: 6 additions & 2 deletions src/string_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <stdexcept>
#include <vector>

#include "likely_unlikely.hpp"

namespace choose {

namespace str {
Expand Down Expand Up @@ -299,9 +301,10 @@ struct QueuedOutput {
}
};

// block until n bytes have been made available in the file, or EOF
size_t get_bytes(FILE* f, size_t n, char* out) {
size_t read_ret = fread(out, sizeof(char), n, f);
if (read_ret == 0) {
if (unlikely(read_ret == 0)) {
if (feof(f)) {
return read_ret;
} else if (ferror(f)) {
Expand All @@ -312,9 +315,10 @@ size_t get_bytes(FILE* f, size_t n, char* out) {
return read_ret;
}

// returns once any positive number of bytes are made available in the file, or EOF
size_t get_bytes_unbuffered(int fileno, size_t n, char* out) {
ssize_t read_ret = read(fileno, out, n);
if (read_ret == -1) {
if (unlikely(read_ret == -1)) {
const char* err_string = strerror(errno);
throw std::runtime_error(err_string);
}
Expand Down
12 changes: 12 additions & 0 deletions src/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,18 @@ BOOST_AUTO_TEST_CASE(general_numeric_unique_with_parse_failure) {
BOOST_REQUIRE_EQUAL(out, correct_output);
}

BOOST_AUTO_TEST_CASE(unique_limit_set) {
choose_output out = run_choose("1\n1\n2\n2\n3\n3\n1\n2\n3\n4\n1\n1\n4\n4\n3\n3", {"--tui", "--unique-use-set", "--unique-limit", "3"});
choose_output correct_output{std::vector<choose::Token>{"1", "2", "3", "4", "1"}};
BOOST_REQUIRE_EQUAL(out, correct_output);
}

BOOST_AUTO_TEST_CASE(unique_limit) {
choose_output out = run_choose("1\n1\n2\n2\n3\n3\n1\n2\n3\n4\n1\n1\n4\n4\n3\n3", {"--tui", "--unique-limit", "3"});
choose_output correct_output{std::vector<choose::Token>{"1", "2", "3", "4", "1"}};
BOOST_REQUIRE_EQUAL(out, correct_output);
}

BOOST_AUTO_TEST_CASE(numeric_unique_use_set) {
choose_output out = run_choose("-0\n0\n.0\n1\n1.0\n0001.0", {"--unique-numeric", "--unique-use-set"});
choose_output correct_output{to_vec("-0\n1\n")};
Expand Down
40 changes: 30 additions & 10 deletions src/token.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,6 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
const bool tail = args.tail;

const bool unique = args.unique;
const bool unique_use_set = args.unique_use_set;
const Comparison unique_type = args.unique_type;
const bool sort = args.sort;
const Comparison sort_type = args.sort_type;
Expand Down Expand Up @@ -346,18 +345,31 @@ std::vector<Token> create_tokens(choose::Arguments& args) {
}
};

using uniqueness_set_T = std::set<indirect, decltype(uniqueness_set_comparison)>;
using unordered_uniqueness_set_T = std::unordered_set<indirect, decltype(unordered_set_hash), decltype(unordered_set_equals)>;
using unique_checker_T = std::variant<std::monostate, uniqueness_set_T, unordered_uniqueness_set_T>;
using unordered_uniqueness_limit_set_T = ForgetfulUnorderedSet<indirect, decltype(unordered_set_hash), decltype(unordered_set_equals)>;
using uniqueness_set_T = std::set<indirect, decltype(uniqueness_set_comparison)>;
using uniqueness_limit_set_T = ForgetfulSet<indirect, decltype(uniqueness_set_comparison)>;
using unique_checker_T = std::variant<std::monostate, unordered_uniqueness_set_T, unordered_uniqueness_limit_set_T, uniqueness_set_T, uniqueness_limit_set_T>;

unique_checker_T unique_checker = [&]() -> unique_checker_T {
if (unique) {
if (unique_use_set) {
return unique_checker_T(uniqueness_set_T(uniqueness_set_comparison));
if (args.unique_use_set) {
if (args.unique_limit == 0) {
return unique_checker_T(uniqueness_set_T(uniqueness_set_comparison));
} else {
return unique_checker_T(uniqueness_limit_set_T(uniqueness_set_comparison, args.unique_limit));
}
} else {
auto s = unordered_uniqueness_set_T(8, unordered_set_hash, unordered_set_equals);
s.max_load_factor(args.unique_load_factor);
return unique_checker_T(std::move(s));
if (args.unique_limit == 0) {
auto s = unordered_uniqueness_set_T(8, unordered_set_hash, unordered_set_equals);
s.max_load_factor(args.unique_load_factor);
return unique_checker_T(std::move(s));
} else {
return unique_checker_T(unordered_uniqueness_limit_set_T(unordered_set_hash, //
unordered_set_equals, //
args.unique_load_factor, //
args.unique_limit));
}
}
} else {
return unique_checker_T();
Expand All @@ -366,10 +378,14 @@ std::vector<Token> create_tokens(choose::Arguments& args) {

// returns true if output[elem] is unique. requires unique == true
auto uniqueness_check = [&](indirect elem) -> bool { //
if (unordered_uniqueness_set_T* set = std::get_if<unordered_uniqueness_set_T>(&unique_checker)) {
if (unordered_uniqueness_set_T* unordered_set = std::get_if<unordered_uniqueness_set_T>(&unique_checker)) {
return unordered_set->insert(elem).second;
} else if (unordered_uniqueness_limit_set_T* unordered_set_limit = std::get_if<unordered_uniqueness_limit_set_T>(&unique_checker)) {
return unordered_set_limit->insert(elem).second;
} else if (uniqueness_set_T* set = std::get_if<uniqueness_set_T>(&unique_checker)) {
return set->insert(elem).second;
} else {
return std::get<uniqueness_set_T>(unique_checker).insert(elem).second;
return std::get<uniqueness_limit_set_T>(unique_checker).insert(elem).second;
}
};

Expand Down Expand Up @@ -806,8 +822,12 @@ std::vector<Token> create_tokens(choose::Arguments& args) {

if (unordered_uniqueness_set_T* uniqueness_unordered_set = std::get_if<unordered_uniqueness_set_T>(&unique_checker)) {
uniqueness_unordered_set->clear();
} else if (unordered_uniqueness_limit_set_T* uniqueness_limit_unordered_set = std::get_if<unordered_uniqueness_limit_set_T>(&unique_checker)) {
uniqueness_limit_unordered_set->clear();
} else if (uniqueness_set_T* set = std::get_if<uniqueness_set_T>(&unique_checker)) {
set->clear();
} else if (uniqueness_limit_set_T* set_limit = std::get_if<uniqueness_limit_set_T>(&unique_checker)) {
set_limit->clear();
}

if (!args.out_start && !args.out_end) {
Expand Down