diff --git a/docs/reference/aggregations/bucket.asciidoc b/docs/reference/aggregations/bucket.asciidoc index b9fbddc65c125..91f8062a96a3b 100644 --- a/docs/reference/aggregations/bucket.asciidoc +++ b/docs/reference/aggregations/bucket.asciidoc @@ -64,3 +64,4 @@ include::bucket/significantterms-aggregation.asciidoc[] include::bucket/significanttext-aggregation.asciidoc[] include::bucket/terms-aggregation.asciidoc[] + diff --git a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc new file mode 100644 index 0000000000000..e2537b61aefda --- /dev/null +++ b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc @@ -0,0 +1,357 @@ +[[search-aggregations-bucket-rare-terms-aggregation]] +=== Rare Terms Aggregation + +A multi-bucket value source based aggregation which finds "rare" terms -- terms that are at the long-tail +of the distribution and are not frequent. Conceptually, this is like a `terms` aggregation that is +sorted by `_count` ascending. As noted in the <>, +actually ordering a `terms` agg by count ascending has unbounded error. Instead, you should use the `rare_terms` +aggregation + +////////////////////////// + +[source,js] +-------------------------------------------------- +PUT /products +{ + "mappings": { + "properties": { + "genre": { + "type": "keyword" + }, + "product": { + "type": "keyword" + } + } + } +} + +POST /products/_doc/_bulk?refresh +{"index":{"_id":0}} +{"genre": "rock", "product": "Product A"} +{"index":{"_id":1}} +{"genre": "rock"} +{"index":{"_id":2}} +{"genre": "rock"} +{"index":{"_id":3}} +{"genre": "jazz", "product": "Product Z"} +{"index":{"_id":4}} +{"genre": "jazz"} +{"index":{"_id":5}} +{"genre": "electronic"} +{"index":{"_id":6}} +{"genre": "electronic"} +{"index":{"_id":7}} +{"genre": "electronic"} +{"index":{"_id":8}} +{"genre": "electronic"} +{"index":{"_id":9}} +{"genre": "electronic"} +{"index":{"_id":10}} +{"genre": "swing"} + +------------------------------------------------- +// NOTCONSOLE +// TESTSETUP + +////////////////////////// + +==== Syntax + +A `rare_terms` aggregation looks like this in isolation: + +[source,js] +-------------------------------------------------- +{ + "rare_terms": { + "field": "the_field", + "max_doc_count": 1 + } +} +-------------------------------------------------- +// NOTCONSOLE + +.`rare_terms` Parameters +|=== +|Parameter Name |Description |Required |Default Value +|`field` |The field we wish to find rare terms in |Required | +|`max_doc_count` |The maximum number of documents a term should appear in. |Optional |`1` +|`precision` |The precision of the internal CuckooFilters. Smaller precision leads to +better approximation, but higher memory usage. Cannot be smaller than `0.00001` |Optional |`0.01` +|`include` |Terms that should be included in the aggregation|Optional | +|`exclude` |Terms that should be excluded from the aggregation|Optional | +|`missing` |The value that should be used if a document does not have the field being aggregated|Optional | +|=== + + +Example: + +[source,js] +-------------------------------------------------- +GET /_search +{ + "aggs" : { + "genres" : { + "rare_terms" : { + "field" : "genre" + } + } + } +} +-------------------------------------------------- +// CONSOLE +// TEST[s/_search/_search\?filter_path=aggregations/] + +Response: + +[source,js] +-------------------------------------------------- +{ + ... + "aggregations" : { + "genres" : { + "buckets" : [ + { + "key" : "swing", + "doc_count" : 1 + } + ] + } + } +} +-------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] + +In this example, the only bucket that we see is the "swing" bucket, because it is the only term that appears in +one document. If we increase the `max_doc_count` to `2`, we'll see some more buckets: + +[source,js] +-------------------------------------------------- +GET /_search +{ + "aggs" : { + "genres" : { + "rare_terms" : { + "field" : "genre", + "max_doc_count": 2 + } + } + } +} +-------------------------------------------------- +// CONSOLE +// TEST[s/_search/_search\?filter_path=aggregations/] + +This now shows the "jazz" term which has a `doc_count` of 2": + +[source,js] +-------------------------------------------------- +{ + ... + "aggregations" : { + "genres" : { + "buckets" : [ + { + "key" : "swing", + "doc_count" : 1 + }, + { + "key" : "jazz", + "doc_count" : 2 + } + ] + } + } +} +-------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] + +[[search-aggregations-bucket-rare-terms-aggregation-max-doc-count]] +==== Maximum document count + +The `max_doc_count` parameter is used to control the upper bound of document counts that a term can have. There +is not a size limitation on the `rare_terms` agg like `terms` agg has. This means that terms +which match the `max_doc_count` criteria will be returned. The aggregation functions in this manner to avoid +the order-by-ascending issues that afflict the `terms` aggregation. + +This does, however, mean that a large number of results can be returned if chosen incorrectly. +To limit the danger of this setting, the maximum `max_doc_count` is 100. + +[[search-aggregations-bucket-rare-terms-aggregation-max-buckets]] +==== Max Bucket Limit + +The Rare Terms aggregation is more liable to trip the `search.max_buckets` soft limit than other aggregations due +to how it works. The `max_bucket` soft-limit is evaluated on a per-shard basis while the aggregation is collecting +results. It is possible for a term to be "rare" on a shard but become "not rare" once all the shard results are +merged together. This means that individual shards tend to collect more buckets than are truly rare, because +they only have their own local view. This list is ultimately pruned to the correct, smaller list of rare +terms on the coordinating node... but a shard may have already tripped the `max_buckets` soft limit and aborted +the request. + +When aggregating on fields that have potentially many "rare" terms, you may need to increase the `max_buckets` soft +limit. Alternatively, you might need to find a way to filter the results to return fewer rare values (smaller time +span, filter by category, etc), or re-evaluate your definition of "rare" (e.g. if something +appears 100,000 times, is it truly "rare"?) + +[[search-aggregations-bucket-rare-terms-aggregation-approximate-counts]] +==== Document counts are approximate + +The naive way to determine the "rare" terms in a dataset is to place all the values in a map, incrementing counts +as each document is visited, then return the bottom `n` rows. This does not scale beyond even modestly sized data +sets. A sharded approach where only the "top n" values are retained from each shard (ala the `terms` aggregation) +fails because the long-tail nature of the problem means it is impossible to find the "top n" bottom values without +simply collecting all the values from all shards. + +Instead, the Rare Terms aggregation uses a different approximate algorithm: + +1. Values are placed in a map the first time they are seen. +2. Each addition occurrence of the term increments a counter in the map +3. If the counter > the `max_doc_count` threshold, the term is removed from the map and placed in a +https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf[CuckooFilter] +4. The CuckooFilter is consulted on each term. If the value is inside the filter, it is known to be above the +threshold already and skipped. + +After execution, the map of values is the map of "rare" terms under the `max_doc_count` threshold. This map and CuckooFilter +are then merged with all other shards. If there are terms that are greater than the threshold (or appear in +a different shard's CuckooFilter) the term is removed from the merged list. The final map of values is returned +to the user as the "rare" terms. + +CuckooFilters have the possibility of returning false positives (they can say a value exists in their collection when +it actually does not). Since the CuckooFilter is being used to see if a term is over threshold, this means a false positive +from the CuckooFilter will mistakenly say a value is common when it is not (and thus exclude it from it final list of buckets). +Practically, this means the aggregations exhibits false-negative behavior since the filter is being used "in reverse" +of how people generally think of approximate set membership sketches. + +CuckooFilters are described in more detail in the paper: + +https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf[Fan, Bin, et al. "Cuckoo filter: Practically better than bloom."] +Proceedings of the 10th ACM International on Conference on emerging Networking Experiments and Technologies. ACM, 2014. + +==== Precision + +Although the internal CuckooFilter is approximate in nature, the false-negative rate can be controlled with a +`precision` parameter. This allows the user to trade more runtime memory for more accurate results. + +The default precision is `0.001`, and the smallest (e.g. most accurate and largest memory overhead) is `0.00001`. +Below are some charts which demonstrate how the accuracy of the aggregation is affected by precision and number +of distinct terms. + +The X-axis shows the number of distinct values the aggregation has seen, and the Y-axis shows the percent error. +Each line series represents one "rarity" condition (ranging from one rare item to 100,000 rare items). For example, +the orange "10" line means ten of the values were "rare" (`doc_count == 1`), out of 1-20m distinct values (where the +rest of the values had `doc_count > 1`) + +This first chart shows precision `0.01`: + +image:images/rare_terms/accuracy_01.png[] + +And precision `0.001` (the default): + +image:images/rare_terms/accuracy_001.png[] + +And finally `precision 0.0001`: + +image:images/rare_terms/accuracy_0001.png[] + +The default precision of `0.001` maintains an accuracy of < 2.5% for the tested conditions, and accuracy slowly +degrades in a controlled, linear fashion as the number of distinct values increases. + +The default precision of `0.001` has a memory profile of `1.748⁻⁶ * n` bytes, where `n` is the number +of distinct values the aggregation has seen (it can also be roughly eyeballed, e.g. 20 million unique values is about +30mb of memory). The memory usage is linear to the number of distinct values regardless of which precision is chosen, +the precision only affects the slope of the memory profile as seen in this chart: + +image:images/rare_terms/memory.png[] + +For comparison, an equivalent terms aggregation at 20 million buckets would be roughly +`20m * 69b == ~1.38gb` (with 69 bytes being a very optimistic estimate of an empty bucket cost, far lower than what +the circuit breaker accounts for). So although the `rare_terms` agg is relatively heavy, it is still orders of +magnitude smaller than the equivalent terms aggregation + +==== Filtering Values + +It is possible to filter the values for which buckets will be created. This can be done using the `include` and +`exclude` parameters which are based on regular expression strings or arrays of exact values. Additionally, +`include` clauses can filter using `partition` expressions. + +===== Filtering Values with regular expressions + +[source,js] +-------------------------------------------------- +GET /_search +{ + "aggs" : { + "genres" : { + "rare_terms" : { + "field" : "genre", + "include" : "swi*", + "exclude" : "electro*" + } + } + } +} +-------------------------------------------------- +// CONSOLE + +In the above example, buckets will be created for all the tags that starts with `swi`, except those starting +with `electro` (so the tag `swing` will be aggregated but not `electro_swing`). The `include` regular expression will determine what +values are "allowed" to be aggregated, while the `exclude` determines the values that should not be aggregated. When +both are defined, the `exclude` has precedence, meaning, the `include` is evaluated first and only then the `exclude`. + +The syntax is the same as <>. + +===== Filtering Values with exact values + +For matching based on exact values the `include` and `exclude` parameters can simply take an array of +strings that represent the terms as they are found in the index: + +[source,js] +-------------------------------------------------- +GET /_search +{ + "aggs" : { + "genres" : { + "rare_terms" : { + "field" : "genre", + "include" : ["swing", "rock"], + "exclude" : ["jazz"] + } + } + } +} +-------------------------------------------------- +// CONSOLE + + +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,js] +-------------------------------------------------- +GET /_search +{ + "aggs" : { + "genres" : { + "rare_terms" : { + "field" : "genre", + "missing": "N/A" <1> + } + } + } +} +-------------------------------------------------- +// CONSOLE + +<1> Documents without a value in the `tags` field will fall into the same bucket as documents that have the value `N/A`. + +==== Nested, RareTerms, and scoring sub-aggregations + +The RareTerms aggregation has to operate in `breadth_first` mode, since it needs to prune terms as doc count thresholds +are breached. This requirement means the RareTerms aggregation is incompatible with certain combinations of aggregations +that require `depth_first`. In particular, scoring sub-aggregations that are inside a `nested` force the entire aggregation tree to run +in `depth_first` mode. This will throw an exception since RareTerms is unable to process `depth_first`. + +As a concrete example, if `rare_terms` aggregation is the child of a `nested` aggregation, and one of the child aggregations of `rare_terms` +needs document scores (like a `top_hits` aggregation), this will throw an exception. \ No newline at end of file diff --git a/docs/reference/images/rare_terms/accuracy_0001.png b/docs/reference/images/rare_terms/accuracy_0001.png new file mode 100644 index 0000000000000..0c13a3938cde2 Binary files /dev/null and b/docs/reference/images/rare_terms/accuracy_0001.png differ diff --git a/docs/reference/images/rare_terms/accuracy_001.png b/docs/reference/images/rare_terms/accuracy_001.png new file mode 100644 index 0000000000000..2aa1be316c382 Binary files /dev/null and b/docs/reference/images/rare_terms/accuracy_001.png differ diff --git a/docs/reference/images/rare_terms/accuracy_01.png b/docs/reference/images/rare_terms/accuracy_01.png new file mode 100644 index 0000000000000..7182b7d3c537e Binary files /dev/null and b/docs/reference/images/rare_terms/accuracy_01.png differ diff --git a/docs/reference/images/rare_terms/memory.png b/docs/reference/images/rare_terms/memory.png new file mode 100644 index 0000000000000..e0de5c2163913 Binary files /dev/null and b/docs/reference/images/rare_terms/memory.png differ diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml new file mode 100644 index 0000000000000..73c46bc963e12 --- /dev/null +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml @@ -0,0 +1,316 @@ +setup: + - skip: + version: " - 8.0.0" # TODO change this after backport + reason: RareTerms added in 7.3.0 + - do: + indices.create: + index: test_1 + body: + settings: + number_of_replicas: 0 + mappings: + properties: + str: + type: keyword + ip: + type: ip + boolean: + type: boolean + integer: + type: long + number: + type: long + date: + type: date + + + - do: + cluster.health: + wait_for_status: green + +--- +"Basic test": + - do: + index: + index: test_1 + id: 1 + body: { "str" : "abc" } + + - do: + index: + index: test_1 + id: 2 + body: { "str": "abc" } + + - do: + index: + index: test_1 + id: 3 + body: { "str": "bcd" } + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "str_terms" : { "rare_terms" : { "field" : "str", "max_doc_count" : 1 } } } } + + - match: { hits.total.value: 3 } + - length: { aggregations.str_terms.buckets: 1 } + - match: { aggregations.str_terms.buckets.0.key: "bcd" } + - is_false: aggregations.str_terms.buckets.0.key_as_string + - match: { aggregations.str_terms.buckets.0.doc_count: 1 } + +--- +"IP test": + - do: + index: + index: test_1 + id: 1 + body: { "ip": "::1" } + + - do: + index: + index: test_1 + id: 2 + body: { "ip": "127.0.0.1" } + + - do: + index: + index: test_1 + id: 3 + body: { "ip": "::1" } + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "ip_terms" : { "rare_terms" : { "field" : "ip" } } } } + + - match: { hits.total.value: 3 } + - length: { aggregations.ip_terms.buckets: 1 } + - match: { aggregations.ip_terms.buckets.0.key: "127.0.0.1" } + - is_false: aggregations.ip_terms.buckets.0.key_as_string + - match: { aggregations.ip_terms.buckets.0.doc_count: 1 } + + - do: + search: + body: { "size" : 0, "aggs" : { "ip_terms" : { "rare_terms" : { "field" : "ip", "include" : [ "127.0.0.1" ] } } } } + + - match: { hits.total.value: 3 } + - length: { aggregations.ip_terms.buckets: 1 } + - match: { aggregations.ip_terms.buckets.0.key: "127.0.0.1" } + - is_false: aggregations.ip_terms.buckets.0.key_as_string + - match: { aggregations.ip_terms.buckets.0.doc_count: 1 } + + - do: + search: + body: { "size" : 0, "aggs" : { "ip_terms" : { "rare_terms" : { "field" : "ip", "exclude" : [ "127.0.0.1" ] } } } } + + - match: { hits.total.value: 3 } + - length: { aggregations.ip_terms.buckets: 0 } + + - do: + catch: request + search: + index: test_1 + body: { "size" : 0, "aggs" : { "ip_terms" : { "rare_terms" : { "field" : "ip", "exclude" : "127.*" } } } } + + + +--- +"Boolean test": + - do: + index: + index: test_1 + id: 1 + body: { "boolean": true } + + - do: + index: + index: test_1 + id: 2 + body: { "boolean": false } + + - do: + index: + index: test_1 + id: 3 + body: { "boolean": true } + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "boolean_terms" : { "rare_terms" : { "field" : "boolean" } } } } + + - match: { hits.total.value: 3 } + - length: { aggregations.boolean_terms.buckets: 1 } + - match: { aggregations.boolean_terms.buckets.0.key: 0 } + - match: { aggregations.boolean_terms.buckets.0.key_as_string: "false" } + - match: { aggregations.boolean_terms.buckets.0.doc_count: 1 } + +--- +"Integer test": + - do: + index: + index: test_1 + id: 1 + body: { "integer": 1234 } + + - do: + index: + index: test_1 + id: 2 + body: { "integer": 5678 } + + - do: + index: + index: test_1 + id: 3 + body: { "integer": 1234 } + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "integer_terms" : { "rare_terms" : { "field" : "integer" } } } } + + - match: { hits.total.value: 3 } + + - length: { aggregations.integer_terms.buckets: 1 } + + - match: { aggregations.integer_terms.buckets.0.key: 5678 } + - is_false: aggregations.integer_terms.buckets.0.key_as_string + - match: { aggregations.integer_terms.buckets.0.doc_count: 1 } + +--- +"Date test": + - do: + index: + index: test_1 + id: 1 + body: { "date": "2016-05-03" } + + - do: + index: + index: test_1 + id: 2 + body: { "date": "2014-09-01" } + + - do: + index: + index: test_1 + id: 3 + body: { "date": "2016-05-03" } + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "date_terms" : { "rare_terms" : { "field" : "date" } } } } + + - match: { hits.total.value: 3 } + + - length: { aggregations.date_terms.buckets: 1 } + - match: { aggregations.date_terms.buckets.0.key: 1409529600000 } + - match: { aggregations.date_terms.buckets.0.key_as_string: "2014-09-01T00:00:00.000Z" } + - match: { aggregations.date_terms.buckets.0.doc_count: 1 } + + - do: + search: + body: { "size" : 0, "aggs" : { "date_terms" : { "rare_terms" : { "field" : "date", "include" : [ "2014-09-01" ] } } } } + + - match: { hits.total.value: 3 } + - length: { aggregations.date_terms.buckets: 1 } + - match: { aggregations.date_terms.buckets.0.key_as_string: "2014-09-01T00:00:00.000Z" } + - match: { aggregations.date_terms.buckets.0.doc_count: 1 } + + - do: + search: + body: { "size" : 0, "aggs" : { "date_terms" : { "rare_terms" : { "field" : "date", "exclude" : [ "2014-09-01" ] } } } } + + - match: { hits.total.value: 3 } + - length: { aggregations.date_terms.buckets: 0 } + +--- +"Unmapped strings": + + - do: + index: + index: test_1 + id: 1 + body: {} + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "string_terms" : { "rare_terms" : { "field" : "unmapped_string"} } } } + + - match: { hits.total.value: 1 } + - length: { aggregations.string_terms.buckets: 0 } + +--- +"Unmapped booleans": + + - do: + index: + index: test_1 + id: 1 + body: {} + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "boolean_terms" : { "rare_terms" : { "field" : "unmapped_boolean" } } } } + + - match: { hits.total.value: 1 } + - length: { aggregations.boolean_terms.buckets: 0 } + +--- +"Unmapped dates": + + - do: + index: + index: test_1 + id: 1 + body: {} + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "date_terms" : { "rare_terms" : { "field" : "unmapped_date"} } } } + + - match: { hits.total.value: 1 } + - length: { aggregations.date_terms.buckets: 0 } + +--- +"Unmapped longs": + + - do: + index: + index: test_1 + id: 1 + body: {} + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "long_terms" : { "rare_terms" : { "field" : "unmapped_long", "value_type" : "long" } } } } + + - match: { hits.total.value: 1 } + - length: { aggregations.long_terms.buckets: 0 } + + diff --git a/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java b/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java index a52f0e8acc4ae..a9232e06657ad 100644 --- a/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java +++ b/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java @@ -21,6 +21,8 @@ import org.elasticsearch.common.util.ByteUtils; +import java.util.Objects; + /** * MurmurHash3 hashing functions. @@ -36,6 +38,24 @@ public static class Hash128 { public long h1; /** higher 64 bits part **/ public long h2; + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (other == null || getClass() != other.getClass()) { + return false; + } + Hash128 that = (Hash128) other; + return Objects.equals(this.h1, that.h1) + && Objects.equals(this.h2, that.h2); + } + + @Override + public int hashCode() { + return Objects.hash(h1, h2); + } } private static long C1 = 0x87c37b91114253d5L; @@ -160,4 +180,22 @@ public static Hash128 hash128(byte[] key, int offset, int length, long seed, Has return hash; } + /** + * A 64-bit variant which accepts a long to hash, and returns the 64bit long hash. + * This is useful if the input is already in long (or smaller) format and you don't + * need the full 128b width and flexibility of + * {@link MurmurHash3#hash128(byte[], int, int, long, Hash128)} + * + * Given the limited nature of this variant, it should be faster than the 128b version + * when you only need 128b (many fewer instructions) + */ + public static long murmur64(long h) { + h ^= h >>> 33; + h *= 0xff51afd7ed558ccdL; + h ^= h >>> 33; + h *= 0xc4ceb9fe1a85ec53L; + h ^= h >>> 33; + return h; + } + } diff --git a/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java new file mode 100644 index 0000000000000..54099735fba47 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java @@ -0,0 +1,521 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.common.util; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.packed.PackedInts; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; + +import java.io.IOException; +import java.util.Iterator; +import java.util.Objects; +import java.util.Random; + +/** + * An approximate set membership datastructure + * + * CuckooFilters are similar to Bloom Filters in usage; values are inserted, and the Cuckoo + * can be asked if it has seen a particular value before. Because the structure is approximate, + * it can return false positives (says it has seen an item when it has not). False negatives + * are not possible though; if the structure says it _has not_ seen an item, that can be + * trusted. + * + * The filter can "saturate" at which point the map has hit it's configured load factor (or near enough + * that a large number of evictions are not able to find a free slot) and will refuse to accept + * any new insertions. + * + * NOTE: this version does not support deletions, and as such does not save duplicate + * fingerprints (e.g. when inserting, if the fingerprint is already present in the + * candidate buckets, it is not inserted). By not saving duplicates, the CuckooFilter + * loses the ability to delete values. But not by allowing deletions, we can save space + * (do not need to waste slots on duplicate fingerprints), and we do not need to worry + * about inserts "overflowing" a bucket because the same item has been repeated repeatedly + * + * NOTE: this CuckooFilter exposes a number of Expert APIs which assume the caller has + * intimate knowledge about how the algorithm works. It is recommended to use + * {@link SetBackedScalingCuckooFilter} instead. + * + * Based on the paper: + * + * Fan, Bin, et al. "Cuckoo filter: Practically better than bloom." + * Proceedings of the 10th ACM International on Conference on emerging Networking Experiments and Technologies. ACM, 2014. + * + * https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf + */ +public class CuckooFilter implements Writeable { + + private static final double LN_2 = Math.log(2); + private static final int MAX_EVICTIONS = 500; + static final int EMPTY = 0; + + private final PackedInts.Mutable data; + private final int numBuckets; + private final int bitsPerEntry; + private final int fingerprintMask; + private final int entriesPerBucket; + private final Random rng; + private int count; + private int evictedFingerprint = EMPTY; + + /** + * @param capacity The number of expected inserts. The filter can hold more than this value, it is just an estimate + * @param fpp The desired false positive rate. Smaller values will reduce the + * false positives at expense of larger size + * @param rng A random number generator, used with the cuckoo hashing process + */ + CuckooFilter(long capacity, double fpp, Random rng) { + this.rng = rng; + this.entriesPerBucket = entriesPerBucket(fpp); + double loadFactor = getLoadFactor(entriesPerBucket); + this.bitsPerEntry = bitsPerEntry(fpp, entriesPerBucket); + this.numBuckets = getNumBuckets(capacity, loadFactor, entriesPerBucket); + + if ((long) numBuckets * (long) entriesPerBucket > Integer.MAX_VALUE) { + throw new IllegalArgumentException("Attempted to create [" + numBuckets * entriesPerBucket + + "] entries which is > Integer.MAX_VALUE"); + } + this.data = PackedInts.getMutable(numBuckets * entriesPerBucket, bitsPerEntry, PackedInts.COMPACT); + + // puts the bits at the right side of the mask, e.g. `0000000000001111` for bitsPerEntry = 4 + this.fingerprintMask = (0x80000000 >> (bitsPerEntry - 1)) >>> (Integer.SIZE - bitsPerEntry); + } + + /** + * This ctor is likely slow and should only be used for testing + */ + CuckooFilter(CuckooFilter other) { + this.numBuckets = other.numBuckets; + this.bitsPerEntry = other.bitsPerEntry; + this.entriesPerBucket = other.entriesPerBucket; + this.count = other.count; + this.evictedFingerprint = other.evictedFingerprint; + this.rng = other.rng; + this.fingerprintMask = other.fingerprintMask; + + // This shouldn't happen, but as a sanity check + if ((long) numBuckets * (long) entriesPerBucket > Integer.MAX_VALUE) { + throw new IllegalArgumentException("Attempted to create [" + numBuckets * entriesPerBucket + + "] entries which is > Integer.MAX_VALUE"); + } + // TODO this is probably super slow, but just used for testing atm + this.data = PackedInts.getMutable(numBuckets * entriesPerBucket, bitsPerEntry, PackedInts.COMPACT); + for (int i = 0; i < other.data.size(); i++) { + data.set(i, other.data.get(i)); + } + } + + CuckooFilter(StreamInput in, Random rng) throws IOException { + this.numBuckets = in.readVInt(); + this.bitsPerEntry = in.readVInt(); + this.entriesPerBucket = in.readVInt(); + this.count = in.readVInt(); + this.evictedFingerprint = in.readVInt(); + this.rng = rng; + + this.fingerprintMask = (0x80000000 >> (bitsPerEntry - 1)) >>> (Integer.SIZE - bitsPerEntry); + + data = (PackedInts.Mutable) PackedInts.getReader(new DataInput() { + @Override + public byte readByte() throws IOException { + return in.readByte(); + } + + @Override + public void readBytes(byte[] b, int offset, int len) throws IOException { + in.readBytes(b, offset, len); + } + }); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVInt(numBuckets); + out.writeVInt(bitsPerEntry); + out.writeVInt(entriesPerBucket); + out.writeVInt(count); + out.writeVInt(evictedFingerprint); + + data.save(new DataOutput() { + @Override + public void writeByte(byte b) throws IOException { + out.writeByte(b); + } + + @Override + public void writeBytes(byte[] b, int offset, int length) throws IOException { + out.writeBytes(b, offset, length); + } + }); + } + + /** + * Get the number of unique items that are being tracked + */ + public int getCount() { + return count; + } + + /** + * Returns the number of buckets that has been chosen based + * on the initial configuration + * + * Expert-level API + */ + int getNumBuckets() { + return numBuckets; + } + + /** + * Returns the number of bits used per entry + * + * Expert-level API + */ + int getBitsPerEntry() { + return bitsPerEntry; + } + + /** + * Returns the cached fingerprint mask. This is simply a mask for the + * first bitsPerEntry bits, used by {@link CuckooFilter#fingerprint(int, int, int)} + * to generate the fingerprint of a hash + * + * Expert-level API + */ + int getFingerprintMask() { + return fingerprintMask; + } + + /** + * Returns an iterator that returns the long[] representation of each bucket. The value + * inside each long will be a fingerprint (or 0L, representing empty). + * + * Expert-level API + */ + Iterator getBuckets() { + return new Iterator<>() { + int current = 0; + + @Override + public boolean hasNext() { + return current < numBuckets; + } + + @Override + public long[] next() { + long[] values = new long[entriesPerBucket]; + int offset = getOffset(current, 0); + data.get(offset, values, 0, entriesPerBucket); + current += 1; + return values; + } + }; + } + + /** + * Returns true if the set might contain the provided value, false otherwise. False values are + * 100% accurate, while true values may be a false-positive. + */ + boolean mightContain(long hash) { + int bucket = hashToIndex((int) hash, numBuckets); + int fingerprint = fingerprint((int) (hash >>> 32), bitsPerEntry, fingerprintMask); + int alternateIndex = alternateIndex(bucket, fingerprint, numBuckets); + + return mightContainFingerprint(bucket, fingerprint, alternateIndex); + } + + /** + * Returns true if the bucket or it's alternate bucket contains the fingerprint. + * + * Expert-level API, use {@link CuckooFilter#mightContain(long)} to check if + * a value is in the filter. + */ + boolean mightContainFingerprint(int bucket, int fingerprint, int alternateBucket) { + + // check all entries for both buckets and the evicted slot + return hasFingerprint(bucket, fingerprint) || hasFingerprint(alternateBucket, fingerprint) || evictedFingerprint == fingerprint; + } + + /** + * Return's true if any of the entries in the bucket contain the fingerprint + */ + private boolean hasFingerprint(int bucket, long fingerprint) { + long[] values = new long[entriesPerBucket]; + int offset = getOffset(bucket, 0); + data.get(offset, values, 0, entriesPerBucket); + + for (int i = 0; i < entriesPerBucket; i++) { + if (values[i] == fingerprint) { + return true; + } + } + return false; + } + + /** + * Add's the hash to the bucket or alternate bucket. Returns true if the insertion was + * successful, false if the filter is saturated. + */ + boolean add(long hash) { + // Each bucket needs 32 bits, so we truncate for the first bucket and shift/truncate for second + int bucket = hashToIndex((int) hash, numBuckets); + int fingerprint = fingerprint((int) (hash >>> 32), bitsPerEntry, fingerprintMask); + return mergeFingerprint(bucket, fingerprint); + } + + /** + * Attempts to merge the fingerprint into the specified bucket or it's alternate bucket. + * Returns true if the insertion was successful, false if the filter is saturated. + * + * Expert-level API, use {@link CuckooFilter#add(long)} to insert + * values into the filter + */ + boolean mergeFingerprint(int bucket, int fingerprint) { + // If we already have an evicted fingerprint we are full, no need to try + if (evictedFingerprint != EMPTY) { + return false; + } + + int alternateBucket = alternateIndex(bucket, fingerprint, numBuckets); + if (tryInsert(bucket, fingerprint) || tryInsert(alternateBucket, fingerprint)) { + count += 1; + return true; + } + + for (int i = 0; i < MAX_EVICTIONS; i++) { + // overwrite our alternate bucket, and a random entry + int offset = getOffset(alternateBucket, rng.nextInt(entriesPerBucket - 1)); + int oldFingerprint = (int) data.get(offset); + data.set(offset, fingerprint); + + // replace details and start again + fingerprint = oldFingerprint; + bucket = alternateBucket; + alternateBucket = alternateIndex(bucket, fingerprint, numBuckets); + + // Only try to insert into alternate bucket + if (tryInsert(alternateBucket, fingerprint)) { + count += 1; + return true; + } + } + + // If we get this far, we failed to insert the value after MAX_EVICTION rounds, + // so cache the last evicted value (so we don't lose it) and signal we failed + evictedFingerprint = fingerprint; + return false; + } + + /** + * Low-level insert method. Attempts to write the fingerprint into an empty entry + * at this bucket's position. Returns true if that was sucessful, false if all entries + * were occupied. + * + * If the fingerprint already exists in one of the entries, it will not duplicate the + * fingerprint like the original paper. This means the filter _cannot_ support deletes, + * but is not sensitive to "overflowing" buckets with repeated inserts + */ + private boolean tryInsert(int bucket, int fingerprint) { + long[] values = new long[entriesPerBucket]; + int offset = getOffset(bucket, 0); + data.get(offset, values, 0, entriesPerBucket); + + // TODO implement semi-sorting + for (int i = 0; i < values.length; i++) { + if (values[i] == EMPTY) { + data.set(offset + i, fingerprint); + return true; + } else if (values[i] == fingerprint) { + // Already have the fingerprint, no need to save + return true; + } + } + return false; + } + + /** + * Converts a hash into a bucket index (primary or alternate). + * + * If the hash is negative, this flips the bits. The hash is then modulo numBuckets + * to get the final index. + * + * Expert-level API + */ + static int hashToIndex(int hash, int numBuckets) { + return hash & (numBuckets - 1); + } + + /** + * Calculates the alternate bucket for a given bucket:fingerprint tuple + * + * The alternate bucket is the fingerprint multiplied by a mixing constant, + * then xor'd against the bucket. This new value is modulo'd against + * the buckets via {@link CuckooFilter#hashToIndex(int, int)} to get the final + * index. + * + * Note that the xor makes this operation reversible as long as we have the + * fingerprint and current bucket (regardless of if that bucket was the primary + * or alternate). + * + * Expert-level API + */ + static int alternateIndex(int bucket, int fingerprint, int numBuckets) { + /* + Reference impl uses murmur2 mixing constant: + https://github.com/efficient/cuckoofilter/blob/master/src/cuckoofilter.h#L78 + // NOTE(binfan): originally we use: + // index ^ HashUtil::BobHash((const void*) (&tag), 4)) & table_->INDEXMASK; + // now doing a quick-n-dirty way: + // 0x5bd1e995 is the hash constant from MurmurHash2 + return IndexHash((uint32_t)(index ^ (tag * 0x5bd1e995))); + */ + int index = bucket ^ (fingerprint * 0x5bd1e995); + return hashToIndex(index, numBuckets); + } + + /** + * Given the bucket and entry position, returns the absolute offset + * inside the PackedInts datastructure + */ + private int getOffset(int bucket, int position) { + return (bucket * entriesPerBucket) + position; + } + + /** + * Calculates the fingerprint for a given hash. + * + * The fingerprint is simply the first `bitsPerEntry` number of bits that are non-zero. + * If the entire hash is zero, `(int) 1` is used + * + * Expert-level API + */ + static int fingerprint(int hash, int bitsPerEntry, int fingerprintMask) { + if (hash == 0) { + // we use 0 as "empty" so if the hash actually hashes to zero... return 1 + // Some other impls will re-hash with a salt but this seems simpler + return 1; + } + + for (int i = 0; i + bitsPerEntry <= Long.SIZE; i += bitsPerEntry) { + int v = (hash >> i) & fingerprintMask; + if (v != 0) { + return v; + } + } + return 1; + } + + /** + * Calculate the optimal number of bits per entry + */ + private int bitsPerEntry(double fpp, int numEntriesPerBucket) { + return (int) Math.round(log2((2 * numEntriesPerBucket) / fpp)); + } + + /** + * Calculate the optimal number of entries per bucket. Will return 2, 4 or 8 + * depending on the false positive rate + */ + private int entriesPerBucket(double fpp) { + /* + Empirical constants from paper: + "the space-optimal bucket size depends on the target false positive rate ε: + when ε > 0.002, having two entries per bucket yields slightly better results + than using four entries per bucket; when ε decreases to 0.00001 < ε <= 0.002, + four entries per bucket minimzes space" + */ + + if (fpp > 0.002) { + return 2; + } else if (fpp > 0.00001 && fpp <= 0.002) { + return 4; + } + return 8; + } + + /** + * Calculates the optimal load factor for the filter, given the number of entries + * per bucket. Will return 0.84, 0.955 or 0.98 depending on b + */ + private double getLoadFactor(int b) { + if ((b == 2 || b == 4 || b == 8) == false) { + throw new IllegalArgumentException("b must be one of [2,4,8]"); + } + /* + Empirical constants from the paper: + "With k = 2 hash functions, the load factor α is 50% when bucket size b = 1 (i.e + the hash table is directly mapped), but increases to 84%, 95%, 98% respectively + using bucket size b = 2, 4, 8" + */ + if (b == 2) { + return 0.84D; + } else if (b == 4) { + return 0.955D; + } else { + return 0.98D; + } + } + + /** + * Calculates the optimal number of buckets for this filter. The xor used in the bucketing + * algorithm requires this to be a power of two, so the optimal number of buckets will + * be rounded to the next largest power of two where applicable. + * + * TODO: there are schemes to avoid powers of two, might want to investigate those + */ + private int getNumBuckets(long capacity, double loadFactor, int b) { + long buckets = Math.round((((double) capacity / loadFactor)) / (double) b); + + // Rounds up to nearest power of 2 + return 1 << -Integer.numberOfLeadingZeros((int)buckets - 1); + } + + private double log2(double x) { + return Math.log(x) / LN_2; + } + + public long getSizeInBytes() { + // (numBuckets, bitsPerEntry, fingerprintMask, entriesPerBucket, count, evictedFingerprint) * 4b == 24b + return data.ramBytesUsed() + 24; + } + + @Override + public int hashCode() { + return Objects.hash(numBuckets, bitsPerEntry, entriesPerBucket, count, evictedFingerprint); + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (other == null || getClass() != other.getClass()) { + return false; + } + + final CuckooFilter that = (CuckooFilter) other; + return Objects.equals(this.numBuckets, that.numBuckets) + && Objects.equals(this.bitsPerEntry, that.bitsPerEntry) + && Objects.equals(this.entriesPerBucket, that.entriesPerBucket) + && Objects.equals(this.count, that.count) + && Objects.equals(this.evictedFingerprint, that.evictedFingerprint); + } +} diff --git a/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java b/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java new file mode 100644 index 0000000000000..095416e5d9aa5 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java @@ -0,0 +1,408 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.util; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.hash.MurmurHash3; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; +import java.util.Random; +import java.util.Set; +import java.util.function.Consumer; + +/** + * An approximate set membership datastructure that scales as more unique values are inserted. + * Can definitively say if a member does not exist (no false negatives), but may say an item exists + * when it does not (has false negatives). Similar in usage to a Bloom Filter. + * + * Internally, the datastructure maintains a Set of hashes up to a specified threshold. This provides + * 100% accurate membership queries. + * + * When the threshold is breached, a list of CuckooFilters are created and used to track membership. + * These filters are approximate similar to Bloom Filters. + * + * This datastructure scales as more values are inserted by growing the list of CuckooFilters. + * Final size is dependent on the cardinality of data inserted, and the precision specified. + */ +public class SetBackedScalingCuckooFilter implements Writeable { + + /** + * This is the estimated insertion capacity for each individual internal CuckooFilter. + */ + private static final int FILTER_CAPACITY = 1000000; + + /** + * This set is used to track the insertions before we convert over to an approximate + * filter. This gives us 100% accuracy for small cardinalities. This will be null + * if isSetMode = false; + * + * package-private for testing + */ + Set hashes; + + /** + * This list holds our approximate filters, after we have migrated out of a set. + * This will be null if isSetMode = true; + */ + List filters; + + private final int threshold; + private final Random rng; + private final int capacity; + private final double fpp; + private Consumer breaker = aLong -> { + //noop + }; + + // cached here for performance reasons + private int numBuckets = 0; + private int bitsPerEntry = 0; + private int fingerprintMask = 0; + private MurmurHash3.Hash128 scratchHash = new MurmurHash3.Hash128(); + + // True if we are tracking inserts with a set, false otherwise + private boolean isSetMode = true; + + /** + * @param threshold The number of distinct values that should be tracked + * before converting to an approximate representation + * @param rng A random number generator needed for the cuckoo hashing process + * @param fpp the false-positive rate that should be used for the cuckoo filters. + */ + public SetBackedScalingCuckooFilter(int threshold, Random rng, double fpp) { + if (threshold <= 0) { + throw new IllegalArgumentException("[threshold] must be a positive integer"); + } + + // We have to ensure that, in the worst case, two full sets can be converted into + // one cuckoo filter without overflowing. This keeps merging logic simpler + if (threshold * 2 > FILTER_CAPACITY) { + throw new IllegalArgumentException("[threshold] must be smaller than [" + (FILTER_CAPACITY / 2) + "]"); + } + if (fpp < 0) { + throw new IllegalArgumentException("[fpp] must be a positive double"); + } + this.hashes = new HashSet<>(threshold); + this.threshold = threshold; + this.rng = rng; + this.capacity = FILTER_CAPACITY; + this.fpp = fpp; + } + + public SetBackedScalingCuckooFilter(SetBackedScalingCuckooFilter other) { + this.threshold = other.threshold; + this.isSetMode = other.isSetMode; + this.rng = other.rng; + this.breaker = other.breaker; + this.capacity = other.capacity; + this.fpp = other.fpp; + if (isSetMode) { + this.hashes = new HashSet<>(other.hashes); + } else { + this.filters = new ArrayList<>(other.filters); + this.numBuckets = filters.get(0).getNumBuckets(); + this.fingerprintMask = filters.get(0).getFingerprintMask(); + this.bitsPerEntry = filters.get(0).getBitsPerEntry(); + } + } + + public SetBackedScalingCuckooFilter(StreamInput in, Random rng) throws IOException { + this.threshold = in.readVInt(); + this.isSetMode = in.readBoolean(); + this.rng = rng; + this.capacity = in.readVInt(); + this.fpp = in.readDouble(); + + if (isSetMode) { + this.hashes = in.readSet(StreamInput::readZLong); + } else { + this.filters = in.readList(in12 -> new CuckooFilter(in12, rng)); + this.numBuckets = filters.get(0).getNumBuckets(); + this.fingerprintMask = filters.get(0).getFingerprintMask(); + this.bitsPerEntry = filters.get(0).getBitsPerEntry(); + } + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVInt(threshold); + out.writeBoolean(isSetMode); + out.writeVInt(capacity); + out.writeDouble(fpp); + if (isSetMode) { + out.writeCollection(hashes, StreamOutput::writeZLong); + } else { + out.writeList(filters); + } + } + + /** + * Registers a circuit breaker with the datastructure. + * + * CuckooFilter's can "saturate" and refuse to accept any new values. When this happens, + * the datastructure scales by adding a new filter. This new filter's bytes will be tracked + * in the registered breaker when configured. + */ + public void registerBreaker(Consumer breaker) { + this.breaker = Objects.requireNonNull(breaker, "Circuit Breaker Consumer cannot be null"); + breaker.accept(getSizeInBytes()); + } + + /** + * Returns true if the set might contain the provided value, false otherwise. False values are + * 100% accurate, while true values may be a false-positive. + */ + public boolean mightContain(BytesRef value) { + MurmurHash3.Hash128 hash = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, scratchHash); + return mightContainHash(hash.h1); + } + + /** + * Returns true if the set might contain the provided value, false otherwise. False values are + * 100% accurate, while true values may be a false-positive. + */ + public boolean mightContain(long value) { + long hash = MurmurHash3.murmur64(value); + return mightContainHash(hash); + } + + /** + * Returns true if the set might contain the provided value, false otherwise. False values are + * 100% accurate, while true values may be a false-positive. + */ + private boolean mightContainHash(long hash) { + if (isSetMode) { + return hashes.contains(hash); + } + + // We calculate these once up front for all the filters and use the expert API + int bucket = CuckooFilter.hashToIndex((int) hash, numBuckets); + int fingerprint = CuckooFilter.fingerprint((int) (hash >> 32), bitsPerEntry, fingerprintMask); + int alternateIndex = CuckooFilter.alternateIndex(bucket, fingerprint, numBuckets); + + for (CuckooFilter filter : filters) { + if (filter.mightContainFingerprint(bucket, fingerprint, alternateIndex)) { + return true; + } + } + return false; + } + + /** + * Returns true if any of the filters contain this fingerprint at the specified bucket. + * This is an expert-level API since it is dealing with buckets and fingerprints, not raw values + * being hashed. + */ + private boolean mightContainFingerprint(int bucket, int fingerprint) { + int alternateIndex = CuckooFilter.alternateIndex(bucket, fingerprint, numBuckets); + for (CuckooFilter filter : filters) { + if (filter.mightContainFingerprint(bucket, fingerprint, alternateIndex)) { + return true; + } + } + return false; + } + + /** + * Add's the provided value to the set for tracking + */ + public void add(BytesRef value) { + MurmurHash3.Hash128 hash = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, scratchHash); + addHash(hash.h1); + } + + /** + * Add's the provided value to the set for tracking + */ + public void add(long value) { + addHash(MurmurHash3.murmur64(value)); + } + + private void addHash(long hash) { + if (isSetMode) { + hashes.add(hash); + maybeConvert(); + return; + } + + boolean success = filters.get(filters.size() - 1).add(hash); + if (success == false) { + // filter is full, create a new one and insert there + CuckooFilter t = new CuckooFilter(capacity, fpp, rng); + t.add(hash); + filters.add(t); + breaker.accept(t.getSizeInBytes()); // make sure we account for the new filter + } + } + + private void maybeConvert() { + if (isSetMode && hashes.size() > threshold) { + convert(); + } + } + + /** + * If we still holding values in a set, convert this filter into an approximate, cuckoo-backed filter. + * This will create a list of CuckooFilters, and null out the set of hashes + */ + void convert() { + if (isSetMode == false) { + throw new IllegalStateException("Cannot convert SetBackedScalingCuckooFilter to approximate " + + "when it has already been converted."); + } + long oldSize = getSizeInBytes(); + + filters = new ArrayList<>(); + CuckooFilter t = new CuckooFilter(capacity, fpp, rng); + // Cache the chosen numBuckets for later use + numBuckets = t.getNumBuckets(); + fingerprintMask = t.getFingerprintMask(); + bitsPerEntry = t.getBitsPerEntry(); + + hashes.forEach(t::add); + filters.add(t); + + hashes = null; + isSetMode = false; + + breaker.accept(-oldSize); // this zeros out the overhead of the set + breaker.accept(getSizeInBytes()); // this adds back in the new overhead of the cuckoo filters + + } + + /** + * Get the approximate size of this datastructure. Approximate because only the Set occupants + * are tracked, not the overhead of the Set itself. + */ + public long getSizeInBytes() { + long bytes = 13; // fpp (double), threshold (int), isSetMode (boolean) + if (hashes != null) { + bytes = (hashes.size() * 16); + } + if (filters != null) { + bytes += filters.stream().mapToLong(CuckooFilter::getSizeInBytes).sum(); + } + return bytes; + } + + + /** + * Merge `other` cuckoo filter into this cuckoo. After merging, this filter's state will + * be the union of the two. During the merging process, the internal Set may be upgraded + * to a cuckoo if it goes over threshold + */ + public void merge(SetBackedScalingCuckooFilter other) { + // Some basic sanity checks to make sure we can merge + if (this.threshold != other.threshold) { + throw new IllegalStateException("Cannot merge other CuckooFilter because thresholds do not match: [" + + this.threshold + "] vs [" + other.threshold + "]"); + } + if (this.capacity != other.capacity) { + throw new IllegalStateException("Cannot merge other CuckooFilter because capacities do not match: [" + + this.capacity + "] vs [" + other.capacity + "]"); + } + if (this.fpp != other.fpp) { + throw new IllegalStateException("Cannot merge other CuckooFilter because precisions do not match: [" + + this.fpp + "] vs [" + other.fpp + "]"); + } + + if (isSetMode && other.isSetMode) { + // Both in sets, merge collections then see if we need to convert to cuckoo + hashes.addAll(other.hashes); + maybeConvert(); + } else if (isSetMode && other.isSetMode == false) { + // Other is in cuckoo mode, so we convert our set to a cuckoo, then + // call the merge function again. Since both are now in set-mode + // this will fall through to the last conditional and do a cuckoo-cuckoo merge + convert(); + merge(other); + } else if (isSetMode == false && other.isSetMode) { + // Rather than converting the other to a cuckoo first, we can just + // replay the values directly into our filter. + other.hashes.forEach(this::add); + } else { + // Both are in cuckoo mode, merge raw fingerprints + + CuckooFilter currentFilter = filters.get(filters.size() - 1); + + for (CuckooFilter otherFilter : other.filters) { + + // The iterator returns an array of longs corresponding to the + // fingerprints for buckets at the current position + Iterator iter = otherFilter.getBuckets(); + int bucket = 0; + while (iter.hasNext()) { + long[] fingerprints = iter.next(); + + // We check to see if the fingerprint is present in any of the existing filters + // (in the same bucket/alternate bucket), or if the fingerprint is empty. In these cases + // we can skip the fingerprint + for (long fingerprint : fingerprints) { + if (fingerprint == CuckooFilter.EMPTY || mightContainFingerprint(bucket, (int) fingerprint)) { + continue; + } + // Try to insert into the last filter in our list + if (currentFilter.mergeFingerprint(bucket, (int) fingerprint) == false) { + // if we failed, the filter is now saturated and we need to create a new one + CuckooFilter t = new CuckooFilter(capacity, fpp, rng); + filters.add(t); + breaker.accept(t.getSizeInBytes()); // make sure we account for the new filter + + currentFilter = filters.get(filters.size() - 1); + } + } + bucket += 1; + } + } + } + } + + + @Override + public int hashCode() { + return Objects.hash(hashes, filters, threshold, isSetMode, capacity, fpp); + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (other == null || getClass() != other.getClass()) { + return false; + } + + final SetBackedScalingCuckooFilter that = (SetBackedScalingCuckooFilter) other; + return Objects.equals(this.hashes, that.hashes) + && Objects.equals(this.filters, that.filters) + && Objects.equals(this.threshold, that.threshold) + && Objects.equals(this.isSetMode, that.isSetMode) + && Objects.equals(this.capacity, that.capacity) + && Objects.equals(this.fpp, that.fpp); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/SearchModule.java b/server/src/main/java/org/elasticsearch/search/SearchModule.java index fc3bdcfda8ecf..01a4aa66810c8 100644 --- a/server/src/main/java/org/elasticsearch/search/SearchModule.java +++ b/server/src/main/java/org/elasticsearch/search/SearchModule.java @@ -153,9 +153,13 @@ import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic; import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicParser; import org.elasticsearch.search.aggregations.bucket.terms.DoubleTerms; +import org.elasticsearch.search.aggregations.bucket.terms.LongRareTerms; import org.elasticsearch.search.aggregations.bucket.terms.LongTerms; +import org.elasticsearch.search.aggregations.bucket.terms.RareTermsAggregationBuilder; +import org.elasticsearch.search.aggregations.bucket.terms.StringRareTerms; import org.elasticsearch.search.aggregations.bucket.terms.StringTerms; import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder; +import org.elasticsearch.search.aggregations.bucket.terms.UnmappedRareTerms; import org.elasticsearch.search.aggregations.bucket.terms.UnmappedTerms; import org.elasticsearch.search.aggregations.metrics.AvgAggregationBuilder; import org.elasticsearch.search.aggregations.metrics.CardinalityAggregationBuilder; @@ -390,6 +394,11 @@ private void registerAggregations(List plugins) { .addResultReader(UnmappedTerms.NAME, UnmappedTerms::new) .addResultReader(LongTerms.NAME, LongTerms::new) .addResultReader(DoubleTerms.NAME, DoubleTerms::new)); + registerAggregation(new AggregationSpec(RareTermsAggregationBuilder.NAME, RareTermsAggregationBuilder::new, + RareTermsAggregationBuilder::parse) + .addResultReader(StringRareTerms.NAME, StringRareTerms::new) + .addResultReader(UnmappedRareTerms.NAME, UnmappedRareTerms::new) + .addResultReader(LongRareTerms.NAME, LongRareTerms::new)); registerAggregation(new AggregationSpec(SignificantTermsAggregationBuilder.NAME, SignificantTermsAggregationBuilder::new, SignificantTermsAggregationBuilder.getParser(significanceHeuristicParserRegistry)) .addResultReader(SignificantStringTerms.NAME, SignificantStringTerms::new) diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketsAggregator.java index 71dacc698bee6..a4ef4286447c1 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketsAggregator.java @@ -90,7 +90,9 @@ public final void mergeBuckets(long[] mergeMap, long newNumBuckets) { docCounts.fill(0, newNumBuckets, 0); for (int i = 0; i < oldDocCounts.size(); i++) { int docCount = oldDocCounts.get(i); - if (docCount != 0) { + + // Skip any in the map which have been "removed", signified with -1 + if (docCount != 0 && mergeMap[i] != -1) { docCounts.increment(mergeMap[i], docCount); } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java index b293cc53a3629..bff5015846951 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java @@ -37,27 +37,87 @@ public MergingBucketsDeferringCollector(SearchContext context, boolean isGlobal) super(context, isGlobal); } +/** + * Merges/prunes the existing bucket ordinals and docDeltas according to the provided mergeMap. + * + * The mergeMap is an array where the index position represents the current bucket ordinal, and + * the value at that position represents the ordinal the bucket should be merged with. If + * the value is set to -1 it is removed entirely. + * + * For example, if the mergeMap [1,1,3,-1,3] is provided: + * - Buckets `0` and `1` will be merged to bucket ordinal `1` + * - Bucket `2` and `4` will be merged to ordinal `3` + * - Bucket `3` will be removed entirely + * + * This process rebuilds the ordinals and docDeltas according to the mergeMap, so it should + * not be called unless there are actually changes to be made, to avoid unnecessary work. + */ public void mergeBuckets(long[] mergeMap) { List newEntries = new ArrayList<>(entries.size()); for (Entry sourceEntry : entries) { PackedLongValues.Builder newBuckets = PackedLongValues.packedBuilder(PackedInts.DEFAULT); + PackedLongValues.Builder newDocDeltas = PackedLongValues.packedBuilder(PackedInts.DEFAULT); + PackedLongValues.Iterator docDeltasItr = sourceEntry.docDeltas.iterator(); + + long lastGoodDelta = 0; for (PackedLongValues.Iterator itr = sourceEntry.buckets.iterator(); itr.hasNext();) { long bucket = itr.next(); - newBuckets.add(mergeMap[Math.toIntExact(bucket)]); + assert docDeltasItr.hasNext(); + long delta = docDeltasItr.next(); + + // Only merge in the ordinal if it hasn't been "removed", signified with -1 + long ordinal = mergeMap[Math.toIntExact(bucket)]; + + if (ordinal != -1) { + newBuckets.add(ordinal); + newDocDeltas.add(delta + lastGoodDelta); + lastGoodDelta = 0; + } else { + // we are skipping this ordinal, which means we need to accumulate the + // doc delta's since the last "good" delta + lastGoodDelta += delta; + } + } + // Only create an entry if this segment has buckets after merging + if (newBuckets.size() > 0) { + assert newDocDeltas.size() > 0 : "docDeltas was empty but we had buckets"; + newEntries.add(new Entry(sourceEntry.context, newDocDeltas.build(), newBuckets.build())); } - newEntries.add(new Entry(sourceEntry.context, sourceEntry.docDeltas, newBuckets.build())); } entries = newEntries; // if there are buckets that have been collected in the current segment // we need to update the bucket ordinals there too - if (bucketsBuilder.size() > 0) { + if (bucketsBuilder != null && bucketsBuilder.size() > 0) { PackedLongValues currentBuckets = bucketsBuilder.build(); PackedLongValues.Builder newBuckets = PackedLongValues.packedBuilder(PackedInts.DEFAULT); + PackedLongValues.Builder newDocDeltas = PackedLongValues.packedBuilder(PackedInts.DEFAULT); + + // The current segment's deltas aren't built yet, so build to a temp object + PackedLongValues currentDeltas = docDeltasBuilder.build(); + PackedLongValues.Iterator docDeltasItr = currentDeltas.iterator(); + + long lastGoodDelta = 0; for (PackedLongValues.Iterator itr = currentBuckets.iterator(); itr.hasNext();) { long bucket = itr.next(); - newBuckets.add(mergeMap[Math.toIntExact(bucket)]); + assert docDeltasItr.hasNext(); + long delta = docDeltasItr.next(); + long ordinal = mergeMap[Math.toIntExact(bucket)]; + + // Only merge in the ordinal if it hasn't been "removed", signified with -1 + if (ordinal != -1) { + newBuckets.add(ordinal); + newDocDeltas.add(delta + lastGoodDelta); + lastGoodDelta = 0; + } else { + // we are skipping this ordinal, which means we need to accumulate the + // doc delta's since the last "good" delta. + // The first is skipped because the original deltas are stored as offsets from first doc, + // not offsets from 0 + lastGoodDelta += delta; + } } + docDeltasBuilder = newDocDeltas; bucketsBuilder = newBuckets; } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java new file mode 100644 index 0000000000000..2bbe3c01988df --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java @@ -0,0 +1,134 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.BucketOrder; +import org.elasticsearch.search.aggregations.LeafBucketCollector; +import org.elasticsearch.search.aggregations.bucket.DeferableBucketAggregator; +import org.elasticsearch.search.aggregations.bucket.DeferringBucketCollector; +import org.elasticsearch.search.aggregations.bucket.MergingBucketsDeferringCollector; +import org.elasticsearch.search.aggregations.bucket.nested.NestedAggregator; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Random; + +public abstract class AbstractRareTermsAggregator extends DeferableBucketAggregator { + + static final BucketOrder ORDER = BucketOrder.compound(BucketOrder.count(true), BucketOrder.key(true)); // sort by count ascending + + protected final long maxDocCount; + protected final double precision; + protected final DocValueFormat format; + protected final T valuesSource; + protected final U includeExclude; + + MergingBucketsDeferringCollector deferringCollector; + LeafBucketCollector subCollectors; + final SetBackedScalingCuckooFilter filter; + + AbstractRareTermsAggregator(String name, AggregatorFactories factories, SearchContext context, + Aggregator parent, List pipelineAggregators, + Map metaData, long maxDocCount, double precision, + DocValueFormat format, T valuesSource, U includeExclude) throws IOException { + super(name, factories, context, parent, pipelineAggregators, metaData); + + // We seed the rng with the ShardID so results are deterministic and don't change randomly + this.filter = new SetBackedScalingCuckooFilter(10000, new Random(context.indexShard().shardId().hashCode()), precision); + this.filter.registerBreaker(this::addRequestCircuitBreakerBytes); + + this.maxDocCount = maxDocCount; + this.precision = precision; + this.format = format; + this.valuesSource = valuesSource; + this.includeExclude = includeExclude; + String scoringAgg = subAggsNeedScore(); + String nestedAgg = descendsFromNestedAggregator(parent); + if (scoringAgg != null && nestedAgg != null) { + /* + * Terms agg would force the collect mode to depth_first here, because + * we need to access the score of nested documents in a sub-aggregation + * and we are not able to generate this score while replaying deferred documents. + * + * But the RareTerms agg _must_ execute in breadth first since it relies on + * deferring execution, so we just have to throw up our hands and refuse + */ + throw new IllegalStateException("RareTerms agg [" + name() + "] is the child of the nested agg [" + nestedAgg + + "], and also has a scoring child agg [" + scoringAgg + "]. This combination is not supported because " + + "it requires executing in [depth_first] mode, which the RareTerms agg cannot do."); + } + } + + @Override + protected boolean shouldDefer(Aggregator aggregator) { + return true; + } + + @Override + public DeferringBucketCollector getDeferringCollector() { + deferringCollector = new MergingBucketsDeferringCollector(context, descendsFromGlobalAggregator(parent())); + return deferringCollector; + } + + private String subAggsNeedScore() { + for (Aggregator subAgg : subAggregators) { + if (subAgg.scoreMode().needsScores()) { + return subAgg.name(); + } + } + return null; + } + + private String descendsFromNestedAggregator(Aggregator parent) { + while (parent != null) { + if (parent.getClass() == NestedAggregator.class) { + return parent.name(); + } + parent = parent.parent(); + } + return null; + } + + protected void doCollect(V val, int docId) throws IOException { + long bucketOrdinal = addValueToOrds(val); + + if (bucketOrdinal < 0) { // already seen + bucketOrdinal = -1 - bucketOrdinal; + collectExistingBucket(subCollectors, docId, bucketOrdinal); + } else { + collectBucket(subCollectors, docId, bucketOrdinal); + } + } + + /** + * Add's the value to the ordinal map. Return the newly allocated id if it wasn't in the ordinal map yet, + * or -1-id if it was already present + */ + abstract long addValueToOrds(V value); +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/IncludeExclude.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/IncludeExclude.java index 8154108f9f0bc..30653f04a355a 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/IncludeExclude.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/IncludeExclude.java @@ -137,10 +137,12 @@ public static IncludeExclude parseExclude(XContentParser parser) throws IOExcept } } + public abstract static class Filter {} + // The includeValue and excludeValue ByteRefs which are the result of the parsing // process are converted into a LongFilter when used on numeric fields // in the index. - public abstract static class LongFilter { + public abstract static class LongFilter extends Filter { public abstract boolean accept(long value); } @@ -183,7 +185,7 @@ private void addReject(long val) { } // Only used for the 'map' execution mode (ie. scripts) - public abstract static class StringFilter { + public abstract static class StringFilter extends Filter { public abstract boolean accept(BytesRef value); } @@ -231,7 +233,7 @@ public boolean accept(BytesRef value) { } } - public abstract static class OrdinalsFilter { + public abstract static class OrdinalsFilter extends Filter { public abstract LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException; } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java new file mode 100644 index 0000000000000..d774d09fa1862 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java @@ -0,0 +1,182 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.util.CollectionUtil; +import org.elasticsearch.common.Randomness; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.AggregationExecutionException; +import org.elasticsearch.search.aggregations.BucketOrder; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.function.Function; +import java.util.stream.Collectors; + +public abstract class InternalMappedRareTerms, B extends InternalRareTerms.Bucket> + extends InternalRareTerms { + + protected DocValueFormat format; + protected List buckets; + protected Map bucketMap; + + final SetBackedScalingCuckooFilter filter; + + protected final Logger logger = LogManager.getLogger(getClass()); + + InternalMappedRareTerms(String name, BucketOrder order, List pipelineAggregators, + Map metaData, DocValueFormat format, + List buckets, long maxDocCount, SetBackedScalingCuckooFilter filter) { + super(name, order, maxDocCount, pipelineAggregators, metaData); + this.format = format; + this.buckets = buckets; + this.filter = filter; + } + + public long getMaxDocCount() { + return maxDocCount; + } + + SetBackedScalingCuckooFilter getFilter() { + return filter; + } + + /** + * Read from a stream. + */ + InternalMappedRareTerms(StreamInput in, Bucket.Reader bucketReader) throws IOException { + super(in); + format = in.readNamedWriteable(DocValueFormat.class); + buckets = in.readList(stream -> bucketReader.read(stream, format)); + filter = new SetBackedScalingCuckooFilter(in, Randomness.get()); + } + + @Override + protected void writeTermTypeInfoTo(StreamOutput out) throws IOException { + out.writeNamedWriteable(format); + out.writeList(buckets); + filter.writeTo(out); + } + + @Override + public InternalAggregation doReduce(List aggregations, ReduceContext reduceContext) { + Map> buckets = new HashMap<>(); + InternalRareTerms referenceTerms = null; + SetBackedScalingCuckooFilter filter = null; + + for (InternalAggregation aggregation : aggregations) { + // Unmapped rare terms don't have a cuckoo filter so we'll skip all this work + // and save some type casting headaches later. + if (aggregation.isMapped() == false) { + continue; + } + + @SuppressWarnings("unchecked") + InternalRareTerms terms = (InternalRareTerms) aggregation; + if (referenceTerms == null && aggregation.getClass().equals(UnmappedRareTerms.class) == false) { + referenceTerms = terms; + } + if (referenceTerms != null && + referenceTerms.getClass().equals(terms.getClass()) == false && + terms.getClass().equals(UnmappedRareTerms.class) == false) { + // control gets into this loop when the same field name against which the query is executed + // is of different types in different indices. + throw new AggregationExecutionException("Merging/Reducing the aggregations failed when computing the aggregation [" + + referenceTerms.getName() + "] because the field you gave in the aggregation query existed as two different " + + "types in two different indices"); + } + for (B bucket : terms.getBuckets()) { + List bucketList = buckets.computeIfAbsent(bucket.getKey(), k -> new ArrayList<>()); + bucketList.add(bucket); + } + + SetBackedScalingCuckooFilter otherFilter = ((InternalMappedRareTerms)aggregation).getFilter(); + if (filter == null) { + filter = new SetBackedScalingCuckooFilter(otherFilter); + } else { + filter.merge(otherFilter); + } + } + + final List rare = new ArrayList<>(); + for (List sameTermBuckets : buckets.values()) { + final B b = sameTermBuckets.get(0).reduce(sameTermBuckets, reduceContext); + if ((b.getDocCount() <= maxDocCount && containsTerm(filter, b) == false)) { + rare.add(b); + reduceContext.consumeBucketsAndMaybeBreak(1); + } else if (b.getDocCount() > maxDocCount) { + // this term has gone over threshold while merging, so add it to the filter. + // Note this may happen during incremental reductions too + addToFilter(filter, b); + } + } + CollectionUtil.introSort(rare, order.comparator(null)); + return createWithFilter(name, rare, filter); + } + + public abstract boolean containsTerm(SetBackedScalingCuckooFilter filter, B bucket); + + public abstract void addToFilter(SetBackedScalingCuckooFilter filter, B bucket); + + @Override + public List getBuckets() { + return buckets; + } + + @Override + public B getBucketByKey(String term) { + if (bucketMap == null) { + bucketMap = buckets.stream().collect(Collectors.toMap(InternalRareTerms.Bucket::getKeyAsString, Function.identity())); + } + return bucketMap.get(term); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + if (super.equals(obj) == false) return false; + InternalMappedRareTerms that = (InternalMappedRareTerms) obj; + return Objects.equals(buckets, that.buckets) + && Objects.equals(format, that.format) + && Objects.equals(filter, that.filter); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), buckets, format, filter); + } + + @Override + public final XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { + return doXContentCommon(builder, params, buckets); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java new file mode 100644 index 0000000000000..dd1a0c19200cf --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java @@ -0,0 +1,205 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.Aggregations; +import org.elasticsearch.search.aggregations.BucketOrder; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.InternalAggregations; +import org.elasticsearch.search.aggregations.InternalMultiBucketAggregation; +import org.elasticsearch.search.aggregations.InternalOrder; +import org.elasticsearch.search.aggregations.KeyComparable; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +public abstract class InternalRareTerms, B extends InternalRareTerms.Bucket> + extends InternalMultiBucketAggregation implements RareTerms { + + public abstract static class Bucket> extends InternalMultiBucketAggregation.InternalBucket + implements RareTerms.Bucket, KeyComparable { + /** + * Reads a bucket. Should be a constructor reference. + */ + @FunctionalInterface + public interface Reader> { + B read(StreamInput in, DocValueFormat format) throws IOException; + } + + long bucketOrd; + + protected long docCount; + protected InternalAggregations aggregations; + protected final DocValueFormat format; + + protected Bucket(long docCount, InternalAggregations aggregations, DocValueFormat formatter) { + this.format = formatter; + this.docCount = docCount; + this.aggregations = aggregations; + } + + /** + * Read from a stream. + */ + protected Bucket(StreamInput in, DocValueFormat formatter) throws IOException { + this.format = formatter; + docCount = in.readVLong(); + aggregations = new InternalAggregations(in); + } + + @Override + public final void writeTo(StreamOutput out) throws IOException { + out.writeVLong(getDocCount()); + aggregations.writeTo(out); + writeTermTo(out); + } + + protected abstract void writeTermTo(StreamOutput out) throws IOException; + + @Override + public long getDocCount() { + return docCount; + } + + @Override + public Aggregations getAggregations() { + return aggregations; + } + + abstract B newBucket(long docCount, InternalAggregations aggs); + + public B reduce(List buckets, ReduceContext context) { + long docCount = 0; + List aggregationsList = new ArrayList<>(buckets.size()); + for (B bucket : buckets) { + docCount += bucket.docCount; + aggregationsList.add(bucket.aggregations); + } + InternalAggregations aggs = InternalAggregations.reduce(aggregationsList, context); + return newBucket(docCount, aggs); + } + + @Override + public final XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(); + keyToXContent(builder); + builder.field(CommonFields.DOC_COUNT.getPreferredName(), getDocCount()); + aggregations.toXContentInternal(builder, params); + builder.endObject(); + return builder; + } + + protected abstract XContentBuilder keyToXContent(XContentBuilder builder) throws IOException; + + @Override + public boolean equals(Object obj) { + if (obj == null || getClass() != obj.getClass()) { + return false; + } + Bucket that = (Bucket) obj; + return Objects.equals(docCount, that.docCount) + && Objects.equals(aggregations, that.aggregations); + } + + @Override + public int hashCode() { + return Objects.hash(getClass(), docCount, aggregations); + } + } + + protected final BucketOrder order; + protected final long maxDocCount; + + protected InternalRareTerms(String name, BucketOrder order, long maxDocCount, + List pipelineAggregators, Map metaData) { + super(name, pipelineAggregators, metaData); + this.order = order; + this.maxDocCount = maxDocCount; + } + + /** + * Read from a stream. + */ + protected InternalRareTerms(StreamInput in) throws IOException { + super(in); + order = InternalOrder.Streams.readOrder(in); + maxDocCount = in.readVLong(); + } + + @Override + protected final void doWriteTo(StreamOutput out) throws IOException { + order.writeTo(out); + out.writeVLong(maxDocCount); + writeTermTypeInfoTo(out); + } + + protected abstract void writeTermTypeInfoTo(StreamOutput out) throws IOException; + + @Override + public abstract List getBuckets(); + + @Override + public abstract B getBucketByKey(String term); + + @Override + public InternalAggregation doReduce(List aggregations, ReduceContext reduceContext) { + throw new UnsupportedOperationException(); + } + + protected abstract A createWithFilter(String name, List buckets, SetBackedScalingCuckooFilter filter); + + /** + * Create an array to hold some buckets. Used in collecting the results. + */ + protected abstract B[] createBucketsArray(int size); + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + if (super.equals(obj) == false) return false; + InternalRareTerms that = (InternalRareTerms) obj; + return Objects.equals(maxDocCount, that.maxDocCount) + && Objects.equals(order, that.order); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), maxDocCount, order); + } + + protected static XContentBuilder doXContentCommon(XContentBuilder builder, Params params, + List buckets) throws IOException { + builder.startArray(CommonFields.BUCKETS.getPreferredName()); + for (Bucket bucket : buckets) { + bucket.toXContent(builder, params); + } + builder.endArray(); + return builder; + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java new file mode 100644 index 0000000000000..29f84fb6030e1 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java @@ -0,0 +1,156 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + + +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.BucketOrder; +import org.elasticsearch.search.aggregations.InternalAggregations; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +/** + * Result of the RareTerms aggregation when the field is some kind of whole number like a integer, long, or a date. + */ +public class LongRareTerms extends InternalMappedRareTerms { + public static final String NAME = "lrareterms"; + + public static class Bucket extends InternalRareTerms.Bucket { + long term; + + public Bucket(long term, long docCount, InternalAggregations aggregations, DocValueFormat format) { + super(docCount, aggregations, format); + this.term = term; + } + + /** + * Read from a stream. + */ + public Bucket(StreamInput in, DocValueFormat format) throws IOException { + super(in, format); + term = in.readLong(); + } + + @Override + protected void writeTermTo(StreamOutput out) throws IOException { + out.writeLong(term); + } + + @Override + public String getKeyAsString() { + return format.format(term).toString(); + } + + @Override + public Object getKey() { + return term; + } + + @Override + public Number getKeyAsNumber() { + return term; + } + + @Override + public int compareKey(Bucket other) { + return Long.compare(term, other.term); + } + + @Override + Bucket newBucket(long docCount, InternalAggregations aggs) { + return new Bucket(term, docCount, aggs, format); + } + + @Override + protected final XContentBuilder keyToXContent(XContentBuilder builder) throws IOException { + builder.field(CommonFields.KEY.getPreferredName(), term); + if (format != DocValueFormat.RAW) { + builder.field(CommonFields.KEY_AS_STRING.getPreferredName(), format.format(term).toString()); + } + return builder; + } + + @Override + public boolean equals(Object obj) { + return super.equals(obj) && Objects.equals(term, ((Bucket) obj).term); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), term); + } + } + + LongRareTerms(String name, BucketOrder order, List pipelineAggregators, + Map metaData, DocValueFormat format, + List buckets, long maxDocCount, SetBackedScalingCuckooFilter filter) { + super(name, order, pipelineAggregators, metaData, format, buckets, maxDocCount, filter); + } + + /** + * Read from a stream. + */ + public LongRareTerms(StreamInput in) throws IOException { + super(in, LongRareTerms.Bucket::new); + } + + @Override + public String getWriteableName() { + return NAME; + } + + @Override + public LongRareTerms create(List buckets) { + return new LongRareTerms(name, order, pipelineAggregators(), metaData, format, buckets, maxDocCount, filter); + } + + @Override + public LongRareTerms.Bucket createBucket(InternalAggregations aggregations, LongRareTerms.Bucket prototype) { + return new LongRareTerms.Bucket(prototype.term, prototype.getDocCount(), aggregations, prototype.format); + } + + @Override + protected LongRareTerms createWithFilter(String name, List buckets, SetBackedScalingCuckooFilter filter) { + return new LongRareTerms(name, order, pipelineAggregators(), getMetaData(), format, + buckets, maxDocCount, filter); + } + + @Override + protected LongRareTerms.Bucket[] createBucketsArray(int size) { + return new LongRareTerms.Bucket[size]; + } + + @Override + public boolean containsTerm(SetBackedScalingCuckooFilter filter, LongRareTerms.Bucket bucket) { + return filter.mightContain((long) bucket.getKey()); + } + + @Override + public void addToFilter(SetBackedScalingCuckooFilter filter, LongRareTerms.Bucket bucket) { + filter.add((long) bucket.getKey()); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java new file mode 100644 index 0000000000000..b1d294fefdcf6 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java @@ -0,0 +1,169 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.util.CollectionUtil; +import org.elasticsearch.common.lease.Releasables; +import org.elasticsearch.common.util.LongHash; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.LeafBucketCollector; +import org.elasticsearch.search.aggregations.LeafBucketCollectorBase; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import static java.util.Collections.emptyList; + +/** + * An aggregator that finds "rare" string values (e.g. terms agg that orders ascending) + */ +public class LongRareTermsAggregator extends AbstractRareTermsAggregator { + + protected LongHash bucketOrds; + + LongRareTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Numeric valuesSource, DocValueFormat format, + SearchContext aggregationContext, Aggregator parent, IncludeExclude.LongFilter longFilter, + int maxDocCount, double precision, List pipelineAggregators, + Map metaData) throws IOException { + super(name, factories, aggregationContext, parent, pipelineAggregators, metaData, maxDocCount, precision, + format, valuesSource, longFilter); + this.bucketOrds = new LongHash(1, aggregationContext.bigArrays()); + } + + protected SortedNumericDocValues getValues(ValuesSource.Numeric valuesSource, LeafReaderContext ctx) throws IOException { + return valuesSource.longValues(ctx); + } + + @Override + public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, + final LeafBucketCollector sub) throws IOException { + final SortedNumericDocValues values = getValues(valuesSource, ctx); + if (subCollectors == null) { + subCollectors = sub; + } + return new LeafBucketCollectorBase(sub, values) { + + @Override + public void collect(int docId, long owningBucketOrdinal) throws IOException { + if (values.advanceExact(docId)) { + final int valuesCount = values.docValueCount(); + long previous = Long.MAX_VALUE; + for (int i = 0; i < valuesCount; ++i) { + final long val = values.nextValue(); + if (previous != val || i == 0) { + if ((includeExclude == null) || (includeExclude.accept(val))) { + doCollect(val, docId); + } + previous = val; + } + } + } + } + }; + } + + @Override + long addValueToOrds(Long value) { + return bucketOrds.add(value); + } + + /** + * Merges the ordinals to a minimal set, populates the CuckooFilter and + * generates a final set of buckets. + * + * If a term is below the maxDocCount, it is turned into a Bucket. Otherwise, + * the term is added to the filter, and pruned from the ordinal map. If + * necessary the ordinal map is merged down to a minimal set to remove deletions + */ + private List buildSketch() { + long deletionCount = 0; + LongHash newBucketOrds = new LongHash(1, context.bigArrays()); + List buckets = new ArrayList<>(); + try (LongHash oldBucketOrds = bucketOrds) { + + long[] mergeMap = new long[(int) oldBucketOrds.size()]; + for (int i = 0; i < oldBucketOrds.size(); i++) { + long oldKey = oldBucketOrds.get(i); + long newBucketOrd = -1; + + long docCount = bucketDocCount(i); + // if the key is below threshold, reinsert into the new ords + if (docCount <= maxDocCount) { + newBucketOrd = newBucketOrds.add(oldKey); + LongRareTerms.Bucket bucket = new LongRareTerms.Bucket(oldKey, docCount, null, format); + bucket.bucketOrd = newBucketOrd; + buckets.add(bucket); + + consumeBucketsAndMaybeBreak(1); + } else { + // Make a note when one of the ords has been deleted + deletionCount += 1; + filter.add(oldKey); + } + mergeMap[i] = newBucketOrd; + } + + // Only merge/delete the ordinals if we have actually deleted one, + // to save on some redundant work + if (deletionCount > 0) { + mergeBuckets(mergeMap, newBucketOrds.size()); + if (deferringCollector != null) { + deferringCollector.mergeBuckets(mergeMap); + } + } + } + bucketOrds = newBucketOrds; + return buckets; + } + + @Override + public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOException { + assert owningBucketOrdinal == 0; + List buckets = buildSketch(); + runDeferredCollections(buckets.stream().mapToLong(b -> b.bucketOrd).toArray()); + + // Finalize the buckets + for (LongRareTerms.Bucket bucket : buckets) { + bucket.aggregations = bucketAggregations(bucket.bucketOrd); + } + + CollectionUtil.introSort(buckets, ORDER.comparator(this)); + return new LongRareTerms(name, ORDER, pipelineAggregators(), metaData(), format, buckets, maxDocCount, filter); + } + + @Override + public InternalAggregation buildEmptyAggregation() { + return new LongRareTerms(name, ORDER, pipelineAggregators(), metaData(), format, emptyList(), 0, filter); + } + + @Override + public void doClose() { + Releasables.close(bucketOrds); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTerms.java new file mode 100644 index 0000000000000..2248514783264 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTerms.java @@ -0,0 +1,48 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.elasticsearch.search.aggregations.bucket.MultiBucketsAggregation; + +import java.util.List; + + +public interface RareTerms extends MultiBucketsAggregation { + + /** + * A bucket that is associated with a single term + */ + interface Bucket extends MultiBucketsAggregation.Bucket { + + Number getKeyAsNumber(); + } + + /** + * Return the sorted list of the buckets in this terms aggregation. + */ + @Override + List getBuckets(); + + /** + * Get the bucket for the given term, or null if there is no such bucket. + */ + Bucket getBucketByKey(String term); + +} + diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java new file mode 100644 index 0000000000000..5772cfa9708d5 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java @@ -0,0 +1,203 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.xcontent.ObjectParser; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.search.aggregations.AggregationBuilder; +import org.elasticsearch.search.aggregations.AggregatorFactories.Builder; +import org.elasticsearch.search.aggregations.AggregatorFactory; +import org.elasticsearch.search.aggregations.support.ValueType; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.aggregations.support.ValuesSourceAggregationBuilder; +import org.elasticsearch.search.aggregations.support.ValuesSourceAggregatorFactory; +import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; +import org.elasticsearch.search.aggregations.support.ValuesSourceParserHelper; +import org.elasticsearch.search.aggregations.support.ValuesSourceType; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.Map; +import java.util.Objects; + +public class RareTermsAggregationBuilder extends ValuesSourceAggregationBuilder { + public static final String NAME = "rare_terms"; + + private static final ParseField MAX_DOC_COUNT_FIELD_NAME = new ParseField("max_doc_count"); + private static final ParseField PRECISION = new ParseField("precision"); + + private static final int MAX_MAX_DOC_COUNT = 100; + private static final ObjectParser PARSER; + static { + PARSER = new ObjectParser<>(RareTermsAggregationBuilder.NAME); + ValuesSourceParserHelper.declareAnyFields(PARSER, true, true); + PARSER.declareLong(RareTermsAggregationBuilder::maxDocCount, MAX_DOC_COUNT_FIELD_NAME); + + PARSER.declareField((b, v) -> b.includeExclude(IncludeExclude.merge(v, b.includeExclude())), + IncludeExclude::parseInclude, IncludeExclude.INCLUDE_FIELD, ObjectParser.ValueType.OBJECT_ARRAY_OR_STRING); + + PARSER.declareField((b, v) -> b.includeExclude(IncludeExclude.merge(b.includeExclude(), v)), + IncludeExclude::parseExclude, IncludeExclude.EXCLUDE_FIELD, ObjectParser.ValueType.STRING_ARRAY); + + PARSER.declareDouble(RareTermsAggregationBuilder::setPrecision, PRECISION); + } + + public static AggregationBuilder parse(String aggregationName, XContentParser parser) throws IOException { + return PARSER.parse(parser, new RareTermsAggregationBuilder(aggregationName, null), null); + } + + private IncludeExclude includeExclude = null; + private int maxDocCount = 1; + private double precision = 0.001; + + public RareTermsAggregationBuilder(String name, ValueType valueType) { + super(name, ValuesSourceType.ANY, valueType); + } + + private RareTermsAggregationBuilder(RareTermsAggregationBuilder clone, Builder factoriesBuilder, Map metaData) { + super(clone, factoriesBuilder, metaData); + this.includeExclude = clone.includeExclude; + } + + @Override + protected AggregationBuilder shallowCopy(Builder factoriesBuilder, Map metaData) { + return new RareTermsAggregationBuilder(this, factoriesBuilder, metaData); + } + + /** + * Read from a stream. + */ + public RareTermsAggregationBuilder(StreamInput in) throws IOException { + super(in, ValuesSourceType.ANY); + includeExclude = in.readOptionalWriteable(IncludeExclude::new); + maxDocCount = in.readVInt(); + } + + @Override + protected boolean serializeTargetValueType() { + return true; + } + + @Override + protected void innerWriteTo(StreamOutput out) throws IOException { + out.writeOptionalWriteable(includeExclude); + out.writeVInt(maxDocCount); + } + + /** + * Set the maximum document count terms should have in order to appear in + * the response. + */ + public RareTermsAggregationBuilder maxDocCount(long maxDocCount) { + if (maxDocCount <= 0) { + throw new IllegalArgumentException( + "[" + MAX_DOC_COUNT_FIELD_NAME.getPreferredName() + "] must be greater than 0. Found [" + + maxDocCount + "] in [" + name + "]"); + } + //TODO review: what size cap should we put on this? + if (maxDocCount > MAX_MAX_DOC_COUNT) { + throw new IllegalArgumentException("[" + MAX_DOC_COUNT_FIELD_NAME.getPreferredName() + "] must be smaller" + + "than " + MAX_MAX_DOC_COUNT + "in [" + name + "]"); + } + this.maxDocCount = (int) maxDocCount; + return this; + } + + /** + * Set terms to include and exclude from the aggregation results + */ + public RareTermsAggregationBuilder includeExclude(IncludeExclude includeExclude) { + this.includeExclude = includeExclude; + return this; + } + + /** + * Get terms to include and exclude from the aggregation results + */ + public IncludeExclude includeExclude() { + return includeExclude; + } + + /** + * Get the current false positive rate for individual cuckoo filters. + */ + public double getPrecision() { + return precision; + } + + /** + * Set's the false-positive rate for individual cuckoo filters. Does not dictate the overall fpp rate + * since we use a "scaling" cuckoo filter which adds more filters as required, and the overall + * error rate grows differently than individual filters + * + * This value does, however, affect the overall space usage of the filter. Coarser precisions provide + * more compact filters. The default is 0.01 + */ + public void setPrecision(double precision) { + if (precision < 0.00001) { + throw new IllegalArgumentException("[precision] must be greater than 0.00001"); + } + this.precision = precision; + } + + @Override + protected ValuesSourceAggregatorFactory innerBuild(SearchContext context, + ValuesSourceConfig config, + AggregatorFactory parent, + Builder subFactoriesBuilder) throws IOException { + return new RareTermsAggregatorFactory(name, config, includeExclude, + context, parent, subFactoriesBuilder, metaData, maxDocCount, precision); + } + + @Override + protected XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { + if (includeExclude != null) { + includeExclude.toXContent(builder, params); + } + builder.field(MAX_DOC_COUNT_FIELD_NAME.getPreferredName(), maxDocCount); + builder.field(PRECISION.getPreferredName(), precision); + return builder; + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), includeExclude, maxDocCount, precision); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + if (super.equals(obj) == false) return false; + RareTermsAggregationBuilder other = (RareTermsAggregationBuilder) obj; + return Objects.equals(includeExclude, other.includeExclude) + && Objects.equals(maxDocCount, other.maxDocCount) + && Objects.equals(precision, other.precision); + } + + @Override + public String getType() { + return NAME; + } + +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java new file mode 100644 index 0000000000000..ddb563e03039d --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java @@ -0,0 +1,164 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.logging.DeprecationLogger; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.AggregationExecutionException; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.AggregatorFactory; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.NonCollectingAggregator; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.aggregations.support.ValuesSourceAggregatorFactory; +import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +public class RareTermsAggregatorFactory extends ValuesSourceAggregatorFactory { + private final IncludeExclude includeExclude; + private final int maxDocCount; + private final double precision; + + RareTermsAggregatorFactory(String name, ValuesSourceConfig config, + IncludeExclude includeExclude, + SearchContext context, + AggregatorFactory parent, AggregatorFactories.Builder subFactoriesBuilder, + Map metaData, int maxDocCount, double precision) throws IOException { + super(name, config, context, parent, subFactoriesBuilder, metaData); + this.includeExclude = includeExclude; + this.maxDocCount = maxDocCount; + this.precision = precision; + } + + @Override + protected Aggregator createUnmapped(Aggregator parent, List pipelineAggregators, Map metaData) + throws IOException { + final InternalAggregation aggregation = new UnmappedRareTerms(name, pipelineAggregators, metaData); + return new NonCollectingAggregator(name, context, parent, factories, pipelineAggregators, metaData) { + @Override + public InternalAggregation buildEmptyAggregation() { + return aggregation; + } + }; + } + + @Override + protected Aggregator doCreateInternal(ValuesSource valuesSource, Aggregator parent, boolean collectsFromSingleBucket, + List pipelineAggregators, Map metaData) throws IOException { + if (collectsFromSingleBucket == false) { + return asMultiBucketAggregator(this, context, parent); + } + if (valuesSource instanceof ValuesSource.Bytes) { + ExecutionMode execution = ExecutionMode.MAP; //TODO global ords not implemented yet, only supports "map" + + DocValueFormat format = config.format(); + if ((includeExclude != null) && (includeExclude.isRegexBased()) && format != DocValueFormat.RAW) { + throw new AggregationExecutionException("Aggregation [" + name + "] cannot support " + + "regular expression style include/exclude settings as they can only be applied to string fields. " + + "Use an array of values for include/exclude clauses"); + } + + return execution.create(name, factories, valuesSource, format, + includeExclude, context, parent, pipelineAggregators, metaData, maxDocCount, precision); + } + + if ((includeExclude != null) && (includeExclude.isRegexBased())) { + throw new AggregationExecutionException("Aggregation [" + name + "] cannot support regular expression style include/exclude " + + "settings as they can only be applied to string fields. Use an array of numeric values for include/exclude clauses " + + "used to filter numeric fields"); + } + + if (valuesSource instanceof ValuesSource.Numeric) { + IncludeExclude.LongFilter longFilter = null; + if (((ValuesSource.Numeric) valuesSource).isFloatingPoint()) { + throw new AggregationExecutionException("RareTerms aggregation does not support floating point fields."); + } + if (includeExclude != null) { + longFilter = includeExclude.convertToLongFilter(config.format()); + } + return new LongRareTermsAggregator(name, factories, (ValuesSource.Numeric) valuesSource, config.format(), + context, parent, longFilter, maxDocCount, precision, pipelineAggregators, metaData); + } + + throw new AggregationExecutionException("RareTerms aggregation cannot be applied to field [" + config.fieldContext().field() + + "]. It can only be applied to numeric or string fields."); + } + + public enum ExecutionMode { + + MAP(new ParseField("map")) { + + @Override + Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource, + DocValueFormat format, IncludeExclude includeExclude, + SearchContext context, Aggregator parent, + List pipelineAggregators, + Map metaData, long maxDocCount, double precision) + throws IOException { + final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter(format); + return new StringRareTermsAggregator(name, factories, (ValuesSource.Bytes) valuesSource, format, filter, + context, parent, pipelineAggregators, metaData, maxDocCount, precision); + } + + @Override + boolean needsGlobalOrdinals() { + return false; + } + + }; + + public static ExecutionMode fromString(String value, final DeprecationLogger deprecationLogger) { + switch (value) { + case "map": + return MAP; + default: + throw new IllegalArgumentException("Unknown `execution_hint`: [" + value + "], expected any of [map]"); + } + } + + private final ParseField parseField; + + ExecutionMode(ParseField parseField) { + this.parseField = parseField; + } + + abstract Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource, + DocValueFormat format, IncludeExclude includeExclude, + SearchContext context, Aggregator parent, + List pipelineAggregators, Map metaData, + long maxDocCount, double precision) + throws IOException; + + abstract boolean needsGlobalOrdinals(); + + @Override + public String toString() { + return parseField.getPreferredName(); + } + } + +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java new file mode 100644 index 0000000000000..3c3e19664a631 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java @@ -0,0 +1,159 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.BucketOrder; +import org.elasticsearch.search.aggregations.InternalAggregations; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +public class StringRareTerms extends InternalMappedRareTerms { + public static final String NAME = "srareterms"; + + public static class Bucket extends InternalRareTerms.Bucket { + BytesRef termBytes; + + public Bucket(BytesRef term, long docCount, InternalAggregations aggregations, DocValueFormat format) { + super(docCount, aggregations, format); + this.termBytes = term; + } + + /** + * Read from a stream. + */ + public Bucket(StreamInput in, DocValueFormat format) throws IOException { + super(in, format); + termBytes = in.readBytesRef(); + } + + @Override + protected void writeTermTo(StreamOutput out) throws IOException { + out.writeBytesRef(termBytes); + } + + @Override + public Object getKey() { + return getKeyAsString(); + } + + // this method is needed for scripted numeric aggs + @Override + public Number getKeyAsNumber() { + /* + * If the term is a long greater than 2^52 then parsing as a double would lose accuracy. Therefore, we first parse as a long and + * if this fails then we attempt to parse the term as a double. + */ + try { + return Long.parseLong(termBytes.utf8ToString()); + } catch (final NumberFormatException ignored) { + return Double.parseDouble(termBytes.utf8ToString()); + } + } + + @Override + public String getKeyAsString() { + return format.format(termBytes).toString(); + } + + @Override + public int compareKey(Bucket other) { + return termBytes.compareTo(other.termBytes); + } + + @Override + Bucket newBucket(long docCount, InternalAggregations aggs) { + return new Bucket(termBytes, docCount, aggs, format); + } + + @Override + protected final XContentBuilder keyToXContent(XContentBuilder builder) throws IOException { + return builder.field(CommonFields.KEY.getPreferredName(), getKeyAsString()); + } + + @Override + public boolean equals(Object obj) { + return super.equals(obj) && Objects.equals(termBytes, ((Bucket) obj).termBytes); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), termBytes); + } + } + + StringRareTerms(String name, BucketOrder order, List pipelineAggregators, + Map metaData, DocValueFormat format, + List buckets, long maxDocCount, SetBackedScalingCuckooFilter filter) { + super(name, order, pipelineAggregators, metaData, format, buckets, maxDocCount, filter); + } + + /** + * Read from a stream. + */ + public StringRareTerms(StreamInput in) throws IOException { + super(in, StringRareTerms.Bucket::new); + } + + @Override + public String getWriteableName() { + return NAME; + } + + @Override + public StringRareTerms create(List buckets) { + return new StringRareTerms(name, order, pipelineAggregators(), metaData, format, buckets, maxDocCount, filter); + } + + @Override + public StringRareTerms.Bucket createBucket(InternalAggregations aggregations, StringRareTerms.Bucket prototype) { + return new StringRareTerms.Bucket(prototype.termBytes, prototype.getDocCount(), aggregations, prototype.format); + } + + @Override + protected StringRareTerms createWithFilter(String name, List buckets, + SetBackedScalingCuckooFilter filterFilter) { + return new StringRareTerms(name, order, pipelineAggregators(), metaData, format, + buckets, maxDocCount, filterFilter); + } + + @Override + protected StringRareTerms.Bucket[] createBucketsArray(int size) { + return new StringRareTerms.Bucket[size]; + } + + @Override + public boolean containsTerm(SetBackedScalingCuckooFilter filter, StringRareTerms.Bucket bucket) { + return filter.mightContain(bucket.termBytes); + } + + @Override + public void addToFilter(SetBackedScalingCuckooFilter filter, StringRareTerms.Bucket bucket) { + filter.add(bucket.termBytes); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java new file mode 100644 index 0000000000000..0c200e96b242c --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java @@ -0,0 +1,175 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.CollectionUtil; +import org.elasticsearch.common.lease.Releasables; +import org.elasticsearch.common.util.BytesRefHash; +import org.elasticsearch.index.fielddata.SortedBinaryDocValues; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.LeafBucketCollector; +import org.elasticsearch.search.aggregations.LeafBucketCollectorBase; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import static java.util.Collections.emptyList; + +/** + * An aggregator that finds "rare" string values (e.g. terms agg that orders ascending) + */ +public class StringRareTermsAggregator extends AbstractRareTermsAggregator { + protected BytesRefHash bucketOrds; + + StringRareTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes valuesSource, + DocValueFormat format, IncludeExclude.StringFilter stringFilter, + SearchContext context, Aggregator parent, List pipelineAggregators, + Map metaData, long maxDocCount, double precision) throws IOException { + super(name, factories, context, parent, pipelineAggregators, metaData, maxDocCount, precision, format, valuesSource, stringFilter); + this.bucketOrds = new BytesRefHash(1, context.bigArrays()); + } + + @Override + public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, + final LeafBucketCollector sub) throws IOException { + final SortedBinaryDocValues values = valuesSource.bytesValues(ctx); + if (subCollectors == null) { + subCollectors = sub; + } + return new LeafBucketCollectorBase(sub, values) { + final BytesRefBuilder previous = new BytesRefBuilder(); + + @Override + public void collect(int docId, long bucket) throws IOException { + assert bucket == 0; + if (values.advanceExact(docId)) { + final int valuesCount = values.docValueCount(); + previous.clear(); + + // SortedBinaryDocValues don't guarantee uniqueness so we + // need to take care of dups + for (int i = 0; i < valuesCount; ++i) { + final BytesRef bytes = values.nextValue(); + if (includeExclude != null && !includeExclude.accept(bytes)) { + continue; + } + if (i > 0 && previous.get().equals(bytes)) { + continue; + } + + doCollect(bytes, docId); + previous.copyBytes(bytes); + } + } + } + }; + } + + @Override + long addValueToOrds(BytesRef value) { + return bucketOrds.add(value); + } + + /** + * Merges the ordinals to a minimal set, populates the CuckooFilter and + * generates a final set of buckets. + * + * If a term is below the maxDocCount, it is turned into a Bucket. Otherwise, + * the term is added to the filter, and pruned from the ordinal map. If + * necessary the ordinal map is merged down to a minimal set to remove deletions + */ + private List buildSketch() { + long deletionCount = 0; + BytesRefHash newBucketOrds = new BytesRefHash(1, context.bigArrays()); + List buckets = new ArrayList<>(); + try (BytesRefHash oldBucketOrds = bucketOrds) { + + long[] mergeMap = new long[(int) oldBucketOrds.size()]; + BytesRef scratch = new BytesRef(); + for (int i = 0; i < oldBucketOrds.size(); i++) { + BytesRef oldKey = oldBucketOrds.get(i, scratch); + long newBucketOrd = -1; + long docCount = bucketDocCount(i); + // if the key is below threshold, reinsert into the new ords + if (docCount <= maxDocCount) { + newBucketOrd = newBucketOrds.add(oldKey); + StringRareTerms.Bucket bucket = new StringRareTerms.Bucket(BytesRef.deepCopyOf(oldKey), docCount, null, format); + bucket.bucketOrd = newBucketOrd; + buckets.add(bucket); + + consumeBucketsAndMaybeBreak(1); + } else { + // Make a note when one of the ords has been deleted + deletionCount += 1; + filter.add(oldKey); + } + mergeMap[i] = newBucketOrd; + } + + // Only merge/delete the ordinals if we have actually deleted one, + // to save on some redundant work + if (deletionCount > 0) { + mergeBuckets(mergeMap, newBucketOrds.size()); + if (deferringCollector != null) { + deferringCollector.mergeBuckets(mergeMap); + } + } + } + bucketOrds = newBucketOrds; + return buckets; + } + + @Override + public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOException { + assert owningBucketOrdinal == 0; + + List buckets = buildSketch(); + runDeferredCollections(buckets.stream().mapToLong(b -> b.bucketOrd).toArray()); + + // Finalize the buckets + for (StringRareTerms.Bucket bucket : buckets) { + bucket.aggregations = bucketAggregations(bucket.bucketOrd); + } + + CollectionUtil.introSort(buckets, ORDER.comparator(this)); + return new StringRareTerms(name, ORDER, pipelineAggregators(), metaData(), format, buckets, maxDocCount, filter); + } + + @Override + public InternalAggregation buildEmptyAggregation() { + return new StringRareTerms(name, LongRareTermsAggregator.ORDER, pipelineAggregators(), metaData(), format, emptyList(), 0, filter); + } + + @Override + public void doClose() { + Releasables.close(bucketOrds); + } +} + diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java index 20162fd1bc78a..446aafa22d36b 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java @@ -114,8 +114,9 @@ public void collect(int doc, long bucket) throws IOException { public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOException { assert owningBucketOrdinal == 0; - if (bucketCountThresholds.getMinDocCount() == 0 && (InternalOrder.isCountDesc(order) == false || - bucketOrds.size() < bucketCountThresholds.getRequiredSize())) { + if (bucketCountThresholds.getMinDocCount() == 0 + && (InternalOrder.isCountDesc(order) == false + || bucketOrds.size() < bucketCountThresholds.getRequiredSize())) { // we need to fill-in the blanks for (LeafReaderContext ctx : context.searcher().getTopReaderContext().leaves()) { final SortedBinaryDocValues values = valuesSource.bytesValues(ctx); @@ -168,11 +169,10 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOE runDeferredCollections(survivingBucketOrds); // Now build the aggs - for (int i = 0; i < list.length; i++) { - final StringTerms.Bucket bucket = list[i]; - bucket.termBytes = BytesRef.deepCopyOf(bucket.termBytes); - bucket.aggregations = bucketAggregations(bucket.bucketOrd); - bucket.docCountError = 0; + for (final StringTerms.Bucket bucket : list) { + bucket.termBytes = BytesRef.deepCopyOf(bucket.termBytes); + bucket.aggregations = bucketAggregations(bucket.bucketOrd); + bucket.docCountError = 0; } return new StringTerms(name, order, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getMinDocCount(), diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java new file mode 100644 index 0000000000000..eff5441a1d7e7 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java @@ -0,0 +1,119 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.InternalAggregations; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import static java.util.Collections.emptyList; + +/** + * Result of the RareTerms aggregation when the field is unmapped. + */ +public class UnmappedRareTerms extends InternalRareTerms { + public static final String NAME = "umrareterms"; + + protected abstract static class Bucket extends InternalRareTerms.Bucket { + private Bucket(long docCount, InternalAggregations aggregations, DocValueFormat formatter) { + super(docCount, aggregations, formatter); + } + } + + UnmappedRareTerms(String name, List pipelineAggregators, Map metaData) { + super(name, LongRareTermsAggregator.ORDER, 0, pipelineAggregators, metaData); + } + + /** + * Read from a stream. + */ + public UnmappedRareTerms(StreamInput in) throws IOException { + super(in); + } + + @Override + protected void writeTermTypeInfoTo(StreamOutput out) throws IOException { + // Nothing to write + } + + @Override + public String getWriteableName() { + return NAME; + } + + @Override + public String getType() { + return StringTerms.NAME; + } + + @Override + public UnmappedRareTerms create(List buckets) { + return new UnmappedRareTerms(name, pipelineAggregators(), metaData); + } + + @Override + public UnmappedRareTerms.Bucket createBucket(InternalAggregations aggregations, UnmappedRareTerms.Bucket prototype) { + throw new UnsupportedOperationException("not supported for UnmappedRareTerms"); + } + + @Override + protected UnmappedRareTerms createWithFilter(String name, List buckets, SetBackedScalingCuckooFilter filter) { + throw new UnsupportedOperationException("not supported for UnmappedRareTerms"); + } + + @Override + public InternalAggregation doReduce(List aggregations, ReduceContext reduceContext) { + return new UnmappedRareTerms(name, pipelineAggregators(), metaData); + } + + @Override + public boolean isMapped() { + return false; + } + + @Override + public final XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { + return doXContentCommon(builder, params, Collections.emptyList()); + } + + @Override + public List getBuckets() { + return emptyList(); + } + + @Override + public UnmappedRareTerms.Bucket getBucketByKey(String term) { + return null; + } + + @Override + protected UnmappedRareTerms.Bucket[] createBucketsArray(int size) { + return new UnmappedRareTerms.Bucket[size]; + } +} diff --git a/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java b/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java new file mode 100644 index 0000000000000..47e9081d815b6 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java @@ -0,0 +1,135 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.common.util; + +import org.elasticsearch.common.Randomness; +import org.elasticsearch.common.hash.MurmurHash3; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.test.AbstractWireSerializingTestCase; + +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.lessThanOrEqualTo; + +public class CuckooFilterTests extends AbstractWireSerializingTestCase { + + @Override + protected CuckooFilter createTestInstance() { + CuckooFilter filter = new CuckooFilter(randomIntBetween(1, 100000), + ((float)randomIntBetween(1, 50)) / 100.0, Randomness.get()); + + int num = randomIntBetween(0, 10); + for (int i = 0; i < num; i++) { + filter.add(hash(randomLong())); + } + + return filter; + } + + @Override + protected Writeable.Reader instanceReader() { + return in -> new CuckooFilter(in, Randomness.get()); + } + + @Override + protected CuckooFilter mutateInstance(CuckooFilter instance) { + CuckooFilter newInstance = new CuckooFilter(instance); + int num = randomIntBetween(1, 10); + for (int i = 0; i < num; i++) { + newInstance.add(hash(randomLong())); + } + return newInstance; + } + + public void testExact() { + CuckooFilter filter = new CuckooFilter(10000, 0.03, Randomness.get()); + + for (int i = 0; i < 100; i++) { + filter.add(hash(i)); + } + + // Was sized sufficiently large that all of these values should be retained + for (int i = 0; i < 100; i++) { + assertThat(filter.mightContain(hash(i)), equalTo(true)); + } + } + + public void testSaturate() { + CuckooFilter filter = new CuckooFilter(10, 0.03, Randomness.get()); + int counter = 0; + boolean saturated = false; + for (int i = 0; i < 100; i++) { + logger.info("Value: " + i); + if (filter.add(hash(i)) == false) { + saturated = true; + } + counter += 1; + if (saturated) { + break; + } + } + // Unclear when it will saturate exactly, but should be before 100 given the configuration + assertTrue(saturated); + logger.info("Saturated at: " + counter); + + for (int i = 0; i < counter; i++) { + logger.info("Value: " + i); + assertThat(filter.mightContain(hash(i)), equalTo(true)); + } + } + + public void testHash() { + CuckooFilter.hashToIndex(-10, 32); + } + + public void testBig() { + CuckooFilter filter = new CuckooFilter(1000000, 0.001, Randomness.get()); + + for (int i = 0; i < 10000; i++) { + filter.add(hash(i)); + } + + int correct = 0; + int incorrect = 0; + for (int i = 0; i < 10000; i++) { + if (filter.mightContain(hash(i))) { + correct += 1; + } else { + incorrect += 1; + } + } + + assertThat(correct, equalTo(10000)); + assertThat(incorrect, equalTo(0)); + + for (int i = 10000; i < 100000; i++) { + if (filter.mightContain(hash(i))) { + incorrect += 1; + } else { + correct += 1; + } + } + + double fppRate = (double) incorrect / 100000; + assertThat(fppRate, lessThanOrEqualTo(0.001)); + } + + private long hash(long i) { + return MurmurHash3.murmur64(i); + } +} diff --git a/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java b/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java new file mode 100644 index 0000000000000..20ffaa00998a1 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java @@ -0,0 +1,231 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.common.util; + +import org.elasticsearch.common.Randomness; +import org.elasticsearch.common.hash.MurmurHash3; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.test.AbstractWireSerializingTestCase; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.lessThanOrEqualTo; + +public class SetBackedScalingCuckooFilterTests extends AbstractWireSerializingTestCase { + + @Override + protected SetBackedScalingCuckooFilter createTestInstance() { + SetBackedScalingCuckooFilter bloom = new SetBackedScalingCuckooFilter(1000, Randomness.get(), 0.01); + + int num = randomIntBetween(0, 10); + for (int i = 0; i < num; i++) { + bloom.add(randomLong()); + } + + return bloom; + } + + @Override + protected Writeable.Reader instanceReader() { + return in -> new SetBackedScalingCuckooFilter(in, Randomness.get()); + } + + @Override + protected SetBackedScalingCuckooFilter mutateInstance(SetBackedScalingCuckooFilter instance) throws IOException { + SetBackedScalingCuckooFilter newInstance = new SetBackedScalingCuckooFilter(instance); + int num = randomIntBetween(1, 10); + for (int i = 0; i < num; i++) { + newInstance.add(randomLong()); + } + return newInstance; + } + + public void testExact() { + int threshold = randomIntBetween(1000, 10000); + SetBackedScalingCuckooFilter filter = new SetBackedScalingCuckooFilter(threshold, Randomness.get(), 0.01); + + int size = 0; + Set values = new HashSet<>(); + Set hashed = new HashSet<>(values.size()); + while (size < threshold - 100) { + long value = randomLong(); + filter.add(value); + boolean newValue = values.add(value); + if (newValue) { + Long hash = MurmurHash3.murmur64(value); + hashed.add(hash); + + size += 16; + } + } + assertThat(filter.hashes.size(), equalTo(hashed.size())); + assertThat(filter.hashes, equalTo(hashed)); + assertNull(filter.filters); + + for (Long value : values) { + assertThat(filter.mightContain(value), equalTo(true)); + } + } + + public void testConvert() { + int threshold = randomIntBetween(1000, 10000); + SetBackedScalingCuckooFilter filter = new SetBackedScalingCuckooFilter(threshold, Randomness.get(), 0.01); + + int counter = 0; + Set values = new HashSet<>(); + while (counter < threshold + 100) { + long value = randomLong(); + filter.add(value); + boolean newValue = values.add(value); + if (newValue) { + counter += 1; + } + } + assertNull(filter.hashes); + assertThat(filter.filters.size(), greaterThan(0)); + + int incorrect = 0; + for (Long v : values) { + if (filter.mightContain(v) == false) { + incorrect += 1; + } + } + double fppRate = (double) incorrect / values.size(); + assertThat(fppRate, lessThanOrEqualTo(0.001)); + } + + public void testConvertTwice() { + int threshold = randomIntBetween(1000, 10000); + SetBackedScalingCuckooFilter filter = new SetBackedScalingCuckooFilter(threshold, Randomness.get(), 0.01); + + int counter = 0; + Set values = new HashSet<>(); + while (counter < threshold + 100) { + long value = randomLong(); + filter.add(value); + boolean newValue = values.add(value); + if (newValue) { + counter += 1; + } + } + assertNull(filter.hashes); + assertThat(filter.filters.size(), greaterThan(0)); + IllegalStateException e = expectThrows(IllegalStateException.class, filter::convert); + assertThat(e.getMessage(), equalTo("Cannot convert SetBackedScalingCuckooFilter to approximate " + + "when it has already been converted.")); + } + + public void testMergeSmall() { + int threshold = 1000; + + // Setup the first filter + SetBackedScalingCuckooFilter filter = new SetBackedScalingCuckooFilter(threshold, Randomness.get(), 0.01); + + int counter = 0; + Set values = new HashSet<>(); + while (counter < threshold + 1) { + long value = randomLong(); + filter.add(value); + boolean newValue = values.add(value); + if (newValue) { + counter += 1; + } + } + assertNull(filter.hashes); + assertThat(filter.filters.size(), greaterThan(0)); + + int incorrect = 0; + for (Long v : values) { + if (filter.mightContain(v) == false) { + incorrect += 1; + } + } + double fppRate = (double) incorrect / values.size(); + assertThat(fppRate, lessThanOrEqualTo(0.001)); + + // Setup the second filter + SetBackedScalingCuckooFilter filter2 = new SetBackedScalingCuckooFilter(threshold, Randomness.get(), 0.01); + counter = 0; + Set values2 = new HashSet<>(); + while (counter < threshold + 1) { + long value = randomLong(); + filter2.add(value); + boolean newValue = values2.add(value); + if (newValue) { + counter += 1; + } + } + assertNull(filter2.hashes); + assertThat(filter2.filters.size(), greaterThan(0)); + + incorrect = 0; + for (Long v : values2) { + if (filter2.mightContain(v) == false) { + incorrect += 1; + } + } + fppRate = (double) incorrect / values2.size(); + assertThat(fppRate, lessThanOrEqualTo(0.001)); + + // now merge and verify the combined set + filter.merge(filter2); + incorrect = 0; + for (Long v : values) { + if (filter.mightContain(v) == false) { + incorrect += 1; + } + } + for (Long v : values2) { + if (filter.mightContain(v) == false) { + incorrect += 1; + } + } + fppRate = (double) incorrect / (values.size() + values2.size()); + assertThat(fppRate, lessThanOrEqualTo(0.001)); + } + + public void testMergeIncompatible() { + SetBackedScalingCuckooFilter filter1 = new SetBackedScalingCuckooFilter(100, Randomness.get(), 0.01); + SetBackedScalingCuckooFilter filter2 = new SetBackedScalingCuckooFilter(1000, Randomness.get(), 0.01); + IllegalStateException e = expectThrows(IllegalStateException.class, () -> filter1.merge(filter2)); + assertThat(e.getMessage(), equalTo("Cannot merge other CuckooFilter because thresholds do not match: [100] vs [1000]")); + + SetBackedScalingCuckooFilter filter3 = new SetBackedScalingCuckooFilter(100, Randomness.get(), 0.001); + e = expectThrows(IllegalStateException.class, () -> filter1.merge(filter3)); + assertThat(e.getMessage(), equalTo("Cannot merge other CuckooFilter because precisions do not match: [0.01] vs [0.001]")); + } + + public void testBadParameters() { + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> new SetBackedScalingCuckooFilter(-1, Randomness.get(), 0.11)); + assertThat(e.getMessage(), equalTo("[threshold] must be a positive integer")); + + e = expectThrows(IllegalArgumentException.class, + () -> new SetBackedScalingCuckooFilter(1000000, Randomness.get(), 0.11)); + assertThat(e.getMessage(), equalTo("[threshold] must be smaller than [500000]")); + + e = expectThrows(IllegalArgumentException.class, + () -> new SetBackedScalingCuckooFilter(100, Randomness.get(), -1.0)); + assertThat(e.getMessage(), equalTo("[fpp] must be a positive double")); + } +} diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/RareTermsTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/RareTermsTests.java new file mode 100644 index 0000000000000..31382f5df8e5c --- /dev/null +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/RareTermsTests.java @@ -0,0 +1,101 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.bucket; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.RegExp; +import org.elasticsearch.search.aggregations.BaseAggregationTestCase; +import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude; +import org.elasticsearch.search.aggregations.bucket.terms.RareTermsAggregationBuilder; + +import java.util.SortedSet; +import java.util.TreeSet; + +public class RareTermsTests extends BaseAggregationTestCase { + + @Override + protected RareTermsAggregationBuilder createTestAggregatorBuilder() { + String name = randomAlphaOfLengthBetween(3, 20); + RareTermsAggregationBuilder factory = new RareTermsAggregationBuilder(name, null); + String field = randomAlphaOfLengthBetween(3, 20); + randomFieldOrScript(factory, field); + if (randomBoolean()) { + factory.missing("MISSING"); + } + if (randomBoolean()) { + factory.format("###.##"); + } + if (randomBoolean()) { + IncludeExclude incExc = null; + switch (randomInt(6)) { + case 0: + incExc = new IncludeExclude(new RegExp("foobar"), null); + break; + case 1: + incExc = new IncludeExclude(null, new RegExp("foobaz")); + break; + case 2: + incExc = new IncludeExclude(new RegExp("foobar"), new RegExp("foobaz")); + break; + case 3: + SortedSet includeValues = new TreeSet<>(); + int numIncs = randomIntBetween(1, 20); + for (int i = 0; i < numIncs; i++) { + includeValues.add(new BytesRef(randomAlphaOfLengthBetween(1, 30))); + } + SortedSet excludeValues = null; + incExc = new IncludeExclude(includeValues, excludeValues); + break; + case 4: + SortedSet includeValues2 = null; + SortedSet excludeValues2 = new TreeSet<>(); + int numExcs2 = randomIntBetween(1, 20); + for (int i = 0; i < numExcs2; i++) { + excludeValues2.add(new BytesRef(randomAlphaOfLengthBetween(1, 30))); + } + incExc = new IncludeExclude(includeValues2, excludeValues2); + break; + case 5: + SortedSet includeValues3 = new TreeSet<>(); + int numIncs3 = randomIntBetween(1, 20); + for (int i = 0; i < numIncs3; i++) { + includeValues3.add(new BytesRef(randomAlphaOfLengthBetween(1, 30))); + } + SortedSet excludeValues3 = new TreeSet<>(); + int numExcs3 = randomIntBetween(1, 20); + for (int i = 0; i < numExcs3; i++) { + excludeValues3.add(new BytesRef(randomAlphaOfLengthBetween(1, 30))); + } + incExc = new IncludeExclude(includeValues3, excludeValues3); + break; + case 6: + final int numPartitions = randomIntBetween(1, 100); + final int partition = randomIntBetween(0, numPartitions - 1); + incExc = new IncludeExclude(partition, numPartitions); + break; + default: + fail(); + } + factory.includeExclude(incExc); + } + return factory; + } + +} diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java new file mode 100644 index 0000000000000..a0d48b7ab778f --- /dev/null +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java @@ -0,0 +1,600 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.DocValuesFieldExistsQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MatchNoDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TotalHits; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.mapper.IdFieldMapper; +import org.elasticsearch.index.mapper.KeywordFieldMapper; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.NumberFieldMapper; +import org.elasticsearch.index.mapper.SeqNoFieldMapper; +import org.elasticsearch.index.mapper.TypeFieldMapper; +import org.elasticsearch.index.mapper.Uid; +import org.elasticsearch.search.SearchHit; +import org.elasticsearch.search.aggregations.Aggregation; +import org.elasticsearch.search.aggregations.Aggregations; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorTestCase; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.InternalMultiBucketAggregation; +import org.elasticsearch.search.aggregations.MultiBucketConsumerService; +import org.elasticsearch.search.aggregations.bucket.MultiBucketsAggregation; +import org.elasticsearch.search.aggregations.bucket.global.GlobalAggregationBuilder; +import org.elasticsearch.search.aggregations.bucket.global.InternalGlobal; +import org.elasticsearch.search.aggregations.bucket.nested.InternalNested; +import org.elasticsearch.search.aggregations.bucket.nested.NestedAggregationBuilder; +import org.elasticsearch.search.aggregations.metrics.InternalTopHits; +import org.elasticsearch.search.aggregations.metrics.Max; +import org.elasticsearch.search.aggregations.metrics.MaxAggregationBuilder; +import org.elasticsearch.search.aggregations.metrics.TopHitsAggregationBuilder; +import org.elasticsearch.search.aggregations.support.ValueType; +import org.elasticsearch.search.sort.FieldSortBuilder; +import org.elasticsearch.search.sort.ScoreSortBuilder; +import org.junit.Assert; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.function.Consumer; + +import static org.elasticsearch.index.mapper.SeqNoFieldMapper.PRIMARY_TERM_NAME; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; + +public class RareTermsAggregatorTests extends AggregatorTestCase { + + private static final String LONG_FIELD = "numeric"; + private static final String KEYWORD_FIELD = "keyword"; + + private static final List dataset; + static { + List d = new ArrayList<>(45); + for (int i = 0; i < 10; i++) { + for (int j = 0; j < i; j++) { + d.add((long) i); + } + } + dataset = d; + } + + public void testMatchNoDocs() throws IOException { + testBothCases(new MatchNoDocsQuery(), dataset, + aggregation -> aggregation.field(KEYWORD_FIELD).maxDocCount(1), + agg -> assertEquals(0, agg.getBuckets().size()), ValueType.STRING + ); + testBothCases(new MatchNoDocsQuery(), dataset, + aggregation -> aggregation.field(LONG_FIELD).maxDocCount(1), + agg -> assertEquals(0, agg.getBuckets().size()), ValueType.NUMERIC + ); + } + + public void testMatchAllDocs() throws IOException { + Query query = new MatchAllDocsQuery(); + + testBothCases(query, dataset, + aggregation -> aggregation.field(LONG_FIELD).maxDocCount(1), + agg -> { + assertEquals(1, agg.getBuckets().size()); + LongRareTerms.Bucket bucket = (LongRareTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo(1L)); + assertThat(bucket.getDocCount(), equalTo(1L)); + }, ValueType.NUMERIC + ); + testBothCases(query, dataset, + aggregation -> aggregation.field(KEYWORD_FIELD).maxDocCount(1), + agg -> { + assertEquals(1, agg.getBuckets().size()); + StringRareTerms.Bucket bucket = (StringRareTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKeyAsString(), equalTo("1")); + assertThat(bucket.getDocCount(), equalTo(1L)); + }, ValueType.STRING + ); + } + + public void testManyDocsOneRare() throws IOException { + Query query = new MatchAllDocsQuery(); + + List d = new ArrayList<>(500); + for (int i = 1; i < 500; i++) { + d.add((long) i); + d.add((long) i); + } + + // The one rare term + d.add(0L); + + testSearchAndReduceCase(query, d, + aggregation -> aggregation.field(LONG_FIELD).maxDocCount(1), + agg -> { + assertEquals(1, agg.getBuckets().size()); + LongRareTerms.Bucket bucket = (LongRareTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo(0L)); + assertThat(bucket.getDocCount(), equalTo(1L)); + }, ValueType.NUMERIC + ); + testSearchAndReduceCase(query, d, + aggregation -> aggregation.field(KEYWORD_FIELD).maxDocCount(1), + agg -> { + assertEquals(1, agg.getBuckets().size()); + StringRareTerms.Bucket bucket = (StringRareTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKeyAsString(), equalTo("0")); + assertThat(bucket.getDocCount(), equalTo(1L)); + }, ValueType.STRING + ); + } + + public void testIncludeExclude() throws IOException { + Query query = new MatchAllDocsQuery(); + + testBothCases(query, dataset, + aggregation -> aggregation.field(LONG_FIELD) + .maxDocCount(2) // bump to 2 since we're only including "2" + .includeExclude(new IncludeExclude(new long[]{2}, new long[]{})), + agg -> { + assertEquals(1, agg.getBuckets().size()); + LongRareTerms.Bucket bucket = (LongRareTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo(2L)); + assertThat(bucket.getDocCount(), equalTo(2L)); + }, ValueType.NUMERIC + ); + testBothCases(query, dataset, + aggregation -> aggregation.field(KEYWORD_FIELD) + .maxDocCount(2) // bump to 2 since we're only including "2" + .includeExclude(new IncludeExclude(new String[]{"2"}, new String[]{})), + agg -> { + assertEquals(1, agg.getBuckets().size()); + StringRareTerms.Bucket bucket = (StringRareTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKeyAsString(), equalTo("2")); + assertThat(bucket.getDocCount(), equalTo(2L)); + }, ValueType.STRING + ); + } + + public void testEmbeddedMaxAgg() throws IOException { + Query query = new MatchAllDocsQuery(); + + testBothCases(query, dataset, aggregation -> { + MaxAggregationBuilder max = new MaxAggregationBuilder("the_max").field(LONG_FIELD); + aggregation.field(LONG_FIELD).maxDocCount(1).subAggregation(max); + }, + agg -> { + assertEquals(1, agg.getBuckets().size()); + LongRareTerms.Bucket bucket = (LongRareTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo(1L)); + assertThat(bucket.getDocCount(), equalTo(1L)); + + Aggregations children = bucket.getAggregations(); + assertThat(children.asList().size(), equalTo(1)); + assertThat(children.asList().get(0).getName(), equalTo("the_max")); + assertThat(((Max)(children.asList().get(0))).getValue(), equalTo(1.0)); + }, ValueType.NUMERIC + ); + testBothCases(query, dataset, aggregation -> { + MaxAggregationBuilder max = new MaxAggregationBuilder("the_max").field(LONG_FIELD); + aggregation.field(KEYWORD_FIELD).maxDocCount(1).subAggregation(max); + }, + agg -> { + assertEquals(1, agg.getBuckets().size()); + StringRareTerms.Bucket bucket = (StringRareTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo("1")); + assertThat(bucket.getDocCount(), equalTo(1L)); + + Aggregations children = bucket.getAggregations(); + assertThat(children.asList().size(), equalTo(1)); + assertThat(children.asList().get(0).getName(), equalTo("the_max")); + assertThat(((Max)(children.asList().get(0))).getValue(), equalTo(1.0)); + }, ValueType.STRING + ); + } + + public void testEmpty() throws IOException { + Query query = new MatchAllDocsQuery(); + + testSearchCase(query, Collections.emptyList(), + aggregation -> aggregation.field(LONG_FIELD).maxDocCount(1), + agg -> assertEquals(0, agg.getBuckets().size()), ValueType.NUMERIC + ); + testSearchCase(query, Collections.emptyList(), + aggregation -> aggregation.field(KEYWORD_FIELD).maxDocCount(1), + agg -> assertEquals(0, agg.getBuckets().size()), ValueType.STRING + ); + + // Note: the search and reduce test will generate no segments (due to no docs) + // and so will return a null agg because the aggs aren't run/reduced + testSearchAndReduceCase(query, Collections.emptyList(), + aggregation -> aggregation.field(LONG_FIELD).maxDocCount(1), + Assert::assertNull, ValueType.NUMERIC + ); + testSearchAndReduceCase(query, Collections.emptyList(), + aggregation -> aggregation.field(KEYWORD_FIELD).maxDocCount(1), + Assert::assertNull, ValueType.STRING + ); + } + + public void testUnmapped() throws Exception { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + Document document = new Document(); + document.add(new SortedDocValuesField("string", new BytesRef("a"))); + document.add(new NumericDocValuesField("long", 0L)); + indexWriter.addDocument(document); + MappedFieldType fieldType1 = new KeywordFieldMapper.KeywordFieldType(); + fieldType1.setName("another_string"); + fieldType1.setHasDocValues(true); + + MappedFieldType fieldType2 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG); + fieldType2.setName("another_long"); + fieldType2.setHasDocValues(true); + + + try (IndexReader indexReader = maybeWrapReaderEs(indexWriter.getReader())) { + IndexSearcher indexSearcher = newIndexSearcher(indexReader); + ValueType[] valueTypes = new ValueType[]{ValueType.STRING, ValueType.LONG}; + String[] fieldNames = new String[]{"string", "long"}; + for (int i = 0; i < fieldNames.length; i++) { + RareTermsAggregationBuilder aggregationBuilder = new RareTermsAggregationBuilder("_name", valueTypes[i]) + .field(fieldNames[i]); + Aggregator aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType1, fieldType2); + aggregator.preCollection(); + indexSearcher.search(new MatchAllDocsQuery(), aggregator); + aggregator.postCollection(); + RareTerms result = (RareTerms) aggregator.buildAggregation(0L); + assertEquals("_name", result.getName()); + assertEquals(0, result.getBuckets().size()); + } + } + } + } + } + + public void testNestedTerms() throws IOException { + Query query = new MatchAllDocsQuery(); + + testBothCases(query, dataset, aggregation -> { + TermsAggregationBuilder terms = new TermsAggregationBuilder("the_terms", ValueType.STRING).field(KEYWORD_FIELD); + aggregation.field(LONG_FIELD).maxDocCount(1).subAggregation(terms); + }, + agg -> { + assertEquals(1, agg.getBuckets().size()); + LongRareTerms.Bucket bucket = (LongRareTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo(1L)); + assertThat(bucket.getDocCount(), equalTo(1L)); + + Aggregations children = bucket.getAggregations(); + assertThat(children.asList().size(), equalTo(1)); + assertThat(children.asList().get(0).getName(), equalTo("the_terms")); + assertThat(((Terms)(children.asList().get(0))).getBuckets().size(), equalTo(1)); + assertThat(((Terms)(children.asList().get(0))).getBuckets().get(0).getKeyAsString(), equalTo("1")); + }, ValueType.NUMERIC + ); + + testBothCases(query, dataset, aggregation -> { + TermsAggregationBuilder terms = new TermsAggregationBuilder("the_terms", ValueType.STRING).field(KEYWORD_FIELD); + aggregation.field(KEYWORD_FIELD).maxDocCount(1).subAggregation(terms); + }, + agg -> { + assertEquals(1, agg.getBuckets().size()); + StringRareTerms.Bucket bucket = (StringRareTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo("1")); + assertThat(bucket.getDocCount(), equalTo(1L)); + + Aggregations children = bucket.getAggregations(); + assertThat(children.asList().size(), equalTo(1)); + assertThat(children.asList().get(0).getName(), equalTo("the_terms")); + assertThat(((Terms)(children.asList().get(0))).getBuckets().size(), equalTo(1)); + assertThat(((Terms)(children.asList().get(0))).getBuckets().get(0).getKeyAsString(), equalTo("1")); + }, ValueType.STRING + ); + } + + public void testGlobalAggregationWithScore() throws IOException { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + Document document = new Document(); + document.add(new SortedDocValuesField("keyword", new BytesRef("a"))); + indexWriter.addDocument(document); + document = new Document(); + document.add(new SortedDocValuesField("keyword", new BytesRef("c"))); + indexWriter.addDocument(document); + document = new Document(); + document.add(new SortedDocValuesField("keyword", new BytesRef("e"))); + indexWriter.addDocument(document); + try (IndexReader indexReader = maybeWrapReaderEs(indexWriter.getReader())) { + IndexSearcher indexSearcher = newIndexSearcher(indexReader); + Aggregator.SubAggCollectionMode collectionMode = randomFrom(Aggregator.SubAggCollectionMode.values()); + GlobalAggregationBuilder globalBuilder = new GlobalAggregationBuilder("global") + .subAggregation( + new RareTermsAggregationBuilder("terms", ValueType.STRING) + .field("keyword") + .subAggregation( + new RareTermsAggregationBuilder("sub_terms", ValueType.STRING) + .field("keyword") + .subAggregation( + new TopHitsAggregationBuilder("top_hits") + .storedField("_none_") + ) + ) + ); + + MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(); + fieldType.setName("keyword"); + fieldType.setHasDocValues(true); + + InternalGlobal result = searchAndReduce(indexSearcher, new MatchAllDocsQuery(), globalBuilder, fieldType); + InternalMultiBucketAggregation terms = result.getAggregations().get("terms"); + assertThat(terms.getBuckets().size(), equalTo(3)); + for (MultiBucketsAggregation.Bucket bucket : terms.getBuckets()) { + InternalMultiBucketAggregation subTerms = bucket.getAggregations().get("sub_terms"); + assertThat(subTerms.getBuckets().size(), equalTo(1)); + MultiBucketsAggregation.Bucket subBucket = subTerms.getBuckets().get(0); + InternalTopHits topHits = subBucket.getAggregations().get("top_hits"); + assertThat(topHits.getHits().getHits().length, equalTo(1)); + for (SearchHit hit : topHits.getHits()) { + assertThat(hit.getScore(), greaterThan(0f)); + } + } + } + } + } + } + + public void testWithNestedAggregations() throws IOException { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + for (int i = 0; i < 10; i++) { + int[] nestedValues = new int[i]; + for (int j = 0; j < i; j++) { + nestedValues[j] = j; + } + indexWriter.addDocuments(generateDocsWithNested(Integer.toString(i), i, nestedValues)); + } + indexWriter.commit(); + + NestedAggregationBuilder nested = new NestedAggregationBuilder("nested", "nested_object") + .subAggregation(new RareTermsAggregationBuilder("terms", ValueType.LONG) + .field("nested_value") + .maxDocCount(1) + ); + MappedFieldType fieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG); + fieldType.setHasDocValues(true); + fieldType.setName("nested_value"); + try (IndexReader indexReader = wrap(DirectoryReader.open(directory))) { + InternalNested result = searchAndReduce(newIndexSearcher(indexReader), + // match root document only + new DocValuesFieldExistsQuery(PRIMARY_TERM_NAME), nested, fieldType); + InternalMultiBucketAggregation terms = result.getAggregations().get("terms"); + assertThat(terms.getBuckets().size(), equalTo(1)); + assertThat(terms.getBuckets().get(0).getKeyAsString(), equalTo("8")); + } + + } + } + } + + public void testWithNestedScoringAggregations() throws IOException { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + for (int i = 0; i < 10; i++) { + int[] nestedValues = new int[i]; + for (int j = 0; j < i; j++) { + nestedValues[j] = j; + } + indexWriter.addDocuments(generateDocsWithNested(Integer.toString(i), i, nestedValues)); + } + indexWriter.commit(); + for (boolean withScore : new boolean[]{true, false}) { + NestedAggregationBuilder nested = new NestedAggregationBuilder("nested", "nested_object") + .subAggregation(new RareTermsAggregationBuilder("terms", ValueType.LONG) + .field("nested_value") + .maxDocCount(2) + .subAggregation( + new TopHitsAggregationBuilder("top_hits") + .sort(withScore ? new ScoreSortBuilder() : new FieldSortBuilder("_doc")) + .storedField("_none_") + ) + ); + MappedFieldType fieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG); + fieldType.setHasDocValues(true); + fieldType.setName("nested_value"); + try (IndexReader indexReader = wrap(DirectoryReader.open(directory))) { + + if (withScore) { + + IllegalStateException e = expectThrows(IllegalStateException.class, + () -> searchAndReduce(newIndexSearcher(indexReader), + // match root document only + new DocValuesFieldExistsQuery(PRIMARY_TERM_NAME), nested, fieldType)); + assertThat(e.getMessage(), equalTo("RareTerms agg [terms] is the child of the nested agg [nested], " + + "and also has a scoring child agg [top_hits]. This combination is not supported because it requires " + + "executing in [depth_first] mode, which the RareTerms agg cannot do.")); + } else { + InternalNested result = searchAndReduce(newIndexSearcher(indexReader), + // match root document only + new DocValuesFieldExistsQuery(PRIMARY_TERM_NAME), nested, fieldType); + InternalMultiBucketAggregation terms = result.getAggregations().get("terms"); + assertThat(terms.getBuckets().size(), equalTo(2)); + long counter = 1; + for (MultiBucketsAggregation.Bucket bucket : terms.getBuckets()) { + InternalTopHits topHits = bucket.getAggregations().get("top_hits"); + TotalHits hits = topHits.getHits().getTotalHits(); + assertNotNull(hits); + assertThat(hits.value, equalTo(counter)); + assertThat(topHits.getHits().getMaxScore(), equalTo(Float.NaN)); + counter += 1; + } + } + } + } + } + } + } + + private final SeqNoFieldMapper.SequenceIDFields sequenceIDFields = SeqNoFieldMapper.SequenceIDFields.emptySeqID(); + private List generateDocsWithNested(String id, int value, int[] nestedValues) { + List documents = new ArrayList<>(); + + for (int nestedValue : nestedValues) { + Document document = new Document(); + document.add(new Field(IdFieldMapper.NAME, Uid.encodeId(id), IdFieldMapper.Defaults.NESTED_FIELD_TYPE)); + document.add(new Field(TypeFieldMapper.NAME, "__nested_object", TypeFieldMapper.Defaults.FIELD_TYPE)); + document.add(new SortedNumericDocValuesField("nested_value", nestedValue)); + documents.add(document); + } + + Document document = new Document(); + document.add(new Field(IdFieldMapper.NAME, Uid.encodeId(id), IdFieldMapper.Defaults.FIELD_TYPE)); + document.add(new Field(TypeFieldMapper.NAME, "docs", TypeFieldMapper.Defaults.FIELD_TYPE)); + document.add(new SortedNumericDocValuesField("value", value)); + document.add(sequenceIDFields.primaryTerm); + documents.add(document); + + return documents; + } + + + private InternalAggregation buildInternalAggregation(RareTermsAggregationBuilder builder, MappedFieldType fieldType, + IndexSearcher searcher) throws IOException { + AbstractRareTermsAggregator aggregator = createAggregator(builder, searcher, fieldType); + aggregator.preCollection(); + searcher.search(new MatchAllDocsQuery(), aggregator); + aggregator.postCollection(); + return aggregator.buildAggregation(0L); + } + + private void testSearchCase(Query query, List dataset, + Consumer configure, + Consumer verify, ValueType valueType) throws IOException { + executeTestCase(false, query, dataset, configure, verify, valueType); + } + + private void testSearchAndReduceCase(Query query, List dataset, + Consumer configure, + Consumer verify, ValueType valueType) throws IOException { + executeTestCase(true, query, dataset, configure, verify, valueType); + } + + private void testBothCases(Query query, List dataset, + Consumer configure, + Consumer verify, ValueType valueType) throws IOException { + testSearchCase(query, dataset, configure, verify, valueType); + testSearchAndReduceCase(query, dataset, configure, verify, valueType); + } + + @Override + protected IndexSettings createIndexSettings() { + Settings nodeSettings = Settings.builder() + .put("search.max_buckets", 100000).build(); + return new IndexSettings( + IndexMetaData.builder("_index").settings(Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)) + .numberOfShards(1) + .numberOfReplicas(0) + .creationDate(System.currentTimeMillis()) + .build(), + nodeSettings + ); + } + + private void executeTestCase(boolean reduced, Query query, List dataset, + Consumer configure, + Consumer verify, ValueType valueType) throws IOException { + + try (Directory directory = newDirectory()) { + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + Document document = new Document(); + for (Long value : dataset) { + if (frequently()) { + indexWriter.commit(); + } + + document.add(new SortedNumericDocValuesField(LONG_FIELD, value)); + document.add(new LongPoint(LONG_FIELD, value)); + document.add(new SortedSetDocValuesField(KEYWORD_FIELD, new BytesRef(Long.toString(value)))); + indexWriter.addDocument(document); + document.clear(); + } + } + + try (IndexReader indexReader = DirectoryReader.open(directory)) { + IndexSearcher indexSearcher = newIndexSearcher(indexReader); + + RareTermsAggregationBuilder aggregationBuilder = new RareTermsAggregationBuilder("_name", valueType); + if (configure != null) { + configure.accept(aggregationBuilder); + } + + MappedFieldType keywordFieldType = new KeywordFieldMapper.KeywordFieldType(); + keywordFieldType.setName(KEYWORD_FIELD); + keywordFieldType.setHasDocValues(true); + + MappedFieldType longFieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG); + longFieldType.setName(LONG_FIELD); + longFieldType.setHasDocValues(true); + + InternalMappedRareTerms rareTerms; + if (reduced) { + rareTerms = searchAndReduce(indexSearcher, query, aggregationBuilder, keywordFieldType, longFieldType); + } else { + rareTerms = search(indexSearcher, query, aggregationBuilder, keywordFieldType, longFieldType); + } + verify.accept(rareTerms); + } + } + } + + @Override + public void doAssertReducedMultiBucketConsumer(Aggregation agg, MultiBucketConsumerService.MultiBucketConsumer bucketConsumer) { + /* + * No-op. + * + * This is used in the aggregator tests to check that after a reduction, we have the correct number of buckets. + * This can be done during incremental reduces, and the final reduce. Unfortunately, the number of buckets + * can _decrease_ during runtime as values are reduced together (e.g. 1 count on each shard, but when + * reduced it becomes 2 and is greater than the threshold). + * + * Because the incremental reduction test picks random subsets to reduce together, it's impossible + * to predict how the buckets will end up, and so this assertion will fail. + * + * If we want to put this assertion back in, we'll need this test to override the incremental reduce + * portion so that we can deterministically know which shards are being reduced together and which + * buckets we should have left after each reduction. + */ + } +} diff --git a/test/framework/src/main/java/org/elasticsearch/search/aggregations/AggregatorTestCase.java b/test/framework/src/main/java/org/elasticsearch/search/aggregations/AggregatorTestCase.java index 3bd3b6838a897..b939f8a9110db 100644 --- a/test/framework/src/main/java/org/elasticsearch/search/aggregations/AggregatorTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/search/aggregations/AggregatorTestCase.java @@ -411,7 +411,7 @@ protected A searchAndReduc new InternalAggregation.ReduceContext(root.context().bigArrays(), null, reduceBucketConsumer, false); A reduced = (A) aggs.get(0).doReduce(toReduce, context); - InternalAggregationTestCase.assertMultiBucketConsumer(reduced, reduceBucketConsumer); + doAssertReducedMultiBucketConsumer(reduced, reduceBucketConsumer); aggs = new ArrayList<>(aggs.subList(r, toReduceSize)); aggs.add(reduced); } @@ -427,12 +427,16 @@ protected A searchAndReduc internalAgg = (A) pipelineAggregator.reduce(internalAgg, context); } } - InternalAggregationTestCase.assertMultiBucketConsumer(internalAgg, reduceBucketConsumer); + doAssertReducedMultiBucketConsumer(internalAgg, reduceBucketConsumer); return internalAgg; } } + protected void doAssertReducedMultiBucketConsumer(Aggregation agg, MultiBucketConsumerService.MultiBucketConsumer bucketConsumer) { + InternalAggregationTestCase.assertMultiBucketConsumer(agg, bucketConsumer); + } + private static class ShardSearcher extends IndexSearcher { private final List ctx;