From 2b29ffaf36e87db83134a1ab4564902d42aec5d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Holger=20Hoffst=C3=A4tte?= Date: Mon, 3 Mar 2014 15:55:35 +0100 Subject: [PATCH 1/6] Makes BytesStreamOutput more efficient by introducing internal paging via a (currently private) non-recycling PageCacheRecycler. This change brought to light a bug in FsTranslog, which used seek() incorrectly/ambiguously; this is fixed so that it works with both the old and new implementation. Fix for bug #5159 --- .../recycler/NonPageCacheRecyclerService.java | 158 ++++++++++ .../common/io/stream/BytesStreamOutput.java | 261 +++++++++++---- .../index/translog/fs/FsTranslog.java | 16 +- .../common/io/streams/BytesStreamsTests.java | 298 ++++++++++++++++-- 4 files changed, 655 insertions(+), 78 deletions(-) create mode 100644 src/main/java/org/elasticsearch/cache/recycler/NonPageCacheRecyclerService.java diff --git a/src/main/java/org/elasticsearch/cache/recycler/NonPageCacheRecyclerService.java b/src/main/java/org/elasticsearch/cache/recycler/NonPageCacheRecyclerService.java new file mode 100644 index 0000000000000..b9c363c3d0287 --- /dev/null +++ b/src/main/java/org/elasticsearch/cache/recycler/NonPageCacheRecyclerService.java @@ -0,0 +1,158 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.cache.recycler; + +import org.elasticsearch.common.recycler.AbstractRecyclerC; +import org.elasticsearch.common.recycler.NoneRecycler; +import org.elasticsearch.common.recycler.Recycler; +import org.elasticsearch.common.recycler.Recycler.V; +import org.elasticsearch.common.util.BigArrays; + +// Should implement PageCacheRecycler once that is made into an interface +public class NonPageCacheRecyclerService { + + private final Recycler byteRecycler; + private final Recycler intRecycler; + private final Recycler longRecycler; + private final Recycler floatRecycler; + private final Recycler doubleRecycler; + private final Recycler objectRecycler; + + public NonPageCacheRecyclerService() { + Recycler.C rcb = new NoneRecyclingByteFactory(); + this.byteRecycler = new NoneRecycler(rcb); + Recycler.C rci = new NoneRecyclingIntFactory(); + this.intRecycler = new NoneRecycler(rci); + Recycler.C rcl = new NoneRecyclingLongFactory(); + this.longRecycler = new NoneRecycler(rcl); + Recycler.C rcf = new NoneRecyclingFloatFactory(); + this.floatRecycler = new NoneRecycler(rcf); + Recycler.C rcd = new NoneRecyclingDoubleFactory(); + this.doubleRecycler = new NoneRecycler(rcd); + Recycler.C rco = new NoneRecyclingObjectFactory(); + this.objectRecycler = new NoneRecycler(rco); + } + + public V bytePage(boolean clear) { + return byteRecycler.obtain(BigArrays.BYTE_PAGE_SIZE); + } + + public V intPage(boolean clear) { + return intRecycler.obtain(BigArrays.BYTE_PAGE_SIZE); + } + + public V longPage(boolean clear) { + return longRecycler.obtain(BigArrays.BYTE_PAGE_SIZE); + } + + public V floatPage(boolean clear) { + return floatRecycler.obtain(BigArrays.BYTE_PAGE_SIZE); + } + + public V doublePage(boolean clear) { + return doubleRecycler.obtain(BigArrays.BYTE_PAGE_SIZE); + } + + public V objectPage() { + return objectRecycler.obtain(BigArrays.BYTE_PAGE_SIZE); + } + + public void close() { + byteRecycler.close(); + intRecycler.close(); + longRecycler.close(); + floatRecycler.close(); + doubleRecycler.close(); + objectRecycler.close(); + } + + private static class NoneRecyclingByteFactory extends AbstractRecyclerC { + @Override + public byte[] newInstance(int sizing) { + return new byte[sizing]; + } + + @Override + public void recycle(byte[] value) { + // don't actually recycle: drop for GC + } + } + + private static class NoneRecyclingIntFactory extends AbstractRecyclerC { + @Override + public int[] newInstance(int sizing) { + return new int[sizing]; + } + + @Override + public void recycle(int[] value) { + // don't actually recycle: drop for GC + } + } + + private static class NoneRecyclingLongFactory extends AbstractRecyclerC { + @Override + public long[] newInstance(int sizing) { + return new long[sizing]; + } + + @Override + public void recycle(long[] value) { + // don't actually recycle: drop for GC + } + } + + private static class NoneRecyclingFloatFactory extends AbstractRecyclerC { + @Override + public float[] newInstance(int sizing) { + return new float[sizing]; + } + + @Override + public void recycle(float[] value) { + // don't actually recycle: drop for GC + } + } + + private static class NoneRecyclingDoubleFactory extends AbstractRecyclerC { + @Override + public double[] newInstance(int sizing) { + return new double[sizing]; + } + + @Override + public void recycle(double[] value) { + // don't actually recycle: drop for GC + } + } + + private static class NoneRecyclingObjectFactory extends AbstractRecyclerC { + @Override + public Object[] newInstance(int sizing) { + return new Object[sizing]; + } + + @Override + public void recycle(Object[] value) { + // don't actually recycle: drop for GC + } + } + +} diff --git a/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java b/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java index 17b87c719223b..4a437cbf2564a 100644 --- a/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java +++ b/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java @@ -19,38 +19,99 @@ package org.elasticsearch.common.io.stream; -import org.apache.lucene.util.ArrayUtil; +import org.elasticsearch.cache.recycler.NonPageCacheRecyclerService; +import org.elasticsearch.cache.recycler.PageCacheRecycler; import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.io.BytesStream; +import org.elasticsearch.common.recycler.Recycler; +import org.elasticsearch.common.util.BigArrays; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; /** - * + * A @link {@link StreamOutput} that uses a {@link PageCacheRecycler} to acquire pages of + * bytes, which avoids frequent reallocation & copying of the internal data. Pages are + * returned to the recycler on {@link #close()}. */ public class BytesStreamOutput extends StreamOutput implements BytesStream { - public static final int DEFAULT_SIZE = 2 * 1024; + // Shared non-recycler for legacy constructor clients; should be removed eventually. + private static final NonPageCacheRecyclerService NONRECYCLING = new NonPageCacheRecyclerService(); - public static final int OVERSIZE_LIMIT = 256 * 1024; + /** + * PageCacheRecycler for acquiring/releasing individual memory pages + */ + private final NonPageCacheRecyclerService pageRecycler; /** * The buffer where data is stored. */ - protected byte buf[]; + private final List> pages; /** * The number of valid bytes in the buffer. */ - protected int count; + private int count; + /** + * Size of a page taken from the PageCacheRecycler. We assume a constant page size for + * all requests. + */ + private final int pageSize; + + /** + * Create a nonrecycling {@link BytesStreamOutput} with 1 initial page acquired. + */ public BytesStreamOutput() { - this(DEFAULT_SIZE); + this(NONRECYCLING, BigArrays.BYTE_PAGE_SIZE); } - public BytesStreamOutput(int size) { - this.buf = new byte[size]; + /** + * Create a nonrecycling {@link BytesStreamOutput} with enough initial pages acquired + * to satisfy the capacity given by {@link expectedSize}. + * + * @param expectedSize the expected maximum size of the stream in bytes. + */ + public BytesStreamOutput(int expectedSize) { + this(NONRECYCLING, expectedSize); + } + + /** + * Create a {@link BytesStreamOutput} with 1 initial page acquired. + * + * @param pageCacheRecycler the {@link PageCacheRecycler} from which to obtain + * bytes[]s. + */ + private BytesStreamOutput(NonPageCacheRecyclerService pageCacheRecycler) { + // expected size does not matter as long as it's >0 + this(pageCacheRecycler, 1); + } + + /** + * Create a {@link BytesStreamOutput} with enough initial pages acquired to satisfy + * the capacity given by {@link expectedSize}. + * + * @param pageCacheRecycler the {@link PageCacheRecycler} from which to obtain + * bytes[]s. + * @param expectedSize the expected maximum size of the stream in bytes. + */ + private BytesStreamOutput(NonPageCacheRecyclerService pageCacheRecycler, int expectedSize) { + this.pageRecycler = pageCacheRecycler; + // there is no good way to figure out the pageSize used by a PCR, so + // get one page and use its size. + Recycler.V vpage = pageRecycler.bytePage(true); + this.pageSize = vpage.v().length; + // expect 16 pages by default, more if specified + this.pages = new ArrayList>(Math.max(16, expectedSize / pageSize)); + // keep already acquired page + this.pages.add(vpage); + // acquire all other requested pages up front if expectedSize > pageSize + if (expectedSize > pageSize) { + ensureCapacity(expectedSize); + } } @Override @@ -63,88 +124,176 @@ public long position() throws IOException { return count; } - @Override - public void seek(long position) throws IOException { - if (position > Integer.MAX_VALUE) { - throw new UnsupportedOperationException(); - } - count = (int) position; - } - @Override public void writeByte(byte b) throws IOException { - int newcount = count + 1; - if (newcount > buf.length) { - buf = grow(newcount); - } - buf[count] = b; - count = newcount; - } - - public void skip(int length) { - int newcount = count + length; - if (newcount > buf.length) { - buf = grow(newcount); - } - count = newcount; + ensureOpen(); + ensureCapacity(count); + byte[] page = pages.get(count / pageSize).v(); + int offset = count % pageSize; + page[offset] = b; + count++; } @Override public void writeBytes(byte[] b, int offset, int length) throws IOException { + ensureOpen(); + + // nothing to copy if (length == 0) { return; } - int newcount = count + length; - if (newcount > buf.length) { - buf = grow(newcount); + + // illegal args: offset and/or length exceed array size + if (b.length < (offset + length)) { + throw new IllegalArgumentException("Illegal offset " + offset + "/length " + length + " for byte[] of length " + b.length); } - System.arraycopy(b, offset, buf, count, length); - count = newcount; - } - private byte[] grow(int newCount) { - // try and grow faster while we are small... - if (newCount < OVERSIZE_LIMIT) { - newCount = Math.max(buf.length << 1, newCount); + // get enough pages for new size + ensureCapacity(count + length); + + // where are we? + int currentPageIndex = count / pageSize; + byte[] currentPage = pages.get(currentPageIndex).v(); + int initialOffset = count % pageSize; + int remainingPageCapacity = pageSize - initialOffset; + int remainingLength = length; + + // try to fill the current page pointed to by #count in one copy if possible + if (remainingLength <= remainingPageCapacity) { + // simply copy all of length + System.arraycopy(b, offset, currentPage, initialOffset, remainingLength); + count += remainingLength; } - return ArrayUtil.grow(buf, newCount); - } + else { + // first fill remainder of first page + System.arraycopy(b, offset, currentPage, initialOffset, remainingPageCapacity); + count += remainingPageCapacity; + remainingLength -= remainingPageCapacity; + int copied = remainingPageCapacity; + + // if there is enough to copy try to fill adjacent pages directly + while (remainingLength > pageSize) { + // advance & fill next page + currentPage = pages.get(++currentPageIndex).v(); + System.arraycopy(b, offset + copied, currentPage, 0, pageSize); + copied += pageSize; + remainingLength -= pageSize; + count += pageSize; + } - public void seek(int seekTo) { - count = seekTo; + // finally take care of remaining bytes: tricky since the above loop may not + // have run, and #count could point to the middle of a page. So figure out + // again where we are. + currentPageIndex = count / pageSize; + currentPage = pages.get(currentPageIndex).v(); + initialOffset = count % pageSize; + System.arraycopy(b, offset + copied, currentPage, initialOffset, remainingLength); + count += remainingLength; + } } public void reset() { + ensureOpen(); + // reset the count but keep all acquired pages count = 0; } - public int bufferSize() { - return buf.length; - } - @Override public void flush() throws IOException { + ensureOpen(); // nothing to do there } @Override - public void close() throws IOException { - // nothing to do here + public void seek(long position) throws IOException { + if (position > Integer.MAX_VALUE) { + throw new IllegalArgumentException("position " + position + " > Integer.MAX_VALUE"); + } + count = (int)position; + ensureCapacity(count); + } + + public void skip(int length) { + count += length; + ensureCapacity(count); } @Override - public BytesReference bytes() { - return new BytesArray(buf, 0, count); + public void close() throws IOException { + // since most callers are broken we don't do anything here. + // a strict implementation would look liek this: + // if (count != -1) { + // // return pages to recycler + // for (Recycler.V vpage : pages) { + // vpage.release(); + // } + // + // // clear refs + // pages.clear(); + // + // // mark as closed + // count = -1; + // } } /** * Returns the current size of the buffer. - * - * @return the value of the count field, which is the number - * of valid bytes in this output stream. + * + * @return the value of the count field, which is the number of valid + * bytes in this output stream. * @see java.io.ByteArrayOutputStream#count */ public int size() { return count; } + + @Override + public BytesReference bytes() { + // for now just create a copy; later we might create a page-aware BytesReference + // and just transfer ownership. + return new BytesArray(toByteArray(), 0, count); + } + + private byte[] toByteArray() { + // stricly speaking this is undefined. :/ + // ensureOpen(); + + // result array + byte[] result = new byte[count]; + int resultOffset = 0; + + // copy all full pages + int toCopy = Math.min(count, pageSize); + int numPages = count / pageSize; + for (int i = 0; i < numPages; i++) { + byte[] page = pages.get(i).v(); + System.arraycopy(page, 0, result, resultOffset, toCopy); + resultOffset += toCopy; + } + + // copy any remaining bytes from the last page + int remainder = count % pageSize; + if (remainder > 0) { + byte[] lastPage = pages.get(numPages).v(); + System.arraycopy(lastPage, 0, result, resultOffset, remainder); + } + + return result; + } + + private void ensureOpen() { + if (count < 0) { + throw new IllegalStateException("Stream is already closed."); + } + } + + private void ensureCapacity(int offset) { + int capacity = pageSize * pages.size(); + while (offset >= capacity) { + Recycler.V vpage = pageRecycler.bytePage(true); + pages.add(vpage); + capacity += pageSize; + } + } + } diff --git a/src/main/java/org/elasticsearch/index/translog/fs/FsTranslog.java b/src/main/java/org/elasticsearch/index/translog/fs/FsTranslog.java index 3c92ca0e2e9a1..2d9907cfaa15e 100644 --- a/src/main/java/org/elasticsearch/index/translog/fs/FsTranslog.java +++ b/src/main/java/org/elasticsearch/index/translog/fs/FsTranslog.java @@ -21,7 +21,7 @@ import jsr166y.ThreadLocalRandom; import org.elasticsearch.ElasticsearchException; -import org.elasticsearch.index.translog.TranslogStats; +import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.io.FileSystemUtils; import org.elasticsearch.common.io.stream.BytesStreamOutput; @@ -34,6 +34,7 @@ import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.index.translog.Translog; import org.elasticsearch.index.translog.TranslogException; +import org.elasticsearch.index.translog.TranslogStats; import org.elasticsearch.index.translog.TranslogStreams; import java.io.File; @@ -340,18 +341,25 @@ public Location add(Operation operation) throws TranslogException { TranslogStreams.writeTranslogOperation(out, operation); out.flush(); + // write size to beginning of stream int size = out.size(); out.seek(0); out.writeInt(size - 4); - - Location location = current.add(out.bytes().array(), out.bytes().arrayOffset(), size); + + // seek back to end + out.seek(size); + + BytesReference ref = out.bytes(); + byte[] refBytes = ref.array(); + int refBytesOffset = ref.arrayOffset(); + Location location = current.add(refBytes, refBytesOffset, size); if (syncOnEachOperation) { current.sync(); } FsTranslogFile trans = this.trans; if (trans != null) { try { - location = trans.add(out.bytes().array(), out.bytes().arrayOffset(), size); + location = trans.add(refBytes, refBytesOffset, size); } catch (ClosedChannelException e) { // ignore } diff --git a/src/test/java/org/elasticsearch/common/io/streams/BytesStreamsTests.java b/src/test/java/org/elasticsearch/common/io/streams/BytesStreamsTests.java index 3a34b17ef6ca2..8f558bc1feee3 100644 --- a/src/test/java/org/elasticsearch/common/io/streams/BytesStreamsTests.java +++ b/src/test/java/org/elasticsearch/common/io/streams/BytesStreamsTests.java @@ -22,24 +22,249 @@ import org.apache.lucene.util.Constants; import org.elasticsearch.common.io.stream.BytesStreamInput; import org.elasticsearch.common.io.stream.BytesStreamOutput; +import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.test.ElasticsearchTestCase; +import org.junit.Ignore; import org.junit.Test; import static org.hamcrest.Matchers.closeTo; import static org.hamcrest.Matchers.equalTo; /** - * + * Tests for {@link BytesStreamOutput} paging behaviour. */ public class BytesStreamsTests extends ElasticsearchTestCase { + @Test + public void testEmpty() throws Exception { + BytesStreamOutput out = new BytesStreamOutput(); + + // test empty stream to array + assertEquals(0, out.size()); + assertEquals(0, out.bytes().toBytes().length); + + out.close(); + } + + @Test + public void testSingleByte() throws Exception { + BytesStreamOutput out = new BytesStreamOutput(); + assertEquals(0, out.size()); + + int expectedSize = 1; + byte[] expectedData = randomizedByteArrayWithSize(expectedSize); + + // write single byte + out.writeByte(expectedData[0]); + assertEquals(expectedSize, out.size()); + assertArrayEquals(expectedData, out.bytes().toBytes()); + + out.close(); + } + + @Test + public void testSingleShortPage() throws Exception { + BytesStreamOutput out = new BytesStreamOutput(); + + int expectedSize = 10; + byte[] expectedData = randomizedByteArrayWithSize(expectedSize); + + // write byte-by-byte + for (int i = 0; i < expectedSize; i++) { + out.writeByte(expectedData[i]); + } + + assertEquals(expectedSize, out.size()); + assertArrayEquals(expectedData, out.bytes().toBytes()); + + out.close(); + } + + @Test + public void testIllegalBulkWrite() throws Exception { + BytesStreamOutput out = new BytesStreamOutput(); + + // bulk-write with wrong args + try { + out.writeBytes(new byte[]{}, 0, 1); + fail("expected IllegalArgumentException: length > (size-offset)"); + } + catch (IllegalArgumentException iax1) { + // expected + } + + out.close(); + } + + @Test + public void testSingleShortPageBulkWrite() throws Exception { + BytesStreamOutput out = new BytesStreamOutput(); + + // first bulk-write empty array: should not change anything + int expectedSize = 0; + byte[] expectedData = randomizedByteArrayWithSize(expectedSize); + out.writeBytes(expectedData); + assertEquals(expectedSize, out.size()); + assertArrayEquals(expectedData, out.bytes().toBytes()); + + // bulk-write again with actual bytes + expectedSize = 10; + expectedData = randomizedByteArrayWithSize(expectedSize); + out.writeBytes(expectedData); + assertEquals(expectedSize, out.size()); + assertArrayEquals(expectedData, out.bytes().toBytes()); + + out.close(); + } + + @Test + public void testSingleFullPageBulkWrite() throws Exception { + BytesStreamOutput out = new BytesStreamOutput(); + + int expectedSize = BigArrays.BYTE_PAGE_SIZE; + byte[] expectedData = randomizedByteArrayWithSize(expectedSize); + + // write in bulk + out.writeBytes(expectedData); + + assertEquals(expectedSize, out.size()); + assertArrayEquals(expectedData, out.bytes().toBytes()); + + out.close(); + } + + @Test + public void testSingleFullPageBulkWriteWithOffset() throws Exception { + BytesStreamOutput out = new BytesStreamOutput(); + + int initialOffset = 10; + int additionalLength = BigArrays.BYTE_PAGE_SIZE; + byte[] expectedData = randomizedByteArrayWithSize(initialOffset + additionalLength); + + // first create initial offset + out.writeBytes(expectedData, 0, initialOffset); + assertEquals(initialOffset, out.size()); + + // now write the rest - more than fits into the remaining first page + out.writeBytes(expectedData, initialOffset, additionalLength); + assertEquals(expectedData.length, out.size()); + assertArrayEquals(expectedData, out.bytes().toBytes()); + + out.close(); + } + + @Test + public void testSingleFullPageBulkWriteWithOffsetCrossover() throws Exception { + BytesStreamOutput out = new BytesStreamOutput(); + + int initialOffset = 10; + int additionalLength = BigArrays.BYTE_PAGE_SIZE * 2; + byte[] expectedData = randomizedByteArrayWithSize(initialOffset + additionalLength); + out.writeBytes(expectedData, 0, initialOffset); + assertEquals(initialOffset, out.size()); + + // now write the rest - more than fits into the remaining page + a full page after + // that, + // ie. we cross over into a third + out.writeBytes(expectedData, initialOffset, additionalLength); + assertEquals(expectedData.length, out.size()); + assertArrayEquals(expectedData, out.bytes().toBytes()); + + out.close(); + } + + @Test + public void testSingleFullPage() throws Exception { + BytesStreamOutput out = new BytesStreamOutput(); + + int expectedSize = BigArrays.BYTE_PAGE_SIZE; + byte[] expectedData = randomizedByteArrayWithSize(expectedSize); + + // write byte-by-byte + for (int i = 0; i < expectedSize; i++) { + out.writeByte(expectedData[i]); + } + + assertEquals(expectedSize, out.size()); + assertArrayEquals(expectedData, out.bytes().toBytes()); + + out.close(); + } + + @Test + public void testOneFullOneShortPage() throws Exception { + BytesStreamOutput out = new BytesStreamOutput(); + + int expectedSize = BigArrays.BYTE_PAGE_SIZE + 10; + byte[] expectedData = randomizedByteArrayWithSize(expectedSize); + + // write byte-by-byte + for (int i = 0; i < expectedSize; i++) { + out.writeByte(expectedData[i]); + } + + assertEquals(expectedSize, out.size()); + assertArrayEquals(expectedData, out.bytes().toBytes()); + + out.close(); + } + + @Test + public void testTwoFullOneShortPage() throws Exception { + BytesStreamOutput out = new BytesStreamOutput(); + + int expectedSize = (BigArrays.BYTE_PAGE_SIZE * 2) + 1; + byte[] expectedData = randomizedByteArrayWithSize(expectedSize); + + // write byte-by-byte + for (int i = 0; i < expectedSize; i++) { + out.writeByte(expectedData[i]); + } + + assertEquals(expectedSize, out.size()); + assertArrayEquals(expectedData, out.bytes().toBytes()); + + out.close(); + } + + @Test + public void testSeek() throws Exception { + BytesStreamOutput out = new BytesStreamOutput(); + + int position = 0; + assertEquals(position, out.position()); + + out.seek(position += 10); + out.seek(position += BigArrays.BYTE_PAGE_SIZE); + out.seek(position += BigArrays.BYTE_PAGE_SIZE + 10); + out.seek(position += BigArrays.BYTE_PAGE_SIZE * 2); + assertEquals(position, out.position()); + assertEquals(position, out.bytes().toBytes().length); + + out.close(); + } + + @Test + public void testSkip() throws Exception { + BytesStreamOutput out = new BytesStreamOutput(); + + int position = 0; + assertEquals(position, out.position()); + + int forward = 100; + out.skip(forward); + assertEquals(position + forward, out.position()); + + out.close(); + } + @Test public void testSimpleStreams() throws Exception { assumeTrue(Constants.JRE_IS_64BIT); BytesStreamOutput out = new BytesStreamOutput(); out.writeBoolean(false); - out.writeByte((byte) 1); - out.writeShort((short) -1); + out.writeByte((byte)1); + out.writeShort((short)-1); out.writeInt(-1); out.writeVInt(2); out.writeLong(-3); @@ -58,13 +283,13 @@ public void testSimpleStreams() throws Exception { out.writeString("goodbye"); BytesStreamInput in = new BytesStreamInput(out.bytes().toBytes(), false); assertThat(in.readBoolean(), equalTo(false)); - assertThat(in.readByte(), equalTo((byte) 1)); - assertThat(in.readShort(), equalTo((short) -1)); + assertThat(in.readByte(), equalTo((byte)1)); + assertThat(in.readShort(), equalTo((short)-1)); assertThat(in.readInt(), equalTo(-1)); assertThat(in.readVInt(), equalTo(2)); - assertThat(in.readLong(), equalTo((long) -3)); - assertThat(in.readVLong(), equalTo((long) 4)); - assertThat((double) in.readFloat(), closeTo(1.1, 0.0001)); + assertThat(in.readLong(), equalTo((long)-3)); + assertThat(in.readVLong(), equalTo((long)4)); + assertThat((double)in.readFloat(), closeTo(1.1, 0.0001)); assertThat(in.readDouble(), closeTo(2.2, 0.0001)); assertThat(in.readGenericValue(), equalTo((Object)intArray)); assertThat(in.readGenericValue(), equalTo((Object)longArray)); @@ -72,19 +297,56 @@ public void testSimpleStreams() throws Exception { assertThat(in.readGenericValue(), equalTo((Object)doubleArray)); assertThat(in.readString(), equalTo("hello")); assertThat(in.readString(), equalTo("goodbye")); + in.close(); + out.close(); } + // we ignore this test for now since all existing callers of BytesStreamOutput happily + // call bytes() after close(). + @Ignore @Test - public void testGrowLogic() throws Exception { - assumeTrue(Constants.JRE_IS_64BIT); + public void testAccessAfterClose() throws Exception { BytesStreamOutput out = new BytesStreamOutput(); - out.writeBytes(new byte[BytesStreamOutput.DEFAULT_SIZE - 5]); - assertThat(out.bufferSize(), equalTo(2048)); // remains the default - out.writeBytes(new byte[1 * 1024]); - assertThat(out.bufferSize(), equalTo(4608)); - out.writeBytes(new byte[32 * 1024]); - assertThat(out.bufferSize(), equalTo(40320)); - out.writeBytes(new byte[32 * 1024]); - assertThat(out.bufferSize(), equalTo(90720)); + + // immediately close + out.close(); + + assertEquals(-1, out.size()); + assertEquals(-1, out.position()); + + // writing a single byte must fail + try { + out.writeByte((byte)0); + fail("expected IllegalStateException: stream closed"); + } + catch (IllegalStateException iex1) { + // expected + } + + // writing in bulk must fail + try { + out.writeBytes(new byte[0], 0, 0); + fail("expected IllegalStateException: stream closed"); + } + catch (IllegalStateException iex1) { + // expected + } + + // toByteArray() must fail + try { + out.bytes().toBytes(); + fail("expected IllegalStateException: stream closed"); + } + catch (IllegalStateException iex1) { + // expected + } + + } + + // create & fill byte[] with randomized data + protected byte[] randomizedByteArrayWithSize(int size) { + byte[] data = new byte[size]; + getRandom().nextBytes(data); + return data; } } From d5278df1ca1ce260dd54306bcce74cf12aa02af3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Holger=20Hoffst=C3=A4tte?= Date: Mon, 3 Mar 2014 17:15:38 +0100 Subject: [PATCH 2/6] Minor style fixes, update BlobStoreRepository similar to FsTranslog. --- ...java => NonePageCacheRecyclerService.java} | 7 +++- .../common/io/stream/BytesStreamOutput.java | 37 +++++++------------ .../blobstore/BlobStoreRepository.java | 16 +++++--- 3 files changed, 29 insertions(+), 31 deletions(-) rename src/main/java/org/elasticsearch/cache/recycler/{NonPageCacheRecyclerService.java => NonePageCacheRecyclerService.java} (94%) diff --git a/src/main/java/org/elasticsearch/cache/recycler/NonPageCacheRecyclerService.java b/src/main/java/org/elasticsearch/cache/recycler/NonePageCacheRecyclerService.java similarity index 94% rename from src/main/java/org/elasticsearch/cache/recycler/NonPageCacheRecyclerService.java rename to src/main/java/org/elasticsearch/cache/recycler/NonePageCacheRecyclerService.java index b9c363c3d0287..717675610d961 100644 --- a/src/main/java/org/elasticsearch/cache/recycler/NonPageCacheRecyclerService.java +++ b/src/main/java/org/elasticsearch/cache/recycler/NonePageCacheRecyclerService.java @@ -26,7 +26,10 @@ import org.elasticsearch.common.util.BigArrays; // Should implement PageCacheRecycler once that is made into an interface -public class NonPageCacheRecyclerService { +public class NonePageCacheRecyclerService { + + // Shared non-recycler for legacy constructor clients; should be removed eventually. + public static final NonePageCacheRecyclerService INSTANCE = new NonePageCacheRecyclerService(); private final Recycler byteRecycler; private final Recycler intRecycler; @@ -35,7 +38,7 @@ public class NonPageCacheRecyclerService { private final Recycler doubleRecycler; private final Recycler objectRecycler; - public NonPageCacheRecyclerService() { + private NonePageCacheRecyclerService() { Recycler.C rcb = new NoneRecyclingByteFactory(); this.byteRecycler = new NoneRecycler(rcb); Recycler.C rci = new NoneRecyclingIntFactory(); diff --git a/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java b/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java index 4a437cbf2564a..581ecfc8d7c86 100644 --- a/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java +++ b/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java @@ -19,7 +19,7 @@ package org.elasticsearch.common.io.stream; -import org.elasticsearch.cache.recycler.NonPageCacheRecyclerService; +import org.elasticsearch.cache.recycler.NonePageCacheRecyclerService; import org.elasticsearch.cache.recycler.PageCacheRecycler; import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.bytes.BytesReference; @@ -38,13 +38,10 @@ */ public class BytesStreamOutput extends StreamOutput implements BytesStream { - // Shared non-recycler for legacy constructor clients; should be removed eventually. - private static final NonPageCacheRecyclerService NONRECYCLING = new NonPageCacheRecyclerService(); - /** * PageCacheRecycler for acquiring/releasing individual memory pages */ - private final NonPageCacheRecyclerService pageRecycler; + private final NonePageCacheRecyclerService pageRecycler; /** * The buffer where data is stored. @@ -66,7 +63,7 @@ public class BytesStreamOutput extends StreamOutput implements BytesStream { * Create a nonrecycling {@link BytesStreamOutput} with 1 initial page acquired. */ public BytesStreamOutput() { - this(NONRECYCLING, BigArrays.BYTE_PAGE_SIZE); + this(NonePageCacheRecyclerService.INSTANCE, BigArrays.BYTE_PAGE_SIZE); } /** @@ -76,7 +73,7 @@ public BytesStreamOutput() { * @param expectedSize the expected maximum size of the stream in bytes. */ public BytesStreamOutput(int expectedSize) { - this(NONRECYCLING, expectedSize); + this(NonePageCacheRecyclerService.INSTANCE, expectedSize); } /** @@ -85,7 +82,7 @@ public BytesStreamOutput(int expectedSize) { * @param pageCacheRecycler the {@link PageCacheRecycler} from which to obtain * bytes[]s. */ - private BytesStreamOutput(NonPageCacheRecyclerService pageCacheRecycler) { + private BytesStreamOutput(NonePageCacheRecyclerService pageCacheRecycler) { // expected size does not matter as long as it's >0 this(pageCacheRecycler, 1); } @@ -98,7 +95,7 @@ private BytesStreamOutput(NonPageCacheRecyclerService pageCacheRecycler) { * bytes[]s. * @param expectedSize the expected maximum size of the stream in bytes. */ - private BytesStreamOutput(NonPageCacheRecyclerService pageCacheRecycler, int expectedSize) { + private BytesStreamOutput(NonePageCacheRecyclerService pageCacheRecycler, int expectedSize) { this.pageRecycler = pageCacheRecycler; // there is no good way to figure out the pageSize used by a PCR, so // get one page and use its size. @@ -220,20 +217,7 @@ public void skip(int length) { @Override public void close() throws IOException { - // since most callers are broken we don't do anything here. - // a strict implementation would look liek this: - // if (count != -1) { - // // return pages to recycler - // for (Recycler.V vpage : pages) { - // vpage.release(); - // } - // - // // clear refs - // pages.clear(); - // - // // mark as closed - // count = -1; - // } + // empty for now. } /** @@ -258,7 +242,12 @@ private byte[] toByteArray() { // stricly speaking this is undefined. :/ // ensureOpen(); - // result array + if (count <= pageSize) { + // simply return the first page + return pages.get(0).v(); + } + + // create result array byte[] result = new byte[count]; int resultOffset = 0; diff --git a/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java b/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java index 5e663c25dbb15..f26c50b93182a 100644 --- a/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java +++ b/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java @@ -32,6 +32,7 @@ import org.elasticsearch.common.blobstore.BlobPath; import org.elasticsearch.common.blobstore.BlobStore; import org.elasticsearch.common.blobstore.ImmutableBlobContainer; +import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.component.AbstractLifecycleComponent; import org.elasticsearch.common.compress.CompressorFactory; import org.elasticsearch.common.io.stream.BytesStreamOutput; @@ -221,11 +222,13 @@ public void initializeSnapshot(SnapshotId snapshotId, ImmutableList indi // TODO: Can we make it atomic? throw new InvalidSnapshotNameException(snapshotId, "snapshot with such name already exists"); } - snapshotsBlobContainer.writeBlob(snapshotBlobName, bStream.bytes().streamInput(), bStream.bytes().length()); + BytesReference bRef = bStream.bytes(); + snapshotsBlobContainer.writeBlob(snapshotBlobName, bRef.streamInput(), bRef.length()); // Write Global MetaData // TODO: Check if metadata needs to be written bStream = writeGlobalMetaData(metaData); - snapshotsBlobContainer.writeBlob(metaDataBlobName(snapshotId), bStream.bytes().streamInput(), bStream.bytes().length()); + bRef = bStream.bytes(); + snapshotsBlobContainer.writeBlob(metaDataBlobName(snapshotId), bRef.streamInput(), bRef.length()); for (String index : indices) { IndexMetaData indexMetaData = metaData.index(index); BlobPath indexPath = basePath().add("indices").add(index); @@ -240,7 +243,8 @@ public void initializeSnapshot(SnapshotId snapshotId, ImmutableList indi IndexMetaData.Builder.toXContent(indexMetaData, builder, ToXContent.EMPTY_PARAMS); builder.endObject(); builder.close(); - indexMetaDataBlobContainer.writeBlob(snapshotBlobName(snapshotId), bStream.bytes().streamInput(), bStream.bytes().length()); + bRef = bStream.bytes(); + indexMetaDataBlobContainer.writeBlob(snapshotBlobName(snapshotId), bRef.streamInput(), bRef.length()); } } catch (IOException ex) { throw new SnapshotCreationException(snapshotId, ex); @@ -314,7 +318,8 @@ public Snapshot finalizeSnapshot(SnapshotId snapshotId, String failure, int tota updatedSnapshot.endTime(System.currentTimeMillis()); snapshot = updatedSnapshot.build(); BytesStreamOutput bStream = writeSnapshot(snapshot); - snapshotsBlobContainer.writeBlob(blobName, bStream.bytes().streamInput(), bStream.bytes().length()); + BytesReference bRef = bStream.bytes(); + snapshotsBlobContainer.writeBlob(blobName, bRef.streamInput(), bRef.length()); ImmutableList snapshotIds = snapshots(); if (!snapshotIds.contains(snapshotId)) { snapshotIds = ImmutableList.builder().addAll(snapshotIds).add(snapshotId).build(); @@ -569,7 +574,8 @@ protected void writeSnapshotList(ImmutableList snapshots) throws IOE builder.endArray(); builder.endObject(); builder.close(); - snapshotsBlobContainer.writeBlob(SNAPSHOTS_FILE, bStream.bytes().streamInput(), bStream.bytes().length()); + BytesReference bRef = bStream.bytes(); + snapshotsBlobContainer.writeBlob(SNAPSHOTS_FILE, bRef.streamInput(), bRef.length()); } /** From 8bf1cad1023c5b9ad4be5868797a7fadfba32bff Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Mon, 6 Jan 2014 17:35:51 -0500 Subject: [PATCH 3/6] Percentiles aggregation. A new metric aggregation that can compute approximate values of arbitrary percentiles. Close #5323 --- .../search/aggregations/metrics.asciidoc | 2 + .../metrics/percentile-aggregation.asciidoc | 180 +++++ pom.xml | 9 + .../elasticsearch/common/util/ArrayUtils.java | 72 ++ .../elasticsearch/common/util/BigArrays.java | 7 + .../common/util/BigIntArray.java | 17 + .../common/util/BigObjectArray.java | 2 +- .../elasticsearch/common/util/IntArray.java | 5 + .../common/util/ObjectArray.java | 4 +- .../aggregations/AggregationBuilders.java | 5 + .../aggregations/AggregationModule.java | 2 + .../TransportAggregationModule.java | 2 + .../percentiles/InternalPercentiles.java | 190 +++++ .../metrics/percentiles/Percentiles.java | 82 ++ .../percentiles/PercentilesAggregator.java | 125 +++ .../percentiles/PercentilesBuilder.java | 67 ++ .../percentiles/PercentilesEstimator.java | 123 +++ .../percentiles/PercentilesParser.java | 177 +++++ .../tdigest/GroupRedBlackTree.java | 259 ++++++ .../percentiles/tdigest/RedBlackTree.java | 737 ++++++++++++++++++ .../metrics/percentiles/tdigest/TDigest.java | 175 +++++ .../percentiles/tdigest/TDigestState.java | 353 +++++++++ ...PercentilesAggregationSearchBenchmark.java | 209 +++++ .../common/util/ArrayUtilsTests.java | 92 +++ .../search/aggregations/RandomTests.java | 26 + .../metrics/AbstractNumericTests.java | 9 +- .../aggregations/metrics/GroupTree.java | 482 ++++++++++++ .../aggregations/metrics/GroupTreeTests.java | 100 +++ .../metrics/PercentilesTests.java | 391 ++++++++++ .../metrics/RedBlackTreeTests.java | 185 +++++ .../metrics/TDigestStateTests.java | 194 +++++ .../test/cache/recycler/MockBigArrays.java | 5 + 32 files changed, 4284 insertions(+), 4 deletions(-) create mode 100644 docs/reference/search/aggregations/metrics/percentile-aggregation.asciidoc create mode 100644 src/main/java/org/elasticsearch/common/util/ArrayUtils.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/InternalPercentiles.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/Percentiles.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesAggregator.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesBuilder.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesEstimator.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesParser.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/GroupRedBlackTree.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/RedBlackTree.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/TDigest.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/TDigestState.java create mode 100644 src/test/java/org/elasticsearch/benchmark/search/aggregations/PercentilesAggregationSearchBenchmark.java create mode 100644 src/test/java/org/elasticsearch/common/util/ArrayUtilsTests.java create mode 100644 src/test/java/org/elasticsearch/search/aggregations/metrics/GroupTree.java create mode 100644 src/test/java/org/elasticsearch/search/aggregations/metrics/GroupTreeTests.java create mode 100644 src/test/java/org/elasticsearch/search/aggregations/metrics/PercentilesTests.java create mode 100644 src/test/java/org/elasticsearch/search/aggregations/metrics/RedBlackTreeTests.java create mode 100644 src/test/java/org/elasticsearch/search/aggregations/metrics/TDigestStateTests.java diff --git a/docs/reference/search/aggregations/metrics.asciidoc b/docs/reference/search/aggregations/metrics.asciidoc index 068fda2b3588e..ebe7664bcda29 100644 --- a/docs/reference/search/aggregations/metrics.asciidoc +++ b/docs/reference/search/aggregations/metrics.asciidoc @@ -13,3 +13,5 @@ include::metrics/stats-aggregation.asciidoc[] include::metrics/extendedstats-aggregation.asciidoc[] include::metrics/valuecount-aggregation.asciidoc[] + +include::metrics/percentile-aggregation.asciidoc[] \ No newline at end of file diff --git a/docs/reference/search/aggregations/metrics/percentile-aggregation.asciidoc b/docs/reference/search/aggregations/metrics/percentile-aggregation.asciidoc new file mode 100644 index 0000000000000..df826d24a3aac --- /dev/null +++ b/docs/reference/search/aggregations/metrics/percentile-aggregation.asciidoc @@ -0,0 +1,180 @@ +[[search-aggregations-metrics-percentile-aggregation]] +=== Percentiles +A `multi-value` metrics aggregation that calculates one or more percentiles +over numeric values extracted from the aggregated documents. These values +can be extracted either from specific numeric fields in the documents, or +be generated by a provided script. + +.Experimental! +[IMPORTANT] +===== +This feature is marked as experimental, and may be subject to change in the +future. If you use this feature, please let us know your experience with it! +===== + +Percentiles show the point at which a certain percentage of observed values +occur. For example, the 95th percentile is the value which is greater than 95% +of the observed values. + +Percentiles are often used to find outliers. In normal distributions, the +0.13th and 99.87th percentiles represents three standard deviations from the +mean. Any data which falls outside three standard deviations is often considered +an anomaly. + +When a range of percentiles are retrieved, they can be used to estimate the +data distribution and determine if the data is skewed, bimodal, etc. + +Assume your data consists of website load times. The average and median +load times are not overly useful to an administrator. The max may be interesting, +but it can be easily skewed by a single slow response. + +Let's look at a range of percentiles representing load time: + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "load_time_outlier" : { + "percentiles" : { + "field" : "load_time" <1> + } + } + } +} +-------------------------------------------------- +<1> The field `load_time` must be a numeric field + +By default, the `percentile` metric will generate a range of +percentiles: `[ 1, 5, 25, 50, 75, 95, 99 ]`. The response will look like this: + +[source,js] +-------------------------------------------------- +{ + ... + + "aggregations": { + "load_time_outlier": { + "1.0": 15, + "5.0": 20, + "25.0": 23, + "50.0": 25, + "75.0": 29, + "95.0": 60, + "99.0": 150 + } + } +} +-------------------------------------------------- + +As you can see, the aggregation will return a calculated value for each percentile +in the default range. If we assume response times are in milliseconds, it is +immediately obvious that the webpage normally loads in 15-30ms, but occasionally +spikes to 60-150ms. + +Often, administrators are only interested in outliers -- the extreme percentiles. +We can specify just the percents we are interested in (requested percentiles +must be a value between 0-100 inclusive): + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "load_time_outlier" : { + "percentiles" : { + "field" : "load_time", + "percents" : [95, 99, 99.9] <1> + } + } + } +} +-------------------------------------------------- +<1> Use the `percents` parameter to specify particular percentiles to calculate + + + +==== Script + +The percentile metric supports scripting. For example, if our load times +are in milliseconds but we want percentiles calculated in seconds, we could use +a script to convert them on-the-fly: + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "load_time_outlier" : { + "percentiles" : { + "script" : "doc['load_time'].value / timeUnit", <1> + "params" : { + "timeUnit" : 1000 <2> + } + } + } + } +} +-------------------------------------------------- +<1> The `field` parameter is replaced with a `script` parameter, which uses the +script to generate values which percentiles are calculated on +<2> Scripting supports parameterized input just like any other script + +==== Percentiles are (usually) approximate + +There are many different algorithms to calculate percentiles. The naive +implementation simply stores all the values in a sorted array. To find the 50th +percentile, you simple find the value that is at `my_array[count(my_array) * 0.5]`. + +Clearly, the naive implementation does not scale -- the sorted array grows +linearly with the number of values in your dataset. To calculate percentiles +across potentially billions of values in an Elasticsearch cluster, _approximate_ +percentiles are calculated. + +The algorithm used by the `percentile` metric is called TDigest (introduced by +Ted Dunning in +https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf[Computing Accurate Quantiles using T-Digests]). + +When using this metric, there are a few guidelines to keep in mind: + +- Accuracy is proportional to `q(1-q)`. This means that extreme percentiles (e.g. 99%) +are more accurate than less extreme percentiles, such as the median +- For small sets of values, percentiles are highly accurate (and potentially +100% accurate if the data is small enough). +- As the quantity of values in a bucket grows, the algorithm begins to approximate +the percentiles. It is effectively trading accuracy for memory savings. The +exact level of inaccuracy is difficult to generalize, since it depends on your +data distribution and volume of data being aggregated + +==== Compression + +Approximate algorithms must balance memory utilization with estimation accuracy. +This balance can be controlled using a `compression` parameter: + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "load_time_outlier" : { + "percentiles" : { + "field" : "load_time", + "compression" : 200 <1> + } + } + } +} +-------------------------------------------------- +<1> Compression controls memory usage and approximation error + +The TDigest algorithm uses a number of "nodes" to approximate percentiles -- the +more nodes available, the higher the accuracy (and large memory footprint) proportional +to the volume of data. The `compression` parameter limits the maximum number of +nodes to `100 * compression`. + +Therefore, by increasing the compression value, you can increase the accuracy of +your percentiles at the cost of more memory. Larger compression values also +make the algorithm slower since the underlying tree data structure grows in size, +resulting in more expensive operations. The default compression value is +`100`. + +A "node" uses roughly 48 bytes of memory, so under worst-case scenarios (large amount +of data which arrives sorted and in-order) the default settings will produce a +TDigest roughly 480KB in size. In practice data tends to be more random and +the TDigest will use less memory. diff --git a/pom.xml b/pom.xml index 8598c5e647c46..691299abce718 100644 --- a/pom.xml +++ b/pom.xml @@ -63,6 +63,12 @@ 4.3.1 test + + org.apache.mahout + mahout-core + 0.9 + test + org.apache.lucene @@ -1195,6 +1201,9 @@ src/main/java/org/elasticsearch/common/lucene/search/XFilteredQuery.java src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java src/main/java/org/apache/lucene/**/X*.java + + src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/TDigestState.java + src/test/java/org/elasticsearch/search/aggregations/metrics/GroupTree.java diff --git a/src/main/java/org/elasticsearch/common/util/ArrayUtils.java b/src/main/java/org/elasticsearch/common/util/ArrayUtils.java new file mode 100644 index 0000000000000..053c9b547ae55 --- /dev/null +++ b/src/main/java/org/elasticsearch/common/util/ArrayUtils.java @@ -0,0 +1,72 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.util; + +import java.util.Arrays; + +/** + * + */ +public class ArrayUtils { + + private ArrayUtils() {} + + /** + * Return the index of value in array, or -1 if there is no such index. + * If there are several values that are within tolerance or less of value, this method will return the + * index of the closest value. In case of several values being as close ot value, there is no guarantee which index + * will be returned. + * Results are undefined if the array is not sorted. + */ + public static int binarySearch(double[] array, double value, double tolerance) { + if (array.length == 0) { + return -1; + } + return binarySearch(array, 0, array.length, value, tolerance); + } + + private static int binarySearch(double[] array, int fromIndex, int toIndex, double value, double tolerance) { + int index = Arrays.binarySearch(array, fromIndex, toIndex, value); + if (index < 0) { + final int highIndex = -1 - index; // first index of a value that is > value + final int lowIndex = highIndex - 1; // last index of a value that is < value + + double lowError = Double.POSITIVE_INFINITY; + double highError = Double.POSITIVE_INFINITY; + if (lowIndex >= 0) { + lowError = value - array[lowIndex]; + } + if (highIndex < array.length) { + highError = array[highIndex] - value; + } + + if (highError < lowError) { + if (highError < tolerance) { + index = highIndex; + } + } else if (lowError < tolerance) { + index = lowIndex; + } else { + index = -1; + } + } + return index; + } +} diff --git a/src/main/java/org/elasticsearch/common/util/BigArrays.java b/src/main/java/org/elasticsearch/common/util/BigArrays.java index ae34ac3b332f4..ca42a54fce923 100644 --- a/src/main/java/org/elasticsearch/common/util/BigArrays.java +++ b/src/main/java/org/elasticsearch/common/util/BigArrays.java @@ -171,6 +171,13 @@ public int increment(long index, int inc) { return array[(int) index] += inc; } + @Override + public void fill(long fromIndex, long toIndex, int value) { + assert indexIsInt(fromIndex); + assert indexIsInt(toIndex); + Arrays.fill(array, (int) fromIndex, (int) toIndex, value); + } + } private static class LongArrayWrapper extends AbstractArrayWrapper implements LongArray { diff --git a/src/main/java/org/elasticsearch/common/util/BigIntArray.java b/src/main/java/org/elasticsearch/common/util/BigIntArray.java index 6ab921212d891..52eb3e4f4e304 100644 --- a/src/main/java/org/elasticsearch/common/util/BigIntArray.java +++ b/src/main/java/org/elasticsearch/common/util/BigIntArray.java @@ -19,6 +19,7 @@ package org.elasticsearch.common.util; +import com.google.common.base.Preconditions; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.RamUsageEstimator; import org.elasticsearch.cache.recycler.PageCacheRecycler; @@ -69,6 +70,22 @@ public int increment(long index, int inc) { return pages[pageIndex][indexInPage] += inc; } + @Override + public void fill(long fromIndex, long toIndex, int value) { + Preconditions.checkArgument(fromIndex <= toIndex); + final int fromPage = pageIndex(fromIndex); + final int toPage = pageIndex(toIndex - 1); + if (fromPage == toPage) { + Arrays.fill(pages[fromPage], indexInPage(fromIndex), indexInPage(toIndex - 1) + 1, value); + } else { + Arrays.fill(pages[fromPage], indexInPage(fromIndex), pages[fromPage].length, value); + for (int i = fromPage + 1; i < toPage; ++i) { + Arrays.fill(pages[i], value); + } + Arrays.fill(pages[toPage], 0, indexInPage(toIndex - 1) + 1, value); + } + } + @Override protected int numBytesPerElement() { return RamUsageEstimator.NUM_BYTES_INT; diff --git a/src/main/java/org/elasticsearch/common/util/BigObjectArray.java b/src/main/java/org/elasticsearch/common/util/BigObjectArray.java index 960687c7ce804..9be9963a4d765 100644 --- a/src/main/java/org/elasticsearch/common/util/BigObjectArray.java +++ b/src/main/java/org/elasticsearch/common/util/BigObjectArray.java @@ -85,4 +85,4 @@ public void resize(long newSize) { this.size = newSize; } -} +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/common/util/IntArray.java b/src/main/java/org/elasticsearch/common/util/IntArray.java index 1ee973beda58f..096b81be49807 100644 --- a/src/main/java/org/elasticsearch/common/util/IntArray.java +++ b/src/main/java/org/elasticsearch/common/util/IntArray.java @@ -39,4 +39,9 @@ public interface IntArray extends BigArray { */ public abstract int increment(long index, int inc); + /** + * Fill slots between fromIndex inclusive to toIndex exclusive with value. + */ + public abstract void fill(long fromIndex, long toIndex, int value); + } diff --git a/src/main/java/org/elasticsearch/common/util/ObjectArray.java b/src/main/java/org/elasticsearch/common/util/ObjectArray.java index c105b729193a9..bd605ae43dd07 100644 --- a/src/main/java/org/elasticsearch/common/util/ObjectArray.java +++ b/src/main/java/org/elasticsearch/common/util/ObjectArray.java @@ -27,11 +27,11 @@ public interface ObjectArray extends BigArray { /** * Get an element given its index. */ - public abstract T get(long index); + T get(long index); /** * Set a value at the given index and return the previous value. */ - public abstract T set(long index, T value); + T set(long index, T value); } diff --git a/src/main/java/org/elasticsearch/search/aggregations/AggregationBuilders.java b/src/main/java/org/elasticsearch/search/aggregations/AggregationBuilders.java index 743d37d2c8d34..be0ad1eaa4861 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/AggregationBuilders.java +++ b/src/main/java/org/elasticsearch/search/aggregations/AggregationBuilders.java @@ -33,6 +33,7 @@ import org.elasticsearch.search.aggregations.metrics.avg.AvgBuilder; import org.elasticsearch.search.aggregations.metrics.max.MaxBuilder; import org.elasticsearch.search.aggregations.metrics.min.MinBuilder; +import org.elasticsearch.search.aggregations.metrics.percentiles.PercentilesBuilder; import org.elasticsearch.search.aggregations.metrics.stats.StatsBuilder; import org.elasticsearch.search.aggregations.metrics.stats.extended.ExtendedStatsBuilder; import org.elasticsearch.search.aggregations.metrics.sum.SumBuilder; @@ -121,4 +122,8 @@ public static IPv4RangeBuilder ipRange(String name) { public static TermsBuilder terms(String name) { return new TermsBuilder(name); } + + public static PercentilesBuilder percentiles(String name) { + return new PercentilesBuilder(name); + } } diff --git a/src/main/java/org/elasticsearch/search/aggregations/AggregationModule.java b/src/main/java/org/elasticsearch/search/aggregations/AggregationModule.java index 25e114eb5c821..3475f3de9a1c4 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/AggregationModule.java +++ b/src/main/java/org/elasticsearch/search/aggregations/AggregationModule.java @@ -36,6 +36,7 @@ import org.elasticsearch.search.aggregations.metrics.avg.AvgParser; import org.elasticsearch.search.aggregations.metrics.max.MaxParser; import org.elasticsearch.search.aggregations.metrics.min.MinParser; +import org.elasticsearch.search.aggregations.metrics.percentiles.PercentilesParser; import org.elasticsearch.search.aggregations.metrics.stats.StatsParser; import org.elasticsearch.search.aggregations.metrics.stats.extended.ExtendedStatsParser; import org.elasticsearch.search.aggregations.metrics.sum.SumParser; @@ -58,6 +59,7 @@ public AggregationModule() { parsers.add(StatsParser.class); parsers.add(ExtendedStatsParser.class); parsers.add(ValueCountParser.class); + parsers.add(PercentilesParser.class); parsers.add(GlobalParser.class); parsers.add(MissingParser.class); diff --git a/src/main/java/org/elasticsearch/search/aggregations/TransportAggregationModule.java b/src/main/java/org/elasticsearch/search/aggregations/TransportAggregationModule.java index a2a5b253e42e7..b97145f357ddd 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/TransportAggregationModule.java +++ b/src/main/java/org/elasticsearch/search/aggregations/TransportAggregationModule.java @@ -37,6 +37,7 @@ import org.elasticsearch.search.aggregations.metrics.avg.InternalAvg; import org.elasticsearch.search.aggregations.metrics.max.InternalMax; import org.elasticsearch.search.aggregations.metrics.min.InternalMin; +import org.elasticsearch.search.aggregations.metrics.percentiles.InternalPercentiles; import org.elasticsearch.search.aggregations.metrics.stats.InternalStats; import org.elasticsearch.search.aggregations.metrics.stats.extended.InternalExtendedStats; import org.elasticsearch.search.aggregations.metrics.sum.InternalSum; @@ -58,6 +59,7 @@ protected void configure() { InternalStats.registerStreams(); InternalExtendedStats.registerStreams(); InternalValueCount.registerStreams(); + InternalPercentiles.registerStreams(); // buckets InternalGlobal.registerStreams(); diff --git a/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/InternalPercentiles.java b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/InternalPercentiles.java new file mode 100644 index 0000000000000..ae4e96a703ee6 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/InternalPercentiles.java @@ -0,0 +1,190 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.metrics.percentiles; + +import com.google.common.collect.UnmodifiableIterator; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.aggregations.AggregationStreams; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.metrics.MetricsAggregation; +import org.elasticsearch.search.aggregations.support.numeric.ValueFormatterStreams; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; + +/** +* +*/ +public class InternalPercentiles extends MetricsAggregation.MultiValue implements Percentiles { + + public final static Type TYPE = new Type("percentiles"); + + public final static AggregationStreams.Stream STREAM = new AggregationStreams.Stream() { + @Override + public InternalPercentiles readResult(StreamInput in) throws IOException { + InternalPercentiles result = new InternalPercentiles(); + result.readFrom(in); + return result; + } + }; + + public static void registerStreams() { + AggregationStreams.registerStream(STREAM, TYPE.stream()); + } + + private PercentilesEstimator.Result result; + private boolean keyed; + + InternalPercentiles() {} // for serialization + + public InternalPercentiles(String name, PercentilesEstimator.Result result, boolean keyed) { + super(name); + this.result = result; + this.keyed = keyed; + } + + @Override + public double value(String name) { + return result.estimate(Double.valueOf(name)); + } + + @Override + public Type type() { + return TYPE; + } + + @Override + public double percentile(double percent) { + return result.estimate(percent); + } + + @Override + public Iterator iterator() { + return new Iter(result); + } + + @Override + public InternalPercentiles reduce(ReduceContext reduceContext) { + List aggregations = reduceContext.aggregations(); + InternalPercentiles first = (InternalPercentiles) aggregations.get(0); + if (aggregations.size() == 1) { + return first; + } + PercentilesEstimator.Result.Merger merger = first.result.merger(aggregations.size()); + for (InternalAggregation aggregation : aggregations) { + merger.add(((InternalPercentiles) aggregation).result); + } + first.result = merger.merge(); + return first; + } + + @Override + public void readFrom(StreamInput in) throws IOException { + name = in.readString(); + valueFormatter = ValueFormatterStreams.readOptional(in); + result = PercentilesEstimator.Streams.read(in); + keyed = in.readBoolean(); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeString(name); + ValueFormatterStreams.writeOptional(valueFormatter, out); + PercentilesEstimator.Streams.write(result, out); + out.writeBoolean(keyed); + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + double[] percents = result.percents; + if (keyed) { + builder.startObject(name); + for(int i = 0; i < percents.length; ++i) { + String key = String.valueOf(percents[i]); + double value = result.estimate(i); + builder.field(key, value); + if (valueFormatter != null) { + builder.field(key + "_as_string", valueFormatter.format(value)); + } + } + builder.endObject(); + } else { + builder.startArray(name); + for (int i = 0; i < percents.length; i++) { + double value = result.estimate(i); + builder.startObject(); + builder.field(CommonFields.KEY, percents[i]); + builder.field(CommonFields.VALUE, value); + if (valueFormatter != null) { + builder.field(CommonFields.VALUE_AS_STRING, valueFormatter.format(value)); + } + builder.endObject(); + } + builder.endArray(); + } + return builder; + } + + public static class Iter extends UnmodifiableIterator { + + private final PercentilesEstimator.Result result; + private int i; + + public Iter(PercentilesEstimator.Result estimator) { + this.result = estimator; + i = 0; + } + + @Override + public boolean hasNext() { + return i < result.percents.length; + } + + @Override + public Percentiles.Percentile next() { + final Percentiles.Percentile next = new InnerPercentile(result.percents[i], result.estimate(i)); + ++i; + return next; + } + } + + private static class InnerPercentile implements Percentiles.Percentile { + + private final double percent; + private final double value; + + private InnerPercentile(double percent, double value) { + this.percent = percent; + this.value = value; + } + + @Override + public double getPercent() { + return percent; + } + + @Override + public double getValue() { + return value; + } + } +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/Percentiles.java b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/Percentiles.java new file mode 100644 index 0000000000000..ec70e3c19a62e --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/Percentiles.java @@ -0,0 +1,82 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.metrics.percentiles; + +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.aggregations.Aggregation; + +import java.io.IOException; + +/** + * + */ +public interface Percentiles extends Aggregation, Iterable { + + public static abstract class Estimator { + + public static TDigest tDigest() { + return new TDigest(); + } + + private final String type; + + protected Estimator(String type) { + this.type = type; + } + + public static class TDigest extends Estimator { + + protected double compression = -1; + + TDigest() { + super("tdigest"); + } + + public TDigest compression(double compression) { + this.compression = compression; + return this; + } + + @Override + void paramsToXContent(XContentBuilder builder) throws IOException { + if (compression > 0) { + builder.field("compression", compression); + } + } + } + + String type() { + return type; + } + + abstract void paramsToXContent(XContentBuilder builder) throws IOException; + + } + + public static interface Percentile { + + double getPercent(); + + double getValue(); + + } + + double percentile(double percent); + +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesAggregator.java new file mode 100644 index 0000000000000..2dd034df4ff56 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesAggregator.java @@ -0,0 +1,125 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.metrics.percentiles; + +import org.apache.lucene.index.AtomicReaderContext; +import org.elasticsearch.index.fielddata.DoubleValues; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.metrics.MetricsAggregator; +import org.elasticsearch.search.aggregations.support.AggregationContext; +import org.elasticsearch.search.aggregations.support.ValueSourceAggregatorFactory; +import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; +import org.elasticsearch.search.aggregations.support.numeric.NumericValuesSource; + +import java.io.IOException; + +/** + * + */ +public class PercentilesAggregator extends MetricsAggregator.MultiValue { + + private final NumericValuesSource valuesSource; + private DoubleValues values; + + private final PercentilesEstimator estimator; + private final boolean keyed; + + + public PercentilesAggregator(String name, long estimatedBucketsCount, NumericValuesSource valuesSource, AggregationContext context, + Aggregator parent, PercentilesEstimator estimator, boolean keyed) { + super(name, estimatedBucketsCount, context, parent); + this.valuesSource = valuesSource; + this.keyed = keyed; + this.estimator = estimator; + } + + @Override + public boolean shouldCollect() { + return valuesSource != null; + } + + @Override + public void setNextReader(AtomicReaderContext reader) { + values = valuesSource.doubleValues(); + } + + @Override + public void collect(int doc, long owningBucketOrdinal) throws IOException { + final int valueCount = values.setDocument(doc); + for (int i = 0; i < valueCount; i++) { + estimator.offer(values.nextValue(), owningBucketOrdinal); + } + } + + @Override + public boolean hasMetric(String name) { + return PercentilesEstimator.indexOfPercent(estimator.percents, Double.parseDouble(name)) >= 0; + } + + @Override + public double metric(String name, long owningBucketOrd) { + return estimator.result(owningBucketOrd).estimate(Double.parseDouble(name)); + } + + @Override + public InternalAggregation buildAggregation(long owningBucketOrdinal) { + if (valuesSource == null) { + return buildEmptyAggregation(); + } + return new InternalPercentiles(name, estimator.result(owningBucketOrdinal), keyed); + } + + @Override + public InternalAggregation buildEmptyAggregation() { + return new InternalPercentiles(name, estimator.emptyResult(), keyed); + } + + @Override + protected void doRelease() { + estimator.release(); + } + + public static class Factory extends ValueSourceAggregatorFactory.LeafOnly { + + private final PercentilesEstimator.Factory estimatorFactory; + private final double[] percents; + private final boolean keyed; + + public Factory(String name, ValuesSourceConfig valuesSourceConfig, + double[] percents, PercentilesEstimator.Factory estimatorFactory, boolean keyed) { + super(name, InternalPercentiles.TYPE.name(), valuesSourceConfig); + this.estimatorFactory = estimatorFactory; + this.percents = percents; + this.keyed = keyed; + } + + @Override + protected Aggregator createUnmapped(AggregationContext aggregationContext, Aggregator parent) { + return new PercentilesAggregator(name, 0, null, aggregationContext, parent, estimatorFactory.create(percents, 0, aggregationContext), keyed); + } + + @Override + protected Aggregator create(NumericValuesSource valuesSource, long expectedBucketsCount, AggregationContext aggregationContext, Aggregator parent) { + PercentilesEstimator estimator = estimatorFactory.create(percents, expectedBucketsCount, aggregationContext); + return new PercentilesAggregator(name, expectedBucketsCount, valuesSource, aggregationContext, parent, estimator, keyed); + } + } + +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesBuilder.java b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesBuilder.java new file mode 100644 index 0000000000000..f066f206bd92e --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesBuilder.java @@ -0,0 +1,67 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.metrics.percentiles; + +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.aggregations.metrics.ValuesSourceMetricsAggregationBuilder; + +import java.io.IOException; + +/** + * + */ +public class PercentilesBuilder extends ValuesSourceMetricsAggregationBuilder { + + private double[] percentiles; + private Percentiles.Estimator estimator; + + public PercentilesBuilder(String name) { + super(name, InternalPercentiles.TYPE.name()); + } + + public PercentilesBuilder percentiles(double... percentiles) { + for (int i = 0; i < percentiles.length; i++) { + if (percentiles[i] < 0 || percentiles[i] > 100) { + throw new IllegalArgumentException("the percents in the percentiles aggregation [" + + name + "] must be in the [0, 100] range"); + } + } + this.percentiles = percentiles; + return this; + } + + public PercentilesBuilder estimator(Percentiles.Estimator estimator) { + this.estimator = estimator; + return this; + } + + @Override + protected void internalXContent(XContentBuilder builder, Params params) throws IOException { + super.internalXContent(builder, params); + + if (percentiles != null) { + builder.field("percents", percentiles); + } + + if (estimator != null) { + builder.field("estimator", estimator.type()); + estimator.paramsToXContent(builder); + } + } +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesEstimator.java b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesEstimator.java new file mode 100644 index 0000000000000..b9cb431976283 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesEstimator.java @@ -0,0 +1,123 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.metrics.percentiles; + +import org.elasticsearch.ElasticsearchIllegalArgumentException; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Streamable; +import org.elasticsearch.common.lease.Releasable; +import org.elasticsearch.common.util.ArrayUtils; +import org.elasticsearch.search.aggregations.metrics.percentiles.tdigest.TDigest; +import org.elasticsearch.search.aggregations.support.AggregationContext; + +import java.io.IOException; +import java.util.Arrays; + +/** +* +*/ +public abstract class PercentilesEstimator implements Releasable { + + protected double[] percents; + + public PercentilesEstimator(double[] percents) { + this.percents = percents; + } + + /** + * @return list of percentile intervals + */ + public double[] percents() { + return percents; + } + + /** + * Offer a new value to the streaming percentile algo. May modify the current + * estimate + * + * @param value Value to stream + */ + public abstract void offer(double value, long bucketOrd); + + public abstract Result result(long bucketOrd); + + public abstract Result emptyResult(); + + static int indexOfPercent(double[] percents, double percent) { + return ArrayUtils.binarySearch(percents, percent, 0.001); + } + + /** + * Responsible for merging multiple estimators into a single one. + */ + public abstract static class Result implements Streamable { + + protected double[] percents; + + protected Result() {} // for serialization + + protected Result(double[] percents) { + this.percents = percents; + } + + protected abstract byte id(); + + public double estimate(double percent) { + int i = indexOfPercent(percents, percent); + assert i >= 0; + return estimate(i); + } + + public abstract double estimate(int index); + + public abstract Merger merger(int estimatedMerges); + + public static interface Merger { + + public abstract void add(F result); + + public abstract Result merge(); + } + } + + public static interface Factory { + + public abstract E create(double[] percents, long estimatedBucketCount, AggregationContext context); + + } + + static class Streams { + + static Result read(StreamInput in) throws IOException { + switch (in.readByte()) { + case TDigest.ID: return TDigest.Result.read(in); + default: + throw new ElasticsearchIllegalArgumentException("Unknown percentile estimator"); + } + } + + static void write(Result estimator, StreamOutput out) throws IOException { + out.writeByte(estimator.id()); + estimator.writeTo(out); + } + + } + +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesParser.java b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesParser.java new file mode 100644 index 0000000000000..36f61dadad58e --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/PercentilesParser.java @@ -0,0 +1,177 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.metrics.percentiles; + +import com.carrotsearch.hppc.DoubleArrayList; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.search.SearchParseException; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactory; +import org.elasticsearch.search.aggregations.metrics.percentiles.tdigest.TDigest; +import org.elasticsearch.search.aggregations.support.FieldContext; +import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; +import org.elasticsearch.search.aggregations.support.numeric.NumericValuesSource; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +/** + * + */ +public class PercentilesParser implements Aggregator.Parser { + + private final static double[] DEFAULT_PERCENTS = new double[] { 1, 5, 25, 50, 75, 95, 99 }; + + @Override + public String type() { + return InternalPercentiles.TYPE.name(); + } + + /** + * We must override the parse method because we need to allow custom parameters + * (execution_hint, etc) which is not possible otherwise + */ + @Override + public AggregatorFactory parse(String aggregationName, XContentParser parser, SearchContext context) throws IOException { + + ValuesSourceConfig config = new ValuesSourceConfig(NumericValuesSource.class); + + String field = null; + String script = null; + String scriptLang = null; + double[] percents = DEFAULT_PERCENTS; + Map scriptParams = null; + boolean assumeSorted = false; + boolean keyed = true; + Map settings = null; + + XContentParser.Token token; + String currentFieldName = null; + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + currentFieldName = parser.currentName(); + } else if (token == XContentParser.Token.VALUE_STRING) { + if ("field".equals(currentFieldName)) { + field = parser.text(); + } else if ("script".equals(currentFieldName)) { + script = parser.text(); + } else if ("lang".equals(currentFieldName)) { + scriptLang = parser.text(); + } else { + if (settings == null) { + settings = new HashMap(); + } + settings.put(currentFieldName, parser.text()); + } + } else if (token == XContentParser.Token.START_ARRAY) { + if ("percents".equals(currentFieldName)) { + DoubleArrayList values = new DoubleArrayList(10); + while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) { + double percent = parser.doubleValue(); + if (percent < 0 || percent > 100) { + throw new SearchParseException(context, "the percents in the percentiles aggregation [" + + aggregationName + "] must be in the [0, 100] range"); + } + values.add(percent); + } + percents = values.toArray(); + // Some impls rely on the fact that percents are sorted + Arrays.sort(percents); + } else { + throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "]."); + } + } else if (token == XContentParser.Token.START_OBJECT) { + if ("params".equals(currentFieldName)) { + scriptParams = parser.map(); + } else { + throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "]."); + } + } else if (token == XContentParser.Token.VALUE_BOOLEAN) { + if ("script_values_sorted".equals(currentFieldName) || "scriptValuesSorted".equals(currentFieldName)) { + assumeSorted = parser.booleanValue(); + } if ("keyed".equals(currentFieldName)) { + keyed = parser.booleanValue(); + } else { + if (settings == null) { + settings = new HashMap(); + } + settings.put(currentFieldName, parser.booleanValue()); + } + } else if (token == XContentParser.Token.VALUE_NUMBER) { + if (settings == null) { + settings = new HashMap(); + } + settings.put(currentFieldName, parser.numberValue()); + } else { + throw new SearchParseException(context, "Unexpected token " + token + " in [" + aggregationName + "]."); + } + } + + PercentilesEstimator.Factory estimatorFactory = EstimatorType.TDIGEST.estimatorFactory(settings); + + if (script != null) { + config.script(context.scriptService().search(context.lookup(), scriptLang, script, scriptParams)); + } + + if (!assumeSorted) { + config.ensureSorted(true); + } + + if (field == null) { + return new PercentilesAggregator.Factory(aggregationName, config, percents, estimatorFactory, keyed); + } + + FieldMapper mapper = context.smartNameFieldMapper(field); + if (mapper == null) { + config.unmapped(true); + return new PercentilesAggregator.Factory(aggregationName, config, percents, estimatorFactory, keyed); + } + + IndexFieldData indexFieldData = context.fieldData().getForField(mapper); + config.fieldContext(new FieldContext(field, indexFieldData)); + return new PercentilesAggregator.Factory(aggregationName, config, percents, estimatorFactory, keyed); + } + + /** + * + */ + public static enum EstimatorType { + TDIGEST() { + @Override + public PercentilesEstimator.Factory estimatorFactory(Map settings) { + return new TDigest.Factory(settings); + } + }; + + public abstract PercentilesEstimator.Factory estimatorFactory(Map settings); + + public static EstimatorType resolve(String name, SearchContext context) { + if (name.equals("tdigest")) { + return TDIGEST; + } + throw new SearchParseException(context, "Unknown percentile estimator [" + name + "]"); + } + + } +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/GroupRedBlackTree.java b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/GroupRedBlackTree.java new file mode 100644 index 0000000000000..00829152408cf --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/GroupRedBlackTree.java @@ -0,0 +1,259 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.metrics.percentiles.tdigest; + +import com.google.common.primitives.Longs; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; + +import java.util.Arrays; + +/** Specialized {@link RedBlackTree} where each node stores aggregated data for its sub-trees. */ +public class GroupRedBlackTree extends RedBlackTree { + + private long nextId; + private double[] centroids; + private long[] counts; + private int[] aggregatedSizes; + private long[] aggregatedCounts; + private long[] ids; + + // used for comparisons + double tmpCentroid; + long tmpCount; + long tmpId; + + public GroupRedBlackTree(int capacity) { + super(capacity); + nextId = 1; + centroids = new double[1+capacity]; + aggregatedSizes = new int[1+capacity]; + counts = new long[1+capacity]; + aggregatedCounts = new long[1+capacity]; + ids = new long[1+capacity]; + } + + // Internal node management + + @Override + protected int compare(int node) { + final double centroid = mean(node); + int cmp = Double.compare(tmpCentroid, centroid); + if (cmp == 0) { + cmp = Longs.compare(tmpId, ids[node]); + } + return cmp; + } + + @Override + protected void copy(int node) { + centroids[node] = tmpCentroid; + counts[node] = tmpCount; + ids[node] = tmpId; + } + + @Override + protected void merge(int node) { + throw new IllegalStateException("compare should never return 0"); + } + + @Override + protected void swap(int node1, int node2) { + super.swap(node1, node2); + for (int n = node1; n != NIL; n = parent(n)) { + fixAggregates(n); + } + } + + @Override + protected int newNode() { + final int newNode = super.newNode(); + if (newNode >= centroids.length) { + final int minSize = ArrayUtil.oversize(newNode + 1, RamUsageEstimator.NUM_BYTES_INT); + centroids = Arrays.copyOf(centroids, minSize); + counts = Arrays.copyOf(counts, minSize); + aggregatedSizes = Arrays.copyOf(aggregatedSizes, minSize); + aggregatedCounts = Arrays.copyOf(aggregatedCounts, minSize); + ids = Arrays.copyOf(ids, minSize); + } + return newNode; + } + + private void fixAggregates(int node) { + final int left = left(node), right = right(node); + aggregatedCounts[node] = counts[node] + aggregatedCounts[left] + aggregatedCounts[right]; + aggregatedSizes[node] = 1 + aggregatedSizes[left] + aggregatedSizes[right]; + } + + private void fixCounts(int node) { + final int left = left(node), right = right(node); + aggregatedCounts[node] = counts[node] + aggregatedCounts[left] + aggregatedCounts[right]; + } + + @Override + protected void rotateLeft(int node) { + super.rotateLeft(node); + fixAggregates(node); + fixAggregates(parent(node)); + } + + @Override + protected void rotateRight(int node) { + super.rotateRight(node); + fixAggregates(node); + fixAggregates(parent(node)); + } + + @Override + protected void beforeRemoval(int node) { + final long count = count(node); + for (int n = node; n != NIL; n = parent(n)) { + aggregatedCounts[n] -= count; + aggregatedSizes[n]--; + } + super.beforeRemoval(node); + } + + @Override + protected void afterInsertion(int node) { + final long count = count(node); + aggregatedCounts[node] = count; + aggregatedSizes[node] = 1; + for (int n = node, p = parent(node); p != NIL; n = p, p = parent(n)) { + aggregatedCounts[p] += count; + aggregatedSizes[p] += 1; + } + super.afterInsertion(node); + } + + // Public API + + public double mean(int node) { + return centroids[node]; + } + + public long count(int node) { + return counts[node]; + } + + public long id(int node) { + return ids[node]; + } + + public void addGroup(double centroid, long count, long id) { + tmpCentroid = centroid; + tmpCount = count; + tmpId = id; + if (id >= nextId) { + nextId = id + 1; + } + addNode(); + } + + public void addGroup(double centroid, long count) { + addGroup(centroid, count, nextId++); + } + + public boolean removeGroup(double centroid, int id) { + tmpCentroid = centroid; + tmpId = id; + final int nodeToRemove = getNode(); + if (nodeToRemove != NIL) { + removeNode(nodeToRemove); + return true; + } else { + return false; + } + } + + public void updateGroup(int node, double centroid, long count) { + tmpCentroid = centroid; + tmpId = id(node); + tmpCount = count; + final int prev = prevNode(node); + final int next = nextNode(node); + if ((prev == NIL || compare(prev) > 0) && (next == NIL || compare(next) < 0)) { + // we can update in place + copy(node); + for (int n = node; n != NIL; n = parent(n)) { + fixCounts(n); + } + } else { + removeNode(node); + addNode(); + } + } + + /** Return the last node whose centroid is strictly smaller than centroid. */ + public int floorNode(double centroid) { + int floor = NIL; + for (int node = root(); node != NIL; ) { + final int cmp = Double.compare(centroid, mean(node)); + if (cmp <= 0) { + node = left(node); + } else { + floor = node; + node = right(node); + } + } + return floor; + } + + /** Return the first node that is greater than or equal to centroid. */ + public int ceilingNode(double centroid) { + int ceiling = NIL; + for (int node = root(); node != NIL; ) { + final int cmp = Double.compare(mean(node), centroid); + if (cmp < 0) { + node = right(node); + } else { + ceiling = node; + node = left(node); + } + } + return ceiling; + } + + /** Compute the number of elements and sum of counts for every entry that is strictly before node. */ + public void headSum(int node, SizeAndSum sizeAndSum) { + if (node == NIL) { + sizeAndSum.size = 0; + sizeAndSum.sum = 0; + return; + } + final int left = left(node); + sizeAndSum.size = aggregatedSizes[left]; + sizeAndSum.sum = aggregatedCounts[left]; + for (int n = node, p = parent(node); p != NIL; n = p, p = parent(n)) { + if (n == right(p)) { + final int leftP = left(p); + sizeAndSum.size += 1 + aggregatedSizes[leftP]; + sizeAndSum.sum += counts[p] + aggregatedCounts[leftP]; + } + } + } + + /** Wrapper around a size and a sum. */ + public static class SizeAndSum { + public int size; + public long sum; + } + +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/RedBlackTree.java b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/RedBlackTree.java new file mode 100644 index 0000000000000..1d7035095d25d --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/RedBlackTree.java @@ -0,0 +1,737 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.metrics.percentiles.tdigest; + +import com.carrotsearch.hppc.IntArrayDeque; +import com.carrotsearch.hppc.cursors.IntCursor; +import com.google.common.collect.UnmodifiableIterator; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.RamUsageEstimator; + +import java.util.Arrays; +import java.util.Iterator; +import java.util.NoSuchElementException; + +/** + * A red-black tree that identifies every node with a unique dense integer ID to make it easy to store node-specific data into + * parallel arrays. This implementation doesn't support more than 2B nodes. + */ +public abstract class RedBlackTree implements Iterable { + + protected static final int NIL = 0; + private static final boolean BLACK = false, RED = true; + + private int size; + private int maxNode; + private int root; + private final IntArrayDeque unusedSlots; + private int[] leftNodes, rightNodes, parentNodes; + private final OpenBitSet colors; + + /** Create a new instance able to store capacity without resizing. */ + protected RedBlackTree(int capacity) { + size = 0; + maxNode = 1; + root = NIL; + leftNodes = new int[1+capacity]; + rightNodes = new int[1+capacity]; + parentNodes = new int[1+capacity]; + colors = new OpenBitSet(1+capacity); + unusedSlots = new IntArrayDeque(); + } + + /** Return the identifier of the root of the tree. */ + protected final int root() { + assert size > 0 || root == NIL; + return root; + } + + /** Release a node */ + protected void releaseNode(int node) { + unusedSlots.addLast(node); + parent(node, NIL); + left(node, NIL); + right(node, NIL); + } + + /** Create a new node in this tree. */ + protected int newNode() { + if (unusedSlots.isEmpty()) { + final int slot = maxNode++; + if (maxNode > leftNodes.length) { + final int minSize = ArrayUtil.oversize(maxNode, RamUsageEstimator.NUM_BYTES_INT); + leftNodes = Arrays.copyOf(leftNodes, minSize); + rightNodes = Arrays.copyOf(rightNodes, minSize); + parentNodes = Arrays.copyOf(parentNodes, minSize); + colors.ensureCapacity(leftNodes.length); + } + return slot; + } else { + return unusedSlots.removeLast(); + } + } + + /** Return the number of nodes in this tree. */ + public int size() { + assert size == maxNode - unusedSlots.size() - 1 : size + " != " + (maxNode - unusedSlots.size() - 1); + return size; + } + + private boolean color(int node) { + return colors.get(node); + } + + private void color(int node, boolean color) { + assert node != NIL; + if (color) { + colors.fastSet(node); + } else { + colors.fastClear(node); + } + } + + /** Return the parent of the given node. */ + protected final int parent(int node) { + return parentNodes[node]; + } + + private void parent(int node, int parent) { + assert node != NIL; + parentNodes[node] = parent; + } + + /** Return the left child of the given node. */ + protected final int left(int node) { + assert node != NIL; + return leftNodes[node]; + } + + protected void left(int node, int leftNode) { + assert node != NIL; + leftNodes[node] = leftNode; + } + + /** Return the right child of the given node. */ + protected final int right(int node) { + assert node != NIL; + return rightNodes[node]; + } + + private void right(int node, int rightNode) { + assert node != NIL; + rightNodes[node] = rightNode; + } + + // return the number of black nodes to go through up to leaves + // to use within assertions only + private int numBlackNode(int node) { + assert assertConsistent(node); + if (node == NIL) { + return 1; + } else { + final int numLeft = numBlackNode(left(node)); + final int numRight = numBlackNode(right(node)); + assert numLeft == numRight : numLeft + " " + numRight; + int num = numLeft; + if (color(node) == BLACK) { + ++num; + } + return num; + } + } + + // check consistency of parent/left/right + private boolean assertConsistent(int node) { + if (node == NIL) { + return true; + } + final int parent = parent(node); + if (parent == NIL) { + assert node == root; + } else { + assert node == left(parent) || node == right(parent); + } + final int left = left(node); + if (left != NIL) { + assert parent(left) == node; + } + final int right = right(node); + if (right != NIL) { + assert parent(right) == node; + } + return true; + } + + // for testing + public void assertConsistent() { + numBlackNode(root); + } + + /** Rotate left the subtree under n */ + protected void rotateLeft(int n) { + final int r = right(n); + final int lr = left(r); + right(n, left(r)); + if (lr != NIL) { + parent(lr, n); + } + final int p = parent(n); + parent(r, p); + if (p == NIL) { + root = r; + } else if (left(p) == n) { + left(p, r); + } else { + assert right(p) == n; + right(p, r); + } + left(r, n); + parent(n, r); + } + + /** Rotate right the subtree under n */ + protected void rotateRight(int n) { + final int l = left(n); + final int rl = right(l); + left(n, rl); + if (rl != NIL) { + parent(rl, n); + } + final int p = parent(n); + parent(l, p); + if (p == NIL) { + root = l; + } else if (right(p) == n) { + right(p, l); + } else { + assert left(p) == n; + left(p, l); + } + right(l, n); + parent(n, l); + } + + /** Called after insertions. Base implementation just balances the tree, + * see http://en.wikipedia.org/wiki/Red%E2%80%93black_tree#Insertion */ + protected void afterInsertion(int node) { + color(node, RED); + insertCase1(node); + } + + private void insertCase1(int node) { + final int parent = parent(node); + if (parent == NIL) { + assert node == root; + color(node, BLACK); + } else { + insertCase2(node, parent); + } + } + + private void insertCase2(int node, int parent) { + if (color(parent) != BLACK) { + insertCase3(node, parent); + } + } + + private void insertCase3(int node, int parent) { + final int grandParent = parent(parent); + assert grandParent != NIL; + final int uncle; + if (parent == left(grandParent)) { + uncle = right(grandParent); + } else { + assert parent == right(grandParent); + uncle = left(grandParent); + } + if (uncle != NIL && color(uncle) == RED) { + color(parent, BLACK); + color(uncle, BLACK); + color(grandParent, RED); + insertCase1(grandParent); + } else { + insertCase4(node, parent, grandParent); + } + } + + private void insertCase4(int node, int parent, int grandParent) { + if (node == right(parent) && parent == left(grandParent)) { + rotateLeft(parent); + node = left(node); + } else if (node == left(parent) && parent == right(grandParent)) { + rotateRight(parent); + node = right(node); + } + insertCase5(node); + } + + private void insertCase5(int node) { + final int parent = parent(node); + final int grandParent = parent(parent); + color(parent, BLACK); + color(grandParent, RED); + if (node == left(parent)) { + rotateRight(grandParent); + } else { + assert node == right(parent); + rotateLeft(grandParent); + } + } + + /** Called before the removal of a node. */ + protected void beforeRemoval(int node) {} + + // see http://en.wikipedia.org/wiki/Red%E2%80%93black_tree#Removal + /** Called after removal. Base implementation just balances the tree, + * see http://en.wikipedia.org/wiki/Red%E2%80%93black_tree#Removal */ + protected void afterRemoval(int node) { + assert node != NIL; + if (color(node) == BLACK) { + removeCase1(node); + } + } + + private int sibling(int node, int parent) { + final int left = left(parent); + if (node == left) { + return right(parent); + } else { + assert node == right(parent); + return left; + } + } + + private void removeCase1(int node) { + if (parent(node) != NIL) { + removeCase2(node); + } + } + + private void removeCase2(int node) { + final int parent = parent(node); + final int sibling = sibling(node, parent); + + if (color(sibling) == RED) { + color(sibling, BLACK); + color(parent, RED); + if (node == left(parent)) { + rotateLeft(parent); + } else { + assert node == right(parent); + rotateRight(parent); + } + } + removeCase3(node); + } + + private void removeCase3(int node) { + final int parent = parent(node); + final int sibling = sibling(node, parent); + + if (color(parent) == BLACK && sibling != NIL && color(sibling) == BLACK + && color(left(sibling)) == BLACK && color(right(sibling)) == BLACK) { + color(sibling, RED); + removeCase1(parent); + } else { + removeCase4(node, parent, sibling); + } + } + + private void removeCase4(int node, int parent, int sibling) { + if (color(parent) == RED && sibling != NIL && color(sibling) == BLACK + && color(left(sibling)) == BLACK && color(right(sibling)) == BLACK) { + color(sibling, RED); + color(parent, BLACK); + } else { + removeCase5(node, parent, sibling); + } + } + + private void removeCase5(int node, int parent, int sibling) { + if (color(sibling) == BLACK) { + if (node == left(parent) && sibling != NIL && color(left(sibling)) == RED && color(right(sibling)) == BLACK) { + color(sibling, RED); + color(left(sibling), BLACK); + rotateRight(sibling); + } else if (node == right(parent) && sibling != NIL && color(left(sibling)) == BLACK && color(right(sibling)) == RED) { + color(sibling, RED); + color(right(sibling), BLACK); + rotateLeft(sibling); + } + } + removeCase6(node); + } + + private void removeCase6(int node) { + final int parent = parent(node); + final int sibling = sibling(node, parent); + color(sibling, color(parent)); + color(parent, BLACK); + if (node == left(parent)) { + color(right(sibling), BLACK); + rotateLeft(parent); + } else { + assert node == right(parent); + color(left(sibling), BLACK); + rotateRight(parent); + } + } + + /** Compare to node. */ + protected abstract int compare(int node); + + /** Copy data used for comparison into node. */ + protected abstract void copy(int node); + + /** Merge data used for comparison into node. */ + protected abstract void merge(int node); + + /** Add a node to the tree. */ + public boolean addNode() { + int newNode = NIL; + if (size == 0) { + newNode = root = newNode(); + copy(root); + } else { + int parent = NIL, node = root; + int cmp; + do { + cmp = compare(node); + if (cmp < 0) { + parent = node; + node = left(node); + } else if (cmp > 0) { + parent = node; + node = right(node); + } else { + merge(node); + return false; + } + } while (node != NIL); + + newNode = newNode(); + copy(newNode); + if (cmp < 0) { + left(parent, newNode); + } else { + assert cmp > 0; + right(parent, newNode); + } + parent(newNode, parent); + } + ++size; + afterInsertion(newNode); + return true; + } + + public int getNode() { + if (size() == 0) { + return NIL; + } + int node = root; + do { + final int cmp = compare(node); + if (cmp < 0) { + node = left(node); + } else if (cmp > 0) { + node = right(node); + } else { + return node; + } + } while (node != NIL); + + return NIL; + } + + /** Swap two nodes. */ + protected void swap(int node1, int node2) { + final int parent1 = parent(node1); + final int parent2 = parent(node2); + if (parent1 != NIL) { + if (node1 == left(parent1)) { + left(parent1, node2); + } else { + assert node1 == right(parent1); + right(parent1, node2); + } + } else { + assert root == node1; + root = node2; + } + if (parent2 != NIL) { + if (node2 == left(parent2)) { + left(parent2, node1); + } else { + assert node2 == right(parent2); + right(parent2, node1); + } + } else { + assert root == node2; + root = node1; + } + parent(node1, parent2); + parent(node2, parent1); + final int left1 = left(node1); + final int left2 = left(node2); + left(node1, left2); + if (left2 != NIL) { + parent(left2, node1); + } + left(node2, left1); + if (left1 != NIL) { + parent(left1, node2); + } + final int right1 = right(node1); + final int right2 = right(node2); + right(node1, right2); + if (right2 != NIL) { + parent(right2, node1); + } + right(node2, right1); + if (right1 != NIL) { + parent(right1, node2); + } + final boolean color1 = color(node1); + final boolean color2 = color(node2); + color(node1, color2); + color(node2, color1); + + assertConsistent(node1); + assertConsistent(node2); + } + + /** Remove a node from the tree. */ + public void removeNode(int nodeToRemove) { + assert nodeToRemove != NIL; + if (left(nodeToRemove) != NIL && right(nodeToRemove) != NIL) { + final int next = nextNode(nodeToRemove); + if (next != NIL) { + swap(nodeToRemove, next); + //copy(next, nodeToRemove); + //nodeToRemove = next; + } + } + + assert left(nodeToRemove) == NIL || right(nodeToRemove) == NIL; + + final int left = left(nodeToRemove); + int child = left != NIL ? left : right(nodeToRemove); + + beforeRemoval(nodeToRemove); + + if (child != NIL) { + final int parent = parent(nodeToRemove); + parent(child, parent); + if (parent != NIL) { + if (nodeToRemove == left(parent)) { + left(parent, child); + } else { + assert nodeToRemove == right(parent); + right(parent, child); + } + } else { + assert nodeToRemove == root; + root = child; + } + if (color(nodeToRemove) == BLACK) { + color(child, BLACK); + } else { + afterRemoval(child); + } + } else { + // no children + final int parent = parent(nodeToRemove); + if (parent == NIL) { + assert nodeToRemove == root; + root = NIL; + } else { + afterRemoval(nodeToRemove); + if (nodeToRemove == left(parent)) { + left(parent, NIL); + } else { + assert nodeToRemove == right(parent); + right(parent, NIL); + } + } + } + + releaseNode(nodeToRemove); + --size; + } + + /** Return the least node under node. */ + protected final int first(int node) { + if (node == NIL) { + return NIL; + } + while (true) { + final int left = left(node); + if (left == NIL) { + break; + } + node = left; + } + return node; + } + + /** Return the largest node under node. */ + protected final int last(int node) { + while (true) { + final int right = right(node); + if (right == NIL) { + break; + } + node = right; + } + return node; + } + + /** Return the previous node. */ + public final int prevNode(int node) { + final int left = left(node); + if (left != NIL) { + return last(left); + } else { + int parent = parent(node); + while (parent != NIL && node == left(parent)) { + node = parent; + parent = parent(parent); + } + return parent; + } + } + + /** Return the next node. */ + public final int nextNode(int node) { + final int right = right(node); + if (right != NIL) { + return first(right); + } else { + int parent = parent(node); + while (parent != NIL && node == right(parent)) { + node = parent; + parent = parent(parent); + } + return parent; + } + } + + @Override + public Iterator iterator() { + return iterator(first(root)); + } + + private Iterator iterator(final int startNode) { + return new UnmodifiableIterator() { + + boolean nextSet; + final IntCursor cursor; + + { + cursor = new IntCursor(); + cursor.index = -1; + cursor.value = startNode; + nextSet = cursor.value != NIL; + } + + boolean computeNext() { + if (cursor.value != NIL) { + cursor.value = RedBlackTree.this.nextNode(cursor.value); + } + return nextSet = (cursor.value != NIL); + } + + @Override + public boolean hasNext() { + return nextSet || computeNext(); + } + + @Override + public IntCursor next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + nextSet = false; + return cursor; + } + }; + } + + public Iterator reverseIterator() { + return reverseIterator(last(root)); + } + + private Iterator reverseIterator(final int startNode) { + return new UnmodifiableIterator() { + + boolean nextSet; + final IntCursor cursor; + + { + cursor = new IntCursor(); + cursor.index = -1; + cursor.value = startNode; + nextSet = cursor.value != NIL; + } + + boolean computeNext() { + if (cursor.value != NIL) { + cursor.value = RedBlackTree.this.prevNode(cursor.value); + } + return nextSet = (cursor.value != NIL); + } + + @Override + public boolean hasNext() { + return nextSet || computeNext(); + } + + @Override + public IntCursor next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + nextSet = false; + return cursor; + } + }; + } + + /** Return a view over the nodes that are stored after startNode in the tree. */ + public Iterable tailSet(final int startNode) { + return new Iterable() { + @Override + public Iterator iterator() { + return RedBlackTree.this.iterator(startNode); + } + }; + } + + /** Return a reversed view over the elements of this tree. */ + public Iterable reverseSet() { + return new Iterable() { + @Override + public Iterator iterator() { + return RedBlackTree.this.reverseIterator(); + } + }; + } +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/TDigest.java b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/TDigest.java new file mode 100644 index 0000000000000..699b6048f9dbe --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/TDigest.java @@ -0,0 +1,175 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.metrics.percentiles.tdigest; + +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.ElasticsearchIllegalArgumentException; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.common.util.ObjectArray; +import org.elasticsearch.search.aggregations.metrics.percentiles.PercentilesEstimator; +import org.elasticsearch.search.aggregations.support.AggregationContext; + +import java.io.IOException; +import java.util.Map; + + +public class TDigest extends PercentilesEstimator { + + public final static byte ID = 0; + + private final BigArrays bigArrays; + private ObjectArray states; + private final double compression; + + public TDigest(double[] percents, double compression, long estimatedBucketsCount, AggregationContext context) { + super(percents); + bigArrays = context.bigArrays(); + states = bigArrays.newObjectArray(estimatedBucketsCount); + this.compression = compression; + } + + @Override + public boolean release() throws ElasticsearchException { + states.release(); + return true; + } + + public void offer(double value, long bucketOrd) { + states = bigArrays.grow(states, bucketOrd + 1); + TDigestState state = states.get(bucketOrd); + if (state == null) { + state = new TDigestState(compression); + states.set(bucketOrd, state); + } + state.add(value); + } + + @Override + public PercentilesEstimator.Result result(long bucketOrd) { + if (bucketOrd >= states.size() || states.get(bucketOrd) == null) { + return emptyResult(); + } + return new Result(percents, states.get(bucketOrd)); + } + + @Override + public PercentilesEstimator.Result emptyResult() { + return new Result(percents, new TDigestState(compression)); + } + + public static class Result extends PercentilesEstimator.Result { + + private TDigestState state; + + public Result() {} // for serialization + + public Result(double[] percents, TDigestState state) { + super(percents); + this.state = state; + } + + @Override + protected byte id() { + return ID; + } + + @Override + public double estimate(int index) { + return state.quantile(percents[index] / 100); + } + + @Override + public Merger merger(int estimatedMerges) { + return new Merger(); + } + + public static Result read(StreamInput in) throws IOException { + Result result = new Result(); + result.readFrom(in); + return result; + } + + @Override + public void readFrom(StreamInput in) throws IOException { + this.percents = new double[in.readInt()]; + for (int i = 0; i < percents.length; i++) { + percents[i] = in.readDouble(); + } + state = TDigestState.read(in); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeInt(percents.length); + for (int i = 0 ; i < percents.length; ++i) { + out.writeDouble(percents[i]); + } + TDigestState.write(state, out); + } + + private class Merger implements PercentilesEstimator.Result.Merger { + + private Result merged; + + @Override + public void add(Result result) { + if (merged == null || merged.state == null) { + merged = result; + return; + } + if (result.state == null || result.state.size() == 0) { + return; + } + merged.state.add(result.state); + } + + @Override + public Result merge() { + return merged; + } + } + + } + + public static class Factory implements PercentilesEstimator.Factory { + + private final double compression; + + public Factory(Map settings) { + double compression = 100; + if (settings != null) { + Object compressionObject = settings.get("compression"); + if (compressionObject != null) { + if (!(compressionObject instanceof Number)) { + throw new ElasticsearchIllegalArgumentException("tdigest compression must be number, got a " + compressionObject.getClass()); + } + compression = ((Number) compressionObject).doubleValue(); + } + } + this.compression = compression; + } + + public TDigest create(double[] percents, long estimtedBucketCount, AggregationContext context) { + return new TDigest(percents, compression, estimtedBucketCount, context); + } + } + +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/TDigestState.java b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/TDigestState.java new file mode 100644 index 0000000000000..cf0bf0f71948f --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/TDigestState.java @@ -0,0 +1,353 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package org.elasticsearch.search.aggregations.metrics.percentiles.tdigest; + +import com.carrotsearch.hppc.cursors.IntCursor; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import org.elasticsearch.ElasticsearchIllegalArgumentException; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.search.aggregations.metrics.percentiles.tdigest.GroupRedBlackTree.SizeAndSum; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.Random; + +/** + * Fork of https://github.com/tdunning/t-digest/blob/master/src/main/java/com/tdunning/math/stats/TDigest.java + * Modified for less object allocation, faster estimations and integration with Elasticsearch serialization. + */ +public class TDigestState { + + private final Random gen; + private double compression = 100; + private GroupRedBlackTree summary; + private long count = 0; + private final SizeAndSum sizeAndSum = new SizeAndSum(); + + /** + * A histogram structure that will record a sketch of a distribution. + * + * @param compression How should accuracy be traded for size? A value of N here will give quantile errors + * almost always less than 3/N with considerably smaller errors expected for extreme + * quantiles. Conversely, you should expect to track about 5 N centroids for this + * accuracy. + */ + public TDigestState(double compression) { + this.compression = compression; + gen = new Random(); + summary = new GroupRedBlackTree((int) compression); + } + + /** + * Adds a sample to a histogram. + * + * @param x The value to add. + */ + public void add(double x) { + add(x, 1); + } + + /** + * Adds a sample to a histogram. + * + * @param x The value to add. + * @param w The weight of this point. + */ + public void add(double x, long w) { + int startNode = summary.floorNode(x); + if (startNode == RedBlackTree.NIL) { + startNode = summary.ceilingNode(x); + } + + if (startNode == RedBlackTree.NIL) { + summary.addGroup(x, w); + count = w; + } else { + double minDistance = Double.POSITIVE_INFINITY; + int lastNeighbor = 0; + summary.headSum(startNode, sizeAndSum); + final int headSize = sizeAndSum.size; + int i = headSize; + for (int node = startNode; node != RedBlackTree.NIL; node = summary.nextNode(node)) { + double z = Math.abs(summary.mean(node) - x); + if (z <= minDistance) { + minDistance = z; + lastNeighbor = i; + } else { + break; + } + i++; + } + + int closest = RedBlackTree.NIL; + long sum = sizeAndSum.sum; + i = headSize; + double n = 1; + for (int node = startNode; node != RedBlackTree.NIL; node = summary.nextNode(node)) { + if (i > lastNeighbor) { + break; + } + double z = Math.abs(summary.mean(node) - x); + double q = (sum + summary.count(node) / 2.0) / count; + double k = 4 * count * q * (1 - q) / compression; + + // this slightly clever selection method improves accuracy with lots of repeated points + if (z == minDistance && summary.count(node) + w <= k) { + if (gen.nextDouble() < 1 / n) { + closest = node; + } + n++; + } + sum += summary.count(node); + i++; + } + + if (closest == RedBlackTree.NIL) { + summary.addGroup(x, w); + } else { + double centroid = summary.mean(closest); + long count = summary.count(closest); + count += w; + centroid += w * (x - centroid) / count; + summary.updateGroup(closest, centroid, count); + } + count += w; + + if (summary.size() > 100 * compression) { + // something such as sequential ordering of data points + // has caused a pathological expansion of our summary. + // To fight this, we simply replay the current centroids + // in random order. + + // this causes us to forget the diagnostic recording of data points + compress(); + } + } + } + + private int[] shuffleNodes(RedBlackTree tree) { + int[] nodes = new int[tree.size()]; + int i = 0; + for (IntCursor cursor : tree) { + nodes[i++] = cursor.value; + } + assert i == tree.size(); + for (i = tree.size() - 1; i > 0; --i) { + final int slot = gen.nextInt(i + 1); + final int tmp = nodes[slot]; + nodes[slot] = nodes[i]; + nodes[i] = tmp; + } + return nodes; + } + + public void add(TDigestState other) { + final int[] shuffledNodes = shuffleNodes(other.summary); + for (int node : shuffledNodes) { + add(other.summary.mean(node), other.summary.count(node)); + } + } + + public static TDigestState merge(double compression, Iterable subData) { + Preconditions.checkArgument(subData.iterator().hasNext(), "Can't merge 0 digests"); + List elements = Lists.newArrayList(subData); + int n = Math.max(1, elements.size() / 4); + TDigestState r = new TDigestState(compression); + for (int i = 0; i < elements.size(); i += n) { + if (n > 1) { + r.add(merge(compression, elements.subList(i, Math.min(i + n, elements.size())))); + } else { + r.add(elements.get(i)); + } + } + return r; + } + + public void compress() { + compress(summary); + } + + private void compress(GroupRedBlackTree other) { + TDigestState reduced = new TDigestState(compression); + final int[] shuffledNodes = shuffleNodes(other); + for (int node : shuffledNodes) { + reduced.add(other.mean(node), other.count(node)); + } + summary = reduced.summary; + } + + /** + * Returns the number of samples represented in this histogram. If you want to know how many + * centroids are being used, try centroids().size(). + * + * @return the number of samples that have been added. + */ + public long size() { + return count; + } + + public GroupRedBlackTree centroids() { + return summary; + } + + /** + * @param x the value at which the CDF should be evaluated + * @return the approximate fraction of all samples that were less than or equal to x. + */ + public double cdf(double x) { + GroupRedBlackTree values = summary; + if (values.size() == 0) { + return Double.NaN; + } else if (values.size() == 1) { + return x < values.mean(values.root()) ? 0 : 1; + } else { + double r = 0; + + // we scan a across the centroids + Iterator it = values.iterator(); + int a = it.next().value; + + // b is the look-ahead to the next centroid + int b = it.next().value; + + // initially, we set left width equal to right width + double left = (values.mean(b) - values.mean(a)) / 2; + double right = left; + + // scan to next to last element + while (it.hasNext()) { + if (x < values.mean(a) + right) { + return (r + values.count(a) * interpolate(x, values.mean(a) - left, values.mean(a) + right)) / count; + } + r += values.count(a); + + a = b; + b = it.next().value; + + left = right; + right = (values.mean(b) - values.mean(a)) / 2; + } + + // for the last element, assume right width is same as left + left = right; + a = b; + if (x < values.mean(a) + right) { + return (r + values.count(a) * interpolate(x, values.mean(a) - left, values.mean(a) + right)) / count; + } else { + return 1; + } + } + } + + /** + * @param q The quantile desired. Can be in the range [0,1]. + * @return The minimum value x such that we think that the proportion of samples is <= x is q. + */ + public double quantile(double q) { + if (q < 0 || q > 1) { + throw new ElasticsearchIllegalArgumentException("q should be in [0,1], got " + q); + } + GroupRedBlackTree values = summary; + if (values.size() == 0) { + return Double.NaN; + } else if (values.size() == 1) { + return values.mean(values.root()); + } + + // if values were stored in a sorted array, index would be the offset we are interested in + final double index = q * (count - 1); + + double previousMean = Double.NaN, previousIndex = 0; + long total = 0; + int next; + Iterator it = centroids().iterator(); + while (true) { + next = it.next().value; + final double nextIndex = total + (values.count(next) - 1.0) / 2; + if (nextIndex >= index) { + if (Double.isNaN(previousMean)) { + // special case 1: the index we are interested in is before the 1st centroid + if (nextIndex == previousIndex) { + return values.mean(next); + } + // assume values grow linearly between index previousIndex=0 and nextIndex2 + int next2 = it.next().value; + final double nextIndex2 = total + values.count(next) + (values.count(next2) - 1.0) / 2; + previousMean = (nextIndex2 * values.mean(next) - nextIndex * values.mean(next2)) / (nextIndex2 - nextIndex); + } + // common case: we found two centroids previous and next so that the desired quantile is + // after 'previous' but before 'next' + return quantile(previousIndex, index, nextIndex, previousMean, values.mean(next)); + } else if (!it.hasNext()) { + // special case 2: the index we are interested in is beyond the last centroid + // again, assume values grow linearly between index previousIndex and (count - 1) + // which is the highest possible index + final double nextIndex2 = count - 1; + final double nextMean2 = (values.mean(next) * (nextIndex2 - previousIndex) - previousMean * (nextIndex2 - nextIndex)) / (nextIndex - previousIndex); + return quantile(nextIndex, index, nextIndex2, values.mean(next), nextMean2); + } + total += values.count(next); + previousMean = values.mean(next); + previousIndex = nextIndex; + } + } + + private static double quantile(double previousIndex, double index, double nextIndex, double previousMean, double nextMean) { + final double delta = nextIndex - previousIndex; + final double previousWeight = (nextIndex - index) / delta; + final double nextWeight = (index - previousIndex) / delta; + return previousMean * previousWeight + nextMean * nextWeight; + } + + public int centroidCount() { + return summary.size(); + } + + public double compression() { + return compression; + } + + private double interpolate(double x, double x0, double x1) { + return (x - x0) / (x1 - x0); + } + + //===== elastic search serialization ======// + + public static void write(TDigestState state, StreamOutput out) throws IOException { + out.writeDouble(state.compression); + out.writeVInt(state.summary.size()); + for (IntCursor cursor : state.summary) { + final int node = cursor.value; + out.writeDouble(state.summary.mean(node)); + out.writeVLong(state.summary.count(node)); + } + } + + public static TDigestState read(StreamInput in) throws IOException { + double compression = in.readDouble(); + TDigestState state = new TDigestState(compression); + int n = in.readVInt(); + for (int i = 0; i < n; i++) { + state.add(in.readDouble(), in.readVInt()); + } + return state; + } +} diff --git a/src/test/java/org/elasticsearch/benchmark/search/aggregations/PercentilesAggregationSearchBenchmark.java b/src/test/java/org/elasticsearch/benchmark/search/aggregations/PercentilesAggregationSearchBenchmark.java new file mode 100644 index 0000000000000..e5693b15a2351 --- /dev/null +++ b/src/test/java/org/elasticsearch/benchmark/search/aggregations/PercentilesAggregationSearchBenchmark.java @@ -0,0 +1,209 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.benchmark.search.aggregations; + +import com.google.common.collect.Maps; +import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; +import org.elasticsearch.action.bulk.BulkRequestBuilder; +import org.elasticsearch.action.bulk.BulkResponse; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.action.search.SearchType; +import org.elasticsearch.client.Client; +import org.elasticsearch.common.StopWatch; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.SizeValue; +import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.json.JsonXContent; +import org.elasticsearch.node.Node; +import org.elasticsearch.search.aggregations.metrics.percentiles.Percentiles; +import org.elasticsearch.search.aggregations.metrics.percentiles.Percentiles.Percentile; + +import java.util.*; +import java.util.concurrent.TimeUnit; + +import static org.elasticsearch.client.Requests.createIndexRequest; +import static org.elasticsearch.client.Requests.getRequest; +import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS; +import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS; +import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; +import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery; +import static org.elasticsearch.node.NodeBuilder.nodeBuilder; +import static org.elasticsearch.search.aggregations.AggregationBuilders.percentiles; + +public class PercentilesAggregationSearchBenchmark { + + private static final int AMPLITUDE = 10000; + private static final int NUM_DOCS = (int) SizeValue.parseSizeValue("1m").singles(); + private static final int BATCH = 100; + private static final String CLUSTER_NAME = PercentilesAggregationSearchBenchmark.class.getSimpleName(); + private static final double[] PERCENTILES = new double[] { 0, 0.01, 0.1, 1, 10, 25, 50, 75, 90, 99, 99.9, 99.99, 100}; + private static final int QUERY_WARMUP = 10; + private static final int QUERY_COUNT = 20; + + private static Random R = new Random(0); + + // we generate ints to not disadvantage qdigest which only works with integers + private enum Distribution { + UNIFORM { + @Override + int next() { + return (int) (R.nextDouble() * AMPLITUDE); + } + }, + GAUSS { + @Override + int next() { + return (int) (R.nextDouble() * AMPLITUDE); + } + }, + LOG_NORMAL { + @Override + int next() { + return (int) Math.exp(R.nextDouble() * Math.log(AMPLITUDE)); + } + }; + String indexName() { + return name().toLowerCase(Locale.ROOT); + } + abstract int next(); + } + + private static double accuratePercentile(double percentile, int[] sortedValues) { + final double index = percentile / 100 * (sortedValues.length - 1); + final int intIndex = (int) index; + final double delta = index - intIndex; + if (delta == 0) { + return sortedValues[intIndex]; + } else { + return sortedValues[intIndex] * (1 - delta) + sortedValues[intIndex + 1] * delta; + } + } + + public static void main(String[] args) throws Exception { + Settings settings = settingsBuilder() + .put("index.refresh_interval", "-1") + .put(SETTING_NUMBER_OF_SHARDS, 100) // to also test performance and accuracy of the reduce phase + .put(SETTING_NUMBER_OF_REPLICAS, 0) + .build(); + + Node[] nodes = new Node[1]; + for (int i = 0; i < nodes.length; i++) { + nodes[i] = nodeBuilder().clusterName(CLUSTER_NAME) + .settings(settingsBuilder().put(settings).put("name", "node" + i)) + .node(); + } + + Node clientNode = nodeBuilder() + .clusterName(CLUSTER_NAME) + .settings(settingsBuilder().put(settings).put("name", "client")).client(true).node(); + + Client client = clientNode.client(); + + for (Distribution d : Distribution.values()) { + try { +// client.admin().indices().prepareDelete(d.indexName()).execute().actionGet(); + client.admin().indices().create(createIndexRequest(d.indexName()).settings(settings)).actionGet(); + } catch (Exception e) { + System.out.println("Index " + d.indexName() + " already exists, skipping index creation"); + continue; + } + + final int[] values = new int[NUM_DOCS]; + for (int i = 0; i < NUM_DOCS; ++i) { + values[i] = d.next(); + } + System.out.println("Indexing " + NUM_DOCS + " documents into " + d.indexName()); + StopWatch stopWatch = new StopWatch().start(); + for (int i = 0; i < NUM_DOCS; ) { + BulkRequestBuilder request = client.prepareBulk(); + for (int j = 0; j < BATCH && i < NUM_DOCS; ++j) { + request.add(client.prepareIndex(d.indexName(), "values", Integer.toString(i)).setSource("v", values[i])); + ++i; + } + BulkResponse response = request.execute().actionGet(); + if (response.hasFailures()) { + System.err.println("--> failures..."); + System.err.println(response.buildFailureMessage()); + } + if ((i % 100000) == 0) { + System.out.println("--> Indexed " + i + " took " + stopWatch.stop().lastTaskTime()); + stopWatch.start(); + } + } + Arrays.sort(values); + XContentBuilder builder = JsonXContent.contentBuilder().startObject(); + for (double percentile : PERCENTILES) { + builder.field(Double.toString(percentile), accuratePercentile(percentile, values)); + } + client.prepareIndex(d.indexName(), "values", "percentiles").setSource(builder.endObject()).execute().actionGet(); + client.admin().indices().prepareRefresh(d.indexName()).execute().actionGet(); + } + + ClusterHealthResponse clusterHealthResponse = client.admin().cluster().prepareHealth().setWaitForGreenStatus().setTimeout("10m").execute().actionGet(); + if (clusterHealthResponse.isTimedOut()) { + System.err.println("--> Timed out waiting for cluster health"); + } + + System.out.println("## Precision"); + for (Distribution d : Distribution.values()) { + System.out.println("#### " + d); + final long count = client.prepareCount(d.indexName()).setQuery(matchAllQuery()).execute().actionGet().getCount(); + if (count != NUM_DOCS + 1) { + throw new Error("Expected " + NUM_DOCS + " documents, got " + (count - 1)); + } + Map percentilesUnsorted = client.get(getRequest(d.indexName()).type("values").id("percentiles")).actionGet().getSourceAsMap(); + SortedMap percentiles = Maps.newTreeMap(); + for (Map.Entry entry : percentilesUnsorted.entrySet()) { + percentiles.put(Double.parseDouble(entry.getKey()), (Double) entry.getValue()); + } + System.out.println("Expected percentiles: " + percentiles); + System.out.println(); + SearchResponse resp = client.prepareSearch(d.indexName()).setSearchType(SearchType.COUNT).addAggregation(percentiles("pcts").field("v").percentiles(PERCENTILES)).execute().actionGet(); + Percentiles pcts = resp.getAggregations().get("pcts"); + Map asMap = Maps.newLinkedHashMap(); + double sumOfErrorSquares = 0; + for (Percentile percentile : pcts) { + asMap.put(percentile.getPercent(), percentile.getValue()); + double error = percentile.getValue() - percentiles.get(percentile.getPercent()); + sumOfErrorSquares += error * error; + } + System.out.println("Percentiles: " + asMap); + System.out.println("Sum of error squares: " + sumOfErrorSquares); + System.out.println(); + } + + System.out.println("## Performance"); + for (int i = 0; i < 3; ++i) { + for (Distribution d : Distribution.values()) { + System.out.println("#### " + d); + for (int j = 0; j < QUERY_WARMUP; ++j) { + client.prepareSearch(d.indexName()).setSearchType(SearchType.COUNT).addAggregation(percentiles("pcts").field("v").percentiles(PERCENTILES)).execute().actionGet(); + } + long start = System.nanoTime(); + for (int j = 0; j < QUERY_COUNT; ++j) { + client.prepareSearch(d.indexName()).setSearchType(SearchType.COUNT).addAggregation(percentiles("pcts").field("v").percentiles(PERCENTILES)).execute().actionGet(); + } + System.out.println(new TimeValue((System.nanoTime() - start) / QUERY_COUNT, TimeUnit.NANOSECONDS)); + } + } + } + +} diff --git a/src/test/java/org/elasticsearch/common/util/ArrayUtilsTests.java b/src/test/java/org/elasticsearch/common/util/ArrayUtilsTests.java new file mode 100644 index 0000000000000..aef86c2494956 --- /dev/null +++ b/src/test/java/org/elasticsearch/common/util/ArrayUtilsTests.java @@ -0,0 +1,92 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.util; + +import org.junit.Test; + +import java.util.Arrays; +import java.util.BitSet; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.is; + +/** + * + */ +public class ArrayUtilsTests { + + @Test + public void binarySearch() throws Exception { + + for (int j = 0; j < 100; j++) { + + int index = Math.min(randomInt(0, 10), 9); + double tolerance = Math.random() * 0.01; + double lookForValue = randomFreq(0.9) ? -1 : Double.NaN; // sometimes we'll look for NaN + double[] array = new double[10]; + for (int i = 0; i < array.length; i++) { + double value; + if (randomFreq(0.9)) { + value = Math.random() * 10; + array[i] = value + ((randomFreq(0.5) ? 1 : -1) * Math.random() * tolerance); + + } else { // sometimes we'll have NaN in the array + value = Double.NaN; + array[i] = value; + } + if (i == index && lookForValue < 0) { + lookForValue = value; + } + } + Arrays.sort(array); + + // pick up all the indices that fall within the range of [lookForValue - tolerance, lookForValue + tolerance] + // we need to do this, since we choose the values randomly and we might end up having multiple values in the + // array that will match the looked for value with the random tolerance. In such cases, the binary search will + // return the first one that will match. + BitSet bitSet = new BitSet(10); + for (int i = 0; i < array.length; i++) { + if (Double.isNaN(lookForValue) && Double.isNaN(array[i])) { + bitSet.set(i); + } else if ((array[i] >= lookForValue - tolerance) && (array[i] <= lookForValue + tolerance)) { + bitSet.set(i); + } + } + + int foundIndex = ArrayUtils.binarySearch(array, lookForValue, tolerance); + + if (bitSet.cardinality() == 0) { + assertThat(foundIndex, is(-1)); + } else { + assertThat(bitSet.get(foundIndex), is(true)); + } + } + } + + private boolean randomFreq(double freq) { + return Math.random() < freq; + } + + private int randomInt(int min, int max) { + int delta = (int) (Math.random() * (max - min)); + return min + delta; + } + +} diff --git a/src/test/java/org/elasticsearch/search/aggregations/RandomTests.java b/src/test/java/org/elasticsearch/search/aggregations/RandomTests.java index 6a8df47d391b8..26f187af8a34c 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/RandomTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/RandomTests.java @@ -20,6 +20,8 @@ package org.elasticsearch.search.aggregations; import com.carrotsearch.hppc.IntOpenHashSet; +import org.elasticsearch.action.bulk.BulkRequestBuilder; +import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.search.SearchRequestBuilder; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.action.support.IndicesOptions; @@ -273,4 +275,28 @@ public void testDuelTermsHistogram() throws Exception { } } + public void testLargeNumbersOfPercentileBuckets() throws Exception { + // test high numbers of percentile buckets to make sure paging and release work correctly + createIndex("idx"); + + final int numDocs = atLeast(25000); + int t = 0; + for (int i = 0; i < numDocs; ) { + BulkRequestBuilder request = client().prepareBulk(); + final int bulkSize = Math.min(numDocs - i, 100); + client().prepareIndex("idx", "type").setSource("double_value", randomDouble()).execute().actionGet(); + for (int j = 0; j < bulkSize; ++j) { + ++t; + request.add(client().prepareIndex("idx", "type", Integer.toString(i++)).setSource("double_value", randomDouble())); + } + BulkResponse response = request.execute().actionGet(); + assertFalse(response.buildFailureMessage(), response.hasFailures()); + } + assertEquals(numDocs, t); + assertNoFailures(client().admin().indices().prepareRefresh("idx").execute().get()); + + SearchResponse response = client().prepareSearch("idx").addAggregation(terms("terms").field("double_value").subAggregation(percentiles("pcts").field("double_value"))).execute().actionGet(); + assertNoFailures(response); + } + } diff --git a/src/test/java/org/elasticsearch/search/aggregations/metrics/AbstractNumericTests.java b/src/test/java/org/elasticsearch/search/aggregations/metrics/AbstractNumericTests.java index df69c9b2a5138..512a09bd6eeb6 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/metrics/AbstractNumericTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/metrics/AbstractNumericTests.java @@ -42,6 +42,8 @@ public Settings indexSettings() { .build(); } + protected long minValue, maxValue, minValues, maxValues; + @Before public void init() throws Exception { createIndex("idx"); @@ -49,13 +51,18 @@ public void init() throws Exception { List builders = new ArrayList(); - for (int i = 0; i < 10; i++) { // TODO randomize the size and the params in here? + final int numDocs = 10; + for (int i = 0; i < numDocs; i++) { // TODO randomize the size and the params in here? builders.add(client().prepareIndex("idx", "type", ""+i).setSource(jsonBuilder() .startObject() .field("value", i+1) .startArray("values").value(i+2).value(i+3).endArray() .endObject())); } + minValue = 1; + minValues = 2; + maxValue = numDocs; + maxValues = numDocs + 2; indexRandom(true, builders); // creating an index to test the empty buckets functionality. The way it works is by indexing diff --git a/src/test/java/org/elasticsearch/search/aggregations/metrics/GroupTree.java b/src/test/java/org/elasticsearch/search/aggregations/metrics/GroupTree.java new file mode 100644 index 0000000000000..44e00a064bf83 --- /dev/null +++ b/src/test/java/org/elasticsearch/search/aggregations/metrics/GroupTree.java @@ -0,0 +1,482 @@ +package org.elasticsearch.search.aggregations.metrics; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import com.google.common.base.Preconditions; +import com.google.common.collect.AbstractIterator; +import com.google.common.collect.Lists; + +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * Upstream: Stream-lib, master @ 704002a2d8fa01fa7e9868dae9d0c8bedd8e9427 + * https://github.com/addthis/stream-lib/blob/master/src/main/java/com/clearspring/analytics/stream/quantile/GroupTree.java + */ + +/** + * A tree containing TDigest.Group. This adds to the normal NavigableSet the + * ability to sum up the size of elements to the left of a particular group. + */ +public class GroupTree implements Iterable { + private int count; + int size; + private int depth; + private Group leaf; + private GroupTree left, right; + + public GroupTree() { + count = size = depth = 0; + leaf = null; + left = right = null; + } + + public GroupTree(Group leaf) { + size = depth = 1; + this.leaf = leaf; + count = leaf.count(); + left = right = null; + } + + public GroupTree(GroupTree left, GroupTree right) { + this.left = left; + this.right = right; + count = left.count + right.count; + size = left.size + right.size; + rebalance(); + leaf = this.right.first(); + } + + public void add(Group group) { + if (size == 0) { + leaf = group; + depth = 1; + count = group.count(); + size = 1; + return; + } else if (size == 1) { + int order = group.compareTo(leaf); + if (order < 0) { + left = new GroupTree(group); + right = new GroupTree(leaf); + } else if (order > 0) { + left = new GroupTree(leaf); + right = new GroupTree(group); + leaf = group; + } + } else if (group.compareTo(leaf) < 0) { + left.add(group); + } else { + right.add(group); + } + count += group.count(); + size++; + depth = Math.max(left.depth, right.depth) + 1; + + rebalance(); + } + + private void rebalance() { + int l = left.depth(); + int r = right.depth(); + if (l > r + 1) { + if (left.left.depth() > left.right.depth()) { + rotate(left.left.left, left.left.right, left.right, right); + } else { + rotate(left.left, left.right.left, left.right.right, right); + } + } else if (r > l + 1) { + if (right.left.depth() > right.right.depth()) { + rotate(left, right.left.left, right.left.right, right.right); + } else { + rotate(left, right.left, right.right.left, right.right.right); + } + } else { + depth = Math.max(left.depth(), right.depth()) + 1; + } + } + + private void rotate(GroupTree a, GroupTree b, GroupTree c, GroupTree d) { + left = new GroupTree(a, b); + right = new GroupTree(c, d); + count = left.count + right.count; + size = left.size + right.size; + depth = Math.max(left.depth(), right.depth()) + 1; + leaf = right.first(); + } + + private int depth() { + return depth; + } + + public int size() { + return size; + } + + /** + * @return the number of items strictly before the current element + */ + public int headCount(Group base) { + if (size == 0) { + return 0; + } else if (left == null) { + return leaf.compareTo(base) < 0 ? 1 : 0; + } else { + if (base.compareTo(leaf) < 0) { + return left.headCount(base); + } else { + return left.size + right.headCount(base); + } + } + } + + /** + * @return the sum of the size() function for all elements strictly before the current element. + */ + public int headSum(Group base) { + if (size == 0) { + return 0; + } else if (left == null) { + return leaf.compareTo(base) < 0 ? count : 0; + } else { + if (base.compareTo(leaf) <= 0) { + return left.headSum(base); + } else { + return left.count + right.headSum(base); + } + } + } + + /** + * @return the first Group in this set + */ + public Group first() { + Preconditions.checkState(size > 0, "No first element of empty set"); + if (left == null) { + return leaf; + } else { + return left.first(); + } + } + + /** + * Iteratres through all groups in the tree. + */ + public Iterator iterator() { + return iterator(null); + } + + /** + * Iterates through all of the Groups in this tree in ascending order of means + * + * @param start The place to start this subset. Remember that Groups are ordered by mean *and* id. + * @return An iterator that goes through the groups in order of mean and id starting at or after the + * specified Group. + */ + private Iterator iterator(final Group start) { + return new AbstractIterator() { + { + stack = new ArrayDeque(); + push(GroupTree.this, start); + } + + Deque stack; + + // recurses down to the leaf that is >= start + // pending right hand branches on the way are put on the stack + private void push(GroupTree z, Group start) { + while (z.left != null) { + if (start == null || start.compareTo(z.leaf) < 0) { + // remember we will have to process the right hand branch later + stack.push(z.right); + // note that there is no guarantee that z.left has any good data + z = z.left; + } else { + // if the left hand branch doesn't contain start, then no push + z = z.right; + } + } + // put the leaf value on the stack if it is valid + if (start == null || z.leaf.compareTo(start) >= 0) { + stack.push(z); + } + } + + @Override + protected Group computeNext() { + GroupTree r = stack.poll(); + while (r != null && r.left != null) { + // unpack r onto the stack + push(r, start); + r = stack.poll(); + } + + // at this point, r == null or r.left == null + // if r == null, stack is empty and we are done + // if r != null, then r.left != null and we have a result + if (r != null) { + return r.leaf; + } + return endOfData(); + } + }; + } + + public void remove(Group base) { + Preconditions.checkState(size > 0, "Cannot remove from empty set"); + if (size == 1) { + Preconditions.checkArgument(base.compareTo(leaf) == 0, "Element %s not found", base); + count = size = 0; + leaf = null; + } else { + if (base.compareTo(leaf) < 0) { + if (left.size > 1) { + left.remove(base); + count -= base.count(); + size--; + rebalance(); + } else { + size = right.size; + count = right.count; + depth = right.depth; + leaf = right.leaf; + left = right.left; + right = right.right; + } + } else { + if (right.size > 1) { + right.remove(base); + leaf = right.first(); + count -= base.count(); + size--; + rebalance(); + } else { + size = left.size; + count = left.count; + depth = left.depth; + leaf = left.leaf; + right = left.right; + left = left.left; + } + } + } + } + + /** + * @return the largest element less than or equal to base + */ + public Group floor(Group base) { + if (size == 0) { + return null; + } else { + if (size == 1) { + return base.compareTo(leaf) >= 0 ? leaf : null; + } else { + if (base.compareTo(leaf) < 0) { + return left.floor(base); + } else { + Group floor = right.floor(base); + if (floor == null) { + floor = left.last(); + } + return floor; + } + } + } + } + + public Group last() { + Preconditions.checkState(size > 0, "Cannot find last element of empty set"); + if (size == 1) { + return leaf; + } else { + return right.last(); + } + } + + /** + * @return the smallest element greater than or equal to base. + */ + public Group ceiling(Group base) { + if (size == 0) { + return null; + } else if (size == 1) { + return base.compareTo(leaf) <= 0 ? leaf : null; + } else { + if (base.compareTo(leaf) < 0) { + Group r = left.ceiling(base); + if (r == null) { + r = right.first(); + } + return r; + } else { + return right.ceiling(base); + } + } + } + + /** + * @return the subset of elements equal to or greater than base. + */ + public Iterable tailSet(final Group start) { + return new Iterable() { + @Override + public Iterator iterator() { + return GroupTree.this.iterator(start); + } + }; + } + + public int sum() { + return count; + } + + public void checkBalance() { + if (left != null) { + Preconditions.checkState(Math.abs(left.depth() - right.depth()) < 2, "Imbalanced"); + int l = left.depth(); + int r = right.depth(); + Preconditions.checkState(depth == Math.max(l, r) + 1, "Depth doesn't match children"); + Preconditions.checkState(size == left.size + right.size, "Sizes don't match children"); + Preconditions.checkState(count == left.count + right.count, "Counts don't match children"); + Preconditions.checkState(leaf.compareTo(right.first()) == 0, "Split is wrong %.5d != %.5d or %d != %d", leaf.mean(), right.first().mean(), leaf.id(), right.first().id()); + left.checkBalance(); + right.checkBalance(); + } + } + + public void print(int depth) { + for (int i = 0; i < depth; i++) { + System.out.printf("| "); + } + int imbalance = Math.abs((left != null ? left.depth : 1) - (right != null ? right.depth : 1)); + System.out.printf("%s%s, %d, %d, %d\n", (imbalance > 1 ? "* " : "") + (right != null && leaf.compareTo(right.first()) != 0 ? "+ " : ""), leaf, size, count, this.depth); + if (left != null) { + left.print(depth + 1); + right.print(depth + 1); + } + } + + + public static class Group implements Comparable { + private static final AtomicInteger uniqueCount = new AtomicInteger(1); + + double centroid = 0; + int count = 0; + private int id; + + private List actualData = null; + + private Group(boolean record) { + id = uniqueCount.incrementAndGet(); + if (record) { + actualData = Lists.newArrayList(); + } + } + + public Group(double x) { + this(false); + start(x, uniqueCount.getAndIncrement()); + } + + public Group(double x, int id) { + this(false); + start(x, id); + } + + public Group(double x, int id, boolean record) { + this(record); + start(x, id); + } + + private void start(double x, int id) { + this.id = id; + add(x, 1); + } + + public void add(double x, int w) { + if (actualData != null) { + actualData.add(x); + } + count += w; + centroid += w * (x - centroid) / count; + } + + public double mean() { + return centroid; + } + + public int count() { + return count; + } + + public int id() { + return id; + } + + @Override + public String toString() { + return "Group{" + + "centroid=" + centroid + + ", count=" + count + + '}'; + } + + @Override + public int hashCode() { + return id; + } + + @Override + public int compareTo(Group o) { + int r = Double.compare(centroid, o.centroid); + if (r == 0) { + r = id - o.id; + } + return r; + } + + public Iterable data() { + return actualData; + } + + public static Group createWeighted(double x, int w, Iterable data) { + Group r = new Group(data != null); + r.add(x, w, data); + return r; + } + + private void add(double x, int w, Iterable data) { + if (actualData != null) { + if (data != null) { + for (Double old : data) { + actualData.add(old); + } + } else { + actualData.add(x); + } + } + count += w; + centroid += w * (x - centroid) / count; + } + } +} \ No newline at end of file diff --git a/src/test/java/org/elasticsearch/search/aggregations/metrics/GroupTreeTests.java b/src/test/java/org/elasticsearch/search/aggregations/metrics/GroupTreeTests.java new file mode 100644 index 0000000000000..5f29ef30b9fe9 --- /dev/null +++ b/src/test/java/org/elasticsearch/search/aggregations/metrics/GroupTreeTests.java @@ -0,0 +1,100 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.metrics; + +import com.carrotsearch.hppc.cursors.IntCursor; +import com.google.common.collect.Lists; +import org.elasticsearch.search.aggregations.metrics.GroupTree.Group; +import org.elasticsearch.search.aggregations.metrics.percentiles.tdigest.GroupRedBlackTree; +import org.elasticsearch.search.aggregations.metrics.percentiles.tdigest.GroupRedBlackTree.SizeAndSum; +import org.elasticsearch.test.ElasticsearchTestCase; + +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +public class GroupTreeTests extends ElasticsearchTestCase { + + public void testDuel() { + GroupTree tree1 = new GroupTree(); + GroupRedBlackTree tree2 = new GroupRedBlackTree(randomInt(100)); + + // Add elements + final int elements = atLeast(100); + for (int i = 0; i < elements; ++i) { + final double centroid = randomDouble(); + final int count = randomIntBetween(1, 5); + Group g = new Group(centroid, i); + g.add(centroid, count - 1); + tree1.add(g); + tree2.addGroup(centroid, count, i); + } + assertEquals(tree1, tree2); + + // Remove + List toRemove = Lists.newArrayList(); + for (Group group : tree1) { + if (randomBoolean()) { + toRemove.add(group); + } + } + Collections.shuffle(toRemove, getRandom()); + for (Group group : toRemove) { + tree1.remove(group); + final boolean removed = tree2.removeGroup(group.mean(), group.id()); + assertTrue(removed); + } + assertEquals(tree1, tree2); + } + + public static void assertEquals(GroupTree tree1, GroupRedBlackTree tree2) { + assertEquals(tree1.size(), tree2.size()); + + Iterator groups1 = tree1.iterator(); + Iterator groups2 = tree2.iterator(); + while (true) { + assertEquals(groups1.hasNext(), groups2.hasNext()); + if (!groups1.hasNext()) { + break; + } + final Group next1 = groups1.next(); + final IntCursor next2 = groups2.next(); + assertEquals(next1.mean(), tree2.mean(next2.value), 0.0001); + assertEquals(next1.count(), tree2.count(next2.value)); + } + assertConsistent(tree2); + } + + public static void assertConsistent(GroupRedBlackTree tree) { + int i = 0; + long sum = 0; + for (IntCursor cursor : tree) { + final int node = cursor.value; + + SizeAndSum s = new GroupRedBlackTree.SizeAndSum(); + tree.headSum(node, s); + assertEquals(i, s.size); + assertEquals(sum, s.sum); + + i++; + sum += tree.count(node); + } + } +} diff --git a/src/test/java/org/elasticsearch/search/aggregations/metrics/PercentilesTests.java b/src/test/java/org/elasticsearch/search/aggregations/metrics/PercentilesTests.java new file mode 100644 index 0000000000000..016f657f8aabd --- /dev/null +++ b/src/test/java/org/elasticsearch/search/aggregations/metrics/PercentilesTests.java @@ -0,0 +1,391 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.metrics; + +import com.google.common.collect.Lists; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.logging.Loggers; +import org.elasticsearch.search.aggregations.bucket.histogram.Histogram; +import org.elasticsearch.search.aggregations.bucket.histogram.Histogram.Order; +import org.elasticsearch.search.aggregations.metrics.percentiles.PercentilesBuilder; +import org.elasticsearch.search.aggregations.metrics.percentiles.Percentiles; +import org.elasticsearch.search.aggregations.metrics.percentiles.Percentiles.Estimator.TDigest; +import org.elasticsearch.search.aggregations.metrics.percentiles.Percentiles.Percentile; +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; + +import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery; +import static org.elasticsearch.search.aggregations.AggregationBuilders.histogram; +import static org.elasticsearch.search.aggregations.AggregationBuilders.percentiles; +import static org.hamcrest.Matchers.*; + +/** + * + */ +public class PercentilesTests extends AbstractNumericTests { + + private static double[] randomPercentiles() { + final int length = randomIntBetween(1, 20); + final double[] percentiles = new double[length]; + for (int i = 0; i < percentiles.length; ++i) { + switch (randomInt(20)) { + case 0: + percentiles[i] = 0; + break; + case 1: + percentiles[i] = 100; + break; + default: + percentiles[i] = randomDouble() * 100; + break; + } + } + Arrays.sort(percentiles); + Loggers.getLogger(PercentilesTests.class).info("Using percentiles={}", Arrays.toString(percentiles)); + return percentiles; + } + + private static PercentilesBuilder randomEstimator(PercentilesBuilder builder) { + if (randomBoolean()) { + TDigest estimator = TDigest.tDigest(); + estimator.compression(randomDouble() * 100); + builder.estimator(estimator); + } + return builder; + } + + private void assertConsistent(double[] pcts, Percentiles percentiles, long minValue, long maxValue) { + final List percentileList = Lists.newArrayList(percentiles); + assertEquals(pcts.length, percentileList.size()); + for (int i = 0; i < pcts.length; ++i) { + final Percentile percentile = percentileList.get(i); + assertThat(percentile.getPercent(), equalTo(pcts[i])); + assertThat(percentile.getValue(), greaterThanOrEqualTo((double) minValue)); + assertThat(percentile.getValue(), lessThanOrEqualTo((double) maxValue)); + + if (percentile.getPercent() == 0) { + assertThat(percentile.getValue(), equalTo((double) minValue)); + } + if (percentile.getPercent() == 100) { + assertThat(percentile.getValue(), equalTo((double) maxValue)); + } + } + + for (int i = 1; i < percentileList.size(); ++i) { + assertThat(percentileList.get(i).getValue(), greaterThanOrEqualTo(percentileList.get(i - 1).getValue())); + } + } + + @Test + public void testEmptyAggregation() throws Exception { + + SearchResponse searchResponse = client().prepareSearch("empty_bucket_idx") + .setQuery(matchAllQuery()) + .addAggregation(histogram("histo").field("value").interval(1l).minDocCount(0) + .subAggregation(randomEstimator(percentiles("percentiles")) + .percentiles(10, 15))) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(2l)); + Histogram histo = searchResponse.getAggregations().get("histo"); + assertThat(histo, notNullValue()); + Histogram.Bucket bucket = histo.getBucketByKey(1l); + assertThat(bucket, notNullValue()); + + Percentiles percentiles = bucket.getAggregations().get("percentiles"); + assertThat(percentiles, notNullValue()); + assertThat(percentiles.getName(), equalTo("percentiles")); + assertThat(percentiles.percentile(10), equalTo(Double.NaN)); + assertThat(percentiles.percentile(15), equalTo(Double.NaN)); + } + + @Test + public void testUnmapped() throws Exception { + SearchResponse searchResponse = client().prepareSearch("idx_unmapped") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .field("value") + .percentiles(0, 10, 15, 100)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(0l)); + + Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertThat(percentiles, notNullValue()); + assertThat(percentiles.getName(), equalTo("percentiles")); + assertThat(percentiles.percentile(0), equalTo(Double.NaN)); + assertThat(percentiles.percentile(10), equalTo(Double.NaN)); + assertThat(percentiles.percentile(15), equalTo(Double.NaN)); + assertThat(percentiles.percentile(100), equalTo(Double.NaN)); + } + + @Test + public void testSingleValuedField() throws Exception { + final double[] pcts = randomPercentiles(); + SearchResponse searchResponse = client().prepareSearch("idx") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .field("value") + .percentiles(pcts)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + final Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertConsistent(pcts, percentiles, minValue, maxValue); + } + + @Test + public void testSingleValuedField_PartiallyUnmapped() throws Exception { + final double[] pcts = randomPercentiles(); + SearchResponse searchResponse = client().prepareSearch("idx", "idx_unmapped") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .field("value") + .percentiles(pcts)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + final Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertConsistent(pcts, percentiles, minValue, maxValue); + } + + @Test + public void testSingleValuedField_WithValueScript() throws Exception { + final double[] pcts = randomPercentiles(); + SearchResponse searchResponse = client().prepareSearch("idx") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .field("value").script("_value - 1") + .percentiles(pcts)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + final Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertConsistent(pcts, percentiles, minValue - 1, maxValue - 1); + } + + @Test + public void testSingleValuedField_WithValueScript_WithParams() throws Exception { + final double[] pcts = randomPercentiles(); + SearchResponse searchResponse = client().prepareSearch("idx") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .field("value").script("_value - dec").param("dec", 1) + .percentiles(pcts)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + final Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertConsistent(pcts, percentiles, minValue - 1, maxValue - 1); + } + + @Test + public void testMultiValuedField() throws Exception { + final double[] pcts = randomPercentiles(); + SearchResponse searchResponse = client().prepareSearch("idx") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .field("values") + .percentiles(pcts)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + final Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertConsistent(pcts, percentiles, minValues, maxValues); + } + + @Test + public void testMultiValuedField_WithValueScript() throws Exception { + final double[] pcts = randomPercentiles(); + SearchResponse searchResponse = client().prepareSearch("idx") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .field("values").script("_value - 1") + .percentiles(pcts)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + final Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertConsistent(pcts, percentiles, minValues - 1, maxValues - 1); + } + + @Test + public void testMultiValuedField_WithValueScript_Reverse() throws Exception { + final double[] pcts = randomPercentiles(); + SearchResponse searchResponse = client().prepareSearch("idx") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .field("values").script("_value * -1") + .percentiles(pcts)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + final Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertConsistent(pcts, percentiles, -maxValues, -minValues); + } + + @Test + public void testMultiValuedField_WithValueScript_WithParams() throws Exception { + final double[] pcts = randomPercentiles(); + SearchResponse searchResponse = client().prepareSearch("idx") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .field("values").script("_value - dec").param("dec", 1) + .percentiles(pcts)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + final Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertConsistent(pcts, percentiles, minValues - 1, maxValues - 1); + } + + @Test + public void testScript_SingleValued() throws Exception { + final double[] pcts = randomPercentiles(); + SearchResponse searchResponse = client().prepareSearch("idx") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .script("doc['value'].value") + .percentiles(pcts)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + final Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertConsistent(pcts, percentiles, minValue, maxValue); + } + + @Test + public void testScript_SingleValued_WithParams() throws Exception { + final double[] pcts = randomPercentiles(); + SearchResponse searchResponse = client().prepareSearch("idx") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .script("doc['value'].value - dec").param("dec", 1) + .percentiles(pcts)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + final Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertConsistent(pcts, percentiles, minValue - 1, maxValue - 1); + } + + @Test + public void testScript_ExplicitSingleValued_WithParams() throws Exception { + final double[] pcts = randomPercentiles(); + SearchResponse searchResponse = client().prepareSearch("idx") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .script("doc['value'].value - dec").param("dec", 1) + .percentiles(pcts)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + final Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertConsistent(pcts, percentiles, minValue - 1, maxValue - 1); + } + + @Test + public void testScript_MultiValued() throws Exception { + final double[] pcts = randomPercentiles(); + SearchResponse searchResponse = client().prepareSearch("idx") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .script("doc['values'].values") + .percentiles(pcts)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + final Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertConsistent(pcts, percentiles, minValues, maxValues); + } + + @Test + public void testScript_ExplicitMultiValued() throws Exception { + final double[] pcts = randomPercentiles(); + SearchResponse searchResponse = client().prepareSearch("idx") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .script("doc['values'].values") + .percentiles(pcts)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + final Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertConsistent(pcts, percentiles, minValues, maxValues); + } + + @Test + public void testScript_MultiValued_WithParams() throws Exception { + final double[] pcts = randomPercentiles(); + SearchResponse searchResponse = client().prepareSearch("idx") + .setQuery(matchAllQuery()) + .addAggregation(randomEstimator(percentiles("percentiles")) + .script("List values = doc['values'].values; double[] res = new double[values.length]; for (int i = 0; i < res.length; i++) { res[i] = values.get(i) - dec; }; return res;").param("dec", 1) + .percentiles(pcts)) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + final Percentiles percentiles = searchResponse.getAggregations().get("percentiles"); + assertConsistent(pcts, percentiles, minValues - 1, maxValues - 1); + } + + @Test + public void testOrderBySubAggregation() { + boolean asc = randomBoolean(); + SearchResponse searchResponse = client().prepareSearch("idx") + .setQuery(matchAllQuery()) + .addAggregation( + histogram("histo").field("value").interval(2l) + .subAggregation(randomEstimator(percentiles("percentiles").percentiles(99))) + .order(Order.aggregation("percentiles", "99", asc))) + .execute().actionGet(); + + assertThat(searchResponse.getHits().getTotalHits(), equalTo(10l)); + + Histogram histo = searchResponse.getAggregations().get("histo"); + double previous = asc ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; + for (Histogram.Bucket bucket : histo.getBuckets()) { + Percentiles percentiles = bucket.getAggregations().get("percentiles"); + double p99 = percentiles.percentile(99); + if (asc) { + assertThat(p99, greaterThanOrEqualTo(previous)); + } else { + assertThat(p99, lessThanOrEqualTo(previous)); + } + previous = p99; + } + } + +} \ No newline at end of file diff --git a/src/test/java/org/elasticsearch/search/aggregations/metrics/RedBlackTreeTests.java b/src/test/java/org/elasticsearch/search/aggregations/metrics/RedBlackTreeTests.java new file mode 100644 index 0000000000000..4dc14111b2293 --- /dev/null +++ b/src/test/java/org/elasticsearch/search/aggregations/metrics/RedBlackTreeTests.java @@ -0,0 +1,185 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.metrics; + +import com.carrotsearch.hppc.cursors.IntCursor; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.primitives.Ints; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.FixedBitSet; +import org.elasticsearch.search.aggregations.metrics.percentiles.tdigest.RedBlackTree; +import org.elasticsearch.test.ElasticsearchTestCase; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +public class RedBlackTreeTests extends ElasticsearchTestCase { + + private static class IntRedBlackTree extends RedBlackTree { + + private int tmpValue; + private int[] values; + private int[] counts; + + IntRedBlackTree() { + super(10); + values = new int[5]; + counts = new int[5]; + } + + @Override + protected int compare(int node) { + return Ints.compare(tmpValue, values[node]); + } + + @Override + protected void merge(int node) { + counts[node] += 1; + } + + @Override + protected void copy(int node) { + values[node] = tmpValue; + counts[node] = 1; + } + + @Override + protected int newNode() { + final int newNode = super.newNode(); + if (values.length <= newNode) { + values = ArrayUtil.grow(values, newNode + 1); + counts = Arrays.copyOf(counts, values.length); + } + return newNode; + } + + public boolean add(int value) { + tmpValue = value; + return super.addNode(); + } + + public boolean remove(int value) { + tmpValue = value; + final int nodeToRemove = getNode(); + if (nodeToRemove == NIL) { + return false; + } else { + super.removeNode(nodeToRemove); + return true; + } + } + + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + b.append("{ "); + for (IntCursor cursor : this) { + b.append(values[cursor.value] + "(" + counts[cursor.value] + "), "); + } + b.setLength(b.length() - 2); + b.append(" } "); + return b.toString(); + } + + } + + public void testAdd() { + Map map = Maps.newHashMap(); + IntRedBlackTree tree = new IntRedBlackTree(); + final int iters = atLeast(1000); + for (int i = 0; i < iters; ++i) { + final int value = randomInt(200); + final boolean added = tree.add(value); + tree.assertConsistent(); + assertEquals(!map.containsKey(value), added); + if (map.containsKey(value)) { + map.put(value, map.get(value) + 1); + } else { + map.put(value, 1); + } + assertEquals(map.size(), tree.size()); + } + + int size = 0; + int previousValue = Integer.MIN_VALUE; + for (IntCursor cursor : tree) { + ++size; + final int value = tree.values[cursor.value]; + assertTrue(previousValue < value); + assertEquals(map.get(value).intValue(), tree.counts[cursor.value]); + previousValue = value; + } + assertEquals(map.size(), size); + } + + public void testRemove() { + final int numValues = atLeast(200); + final FixedBitSet values = new FixedBitSet(numValues); + values.set(0, numValues); + IntRedBlackTree tree = new IntRedBlackTree(); + for (int i = 0; i < numValues; ++i) { + tree.add(i); + } + + final int iters = atLeast(300); + for (int i = 0; i < iters; ++i) { + final int value = randomInt(numValues - 1); + final boolean removed = tree.remove(value); + assertEquals(removed, values.get(value)); + values.clear(value); + assertEquals(values.cardinality(), tree.size()); + tree.assertConsistent(); + } + + int size = 0; + int previousValue = Integer.MIN_VALUE; + for (IntCursor cursor : tree) { + ++size; + final int value = tree.values[cursor.value]; + assertTrue(previousValue < value); + assertTrue(values.get(value)); + previousValue = value; + } + assertEquals(values.cardinality(), size); + } + + public void testReverse() { + IntRedBlackTree tree = new IntRedBlackTree(); + final int iters = atLeast(1000); + for (int i = 0; i < iters; ++i) { + final int value = randomInt(2000); + tree.add(value); + } + List sortedNodes = Lists.newArrayList(); + for (IntCursor cursor : tree) { + sortedNodes.add(cursor.value); + } + List reverseNodes = Lists.newArrayList(); + for (IntCursor cursor : tree.reverseSet()) { + reverseNodes.add(cursor.value); + } + Collections.reverse(sortedNodes); + assertEquals(sortedNodes, reverseNodes); + } + +} diff --git a/src/test/java/org/elasticsearch/search/aggregations/metrics/TDigestStateTests.java b/src/test/java/org/elasticsearch/search/aggregations/metrics/TDigestStateTests.java new file mode 100644 index 0000000000000..c898540569f29 --- /dev/null +++ b/src/test/java/org/elasticsearch/search/aggregations/metrics/TDigestStateTests.java @@ -0,0 +1,194 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.metrics; + +import com.google.common.collect.Lists; +import org.apache.mahout.math.jet.random.AbstractContinousDistribution; +import org.apache.mahout.math.jet.random.Gamma; +import org.apache.mahout.math.jet.random.Normal; +import org.apache.mahout.math.jet.random.Uniform; +import org.elasticsearch.search.aggregations.metrics.percentiles.tdigest.TDigestState; +import org.elasticsearch.test.ElasticsearchTestCase; +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.Random; + +import static org.hamcrest.Matchers.lessThan; + +// imported tests from upstream's TDigestTest +public class TDigestStateTests extends ElasticsearchTestCase { + + @Test + public void testUniform() { + Random gen = getRandom(); + for (int i = 0; i < 10; i++) { + runTest(new Uniform(0, 1, gen), 100, + new double[]{0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999}, + "uniform"); + } + } + + @Test + public void testGamma() { + // this Gamma distribution is very heavily skewed. The 0.1%-ile is 6.07e-30 while + // the median is 0.006 and the 99.9th %-ile is 33.6 while the mean is 1. + // this severe skew means that we have to have positional accuracy that + // varies by over 11 orders of magnitude. + Random gen = getRandom(); + for (int i = 0; i < 10; i++) { + runTest(new Gamma(0.1, 0.1, gen), 100, +// new double[]{6.0730483624079e-30, 6.0730483624079e-20, 6.0730483627432e-10, 5.9339110446023e-03, +// 2.6615455373884e+00, 1.5884778179295e+01, 3.3636770117188e+01}, + new double[]{0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999}, + "gamma"); + } + } + + @Test + public void testNarrowNormal() { + // this mixture of a uniform and normal distribution has a very narrow peak which is centered + // near the median. Our system should be scale invariant and work well regardless. + final Random gen = getRandom(); + AbstractContinousDistribution mix = new AbstractContinousDistribution() { + AbstractContinousDistribution normal = new Normal(0, 1e-5, gen); + AbstractContinousDistribution uniform = new Uniform(-1, 1, gen); + + @Override + public double nextDouble() { + double x; + if (gen.nextDouble() < 0.5) { + x = uniform.nextDouble(); + } else { + x = normal.nextDouble(); + } + return x; + } + }; + + for (int i = 0; i < 10; i++) { + runTest(mix, 100, new double[]{0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99, 0.999}, "mixture"); + } + } + + @Test + public void testRepeatedValues() { + final Random gen = getRandom(); + + // 5% of samples will be 0 or 1.0. 10% for each of the values 0.1 through 0.9 + AbstractContinousDistribution mix = new AbstractContinousDistribution() { + @Override + public double nextDouble() { + return Math.rint(gen.nextDouble() * 10) / 10.0; + } + }; + + TDigestState dist = new TDigestState((double) 1000); + List data = Lists.newArrayList(); + for (int i1 = 0; i1 < 100000; i1++) { + double x = mix.nextDouble(); + data.add(x); + dist.add(x); + } + + // I would be happier with 5x compression, but repeated values make things kind of weird + assertTrue("Summary is too large", dist.centroidCount() < 10 * (double) 1000); + + // all quantiles should round to nearest actual value + for (int i = 0; i < 10; i++) { + double z = i / 10.0; + // we skip over troublesome points that are nearly halfway between + for (double delta : new double[] {0.01, 0.02, 0.03, 0.07, 0.08, 0.09}) { + double q = z + delta; + double cdf = dist.cdf(q); + // we also relax the tolerances for repeated values + assertEquals(String.format(Locale.ROOT, "z=%.1f, q = %.3f, cdf = %.3f", z, q, cdf), z + 0.05, cdf, 0.01); + + double estimate = dist.quantile(q); + assertEquals(String.format(Locale.ROOT, "z=%.1f, q = %.3f, cdf = %.3f, estimate = %.3f", z, q, cdf, estimate), Math.rint(q * 10) / 10.0, estimate, 0.001); + } + } + } + + @Test + public void testSequentialPoints() { + for (int i = 0; i < 10; i++) { + runTest(new AbstractContinousDistribution() { + double base = 0; + + @Override + public double nextDouble() { + base += Math.PI * 1e-5; + return base; + } + }, 100, new double[]{0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999}, + "sequential"); + } + } + + public void runTest(AbstractContinousDistribution gen, double sizeGuide, double[] qValues, String tag) { + final int len = 100000; + final TDigestState dist = new TDigestState(sizeGuide); + double[] data = new double[len]; + for (int i = 0; i < len; ++i) { + double x = gen.nextDouble(); + data[i] = x; + dist.add(x); + } + dist.compress(); + Arrays.sort(data); + + double[] xValues = qValues.clone(); + for (int i = 0; i < qValues.length; i++) { + double ix = data.length * qValues[i] - 0.5; + int index = (int) Math.floor(ix); + double p = ix - index; + xValues[i] = data[index] * (1 - p) + data[index + 1] * p; + } + + assertThat("Summary is too large", (double) dist.centroidCount(), lessThan(sizeGuide * 10)); + int softErrors = 0; + for (int i = 0; i < xValues.length; i++) { + double x = xValues[i]; + double q = qValues[i]; + double estimate = dist.cdf(x); + assertEquals(q, estimate, 0.005); + + estimate = cdf(dist.quantile(q), data); + if (Math.abs(q - estimate) > 0.005) { + softErrors++; + } + assertEquals(q, estimate, 0.012); + } + assertTrue(softErrors < 3); + } + + private double cdf(final double x, double[] data) { + int n1 = 0; + int n2 = 0; + for (double v : data) { + n1 += (v < x) ? 1 : 0; + n2 += (v <= x) ? 1 : 0; + } + return (n1 + n2) / 2.0 / data.length; + } +} diff --git a/src/test/java/org/elasticsearch/test/cache/recycler/MockBigArrays.java b/src/test/java/org/elasticsearch/test/cache/recycler/MockBigArrays.java index 0cefc8dc15c7e..7e56757f6c04d 100644 --- a/src/test/java/org/elasticsearch/test/cache/recycler/MockBigArrays.java +++ b/src/test/java/org/elasticsearch/test/cache/recycler/MockBigArrays.java @@ -336,6 +336,11 @@ public int increment(long index, int inc) { return in.increment(index, inc); } + @Override + public void fill(long fromIndex, long toIndex, int value) { + in.fill(fromIndex, toIndex, value); + } + } private class LongArrayWrapper extends AbstractArrayWrapper implements LongArray { From a0935bc48a90ec5dee0bac4586c091e16592b641 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Mon, 3 Mar 2014 20:31:38 +0100 Subject: [PATCH 4/6] Fix test bug: a too low compression level can make accuracy terrible. --- .../search/aggregations/metrics/PercentilesTests.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/java/org/elasticsearch/search/aggregations/metrics/PercentilesTests.java b/src/test/java/org/elasticsearch/search/aggregations/metrics/PercentilesTests.java index 016f657f8aabd..4c85659b0da81 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/metrics/PercentilesTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/metrics/PercentilesTests.java @@ -23,10 +23,10 @@ import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.search.aggregations.bucket.histogram.Histogram; import org.elasticsearch.search.aggregations.bucket.histogram.Histogram.Order; -import org.elasticsearch.search.aggregations.metrics.percentiles.PercentilesBuilder; import org.elasticsearch.search.aggregations.metrics.percentiles.Percentiles; import org.elasticsearch.search.aggregations.metrics.percentiles.Percentiles.Estimator.TDigest; import org.elasticsearch.search.aggregations.metrics.percentiles.Percentiles.Percentile; +import org.elasticsearch.search.aggregations.metrics.percentiles.PercentilesBuilder; import org.junit.Test; import java.util.Arrays; @@ -66,7 +66,7 @@ private static double[] randomPercentiles() { private static PercentilesBuilder randomEstimator(PercentilesBuilder builder) { if (randomBoolean()) { TDigest estimator = TDigest.tDigest(); - estimator.compression(randomDouble() * 100); + estimator.compression(randomIntBetween(20, 120) + randomDouble()); builder.estimator(estimator); } return builder; From 778ddb2abd52c5b750c33d46bc3b3dda968de7fe Mon Sep 17 00:00:00 2001 From: Binh Ly Date: Wed, 26 Feb 2014 14:01:59 -0500 Subject: [PATCH 5/6] [DOCS] Java API JSON typo --- docs/java-api/index_.asciidoc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/java-api/index_.asciidoc b/docs/java-api/index_.asciidoc index 0ba4542ffe1d2..5ad9ef508c54b 100644 --- a/docs/java-api/index_.asciidoc +++ b/docs/java-api/index_.asciidoc @@ -37,7 +37,7 @@ dates regarding to the String json = "{" + "\"user\":\"kimchy\"," + "\"postDate\":\"2013-01-30\"," + - "\"message\":\"trying out Elasticsearch\"," + + "\"message\":\"trying out Elasticsearch\"" + "}"; -------------------------------------------------- @@ -152,7 +152,7 @@ don't have to give an ID: String json = "{" + "\"user\":\"kimchy\"," + "\"postDate\":\"2013-01-30\"," + - "\"message\":\"trying out Elasticsearch\"," + + "\"message\":\"trying out Elasticsearch\"" + "}"; IndexResponse response = client.prepareIndex("twitter", "tweet") From 4c8b076b9aae370ee16e727eb69d5b208a8cebd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Holger=20Hoffst=C3=A4tte?= Date: Tue, 4 Mar 2014 12:02:16 +0100 Subject: [PATCH 6/6] Rwrite on top of BigArrays/ByteArray. Delete NonePageCacheRecycler. --- .../NonePageCacheRecyclerService.java | 161 ----------------- .../common/io/stream/BytesStreamOutput.java | 171 +++--------------- 2 files changed, 29 insertions(+), 303 deletions(-) delete mode 100644 src/main/java/org/elasticsearch/cache/recycler/NonePageCacheRecyclerService.java diff --git a/src/main/java/org/elasticsearch/cache/recycler/NonePageCacheRecyclerService.java b/src/main/java/org/elasticsearch/cache/recycler/NonePageCacheRecyclerService.java deleted file mode 100644 index 717675610d961..0000000000000 --- a/src/main/java/org/elasticsearch/cache/recycler/NonePageCacheRecyclerService.java +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.cache.recycler; - -import org.elasticsearch.common.recycler.AbstractRecyclerC; -import org.elasticsearch.common.recycler.NoneRecycler; -import org.elasticsearch.common.recycler.Recycler; -import org.elasticsearch.common.recycler.Recycler.V; -import org.elasticsearch.common.util.BigArrays; - -// Should implement PageCacheRecycler once that is made into an interface -public class NonePageCacheRecyclerService { - - // Shared non-recycler for legacy constructor clients; should be removed eventually. - public static final NonePageCacheRecyclerService INSTANCE = new NonePageCacheRecyclerService(); - - private final Recycler byteRecycler; - private final Recycler intRecycler; - private final Recycler longRecycler; - private final Recycler floatRecycler; - private final Recycler doubleRecycler; - private final Recycler objectRecycler; - - private NonePageCacheRecyclerService() { - Recycler.C rcb = new NoneRecyclingByteFactory(); - this.byteRecycler = new NoneRecycler(rcb); - Recycler.C rci = new NoneRecyclingIntFactory(); - this.intRecycler = new NoneRecycler(rci); - Recycler.C rcl = new NoneRecyclingLongFactory(); - this.longRecycler = new NoneRecycler(rcl); - Recycler.C rcf = new NoneRecyclingFloatFactory(); - this.floatRecycler = new NoneRecycler(rcf); - Recycler.C rcd = new NoneRecyclingDoubleFactory(); - this.doubleRecycler = new NoneRecycler(rcd); - Recycler.C rco = new NoneRecyclingObjectFactory(); - this.objectRecycler = new NoneRecycler(rco); - } - - public V bytePage(boolean clear) { - return byteRecycler.obtain(BigArrays.BYTE_PAGE_SIZE); - } - - public V intPage(boolean clear) { - return intRecycler.obtain(BigArrays.BYTE_PAGE_SIZE); - } - - public V longPage(boolean clear) { - return longRecycler.obtain(BigArrays.BYTE_PAGE_SIZE); - } - - public V floatPage(boolean clear) { - return floatRecycler.obtain(BigArrays.BYTE_PAGE_SIZE); - } - - public V doublePage(boolean clear) { - return doubleRecycler.obtain(BigArrays.BYTE_PAGE_SIZE); - } - - public V objectPage() { - return objectRecycler.obtain(BigArrays.BYTE_PAGE_SIZE); - } - - public void close() { - byteRecycler.close(); - intRecycler.close(); - longRecycler.close(); - floatRecycler.close(); - doubleRecycler.close(); - objectRecycler.close(); - } - - private static class NoneRecyclingByteFactory extends AbstractRecyclerC { - @Override - public byte[] newInstance(int sizing) { - return new byte[sizing]; - } - - @Override - public void recycle(byte[] value) { - // don't actually recycle: drop for GC - } - } - - private static class NoneRecyclingIntFactory extends AbstractRecyclerC { - @Override - public int[] newInstance(int sizing) { - return new int[sizing]; - } - - @Override - public void recycle(int[] value) { - // don't actually recycle: drop for GC - } - } - - private static class NoneRecyclingLongFactory extends AbstractRecyclerC { - @Override - public long[] newInstance(int sizing) { - return new long[sizing]; - } - - @Override - public void recycle(long[] value) { - // don't actually recycle: drop for GC - } - } - - private static class NoneRecyclingFloatFactory extends AbstractRecyclerC { - @Override - public float[] newInstance(int sizing) { - return new float[sizing]; - } - - @Override - public void recycle(float[] value) { - // don't actually recycle: drop for GC - } - } - - private static class NoneRecyclingDoubleFactory extends AbstractRecyclerC { - @Override - public double[] newInstance(int sizing) { - return new double[sizing]; - } - - @Override - public void recycle(double[] value) { - // don't actually recycle: drop for GC - } - } - - private static class NoneRecyclingObjectFactory extends AbstractRecyclerC { - @Override - public Object[] newInstance(int sizing) { - return new Object[sizing]; - } - - @Override - public void recycle(Object[] value) { - // don't actually recycle: drop for GC - } - } - -} diff --git a/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java b/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java index 581ecfc8d7c86..cbf2753c78a6e 100644 --- a/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java +++ b/src/main/java/org/elasticsearch/common/io/stream/BytesStreamOutput.java @@ -19,51 +19,41 @@ package org.elasticsearch.common.io.stream; -import org.elasticsearch.cache.recycler.NonePageCacheRecyclerService; -import org.elasticsearch.cache.recycler.PageCacheRecycler; +import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.io.BytesStream; -import org.elasticsearch.common.recycler.Recycler; import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.common.util.ByteArray; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; /** - * A @link {@link StreamOutput} that uses a {@link PageCacheRecycler} to acquire pages of - * bytes, which avoids frequent reallocation & copying of the internal data. Pages are - * returned to the recycler on {@link #close()}. + * A @link {@link StreamOutput} that uses{@link BigArrays} to acquire pages of + * bytes, which avoids frequent reallocation & copying of the internal data. */ public class BytesStreamOutput extends StreamOutput implements BytesStream { /** - * PageCacheRecycler for acquiring/releasing individual memory pages + * Factory/manager for our ByteArray */ - private final NonePageCacheRecyclerService pageRecycler; + private final BigArrays bigarrays; /** - * The buffer where data is stored. + * The internal list of pages. */ - private final List> pages; + private ByteArray bytes; /** * The number of valid bytes in the buffer. */ private int count; - /** - * Size of a page taken from the PageCacheRecycler. We assume a constant page size for - * all requests. - */ - private final int pageSize; - /** * Create a nonrecycling {@link BytesStreamOutput} with 1 initial page acquired. */ public BytesStreamOutput() { - this(NonePageCacheRecyclerService.INSTANCE, BigArrays.BYTE_PAGE_SIZE); + this(BigArrays.PAGE_SIZE_IN_BYTES); } /** @@ -73,42 +63,8 @@ public BytesStreamOutput() { * @param expectedSize the expected maximum size of the stream in bytes. */ public BytesStreamOutput(int expectedSize) { - this(NonePageCacheRecyclerService.INSTANCE, expectedSize); - } - - /** - * Create a {@link BytesStreamOutput} with 1 initial page acquired. - * - * @param pageCacheRecycler the {@link PageCacheRecycler} from which to obtain - * bytes[]s. - */ - private BytesStreamOutput(NonePageCacheRecyclerService pageCacheRecycler) { - // expected size does not matter as long as it's >0 - this(pageCacheRecycler, 1); - } - - /** - * Create a {@link BytesStreamOutput} with enough initial pages acquired to satisfy - * the capacity given by {@link expectedSize}. - * - * @param pageCacheRecycler the {@link PageCacheRecycler} from which to obtain - * bytes[]s. - * @param expectedSize the expected maximum size of the stream in bytes. - */ - private BytesStreamOutput(NonePageCacheRecyclerService pageCacheRecycler, int expectedSize) { - this.pageRecycler = pageCacheRecycler; - // there is no good way to figure out the pageSize used by a PCR, so - // get one page and use its size. - Recycler.V vpage = pageRecycler.bytePage(true); - this.pageSize = vpage.v().length; - // expect 16 pages by default, more if specified - this.pages = new ArrayList>(Math.max(16, expectedSize / pageSize)); - // keep already acquired page - this.pages.add(vpage); - // acquire all other requested pages up front if expectedSize > pageSize - if (expectedSize > pageSize) { - ensureCapacity(expectedSize); - } + bigarrays = BigArrays.NON_RECYCLING_INSTANCE; + bytes = bigarrays.newByteArray(expectedSize); } @Override @@ -124,10 +80,8 @@ public long position() throws IOException { @Override public void writeByte(byte b) throws IOException { ensureOpen(); - ensureCapacity(count); - byte[] page = pages.get(count / pageSize).v(); - int offset = count % pageSize; - page[offset] = b; + ensureCapacity(count+1); + bytes.set(count, b); count++; } @@ -146,59 +100,28 @@ public void writeBytes(byte[] b, int offset, int length) throws IOException { } // get enough pages for new size - ensureCapacity(count + length); - - // where are we? - int currentPageIndex = count / pageSize; - byte[] currentPage = pages.get(currentPageIndex).v(); - int initialOffset = count % pageSize; - int remainingPageCapacity = pageSize - initialOffset; - int remainingLength = length; - - // try to fill the current page pointed to by #count in one copy if possible - if (remainingLength <= remainingPageCapacity) { - // simply copy all of length - System.arraycopy(b, offset, currentPage, initialOffset, remainingLength); - count += remainingLength; - } - else { - // first fill remainder of first page - System.arraycopy(b, offset, currentPage, initialOffset, remainingPageCapacity); - count += remainingPageCapacity; - remainingLength -= remainingPageCapacity; - int copied = remainingPageCapacity; + ensureCapacity(count+length); - // if there is enough to copy try to fill adjacent pages directly - while (remainingLength > pageSize) { - // advance & fill next page - currentPage = pages.get(++currentPageIndex).v(); - System.arraycopy(b, offset + copied, currentPage, 0, pageSize); - copied += pageSize; - remainingLength -= pageSize; - count += pageSize; - } + // bulk copy + bytes.set(count, b, offset, length); - // finally take care of remaining bytes: tricky since the above loop may not - // have run, and #count could point to the middle of a page. So figure out - // again where we are. - currentPageIndex = count / pageSize; - currentPage = pages.get(currentPageIndex).v(); - initialOffset = count % pageSize; - System.arraycopy(b, offset + copied, currentPage, initialOffset, remainingLength); - count += remainingLength; - } + // advance + count += length; } public void reset() { ensureOpen(); - // reset the count but keep all acquired pages + + // shrink list of pages + bytes = bigarrays.resize(bytes, BigArrays.PAGE_SIZE_IN_BYTES); + + // go back to start count = 0; } @Override public void flush() throws IOException { ensureOpen(); - // nothing to do there } @Override @@ -206,6 +129,7 @@ public void seek(long position) throws IOException { if (position > Integer.MAX_VALUE) { throw new IllegalArgumentException("position " + position + " > Integer.MAX_VALUE"); } + count = (int)position; ensureCapacity(count); } @@ -233,41 +157,13 @@ public int size() { @Override public BytesReference bytes() { - // for now just create a copy; later we might create a page-aware BytesReference - // and just transfer ownership. - return new BytesArray(toByteArray(), 0, count); + BytesRef bref = new BytesRef(); + bytes.get(0, count, bref); + return new BytesArray(bref, false); } - private byte[] toByteArray() { - // stricly speaking this is undefined. :/ - // ensureOpen(); - - if (count <= pageSize) { - // simply return the first page - return pages.get(0).v(); - } - - // create result array - byte[] result = new byte[count]; - int resultOffset = 0; - - // copy all full pages - int toCopy = Math.min(count, pageSize); - int numPages = count / pageSize; - for (int i = 0; i < numPages; i++) { - byte[] page = pages.get(i).v(); - System.arraycopy(page, 0, result, resultOffset, toCopy); - resultOffset += toCopy; - } - - // copy any remaining bytes from the last page - int remainder = count % pageSize; - if (remainder > 0) { - byte[] lastPage = pages.get(numPages).v(); - System.arraycopy(lastPage, 0, result, resultOffset, remainder); - } - - return result; + private void ensureCapacity(int offset) { + bytes = bigarrays.grow(bytes, offset); } private void ensureOpen() { @@ -276,13 +172,4 @@ private void ensureOpen() { } } - private void ensureCapacity(int offset) { - int capacity = pageSize * pages.size(); - while (offset >= capacity) { - Recycler.V vpage = pageRecycler.bytePage(true); - pages.add(vpage); - capacity += pageSize; - } - } - }