-
Notifications
You must be signed in to change notification settings - Fork 434
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: allow layer flushes more often (#7927)
As seen with the pgvector 0.7.0 index builds, we can receive large batches of images, leading to very large L0 layers in the range of 1GB. These large layers are produced because we are only able to roll the layer after we have witnessed two different Lsns in a single `DataDirModification::commit`. As the single Lsn batches of images can span over multiple `DataDirModification` lifespans, we will rarely get to write two different Lsns in a single `put_batch` currently. The solution is to remember the TimelineWriterState instead of eagerly forgetting it until we really open the next layer or someone else flushes (while holding the write_guard). Additional changes are test fixes to avoid "initdb image layer optimization" or ignoring initdb layers for assertion. Cc: #7197 because small `checkpoint_distance` will now trigger the "initdb image layer optimization"
- Loading branch information
Showing
9 changed files
with
245 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
from dataclasses import dataclass | ||
from typing import Iterable, List, Union | ||
|
||
import pytest | ||
from fixtures.log_helper import log | ||
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn | ||
from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo | ||
from fixtures.utils import human_bytes | ||
|
||
|
||
def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder, build_type: str): | ||
""" | ||
Build a non-small GIN index which includes similarly batched up images in WAL stream as does pgvector | ||
to show that we no longer create oversized layers. | ||
""" | ||
|
||
if build_type == "debug": | ||
pytest.skip("debug run is unnecessarily slow") | ||
|
||
minimum_initdb_size = 20 * 1024**2 | ||
checkpoint_distance = 32 * 1024**2 | ||
minimum_good_layer_size = checkpoint_distance * 0.9 | ||
minimum_too_large_layer_size = 2 * checkpoint_distance | ||
|
||
# index size: 99MiB | ||
rows = 2_500_000 | ||
|
||
# bucket lower limits | ||
buckets = [0, minimum_initdb_size, minimum_good_layer_size, minimum_too_large_layer_size] | ||
|
||
assert ( | ||
minimum_initdb_size < minimum_good_layer_size | ||
), "keep checkpoint_distance higher than the initdb size (find it by experimenting)" | ||
|
||
env = neon_env_builder.init_start( | ||
initial_tenant_conf={ | ||
"checkpoint_distance": f"{checkpoint_distance}", | ||
"compaction_target_size": f"{checkpoint_distance}", | ||
# this test is primarly interested in L0 sizes but we'll compact after ingestion to ensure sizes are good even then | ||
"compaction_period": "0s", | ||
"gc_period": "0s", | ||
"compaction_threshold": "255", | ||
"image_creation_threshold": "99999", | ||
} | ||
) | ||
|
||
# build a larger than 3*checkpoint_distance sized gin index. | ||
# gin index building exhibits the same behaviour as the pgvector with the two phase build | ||
with env.endpoints.create_start("main") as ep, ep.cursor() as cur: | ||
cur.execute( | ||
f"create table int_array_test as select array_agg(g) as int_array from generate_series(1, {rows}) g group by g / 10;" | ||
) | ||
cur.execute( | ||
"create index int_array_test_gin_index on int_array_test using gin (int_array);" | ||
) | ||
cur.execute("select pg_table_size('int_array_test_gin_index')") | ||
size = cur.fetchone() | ||
assert size is not None | ||
assert isinstance(size[0], int) | ||
log.info(f"gin index size: {human_bytes(size[0])}") | ||
assert ( | ||
size[0] > checkpoint_distance * 3 | ||
), f"gin index is not large enough: {human_bytes(size[0])}" | ||
wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) | ||
|
||
ps_http = env.pageserver.http_client() | ||
ps_http.timeline_checkpoint(env.initial_tenant, env.initial_timeline) | ||
|
||
infos = ps_http.layer_map_info(env.initial_tenant, env.initial_timeline) | ||
assert len(infos.in_memory_layers) == 0, "should had flushed open layers" | ||
post_ingest = histogram_historic_layers(infos, buckets) | ||
|
||
# describe first, assert later for easier debugging | ||
log.info("non-cumulative layer size distribution after ingestion:") | ||
print_layer_size_histogram(post_ingest) | ||
|
||
# since all we have are L0s, we should be getting nice L1s and images out of them now | ||
ps_http.patch_tenant_config_client_side( | ||
env.initial_tenant, | ||
{ | ||
"compaction_threshold": 1, | ||
"image_creation_threshold": 1, | ||
}, | ||
) | ||
|
||
ps_http.timeline_compact(env.initial_tenant, env.initial_timeline, True, True) | ||
|
||
infos = ps_http.layer_map_info(env.initial_tenant, env.initial_timeline) | ||
assert len(infos.in_memory_layers) == 0, "no new inmem layers expected" | ||
post_compact = histogram_historic_layers(infos, buckets) | ||
|
||
log.info("non-cumulative layer size distribution after compaction:") | ||
print_layer_size_histogram(post_compact) | ||
|
||
assert ( | ||
post_ingest.counts[3] == 0 | ||
), f"there should be no layers larger than 2*checkpoint_distance ({human_bytes(2*checkpoint_distance)})" | ||
assert post_ingest.counts[1] == 1, "expect one smaller layer for initdb" | ||
assert ( | ||
post_ingest.counts[0] <= 1 | ||
), "expect at most one tiny layer from shutting down the endpoint" | ||
|
||
# just make sure we don't have trouble splitting the layers apart | ||
assert post_compact.counts[3] == 0 | ||
|
||
|
||
@dataclass | ||
class Histogram: | ||
buckets: List[Union[int, float]] | ||
counts: List[int] | ||
sums: List[int] | ||
|
||
|
||
def histogram_historic_layers( | ||
infos: LayerMapInfo, minimum_sizes: List[Union[int, float]] | ||
) -> Histogram: | ||
def log_layer(layer: HistoricLayerInfo) -> HistoricLayerInfo: | ||
log.info( | ||
f"{layer.layer_file_name} {human_bytes(layer.layer_file_size)} ({layer.layer_file_size} bytes)" | ||
) | ||
return layer | ||
|
||
layers = map(log_layer, infos.historic_layers) | ||
sizes = (x.layer_file_size for x in layers) | ||
return histogram(sizes, minimum_sizes) | ||
|
||
|
||
def histogram(sizes: Iterable[int], minimum_sizes: List[Union[int, float]]) -> Histogram: | ||
assert all(minimum_sizes[i] < minimum_sizes[i + 1] for i in range(len(minimum_sizes) - 1)) | ||
buckets = list(enumerate(minimum_sizes)) | ||
counts = [0 for _ in buckets] | ||
sums = [0 for _ in buckets] | ||
|
||
for size in sizes: | ||
found = False | ||
for index, min_size in reversed(buckets): | ||
if size >= min_size: | ||
counts[index] += 1 | ||
sums[index] += size | ||
found = True | ||
break | ||
assert found | ||
|
||
return Histogram(minimum_sizes, counts, sums) | ||
|
||
|
||
def print_layer_size_histogram(h: Histogram): | ||
for index, min_size in enumerate(h.buckets): | ||
log.info( | ||
f">= {human_bytes(min_size)}: {h.counts[index]} layers total {human_bytes(h.sums[index])}" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
b52e31c
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
3286 tests run: 3131 passed, 0 failed, 155 skipped (full report)
Flaky tests (1)
Postgres 15
test_statvfs_pressure_usage
: debugCode coverage* (full report)
functions
:31.5% (6602 of 20962 functions)
lines
:48.5% (51080 of 105408 lines)
* collected from Rust tests only
b52e31c at 2024-06-10T15:07:37.751Z :recycle: