Skip to content

Commit

Permalink
fix(dynamic-sampling): Add query timeout logging for more functions (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
iambriccardo authored Jul 6, 2023
1 parent f9e82e0 commit a54d212
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,9 @@ def fetch_projects_with_total_root_transaction_count_and_rates(
break
else:
log_query_timeout(
query="fetch_projects_with_total_root_transaction_count_and_rates", offset=offset
query="fetch_projects_with_total_root_transaction_count_and_rates",
offset=offset,
timeout_seconds=MAX_SECONDS,
)

return aggregated_projects
Expand Down
15 changes: 13 additions & 2 deletions src/sentry/dynamic_sampling/tasks/boost_low_volume_transactions.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,11 @@ def get_orgs_with_project_counts(
last_result = last_result[first_idx:]
if not more_results:
break
else:
log_query_timeout(
query="get_orgs_with_project_counts", offset=offset, timeout_seconds=MAX_SECONDS
)

if len(last_result) > 0:
yield [org_id for org_id, _ in last_result]

Expand Down Expand Up @@ -369,7 +374,9 @@ def fetch_project_transaction_totals(org_ids: List[int]) -> Iterator[ProjectTran
}

else:
log_query_timeout(query="fetch_project_transaction_totals", offset=offset)
log_query_timeout(
query="fetch_project_transaction_totals", offset=offset, timeout_seconds=MAX_SECONDS
)

return None

Expand Down Expand Up @@ -495,7 +502,11 @@ def fetch_transactions_with_total_volumes(
}
break
else:
log_query_timeout(query="fetch_transactions_with_total_volumes", offset=offset)
log_query_timeout(
query="fetch_transactions_with_total_volumes",
offset=offset,
timeout_seconds=MAX_SECONDS,
)

return None

Expand Down
11 changes: 10 additions & 1 deletion src/sentry/dynamic_sampling/tasks/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ def get_active_orgs_with_projects_counts(
last_result = last_result[first_idx:]
if not more_results:
break
else:
log_query_timeout(
query="get_active_orgs_with_projects_counts", offset=offset, timeout_seconds=MAX_SECONDS
)

if len(last_result) > 0:
yield [org_id for org_id, _ in last_result]

Expand Down Expand Up @@ -175,7 +180,11 @@ def fetch_orgs_with_total_root_transactions_count(
if not more_results:
break
else:
log_query_timeout(query="fetch_orgs_with_total_root_transactions_count", offset=offset)
log_query_timeout(
query="fetch_orgs_with_total_root_transactions_count",
offset=offset,
timeout_seconds=MAX_SECONDS,
)

return aggregated_projects

Expand Down
13 changes: 11 additions & 2 deletions src/sentry/dynamic_sampling/tasks/logging.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
from typing import Any, Callable, Dict, Optional

from sentry.utils import metrics

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -37,8 +39,15 @@ def log_sample_rate_source(
)


def log_query_timeout(query: str, offset: int) -> None:
logger.error("dynamic_sampling.query_timeout", extra={"query": query, "offset": offset})
def log_query_timeout(query: str, offset: int, timeout_seconds: int) -> None:
logger.error(
"dynamic_sampling.query_timeout",
extra={"query": query, "offset": offset, "timeout_seconds": timeout_seconds},
)

# We also want to collect a metric, in order to measure how many retries we are having. It may help us to spot
# possible problems on the Snuba end that affect query performance.
metrics.incr("dynamic_sampling.query_timeout", tags={"query": query})


def log_recalibrate_org_error(org_id: int, error: str) -> None:
Expand Down
12 changes: 10 additions & 2 deletions src/sentry/dynamic_sampling/tasks/recalibrate_orgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from sentry.dynamic_sampling.tasks.common import get_adjusted_base_rate_from_cache_or_compute
from sentry.dynamic_sampling.tasks.constants import (
MAX_REBALANCE_FACTOR,
MAX_SECONDS,
MIN_REBALANCE_FACTOR,
RECALIBRATE_ORGS_QUERY_INTERVAL,
)
Expand All @@ -31,6 +30,7 @@
)
from sentry.dynamic_sampling.tasks.logging import (
log_action_if,
log_query_timeout,
log_recalibrate_org_error,
log_recalibrate_org_state,
log_sample_rate_source,
Expand All @@ -43,6 +43,10 @@
from sentry.tasks.base import instrumented_task
from sentry.utils.snuba import raw_snql_query

# Since we are using a granularity of 60 (minute granularity), we want to have a higher time upper limit for executing
# multiple queries on Snuba.
RECALIBRATE_ORGS_MAX_SECONDS = 600


class RecalibrationError(Exception):
def __init__(self, org_id, message):
Expand Down Expand Up @@ -166,7 +170,7 @@ def get_active_orgs(
metric_id = indexer.resolve_shared_org(str(TransactionMRI.COUNT_PER_ROOT_PROJECT.value))
offset = 0

while (time.time() - start_time) < MAX_SECONDS:
while (time.time() - start_time) < RECALIBRATE_ORGS_MAX_SECONDS:
query = (
Query(
match=Entity(EntityKey.GenericOrgMetricsCounters.value),
Expand Down Expand Up @@ -212,6 +216,10 @@ def get_active_orgs(

if not more_results:
return
else:
log_query_timeout(
query="get_active_orgs", offset=offset, timeout_seconds=RECALIBRATE_ORGS_MAX_SECONDS
)


def fetch_org_volumes(
Expand Down
6 changes: 5 additions & 1 deletion src/sentry/dynamic_sampling/tasks/sliding_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,10 @@ def fetch_projects_with_total_root_transactions_count(
if not more_results:
break
else:
log_query_timeout(query="fetch_projects_with_total_root_transactions_count", offset=offset)
log_query_timeout(
query="fetch_projects_with_total_root_transactions_count",
offset=offset,
timeout_seconds=MAX_SECONDS,
)

return aggregated_projects

0 comments on commit a54d212

Please sign in to comment.