Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

replace statsd with prometheus client #6897

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ venv/
/.github/
/netlify.toml
/setup/
.github/
justinclift marked this conversation as resolved.
Show resolved Hide resolved
27 changes: 15 additions & 12 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ sqlalchemy-searchable = "1.2.0"
sqlalchemy-utils = "0.34.2"
sqlparse = "0.5.0"
sshtunnel = "0.1.5"
statsd = "3.3.0"
prometheus-client = "0.20.0"
supervisor = "4.1.0"
supervisor-checks = "0.8.1"
ua-parser = "0.18.0"
Expand Down
7 changes: 5 additions & 2 deletions redash/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from flask_limiter.util import get_remote_address
from flask_mail import Mail
from flask_migrate import Migrate
from statsd import StatsClient
from prometheus_client.bridge.graphite import GraphiteBridge

from redash import settings
from redash.app import create_app # noqa
Expand Down Expand Up @@ -43,11 +43,14 @@ def setup_logging():

setup_logging()

if settings.STATSD_ENABLED:
gb = GraphiteBridge((settings.STATSD_HOST, settings.STATSD_PORT), tags=settings.STATSD_USE_TAGS)
gb.start(settings.STATSD_PERIOD, settings.STATSD_PREFIX)

redis_connection = redis.from_url(settings.REDIS_URL)
rq_redis_connection = redis.from_url(settings.RQ_REDIS_URL)
mail = Mail()
migrate = Migrate(compare_type=True)
statsd_client = StatsClient(host=settings.STATSD_HOST, port=settings.STATSD_PORT, prefix=settings.STATSD_PREFIX)
limiter = Limiter(key_func=get_remote_address, storage_uri=settings.LIMITER_STORAGE)

import_query_runners(settings.QUERY_RUNNERS)
Expand Down
3 changes: 3 additions & 0 deletions redash/app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from flask import Flask
from prometheus_client import make_wsgi_app
from werkzeug.middleware.dispatcher import DispatcherMiddleware
from werkzeug.middleware.proxy_fix import ProxyFix

from redash import settings
Expand Down Expand Up @@ -39,6 +41,7 @@ def create_app():

sentry.init()
app = Redash()
app.wsgi_app = DispatcherMiddleware(app.wsgi_app, {"/metrics": make_wsgi_app()})

security.init_app(app)
request_metrics.init_app(app)
Expand Down
11 changes: 8 additions & 3 deletions redash/metrics/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,20 @@
import time

from flask import g, has_request_context
from prometheus_client import Histogram
from sqlalchemy.engine import Engine
from sqlalchemy.event import listens_for
from sqlalchemy.orm.util import _ORMJoin
from sqlalchemy.sql.selectable import Alias

from redash import statsd_client

metrics_logger = logging.getLogger("metrics")

dbActionLatencyHistogram = Histogram(
"db_action_latency_milliseconds",
"Database operation duration",
["name", "action"],
)


def _table_name_from_select_element(elt):
t = elt.froms[0]
Expand Down Expand Up @@ -48,7 +53,7 @@ def after_execute(conn, elt, multiparams, params, result):

action = action.lower()

statsd_client.timing("db.{}.{}".format(name, action), duration)
dbActionLatencyHistogram.labels(name, action).observe(duration)
metrics_logger.debug("table=%s query=%s duration=%.2f", name, action, duration)

if has_request_context():
Expand Down
11 changes: 8 additions & 3 deletions redash/metrics/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,16 @@
from collections import namedtuple

from flask import g, request

from redash import statsd_client
from prometheus_client import Histogram

metrics_logger = logging.getLogger("metrics")

requestsHistogram = Histogram(
"requests_latency_milliseconds",
"Requests latency",
["endpoint", "method"],
)


def record_request_start_time():
g.start_time = time.time()
Expand Down Expand Up @@ -35,7 +40,7 @@ def calculate_metrics(response):
queries_duration,
)

statsd_client.timing("requests.{}.{}".format(endpoint, request.method.lower()), request_duration)
requestsHistogram.labels(endpoint, request.method.lower()).observe(request_duration)

return response

Expand Down
6 changes: 6 additions & 0 deletions redash/query_runner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import sqlparse
from dateutil import parser
from prometheus_client import Counter
from rq.timeouts import JobTimeoutException
from sshtunnel import open_tunnel

Expand Down Expand Up @@ -120,6 +121,11 @@ class BaseQueryRunner:
limit_query = " LIMIT 1000"
limit_keywords = ["LIMIT", "OFFSET"]
limit_after_select = False
queryRunnerResultsCounter = Counter(
"query_runner_results",
"Query Runner results counter",
["runner", "name", "status"],
)

def __init__(self, configuration):
self.syntax = "sql"
Expand Down
4 changes: 2 additions & 2 deletions redash/query_runner/databricks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
import os

from redash import __version__, statsd_client
from redash import __version__
from redash.query_runner import (
TYPE_BOOLEAN,
TYPE_DATE,
Expand Down Expand Up @@ -112,7 +112,7 @@ def run_query(self, query, user):

if len(result_set) >= ROW_LIMIT and cursor.fetchone() is not None:
logger.warning("Truncated result set.")
statsd_client.incr("redash.query_runner.databricks.truncated")
self.queryRunnerResultsCounter.labels("databricks", self.name, "truncated").inc()
data["truncated"] = True
error = None
else:
Expand Down
2 changes: 2 additions & 0 deletions redash/settings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@
REDIS_URL = add_decode_responses_to_redis_url(_REDIS_URL)
PROXIES_COUNT = int(os.environ.get("REDASH_PROXIES_COUNT", "1"))

STATSD_ENABLED = parse_boolean(os.environ.get("REDASH_STATSD_ENABLED", "false"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like STATSD was always unconditionally enabled before? Do we need to update the documentation?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it makes no sense to enable statsd by default if it has no remote endpoint configured

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

statsd uses push model, and prometheus - pull. for statsd you should specify remote url to push metrics to and prometheus exposes metrics at configured endpoint

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, makes sense!

STATSD_HOST = os.environ.get("REDASH_STATSD_HOST", "127.0.0.1")
STATSD_PORT = int(os.environ.get("REDASH_STATSD_PORT", "8125"))
STATSD_PREFIX = os.environ.get("REDASH_STATSD_PREFIX", "redash")
STATSD_PERIOD = float(os.environ.get("REDASH_STATSD_PERIOD", "30.0"))
STATSD_USE_TAGS = parse_boolean(os.environ.get("REDASH_STATSD_USE_TAGS", "false"))

# Connection settings for Redash's own database (where we store the queries, results, etc)
Expand Down
15 changes: 11 additions & 4 deletions redash/tasks/queries/maintenance.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import logging
import time

from prometheus_client import Counter
from rq.timeouts import JobTimeoutException

from redash import models, redis_connection, settings, statsd_client
from redash import models, redis_connection, settings
from redash.models.parameterized_query import (
InvalidParameterError,
QueryDetachedFromDataSourceError,
Expand All @@ -17,6 +18,12 @@

logger = get_job_logger(__name__)

refreshSchemaCounter = Counter(
"refresh_schema",
"Refresh schema operation status counter",
["status"],
)


def empty_schedules():
logger.info("Deleting schedules of past scheduled queries...")
Expand Down Expand Up @@ -169,17 +176,17 @@ def refresh_schema(data_source_id):
ds.id,
time.time() - start_time,
)
statsd_client.incr("refresh_schema.success")
refreshSchemaCounter.labels("success").inc()
except JobTimeoutException:
logger.info(
"task=refresh_schema state=timeout ds_id=%s runtime=%.2f",
ds.id,
time.time() - start_time,
)
statsd_client.incr("refresh_schema.timeout")
refreshSchemaCounter.labels("timeout").inc()
except Exception:
logger.warning("Failed refreshing schema for the data source: %s", ds.name, exc_info=1)
statsd_client.incr("refresh_schema.error")
refreshSchemaCounter.labels("error").inc()
logger.info(
"task=refresh_schema state=failed ds_id=%s runtime=%.2f",
ds.id,
Expand Down
19 changes: 12 additions & 7 deletions redash/tasks/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import signal
import sys

from prometheus_client import Gauge
from rq import Queue as BaseQueue
from rq.job import Job as BaseJob
from rq.job import JobStatus
Expand All @@ -13,14 +14,18 @@
Worker,
)

from redash import statsd_client

# HerokuWorker does not work in OSX https://github.com/getredash/redash/issues/5413
if sys.platform == "darwin":
BaseWorker = Worker
else:
BaseWorker = HerokuWorker

rqJobsCounter = Gauge(
"rq_jobs",
"RQ jobs status count",
["queue", "status"],
)


class CancellableJob(BaseJob):
def cancel(self, pipeline=None):
Expand All @@ -41,7 +46,7 @@ class StatsdRecordingQueue(BaseQueue):

def enqueue_job(self, *args, **kwargs):
job = super().enqueue_job(*args, **kwargs)
statsd_client.incr("rq.jobs.created.{}".format(self.name))
rqJobsCounter.labels(self.name, "created").inc()
return job


Expand All @@ -59,13 +64,13 @@ class StatsdRecordingWorker(BaseWorker):
"""

def execute_job(self, job, queue):
statsd_client.incr("rq.jobs.running.{}".format(queue.name))
statsd_client.incr("rq.jobs.started.{}".format(queue.name))
rqJobsCounter.labels(queue.name, "started").inc()
try:
rqJobsCounter.labels(queue.name, "running").inc()
super().execute_job(job, queue)
finally:
statsd_client.decr("rq.jobs.running.{}".format(queue.name))
statsd_client.incr("rq.jobs.{}.{}".format(job.get_status(), queue.name))
rqJobsCounter.labels(queue.name, "running").dec()
rqJobsCounter.labels(queue.name, job.get_status()).inc()


class HardLimitingWorker(BaseWorker):
Expand Down
3 changes: 2 additions & 1 deletion tests/handlers/test_authentication.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ def test_shows_reset_password_form(self):


class TestInvite(BaseTestCase):
def test_expired_invite_token(self):
@mock.patch("prometheus_client.Histogram.observe")
def test_expired_invite_token(self, inc):
with mock.patch("time.time") as patched_time:
patched_time.return_value = time.time() - (7 * 24 * 3600) - 10
token = invite_token(self.factory.user)
Expand Down
6 changes: 3 additions & 3 deletions tests/metrics/test_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from tests import BaseTestCase


@patch("statsd.StatsClient.timing")
@patch("prometheus_client.Histogram.observe")
class TestDatabaseMetrics(BaseTestCase):
def test_db_request_records_statsd_metrics(self, timing):
def test_db_request_records_metrics(self, observe):
self.factory.create_query()
timing.assert_called_with("db.changes.insert", ANY)
observe.assert_called_with(ANY)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feels like we're not testing as much as we were before here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

interface has changed and observe function calculates histograms and expect time at the input. we cannot predict the time it takes to run a function, so ANY value was set here. regarding metric name, it should not be passed anymore as it's initialized in a different manner from statsd python client

6 changes: 3 additions & 3 deletions tests/metrics/test_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from tests import BaseTestCase


@patch("statsd.StatsClient.timing")
@patch("prometheus_client.Histogram.observe")
class TestRequestMetrics(BaseTestCase):
def test_flask_request_records_statsd_metrics(self, timing):
def test_flask_request_records_metrics(self, observe):
self.client.get("/ping")
timing.assert_called_once_with("requests.redash_ping.get", ANY)
observe.assert_called_once_with(ANY)
4 changes: 2 additions & 2 deletions tests/tasks/test_schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,6 @@ def foo():

schedule_periodic_jobs([{"func": foo, "interval": 60}])

with patch("statsd.StatsClient.incr") as incr:
with patch("prometheus_client.Gauge.inc") as inc:
rq_scheduler.enqueue_jobs()
incr.assert_called_once_with("rq.jobs.created.periodic")
inc.assert_called_once()
Loading
Loading