From e8980ad9e77b1caac09c5a00008f68b165396e9c Mon Sep 17 00:00:00 2001 From: Eduardo Filho Date: Thu, 23 Nov 2023 17:06:20 -0500 Subject: [PATCH] Glam accounts for sampling when calculating sample_count for windows & release probes (#4581) * Glam - fix legacy windows & release probes' sample count going fwd * Glam FOG accounts for sampling when calculating total_sample for windows & release probes * fog - fix client count and sample count * Add channel filtering for fog --- bigquery_etl/glam/generate.py | 8 ++++- .../glam/templates/probe_counts_v1.sql | 36 +++++++++++-------- .../glam/templates/view_sample_counts_v1.sql | 29 +++++++++++---- .../glam_sample_counts_v1/query.sql | 24 +++++++------ 4 files changed, 66 insertions(+), 31 deletions(-) diff --git a/bigquery_etl/glam/generate.py b/bigquery_etl/glam/generate.py index 02e1a0d9002..b0e0699ffc4 100644 --- a/bigquery_etl/glam/generate.py +++ b/bigquery_etl/glam/generate.py @@ -278,6 +278,7 @@ def main(): source_table=f"glam_etl.{args.prefix}__scalar_bucket_counts_v1", is_scalar=True, ), + channel=channel_prefixes[args.prefix], ), table( "probe_counts_v1", @@ -286,6 +287,7 @@ def main(): source_table=f"glam_etl.{args.prefix}__histogram_bucket_counts_v1", is_scalar=False, ), + channel=channel_prefixes[args.prefix], ), table( "scalar_percentiles_v1", @@ -296,7 +298,11 @@ def main(): table("histogram_percentiles_v1"), view("view_probe_counts_v1"), view("view_user_counts_v1", **models.user_counts()), - view("view_sample_counts_v1", **models.sample_counts()), + view( + "view_sample_counts_v1", + **models.sample_counts(), + channel=channel_prefixes[args.prefix], + ), table("extract_user_counts_v1", **config[args.prefix]), table("extract_probe_counts_v1", **config[args.prefix]), ] diff --git a/bigquery_etl/glam/templates/probe_counts_v1.sql b/bigquery_etl/glam/templates/probe_counts_v1.sql index 49371a58efc..c83ae068360 100644 --- a/bigquery_etl/glam/templates/probe_counts_v1.sql +++ b/bigquery_etl/glam/templates/probe_counts_v1.sql @@ -41,13 +41,17 @@ SELECT {% if is_scalar %} client_agg_type, agg_type, - -- Logic to count clients based on sampled windows release data. - -- If you're changing this, then you'll also need to change - -- clients_daily_[scalar | histogram]_aggregates - IF(os = 'Windows' AND channel = 'release', - SUM(count) * 10, - SUM(count) - ) AS total_users, + {%if channel == "release" %} + -- Logic to count clients based on sampled windows release data, which started in v119. + -- If you're changing this, then you'll also need to change + -- clients_daily_[scalar | histogram]_aggregates + IF(os = 'Windows' AND app_version >= 119, + SUM(count) * 10, + SUM(count) + ) AS total_users, + {% else %} + SUM(count) AS total_users, + {% endif %} mozfun.glam.histogram_fill_buckets_dirichlet( mozfun.map.sum(ARRAY_AGG(STRUCT(bucket, count))), CASE @@ -64,13 +68,17 @@ SELECT {% else %} agg_type AS client_agg_type, 'histogram' as agg_type, - -- Logic to count clients based on sampled windows release data. - -- If you're changing this, then you'll also need to change - -- clients_daily_[scalar | histogram]_aggregates - IF(os = 'Windows' AND channel = 'release', - CAST(ROUND(SUM(record.value)) AS INT64) * 10, - CAST(ROUND(SUM(record.value)) AS INT64) - ) AS total_users, + {% if channel == "release" %} + -- Logic to count clients based on sampled windows release data, which started in v119. + -- If you're changing this, then you'll also need to change + -- clients_daily_[scalar | histogram]_aggregates + IF(os = 'Windows' AND app_version >= 119, + CAST(ROUND(SUM(record.value)) AS INT64) * 10, + CAST(ROUND(SUM(record.value)) AS INT64) + ) AS total_users, + {% else %} + CAST(ROUND(SUM(record.value)) AS INT64) AS total_users, + {% endif %} mozfun.glam.histogram_fill_buckets_dirichlet( mozfun.map.sum(ARRAY_AGG(record)), mozfun.glam.histogram_buckets_cast_string_array( diff --git a/bigquery_etl/glam/templates/view_sample_counts_v1.sql b/bigquery_etl/glam/templates/view_sample_counts_v1.sql index 84e7a4171e8..81141710b33 100644 --- a/bigquery_etl/glam/templates/view_sample_counts_v1.sql +++ b/bigquery_etl/glam/templates/view_sample_counts_v1.sql @@ -14,6 +14,9 @@ WITH histogram_data AS ( app_version, app_build_id, channel, + {% if channel == 'release' %} + IF(os = 'Windows', 10, 1) AS sample_mult, + {% endif %} h1.metric, h1.key, h1.agg_type, @@ -29,6 +32,9 @@ scalars_histogram_data AS ( app_version, app_build_id, channel, + {% if channel == 'release' %} + IF(os = 'Windows', 10, 1) AS sample_mult, + {% endif %} s1.metric, s1.key, agg_type, @@ -36,7 +42,7 @@ scalars_histogram_data AS ( FROM `{{ project }}.{{ dataset }}.{{ prefix }}__clients_scalar_aggregates_v1`, UNNEST(scalar_aggregates) s1 - UNION ALL + UNION ALL SELECT client_id, @@ -45,6 +51,9 @@ scalars_histogram_data AS ( app_version, app_build_id, channel, + {% if channel == 'release' %} + sample_mult, + {% endif %} metric, v1.key, agg_type, @@ -64,16 +73,20 @@ scalars_histogram_data AS ( }} SELECT {{ attributes }}, - metric, + metric, '' AS key, agg_type, - SUM(value) as total_sample + {% if channel == 'release' %} + SUM(value) * MAX(sample_mult) as total_sample + {% else %} + SUM(value) as total_sample + {% endif %} FROM all_combos WHERE agg_type = 'summed_histogram' GROUP BY - {{ attributes }}, - metric, + {{ attributes }}, + metric, key, agg_type @@ -84,7 +97,11 @@ SELECT metric, key, agg_type, - SUM(value) as total_sample + {% if channel == 'release' %} + SUM(value) * MAX(sample_mult) as total_sample + {% else %} + SUM(value) as total_sample + {% endif %} FROM all_combos WHERE agg_type <> 'summed_histogram' diff --git a/sql/moz-fx-data-shared-prod/telemetry_derived/glam_sample_counts_v1/query.sql b/sql/moz-fx-data-shared-prod/telemetry_derived/glam_sample_counts_v1/query.sql index 00304791814..7f26e148dfd 100644 --- a/sql/moz-fx-data-shared-prod/telemetry_derived/glam_sample_counts_v1/query.sql +++ b/sql/moz-fx-data-shared-prod/telemetry_derived/glam_sample_counts_v1/query.sql @@ -8,7 +8,9 @@ WITH histogram_data AS ( process, key, h1.agg_type, - h1.aggregates + h1.aggregates, + IF(os = 'Windows' + AND channel = 'release', 10, 1) AS sample_mult FROM clients_histogram_aggregates_v2, UNNEST(histogram_aggregates) h1 @@ -21,7 +23,9 @@ scalars_data AS ( app_version, app_build_id, channel, - scalar_aggregates + scalar_aggregates, + IF(os = 'Windows' + AND channel = 'release', 10, 1) AS sample_mult FROM clients_scalar_aggregates_v1 WHERE @@ -36,7 +40,7 @@ SELECT process, histogram_data.key, agg_type, - SUM(v1.value) AS total_sample + SUM(v1.value) * MAX(sample_mult) AS total_sample FROM histogram_data, UNNEST(aggregates) v1 @@ -59,7 +63,7 @@ SELECT process, histogram_data.key, agg_type, - SUM(v1.value) AS total_sample + SUM(v1.value) * MAX(sample_mult) AS total_sample FROM histogram_data, UNNEST(aggregates) v1 @@ -81,7 +85,7 @@ SELECT process, histogram_data.key, agg_type, - SUM(v1.value) AS total_sample + SUM(v1.value) * MAX(sample_mult) AS total_sample FROM histogram_data, UNNEST(aggregates) v1 @@ -103,7 +107,7 @@ SELECT process, histogram_data.key, agg_type, - SUM(v1.value) AS total_sample + SUM(v1.value) * MAX(sample_mult) AS total_sample FROM histogram_data, UNNEST(aggregates) v1 @@ -126,7 +130,7 @@ SELECT agg_type, CASE WHEN agg_type IN ('count', 'true', 'false') - THEN SUM(value) + THEN SUM(value) * MAX(sample_mult) ELSE NULL END AS total_sample FROM @@ -153,7 +157,7 @@ SELECT agg_type, CASE WHEN agg_type IN ('count', 'true', 'false') - THEN SUM(value) + THEN SUM(value) * MAX(sample_mult) ELSE NULL END AS total_sample FROM @@ -179,7 +183,7 @@ SELECT agg_type, CASE WHEN agg_type IN ('count', 'true', 'false') - THEN SUM(value) + THEN SUM(value) * MAX(sample_mult) ELSE NULL END AS total_sample FROM @@ -205,7 +209,7 @@ SELECT agg_type, CASE WHEN agg_type IN ('count', 'true', 'false') - THEN SUM(value) + THEN SUM(value) * MAX(sample_mult) ELSE NULL END AS total_sample FROM