Skip to content

Commit

Permalink
Glam accounts for sampling when calculating sample_count for windows …
Browse files Browse the repository at this point in the history
…& release probes (#4581)

* Glam - fix legacy windows & release probes' sample count going fwd

* Glam FOG accounts for sampling when calculating total_sample for windows & release probes

* fog - fix client count and sample count

* Add channel filtering for fog
  • Loading branch information
edugfilho authored and irrationalagent committed Dec 11, 2023
1 parent e985859 commit e8980ad
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 31 deletions.
8 changes: 7 additions & 1 deletion bigquery_etl/glam/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ def main():
source_table=f"glam_etl.{args.prefix}__scalar_bucket_counts_v1",
is_scalar=True,
),
channel=channel_prefixes[args.prefix],
),
table(
"probe_counts_v1",
Expand All @@ -286,6 +287,7 @@ def main():
source_table=f"glam_etl.{args.prefix}__histogram_bucket_counts_v1",
is_scalar=False,
),
channel=channel_prefixes[args.prefix],
),
table(
"scalar_percentiles_v1",
Expand All @@ -296,7 +298,11 @@ def main():
table("histogram_percentiles_v1"),
view("view_probe_counts_v1"),
view("view_user_counts_v1", **models.user_counts()),
view("view_sample_counts_v1", **models.sample_counts()),
view(
"view_sample_counts_v1",
**models.sample_counts(),
channel=channel_prefixes[args.prefix],
),
table("extract_user_counts_v1", **config[args.prefix]),
table("extract_probe_counts_v1", **config[args.prefix]),
]
Expand Down
36 changes: 22 additions & 14 deletions bigquery_etl/glam/templates/probe_counts_v1.sql
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,17 @@ SELECT
{% if is_scalar %}
client_agg_type,
agg_type,
-- Logic to count clients based on sampled windows release data.
-- If you're changing this, then you'll also need to change
-- clients_daily_[scalar | histogram]_aggregates
IF(os = 'Windows' AND channel = 'release',
SUM(count) * 10,
SUM(count)
) AS total_users,
{%if channel == "release" %}
-- Logic to count clients based on sampled windows release data, which started in v119.
-- If you're changing this, then you'll also need to change
-- clients_daily_[scalar | histogram]_aggregates
IF(os = 'Windows' AND app_version >= 119,
SUM(count) * 10,
SUM(count)
) AS total_users,
{% else %}
SUM(count) AS total_users,
{% endif %}
mozfun.glam.histogram_fill_buckets_dirichlet(
mozfun.map.sum(ARRAY_AGG(STRUCT<key STRING, value FLOAT64>(bucket, count))),
CASE
Expand All @@ -64,13 +68,17 @@ SELECT
{% else %}
agg_type AS client_agg_type,
'histogram' as agg_type,
-- Logic to count clients based on sampled windows release data.
-- If you're changing this, then you'll also need to change
-- clients_daily_[scalar | histogram]_aggregates
IF(os = 'Windows' AND channel = 'release',
CAST(ROUND(SUM(record.value)) AS INT64) * 10,
CAST(ROUND(SUM(record.value)) AS INT64)
) AS total_users,
{% if channel == "release" %}
-- Logic to count clients based on sampled windows release data, which started in v119.
-- If you're changing this, then you'll also need to change
-- clients_daily_[scalar | histogram]_aggregates
IF(os = 'Windows' AND app_version >= 119,
CAST(ROUND(SUM(record.value)) AS INT64) * 10,
CAST(ROUND(SUM(record.value)) AS INT64)
) AS total_users,
{% else %}
CAST(ROUND(SUM(record.value)) AS INT64) AS total_users,
{% endif %}
mozfun.glam.histogram_fill_buckets_dirichlet(
mozfun.map.sum(ARRAY_AGG(record)),
mozfun.glam.histogram_buckets_cast_string_array(
Expand Down
29 changes: 23 additions & 6 deletions bigquery_etl/glam/templates/view_sample_counts_v1.sql
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ WITH histogram_data AS (
app_version,
app_build_id,
channel,
{% if channel == 'release' %}
IF(os = 'Windows', 10, 1) AS sample_mult,
{% endif %}
h1.metric,
h1.key,
h1.agg_type,
Expand All @@ -29,14 +32,17 @@ scalars_histogram_data AS (
app_version,
app_build_id,
channel,
{% if channel == 'release' %}
IF(os = 'Windows', 10, 1) AS sample_mult,
{% endif %}
s1.metric,
s1.key,
agg_type,
s1.value
FROM
`{{ project }}.{{ dataset }}.{{ prefix }}__clients_scalar_aggregates_v1`, UNNEST(scalar_aggregates) s1

UNION ALL
UNION ALL

SELECT
client_id,
Expand All @@ -45,6 +51,9 @@ scalars_histogram_data AS (
app_version,
app_build_id,
channel,
{% if channel == 'release' %}
sample_mult,
{% endif %}
metric,
v1.key,
agg_type,
Expand All @@ -64,16 +73,20 @@ scalars_histogram_data AS (
}}
SELECT
{{ attributes }},
metric,
metric,
'' AS key,
agg_type,
SUM(value) as total_sample
{% if channel == 'release' %}
SUM(value) * MAX(sample_mult) as total_sample
{% else %}
SUM(value) as total_sample
{% endif %}
FROM
all_combos
WHERE agg_type = 'summed_histogram'
GROUP BY
{{ attributes }},
metric,
{{ attributes }},
metric,
key,
agg_type

Expand All @@ -84,7 +97,11 @@ SELECT
metric,
key,
agg_type,
SUM(value) as total_sample
{% if channel == 'release' %}
SUM(value) * MAX(sample_mult) as total_sample
{% else %}
SUM(value) as total_sample
{% endif %}
FROM
all_combos
WHERE agg_type <> 'summed_histogram'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ WITH histogram_data AS (
process,
key,
h1.agg_type,
h1.aggregates
h1.aggregates,
IF(os = 'Windows'
AND channel = 'release', 10, 1) AS sample_mult
FROM
clients_histogram_aggregates_v2,
UNNEST(histogram_aggregates) h1
Expand All @@ -21,7 +23,9 @@ scalars_data AS (
app_version,
app_build_id,
channel,
scalar_aggregates
scalar_aggregates,
IF(os = 'Windows'
AND channel = 'release', 10, 1) AS sample_mult
FROM
clients_scalar_aggregates_v1
WHERE
Expand All @@ -36,7 +40,7 @@ SELECT
process,
histogram_data.key,
agg_type,
SUM(v1.value) AS total_sample
SUM(v1.value) * MAX(sample_mult) AS total_sample
FROM
histogram_data,
UNNEST(aggregates) v1
Expand All @@ -59,7 +63,7 @@ SELECT
process,
histogram_data.key,
agg_type,
SUM(v1.value) AS total_sample
SUM(v1.value) * MAX(sample_mult) AS total_sample
FROM
histogram_data,
UNNEST(aggregates) v1
Expand All @@ -81,7 +85,7 @@ SELECT
process,
histogram_data.key,
agg_type,
SUM(v1.value) AS total_sample
SUM(v1.value) * MAX(sample_mult) AS total_sample
FROM
histogram_data,
UNNEST(aggregates) v1
Expand All @@ -103,7 +107,7 @@ SELECT
process,
histogram_data.key,
agg_type,
SUM(v1.value) AS total_sample
SUM(v1.value) * MAX(sample_mult) AS total_sample
FROM
histogram_data,
UNNEST(aggregates) v1
Expand All @@ -126,7 +130,7 @@ SELECT
agg_type,
CASE
WHEN agg_type IN ('count', 'true', 'false')
THEN SUM(value)
THEN SUM(value) * MAX(sample_mult)
ELSE NULL
END AS total_sample
FROM
Expand All @@ -153,7 +157,7 @@ SELECT
agg_type,
CASE
WHEN agg_type IN ('count', 'true', 'false')
THEN SUM(value)
THEN SUM(value) * MAX(sample_mult)
ELSE NULL
END AS total_sample
FROM
Expand All @@ -179,7 +183,7 @@ SELECT
agg_type,
CASE
WHEN agg_type IN ('count', 'true', 'false')
THEN SUM(value)
THEN SUM(value) * MAX(sample_mult)
ELSE NULL
END AS total_sample
FROM
Expand All @@ -205,7 +209,7 @@ SELECT
agg_type,
CASE
WHEN agg_type IN ('count', 'true', 'false')
THEN SUM(value)
THEN SUM(value) * MAX(sample_mult)
ELSE NULL
END AS total_sample
FROM
Expand Down

0 comments on commit e8980ad

Please sign in to comment.