Glam accounts for sampling when calculating sample_count for windows …

…& release probes (#4581) * Glam - fix legacy windows & release probes' sample count going fwd * Glam FOG accounts for sampling when calculating total_sample for windows & release probes * fog - fix client count and sample count * Add channel filtering for fog
mozilla · Dec 11, 2023 · e8980ad · e8980ad
1 parent e985859
commit e8980ad
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 31 deletions.
diff --git a/bigquery_etl/glam/generate.py b/bigquery_etl/glam/generate.py
@@ -278,6 +278,7 @@ def main():
                 source_table=f"glam_etl.{args.prefix}__scalar_bucket_counts_v1",
                 is_scalar=True,
             ),
+            channel=channel_prefixes[args.prefix],
         ),
         table(
             "probe_counts_v1",
@@ -286,6 +287,7 @@ def main():
                 source_table=f"glam_etl.{args.prefix}__histogram_bucket_counts_v1",
                 is_scalar=False,
             ),
+            channel=channel_prefixes[args.prefix],
         ),
         table(
             "scalar_percentiles_v1",
@@ -296,7 +298,11 @@ def main():
         table("histogram_percentiles_v1"),
         view("view_probe_counts_v1"),
         view("view_user_counts_v1", **models.user_counts()),
-        view("view_sample_counts_v1", **models.sample_counts()),
+        view(
+            "view_sample_counts_v1",
+            **models.sample_counts(),
+            channel=channel_prefixes[args.prefix],
+        ),
         table("extract_user_counts_v1", **config[args.prefix]),
         table("extract_probe_counts_v1", **config[args.prefix]),
     ]

diff --git a/bigquery_etl/glam/templates/probe_counts_v1.sql b/bigquery_etl/glam/templates/probe_counts_v1.sql
@@ -41,13 +41,17 @@ SELECT
     {% if is_scalar %}
         client_agg_type,
         agg_type,
-        -- Logic to count clients based on sampled windows release data.
-        -- If you're changing this, then you'll also need to change
-        -- clients_daily_[scalar | histogram]_aggregates
-        IF(os = 'Windows' AND channel = 'release',
-          SUM(count) * 10,
-          SUM(count)
-        ) AS total_users,
+        {%if channel == "release" %}
+          -- Logic to count clients based on sampled windows release data, which started in v119.
+          -- If you're changing this, then you'll also need to change
+          -- clients_daily_[scalar | histogram]_aggregates
+          IF(os = 'Windows' AND app_version >= 119,
+            SUM(count) * 10,
+            SUM(count)
+          ) AS total_users,
+        {% else %}
+          SUM(count) AS total_users,
+        {% endif %}
         mozfun.glam.histogram_fill_buckets_dirichlet(
             mozfun.map.sum(ARRAY_AGG(STRUCT<key STRING, value FLOAT64>(bucket, count))),
             CASE
@@ -64,13 +68,17 @@ SELECT
     {% else %}
         agg_type AS client_agg_type,
         'histogram' as agg_type,
-        -- Logic to count clients based on sampled windows release data.
-        -- If you're changing this, then you'll also need to change
-        -- clients_daily_[scalar | histogram]_aggregates
-        IF(os = 'Windows' AND channel = 'release',
-          CAST(ROUND(SUM(record.value)) AS INT64) * 10,
-          CAST(ROUND(SUM(record.value)) AS INT64)
-        ) AS total_users,
+        {% if channel == "release" %}
+          -- Logic to count clients based on sampled windows release data, which started in v119.
+          -- If you're changing this, then you'll also need to change
+          -- clients_daily_[scalar | histogram]_aggregates
+          IF(os = 'Windows' AND app_version >= 119,
+            CAST(ROUND(SUM(record.value)) AS INT64) * 10,
+            CAST(ROUND(SUM(record.value)) AS INT64)
+          ) AS total_users,
+        {% else %}
+          CAST(ROUND(SUM(record.value)) AS INT64) AS total_users,
+        {% endif %}
         mozfun.glam.histogram_fill_buckets_dirichlet(
             mozfun.map.sum(ARRAY_AGG(record)),
             mozfun.glam.histogram_buckets_cast_string_array(

diff --git a/bigquery_etl/glam/templates/view_sample_counts_v1.sql b/bigquery_etl/glam/templates/view_sample_counts_v1.sql
@@ -14,6 +14,9 @@ WITH histogram_data AS (
     app_version,
     app_build_id,
     channel,
+    {% if channel == 'release' %}
+      IF(os = 'Windows', 10, 1) AS sample_mult,
+    {% endif %}
     h1.metric,
     h1.key,
     h1.agg_type,
@@ -29,14 +32,17 @@ scalars_histogram_data AS (
     app_version,
     app_build_id,
     channel,
+    {% if channel == 'release' %}
+      IF(os = 'Windows', 10, 1) AS sample_mult,
+    {% endif %}
     s1.metric,
     s1.key,
     agg_type,
     s1.value
   FROM
     `{{ project }}.{{ dataset }}.{{ prefix }}__clients_scalar_aggregates_v1`, UNNEST(scalar_aggregates) s1
 
-  UNION ALL 
+  UNION ALL
 
   SELECT
     client_id,
@@ -45,6 +51,9 @@ scalars_histogram_data AS (
     app_version,
     app_build_id,
     channel,
+    {% if channel == 'release' %}
+      sample_mult,
+    {% endif %}
     metric,
     v1.key,
     agg_type,
@@ -64,16 +73,20 @@ scalars_histogram_data AS (
 }}
 SELECT
     {{ attributes }},
-    metric, 
+    metric,
     '' AS key,
     agg_type,
-    SUM(value) as total_sample
+    {% if channel == 'release' %}
+      SUM(value) * MAX(sample_mult) as total_sample
+    {% else %}
+      SUM(value) as total_sample
+    {% endif %}
 FROM
     all_combos
 WHERE agg_type = 'summed_histogram'
 GROUP BY
-    {{ attributes }}, 
-    metric, 
+    {{ attributes }},
+    metric,
     key,
     agg_type
 
@@ -84,7 +97,11 @@ SELECT
     metric,
     key,
     agg_type,
-    SUM(value) as total_sample
+    {% if channel == 'release' %}
+      SUM(value) * MAX(sample_mult) as total_sample
+    {% else %}
+      SUM(value) as total_sample
+    {% endif %}
 FROM
     all_combos
 WHERE agg_type <> 'summed_histogram'

diff --git a/sql/moz-fx-data-shared-prod/telemetry_derived/glam_sample_counts_v1/query.sql b/sql/moz-fx-data-shared-prod/telemetry_derived/glam_sample_counts_v1/query.sql
@@ -8,7 +8,9 @@ WITH histogram_data AS (
     process,
     key,
     h1.agg_type,
-    h1.aggregates
+    h1.aggregates,
+    IF(os = 'Windows'
+    AND channel = 'release', 10, 1) AS sample_mult
   FROM
     clients_histogram_aggregates_v2,
     UNNEST(histogram_aggregates) h1
@@ -21,7 +23,9 @@ scalars_data AS (
     app_version,
     app_build_id,
     channel,
-    scalar_aggregates
+    scalar_aggregates,
+    IF(os = 'Windows'
+    AND channel = 'release', 10, 1) AS sample_mult
   FROM
     clients_scalar_aggregates_v1
   WHERE
@@ -36,7 +40,7 @@ SELECT
   process,
   histogram_data.key,
   agg_type,
-  SUM(v1.value) AS total_sample
+  SUM(v1.value) * MAX(sample_mult) AS total_sample
 FROM
   histogram_data,
   UNNEST(aggregates) v1
@@ -59,7 +63,7 @@ SELECT
   process,
   histogram_data.key,
   agg_type,
-  SUM(v1.value) AS total_sample
+  SUM(v1.value) * MAX(sample_mult) AS total_sample
 FROM
   histogram_data,
   UNNEST(aggregates) v1
@@ -81,7 +85,7 @@ SELECT
   process,
   histogram_data.key,
   agg_type,
-  SUM(v1.value) AS total_sample
+  SUM(v1.value) * MAX(sample_mult) AS total_sample
 FROM
   histogram_data,
   UNNEST(aggregates) v1
@@ -103,7 +107,7 @@ SELECT
   process,
   histogram_data.key,
   agg_type,
-  SUM(v1.value) AS total_sample
+  SUM(v1.value) * MAX(sample_mult) AS total_sample
 FROM
   histogram_data,
   UNNEST(aggregates) v1
@@ -126,7 +130,7 @@ SELECT
   agg_type,
   CASE
     WHEN agg_type IN ('count', 'true', 'false')
-      THEN SUM(value)
+      THEN SUM(value) * MAX(sample_mult)
     ELSE NULL
   END AS total_sample
 FROM
@@ -153,7 +157,7 @@ SELECT
   agg_type,
   CASE
     WHEN agg_type IN ('count', 'true', 'false')
-      THEN SUM(value)
+      THEN SUM(value) * MAX(sample_mult)
     ELSE NULL
   END AS total_sample
 FROM
@@ -179,7 +183,7 @@ SELECT
   agg_type,
   CASE
     WHEN agg_type IN ('count', 'true', 'false')
-      THEN SUM(value)
+      THEN SUM(value) * MAX(sample_mult)
     ELSE NULL
   END AS total_sample
 FROM
@@ -205,7 +209,7 @@ SELECT
   agg_type,
   CASE
     WHEN agg_type IN ('count', 'true', 'false')
-      THEN SUM(value)
+      THEN SUM(value) * MAX(sample_mult)
     ELSE NULL
   END AS total_sample
 FROM