From e8980ad9e77b1caac09c5a00008f68b165396e9c Mon Sep 17 00:00:00 2001
From: Eduardo Filho <edugomfilho@gmail.com>
Date: Thu, 23 Nov 2023 17:06:20 -0500
Subject: [PATCH] Glam accounts for sampling when calculating sample_count for
 windows & release probes (#4581)

* Glam - fix legacy windows & release probes' sample count going fwd

* Glam FOG accounts for sampling when calculating total_sample for windows & release probes

* fog - fix client count and sample count

* Add channel filtering for fog
---
 bigquery_etl/glam/generate.py                 |  8 ++++-
 .../glam/templates/probe_counts_v1.sql        | 36 +++++++++++--------
 .../glam/templates/view_sample_counts_v1.sql  | 29 +++++++++++----
 .../glam_sample_counts_v1/query.sql           | 24 +++++++------
 4 files changed, 66 insertions(+), 31 deletions(-)
diff --git a/bigquery_etl/glam/generate.py b/bigquery_etl/glam/generate.py
index 02e1a0d9002..b0e0699ffc4 100644
--- a/bigquery_etl/glam/generate.py
+++ b/bigquery_etl/glam/generate.py
@@ -278,6 +278,7 @@ def main():
                 source_table=f"glam_etl.{args.prefix}__scalar_bucket_counts_v1",
                 is_scalar=True,
             ),
+            channel=channel_prefixes[args.prefix],
         ),
         table(
             "probe_counts_v1",
@@ -286,6 +287,7 @@ def main():
                 source_table=f"glam_etl.{args.prefix}__histogram_bucket_counts_v1",
                 is_scalar=False,
             ),
+            channel=channel_prefixes[args.prefix],
         ),
         table(
             "scalar_percentiles_v1",
@@ -296,7 +298,11 @@ def main():
         table("histogram_percentiles_v1"),
         view("view_probe_counts_v1"),
         view("view_user_counts_v1", **models.user_counts()),
-        view("view_sample_counts_v1", **models.sample_counts()),
+        view(
+            "view_sample_counts_v1",
+            **models.sample_counts(),
+            channel=channel_prefixes[args.prefix],
+        ),
         table("extract_user_counts_v1", **config[args.prefix]),
         table("extract_probe_counts_v1", **config[args.prefix]),
     ]
diff --git a/bigquery_etl/glam/templates/probe_counts_v1.sql b/bigquery_etl/glam/templates/probe_counts_v1.sql
index 49371a58efc..c83ae068360 100644
--- a/bigquery_etl/glam/templates/probe_counts_v1.sql
+++ b/bigquery_etl/glam/templates/probe_counts_v1.sql
@@ -41,13 +41,17 @@ SELECT
     {% if is_scalar %}
         client_agg_type,
         agg_type,
-        -- Logic to count clients based on sampled windows release data.
-        -- If you're changing this, then you'll also need to change
-        -- clients_daily_[scalar | histogram]_aggregates
-        IF(os = 'Windows' AND channel = 'release',
-          SUM(count) * 10,
-          SUM(count)
-        ) AS total_users,
+        {%if channel == "release" %}
+          -- Logic to count clients based on sampled windows release data, which started in v119.
+          -- If you're changing this, then you'll also need to change
+          -- clients_daily_[scalar | histogram]_aggregates
+          IF(os = 'Windows' AND app_version >= 119,
+            SUM(count) * 10,
+            SUM(count)
+          ) AS total_users,
+        {% else %}
+          SUM(count) AS total_users,
+        {% endif %}
         mozfun.glam.histogram_fill_buckets_dirichlet(
             mozfun.map.sum(ARRAY_AGG(STRUCT<key STRING, value FLOAT64>(bucket, count))),
             CASE
@@ -64,13 +68,17 @@ SELECT
     {% else %}
         agg_type AS client_agg_type,
         'histogram' as agg_type,
-        -- Logic to count clients based on sampled windows release data.
-        -- If you're changing this, then you'll also need to change
-        -- clients_daily_[scalar | histogram]_aggregates
-        IF(os = 'Windows' AND channel = 'release',
-          CAST(ROUND(SUM(record.value)) AS INT64) * 10,
-          CAST(ROUND(SUM(record.value)) AS INT64)
-        ) AS total_users,
+        {% if channel == "release" %}
+          -- Logic to count clients based on sampled windows release data, which started in v119.
+          -- If you're changing this, then you'll also need to change
+          -- clients_daily_[scalar | histogram]_aggregates
+          IF(os = 'Windows' AND app_version >= 119,
+            CAST(ROUND(SUM(record.value)) AS INT64) * 10,
+            CAST(ROUND(SUM(record.value)) AS INT64)
+          ) AS total_users,
+        {% else %}
+          CAST(ROUND(SUM(record.value)) AS INT64) AS total_users,
+        {% endif %}
         mozfun.glam.histogram_fill_buckets_dirichlet(
             mozfun.map.sum(ARRAY_AGG(record)),
             mozfun.glam.histogram_buckets_cast_string_array(
diff --git a/bigquery_etl/glam/templates/view_sample_counts_v1.sql b/bigquery_etl/glam/templates/view_sample_counts_v1.sql
index 84e7a4171e8..81141710b33 100644
--- a/bigquery_etl/glam/templates/view_sample_counts_v1.sql
+++ b/bigquery_etl/glam/templates/view_sample_counts_v1.sql
@@ -14,6 +14,9 @@ WITH histogram_data AS (
     app_version,
     app_build_id,
     channel,
+    {% if channel == 'release' %}
+      IF(os = 'Windows', 10, 1) AS sample_mult,
+    {% endif %}
     h1.metric,
     h1.key,
     h1.agg_type,
@@ -29,6 +32,9 @@ scalars_histogram_data AS (
     app_version,
     app_build_id,
     channel,
+    {% if channel == 'release' %}
+      IF(os = 'Windows', 10, 1) AS sample_mult,
+    {% endif %}
     s1.metric,
     s1.key,
     agg_type,
@@ -36,7 +42,7 @@ scalars_histogram_data AS (
   FROM
     `{{ project }}.{{ dataset }}.{{ prefix }}__clients_scalar_aggregates_v1`, UNNEST(scalar_aggregates) s1
 
-  UNION ALL 
+  UNION ALL
 
   SELECT
     client_id,
@@ -45,6 +51,9 @@ scalars_histogram_data AS (
     app_version,
     app_build_id,
     channel,
+    {% if channel == 'release' %}
+      sample_mult,
+    {% endif %}
     metric,
     v1.key,
     agg_type,
@@ -64,16 +73,20 @@ scalars_histogram_data AS (
 }}
 SELECT
     {{ attributes }},
-    metric, 
+    metric,
     '' AS key,
     agg_type,
-    SUM(value) as total_sample
+    {% if channel == 'release' %}
+      SUM(value) * MAX(sample_mult) as total_sample
+    {% else %}
+      SUM(value) as total_sample
+    {% endif %}
 FROM
     all_combos
 WHERE agg_type = 'summed_histogram'
 GROUP BY
-    {{ attributes }}, 
-    metric, 
+    {{ attributes }},
+    metric,
     key,
     agg_type
 
@@ -84,7 +97,11 @@ SELECT
     metric,
     key,
     agg_type,
-    SUM(value) as total_sample
+    {% if channel == 'release' %}
+      SUM(value) * MAX(sample_mult) as total_sample
+    {% else %}
+      SUM(value) as total_sample
+    {% endif %}
 FROM
     all_combos
 WHERE agg_type <> 'summed_histogram'
diff --git a/sql/moz-fx-data-shared-prod/telemetry_derived/glam_sample_counts_v1/query.sql b/sql/moz-fx-data-shared-prod/telemetry_derived/glam_sample_counts_v1/query.sql
index 00304791814..7f26e148dfd 100644
--- a/sql/moz-fx-data-shared-prod/telemetry_derived/glam_sample_counts_v1/query.sql
+++ b/sql/moz-fx-data-shared-prod/telemetry_derived/glam_sample_counts_v1/query.sql
@@ -8,7 +8,9 @@ WITH histogram_data AS (
     process,
     key,
     h1.agg_type,
-    h1.aggregates
+    h1.aggregates,
+    IF(os = 'Windows'
+    AND channel = 'release', 10, 1) AS sample_mult
   FROM
     clients_histogram_aggregates_v2,
     UNNEST(histogram_aggregates) h1
@@ -21,7 +23,9 @@ scalars_data AS (
     app_version,
     app_build_id,
     channel,
-    scalar_aggregates
+    scalar_aggregates,
+    IF(os = 'Windows'
+    AND channel = 'release', 10, 1) AS sample_mult
   FROM
     clients_scalar_aggregates_v1
   WHERE
@@ -36,7 +40,7 @@ SELECT
   process,
   histogram_data.key,
   agg_type,
-  SUM(v1.value) AS total_sample
+  SUM(v1.value) * MAX(sample_mult) AS total_sample
 FROM
   histogram_data,
   UNNEST(aggregates) v1
@@ -59,7 +63,7 @@ SELECT
   process,
   histogram_data.key,
   agg_type,
-  SUM(v1.value) AS total_sample
+  SUM(v1.value) * MAX(sample_mult) AS total_sample
 FROM
   histogram_data,
   UNNEST(aggregates) v1
@@ -81,7 +85,7 @@ SELECT
   process,
   histogram_data.key,
   agg_type,
-  SUM(v1.value) AS total_sample
+  SUM(v1.value) * MAX(sample_mult) AS total_sample
 FROM
   histogram_data,
   UNNEST(aggregates) v1
@@ -103,7 +107,7 @@ SELECT
   process,
   histogram_data.key,
   agg_type,
-  SUM(v1.value) AS total_sample
+  SUM(v1.value) * MAX(sample_mult) AS total_sample
 FROM
   histogram_data,
   UNNEST(aggregates) v1
@@ -126,7 +130,7 @@ SELECT
   agg_type,
   CASE
     WHEN agg_type IN ('count', 'true', 'false')
-      THEN SUM(value)
+      THEN SUM(value) * MAX(sample_mult)
     ELSE NULL
   END AS total_sample
 FROM
@@ -153,7 +157,7 @@ SELECT
   agg_type,
   CASE
     WHEN agg_type IN ('count', 'true', 'false')
-      THEN SUM(value)
+      THEN SUM(value) * MAX(sample_mult)
     ELSE NULL
   END AS total_sample
 FROM
@@ -179,7 +183,7 @@ SELECT
   agg_type,
   CASE
     WHEN agg_type IN ('count', 'true', 'false')
-      THEN SUM(value)
+      THEN SUM(value) * MAX(sample_mult)
     ELSE NULL
   END AS total_sample
 FROM
@@ -205,7 +209,7 @@ SELECT
   agg_type,
   CASE
     WHEN agg_type IN ('count', 'true', 'false')
-      THEN SUM(value)
+      THEN SUM(value) * MAX(sample_mult)
     ELSE NULL
   END AS total_sample
 FROM