Skip to content

Commit

Permalink
Merge branch 'main' into review-checker-fix
Browse files Browse the repository at this point in the history
  • Loading branch information
alekhyamoz committed Nov 20, 2023
2 parents cc59448 + 05fed88 commit a1ce8d6
Show file tree
Hide file tree
Showing 5 changed files with 356 additions and 20 deletions.
1 change: 1 addition & 0 deletions bqetl_project.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ dry_run:
- sql/moz-fx-data-shared-prod/firefox_desktop/top_sites/view.sql
- sql/moz-fx-data-shared-prod/firefox_desktop/quick_suggest/view.sql
- sql/moz-fx-data-shared-prod/stub_attribution_service_derived/dl_token_ga_attribution_lookup_v1/query.sql
- sql/moz-fx-data-shared-prod/stub_attribution_service/dl_token_ga_attribution_lookup/view.sql
# Materialized views
- sql/moz-fx-data-shared-prod/telemetry_derived/experiment_search_events_live_v1/init.sql
- sql/moz-fx-data-shared-prod/telemetry_derived/experiment_events_live_v1/init.sql
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,39 @@ RETURNS STRING AS (
END
);

WITH daily_sessions AS (
WITH historic_and_intraday AS (
SELECT
*
FROM
`moz-fx-data-marketing-prod.65789850.ga_sessions_*`
WHERE
-- This table is partitioned, so we only process the data from session_date
-- To handle late-arriving data, we process 3 days of data each day (re-processing the past 2)
-- as separate Airflow tasks
--
-- Here, we need to take data from yesterday, just in case some of our sessions from today
-- actually started yesterday. If they did, they'll be filtered out in the HAVING clause
_TABLE_SUFFIX
BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(@session_date, INTERVAL 1 DAY))
-- However, we have data for today that will arrive _tomorrow_! Some inter-day sessions
-- will be present in two days, with the same ids. A session should never span more
-- than two days though, see https://sql.telemetry.mozilla.org/queries/95882/source
-- If one does, our uniqueness check will alert us
AND FORMAT_DATE('%Y%m%d', DATE_ADD(@session_date, INTERVAL 1 DAY))
UNION ALL
-- Intraday sessions are "real-time" exports of sessions of the current day
-- usually we wouldn't need these, but sometimes GA is slow in adding the
-- intraday sessions back into ga_sessions
SELECT
*
FROM
`moz-fx-data-marketing-prod.65789850.ga_sessions_intraday_*`
WHERE
_TABLE_SUFFIX
BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(@session_date, INTERVAL 1 DAY))
AND FORMAT_DATE('%Y%m%d', DATE_ADD(@session_date, INTERVAL 1 DAY))
),
daily_sessions AS (
SELECT
mozfun.ga.nullify_string(clientId) AS ga_client_id,
-- visitId (or sessionId in GA4) is guaranteed unique only among one client, look at visitId here https://support.google.com/analytics/answer/3437719?hl=en
Expand All @@ -59,9 +91,9 @@ WITH daily_sessions AS (
MIN_BY(trafficSource.medium, visitStartTime) AS medium,
MIN_BY(trafficSource.keyword, visitStartTime) AS term,
MIN_BY(trafficSource.adContent, visitStartTime) AS content,
ARRAY_AGG(
mozfun.ga.nullify_string(trafficSource.adwordsClickInfo.gclId) IGNORE NULLS
)[0] AS gclid,
ARRAY_AGG(mozfun.ga.nullify_string(trafficSource.adwordsClickInfo.gclId) IGNORE NULLS)[
0
] AS gclid,
/* Device */
MIN_BY(device.deviceCategory, visitStartTime) AS device_category,
MIN_BY(device.mobileDeviceModel, visitStartTime) AS mobile_device_model,
Expand All @@ -72,26 +104,12 @@ WITH daily_sessions AS (
MIN_BY(device.browser, visitStartTime) AS browser,
MIN_BY(device.browserVersion, visitStartTime) AS browser_version,
FROM
`moz-fx-data-marketing-prod.65789850.ga_sessions_*`
WHERE
-- This table is partitioned, so we only process the data from session_date
-- To handle late-arriving data, we process 3 days of data each day (re-processing the past 2)
-- as separate Airflow tasks (or via bqetl backfill, I haven't decided yet)
--
-- Here, we need to take data from yesterday, just in case some of our sessions from today
-- actually started yesterday. If they did, they'll be filtered out in the HAVING clause
_TABLE_SUFFIX
BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(@session_date, INTERVAL 1 DAY))
-- However, we have data for today that will arrive _tomorrow_! Some inter-day sessions
-- will be present in two days, with the same ids. A session should never span more
-- than two days though, see https://sql.telemetry.mozilla.org/queries/95882/source
-- If one does, our uniqueness check will alert us
AND FORMAT_DATE('%Y%m%d', DATE_ADD(@session_date, INTERVAL 1 DAY))
historic_and_intraday
GROUP BY
ga_client_id,
ga_session_id
HAVING
-- Don't include entries from today that started yesterday
-- Don't include entries from that started yesterday or tomorrow
session_date = @session_date
)
SELECT
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
[
{
"mode": "NULLABLE",
"name": "visitId",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "visitNumber",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "visitStartTime",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "date",
"type": "STRING"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "timeOnSite",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "pageviews",
"type": "INTEGER"
}
],
"mode": "NULLABLE",
"name": "totals",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "campaign",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "source",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "medium",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "adContent",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "keyword",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "adwordsClickInfo",
"type": "RECORD",
"fields": [
{
"mode": "NULLABLE",
"name": "gclId",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "campaignId",
"type": "INTEGER"
}
]
}
],
"mode": "NULLABLE",
"name": "trafficSource",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "browser",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "browserVersion",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "operatingSystem",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "operatingSystemVersion",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "language",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "deviceCategory",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "mobileDeviceModel",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "mobileDeviceInfo",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "device",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "country",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "region",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "city",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "geoNetwork",
"type": "RECORD"
},
{
"fields": [
{
"fields": [
{
"mode": "NULLABLE",
"name": "pagePath",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "page",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "landingScreenName",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "appInfo",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "eventCategory",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "eventAction",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "eventLabel",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "eventInfo",
"type": "RECORD"
},
{
"mode": "NULLABLE",
"name": "type",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "hitNumber",
"type": "INTEGER"
}
],
"mode": "REPEATED",
"name": "hits",
"type": "RECORD"
},
{
"mode": "NULLABLE",
"name": "clientId",
"type": "STRING"
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,34 @@
last_reported_stub_session_id: "laterStubSessionId"
all_reported_stub_session_ids: ["earlierStubSessionId", "laterStubSessionId"]
landing_screen: first
- ga_client_id: clientIntraday
ga_session_id: clientIntraday1
session_date: 2023-03-31
is_first_session: true
session_number: 1
time_on_site: 11
pageviews: 1
country: earlierCountry
region: earlierRegion
city: earlierCity
campaign_id: "1"
gclid: "earlierGclid"
campaign: "earlierCampaign"
source: "earlierSource"
medium: "earlierMedium"
content: "earlierContent"
term: "earlierKeyword"
device_category: "earlierDeviceCategory"
mobile_device_model: "earlierMobileDeviceModel"
mobile_device_string: "earlierMobileDeviceInfo"
os: "earlierOperatingSystem"
os_version: "earlierOperatingSystemVersion"
language: "earlierLanguage"
browser: "earlierBrowser"
browser_version: "earlierBrowserVersion"
had_download_event: true
last_reported_install_target: "desktop_release"
all_reported_install_targets: ["desktop_release"]
last_reported_stub_session_id: "laterStubSessionId"
all_reported_stub_session_ids: ["earlierStubSessionId", "laterStubSessionId"]
landing_screen: first
Loading

0 comments on commit a1ce8d6

Please sign in to comment.