Skip to content

Commit

Permalink
Merge pull request #47 from tinybirdco/feat/add_column_to_s3_ds/initi…
Browse files Browse the repository at this point in the history
…alization

Add column to S3 project initialization
  • Loading branch information
albertojuan committed Nov 8, 2023
2 parents 290a04c + c87e68a commit d4de697
Show file tree
Hide file tree
Showing 28 changed files with 780 additions and 0 deletions.
24 changes: 24 additions & 0 deletions .github/workflows/add_column_to_s3_ds_cd.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

##################################################
### Visit https://github.com/tinybirdco/ci ###
### for more details or custom CI/CD ###
##################################################

name: Tinybird - CD Workflow

on:
workflow_dispatch:
push:
paths:
- 'add_column_to_s3_ds/**'
branches:
- main
jobs:
cd:
uses: tinybirdco/ci/.github/workflows/cd.yml@main
with:
tb_deploy: false
data_project_dir: ./add_column_to_s3_ds
secrets:
tb_admin_token: ${{ secrets.TB_ADMIN_TOKEN_ADD_COLUMN_TO_S3_DS }}
tb_host: https://api.tinybird.co
27 changes: 27 additions & 0 deletions .github/workflows/add_column_to_s3_ds_ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

##################################################
### Visit https://github.com/tinybirdco/ci ###
### for more details or custom CI/CD ###
##################################################

name: Tinybird - CI Workflow

on:
workflow_dispatch:
pull_request:
paths:
- 'add_column_to_s3_ds/**'
branches:
- main
types: [opened, reopened, labeled, unlabeled, synchronize, closed]

concurrency: ${{ github.workflow }}-${{ github.event.pull_request.number }}

jobs:
ci:
uses: tinybirdco/ci/.github/workflows/ci.yml@main
with:
data_project_dir: ./add_column_to_s3_ds
secrets:
tb_admin_token: ${{ secrets.TB_ADMIN_TOKEN_ADD_COLUMN_TO_S3_DS }}
tb_host: https://api.tinybird.co
1 change: 1 addition & 0 deletions add_column_to_s3_ds/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.tinyb
2 changes: 2 additions & 0 deletions add_column_to_s3_ds/.tinyenv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
TB_VERSION_WARNING=0
VERSION=0.0.0
3 changes: 3 additions & 0 deletions add_column_to_s3_ds/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Tinybird Versions - {{ YOUR USE CASE NAME HERE }}

Work in progress ...
19 changes: 19 additions & 0 deletions add_column_to_s3_ds/datasources/analytics_events.datasource
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
TOKEN "tracker" APPEND


SCHEMA >
`timestamp` DateTime `json:$.timestamp`,
`session_id` String `json:$.session_id`,
`action` LowCardinality(String) `json:$.action`,
`version` LowCardinality(String) `json:$.version`,
`payload` String `json:$.payload`

ENGINE "MergeTree"
ENGINE_PARTITION_KEY "toYear(timestamp)"
ENGINE_SORTING_KEY "timestamp, session_id, payload"

IMPORT_SERVICE 's3'
IMPORT_CONNECTION_NAME 'TB-S3'
IMPORT_BUCKET_URI 's3://webanalyticstb/v1/*.ndjson'
IMPORT_STRATEGY 'append'
IMPORT_SCHEDULE '@auto'
13 changes: 13 additions & 0 deletions add_column_to_s3_ds/datasources/analytics_pages_mv.datasource
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

SCHEMA >
`date` Date,
`device` String,
`browser` String,
`location` String,
`pathname` String,
`visits` AggregateFunction(uniq, String),
`hits` AggregateFunction(count)

ENGINE "AggregatingMergeTree"
ENGINE_PARTITION_KEY "toYYYYMM(date)"
ENGINE_SORTING_KEY "date, device, browser, location, pathname"
14 changes: 14 additions & 0 deletions add_column_to_s3_ds/datasources/analytics_sessions_mv.datasource
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

SCHEMA >
`date` Date,
`session_id` String,
`device` SimpleAggregateFunction(any, String),
`browser` SimpleAggregateFunction(any, String),
`location` SimpleAggregateFunction(any, String),
`first_hit` SimpleAggregateFunction(min, DateTime),
`latest_hit` SimpleAggregateFunction(max, DateTime),
`hits` AggregateFunction(count)

ENGINE "AggregatingMergeTree"
ENGINE_PARTITION_KEY "toYYYYMM(date)"
ENGINE_SORTING_KEY "date, session_id"
13 changes: 13 additions & 0 deletions add_column_to_s3_ds/datasources/analytics_sources_mv.datasource
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

SCHEMA >
`date` Date,
`device` String,
`browser` String,
`location` String,
`referrer` String,
`visits` AggregateFunction(uniq, String),
`hits` AggregateFunction(count)

ENGINE "AggregatingMergeTree"
ENGINE_PARTITION_KEY "toYYYYMM(date)"
ENGINE_SORTING_KEY "date, device, browser, location, referrer"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"timestamp":"2023-10-11T07:58:10.324Z","session_id":"3cda774e-adb4-4b3b-88fa-5744ee81ea5f","action":"page_hit","version":"1","payload":"{ \"user-agent\":\"Mozilla/5.0 (Linux; Android 13) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.5249.118 Mobile Safari/537.36\", \"locale\":\"en-US\", \"location\":\"US\", \"referrer\":\"https://www.yandex.com\", \"pathname\":\"/product\", \"href\":\"https://www.tinybird.co/product\"}"}
{"timestamp":"2023-10-11T07:57:36.694Z","session_id":"85c4d468-f480-4610-978d-d64cb665bf5f","action":"page_hit","version":"1","payload":"{ \"user-agent\":\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:103.0) Gecko/20100101 Firefox/103.0\", \"locale\":\"en-US\", \"location\":\"US\", \"referrer\":\"https://www.google.com\", \"pathname\":\"/\", \"href\":\"https://www.tinybird.co\"}"}
{"timestamp":"2023-10-11T07:57:56.172Z","session_id":"dcc0ca3e-4f9c-435b-97e5-daa5a8c48e2a","action":"page_hit","version":"1","payload":"{ \"user-agent\":\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36\", \"locale\":\"en-US\", \"location\":\"US\", \"referrer\":\"https://www.google.com\", \"pathname\":\"/product\", \"href\":\"https://www.tinybird.co/product\"}"}
{"timestamp":"2023-10-12T07:57:56.172Z","session_id":"dcc0ca3e-4f9c-435b-97e5-daa5a8c48e2a","action":"page_hit","version":"1","payload":"{ \"user-agent\":\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36\", \"locale\":\"en-US\", \"location\":\"US\", \"referrer\":\"https://www.google.com\", \"pathname\":\"/product\", \"href\":\"https://www.tinybird.co/product\"}"}
59 changes: 59 additions & 0 deletions add_column_to_s3_ds/pipes/analytics_hits.pipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
DESCRIPTION >
Parsed `page_hit` events, implementing `browser` and `device` detection logic.


TOKEN "dashboard" READ

NODE parsed_hits
DESCRIPTION >
Parse raw page_hit events

SQL >

SELECT
timestamp,
action,
version,
coalesce(session_id,'0') as session_id,
JSONExtractString(payload, 'locale') as locale,
JSONExtractString(payload, 'location') as location,
JSONExtractString(payload, 'referrer') as referrer,
JSONExtractString(payload, 'pathname') as pathname,
JSONExtractString(payload, 'href') as href,
lower(JSONExtractString(payload, 'user-agent')) as user_agent
FROM
analytics_events
where action = 'page_hit'



NODE endpoint
SQL >

SELECT
timestamp,
action,
version,
session_id,
location,
referrer,
pathname,
href,
case
when match(user_agent, 'wget|ahrefsbot|curl|urllib|bitdiscovery|\+https://|googlebot') then 'bot'
when match(user_agent, 'android') then 'mobile-android'
when match(user_agent, 'ipad|iphone|ipod') then 'mobile-ios'
else 'desktop'
END as device,
case
when match(user_agent, 'firefox') then 'firefox'
when match(user_agent, 'chrome|crios') then 'chrome'
when match(user_agent, 'opera') then 'opera'
when match(user_agent, 'msie|trident') then 'ie'
when match(user_agent, 'iphone|ipad|safari') then 'safari'
else 'Unknown'
END as browser
FROM
parsed_hits


26 changes: 26 additions & 0 deletions add_column_to_s3_ds/pipes/analytics_pages.pipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
NODE analytics_pages_1
DESCRIPTION >
Aggregate by pathname and calculate session and hits

SQL >

SELECT
toDate(timestamp) AS date,
device,
browser,
location,
pathname,
uniqState(session_id) AS visits,
countState() AS hits
FROM analytics_hits
GROUP BY
date,
device,
browser,
location,
pathname

TYPE materialized
DATASOURCE analytics_pages_mv


24 changes: 24 additions & 0 deletions add_column_to_s3_ds/pipes/analytics_sessions.pipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
NODE analytics_sessions_1
DESCRIPTION >
Aggregate by session_id and calculate session metrics

SQL >

SELECT
toDate(timestamp) AS date,
session_id,
anySimpleState(device) AS device,
anySimpleState(browser) AS browser,
anySimpleState(location) AS location,
minSimpleState(timestamp) AS first_hit,
maxSimpleState(timestamp) AS latest_hit,
countState() AS hits
FROM analytics_hits
GROUP BY
date,
session_id

TYPE materialized
DATASOURCE analytics_sessions_mv


33 changes: 33 additions & 0 deletions add_column_to_s3_ds/pipes/analytics_sources.pipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
NODE analytics_sources_1
DESCRIPTION >
Aggregate by referral and calculate session and hits

SQL >

WITH (
SELECT domainWithoutWWW(href)
FROM analytics_hits
WHERE href is not null and href != ''
LIMIT 1
) AS currenct_domain
SELECT
toDate(timestamp) AS date,
device,
browser,
location,
referrer,
uniqState(session_id) AS visits,
countState() AS hits
FROM analytics_hits
WHERE domainWithoutWWW(referrer) != currenct_domain
GROUP BY
date,
device,
browser,
location,
referrer

TYPE materialized
DATASOURCE analytics_sources_mv


123 changes: 123 additions & 0 deletions add_column_to_s3_ds/pipes/kpis.pipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
DESCRIPTION >
Summary with general KPIs per date, including visits, page views, bounce rate and average session duration.
Accepts `date_from` and `date_to` date filter, all historical data if not passed.
Daily granularity, except when filtering one single day (hourly)


TOKEN "dashboard" READ

NODE timeseries
DESCRIPTION >
Generate a timeseries for the specified time range, so we call fill empty data points.
Filters "future" data points.

SQL >

%
{% set _single_day = defined(date_from) and day_diff(date_from, date_to) == 0 %}
with
{% if defined(date_from) %}
toStartOfDay(toDate({{Date(date_from, description="Starting day for filtering a date range", required=False)}})) as start,
{% else %}
toStartOfDay(timestampAdd(today(), interval -7 day)) as start,
{% end %}
{% if defined(date_to) %}
toStartOfDay(toDate({{Date(date_to, description="Finishing day for filtering a date range", required=False)}})) as end
{% else %}
toStartOfDay(today()) as end
{% end %}
{% if _single_day %}
select arrayJoin(arrayMap(x -> toDateTime(x), range(toUInt32(toDateTime(start)), toUInt32(timestampAdd(end, interval 1 day)), 3600))) as date
{% else %}
select arrayJoin(arrayMap(x -> toDate(x), range(toUInt32(start), toUInt32(timestampAdd(end, interval 1 day)), 24 * 3600))) as date
{% end %}
where date <= now()



NODE hits
DESCRIPTION >
Group by sessions and calculate metrics at that level

SQL >

%
{% if defined(date_from) and day_diff(date_from, date_to) == 0 %}
select
toStartOfHour(timestamp) as date,
session_id,
uniq(session_id) as visits,
count() as pageviews,
case when min(timestamp) = max(timestamp) then 1 else 0 end as is_bounce,
max(timestamp) as latest_hit_aux,
min(timestamp) as first_hit_aux
from
analytics_hits
where
toDate(timestamp) = {{Date(date_from)}}
group by
toStartOfHour(timestamp), session_id
{% else %}
select
date,
session_id,
uniq(session_id) as visits,
countMerge(hits) as pageviews,
case when min(first_hit) = max(latest_hit) then 1 else 0 end as is_bounce,
max(latest_hit) as latest_hit_aux,
min(first_hit) as first_hit_aux
from
analytics_sessions_mv
where
{% if defined(date_from) %}
date >= {{Date(date_from)}}
{% else %}
date >= timestampAdd(today(), interval -7 day)
{% end %}
{% if defined(date_to) %}
and date <= {{Date(date_to)}}
{% else %}
and date <= today()
{% end %}
group by
date, session_id
{% end %}



NODE data
DESCRIPTION >
General KPIs per date, works for both summary metrics and trends charts.

SQL >

select
date,
uniq(session_id) as visits,
sum(pageviews) as pageviews,
sum(case when latest_hit_aux = first_hit_aux then 1 end) / visits as bounce_rate,
avg(latest_hit_aux - first_hit_aux) as avg_session_sec
from
hits
group by
date



NODE endpoint
DESCRIPTION >
Join and generate timeseries with metrics

SQL >

select
a.date,
b.visits,
b.pageviews,
b.bounce_rate,
b.avg_session_sec
from
timeseries a
left join data b using date


Loading

0 comments on commit d4de697

Please sign in to comment.