Skip to content

Commit

Permalink
Merge pull request #33 from Parsely/dev_release_2.3.0.dev1
Browse files Browse the repository at this point in the history
Merging with release 2.3.0 updates
  • Loading branch information
rachelannelise authored Sep 18, 2019
2 parents 699ed17 + 0e7322e commit 303ae23
Show file tree
Hide file tree
Showing 7 changed files with 317 additions and 15 deletions.
59 changes: 50 additions & 9 deletions parsely_raw_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,58 @@
limitations under the License.
"""

__version__ = '2.3.0.dev0'
__version__ = "2.3.0"

from . import bigquery, docgen, redshift, s3, samples, schema, stream, utils
from six import iteritems

__all__ = [
'bigquery',
'docgen',
'redshift',
's3',
'samples',
'schema',
'stream',
'utils',
"bigquery",
"docgen",
"redshift",
"s3",
"samples",
"schema",
"stream",
"utils",
]

BOOLEAN_FIELDS = {"flags_is_amp"}


def normalize_keys(input_event_dict, schema=None):
"""Conform events to public schema: correct keys and proper value types.
@param input_event_dict: A dictionary containing Parse.ly pixel events
@param schema: Optional parameter containing the schema to normalize the event_dict keys against
IF not specified, this will default to the most recent parsely_raw_data schema
"""
event_dict = {}
schema = schema or schema.SCHEMA

# fix value types
if input_event_dict.get("metadata.share_urls") is not None and isinstance(
input_event_dict["metadata.share_urls"], dict
):
input_event_dict["metadata.share_urls"] = (
list(input_event_dict["metadata.share_urls"].values()) or None
)

# replace all "."s in the key with "_"
input_event_dict = {x.replace('.', '_'): v for x, v in input_event_dict.items()}

# emit only public schema items
# ensure all columns are available and null when needed
# account for all boolean schema defined fields as this is parsely_raw_data specific
for key in schema:
if key not in input_event_dict:
if key in BOOLEAN_FIELDS:
event_dict[key] = False
else:
event_dict[key] = None
else:
event_dict[key] = input_event_dict[key]

event_dict["schema_version"] = __version__

return event_dict
4 changes: 1 addition & 3 deletions parsely_raw_data/schema.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import print_function

import json

from tabulate import tabulate

"""
Expand Down Expand Up @@ -78,6 +76,7 @@
{"key": "ref_query", "ex": "", "type": str},
{"key": "ref_scheme", "ex": "http", "type": str, "size": 64},
{"key": "referrer", "ex": "http://mashable.com/", "type": str},
{"key": "schema_version", "ex": "2.3.0", "type": str, "size": 64},
{"key": "session", "ex": True, "type": bool},
{"key": "session_id", "ex": 6, "type": int, "available_with_field": "session"},
{"key": "session_initial_referrer", "ex": "http://mashable.com/", "type": str, "available_with_field": "session"},
Expand Down Expand Up @@ -146,7 +145,6 @@
{"key": "visitor_site_id", "ex": "ab94fd31-a207-4010-8a25-fb4788207b82", "type": str, "size": 128, "req": True, "available_with_field": "visitor"}
]


def mk_sample_event():
sample = {}
for record in SCHEMA:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ tablib
xlsxwriter
tabulate
oauth2client
pytest
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ logging-clear-handlers = 1
verbosity = 2
detailed-errors = 1

[pytest]
[tool:pytest]
norecursedirs = build docs/_build *.egg .tox *.venv requirements/
addopts =
# Shows a line for every test
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ def run_setup():
author='Emmett Butler',
author_email='support@parsely.com',
url='https://github.com/Parsely/parsely_raw_data',
description='Utilities for accessing raw Parse.ly data',
description='Utilities for accessing the Parse.ly Data Pipeline',
long_description='Utilities for accessing the Parse.ly Data Pipeline',
keywords='parsely s3 kinesis redshift firehose bigquery',
license='Apache License 2.0',
packages=find_packages(),
Expand Down
261 changes: 261 additions & 0 deletions tests/test_basic.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,269 @@
from six import text_type

import parsely_raw_data

expected_view = {
"action": "pageview",
"apikey": "example.com",
"campaign_id": None,
"channel": "website",
"display": True,
"display_avail_height": 877,
"display_avail_width": 1436,
"display_pixel_depth": 24,
"display_total_height": 900,
"display_total_width": 1440,
"engaged_time_inc": None,
"extra_data": None,
"flags_is_amp": False,
"event_id": "0x32bc4058da8233e6ddd86ba0a8920586",
"ip_city": "Toronto",
"ip_continent": "NA",
"ip_country": "CA",
"ip_lat": 43.7124,
"ip_lon": -79.3644,
"ip_postal": "M4G",
"ip_subdivision": "ON",
"ip_timezone": "America/Toronto",
"ip_market_nielsen": None,
"ip_market_doubleclick": None,
"ip_market_name": None,
"metadata": False,
"metadata_authors": None,
"metadata_canonical_url": None,
"metadata_custom_metadata": None,
"metadata_data_source": None,
"metadata_duration": None,
"metadata_full_content_word_count": None,
"metadata_image_url": None,
"metadata_page_type": None,
"metadata_post_id": None,
"metadata_pub_date_tmsp": None,
"metadata_save_date_tmsp": None,
"metadata_section": None,
"metadata_share_urls": None,
"metadata_tags": None,
"metadata_thumb_url": None,
"metadata_title": None,
"metadata_urls": None,
"pageload_id": None,
"pageview_id": None,
"ref_category": "internal",
"ref_clean": "http://www.example.com/article-123",
"ref_domain": "example.com",
"ref_fragment": "",
"ref_netloc": "www.example.com",
"ref_params": "",
"ref_path": "/article-123",
"ref_query": "",
"ref_scheme": "http",
"referrer": "http://www.example.com/article-123",
"schema_version": parsely_raw_data.__version__,
"session": True,
"session_id": 5,
"session_initial_referrer": "https://www.google.ca/",
"session_initial_url": "http://www.example.com/",
"session_last_session_timestamp": "1_470_045_600_000",
"session_timestamp": "1_471_428_000_000",
"slot": False,
"sref_category": "search",
"sref_clean": "https://www.google.ca/",
"sref_domain": "google",
"sref_fragment": "",
"sref_netloc": "www.google.ca",
"sref_params": "",
"sref_path": "/",
"sref_query": "",
"sref_scheme": "https",
"surl_clean": "http://www.example.com/",
"surl_domain": "example.com",
"surl_fragment": "",
"surl_netloc": "www.example.com",
"surl_params": "",
"surl_path": "/",
"surl_query": "",
"surl_scheme": "http",
"surl_utm_campaign": None,
"surl_utm_content": None,
"surl_utm_medium": None,
"surl_utm_source": None,
"surl_utm_term": None,
"timestamp_info": True,
"timestamp_info_nginx_ms": "1_429_707_722_000",
"timestamp_info_override_ms": None,
"timestamp_info_pixel_ms": None,
"ts_action": "2015-04-22 13:02:02",
"ts_session_current": "2016-08-17 10:00:00",
"ts_session_last": "2016-08-01 10:00:00",
"ua_browser": "Chrome",
"ua_browserversion": "52.0.2743",
"ua_device": "Other",
"ua_devicebrand": None,
"ua_devicemodel": None,
"ua_devicetouchcapable": False,
"ua_devicetype": "desktop",
"ua_os": "Mac OS X",
"ua_osversion": "10.10.5",
"url": "http://www.example.com/",
"url_clean": "http://www.example.com/",
"url_domain": "example.com",
"url_fragment": "",
"url_netloc": "www.example.com",
"url_params": "",
"url_path": "/",
"url_query": "",
"url_scheme": "http",
"user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
"utm_campaign": None,
"utm_content": None,
"utm_medium": None,
"utm_source": None,
"utm_term": None,
"videostart_id": None,
"version": 1,
"visitor": True,
"visitor_ip": "184.149.39.120",
"visitor_network_id": "",
"visitor_site_id": "e71604df-a912-455d-aaf3-a9c72a6dd86c",
}

test_view = {
"action": "pageview",
"apikey": "example.com",
"channel": "website",
"display": True,
"display_avail_height": 877,
"display_avail_width": 1436,
"display_pixel_depth": 24,
"display_total_height": 900,
"display_total_width": 1440,
"event_id": "0x32bc4058da8233e6ddd86ba0a8920586",
"ip_city": "Toronto",
"ip_continent": "NA",
"ip_country": "CA",
"ip_lat": 43.7124,
"ip_lon": -79.3644,
"ip_postal": "M4G",
"ip_subdivision": "ON",
"ip_timezone": "America/Toronto",
"ip_market_nielsen": None,
"ip_market_doubleclick": None,
"ip_market_name": None,
"metadata": False,
"metadata_authors": None,
"metadata_canonical_url": None,
"metadata_custom_metadata": None,
"metadata_data_source": None,
"metadata_duration": None,
"metadata_full_content_word_count": None,
"metadata_image_url": None,
"metadata_page_type": None,
"metadata_post_id": None,
"metadata_pub_date_tmsp": None,
"metadata_save_date_tmsp": None,
"metadata_section": None,
"metadata_share_urls": None,
"metadata_tags": None,
"metadata_thumb_url": None,
"metadata_title": None,
"metadata_urls": None,
"pageload_id": None,
"pageview_id": None,
"ref_category": "internal",
"ref_clean": "http://www.example.com/article-123",
"ref_domain": "example.com",
"ref_fragment": "",
"ref_netloc": "www.example.com",
"ref_params": "",
"ref_path": "/article-123",
"ref_query": "",
"ref_scheme": "http",
"referrer": "http://www.example.com/article-123",
"schema_version": parsely_raw_data.__version__,
"session": True,
"session_id": 5,
"session_initial_referrer": "https://www.google.ca/",
"session_initial_url": "http://www.example.com/",
"session_last_session_timestamp": "1_470_045_600_000",
"session_timestamp": "1_471_428_000_000",
"slot": False,
"sref_category": "search",
"sref_clean": "https://www.google.ca/",
"sref_domain": "google",
"sref_fragment": "",
"sref_netloc": "www.google.ca",
"sref_params": "",
"sref_path": "/",
"sref_query": "",
"sref_scheme": "https",
"surl_clean": "http://www.example.com/",
"surl_domain": "example.com",
"surl_fragment": "",
"surl_netloc": "www.example.com",
"surl_params": "",
"surl_path": "/",
"surl_query": "",
"surl_scheme": "http",
"surl_utm_campaign": None,
"surl_utm_content": None,
"surl_utm_medium": None,
"surl_utm_source": None,
"surl_utm_term": None,
"timestamp_info": True,
"timestamp_info_nginx_ms": "1_429_707_722_000",
"timestamp_info_override_ms": None,
"timestamp_info_pixel_ms": None,
"ts_action": "2015-04-22 13:02:02",
"ts_session_current": "2016-08-17 10:00:00",
"ts_session_last": "2016-08-01 10:00:00",
"ua_browser": "Chrome",
"ua_browserversion": "52.0.2743",
"ua_device": "Other",
"ua_devicetouchcapable": False,
"ua_devicetype": "desktop",
"ua_os": "Mac OS X",
"ua_osversion": "10.10.5",
"url": "http://www.example.com/",
"url_clean": "http://www.example.com/",
"url_domain": "example.com",
"url_fragment": "",
"url_netloc": "www.example.com",
"url_params": "",
"url_path": "/",
"url_query": "",
"url_scheme": "http",
"user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
"version": 1,
"visitor": True,
"visitor_ip": "184.149.39.120",
"visitor_network_id": "",
"visitor_site_id": "e71604df-a912-455d-aaf3-a9c72a6dd86c",
}


def test_normalize_keys_empty_dict():
# Return schema keys
public_schema_keys = {text_type(field["key"]) for field in parsely_raw_data.schema.SCHEMA}

# test passing empty dict yields full dict with empty rows
event_empty_dict = {
"event_id": "1234"
}
normalized_dict = dict(parsely_raw_data.normalize_keys(event_empty_dict, schema=public_schema_keys))
assert len(normalized_dict.keys()) == len(public_schema_keys)


def test_normalize_keys_partially_complete_dict():
public_schema_keys = {text_type(field["key"]) for field in parsely_raw_data.schema.SCHEMA}

# test passing event dict yield all expected fields in schema
normalized_dict = parsely_raw_data.normalize_keys(test_view, schema=public_schema_keys)
assert normalized_dict == expected_view


def test_basic():
pass


if __name__ == "__main__":
test_basic()
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ envlist = py27, py34, py35, pypy
commands =
pip install -r test-requirements.txt
pip install -e .
py.test {posargs}
pytest {posargs}

0 comments on commit 303ae23

Please sign in to comment.