From 8e5dcc2f5ca0f66a953274ae19d8b6b31e2b90d3 Mon Sep 17 00:00:00 2001 From: Dan Blanchard Date: Mon, 6 Mar 2017 15:47:11 -0500 Subject: [PATCH] Add data_source field to Metadata to differentiate between in-pixel and crawled metadata. --- parsely_raw_data/event.py | 11 ++++++++--- tests/test_event.py | 5 +++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/parsely_raw_data/event.py b/parsely_raw_data/event.py index 5b0b420..6c29f29 100644 --- a/parsely_raw_data/event.py +++ b/parsely_raw_data/event.py @@ -82,13 +82,15 @@ class Metadata(SlotsMixin): __slots__ = ('authors', 'canonical_url', 'urls', 'page_type', 'post_id', 'pub_date_tmsp', 'custom_metadata', 'section', 'tags', 'save_date_tmsp', 'thumb_url', 'title', 'image_url', - 'full_content_word_count', 'share_urls', 'duration') + 'full_content_word_count', 'share_urls', 'duration', + 'data_source') __version__ = 1 def __init__(self, authors, canonical_url, urls, page_type, post_id, pub_date_tmsp, custom_metadata, section, tags, save_date_tmsp, thumb_url, title, image_url, - full_content_word_count, share_urls, duration): + full_content_word_count, share_urls, duration, + data_source): self.authors = authors self.canonical_url = canonical_url self.urls = urls @@ -105,6 +107,7 @@ def __init__(self, authors, canonical_url, urls, page_type, post_id, self.full_content_word_count = full_content_word_count self.share_urls = share_urls self.duration = duration + self.data_source = data_source class SlotInfo(SlotsMixin): @@ -285,6 +288,7 @@ def to_dict(self): event_dict['metadata.full_content_word_count'] = self.metadata.full_content_word_count event_dict['metadata.share_urls'] = self.metadata.share_urls event_dict['metadata.duration'] = self.metadata.duration + event_dict['metadata.data_source'] = self.metadata.data_source event_dict['metadata.__version__'] = self.metadata.__version__ else: event_dict['metadata'] = False @@ -363,7 +367,8 @@ def from_dict(cls, data): data.get('metadata.image_url'), data.get('metadata.full_content_word_count'), data.get('metadata.share_urls'), - data.get('metadata.duration')) + data.get('metadata.duration'), + data.get('metadata.data_source')) else: metadata = None if data.get('campaign'): diff --git a/tests/test_event.py b/tests/test_event.py index 14a5a60..492d9b4 100644 --- a/tests/test_event.py +++ b/tests/test_event.py @@ -42,7 +42,8 @@ "http://parsely.com/imgurl", 420, ["http://twitter.com/nothing"], - 69 + 69, + 'crawl' ), CampaignInfo('spring_sale', 'email', 'newsletter', 'logolink', 'foo'), EventFlags(False), @@ -62,7 +63,7 @@ def test_to_dict_checker(): assert len(SlotInfo.__slots__) == 4, msg assert len(TimestampInfo.__slots__) == 3, msg assert len(VisitorInfo.__slots__) == 3, msg - assert len(Metadata.__slots__) == 16, msg + assert len(Metadata.__slots__) == 17, msg assert len(CampaignInfo.__slots__) == 5, msg assert len(EventFlags.__slots__) == 1, msg