Skip to content

the-timoye/spark-examples

Repository files navigation

Spark Examples

This project does some data-wrangling on a json dataset; answering analytical questions from Udacity's Data Engineering Nanodegree program

Data-set Schema

root
|-- artist: string (nullable = true)
|-- auth: string (nullable = true)
|-- firstName: string (nullable = true)
|-- gender: string (nullable = true)
|-- itemInSession: long (nullable = true)
|-- lastName: string (nullable = true)
|-- length: double (nullable = true)
|-- level: string (nullable = true)
|-- location: string (nullable = true)
|-- method: string (nullable = true)
|-- page: string (nullable = true)
|-- registration: long (nullable = true)
|-- sessionId: long (nullable = true)
|-- song: string (nullable = true)
|-- status: long (nullable = true)
|-- ts: long (nullable = true)
|-- userAgent: string (nullable = true)
|-- userId: string (nullable = true)

Question 1: Which page did user id "" not visit?

Steps:

- Unique Pages are selected from the dataset using the `dropDuplicates()` method
- Unique pages visited by the user is selected from the dataframe using the `where()` and `dropDuplicates()` methods
- results are put in a set, and missing pages from the users pages are extracted

Question 2: How many females are in the dataset?

Steps:

- `select` all distinct users using the dropDuplicates() method,
- `filter` by gender 'F'
- `count` the result

Question 3: Question 3: How many songs were played form the most played artist?

Steps:

- Select and group dataframe by artists
- Run a `count` aggregate.
- Sort in descending order and display the first item in the results.

Data Lakes Schema

root |-- data: struct (nullable = true) | |-- approved_at_utc: string (nullable = true) | |-- approved_by: string (nullable = true) | |-- archived: boolean (nullable = true) | |-- author: string (nullable = true) | |-- author_flair_background_color: string (nullable = true) | |-- author_flair_css_class: string (nullable = true) | |-- author_flair_richtext: array (nullable = true) | | |-- element: string (containsNull = true) | |-- author_flair_template_id: string (nullable = true) | |-- author_flair_text: string (nullable = true) | |-- author_flair_text_color: string (nullable = true) | |-- author_flair_type: string (nullable = true) | |-- author_fullname: string (nullable = true) | |-- author_patreon_flair: boolean (nullable = true) | |-- banned_at_utc: string (nullable = true) | |-- banned_by: string (nullable = true) | |-- can_gild: boolean (nullable = true) | |-- can_mod_post: boolean (nullable = true) | |-- category: string (nullable = true) | |-- clicked: boolean (nullable = true) | |-- content_categories: string (nullable = true) | |-- contest_mode: boolean (nullable = true) | |-- created: double (nullable = true) | |-- created_utc: double (nullable = true) | |-- crosspost_parent: string (nullable = true) | |-- crosspost_parent_list: array (nullable = true) | | |-- element: struct (containsNull = true) | | | |-- approved_at_utc: string (nullable = true) | | | |-- approved_by: string (nullable = true) | | | |-- archived: boolean (nullable = true) | | | |-- author: string (nullable = true) | | | |-- author_flair_background_color: string (nullable = true) | | | |-- author_flair_css_class: string (nullable = true) | | | |-- author_flair_richtext: array (nullable = true) | | | | |-- element: string (containsNull = true) | | | |-- author_flair_template_id: string (nullable = true) | | | |-- author_flair_text: string (nullable = true) | | | |-- author_flair_text_color: string (nullable = true) | | | |-- author_flair_type: string (nullable = true) | | | |-- author_fullname: string (nullable = true) | | | |-- author_patreon_flair: boolean (nullable = true) | | | |-- banned_at_utc: string (nullable = true) | | | |-- banned_by: string (nullable = true) | | | |-- can_gild: boolean (nullable = true) | | | |-- can_mod_post: boolean (nullable = true) | | | |-- category: string (nullable = true) | | | |-- clicked: boolean (nullable = true) | | | |-- content_categories: string (nullable = true) | | | |-- contest_mode: boolean (nullable = true) | | | |-- created: double (nullable = true) | | | |-- created_utc: double (nullable = true) | | | |-- distinguished: string (nullable = true) | | | |-- domain: string (nullable = true) | | | |-- downs: long (nullable = true) | | | |-- edited: boolean (nullable = true) | | | |-- gilded: long (nullable = true) | | | |-- gildings: struct (nullable = true) | | | | |-- gid_1: long (nullable = true) | | | | |-- gid_2: long (nullable = true) | | | | |-- gid_3: long (nullable = true) | | | |-- hidden: boolean (nullable = true) | | | |-- hide_score: boolean (nullable = true) | | | |-- id: string (nullable = true) | | | |-- is_crosspostable: boolean (nullable = true) | | | |-- is_meta: boolean (nullable = true) | | | |-- is_original_content: boolean (nullable = true) | | | |-- is_reddit_media_domain: boolean (nullable = true) | | | |-- is_robot_indexable: boolean (nullable = true) | | | |-- is_self: boolean (nullable = true) | | | |-- is_video: boolean (nullable = true) | | | |-- likes: string (nullable = true) | | | |-- link_flair_background_color: string (nullable = true) | | | |-- link_flair_css_class: string (nullable = true) | | | |-- link_flair_richtext: array (nullable = true) | | | | |-- element: string (containsNull = true) | | | |-- link_flair_template_id: string (nullable = true) | | | |-- link_flair_text: string (nullable = true) | | | |-- link_flair_text_color: string (nullable = true) | | | |-- link_flair_type: string (nullable = true) | | | |-- locked: boolean (nullable = true) | | | |-- media: string (nullable = true) | | | |-- media_only: boolean (nullable = true) | | | |-- mod_note: string (nullable = true) | | | |-- mod_reason_by: string (nullable = true) | | | |-- mod_reason_title: string (nullable = true) | | | |-- mod_reports: array (nullable = true) | | | | |-- element: string (containsNull = true) | | | |-- name: string (nullable = true) | | | |-- no_follow: boolean (nullable = true) | | | |-- num_comments: long (nullable = true) | | | |-- num_crossposts: long (nullable = true) | | | |-- num_reports: string (nullable = true) | | | |-- over_18: boolean (nullable = true) | | | |-- parent_whitelist_status: string (nullable = true) | | | |-- permalink: string (nullable = true) | | | |-- pinned: boolean (nullable = true) | | | |-- pwls: long (nullable = true) | | | |-- quarantine: boolean (nullable = true) | | | |-- removal_reason: string (nullable = true) | | | |-- report_reasons: string (nullable = true) | | | |-- saved: boolean (nullable = true) | | | |-- score: long (nullable = true) | | | |-- secure_media: string (nullable = true) | | | |-- selftext: string (nullable = true) | | | |-- selftext_html: string (nullable = true) | | | |-- send_replies: boolean (nullable = true) | | | |-- spoiler: boolean (nullable = true) | | | |-- stickied: boolean (nullable = true) | | | |-- subreddit: string (nullable = true) | | | |-- subreddit_id: string (nullable = true) | | | |-- subreddit_name_prefixed: string (nullable = true) | | | |-- subreddit_subscribers: long (nullable = true) | | | |-- subreddit_type: string (nullable = true) | | | |-- suggested_sort: string (nullable = true) | | | |-- thumbnail: string (nullable = true) | | | |-- title: string (nullable = true) | | | |-- ups: long (nullable = true) | | | |-- url: string (nullable = true) | | | |-- user_reports: array (nullable = true) | | | | |-- element: string (containsNull = true) | | | |-- view_count: string (nullable = true) | | | |-- visited: boolean (nullable = true) | | | |-- whitelist_status: string (nullable = true) | | | |-- wls: long (nullable = true) | |-- distinguished: string (nullable = true) | |-- domain: string (nullable = true) | |-- downs: long (nullable = true) | |-- edited: boolean (nullable = true) | |-- gilded: long (nullable = true) | |-- gildings: struct (nullable = true) | | |-- gid_1: long (nullable = true) | | |-- gid_2: long (nullable = true) | | |-- gid_3: long (nullable = true) | |-- hidden: boolean (nullable = true) | |-- hide_score: boolean (nullable = true) | |-- id: string (nullable = true) | |-- is_crosspostable: boolean (nullable = true) | |-- is_meta: boolean (nullable = true) | |-- is_original_content: boolean (nullable = true) | |-- is_reddit_media_domain: boolean (nullable = true) | |-- is_robot_indexable: boolean (nullable = true) | |-- is_self: boolean (nullable = true) | |-- is_video: boolean (nullable = true) | |-- likes: string (nullable = true) | |-- link_flair_background_color: string (nullable = true) | |-- link_flair_css_class: string (nullable = true) | |-- link_flair_richtext: array (nullable = true) | | |-- element: string (containsNull = true) | |-- link_flair_template_id: string (nullable = true) | |-- link_flair_text: string (nullable = true) | |-- link_flair_text_color: string (nullable = true) | |-- link_flair_type: string (nullable = true) | |-- locked: boolean (nullable = true) | |-- media: string (nullable = true) | |-- media_only: boolean (nullable = true) | |-- mod_note: string (nullable = true) | |-- mod_reason_by: string (nullable = true) | |-- mod_reason_title: string (nullable = true) | |-- mod_reports: array (nullable = true) | | |-- element: string (containsNull = true) | |-- name: string (nullable = true) | |-- no_follow: boolean (nullable = true) | |-- num_comments: long (nullable = true) | |-- num_crossposts: long (nullable = true) | |-- num_reports: string (nullable = true) | |-- over_18: boolean (nullable = true) | |-- parent_whitelist_status: string (nullable = true) | |-- permalink: string (nullable = true) | |-- pinned: boolean (nullable = true) | |-- pwls: long (nullable = true) | |-- quarantine: boolean (nullable = true) | |-- removal_reason: string (nullable = true) | |-- report_reasons: string (nullable = true) | |-- saved: boolean (nullable = true) | |-- score: long (nullable = true) | |-- secure_media: string (nullable = true) | |-- selftext: string (nullable = true) | |-- selftext_html: string (nullable = true) | |-- send_replies: boolean (nullable = true) | |-- spoiler: boolean (nullable = true) | |-- stickied: boolean (nullable = true) | |-- subreddit: string (nullable = true) | |-- subreddit_id: string (nullable = true) | |-- subreddit_name_prefixed: string (nullable = true) | |-- subreddit_subscribers: long (nullable = true) | |-- subreddit_type: string (nullable = true) | |-- suggested_sort: string (nullable = true) | |-- thumbnail: string (nullable = true) | |-- title: string (nullable = true) | |-- ups: long (nullable = true) | |-- url: string (nullable = true) | |-- user_reports: array (nullable = true) | | |-- element: string (containsNull = true) | |-- view_count: string (nullable = true) | |-- visited: boolean (nullable = true) | |-- whitelist_status: string (nullable = true) | |-- wls: long (nullable = true) |-- kind: string (nullable = true)