-
Notifications
You must be signed in to change notification settings - Fork 98
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Append schema functionality for review & comments. (#86)
* Added files for schema append functionality * Update test_append_if_schema_identical.py * Made the changes as per the review comments * Made the changes as per the review comments & added comments for better readability. * Made the changes as per the review comments & added comments for better readability.
- Loading branch information
1 parent
e1d0a84
commit 04571ef
Showing
3 changed files
with
62 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
from pyspark.sql import DataFrame | ||
|
||
|
||
class SchemaMismatchError(ValueError): | ||
"""raise this when there's a schema mismatch between source & target schema""" | ||
|
||
|
||
def append_if_schema_identical(source_df: DataFrame, target_df: DataFrame) -> DataFrame: | ||
"""Compares the schema of source & target dataframe . | ||
:param source_df: Input DataFrame | ||
:type source_df: pyspark.sql.DataFrame | ||
:param target_df: Input DataFrame | ||
:type target_df: pyspark.sql.DataFrame | ||
:return: dataframe | ||
:rtype: pyspark.sql.DataFrame | ||
""" | ||
# Retrieve the schemas of the source and target dataframes | ||
source_schema = source_df.schema | ||
target_schema = target_df.schema | ||
|
||
# Convert the schemas to a list of tuples | ||
source_schema_list = [(field.name, str(field.dataType)) for field in source_schema] | ||
target_schema_list = [(field.name, str(field.dataType)) for field in target_schema] | ||
|
||
unmatched_cols = [col for col in source_schema_list if col not in target_schema_list] | ||
error_message = f"The schemas of the source and target dataframes are not identical." \ | ||
f"From source schema column {unmatched_cols} is missing in target schema" | ||
# Check if the column names in the source and target schemas are the same, regardless of their order | ||
if set(source_schema.fieldNames()) != set(target_schema.fieldNames()): | ||
raise SchemaMismatchError(error_message) | ||
# Check if the column names and data types in the source and target schemas are the same, in the same order | ||
if sorted(source_schema_list) != sorted(target_schema_list): | ||
raise SchemaMismatchError(error_message) | ||
|
||
# Append the dataframes if the schemas are identical | ||
appended_df = target_df.unionByName(source_df) | ||
return appended_df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from pyspark.sql.types import StructType, StructField, IntegerType, StringType | ||
import quinn | ||
from tests.conftest import auto_inject_fixtures | ||
|
||
|
||
@auto_inject_fixtures("spark") | ||
def test_append_if_schema_identical(spark): | ||
source_data = [(1, "capetown", "Alice"), (2, "delhi", "Bob")] | ||
target_data = [(3, "Charlie", "New York"), (4, "Dave", "Los Angeles")] | ||
|
||
source_df = spark.createDataFrame(source_data, schema=StructType([ | ||
StructField("id", IntegerType()), | ||
StructField("city", StringType()), | ||
StructField("name", StringType()) | ||
])) | ||
|
||
target_df = spark.createDataFrame(target_data, schema=StructType([ | ||
StructField("id", IntegerType()), | ||
StructField("name", StringType()), | ||
StructField("city", StringType()) | ||
])) | ||
|
||
# Call the append_if_schema_identical function | ||
appended_df = quinn.append_if_schema_identical(source_df, target_df) |