Merge pull request #799 from dyvenia/dev

Release 0.4.21 PR
dyvenia · Oct 26, 2023 · 584fa05 · 584fa05
2 parents a6ced4d + 6be1ad8
commit 584fa05
Show file tree

Hide file tree

Showing 44 changed files with 2,195 additions and 51 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,12 +5,20 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+
+## [0.4.21] - 2023-10-26
 ### Added
+- Added `validate_df` task to task_utils.
+- Added `SharepointList` source class.
+- Added `SharepointListToDF` task class.
+- Added `SharepointListToADLS` flow class.
+- Added tests for `SharepointList`.
+- Added `get_nested_dict` to untils.py.
 
 ### Fixed
 
 ### Changed
-
+- Changed `GenesysToCSV` logic for end_point == "conversations". Added new fields to extraction.
 
 ## [0.4.20] - 2023-10-12
 ### Added
@@ -618,4 +626,4 @@ specified in the `SUPERMETRICS_DEFAULT_USER` secret
 - Moved from poetry to pip
 
 ### Fixed
-- Fix `AzureBlobStorage`'s `to_storage()` method is missing the final upload blob part
+- Fix `AzureBlobStorage`'s `to_storage()` method is missing the final upload blob part
diff --git a/tests/integration/flows/test_bigquery_to_adls.py b/tests/integration/flows/test_bigquery_to_adls.py
@@ -1,11 +1,16 @@
 import os
 
 import pendulum
-from prefect.tasks.secrets import PrefectSecret
+import pytest
+from unittest import mock
+import pandas as pd
 
+from prefect.tasks.secrets import PrefectSecret
 from viadot.flows import BigQueryToADLS
 from viadot.tasks import AzureDataLakeRemove
 
+from viadot.exceptions import ValidationError
+
 ADLS_DIR_PATH = "raw/tests/"
 ADLS_FILE_NAME = str(pendulum.now("utc")) + ".parquet"
 BIGQ_CREDENTIAL_KEY = "BIGQUERY-TESTS"
@@ -72,6 +77,68 @@ def test_bigquery_to_adls_false():
     assert result.is_failed()
     os.remove("test_bigquery_to_adls_overwrite_false.parquet")
     os.remove("test_bigquery_to_adls_overwrite_false.json")
+
+
+DATA = {
+    "type": ["banner", "banner"],
+    "country": ["PL", "DE"],
+}
+
+
+@mock.patch(
+    "viadot.tasks.BigQueryToDF.run",
+    return_value=pd.DataFrame(data=DATA),
+)
+@pytest.mark.run
+def test_bigquery_to_adls_validate_df_fail(mocked_data):
+    flow_bigquery = BigQueryToADLS(
+        name="Test BigQuery to ADLS validate df fail",
+        dataset_name="official_empty",
+        table_name="space",
+        credentials_key=BIGQ_CREDENTIAL_KEY,
+        adls_file_name=ADLS_FILE_NAME,
+        overwrite_adls=True,
+        adls_dir_path=ADLS_DIR_PATH,
+        adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET,
+        validate_df_dict={"column_list_to_match": ["type", "country", "test"]},
+    )
+    try:
+        result = flow_bigquery.run()
+    except ValidationError:
+        pass
+
+    os.remove("test_bigquery_to_adls_validate_df_fail.parquet")
+    os.remove("test_bigquery_to_adls_validate_df_fail.json")
+
+
+@mock.patch(
+    "viadot.tasks.BigQueryToDF.run",
+    return_value=pd.DataFrame(data=DATA),
+)
+@pytest.mark.run
+def test_bigquery_to_adls_validate_df_success(mocked_data):
+    flow_bigquery = BigQueryToADLS(
+        name="Test BigQuery to ADLS validate df success",
+        dataset_name="official_empty",
+        table_name="space",
+        credentials_key=BIGQ_CREDENTIAL_KEY,
+        adls_file_name=ADLS_FILE_NAME,
+        overwrite_adls=True,
+        adls_dir_path=ADLS_DIR_PATH,
+        adls_sp_credentials_secret=ADLS_CREDENTIAL_SECRET,
+        validate_df_dict={"column_list_to_match": ["type", "country"]},
+    )
+    result = flow_bigquery.run()
+
+    result = flow_bigquery.run()
+    assert result.is_successful()
+
+    task_results = result.result.values()
+    assert all([task_result.is_successful() for task_result in task_results])
+
+    os.remove("test_bigquery_to_adls_validate_df_success.parquet")
+    os.remove("test_bigquery_to_adls_validate_df_success.json")
+
     rm = AzureDataLakeRemove(
         path=ADLS_DIR_PATH + ADLS_FILE_NAME, vault_name="azuwevelcrkeyv001s"
     )

diff --git a/tests/integration/flows/test_cloud_for_customers_report_to_adls.py b/tests/integration/flows/test_cloud_for_customers_report_to_adls.py
@@ -1,5 +1,6 @@
 from viadot.config import local_config
 from viadot.flows import CloudForCustomersReportToADLS
+from viadot.exceptions import ValidationError
 
 
 def test_cloud_for_customers_report_to_adls():
@@ -27,3 +28,61 @@ def test_cloud_for_customers_report_to_adls():
 
     task_results = result.result.values()
     assert all([task_result.is_successful() for task_result in task_results])
+
+    assert len(flow.tasks) == 6
+
+
+def test_cloud_for_customers_report_to_adls_validation_fail(caplog):
+    credentials = local_config.get("CLOUD_FOR_CUSTOMERS")
+    credentials_prod = credentials["Prod"]
+    channels = ["VEL_B_AFS", "VEL_B_ASA"]
+    month = ["01"]
+    year = ["2021"]
+    flow = CloudForCustomersReportToADLS(
+        report_url=credentials_prod["server"],
+        env="Prod",
+        channels=channels,
+        months=month,
+        years=year,
+        name="test_c4c_report_to_adls",
+        local_file_path=f"test_c4c_report_to_adls.csv",
+        adls_sp_credentials_secret=credentials["adls_sp_credentials_secret"],
+        adls_dir_path=credentials["adls_dir_path"],
+        validate_df_dict={"column_size": {"ChannelName ID": 10}},
+    )
+    try:
+        result = flow.run()
+    except ValidationError:
+        pass
+
+
+def test_cloud_for_customers_report_to_adls_validation_success():
+    credentials = local_config.get("CLOUD_FOR_CUSTOMERS")
+    credentials_prod = credentials["Prod"]
+    channels = ["VEL_B_AFS", "VEL_B_ASA"]
+    month = ["01"]
+    year = ["2021"]
+    flow = CloudForCustomersReportToADLS(
+        report_url=credentials_prod["server"],
+        env="Prod",
+        channels=channels,
+        months=month,
+        years=year,
+        name="test_c4c_report_to_adls",
+        local_file_path=f"test_c4c_report_to_adls.csv",
+        adls_sp_credentials_secret=credentials["adls_sp_credentials_secret"],
+        adls_dir_path=credentials["adls_dir_path"],
+        validate_df_dict={"column_size": {"ChannelName ID": 13}},
+    )
+
+    try:
+        result = flow.run()
+    except ValidationError:
+        assert False, "Validation failed but was expected to pass"
+
+    assert result.is_successful()
+
+    task_results = result.result.values()
+    assert all([task_result.is_successful() for task_result in task_results])
+
+    assert len(flow.tasks) == 7
diff --git a/tests/integration/flows/test_customer_gauge_to_adls.py b/tests/integration/flows/test_customer_gauge_to_adls.py
@@ -5,6 +5,7 @@
 import pytest
 
 from viadot.flows import CustomerGaugeToADLS
+from viadot.exceptions import ValidationError
 
 DATA = {
     "user_name": ["Jane", "Bob"],
@@ -15,6 +16,7 @@
     "user_address_country_name": "United States",
     "user_address_country_code": "US",
 }
+
 COLUMNS = ["user_name", "user_address_street"]
 ADLS_FILE_NAME = "test_customer_gauge.parquet"
 ADLS_DIR_PATH = "raw/tests/"
@@ -40,3 +42,54 @@ def test_customer_gauge_to_adls_run_flow(mocked_class):
     assert result.is_successful()
     os.remove("test_customer_gauge_to_adls_flow_run.parquet")
     os.remove("test_customer_gauge_to_adls_flow_run.json")
+
+
+@mock.patch(
+    "viadot.tasks.CustomerGaugeToDF.run",
+    return_value=pd.DataFrame(data=DATA),
+)
+@pytest.mark.run
+def test_customer_gauge_to_adls_run_flow_validation_success(mocked_class):
+    flow = CustomerGaugeToADLS(
+        "test_customer_gauge_to_adls_run_flow_validation_success",
+        endpoint="responses",
+        total_load=False,
+        anonymize=True,
+        columns_to_anonymize=COLUMNS,
+        adls_dir_path=ADLS_DIR_PATH,
+        adls_file_name=ADLS_FILE_NAME,
+        overwrite_adls=True,
+        validate_df_dict={"column_size": {"user_address_state": 2}},
+    )
+    result = flow.run()
+    assert result.is_successful()
+    assert len(flow.tasks) == 11
+
+    os.remove("test_customer_gauge_to_adls_run_flow_validation_success.parquet")
+    os.remove("test_customer_gauge_to_adls_run_flow_validation_success.json")
+
+
+@mock.patch(
+    "viadot.tasks.CustomerGaugeToDF.run",
+    return_value=pd.DataFrame(data=DATA),
+)
+@pytest.mark.run
+def test_customer_gauge_to_adls_run_flow_validation_failure(mocked_class):
+    flow = CustomerGaugeToADLS(
+        "test_customer_gauge_to_adls_run_flow_validation_failure",
+        endpoint="responses",
+        total_load=False,
+        anonymize=True,
+        columns_to_anonymize=COLUMNS,
+        adls_dir_path=ADLS_DIR_PATH,
+        adls_file_name=ADLS_FILE_NAME,
+        overwrite_adls=True,
+        validate_df_dict={"column_size": {"user_name": 5}},
+    )
+    try:
+        flow.run()
+    except ValidationError:
+        pass
+
+    os.remove("test_customer_gauge_to_adls_run_flow_validation_failure.parquet")
+    os.remove("test_customer_gauge_to_adls_run_flow_validation_failure.json")
diff --git a/tests/integration/flows/test_eurostat_to_adls.py b/tests/integration/flows/test_eurostat_to_adls.py
@@ -6,7 +6,11 @@
 
 from viadot.flows import EurostatToADLS
 
-DATA = {"geo": ["PL", "DE", "NL"], "indicator": [35, 55, 77]}
+DATA = {
+    "geo": ["PL", "DE", "NL"],
+    "indicator": [35, 55, 77],
+    "time": ["2023-01", "2023-51", "2023-07"],
+}
 ADLS_FILE_NAME = "test_eurostat.parquet"
 ADLS_DIR_PATH = "raw/tests/"
 
@@ -28,3 +32,23 @@ def test_eurostat_to_adls_run_flow(mocked_class):
     assert result.is_successful()
     os.remove("test_eurostat_to_adls_flow_run.parquet")
     os.remove("test_eurostat_to_adls_flow_run.json")
+
+
+@mock.patch(
+    "viadot.tasks.EurostatToDF.run",
+    return_value=pd.DataFrame(data=DATA),
+)
+@pytest.mark.run
+def test_validate_df(mocked_class):
+    flow = EurostatToADLS(
+        "test_validate_df",
+        dataset_code="ILC_DI04",
+        overwrite_adls=True,
+        validate_df_dict={"column_size": {"time": 7}},
+        adls_dir_path=ADLS_DIR_PATH,
+        adls_file_name=ADLS_FILE_NAME,
+    )
+    result = flow.run()
+    assert result.is_successful()
+    os.remove("test_validate_df.parquet")
+    os.remove("test_validate_df.json")
diff --git a/tests/integration/flows/test_hubspot_to_adls.py b/tests/integration/flows/test_hubspot_to_adls.py
@@ -6,6 +6,7 @@
 import pytest
 
 from viadot.flows import HubspotToADLS
+from viadot.exceptions import ValidationError
 
 DATA = {
     "id": {"0": "820306930"},
@@ -60,3 +61,81 @@ def test_hubspot_to_adls_flow_run(mocked_class):
     assert result.is_successful()
     os.remove("test_hubspot_to_adls_flow_run.parquet")
     os.remove("test_hubspot_to_adls_flow_run.json")
+
+
+@mock.patch(
+    "viadot.tasks.HubspotToDF.run",
+    return_value=pd.DataFrame(data=DATA),
+)
+@pytest.mark.run
+def test_hubspot_to_adls_flow_run_validate_fail(mocked_class):
+    flow = HubspotToADLS(
+        "test_hubspot_to_adls_flow_run",
+        hubspot_credentials_key="HUBSPOT",
+        endpoint="line_items",
+        filters=[
+            {
+                "filters": [
+                    {
+                        "propertyName": "createdate",
+                        "operator": "BETWEEN",
+                        "highValue": "2021-01-01",
+                        "value": "2021-01-01",
+                    },
+                    {"propertyName": "quantity", "operator": "EQ", "value": "2"},
+                ]
+            },
+            {
+                "filters": [
+                    {"propertyName": "amount", "operator": "EQ", "value": "3744.000"}
+                ]
+            },
+        ],
+        overwrite_adls=True,
+        adls_dir_path=ADLS_DIR_PATH,
+        adls_file_name=ADLS_FILE_NAME,
+        validate_df_dict={"column_size": {"id": 0}},
+    )
+    try:
+        flow.run()
+    except ValidationError:
+        pass
+
+
+@mock.patch(
+    "viadot.tasks.HubspotToDF.run",
+    return_value=pd.DataFrame(data=DATA),
+)
+@pytest.mark.run
+def test_hubspot_to_adls_flow_run_validate_success(mocked_class):
+    flow = HubspotToADLS(
+        "test_hubspot_to_adls_flow_run",
+        hubspot_credentials_key="HUBSPOT",
+        endpoint="line_items",
+        filters=[
+            {
+                "filters": [
+                    {
+                        "propertyName": "createdate",
+                        "operator": "BETWEEN",
+                        "highValue": "2021-01-01",
+                        "value": "2021-01-01",
+                    },
+                    {"propertyName": "quantity", "operator": "EQ", "value": "2"},
+                ]
+            },
+            {
+                "filters": [
+                    {"propertyName": "amount", "operator": "EQ", "value": "3744.000"}
+                ]
+            },
+        ],
+        overwrite_adls=True,
+        adls_dir_path=ADLS_DIR_PATH,
+        adls_file_name=ADLS_FILE_NAME,
+        validate_df_dict={"column_unique_values": ["id"]},
+    )
+    result = flow.run()
+    assert result.is_successful()
+    os.remove("test_hubspot_to_adls_flow_run.parquet")
+    os.remove("test_hubspot_to_adls_flow_run.json")