udacity · ebs-well · Mar 17, 2024 · Mar 17, 2024 · Mar 17, 2024 · Mar 17, 2024
diff --git a/.ipynb_checkpoints/config-checkpoint.yaml b/.ipynb_checkpoints/config-checkpoint.yaml
@@ -0,0 +1,38 @@
+main:
+  components_repository: "https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter.git#components"
+  # All the intermediate files will be copied to this directory at the end of the run.
+  # Set this to null if you are running in prod
+  project_name: nyc_airbnb
+  experiment_name: development
+  steps: all
+etl:
+  sample: "sample1.csv"
+  min_price: 10  # dollars
+  max_price: 350  # dollars
+data_check:
+  kl_threshold: 0.2
+modeling:
+  # Fraction of data to use for test (the remaining will be used for train and validation)
+  test_size: 0.2
+  # Fraction of remaining data to use for validation
+  val_size: 0.2
+  # Fix this for reproducibility, change to have new splits
+  random_seed: 42
+  # Column to use for stratification (use "none" for no stratification)
+  stratify_by: "neighbourhood_group"
+  # Maximum number of features to consider for the TFIDF applied to the title of the
+  # insertion (the column called "name")
+  max_tfidf_features: 5
+  # NOTE: you can put here any parameter that is accepted by the constructor of
+  # RandomForestRegressor. This is a subsample, but more could be added:
+  random_forest:
+    n_estimators: 100
+    max_depth: 15
+    min_samples_split: 4
+    min_samples_leaf: 3
+    # Here -1 means all available cores
+    n_jobs: -1
+    criterion: mae
+    max_features: 0.5
+    # DO not change the following
+    oob_score: true
diff --git a/.ipynb_checkpoints/main-checkpoint.py b/.ipynb_checkpoints/main-checkpoint.py
@@ -0,0 +1,148 @@
+import json
+
+import mlflow
+import tempfile
+import os
+import wandb
+import hydra
+from omegaconf import DictConfig
+
+_steps = [
+    "download",
+    "basic_cleaning",
+    "data_check",
+    "data_split",
+    "train_random_forest",
+    # NOTE: We do not include this in the steps so it is not run by mistake.
+    # You first need to promote a model export to "prod" before you can run this,
+    # then you need to run this step explicitly
+#    "test_regression_model"
+]
+
+
+# This automatically reads in the configuration
+@hydra.main(config_name='config')
+def go(config: DictConfig):
+
+    # Setup the wandb experiment. All runs will be grouped under this name
+    os.environ["WANDB_PROJECT"] = config["main"]["project_name"]
+    os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"]
+
+    # Steps to execute
+    steps_par = config['main']['steps']
+    active_steps = steps_par.split(",") if steps_par != "all" else _steps
+
+    # Move to a temporary directory
+    with tempfile.TemporaryDirectory() as tmp_dir:
+
+        if "download" in active_steps:
+            # Download file and load in W&B
+            _ = mlflow.run(
+                f"{config['main']['components_repository']}/get_data",
+                "main",
+                parameters={
+                    "sample": config["etl"]["sample"],
+                    "artifact_name": "sample.csv",
+                    "artifact_type": "raw_data",
+                    "artifact_description": "Raw file as downloaded"
+                },
+            )
+
+        if "basic_cleaning" in active_steps:
+            ##################
+            # Implement here #
+            ##################
+            if "basic_cleaning" in active_steps:
+                _ = mlflow.run(
+                     os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
+                     "main",
+                     parameters={
+                         "input_artifact": "sample.csv:latest",
+                         "output_artifact": "clean_sample.csv",
+                         "output_type": "clean_sample",
+                         "output_description": "Data with outliers and null values removed",
+                         "min_price": config['etl']['min_price'],
+                         "max_price": config['etl']['max_price']
+                     },
+                 )
+
+        if "data_check" in active_steps:
+            ##################
+            # Implement here #
+            ##################
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
+                "main",
+                parameters={
+                    "csv": "clean_sample.csv:latest",
+                    "ref": "clean_sample.csv:reference",
+                    "kl_threshold": config["data_check"]["kl_threshold"],
+                    "min_price": config['etl']['min_price'],
+                    "max_price": config['etl']['max_price']
+                }
+            )
+
+
+        if "data_split" in active_steps:
+            ##################
+            # Implement here #
+            ##################
+            _ = mlflow.run(
+                f"{config['main']['components_repository']}/train_val_test_split",
+                'main',
+                parameters = {
+                    'input': 'clean_sample.csv:latest',
+                    'test_size': config['modeling']['test_size'],
+                    'random_seed': config['modeling']['random_seed'],
+                    'stratify_by': config['modeling']['stratify_by']
+                }
+            )
+
+
+        if "train_random_forest" in active_steps:
+
+            # NOTE: we need to serialize the random forest configuration into JSON
+            rf_config = os.path.abspath("rf_config.json")
+            with open(rf_config, "w+") as fp:
+                json.dump(dict(config["modeling"]["random_forest"].items()), fp)  # DO NOT TOUCH
+
+            # NOTE: use the rf_config we just created as the rf_config parameter for the train_random_forest
+            # step
+
+            ##################
+            # Implement here #
+            ##################
+
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), 'src', 'train_random_forest'),
+                'main',
+                parameters = {
+                    'trainval_artifact': 'trainval_data.csv:latest',
+                    'val_size': config['modeling']['val_size'],
+                    'random_seed': config['modeling']['random_seed'],
+                    'stratify_by': config['modeling']['stratify_by'],
+                    'rf_config': rf_config,
+                    'max_tfidf_features': config['modeling']['max_tfidf_features'],
+                    'output_artifact': 'random_forest_export'
+                }
+            )
+
+        if "test_regression_model" in active_steps:
+
+            ##################
+            # Implement here #
+            ##################
+
+            _ = mlflow.run(
+                f"{config['main']['components_repository']}/test_regression_model",
+                'main',
+                parameters = {
+                    'mlflow_model': 'random_forest_export:prod',
+                    'test_dataset': 'test_data.csv:latest'
+                }
+            )
+
+
+
+if __name__ == "__main__":
+    go()
diff --git a/README.md b/README.md
@@ -1,3 +1,8 @@
+Github link: https://github.com/ebs-well/Project-Build-an-ML-Pipeline-Starter
+
+Weights and Biases link: https://wandb.ai/ebs-stilwell/nyc_airbnb?nw=nwuserebsstilwell
+
+
 # Build an ML Pipeline for Short-Term Rental Prices in NYC
 You are working for a property management company renting rooms and properties for short periods of 
 time on various rental platforms. You need to estimate the typical price for a given property based 

diff --git a/main.py b/main.py
@@ -52,19 +52,52 @@ def go(config: DictConfig):
             ##################
             # Implement here #
             ##################
-            pass
+            if "basic_cleaning" in active_steps:
+                _ = mlflow.run(
+                     os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
+                     "main",
+                     parameters={
+                         "input_artifact": "sample.csv:latest",
+                         "output_artifact": "clean_sample.csv",
+                         "output_type": "clean_sample",
+                         "output_description": "Data with outliers and null values removed",
+                         "min_price": config['etl']['min_price'],
+                         "max_price": config['etl']['max_price']
+                     },
+                 )
 
         if "data_check" in active_steps:
             ##################
             # Implement here #
             ##################
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
+                "main",
+                parameters={
+                    "csv": "clean_sample.csv:latest",
+                    "ref": "clean_sample.csv:reference",
+                    "kl_threshold": config["data_check"]["kl_threshold"],
+                    "min_price": config['etl']['min_price'],
+                    "max_price": config['etl']['max_price']
+                }
+            )
+
 
         if "data_split" in active_steps:
             ##################
             # Implement here #
             ##################
-            pass
+            _ = mlflow.run(
+                f"{config['main']['components_repository']}/train_val_test_split",
+                'main',
+                parameters = {
+                    'input': 'clean_sample.csv:latest',
+                    'test_size': config['modeling']['test_size'],
+                    'random_seed': config['modeling']['random_seed'],
+                    'stratify_by': config['modeling']['stratify_by']
+                }
+            )
+
 
         if "train_random_forest" in active_steps:
 
@@ -80,15 +113,35 @@ def go(config: DictConfig):
             # Implement here #
             ##################
 
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), 'src', 'train_random_forest'),
+                'main',
+                parameters = {
+                    'trainval_artifact': 'trainval_data.csv:latest',
+                    'val_size': config['modeling']['val_size'],
+                    'random_seed': config['modeling']['random_seed'],
+                    'stratify_by': config['modeling']['stratify_by'],
+                    'rf_config': rf_config,
+                    'max_tfidf_features': config['modeling']['max_tfidf_features'],
+                    'output_artifact': 'random_forest_export'
+                }
+            )
 
         if "test_regression_model" in active_steps:
 
             ##################
             # Implement here #
             ##################
 
-            pass
+            _ = mlflow.run(
+                f"{config['main']['components_repository']}/test_regression_model",
+                'main',
+                parameters = {
+                    'mlflow_model': 'random_forest_export:prod',
+                    'test_dataset': 'test_data.csv:latest'
+                }
+            )
+
 
 
 if __name__ == "__main__":

diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/meta.yaml b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: file:///home/falcon/Project-Build-an-ML-Pipeline-Starter/mlruns/0/0ddc1d15b4254255aa54971028dcc094/artifacts
+end_time: 1710648182250
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+run_id: 0ddc1d15b4254255aa54971028dcc094
+run_name: bustling-auk-59
+run_uuid: 0ddc1d15b4254255aa54971028dcc094
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1710648088707
+status: 3
+tags: []
+user_id: falcon
diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/params/input b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/params/input
@@ -0,0 +1 @@
+clean_sample.csv:latest
diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/params/random_seed b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/params/random_seed
@@ -0,0 +1 @@
+42
diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/params/stratify_by b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/params/stratify_by
@@ -0,0 +1 @@
+neighbourhood_group
diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/params/test_size b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/params/test_size
@@ -0,0 +1 @@
+0.2
diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.gitRepoURL b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.gitRepoURL
@@ -0,0 +1 @@
+https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter.git
diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.project.backend b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.project.backend
@@ -0,0 +1 @@
+local
diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.project.entryPoint b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.project.entryPoint
@@ -0,0 +1 @@
+main
diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.project.env b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.project.env
@@ -0,0 +1 @@
+conda
diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.runName b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.runName
@@ -0,0 +1 @@
+bustling-auk-59
diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.source.git.commit b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.source.git.commit
@@ -0,0 +1 @@
+077621b0ed54d88e8c2f5ef3dea7b725c4ea7c98
diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.source.git.repoURL b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.source.git.repoURL
@@ -0,0 +1 @@
+https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter.git
diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.source.name b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.source.name
@@ -0,0 +1 @@
+https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter.git#components/train_val_test_split
diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.source.type b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.source.type
@@ -0,0 +1 @@
+PROJECT
diff --git a/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.user b/mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.user
@@ -0,0 +1 @@
+falcon
diff --git a/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/meta.yaml b/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/meta.yaml
@@ -0,0 +1,15 @@
+artifact_uri: file:///home/falcon/Project-Build-an-ML-Pipeline-Starter/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/artifacts
+end_time: 1710646146028
+entry_point_name: ''
+experiment_id: '0'
+lifecycle_stage: active
+run_id: 18a1ae29ce954264bdcdbb27cd185d03
+run_name: angry-shrike-709
+run_uuid: 18a1ae29ce954264bdcdbb27cd185d03
+source_name: ''
+source_type: 4
+source_version: ''
+start_time: 1710646110734
+status: 3
+tags: []
+user_id: falcon
diff --git a/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/params/input_artifact b/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/params/input_artifact
@@ -0,0 +1 @@
+sample.csv:latest
diff --git a/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/params/max_price b/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/params/max_price
@@ -0,0 +1 @@
+350
diff --git a/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/params/min_price b/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/params/min_price
@@ -0,0 +1 @@
+10
diff --git a/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/params/output_artifact b/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/params/output_artifact
@@ -0,0 +1 @@
+clean_sample.csv
diff --git a/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/params/output_description b/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/params/output_description
@@ -0,0 +1 @@
+'Data with outliers and null values removed'
diff --git a/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/params/output_type b/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/params/output_type
@@ -0,0 +1 @@
+clean_sample
diff --git a/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/tags/mlflow.gitRepoURL b/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/tags/mlflow.gitRepoURL
@@ -0,0 +1 @@
+https://github.com/ebs-well/Project-Build-an-ML-Pipeline-Starter.git
diff --git a/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/tags/mlflow.project.backend b/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/tags/mlflow.project.backend
@@ -0,0 +1 @@
+local
diff --git a/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/tags/mlflow.project.entryPoint b/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/tags/mlflow.project.entryPoint
@@ -0,0 +1 @@
+main
diff --git a/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/tags/mlflow.project.env b/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/tags/mlflow.project.env
@@ -0,0 +1 @@
+conda
diff --git a/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/tags/mlflow.runName b/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/tags/mlflow.runName
@@ -0,0 +1 @@
+angry-shrike-709