Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated #4

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
38 changes: 38 additions & 0 deletions .ipynb_checkpoints/config-checkpoint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
main:
components_repository: "https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter.git#components"
# All the intermediate files will be copied to this directory at the end of the run.
# Set this to null if you are running in prod
project_name: nyc_airbnb
experiment_name: development
steps: all
etl:
sample: "sample1.csv"
min_price: 10 # dollars
max_price: 350 # dollars
data_check:
kl_threshold: 0.2
modeling:
# Fraction of data to use for test (the remaining will be used for train and validation)
test_size: 0.2
# Fraction of remaining data to use for validation
val_size: 0.2
# Fix this for reproducibility, change to have new splits
random_seed: 42
# Column to use for stratification (use "none" for no stratification)
stratify_by: "neighbourhood_group"
# Maximum number of features to consider for the TFIDF applied to the title of the
# insertion (the column called "name")
max_tfidf_features: 5
# NOTE: you can put here any parameter that is accepted by the constructor of
# RandomForestRegressor. This is a subsample, but more could be added:
random_forest:
n_estimators: 100
max_depth: 15
min_samples_split: 4
min_samples_leaf: 3
# Here -1 means all available cores
n_jobs: -1
criterion: mae
max_features: 0.5
# DO not change the following
oob_score: true
148 changes: 148 additions & 0 deletions .ipynb_checkpoints/main-checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import json

import mlflow
import tempfile
import os
import wandb
import hydra
from omegaconf import DictConfig

_steps = [
"download",
"basic_cleaning",
"data_check",
"data_split",
"train_random_forest",
# NOTE: We do not include this in the steps so it is not run by mistake.
# You first need to promote a model export to "prod" before you can run this,
# then you need to run this step explicitly
# "test_regression_model"
]


# This automatically reads in the configuration
@hydra.main(config_name='config')
def go(config: DictConfig):

# Setup the wandb experiment. All runs will be grouped under this name
os.environ["WANDB_PROJECT"] = config["main"]["project_name"]
os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"]

# Steps to execute
steps_par = config['main']['steps']
active_steps = steps_par.split(",") if steps_par != "all" else _steps

# Move to a temporary directory
with tempfile.TemporaryDirectory() as tmp_dir:

if "download" in active_steps:
# Download file and load in W&B
_ = mlflow.run(
f"{config['main']['components_repository']}/get_data",
"main",
parameters={
"sample": config["etl"]["sample"],
"artifact_name": "sample.csv",
"artifact_type": "raw_data",
"artifact_description": "Raw file as downloaded"
},
)

if "basic_cleaning" in active_steps:
##################
# Implement here #
##################
if "basic_cleaning" in active_steps:
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
"main",
parameters={
"input_artifact": "sample.csv:latest",
"output_artifact": "clean_sample.csv",
"output_type": "clean_sample",
"output_description": "Data with outliers and null values removed",
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price']
},
)

if "data_check" in active_steps:
##################
# Implement here #
##################
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
"main",
parameters={
"csv": "clean_sample.csv:latest",
"ref": "clean_sample.csv:reference",
"kl_threshold": config["data_check"]["kl_threshold"],
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price']
}
)


if "data_split" in active_steps:
##################
# Implement here #
##################
_ = mlflow.run(
f"{config['main']['components_repository']}/train_val_test_split",
'main',
parameters = {
'input': 'clean_sample.csv:latest',
'test_size': config['modeling']['test_size'],
'random_seed': config['modeling']['random_seed'],
'stratify_by': config['modeling']['stratify_by']
}
)


if "train_random_forest" in active_steps:

# NOTE: we need to serialize the random forest configuration into JSON
rf_config = os.path.abspath("rf_config.json")
with open(rf_config, "w+") as fp:
json.dump(dict(config["modeling"]["random_forest"].items()), fp) # DO NOT TOUCH

# NOTE: use the rf_config we just created as the rf_config parameter for the train_random_forest
# step

##################
# Implement here #
##################

_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), 'src', 'train_random_forest'),
'main',
parameters = {
'trainval_artifact': 'trainval_data.csv:latest',
'val_size': config['modeling']['val_size'],
'random_seed': config['modeling']['random_seed'],
'stratify_by': config['modeling']['stratify_by'],
'rf_config': rf_config,
'max_tfidf_features': config['modeling']['max_tfidf_features'],
'output_artifact': 'random_forest_export'
}
)

if "test_regression_model" in active_steps:

##################
# Implement here #
##################

_ = mlflow.run(
f"{config['main']['components_repository']}/test_regression_model",
'main',
parameters = {
'mlflow_model': 'random_forest_export:prod',
'test_dataset': 'test_data.csv:latest'
}
)



if __name__ == "__main__":
go()
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
Github link: https://github.com/ebs-well/Project-Build-an-ML-Pipeline-Starter

Weights and Biases link: https://wandb.ai/ebs-stilwell/nyc_airbnb?nw=nwuserebsstilwell


# Build an ML Pipeline for Short-Term Rental Prices in NYC
You are working for a property management company renting rooms and properties for short periods of
time on various rental platforms. You need to estimate the typical price for a given property based
Expand Down
63 changes: 58 additions & 5 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,19 +52,52 @@ def go(config: DictConfig):
##################
# Implement here #
##################
pass
if "basic_cleaning" in active_steps:
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
"main",
parameters={
"input_artifact": "sample.csv:latest",
"output_artifact": "clean_sample.csv",
"output_type": "clean_sample",
"output_description": "Data with outliers and null values removed",
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price']
},
)

if "data_check" in active_steps:
##################
# Implement here #
##################
pass
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
"main",
parameters={
"csv": "clean_sample.csv:latest",
"ref": "clean_sample.csv:reference",
"kl_threshold": config["data_check"]["kl_threshold"],
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price']
}
)


if "data_split" in active_steps:
##################
# Implement here #
##################
pass
_ = mlflow.run(
f"{config['main']['components_repository']}/train_val_test_split",
'main',
parameters = {
'input': 'clean_sample.csv:latest',
'test_size': config['modeling']['test_size'],
'random_seed': config['modeling']['random_seed'],
'stratify_by': config['modeling']['stratify_by']
}
)


if "train_random_forest" in active_steps:

Expand All @@ -80,15 +113,35 @@ def go(config: DictConfig):
# Implement here #
##################

pass
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), 'src', 'train_random_forest'),
'main',
parameters = {
'trainval_artifact': 'trainval_data.csv:latest',
'val_size': config['modeling']['val_size'],
'random_seed': config['modeling']['random_seed'],
'stratify_by': config['modeling']['stratify_by'],
'rf_config': rf_config,
'max_tfidf_features': config['modeling']['max_tfidf_features'],
'output_artifact': 'random_forest_export'
}
)

if "test_regression_model" in active_steps:

##################
# Implement here #
##################

pass
_ = mlflow.run(
f"{config['main']['components_repository']}/test_regression_model",
'main',
parameters = {
'mlflow_model': 'random_forest_export:prod',
'test_dataset': 'test_data.csv:latest'
}
)



if __name__ == "__main__":
Expand Down
15 changes: 15 additions & 0 deletions mlruns/0/0ddc1d15b4254255aa54971028dcc094/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
artifact_uri: file:///home/falcon/Project-Build-an-ML-Pipeline-Starter/mlruns/0/0ddc1d15b4254255aa54971028dcc094/artifacts
end_time: 1710648182250
entry_point_name: ''
experiment_id: '0'
lifecycle_stage: active
run_id: 0ddc1d15b4254255aa54971028dcc094
run_name: bustling-auk-59
run_uuid: 0ddc1d15b4254255aa54971028dcc094
source_name: ''
source_type: 4
source_version: ''
start_time: 1710648088707
status: 3
tags: []
user_id: falcon
1 change: 1 addition & 0 deletions mlruns/0/0ddc1d15b4254255aa54971028dcc094/params/input
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
clean_sample.csv:latest
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
42
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
neighbourhood_group
1 change: 1 addition & 0 deletions mlruns/0/0ddc1d15b4254255aa54971028dcc094/params/test_size
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.2
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter.git
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
local
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
main
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
conda
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
bustling-auk-59
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
077621b0ed54d88e8c2f5ef3dea7b725c4ea7c98
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter.git
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://github.com/udacity/Project-Build-an-ML-Pipeline-Starter.git#components/train_val_test_split
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
PROJECT
1 change: 1 addition & 0 deletions mlruns/0/0ddc1d15b4254255aa54971028dcc094/tags/mlflow.user
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
falcon
15 changes: 15 additions & 0 deletions mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
artifact_uri: file:///home/falcon/Project-Build-an-ML-Pipeline-Starter/mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/artifacts
end_time: 1710646146028
entry_point_name: ''
experiment_id: '0'
lifecycle_stage: active
run_id: 18a1ae29ce954264bdcdbb27cd185d03
run_name: angry-shrike-709
run_uuid: 18a1ae29ce954264bdcdbb27cd185d03
source_name: ''
source_type: 4
source_version: ''
start_time: 1710646110734
status: 3
tags: []
user_id: falcon
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
sample.csv:latest
1 change: 1 addition & 0 deletions mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/params/max_price
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
350
1 change: 1 addition & 0 deletions mlruns/0/18a1ae29ce954264bdcdbb27cd185d03/params/min_price
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
10
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
clean_sample.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'Data with outliers and null values removed'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
clean_sample
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://github.com/ebs-well/Project-Build-an-ML-Pipeline-Starter.git
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
local
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
main
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
conda
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
angry-shrike-709
Loading
Loading