Skip to content

brian-kipkoech-tanui/Regressionusecase

Repository files navigation

Regressionusecase

End to End ML Project Use Case

Import the relevant libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Load the dataset

dataset = pd.read_csv("./ames_unprocessed_data.csv")
# Check the basic info
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MSSubClass    1460 non-null   int64  
 1   MSZoning      1460 non-null   object 
 2   LotFrontage   1201 non-null   float64
 3   LotArea       1460 non-null   int64  
 4   Neighborhood  1460 non-null   object 
 5   BldgType      1460 non-null   object 
 6   HouseStyle    1460 non-null   object 
 7   OverallQual   1460 non-null   int64  
 8   OverallCond   1460 non-null   int64  
 9   YearBuilt     1460 non-null   int64  
 10  Remodeled     1460 non-null   int64  
 11  GrLivArea     1460 non-null   int64  
 12  BsmtFullBath  1460 non-null   int64  
 13  BsmtHalfBath  1460 non-null   int64  
 14  FullBath      1460 non-null   int64  
 15  HalfBath      1460 non-null   int64  
 16  BedroomAbvGr  1460 non-null   int64  
 17  Fireplaces    1460 non-null   int64  
 18  GarageArea    1460 non-null   int64  
 19  PavedDrive    1460 non-null   object 
 20  SalePrice     1460 non-null   int64  
dtypes: float64(1), int64(15), object(5)
memory usage: 239.7+ KB

Handle missing values

# Check the column with missing values
data = dataset.copy()
dataset.isna().sum()
MSSubClass        0
MSZoning          0
LotFrontage     259
LotArea           0
Neighborhood      0
BldgType          0
HouseStyle        0
OverallQual       0
OverallCond       0
YearBuilt         0
Remodeled         0
GrLivArea         0
BsmtFullBath      0
BsmtHalfBath      0
FullBath          0
HalfBath          0
BedroomAbvGr      0
Fireplaces        0
GarageArea        0
PavedDrive        0
SalePrice         0
dtype: int64
# Use the sklearn's SimpleImputer() class to handle all missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# fit the imputer on numerical columns
int_cols = dataset.select_dtypes(include=['float', 'int']).columns.to_list()
imputer.fit(dataset[int_cols])
dataset[int_cols] = imputer.transform(dataset[int_cols])
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MSSubClass    1460 non-null   float64
 1   MSZoning      1460 non-null   object 
 2   LotFrontage   1460 non-null   float64
 3   LotArea       1460 non-null   float64
 4   Neighborhood  1460 non-null   object 
 5   BldgType      1460 non-null   object 
 6   HouseStyle    1460 non-null   object 
 7   OverallQual   1460 non-null   float64
 8   OverallCond   1460 non-null   float64
 9   YearBuilt     1460 non-null   float64
 10  Remodeled     1460 non-null   float64
 11  GrLivArea     1460 non-null   float64
 12  BsmtFullBath  1460 non-null   float64
 13  BsmtHalfBath  1460 non-null   float64
 14  FullBath      1460 non-null   float64
 15  HalfBath      1460 non-null   float64
 16  BedroomAbvGr  1460 non-null   float64
 17  Fireplaces    1460 non-null   float64
 18  GarageArea    1460 non-null   float64
 19  PavedDrive    1460 non-null   object 
 20  SalePrice     1460 non-null   float64
dtypes: float64(16), object(5)
memory usage: 239.7+ KB

Checking the distribution of continuous variable

import seaborn as sns
sns.set()
dataset[int_cols].hist(figsize=(20,8))
plt.show()

png

Feature Scaling

dataset
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
MSSubClass MSZoning LotFrontage LotArea Neighborhood BldgType HouseStyle OverallQual OverallCond YearBuilt ... GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr Fireplaces GarageArea PavedDrive SalePrice
0 60.0 RL 65.0 8450.0 CollgCr 1Fam 2Story 7.0 5.0 2003.0 ... 1710.0 1.0 0.0 2.0 1.0 3.0 0.0 548.0 Y 208500.0
1 20.0 RL 80.0 9600.0 Veenker 1Fam 1Story 6.0 8.0 1976.0 ... 1262.0 0.0 1.0 2.0 0.0 3.0 1.0 460.0 Y 181500.0
2 60.0 RL 68.0 11250.0 CollgCr 1Fam 2Story 7.0 5.0 2001.0 ... 1786.0 1.0 0.0 2.0 1.0 3.0 1.0 608.0 Y 223500.0
3 70.0 RL 60.0 9550.0 Crawfor 1Fam 2Story 7.0 5.0 1915.0 ... 1717.0 1.0 0.0 1.0 0.0 3.0 1.0 642.0 Y 140000.0
4 60.0 RL 84.0 14260.0 NoRidge 1Fam 2Story 8.0 5.0 2000.0 ... 2198.0 1.0 0.0 2.0 1.0 4.0 1.0 836.0 Y 250000.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1455 60.0 RL 62.0 7917.0 Gilbert 1Fam 2Story 6.0 5.0 1999.0 ... 1647.0 0.0 0.0 2.0 1.0 3.0 1.0 460.0 Y 175000.0
1456 20.0 RL 85.0 13175.0 NWAmes 1Fam 1Story 6.0 6.0 1978.0 ... 2073.0 1.0 0.0 2.0 0.0 3.0 2.0 500.0 Y 210000.0
1457 70.0 RL 66.0 9042.0 Crawfor 1Fam 2Story 7.0 9.0 1941.0 ... 2340.0 0.0 0.0 2.0 0.0 4.0 2.0 252.0 Y 266500.0
1458 20.0 RL 68.0 9717.0 NAmes 1Fam 1Story 5.0 6.0 1950.0 ... 1078.0 1.0 0.0 1.0 0.0 2.0 0.0 240.0 Y 142125.0
1459 20.0 RL 75.0 9937.0 Edwards 1Fam 1Story 5.0 6.0 1965.0 ... 1256.0 1.0 0.0 1.0 1.0 3.0 0.0 276.0 Y 147500.0

1460 rows × 21 columns

# Create the features and label
X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1].values
y = y.reshape(len(y), 1)
sc_x = StandardScaler()
sc_y = StandardScaler()
# Standardise the numerical columns
X[int_cols[:-1]] = sc_x.fit_transform(X[int_cols[:-1]])
y = sc_y.fit_transform(y)
X
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
MSSubClass MSZoning LotFrontage LotArea Neighborhood BldgType HouseStyle OverallQual OverallCond YearBuilt Remodeled GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr Fireplaces GarageArea PavedDrive
0 0.073375 RL -0.229372 -0.207142 CollgCr 1Fam 2Story 0.651479 -0.517200 1.050994 -0.954460 0.370333 1.107810 -0.241061 0.789741 1.227585 0.163779 -0.951226 0.351000 Y
1 -0.872563 RL 0.451936 -0.091886 Veenker 1Fam 1Story -0.071836 2.179628 0.156734 -0.954460 -0.482512 -0.819964 3.948809 0.789741 -0.761621 0.163779 0.600495 -0.060731 Y
2 0.073375 RL -0.093110 0.073480 CollgCr 1Fam 2Story 0.651479 -0.517200 0.984752 1.047712 0.515013 1.107810 -0.241061 0.789741 1.227585 0.163779 0.600495 0.631726 Y
3 0.309859 RL -0.456474 -0.096897 Crawfor 1Fam 2Story 0.651479 -0.517200 -1.863632 1.047712 0.383659 1.107810 -0.241061 -1.026041 -0.761621 0.163779 0.600495 0.790804 Y
4 0.073375 RL 0.633618 0.375148 NoRidge 1Fam 2Story 1.374795 -0.517200 0.951632 -0.954460 1.299326 1.107810 -0.241061 0.789741 1.227585 1.390023 0.600495 1.698485 Y
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1455 0.073375 RL -0.365633 -0.260560 Gilbert 1Fam 2Story -0.071836 -0.517200 0.918511 1.047712 0.250402 -0.819964 -0.241061 0.789741 1.227585 0.163779 0.600495 -0.060731 Y
1456 -0.872563 RL 0.679039 0.266407 NWAmes 1Fam 1Story -0.071836 0.381743 0.222975 1.047712 1.061367 1.107810 -0.241061 0.789741 -0.761621 0.163779 2.152216 0.126420 Y
1457 0.309859 RL -0.183951 -0.147810 Crawfor 1Fam 2Story 0.651479 3.078570 -1.002492 1.047712 1.569647 -0.819964 -0.241061 0.789741 -0.761621 1.390023 2.152216 -1.033914 Y
1458 -0.872563 RL -0.093110 -0.080160 NAmes 1Fam 1Story -0.795151 0.381743 -0.704406 1.047712 -0.832788 1.107810 -0.241061 -1.026041 -0.761621 -1.062465 -0.951226 -1.090059 Y
1459 -0.872563 RL 0.224833 -0.058112 Edwards 1Fam 1Story -0.795151 0.381743 -0.207594 -0.954460 -0.493934 1.107810 -0.241061 -1.026041 1.227585 0.163779 -0.951226 -0.921624 Y

1460 rows × 20 columns

Encoding Categorical Variables

# Check the number of categories for each categorical column
print("Unique Categories for each categorical columns\n")
for col in dataset.select_dtypes(include=['object']).columns.to_list():
    print(col, '\n--------------')
    print(dataset[col].unique(), '\n')
Unique Categories for each categorical columns

MSZoning 
--------------
['RL' 'RM' 'C (all)' 'FV' 'RH'] 

Neighborhood 
--------------
['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste'] 

BldgType 
--------------
['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs'] 

HouseStyle 
--------------
['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin'] 

PavedDrive 
--------------
['Y' 'N' 'P'] 
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,4,5,6,19])],remainder='passthrough')
X = ct.fit_transform(X)
X
array([[ 0.        ,  0.        ,  0.        , ...,  0.16377912,
        -0.95122649,  0.35100032],
       [ 0.        ,  0.        ,  0.        , ...,  0.16377912,
         0.60049493, -0.06073101],
       [ 0.        ,  0.        ,  0.        , ...,  0.16377912,
         0.60049493,  0.63172623],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  1.39002276,
         2.15221636, -1.03391416],
       [ 0.        ,  0.        ,  0.        , ..., -1.06246453,
        -0.95122649, -1.09005935],
       [ 0.        ,  0.        ,  0.        , ...,  0.16377912,
        -0.95122649, -0.9216238 ]])

Splitting the dataset into training and testing datasets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Use sklearn Pipeline to preprocess and fit a model

from sklearn.preprocessing import FunctionTransformer
from copy import deepcopy

X = data.iloc[:,:-1]
y = data.iloc[:,-1].values
# Fix missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(X.select_dtypes(include=['int', 'float']))
int_cols = X.select_dtypes(include=['float', 'int']).columns.to_list()
X[int_cols] = imputer.transform(X[int_cols])
# OneHot encode the categorical columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,4,5,6,19])],remainder='passthrough')
X = ct.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Standardise the whole dataset
std_scaler = StandardScaler().fit(X_train)

def preprocessor(X):
    D = np.copy(X)
    D = std_scaler.transform(D)
    return D
len(X_test[0])
61
preprocessor_transformer = FunctionTransformer(preprocessor)
preprocessor_transformer
FunctionTransformer(func=<function preprocessor at 0x000001AC94235048>)
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

p1 = Pipeline([('scaler', preprocessor_transformer),
              ('Linear Regression', LinearRegression())])
p1
Pipeline(steps=[('scaler',
                 FunctionTransformer(func=<function preprocessor at 0x000001AC94235048>)),
                ('Linear Regression', LinearRegression())])
from sklearn.metrics import mean_absolute_error

def fit_and_print(p, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
    # Fit the transformer
    p.fit(X_train, y_train)
    # Predict the train and test outputs
    training_prediction = p.predict(X_train)
    test_prediction =p.predict(X_test)
    
    # Print the errors
    print("Training Error:     "+str(mean_absolute_error(training_prediction, y_train)))
    print("Test Error:         "+str(mean_absolute_error(test_prediction, y_test)))
fit_and_print(p1)
Training Error:     19612.705234115496
Test Error:         20056.787013477147

Fit a KNN regressor

# Perform Grid search
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor as KNR

# Create a KNN Regressor
knn = KNR()

# Define Hyperparameters to tune
param_grid = {
    'n_neighbors': [3,5,7,9,11]
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid,
                          scoring='neg_mean_squared_error', cv=5)
# Fit Grid Search to Data
grid_search.fit(X_train, y_train)
# Retrieve Best Parameters
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
print("best_params: ", best_params, "\nbest_estimator: ", best_estimator)
best_params:  {'n_neighbors': 7} 
best_estimator:  KNeighborsRegressor(n_neighbors=7)
p2 = Pipeline([('Scaler', preprocessor_transformer),
               ('KNN Regression', KNR(n_neighbors=9))])
fit_and_print(p2)
Training Error:     23518.734874429225
Test Error:         26095.699010654487

Fit a RandomForestRegressor()

from sklearn.ensemble import RandomForestRegressor
# Create a random forest regressor
rf_regressor = RandomForestRegressor()

# Define Hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1,2,4,6,8,10],
#     'max_features': ['auto', 'sqrt'],
#     'bootstrap': [True, False]
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=5)
#Fit grid search data
grid_search.fit(X_train, y_train)
# Retrive best parameters
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
print("best_params: ", best_params, "\nbest_estimator: ", best_estimator)
best_params:  {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 150} 
best_estimator:  RandomForestRegressor(max_depth=20, min_samples_split=5, n_estimators=150)
p3 = Pipeline([('Scaler', preprocessor_transformer),
               ('RFR Regression', RandomForestRegressor(max_depth=20, min_samples_split=5, n_estimators=150))])
fit_and_print(p3)
Training Error:     8106.180156534265
Test Error:         18495.41643449427