diff --git a/notebooks/airbnb_tf.ipynb b/notebooks/airbnb_tf.ipynb new file mode 100644 index 0000000..06b8e4a --- /dev/null +++ b/notebooks/airbnb_tf.ipynb @@ -0,0 +1,827 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load data\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import sage\n", + "import numpy as np\n", + "import pandas as pd\n", + "import gender_guesser.detector as detector\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamehost_idhost_nameneighbourhood_groupneighbourhoodlatitudelongituderoom_typepriceminimum_nightsnumber_of_reviewslast_reviewreviews_per_monthcalculated_host_listings_countavailability_365
02539Clean & quiet apt home by the park2787JohnBrooklynKensington40.64749-73.97237Private room149192018-10-190.216365
12595Skylit Midtown Castle2845JenniferManhattanMidtown40.75362-73.98377Entire home/apt2251452019-05-210.382355
23647THE VILLAGE OF HARLEM....NEW YORK !4632ElisabethManhattanHarlem40.80902-73.94190Private room15030NaTNaN1365
33831Cozy Entire Floor of Brownstone4869LisaRoxanneBrooklynClinton Hill40.68514-73.95976Entire home/apt8912702019-07-054.641194
45022Entire Apt: Spacious Studio/Loft by central park7192LauraManhattanEast Harlem40.79851-73.94399Entire home/apt801092018-11-190.1010
\n", + "
" + ], + "text/plain": [ + " id name host_id \\\n", + "0 2539 Clean & quiet apt home by the park 2787 \n", + "1 2595 Skylit Midtown Castle 2845 \n", + "2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 \n", + "3 3831 Cozy Entire Floor of Brownstone 4869 \n", + "4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 \n", + "\n", + " host_name neighbourhood_group neighbourhood latitude longitude \\\n", + "0 John Brooklyn Kensington 40.64749 -73.97237 \n", + "1 Jennifer Manhattan Midtown 40.75362 -73.98377 \n", + "2 Elisabeth Manhattan Harlem 40.80902 -73.94190 \n", + "3 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976 \n", + "4 Laura Manhattan East Harlem 40.79851 -73.94399 \n", + "\n", + " room_type price minimum_nights number_of_reviews last_review \\\n", + "0 Private room 149 1 9 2018-10-19 \n", + "1 Entire home/apt 225 1 45 2019-05-21 \n", + "2 Private room 150 3 0 NaT \n", + "3 Entire home/apt 89 1 270 2019-07-05 \n", + "4 Entire home/apt 80 10 9 2018-11-19 \n", + "\n", + " reviews_per_month calculated_host_listings_count availability_365 \n", + "0 0.21 6 365 \n", + "1 0.38 2 355 \n", + "2 NaN 1 365 \n", + "3 4.64 1 194 \n", + "4 0.10 1 0 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load data\n", + "df = sage.datasets.airbnb()\n", + "\n", + "# Sample rows\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Categorical features\n", + "categorical_columns = ['neighbourhood_group', 'neighbourhood', 'room_type']\n", + "for column in categorical_columns:\n", + " df[column] = pd.Categorical(df[column]).codes" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Exclude outliers (top 0.5%)\n", + "df = df[df['price'] < df['price'].quantile(0.995)]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Features derived from name\n", + "df['name_length'] = df['name'].apply(lambda x: len(x))\n", + "df['name_isupper'] = df['name'].apply(lambda x: int(x.isupper()))\n", + "df['name_words'] = df['name'].apply(lambda x: len(re.findall(r'\\w+', x)))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Host gender guess\n", + "guesser = detector.Detector()\n", + "df['host_gender'] = df['host_name'].apply(lambda x: guesser.get_gender(x.split(' ')[0]))\n", + "df['host_gender'] = pd.Categorical(df['host_gender']).codes" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Number of days since last review\n", + "most_recent = df['last_review'].max()\n", + "df['last_review'] = (most_recent - df['last_review']).dt.days\n", + "df['last_review'] = (df['last_review'] - df['last_review'].mean()) / df['last_review'].std()\n", + "df['last_review'] = df['last_review'].fillna(-5)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Missing values\n", + "df['reviews_per_month'] = df['reviews_per_month'].fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Normalize other numerical features\n", + "df['number_of_reviews'] = (df['number_of_reviews'] - df['number_of_reviews'].mean()) / df['number_of_reviews'].std()\n", + "df['availability_365'] = (df['availability_365'] - df['availability_365'].mean()) / df['availability_365'].std()\n", + "df['name_length'] = (df['name_length'] - df['name_length'].mean()) / df['name_length'].std()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Normalize latitude and longitude\n", + "df['latitude'] = (df['latitude'] - df['latitude'].mean()) / df['latitude'].std()\n", + "df['longitude'] = (df['longitude'] - df['longitude'].mean()) / df['longitude'].std()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop columns\n", + "df = df.drop(['id', 'host_id', 'host_name', 'name'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
neighbourhood_groupneighbourhoodlatitudelongituderoom_typepriceminimum_nightsnumber_of_reviewslast_reviewreviews_per_monthcalculated_host_listings_countavailability_365name_lengthname_isuppername_wordshost_gender
01108-1.492033-0.44079511491-0.321948-0.0348520.2161.922931-0.276169072
121270.452497-0.687792022510.484307-0.5523660.3821.846815-1.513431031
22941.4675440.21938111503-0.523512-5.0000000.0011.922931-0.180995161
3141-0.802204-0.16758108915.523401-0.6611884.6410.621350-0.561691055
42611.2749780.17409808010-0.321948-0.1098190.101-0.8552981.056268081
\n", + "
" + ], + "text/plain": [ + " neighbourhood_group neighbourhood latitude longitude room_type price \\\n", + "0 1 108 -1.492033 -0.440795 1 149 \n", + "1 2 127 0.452497 -0.687792 0 225 \n", + "2 2 94 1.467544 0.219381 1 150 \n", + "3 1 41 -0.802204 -0.167581 0 89 \n", + "4 2 61 1.274978 0.174098 0 80 \n", + "\n", + " minimum_nights number_of_reviews last_review reviews_per_month \\\n", + "0 1 -0.321948 -0.034852 0.21 \n", + "1 1 0.484307 -0.552366 0.38 \n", + "2 3 -0.523512 -5.000000 0.00 \n", + "3 1 5.523401 -0.661188 4.64 \n", + "4 10 -0.321948 -0.109819 0.10 \n", + "\n", + " calculated_host_listings_count availability_365 name_length \\\n", + "0 6 1.922931 -0.276169 \n", + "1 2 1.846815 -1.513431 \n", + "2 1 1.922931 -0.180995 \n", + "3 1 0.621350 -0.561691 \n", + "4 1 -0.855298 1.056268 \n", + "\n", + " name_isupper name_words host_gender \n", + "0 0 7 2 \n", + "1 0 3 1 \n", + "2 1 6 1 \n", + "3 0 5 5 \n", + "4 0 8 1 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# New sample rows\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Rearrange columns\n", + "target_col = 'price'\n", + "cols = df.columns.tolist()\n", + "del cols[cols.index(target_col)]\n", + "cols.append(target_col)\n", + "feature_names = cols[:-1]\n", + "df = df[cols]\n", + "\n", + "# Split data\n", + "train, test = train_test_split(\n", + " df.values, test_size=int(0.1 * len(df.values)), random_state=0)\n", + "train, val = train_test_split(\n", + " train, test_size=int(0.1 * len(df.values)), random_state=0)\n", + "Y_train = train[:, -1:].copy()\n", + "Y_val = val[:, -1:].copy()\n", + "Y_test = test[:, -1:].copy()\n", + "train = train[:, :-1].copy()\n", + "val = val[:, :-1].copy()\n", + "test = test[:, :-1].copy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train model" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "from tensorflow import keras\n", + "from tensorflow.keras import layers" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "batch_size = 32\n", + "\n", + "# Prepare the training dataset.\n", + "train_dataset = tf.data.Dataset.from_tensor_slices((train, Y_train))\n", + "train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)\n", + "\n", + "# Prepare the validation dataset.\n", + "val_dataset = tf.data.Dataset.from_tensor_slices((val, Y_val))\n", + "val_dataset = val_dataset.batch(batch_size)\n", + "\n", + "# Get model\n", + "model = keras.Sequential(\n", + " [\n", + " layers.Dense(128, activation='relu', input_shape=(train.shape[1],)),\n", + " layers.Dense(256, activation='relu'),\n", + " layers.Dense(64, activation='relu'),\n", + " layers.Dense(1)\n", + " ],\n", + " name='airbnb_model'\n", + ")\n", + "\n", + "# Instantiate an optimizer to train the model.\n", + "optimizer = keras.optimizers.Adam(learning_rate=1e-3)\n", + "\n", + "# Instantiate a loss function.\n", + "loss_fn = keras.losses.MeanSquaredError()\n", + "\n", + "# Prepare the metrics.\n", + "train_acc_metric = keras.metrics.MeanSquaredError()\n", + "val_acc_metric = keras.metrics.MeanSquaredError()\n", + "\n", + "# Utils.\n", + "@tf.function\n", + "def train_step(x, y):\n", + " with tf.GradientTape() as tape:\n", + " preds = model(x)\n", + " loss_value = loss_fn(y, preds)\n", + " grads = tape.gradient(loss_value, model.trainable_weights)\n", + " optimizer.apply_gradients(zip(grads, model.trainable_weights))\n", + " train_acc_metric.update_state(y, preds)\n", + " return loss_value\n", + "\n", + "@tf.function\n", + "def test_step(x, y):\n", + " preds = model(x)\n", + " val_acc_metric.update_state(y, preds)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "epochs = 50\n", + "\n", + "for epoch in range(epochs):\n", + "\n", + " # Iterate over the batches of the dataset.\n", + " for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):\n", + " loss_value = train_step(x_batch_train, y_batch_train)\n", + "\n", + " # Display metrics at the end of each epoch.\n", + " train_acc = train_acc_metric.result()\n", + "\n", + " # Reset training metrics at the end of each epoch\n", + " train_acc_metric.reset_states()\n", + "\n", + " # Run a validation loop at the end of each epoch.\n", + " for x_batch_val, y_batch_val in val_dataset:\n", + " test_step(x_batch_val, y_batch_val)\n", + "\n", + " val_acc = val_acc_metric.result()\n", + " val_acc_metric.reset_states()\n", + "\n", + "# For classification (which is not the case here): see \n", + "# https://github.com/iancovert/sage/blob/master/sage/utils.py#L36,\n", + "# as output activations should already be applied properly.\n", + "# probability_model = tf.keras.Sequential([model, tf.keras.layers.Softmax()])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training acc: 6417.3271\n", + "Validation acc: 7278.4600\n" + ] + } + ], + "source": [ + "print(\"Training acc: %.4f\" % (float(train_acc),))\n", + "print(\"Validation acc: %.4f\" % (float(val_acc),))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Calculate individual feature importance" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting up imputer for Tensorflow model, assuming that any necessary output activations are applied properly. If not, please set up keras.Sequential with keras.layers.Softmax()\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [03:28<00:00, 208.31s/it] \n" + ] + } + ], + "source": [ + "# Setup and calculate\n", + "imputer = sage.MarginalImputer(model, test[:512])\n", + "estimator = sage.PermutationEstimator(imputer, 'mse')\n", + "sage_values = estimator(test, Y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plot results\n", + "sage_values.plot(feature_names)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Calculate grouped feature importance" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Feature groups\n", + "feature_groups = group_names = {\n", + " 'location (grouped)': ['latitude', 'longitude', 'neighbourhood', 'neighbourhood_group'],\n", + " 'name (grouped)': ['name_words', 'name_length', 'name_isupper'],\n", + " 'reviews (grouped)': ['last_review', 'reviews_per_month', 'number_of_reviews'],\n", + " 'host (grouped)': ['host_gender', 'calculated_host_listings_count'],\n", + " 'availability': ['availability_365'],\n", + " 'room_type': ['room_type']\n", + "}\n", + "group_names = [group for group in feature_groups]\n", + "for col in feature_names:\n", + " if np.all([col not in group[1] for group in feature_groups.items()]):\n", + " group_names.append(col)\n", + "\n", + "# Group indices\n", + "groups = []\n", + "for _, group in feature_groups.items():\n", + " ind_list = []\n", + " for feature in group:\n", + " ind_list.append(cols.index(feature))\n", + " groups.append(ind_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting up imputer for Tensorflow model, assuming that any necessary output activations are applied properly. If not, please set up keras.Sequential with keras.layers.Softmax()\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [01:33<00:00, 93.20s/it] \n" + ] + } + ], + "source": [ + "# Setup and calculate\n", + "imputer = sage.GroupedMarginalImputer(model, test[:512], groups)\n", + "estimator = sage.PermutationEstimator(imputer, 'mse')\n", + "sage_values = estimator(test, Y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plot results\n", + "sage_values.plot(group_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "c4f92193806e2908606a5f23edd55a5282f2f433b73b1c504507f9256ed9f0b4" + }, + "kernelspec": { + "display_name": "Python 3.9.7 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/sage/utils.py b/sage/utils.py index 6b48f47..e5fe800 100644 --- a/sage/utils.py +++ b/sage/utils.py @@ -29,10 +29,18 @@ def model_conversion(model): 'not, please set up nn.Sequential with nn.Sigmoid or nn.Softmax') import torch + model.eval() device = next(model.parameters()).device return lambda x: model(torch.tensor( x, dtype=torch.float32, device=device)).cpu().data.numpy() + elif safe_isinstance(model, 'keras.Model'): + print('Setting up imputer for keras model, assuming that any ' + 'necessary output activations are applied properly. If not, ' + 'please set up keras.Sequential with keras.layers.Softmax()') + + return lambda x: model(x, training=False).numpy() + elif callable(model): # Assume model is compatible function or callable object. return model