add VW classification sample

microsoft · May 6, 2021 · df83c7a · df83c7a
1 parent 56d50c3
commit df83c7a
Showing 1 changed file with 161 additions and 0 deletions.
diff --git a/notebooks/samples/Vowpal Wabbit - Heart Disease Detection.ipynb b/notebooks/samples/Vowpal Wabbit - Heart Disease Detection.ipynb
@@ -0,0 +1,161 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": 3
+  },
+  "orig_nbformat": 2
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "source": [
+    "## Heart Disease Detection with VowalWabbit Classifier"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "source": [
+    "#### Read dataset"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = spark.read.format(\"csv\")\\\n",
+    "  .option(\"header\", True)\\\n",
+    "  .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/heart_disease_prediction_data.csv\")\n",
+    "# print dataset size\n",
+    "print(\"records read: \" + str(dataset.count()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert features to double type\n",
+    "from pyspark.sql.functions import col\n",
+    "from pyspark.sql.types import DoubleType\n",
+    "for colName in dataset.columns:\n",
+    "  dataset = dataset.withColumn(colName, col(colName).cast(DoubleType()))\n",
+    "print(\"Schema: \")\n",
+    "dataset.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset.show(10, truncate=False)"
+   ]
+  },
+  {
+   "source": [
+    "#### Split the dataset into train and test"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train, test = dataset.randomSplit([0.85, 0.15], seed=1)"
+   ]
+  },
+  {
+   "source": [
+    "#### Use VowalWabbitFeaturizer to convert data features into vector"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mmlspark.vw import VowpalWabbitFeaturizer\n",
+    "featurizer = VowpalWabbitFeaturizer(inputCols=dataset.columns[:-1], outputCol=\"features\")\n",
+    "train_data = featurizer.transform(train)[\"target\", \"features\"]\n",
+    "test_data = featurizer.transform(test)[\"target\", \"features\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data.groupBy(\"target\").count().show()"
+   ]
+  },
+  {
+   "source": [
+    "#### Model Training"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mmlspark.vw import VowpalWabbitClassifier\n",
+    "model = VowpalWabbitClassifier(numPasses=20, labelCol=\"target\", featuresCol=\"features\").fit(train_data)"
+   ]
+  },
+  {
+   "source": [
+    "#### Model Prediction"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions = model.transform(test_data)\n",
+    "predictions.limit(10).toPandas()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mmlspark.train import ComputeModelStatistics\n",
+    "metrics = ComputeModelStatistics(evaluationMetric='classification', labelCol='target', scoredLabelsCol='prediction').transform(predictions)\n",
+    "display(metrics)"
+   ]
+  }
+ ]
+}