initial version

tblock · Feb 26, 2019 · 8d6e1e0 · 8d6e1e0
commit 8d6e1e0
Show file tree

Hide file tree

Showing 26 changed files with 63,952 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Timo Block
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,13 @@
+# thesis-data
+
+This repository contains supplementary scripts and data for my thesis.
+I classified the [Ten Thousand German News Articles Dataset](https://tblock.github.io/10kGNAD/) with four text classifiers. Namely a Support Vector Machine, facebook's [fastText](https://fasttext.cc) libary, a TensorFlow neuronal net and the [ULMFiT](https://arxiv.org/abs/1801.06146) method.
+
+
+The scripts can be run in a [Google Colab IPython Notebook](https://colab.research.google.com).
+
+- SVM [[view]](https://github.com/tblock/thesis-data/blob/master/reproduce_SVM.ipynb) [[run]](https://colab.research.google.com/github/tblock/thesis-data/blob/master/reproduce_SVM.ipynb)
+- fastText [[view]](https://github.com/tblock/thesis-data/blob/master/reproduce_fastText.ipynb) [[run]](https://colab.research.google.com/github/tblock/thesis-data/blob/master/reproduce_fastText.ipynb)
+- TensorFlow [[view]](https://github.com/tblock/thesis-data/blob/master/reproduce_TensorFlow.ipynb) [[run]](https://colab.research.google.com/github/tblock/thesis-data/blob/master/reproduce_TensorFlow.ipynb)
+- ULMFiT [[view]](https://github.com/tblock/thesis-data/blob/master/reproduce_ULMFiT.ipynb) [[run]](https://colab.research.google.com/github/tblock/thesis-data/blob/master/reproduce_ULMFiT.ipynb)
+
diff --git a/fastText-data/fastText_test.csv b/fastText-data/fastText_test.csv
diff --git a/reproduce_SVM.ipynb b/reproduce_SVM.ipynb
@@ -0,0 +1,256 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "REP_SVM.ipynb",
+      "version": "0.3.2",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "metadata": {
+        "id": "5_EKIwLlcII7",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "## Support Vector Machine to classify the [Ten Thousand German News Articles Dataset](https://github.com/tblock/10kGNAD)\n",
+        "This Notebook contains the code to reproduce the results in my thesis.\n",
+        "The code reproduces the exact results.\n",
+        "\n",
+        "Run all cells consecutively."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "84eqgmr2RZhY",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "### Enviroment Setup "
+      ]
+    },
+    {
+      "metadata": {
+        "id": "oyzQUOgwcy2W",
+        "colab_type": "code",
+        "outputId": "3a646b13-70f0-4492-9dde-a8141fd261a0",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 53
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "# load the dataset and generate subsets\n",
+        "!rm -rf 10kGNAD lowshot\n",
+        "!git config --global advice.detachedHead false\n",
+        "!git clone -q --branch v1.1 https://github.com/tblock/10kGNAD.git && echo \"downloaded dataset\"\n",
+        "!mkdir lowshot\n",
+        "!cp 10kGNAD/train.csv .\n",
+        "!python 10kGNAD/code/generate_lowshot_sets.py > /dev/null && echo \"generated train subsets\""
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "downloaded dataset\n",
+            "generated train subsets\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "3wuK2dHCcHf9",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "import glob\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "from sklearn.svm import LinearSVC\n",
+        "from sklearn.pipeline import Pipeline\n",
+        "from sklearn.feature_extraction.text import CountVectorizer\n",
+        "from sklearn.feature_extraction.text import TfidfTransformer\n",
+        "from sklearn.preprocessing import StandardScaler"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "63NWI1D4RiJb",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "### Train Models"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "9b1C4PVtt4LL",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "# load test set\n",
+        "df_test = pd.read_csv('10kGNAD/test.csv', header=None, sep=';', quotechar=\"'\", names=['label', 'text'])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "fIRl3HQvutRJ",
+        "colab_type": "code",
+        "outputId": "67b95baf-1ecb-40b4-991b-90b798bd60d2",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1637
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "filenames = sorted(glob.glob(\"lowshot/*.csv\"))\n",
+        "\n",
+        "for filename in filenames:  # for each subset\n",
+        "  \n",
+        "  df_train = pd.read_csv(filename, header=None, sep=';', quotechar=\"'\", names=['label', 'text'])\n",
+        "\n",
+        "  # build the classifier pipeline\n",
+        "  lsvc_classifier = Pipeline([\n",
+        "    ('vect', CountVectorizer()),\n",
+        "    ('tfidf', TfidfTransformer(\n",
+        "        sublinear_tf=True # Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).\n",
+        "    )),\n",
+        "    ('clf', LinearSVC(\n",
+        "        dual=False,\n",
+        "        C=1.6,\n",
+        "        class_weight=\"balanced\"\n",
+        "    ))\n",
+        "  ])\n",
+        "\n",
+        "  lsvc_classifier.fit(df_train['text'], df_train['label'])  # train the classifier\n",
+        "  predicted = lsvc_classifier.predict(df_test['text'])  # predict the test set \n",
+        "  acc = np.mean(predicted == df_test['label'])  # calculate the accuracy\n",
+        "  \n",
+        "  print(filename[16:-4],\"%.2f\" % float((100 - acc*100)), sep=\" -> \") # print the error rate "
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "0_0.01_0 -> 40.56\n",
+            "0_0.01_1 -> 39.40\n",
+            "0_0.01_2 -> 36.87\n",
+            "0_0.01_3 -> 38.13\n",
+            "0_0.01_4 -> 42.02\n",
+            "0_0.01_5 -> 40.56\n",
+            "0_0.01_6 -> 39.59\n",
+            "0_0.01_7 -> 38.33\n",
+            "0_0.01_8 -> 38.04\n",
+            "0_0.01_9 -> 39.59\n",
+            "1_0.02_0 -> 32.10\n",
+            "1_0.02_1 -> 27.72\n",
+            "1_0.02_2 -> 30.64\n",
+            "1_0.02_3 -> 32.78\n",
+            "1_0.02_4 -> 28.21\n",
+            "1_0.02_5 -> 29.38\n",
+            "1_0.02_6 -> 30.06\n",
+            "1_0.02_7 -> 28.99\n",
+            "1_0.02_8 -> 31.81\n",
+            "1_0.02_9 -> 32.10\n",
+            "2_0.05_0 -> 22.76\n",
+            "2_0.05_1 -> 24.22\n",
+            "2_0.05_2 -> 20.53\n",
+            "2_0.05_3 -> 21.89\n",
+            "2_0.05_4 -> 22.86\n",
+            "2_0.05_5 -> 22.57\n",
+            "2_0.05_6 -> 22.76\n",
+            "2_0.05_7 -> 24.51\n",
+            "2_0.05_8 -> 21.69\n",
+            "2_0.05_9 -> 22.28\n",
+            "3_0.075_0 -> 18.97\n",
+            "3_0.075_1 -> 19.26\n",
+            "3_0.075_2 -> 20.72\n",
+            "3_0.075_3 -> 20.43\n",
+            "3_0.075_4 -> 20.53\n",
+            "3_0.075_5 -> 19.94\n",
+            "3_0.075_6 -> 19.94\n",
+            "3_0.075_7 -> 19.84\n",
+            "3_0.075_8 -> 19.65\n",
+            "3_0.075_9 -> 20.14\n",
+            "4_0.1_0 -> 19.36\n",
+            "4_0.1_1 -> 18.68\n",
+            "4_0.1_2 -> 17.02\n",
+            "4_0.1_3 -> 18.77\n",
+            "4_0.1_4 -> 17.32\n",
+            "4_0.1_5 -> 17.61\n",
+            "4_0.1_6 -> 18.48\n",
+            "4_0.1_7 -> 18.68\n",
+            "4_0.1_8 -> 17.61\n",
+            "4_0.1_9 -> 17.02\n",
+            "5_0.2_0 -> 14.98\n",
+            "5_0.2_1 -> 15.47\n",
+            "5_0.2_2 -> 14.88\n",
+            "5_0.2_3 -> 15.66\n",
+            "5_0.2_4 -> 15.27\n",
+            "5_0.2_5 -> 15.37\n",
+            "5_0.2_6 -> 16.73\n",
+            "5_0.2_7 -> 15.56\n",
+            "5_0.2_8 -> 16.15\n",
+            "5_0.2_9 -> 15.47\n",
+            "6_0.5_0 -> 13.13\n",
+            "6_0.5_1 -> 13.04\n",
+            "6_0.5_2 -> 13.23\n",
+            "6_0.5_3 -> 12.45\n",
+            "6_0.5_4 -> 13.42\n",
+            "6_0.5_5 -> 12.65\n",
+            "6_0.5_6 -> 13.13\n",
+            "6_0.5_7 -> 13.91\n",
+            "6_0.5_8 -> 13.04\n",
+            "6_0.5_9 -> 12.84\n",
+            "7_0.75_0 -> 11.28\n",
+            "7_0.75_1 -> 11.87\n",
+            "7_0.75_2 -> 12.06\n",
+            "7_0.75_3 -> 12.65\n",
+            "7_0.75_4 -> 12.35\n",
+            "7_0.75_5 -> 11.58\n",
+            "7_0.75_6 -> 12.35\n",
+            "7_0.75_7 -> 12.06\n",
+            "7_0.75_8 -> 12.06\n",
+            "7_0.75_9 -> 12.26\n",
+            "8_1.0_0 -> 11.48\n",
+            "8_1.0_1 -> 11.48\n",
+            "8_1.0_2 -> 11.48\n",
+            "8_1.0_3 -> 11.48\n",
+            "8_1.0_4 -> 11.48\n",
+            "8_1.0_5 -> 11.48\n",
+            "8_1.0_6 -> 11.48\n",
+            "8_1.0_7 -> 11.48\n",
+            "8_1.0_8 -> 11.48\n",
+            "8_1.0_9 -> 11.48\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    }
+  ]
+}