Merge pull request #118 from thunlp/dev

Dev
thunlp · Sep 18, 2020 · a280a72 · a280a72
2 parents 637e81a + 8f8cde3
commit a280a72
Show file tree

Hide file tree

Showing 35 changed files with 879 additions and 35 deletions.
diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml
@@ -0,0 +1,25 @@
+name: publish to pypi
+on:
+    push:
+        branches:
+            -   master
+jobs:
+    build-n-publish:
+        name: Build and publish
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@master
+        -   uses: actions/setup-python@v2
+            with:
+                python-version: '3.7'
+                architecture: 'x64'
+        -   name: Run build script
+            run: |
+                pip install twine --user
+                python setup.py sdist bdist_wheel
+        -   name: Publish distribution 📦 to PyPI
+            if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+            uses: pypa/gh-action-pypi-publish@master
+            with:
+                user: __token__
+                password: ${{ secrets.pypi_password }}
diff --git a/OpenAttack/attackers/deepwordbug.py b/OpenAttack/attackers/deepwordbug.py
@@ -25,8 +25,6 @@ def __init__(self, **kwargs):
         :param string scoring: Scoring function used to compute word importance, ``["replaceone", "temporal", "tail", "combined"]``. **Default:** replaceone
         :param string transformer: Transform function to modify a word, ``["homoglyph", "swap"]``. **Default:** homoglyph
 
-        :Package Requirements:
-            * torch
         :Classifier Capacity: Probability
 
         Black-box Generation of Adversarial Text Sequences to Evade Deep Learning Classifiers. Ji Gao, Jack Lanchantin, Mary Lou Soffa, Yanjun Qi. IEEE SPW 2018.
@@ -40,15 +38,14 @@ def __init__(self, **kwargs):
         self.power = self.config["power"]
 
     def __call__(self, clsf, x_orig, target=None):
-        import torch
         """
         * **clsf** : **Classifier** .
         * **x_orig** : Input sentence.
         """
         y_orig = clsf.get_pred([x_orig])[0]
         inputs = x_orig.strip().lower().split(" ")
         losses = self.scorefunc(self.scoring, clsf, inputs, y_orig)  # 每个词消失后的loss向量
-        sorted, indices = torch.sort(losses, descending=True)
+        indices = np.argsort(losses)
 
         advinputs = inputs[:]
         t = 0
@@ -90,46 +87,32 @@ def transform(self, type, word):
 
     # scoring functions
     def replaceone(self, clsf, inputs, y_orig):
-        import torch
-
-        losses = torch.zeros(len(inputs))
+        losses = np.zeros(len(inputs))
         for i in range(len(inputs)):
             tempinputs = inputs[:]  # ##
             tempinputs[i] = self.config['unk']
-            with torch.no_grad():
-                tempoutput = torch.from_numpy(clsf.get_prob([" ".join(tempinputs)]))  # ##
-            softmax = torch.nn.Softmax(dim=1)
-            nll_lossed = -1 * torch.log(softmax(tempoutput))[0][y_orig].item()
-            losses[i] = nll_lossed  # ##
+            tempoutput = clsf.get_prob([" ".join(tempinputs)])
+            losses[i] = 1 - tempoutput[0][y_orig]
         return losses
 
     def temporal(self, clsf, inputs, y_orig):
-        import torch
-        softmax = torch.nn.Softmax(dim=1)
-
-        losses1 = torch.zeros(len(inputs))
-        dloss = torch.zeros(len(inputs))
+        losses1 = np.zeros(len(inputs))
+        dloss = np.zeros(len(inputs))
         for i in range(len(inputs)):
             tempinputs = inputs[: i + 1]
-            with torch.no_grad():
-                tempoutput = torch.from_numpy(clsf.get_prob([self.config["processor"].detokenizer(tempinputs)]))
-            losses1[i] = -1 * torch.log(softmax(tempoutput))[0][y_orig].item()
-            print(self.config["processor"].detokenizer(tempinputs), losses1[i])
+            tempoutput = clsf.get_prob([self.config["processor"].detokenizer(tempinputs)])
+            losses1[i] = 1 - tempoutput[0][y_orig]
         for i in range(1, len(inputs)):
             dloss[i] = abs(losses1[i] - losses1[i - 1])
         return dloss
 
     def temporaltail(self, clsf, inputs, y_orig):
-        import torch
-        softmax = torch.nn.Softmax(dim=1)
-
-        losses1 = torch.zeros(len(inputs))
-        dloss = torch.zeros(len(inputs))
+        losses1 = np.zeros(len(inputs))
+        dloss = np.zeros(len(inputs))
         for i in range(len(inputs)):
             tempinputs = inputs[i:]
-            with torch.no_grad():
-                tempoutput = torch.from_numpy(clsf.get_prob([self.config["processor"].detokenizer(tempinputs)]))
-            losses1[i] = -1 * torch.log(softmax(tempoutput))[0][y_orig].item()
+            tempoutput = clsf.get_prob([self.config["processor"].detokenizer(tempinputs)])
+            losses1[i] = 1 - tempoutput[0][y_orig]
         for i in range(1, len(inputs)):
             dloss[i] = abs(losses1[i] - losses1[i - 1])
         return dloss

diff --git a/OpenAttack/data/data_ag.py b/OpenAttack/data/data_ag.py
@@ -0,0 +1,25 @@
+"""
+:type: a tuple of three :py:class:`.Dataset` s, `(train, valid, test)`.
+:Size: 31.0MB
+
+AG News dataset which is used to train victim models.
+"""
+import pickle
+
+NAME = "Dataset.AG"
+DOWNLOAD = "https://thunlp.oss-cn-qingdao.aliyuncs.com/TAADToolbox/dataset/sst.pkl"
+
+
+def LOAD(path):
+    from OpenAttack.utils import Dataset, DataInstance
+
+    def mapping(data):
+        return Dataset([
+            DataInstance(
+                x=it[0],
+                y=it[1]
+            ) for it in data
+        ], copy=False)
+
+    train, valid, test = pickle.load(open(path, "rb"))
+    return mapping(train), mapping(valid), mapping(test)
diff --git a/OpenAttack/data/data_imdb.py b/OpenAttack/data/data_imdb.py
@@ -0,0 +1,25 @@
+"""
+:type: a tuple of three :py:class:`.Dataset` s, `(train, valid, test)`.
+:Size: 56.2MB
+
+IMDB dataset which is used to train victim models.
+"""
+import pickle
+
+NAME = "Dataset.IMDB"
+DOWNLOAD = "https://thunlp.oss-cn-qingdao.aliyuncs.com/TAADToolbox/dataset/imdb.pkl"
+
+
+def LOAD(path):
+    from OpenAttack.utils import Dataset, DataInstance
+
+    def mapping(data):
+        return Dataset([
+            DataInstance(
+                x=it[0],
+                y=it[1]
+            ) for it in data
+        ], copy=False)
+
+    train, valid, test = pickle.load(open(path, "rb"))
+    return mapping(train), mapping(valid), mapping(test)
diff --git a/OpenAttack/data/data_mnli.py b/OpenAttack/data/data_mnli.py
@@ -0,0 +1,26 @@
+"""
+:type: a tuple of three :py:class:`.Dataset` s, `(train, valid, test)`.
+:Size: 77.373MB
+
+MNLI dataset which is used to train victim models.
+"""
+import pickle
+
+NAME = "Dataset.MNLI"
+DOWNLOAD = "https://thunlp.oss-cn-qingdao.aliyuncs.com/TAADToolbox/dataset/mnli.pkl"
+
+
+def LOAD(path):
+    from OpenAttack.utils import Dataset, DataInstance
+
+    def mapping(data):
+        return Dataset([
+            DataInstance(
+                x=it[0],
+                y=it[2],
+                meta= { "reference": it[1] }
+            ) for it in data
+        ], copy=False)
+
+    train, valid, test = pickle.load(open(path, "rb"))
+    return mapping(train), mapping(valid), mapping(test)
diff --git a/OpenAttack/data/data_snli.py b/OpenAttack/data/data_snli.py
@@ -0,0 +1,26 @@
+"""
+:type: a tuple of three :py:class:`.Dataset` s, `(train, valid, test)`.
+:Size: 72.596MB
+
+SNLI dataset which is used to train victim models.
+"""
+import pickle
+
+NAME = "Dataset.SNLI"
+DOWNLOAD = "https://thunlp.oss-cn-qingdao.aliyuncs.com/TAADToolbox/dataset/snli.pkl"
+
+
+def LOAD(path):
+    from OpenAttack.utils import Dataset, DataInstance
+
+    def mapping(data):
+        return Dataset([
+            DataInstance(
+                x=it[0],
+                y=it[2],
+                meta= { "reference": it[1] }
+            ) for it in data
+        ], copy=False)
+
+    train, valid, test = pickle.load(open(path, "rb"))
+    return mapping(train), mapping(valid), mapping(test)
diff --git a/OpenAttack/data/nltk_perceptron_pos_tagger.py b/OpenAttack/data/nltk_perceptron_pos_tagger.py
@@ -16,5 +16,5 @@
 
 def LOAD(path):
     ret = __import__("nltk").tag.PerceptronTagger(load=False)
-    ret.load(os.path.join(path, "averaged_perceptron_tagger.pickle"))
+    ret.load("file:" + os.path.join(path, "averaged_perceptron_tagger.pickle"))
     return ret.tag
diff --git a/OpenAttack/data/nltk_senttokenizer.py b/OpenAttack/data/nltk_senttokenizer.py
@@ -14,5 +14,5 @@
 
 
 def LOAD(path):
-    return __import__("nltk").data.load(os.path.join(path, "english.pickle")).tokenize
+    return __import__("nltk").data.load("file:" + os.path.join(path, "english.pickle")).tokenize
 
diff --git a/OpenAttack/data/victim_albert_ag.py b/OpenAttack/data/victim_albert_ag.py
@@ -0,0 +1,20 @@
+"""
+:type: OpenAttack.utils.AlbertClassifier
+:Size: 788.697MB
+:Package Requirements:
+    * transformers
+    * pytorch
+
+Pretrained ALBERT model on AG-4 dataset. See :py:data:`Dataset.AG` for detail.
+"""
+
+from OpenAttack.utils import make_zip_downloader, AlbertClassifier
+
+NAME = "Victim.ALBERT.AG"
+
+URL = "https://thunlp.oss-cn-qingdao.aliyuncs.com/TAADToolbox/victim/albert_ag.zip"
+DOWNLOAD = make_zip_downloader(URL)
+
+def LOAD(path):
+    from OpenAttack import Classifier
+    return AlbertClassifier(path, 5)
diff --git a/OpenAttack/data/victim_albert_imdb.py b/OpenAttack/data/victim_albert_imdb.py
@@ -0,0 +1,20 @@
+"""
+:type: OpenAttack.utils.AlbertClassifier
+:Size: 788.662MB
+:Package Requirements:
+    * transformers
+    * pytorch
+
+Pretrained ALBERT model on IMDB dataset. See :py:data:`Dataset.IMDB` for detail.
+"""
+
+from OpenAttack.utils import make_zip_downloader, AlbertClassifier
+
+NAME = "Victim.ALBERT.IMDB"
+
+URL = "https://thunlp.oss-cn-qingdao.aliyuncs.com/TAADToolbox/victim/albert_imdb.zip"
+DOWNLOAD = make_zip_downloader(URL)
+
+def LOAD(path):
+    from OpenAttack import Classifier
+    return AlbertClassifier(path, 2)
diff --git a/OpenAttack/data/victim_albert_mnli.py b/OpenAttack/data/victim_albert_mnli.py
@@ -0,0 +1,20 @@
+"""
+:type: OpenAttack.utils.AlbertClassifier
+:Size: 788.668MB
+:Package Requirements:
+    * transformers
+    * pytorch
+
+Pretrained ALBERT model on MNLI dataset. See :py:data:`Dataset.MNLI` for detail.
+"""
+
+from OpenAttack.utils import make_zip_downloader, AlbertClassifier
+
+NAME = "Victim.ALBERT.MNLI"
+
+URL = "https://thunlp.oss-cn-qingdao.aliyuncs.com/TAADToolbox/victim/albert_mnli.zip"
+DOWNLOAD = make_zip_downloader(URL)
+
+def LOAD(path):
+    from OpenAttack import Classifier
+    return AlbertClassifier(path, 2)
diff --git a/OpenAttack/data/victim_albert_snli.py b/OpenAttack/data/victim_albert_snli.py
@@ -0,0 +1,20 @@
+"""
+:type: OpenAttack.utils.AlbertClassifier
+:Size: 788.672MB
+:Package Requirements:
+    * transformers
+    * pytorch
+
+Pretrained ALBERT model on SNLI dataset. See :py:data:`Dataset.SNLI` for detail.
+"""
+
+from OpenAttack.utils import make_zip_downloader, AlbertClassifier
+
+NAME = "Victim.ALBERT.SNLI"
+
+URL = "https://thunlp.oss-cn-qingdao.aliyuncs.com/TAADToolbox/victim/albert_snli.zip"
+DOWNLOAD = make_zip_downloader(URL)
+
+def LOAD(path):
+    from OpenAttack import Classifier
+    return AlbertClassifier(path, 3)
diff --git a/OpenAttack/data/victim_albert_sst.py b/OpenAttack/data/victim_albert_sst.py
@@ -0,0 +1,20 @@
+"""
+:type: OpenAttack.utils.AlbertClassifier
+:Size: 788.66MB
+:Package Requirements:
+    * transformers
+    * pytorch
+
+Pretrained ALBERT model on SST-2 dataset. See :py:data:`Dataset.SST` for detail.
+"""
+
+from OpenAttack.utils import make_zip_downloader, AlbertClassifier
+
+NAME = "Victim.ALBERT.SST"
+
+URL = "https://thunlp.oss-cn-qingdao.aliyuncs.com/TAADToolbox/victim/albert_sst.zip"
+DOWNLOAD = make_zip_downloader(URL)
+
+def LOAD(path):
+    from OpenAttack import Classifier
+    return AlbertClassifier(path, 2)
diff --git a/OpenAttack/data/victim_bert_mnli.py b/OpenAttack/data/victim_bert_mnli.py
@@ -0,0 +1,20 @@
+"""
+:type: OpenAttack.utils.BertClassifier
+:Size: 1.23GB
+:Package Requirements:
+    * transformers
+    * pytorch
+
+Pretrained BERT model on MNLI dataset. See :py:data:`Dataset.MNLI` for detail.
+"""
+
+from OpenAttack.utils import make_zip_downloader, BertClassifier
+
+NAME = "Victim.BERT.MNLI"
+
+URL = "https://thunlp.oss-cn-qingdao.aliyuncs.com/TAADToolbox/victim/bert_mnli.zip"
+DOWNLOAD = make_zip_downloader(URL)
+
+def LOAD(path):
+    from OpenAttack import Classifier
+    return BertClassifier(path, 2)
diff --git a/OpenAttack/data/victim_bert_snli.py b/OpenAttack/data/victim_bert_snli.py
@@ -0,0 +1,20 @@
+"""
+:type: OpenAttack.utils.BertClassifier
+:Size: 1.23GB
+:Package Requirements:
+    * transformers
+    * pytorch
+
+Pretrained BERT model on SNLI dataset. See :py:data:`Dataset.SNLI` for detail.
+"""
+
+from OpenAttack.utils import make_zip_downloader, BertClassifier
+
+NAME = "Victim.BERT.SNLI"
+
+URL = "https://thunlp.oss-cn-qingdao.aliyuncs.com/TAADToolbox/victim/bert_snli.zip"
+DOWNLOAD = make_zip_downloader(URL)
+
+def LOAD(path):
+    from OpenAttack import Classifier
+    return BertClassifier(path, 3)