Skip to content

Commit

Permalink
First two transformers for the Polish trained in Clarin-Biz project (#1)
Browse files Browse the repository at this point in the history
* add docker

* docker update

* ignore

* test transformers loading

* use our v1 polish roberta from hugging face repo

* update clarin hf repo

* update clarin long former model

* add transformers

* move to experimental

* fix moved init

* fix moved init

* add transformers

move to experimental

fix moved init

fix moved init

* mains

* ignore lib without type in mypy
add tensorflow to poetry
  • Loading branch information
laugustyniak authored Mar 23, 2021
1 parent ea095b7 commit 965be95
Show file tree
Hide file tree
Showing 10 changed files with 891 additions and 6 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,5 @@ dmypy.json

# Pyre type checker
.pyre/

.idea/
19 changes: 19 additions & 0 deletions docker/embeddings-gpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
FROM nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu18.04
MAINTAINER Lukasz Augustyniak <lukasz.augustyniak@pwr.edu.pl>

RUN apt update && \
apt install -y bash \
build-essential \
git \
curl \
ca-certificates \
python3 \
python3-pip && \
rm -rf /var/lib/apt/lists

RUN python3 -m pip install --no-cache-dir --upgrade pip && \
python3 -m pip install --no-cache-dir \
jupyter \
tensorflow \
torch \
transformers
Empty file added experimental/__init__.py
Empty file.
Empty file.
Empty file.
24 changes: 24 additions & 0 deletions experimental/embeddings/language_models/long_former.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from transformers import AutoTokenizer, AutoModel
import tensorflow as tf


def main() -> None:
tokenizer = AutoTokenizer.from_pretrained("clarin-pl/long-former-polish")
model = AutoModel.from_pretrained("clarin-pl/long-former-polish")

prompt = "Zażółć gęślą jaźń"
choice0 = "Gęś"
choice1 = "Jaźń."

encoding = tokenizer(
[[prompt, prompt], [choice0, choice1]], return_tensors="tf", padding=True
)
inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
outputs = model(inputs) # batch size is 1

# the linear classifier still needs to be trained
logits = outputs.logits


if __name__ == "__main__":
main()
13 changes: 13 additions & 0 deletions experimental/embeddings/language_models/roberta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from transformers import AutoTokenizer, AutoModel


def main() -> None:
tokenizer = AutoTokenizer.from_pretrained("clarin-pl/roberta-polish-kgr10")
model = AutoModel.from_pretrained("clarin-pl/roberta-polish-kgr10")

encoded_input = tokenizer.encode("Zażółć gęślą jaźń", return_tensors="pt")
outputs = model(encoded_input)


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[mypy-transformers.*]
ignore_missing_imports = True

[mypy-tensorflow.*]
ignore_missing_imports = True
Loading

0 comments on commit 965be95

Please sign in to comment.