Skip to content

Commit

Permalink
Move the transformer choice to the resources directory and refactor a…
Browse files Browse the repository at this point in the history
… method that can be used by each of the processors which use the transformers
  • Loading branch information
AngledLuffa committed Aug 22, 2023
1 parent e4b967a commit c901476
Show file tree
Hide file tree
Showing 7 changed files with 281 additions and 275 deletions.
247 changes: 247 additions & 0 deletions stanza/resources/default_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,3 +330,250 @@
},
}


"""
default transformers to use for various languages
we try to document why we choose a particular model in each case
"""
TRANSFORMERS = {
# https://huggingface.co/Maltehb/danish-bert-botxo
# contrary to normal expectations, this hurts F1
# on a dev split by about 1 F1
# "da": "Maltehb/danish-bert-botxo",
#
# the multilingual bert is a marginal improvement for conparse
#
# December 2022 update:
# there are quite a few Danish transformers available on HuggingFace
# here are the results of training a constituency parser with adadelta/adamw
# on each of them:
#
# no bert 0.8245 0.8230
# alexanderfalk/danbert-small-cased 0.8236 0.8286
# Geotrend/distilbert-base-da-cased 0.8268 0.8306
# sarnikowski/convbert-small-da-cased 0.8322 0.8341
# bert-base-multilingual-cased 0.8341 0.8342
# vesteinn/ScandiBERT-no-faroese 0.8373 0.8408
# Maltehb/danish-bert-botxo 0.8383 0.8408
# vesteinn/ScandiBERT 0.8421 0.8475
#
# Also, two models have token windows too short for use with the
# Danish dataset:
# jonfd/electra-small-nordic
# Maltehb/aelaectra-danish-electra-small-cased
#
"da": "vesteinn/ScandiBERT",

# As of April 2022, the bert models available have a weird
# tokenizer issue where soft hyphen causes it to crash.
# We attempt to compensate for that in the dev branch
# bert-base-german-cased
# dev: 2022-04-27 21:21:31 INFO: de_germeval2014 87.59
# test: 2022-04-27 21:21:59 INFO: de_germeval2014 86.95
#
# dbmdz/bert-base-german-cased
# dev: 2022-04-27 22:24:59 INFO: de_germeval2014 88.22
# test: 2022-04-27 22:25:27 INFO: de_germeval2014 87.80
"de": "dbmdz/bert-base-german-cased",

# experiments on various forms of roberta & electra
# https://huggingface.co/roberta-base
# https://huggingface.co/roberta-large
# https://huggingface.co/google/electra-small-discriminator
# https://huggingface.co/google/electra-base-discriminator
# https://huggingface.co/google/electra-large-discriminator
#
# experiments using the different models for POS tagging,
# dev set, including WV and charlm, AllTags score:
# roberta-base: 95.67
# roberta-large: 95.98
# electra-small: 95.31
# electra-base: 95.90
# electra-large: 96.01
#
# depparse scores, dev set, no finetuning, with WV and charlm
# UAS LAS CLAS MLAS BLEX
# roberta-base: 93.16 91.20 89.87 89.38 89.87
# roberta-large: 93.47 91.56 90.13 89.71 90.13
# electra-small: 92.17 90.02 88.25 87.66 88.25
# electra-base: 93.42 91.44 90.10 89.67 90.10
# electra-large: 94.07 92.17 90.99 90.53 90.99
#
# conparse scores, dev & test set, with WV and charlm
# roberta_base: 96.05 95.60
# roberta_large: 95.95 95.60
# electra-small: 95.33 95.04
# electra-base: 96.09 95.98
# electra-large: 96.25 96.14
#
# conparse scores w/ finetune, dev & test set, with WV and charlm
# roberta_base: 96.07 95.81
# roberta_large: 96.37 96.41 (!!!)
# electra-small: 95.62 95.36
# electra-base: 96.21 95.94
# electra-large: 96.40 96.32
#
"en": "google/electra-large-discriminator",

# NER scores for a couple Persian options:
# none:
# dev: 2022-04-23 01:44:53 INFO: fa_arman 79.46
# test: 2022-04-23 01:45:03 INFO: fa_arman 80.06
#
# HooshvareLab/bert-fa-zwnj-base
# dev: 2022-04-23 02:43:44 INFO: fa_arman 80.87
# test: 2022-04-23 02:44:07 INFO: fa_arman 80.81
#
# HooshvareLab/roberta-fa-zwnj-base
# dev: 2022-04-23 16:23:25 INFO: fa_arman 81.23
# test: 2022-04-23 16:23:48 INFO: fa_arman 81.11
#
# HooshvareLab/bert-base-parsbert-uncased
# dev: 2022-04-26 10:42:09 INFO: fa_arman 82.49
# test: 2022-04-26 10:42:31 INFO: fa_arman 83.16
"fa": 'HooshvareLab/bert-base-parsbert-uncased',

# NER scores for a couple options:
# none:
# dev: 2022-03-04 INFO: fi_turku 83.45
# test: 2022-03-04 INFO: fi_turku 86.25
#
# bert-base-multilingual-cased
# dev: 2022-03-04 INFO: fi_turku 85.23
# test: 2022-03-04 INFO: fi_turku 89.00
#
# TurkuNLP/bert-base-finnish-cased-v1:
# dev: 2022-03-04 INFO: fi_turku 88.41
# test: 2022-03-04 INFO: fi_turku 91.36
"fi": "TurkuNLP/bert-base-finnish-cased-v1",

# POS dev set tagging results for French:
# No bert:
# 98.60 100.00 98.55 98.04
# dbmdz/electra-base-french-europeana-cased-discriminator
# 98.70 100.00 98.69 98.24
# benjamin/roberta-base-wechsel-french
# 98.71 100.00 98.75 98.26
# camembert/camembert-large
# 98.75 100.00 98.75 98.30
# camembert-base
# 98.78 100.00 98.77 98.33

# a couple possibilities to experiment with for Hebrew
# dev scores for POS and depparse
# https://huggingface.co/imvladikon/alephbertgimmel-base-512
# UPOS XPOS UFeats AllTags
# 97.25 97.25 92.84 91.81
# UAS LAS CLAS MLAS BLEX
# 94.42 92.47 89.49 88.82 89.49
#
# https://huggingface.co/onlplab/alephbert-base
# UPOS XPOS UFeats AllTags
# 97.37 97.37 92.50 91.55
# UAS LAS CLAS MLAS BLEX
# 94.06 92.12 88.80 88.13 88.80
#
# https://huggingface.co/avichr/heBERT
# UPOS XPOS UFeats AllTags
# 97.09 97.09 92.36 91.28
# UAS LAS CLAS MLAS BLEX
# 94.29 92.30 88.99 88.38 88.99
"he": "imvladikon/alephbertgimmel-base-512",

# https://huggingface.co/xlm-roberta-base
# Scores by entity for armtdp NER on 18 labels:
# no bert : 86.68
# xlm-roberta-base : 89.31
"hy": "xlm-roberta-base",

# Indonesian POS experiments: dev set of GSD
# python3 stanza/utils/training/run_pos.py id_gsd --no_bert
# python3 stanza/utils/training/run_pos.py id_gsd --bert_model ...
# also ran on the ICON constituency dataset
# model POS CON
# no_bert 89.95 84.74
# flax-community/indonesian-roberta-large 89.78 (!) xxx
# flax-community/indonesian-roberta-base 90.14 xxx
# indolem/indobert-base-uncased 90.21 88.60
# cahya/bert-base-indonesian-1.5G 90.32 88.15
# cahya/roberta-base-indonesian-1.5G 90.40 87.27
"id": "indolem/indobert-base-uncased",

# from https://github.com/idb-ita/GilBERTo
# annoyingly, it doesn't handle cased text
# supposedly there is an argument "do_lower_case"
# but that still leaves a lot of unk tokens
# "it": "idb-ita/gilberto-uncased-from-camembert",
#
# from https://github.com/musixmatchresearch/umberto
# on NER, this gets 88.37 dev and 91.02 test
# another option is dbmdz/bert-base-italian-cased,
# which gets 87.27 dev and 90.32 test
#
# in-order constituency parser on the VIT dev set:
# dbmdz/bert-base-italian-cased 0.8079
# dbmdz/bert-base-italian-xxl-cased: 0.8195
# Musixmatch/umberto-commoncrawl-cased-v1: 0.8256
# dbmdz/electra-base-italian-xxl-cased-discriminator: 0.8314
#
# FBK NER dev set:
# dbmdz/bert-base-italian-cased: 87.76
# Musixmatch/umberto-commoncrawl-cased-v1: 88.62
# dbmdz/bert-base-italian-xxl-cased: 88.84
# dbmdz/electra-base-italian-xxl-cased-discriminator: 89.91
#
# combined UD POS dev set: UPOS XPOS UFeats AllTags
# dbmdz/bert-base-italian-cased: 98.62 98.53 98.06 97.49
# dbmdz/bert-base-italian-xxl-cased: 98.61 98.54 98.07 97.58
# dbmdz/electra-base-italian-xxl-cased-discriminator: 98.64 98.54 98.14 97.61
# Musixmatch/umberto-commoncrawl-cased-v1: 98.56 98.45 98.13 97.62
"it": "dbmdz/electra-base-italian-xxl-cased-discriminator",

# experiments on the cintil conparse dataset
# ran a variety of transformer settings
# found the following dev set scores after 400 iterations:
# Geotrend/distilbert-base-pt-cased : not plug & play
# no bert: 0.9082
# xlm-roberta-base: 0.9109
# xlm-roberta-large: 0.9254
# adalbertojunior/distilbert-portuguese-cased: 0.9300
# neuralmind/bert-base-portuguese-cased: 0.9307
# neuralmind/bert-large-portuguese-cased: 0.9343
"pt": "neuralmind/bert-large-portuguese-cased",

# https://huggingface.co/dbmdz/bert-base-turkish-128k-cased
# helps the Turkish model quite a bit
"tr": "dbmdz/bert-base-turkish-128k-cased",

# from https://github.com/VinAIResearch/PhoBERT
# "vi": "vinai/phobert-base",
# using 6 or 7 layers of phobert-large is slightly
# more effective for constituency parsing than
# using 4 layers of phobert-base
# ... going beyond 4 layers of phobert-base
# does not help the scores
"vi": "vinai/phobert-large",

# https://github.com/ymcui/Chinese-BERT-wwm
# there's also hfl/chinese-roberta-wwm-ext-large
# or hfl/chinese-electra-base-discriminator
# or hfl/chinese-electra-180g-large-discriminator,
# which works better than the below roberta on constituency
"zh-hans": "hfl/chinese-roberta-wwm-ext",

# https://huggingface.co/allegro/herbert-base-cased
# Scores by entity on the NKJP NER task:
# no bert (dev/test): 88.64/88.75
# herbert-base-cased (dev/test): 91.48/91.02,
# herbert-large-cased (dev/test): 92.25/91.62
# sdadas/polish-roberta-large-v2 (dev/test): 92.66/91.22
"pl": "allegro/herbert-base-cased",
}

TRANSFORMER_LAYERS = {
# not clear what the best number is without more experiments,
# but more than 4 is working better than just 4
"vi": 7,
}

Loading

0 comments on commit c901476

Please sign in to comment.