Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a constituency model for Icelandic #1389

Open
wants to merge 1 commit into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions stanza/resources/default_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,17 @@ def build_default_pretrains(default_treebanks):
# xlm-roberta-base : 89.31
"hy": "xlm-roberta-base",

# https://huggingface.co/mideind/IceBERT
# IceBERT-large is also available:
# https://huggingface.co/mideind/IceBERT-large
# Constituency F1 scores:
# No bert (in-order): 84.40%
# IceBERT (top-down): 88.66%
# IceBERT (finetuning, top-down): 90.38%
# IceBERT-large (top-down): 88.80%
# IceBERT-large (ft, top-down): 90.29%
"is": "mideind/IceBERT"

# Indonesian POS experiments: dev set of GSD
# python3 stanza/utils/training/run_pos.py id_gsd --no_bert
# python3 stanza/utils/training/run_pos.py id_gsd --bert_model ...
Expand Down Expand Up @@ -811,6 +822,10 @@ def build_default_pretrains(default_treebanks):
# hy
"xlm-roberta-base": "xlm-roberta-base",

# is
"mideind/IceBERT": "icebert",
"mideind/IceBERT-large": "icebert-large",

# id
"indolem/indobert-base-uncased": "indobert",
"indobenchmark/indobert-large-p1": "indobenchmark-large-p1",
Expand Down
48 changes: 48 additions & 0 deletions stanza/utils/datasets/constituency/convert_icepahc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from stanza.utils.datasets.constituency import utils

def read_psd_file(input_file):
"""
Convert the IcePaHC .psd file to text

Returns a list of sentences
"""
with open(input_file, encoding='utf-8') as file:
lines = file.readlines()

output_trees = []
current_tree = ''

# Add the trees as parsed sentences to the output_trees list
for line in lines:
if line.startswith("(ROOT"):
if current_tree:
cleaned_tree = ' '.join(current_tree.split())
output_trees.append(cleaned_tree)
current_tree = line
else:
current_tree += line

# Can't forget the last tree
if current_tree:
cleaned_tree = ' '.join(current_tree.split())
output_trees.append(cleaned_tree.strip())

return output_trees


def convert_icepahc_treebank(input_file, train_size=0.8, dev_size=0.1):

trees = read_psd_file(input_file)

print("Read %d trees" % len(trees))
train_trees, dev_trees, test_trees = utils.split_treebank(trees, train_size, dev_size)
print("Split %d trees into %d train %d dev %d test" % (len(trees), len(train_trees), len(dev_trees), len(test_trees)))

return train_trees, dev_trees, test_trees


def main():
treebank = convert_icepahc_treebank("simpleicepahc24.psd")

if __name__ == '__main__':
main()
25 changes: 25 additions & 0 deletions stanza/utils/datasets/constituency/prepare_con_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,14 @@
Currently only German is converted, the German version being a
version of the Tiger Treebank
python3 -m stanza.utils.datasets.constituency.prepare_con_dataset de_spmrl

is_icepahc
The Icelandic Parsed Historical Corpus (IcePaHC), available at:
https://clarin.is/en/resources/icepahc/
A simplified/clean version of the IcePaHC treebank is used for the training
of the constituency parser, where for example empty phrases (traces and zero
subjects) and lemmas have been removed. This version is available at:
https://github.com/ingunnjk/IceConParse/tree/main/data
"""

import argparse
Expand Down Expand Up @@ -198,6 +206,7 @@
from stanza.utils.datasets.constituency.utils import SHARDS, write_dataset
import stanza.utils.datasets.constituency.vtb_convert as vtb_convert
import stanza.utils.datasets.constituency.vtb_split as vtb_split
from stanza.utils.datasets.constituency.convert_icepahc import convert_icepahc_treebank

class UnknownDatasetError(ValueError):
def __init__(self, dataset, text):
Expand Down Expand Up @@ -470,6 +479,20 @@ def process_spmrl(paths, dataset_name, *args):

convert_spmrl(input_directory, output_directory, dataset_name)

def process_icepahc(paths, dataset_name, *args):
"""
Processes the Icelandic dataset, IcePaHC
"""
assert dataset_name == 'is_icepahc'

input_file = os.path.join(paths["CONSTITUENCY_BASE"], "simpleicepahc24.psd")
if not os.path.exists(input_file):
raise FileNotFoundError("Unable to find input file for IcePaHC. Expected in {}".format(input_file))
output_dir = paths["CONSTITUENCY_DATA_DIR"]

datasets = convert_icepahc_treebank(input_file)
write_dataset(datasets, output_dir, dataset_name)

DATASET_MAPPING = {
'da_arboretum': process_arboretum,

Expand All @@ -495,6 +518,8 @@ def process_spmrl(paths, dataset_name, *args):

'zh-hans_ctb-51': process_ctb_51,
'zh-hans_ctb-90': process_ctb_90,

'is_icepahc': process_icepahc,
}

def main(dataset_name, *args):
Expand Down