Machine learning drives forward the development in many areas of Natural Language Processing (NLP). Until now, many NLP systems and research are focusing on high-resource languages, i.e. languages for which many data resources exist. Recently, so-called low-resource languages increasingly come into focus. In this context, multi-lingual language models, which are trained on related languages to a target low-resource language, may enable NLP tasks on this low-resource language. In this work, we investigate the use of multi-lingual models for Named Entity Recognition (NER) for low-resource languages. We consider the West Slavic language family a…(more)
Please log in to take part in the discussion (add own reviews or comments).
Cite this publication
More citation styles
- please select -
%0 Conference Paper
%1 d5ae2c3f3d6741fd931a4dea1d50ebe8
%A Torge, Sunna
%A Politov, Andrei
%A Lehmann, Christoph
%A Saffar, Bochra
%A Tao, Ziyan
%B EACL 2023 - 9th Workshop on Slavic Natural Language Processing, Proceedings of the SlavicNLP 2023
%D 2023
%I The Association for Computational Linguistics
%K area_architectures unit_transfer FIS_scads imported
%P 1--10
%T Named Entity Recognition for Low-Resource Languages - Profiting from Language Families
%U https://bsnlp.cs.helsinki.fi/
%X Machine learning drives forward the development in many areas of Natural Language Processing (NLP). Until now, many NLP systems and research are focusing on high-resource languages, i.e. languages for which many data resources exist. Recently, so-called low-resource languages increasingly come into focus. In this context, multi-lingual language models, which are trained on related languages to a target low-resource language, may enable NLP tasks on this low-resource language. In this work, we investigate the use of multi-lingual models for Named Entity Recognition (NER) for low-resource languages. We consider the West Slavic language family and the low-resource languages Upper Sorbian and Kashubian. Three RoBERTa models were trained from scratch, two mono-lingual models for Czech and Polish, and one bi-lingual model for Czech and Polish. These models were evaluated on the NER downstream task for Czech, Polish, Upper Sorbian, and Kashubian, and compared to existing state-of-the-art models such as RobeCzech, HerBERT, and XLM-R. The results indicate that the mono-lingual models perform better on the language they were trained on, and both the mono-lingual and language family models outperform the large multi-lingual model in downstream tasks. Overall, the study shows that low-resource West Slavic languages can benefit from closely related languages and their models.
@inproceedings{d5ae2c3f3d6741fd931a4dea1d50ebe8,
abstract = {Machine learning drives forward the development in many areas of Natural Language Processing (NLP). Until now, many NLP systems and research are focusing on high-resource languages, i.e. languages for which many data resources exist. Recently, so-called low-resource languages increasingly come into focus. In this context, multi-lingual language models, which are trained on related languages to a target low-resource language, may enable NLP tasks on this low-resource language. In this work, we investigate the use of multi-lingual models for Named Entity Recognition (NER) for low-resource languages. We consider the West Slavic language family and the low-resource languages Upper Sorbian and Kashubian. Three RoBERTa models were trained from scratch, two mono-lingual models for Czech and Polish, and one bi-lingual model for Czech and Polish. These models were evaluated on the NER downstream task for Czech, Polish, Upper Sorbian, and Kashubian, and compared to existing state-of-the-art models such as RobeCzech, HerBERT, and XLM-R. The results indicate that the mono-lingual models perform better on the language they were trained on, and both the mono-lingual and language family models outperform the large multi-lingual model in downstream tasks. Overall, the study shows that low-resource West Slavic languages can benefit from closely related languages and their models.},
added-at = {2024-11-28T16:27:18.000+0100},
author = {Torge, Sunna and Politov, Andrei and Lehmann, Christoph and Saffar, Bochra and Tao, Ziyan},
biburl = {https://puma.scadsai.uni-leipzig.de/bibtex/25dd9604d3e05e770b3466e6878813b82/scadsfct},
booktitle = {EACL 2023 - 9th Workshop on Slavic Natural Language Processing, Proceedings of the SlavicNLP 2023},
interhash = {c211c9eaad7cf00ad19df5901b391e11},
intrahash = {5dd9604d3e05e770b3466e6878813b82},
keywords = {area_architectures unit_transfer FIS_scads imported},
language = {English},
note = {Publisher Copyright: {\textcopyright} 2023 Association for Computational Linguistics.; 9th Workshop on Slavic Natural Language Processing, Slavic NLP 2023 ; Conference date: 06-05-2023 Through 06-05-2023},
pages = {1--10},
publisher = {The Association for Computational Linguistics},
series = {Proceedings of the Workshop (SlavicNLP)},
timestamp = {2024-12-06T14:36:51.000+0100},
title = {Named Entity Recognition for Low-Resource Languages - Profiting from Language Families},
url = {https://bsnlp.cs.helsinki.fi/},
year = 2023
}