%0 Generic
%1 laurençon2023bigsciencerootscorpus16tb
%A Laurençon, Hugo
%A Saulnier, Lucile
%A Wang, Thomas
%A Akiki, Christopher
%A del Moral, Albert Villanova
%A Scao, Teven Le
%A Werra, Leandro Von
%A Mou, Chenghao
%A Ponferrada, Eduardo González
%A Nguyen, Huu
%A Frohberg, Jörg
%A Šaško, Mario
%A Lhoest, Quentin
%A McMillan-Major, Angelina
%A Dupont, Gerard
%A Biderman, Stella
%A Rogers, Anna
%A allal, Loubna Ben
%A Toni, Francesco De
%A Pistilli, Giada
%A Nguyen, Olivier
%A Nikpoor, Somaieh
%A Masoud, Maraim
%A Colombo, Pierre
%A de la Rosa, Javier
%A Villegas, Paulo
%A Thrush, Tristan
%A Longpre, Shayne
%A Nagel, Sebastian
%A Weber, Leon
%A Muñoz, Manuel
%A Zhu, Jian
%A Strien, Daniel Van
%A Alyafeai, Zaid
%A Almubarak, Khalid
%A Vu, Minh Chien
%A Gonzalez-Dios, Itziar
%A Soroa, Aitor
%A Lo, Kyle
%A Dey, Manan
%A Suarez, Pedro Ortiz
%A Gokaslan, Aaron
%A Bose, Shamik
%A Adelani, David
%A Phan, Long
%A Tran, Hieu
%A Yu, Ian
%A Pai, Suhas
%A Chim, Jenny
%A Lepercq, Violette
%A Ilic, Suzana
%A Mitchell, Margaret
%A Luccioni, Sasha Alexandra
%A Jernite, Yacine
%D 2023
%K topic_language imported
%T The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset
%U https://arxiv.org/abs/2303.03915
@misc{laurençon2023bigsciencerootscorpus16tb,
added-at = {2024-10-02T10:38:17.000+0200},
archiveprefix = {arXiv},
author = {Laurençon, Hugo and Saulnier, Lucile and Wang, Thomas and Akiki, Christopher and del Moral, Albert Villanova and Scao, Teven Le and Werra, Leandro Von and Mou, Chenghao and Ponferrada, Eduardo González and Nguyen, Huu and Frohberg, Jörg and Šaško, Mario and Lhoest, Quentin and McMillan-Major, Angelina and Dupont, Gerard and Biderman, Stella and Rogers, Anna and allal, Loubna Ben and Toni, Francesco De and Pistilli, Giada and Nguyen, Olivier and Nikpoor, Somaieh and Masoud, Maraim and Colombo, Pierre and de la Rosa, Javier and Villegas, Paulo and Thrush, Tristan and Longpre, Shayne and Nagel, Sebastian and Weber, Leon and Muñoz, Manuel and Zhu, Jian and Strien, Daniel Van and Alyafeai, Zaid and Almubarak, Khalid and Vu, Minh Chien and Gonzalez-Dios, Itziar and Soroa, Aitor and Lo, Kyle and Dey, Manan and Suarez, Pedro Ortiz and Gokaslan, Aaron and Bose, Shamik and Adelani, David and Phan, Long and Tran, Hieu and Yu, Ian and Pai, Suhas and Chim, Jenny and Lepercq, Violette and Ilic, Suzana and Mitchell, Margaret and Luccioni, Sasha Alexandra and Jernite, Yacine},
biburl = {https://puma.scadsai.uni-leipzig.de/bibtex/24fb1333029089c94e1cf1a44cf1c1f38/scadsfct},
eprint = {2303.03915},
interhash = {5974274ecc0057b20fe9f96b1b82621b},
intrahash = {4fb1333029089c94e1cf1a44cf1c1f38},
keywords = {topic_language imported},
primaryclass = {cs.CL},
timestamp = {2024-11-28T17:41:12.000+0100},
title = {The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset},
url = {https://arxiv.org/abs/2303.03915},
year = 2023
}