Context-sensitive word search engines are writing assistants that support word choice, phrasing, and idiomatic language use by indexing large-scale n-gram collections and implementing a wildcard search. However, search results become unreliable with increasing context size (e.g., n\textgreater=5), when observations become sparse. This paper proposes two strategies for word search with larger n, based on masked and conditional language modeling. We build such search engines using BERT and BART and compare their capabilities in answering English context queries with those of the n-gram-based word search engine Netspeak. Our proposed strategies score within 5 percentage points MRR of n-gram collections while answering up to 5 times as many queries.
%0 Conference Paper
%1 wiegmann-etal-2022-language
%A Wiegmann, Matti
%A Völske, Michael
%A Stein, Benno
%A Potthast, Martin
%B Proceedings of the First Workshop on Intelligent and Interactive Writing Assistants (In2Writing 2022)
%C Dublin, Ireland
%D 2022
%I Association for Computational Linguistics
%K
%P 39--45
%R 10.18653/v1/2022.in2writing-1.5
%T Language Models as Context-sensitive Word Search Engines
%U https://aclanthology.org/2022.in2writing-1.5
%X Context-sensitive word search engines are writing assistants that support word choice, phrasing, and idiomatic language use by indexing large-scale n-gram collections and implementing a wildcard search. However, search results become unreliable with increasing context size (e.g., n\textgreater=5), when observations become sparse. This paper proposes two strategies for word search with larger n, based on masked and conditional language modeling. We build such search engines using BERT and BART and compare their capabilities in answering English context queries with those of the n-gram-based word search engine Netspeak. Our proposed strategies score within 5 percentage points MRR of n-gram collections while answering up to 5 times as many queries.
@inproceedings{wiegmann-etal-2022-language,
abstract = {Context-sensitive word search engines are writing assistants that support word choice, phrasing, and idiomatic language use by indexing large-scale n-gram collections and implementing a wildcard search. However, search results become unreliable with increasing context size (e.g., n{\textgreater}=5), when observations become sparse. This paper proposes two strategies for word search with larger n, based on masked and conditional language modeling. We build such search engines using BERT and BART and compare their capabilities in answering English context queries with those of the n-gram-based word search engine Netspeak. Our proposed strategies score within 5 percentage points MRR of n-gram collections while answering up to 5 times as many queries.},
added-at = {2024-09-10T11:56:37.000+0200},
address = {Dublin, Ireland},
author = {Wiegmann, Matti and V{\"o}lske, Michael and Stein, Benno and Potthast, Martin},
biburl = {https://puma.scadsai.uni-leipzig.de/bibtex/22896b274d7ef7975319d9b52be0556fd/scadsfct},
booktitle = {Proceedings of the First Workshop on Intelligent and Interactive Writing Assistants (In2Writing 2022)},
doi = {10.18653/v1/2022.in2writing-1.5},
interhash = {0b89126a01ffb674648dbfad2d0f56ba},
intrahash = {2896b274d7ef7975319d9b52be0556fd},
keywords = {},
month = may,
pages = {39--45},
publisher = {Association for Computational Linguistics},
timestamp = {2024-09-10T15:15:57.000+0200},
title = {Language Models as Context-sensitive Word Search Engines},
url = {https://aclanthology.org/2022.in2writing-1.5},
year = 2022
}