The Archive Query Log (AQL) is a previously unused, comprehensive query log collected at the Internet Archive over the last 25 years. Its first version includes 356 million queries, 166 million search result pages, and 1.7 billion search results across 550 search providers. Although many query logs have been studied in the literature, the search providers that own them generally do not publish their logs to protect user privacy and vital business data. Of the few query logs publicly available, none combines size, scope, and diversity. The AQL is the first to do so, enabling research on new retrieval models and (diachronic) search engine analyses. Provided in a privacy-preserving manner, it promotes open research as well as more transparency and accountability in the search industry.
%0 Journal Article
%1 Reimer2023-mn
%A Reimer, Jan Heinrich
%A Schmidt, Sebastian
%A Fröbe, Maik
%A Gienapp, Lukas
%A Scells, Harrisen
%A Stein, Benno
%A Hagen, Matthias
%A Potthast, Martin
%D 2023
%I arXiv
%K topic_language topic_lifescience
%T The Archive Query Log: Mining millions of search result pages of hundreds of search engines from 25 years of web archives
%X The Archive Query Log (AQL) is a previously unused, comprehensive query log collected at the Internet Archive over the last 25 years. Its first version includes 356 million queries, 166 million search result pages, and 1.7 billion search results across 550 search providers. Although many query logs have been studied in the literature, the search providers that own them generally do not publish their logs to protect user privacy and vital business data. Of the few query logs publicly available, none combines size, scope, and diversity. The AQL is the first to do so, enabling research on new retrieval models and (diachronic) search engine analyses. Provided in a privacy-preserving manner, it promotes open research as well as more transparency and accountability in the search industry.
@article{Reimer2023-mn,
abstract = {The Archive Query Log (AQL) is a previously unused, comprehensive query log collected at the Internet Archive over the last 25 years. Its first version includes 356 million queries, 166 million search result pages, and 1.7 billion search results across 550 search providers. Although many query logs have been studied in the literature, the search providers that own them generally do not publish their logs to protect user privacy and vital business data. Of the few query logs publicly available, none combines size, scope, and diversity. The AQL is the first to do so, enabling research on new retrieval models and (diachronic) search engine analyses. Provided in a privacy-preserving manner, it promotes open research as well as more transparency and accountability in the search industry.},
added-at = {2024-09-10T10:41:24.000+0200},
author = {Reimer, Jan Heinrich and Schmidt, Sebastian and Fr{\"o}be, Maik and Gienapp, Lukas and Scells, Harrisen and Stein, Benno and Hagen, Matthias and Potthast, Martin},
biburl = {https://puma.scadsai.uni-leipzig.de/bibtex/22d9e843e218d5b1eaa62a40ec0577dcd/scadsfct},
interhash = {6a131b4108b6fad276c8c88225fed717},
intrahash = {2d9e843e218d5b1eaa62a40ec0577dcd},
keywords = {topic_language topic_lifescience},
publisher = {arXiv},
timestamp = {2024-11-22T15:47:47.000+0100},
title = {The Archive Query Log: Mining millions of search result pages of hundreds of search engines from 25 years of web archives},
year = 2023
}