Learning to rank~(LTR) is the de facto standard for web search, improving upon classical retrieval models by exploiting (in)direct relevance feedback from user judgments, interaction logs, etc. We investigate for the first time the effect of a sampling bias on LTR~models due to the potential presence of near-duplicate web pages in the training data, and how (in)consistent relevance feedback of duplicates influences an LTR~model's decisions. To examine this bias, we construct a series of specialized LTR~datasets based on the ClueWeb09 corpus with varying amounts of near-duplicates. We devise worst-case and average-case train/test splits that are evaluated on popular pointwise, pairwise, and listwise LTR~models. Our experiments demonstrate that duplication causes overfitting and thus less effective models, making a strong case for the benefits of systematic deduplication before training and model evaluation.
%0 Conference Paper
%1 10.1145/3397271.3401212
%A Fröbe, Maik
%A Bevendorff, Janek
%A Reimer, Jan Heinrich
%A Potthast, Martin
%A Hagen, Matthias
%B Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval
%C New York, NY, USA
%D 2020
%I Association for Computing Machinery
%K bias learning near-duplicate-detection, novelty principle, rank, selection to
%P 1997–2000
%R 10.1145/3397271.3401212
%T Sampling Bias Due to Near-Duplicates in Learning to Rank
%U https://doi.org/10.1145/3397271.3401212
%X Learning to rank~(LTR) is the de facto standard for web search, improving upon classical retrieval models by exploiting (in)direct relevance feedback from user judgments, interaction logs, etc. We investigate for the first time the effect of a sampling bias on LTR~models due to the potential presence of near-duplicate web pages in the training data, and how (in)consistent relevance feedback of duplicates influences an LTR~model's decisions. To examine this bias, we construct a series of specialized LTR~datasets based on the ClueWeb09 corpus with varying amounts of near-duplicates. We devise worst-case and average-case train/test splits that are evaluated on popular pointwise, pairwise, and listwise LTR~models. Our experiments demonstrate that duplication causes overfitting and thus less effective models, making a strong case for the benefits of systematic deduplication before training and model evaluation.
%@ 9781450380164
@inproceedings{10.1145/3397271.3401212,
abstract = {Learning to rank~(LTR) is the de facto standard for web search, improving upon classical retrieval models by exploiting (in)direct relevance feedback from user judgments, interaction logs, etc. We investigate for the first time the effect of a sampling bias on LTR~models due to the potential presence of near-duplicate web pages in the training data, and how (in)consistent relevance feedback of duplicates influences an LTR~model's decisions. To examine this bias, we construct a series of specialized LTR~datasets based on the ClueWeb09 corpus with varying amounts of near-duplicates. We devise worst-case and average-case train/test splits that are evaluated on popular pointwise, pairwise, and listwise LTR~models. Our experiments demonstrate that duplication causes overfitting and thus less effective models, making a strong case for the benefits of systematic deduplication before training and model evaluation.},
added-at = {2024-10-02T10:38:17.000+0200},
address = {New York, NY, USA},
author = {Fr\"{o}be, Maik and Bevendorff, Janek and Reimer, Jan Heinrich and Potthast, Martin and Hagen, Matthias},
biburl = {https://puma.scadsai.uni-leipzig.de/bibtex/26e0a5e0e6189c64991d0f8deef01c19c/scadsfct},
booktitle = {Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval},
doi = {10.1145/3397271.3401212},
interhash = {d58ec4ee5c952c7e702b7980c86798b1},
intrahash = {6e0a5e0e6189c64991d0f8deef01c19c},
isbn = {9781450380164},
keywords = {bias learning near-duplicate-detection, novelty principle, rank, selection to},
location = {Virtual Event, China},
numpages = {4},
pages = {1997–2000},
publisher = {Association for Computing Machinery},
series = {SIGIR '20},
timestamp = {2024-10-02T10:38:17.000+0200},
title = {Sampling Bias Due to Near-Duplicates in Learning to Rank},
url = {https://doi.org/10.1145/3397271.3401212},
year = 2020
}