Cross-encoders distilled from large language models (LLMs) are often more effective re-rankers than cross-encoders fine-tuned on manually labeled data. However, the distilled models usually do not reach their teacher LLM's effectiveness. To investigate whether best practices for fine-tuning cross-encoders on manually labeled data (e.g., hard-negative sampling, deep sampling, and listwise loss functions) can help to improve LLM ranker distillation, we construct and release a new distillation dataset: Rank-DistiLLM. In our experiments, cross-encoders trained on Rank-DistiLLM reach the effectiveness of LLMs while being orders of magnitude more efficient. Our code and data is available at https://github.com/webis-de/msmarco-llm-distillation.
%0 Journal Article
%1 Schlatt2024-os
%A Schlatt, Ferdinand
%A Fröbe, Maik
%A Scells, Harrisen
%A Zhuang, Shengyao
%A Koopman, Bevan
%A Zuccon, Guido
%A Stein, Benno
%A Potthast, Martin
%A Hagen, Matthias
%D 2024
%I arXiv
%K
%T A systematic investigation of distilling large language models into cross-encoders for passage re-ranking
%X Cross-encoders distilled from large language models (LLMs) are often more effective re-rankers than cross-encoders fine-tuned on manually labeled data. However, the distilled models usually do not reach their teacher LLM's effectiveness. To investigate whether best practices for fine-tuning cross-encoders on manually labeled data (e.g., hard-negative sampling, deep sampling, and listwise loss functions) can help to improve LLM ranker distillation, we construct and release a new distillation dataset: Rank-DistiLLM. In our experiments, cross-encoders trained on Rank-DistiLLM reach the effectiveness of LLMs while being orders of magnitude more efficient. Our code and data is available at https://github.com/webis-de/msmarco-llm-distillation.
@article{Schlatt2024-os,
abstract = {Cross-encoders distilled from large language models (LLMs) are often more effective re-rankers than cross-encoders fine-tuned on manually labeled data. However, the distilled models usually do not reach their teacher LLM's effectiveness. To investigate whether best practices for fine-tuning cross-encoders on manually labeled data (e.g., hard-negative sampling, deep sampling, and listwise loss functions) can help to improve LLM ranker distillation, we construct and release a new distillation dataset: Rank-DistiLLM. In our experiments, cross-encoders trained on Rank-DistiLLM reach the effectiveness of LLMs while being orders of magnitude more efficient. Our code and data is available at https://github.com/webis-de/msmarco-llm-distillation.},
added-at = {2024-09-10T10:41:24.000+0200},
author = {Schlatt, Ferdinand and Fr{\"o}be, Maik and Scells, Harrisen and Zhuang, Shengyao and Koopman, Bevan and Zuccon, Guido and Stein, Benno and Potthast, Martin and Hagen, Matthias},
biburl = {https://puma.scadsai.uni-leipzig.de/bibtex/2de0c94908476d2bda358ee41352d6ce1/scadsfct},
interhash = {af69f0082a175351ecc57508cedf636b},
intrahash = {de0c94908476d2bda358ee41352d6ce1},
keywords = {},
publisher = {arXiv},
timestamp = {2024-09-10T10:47:32.000+0200},
title = {A systematic investigation of distilling large language models into cross-encoders for passage re-ranking},
year = 2024
}