The SciQA benchmark for scientific question answering aims to represent a challenging task for next-generation question-answering systems on which vanilla large language models fail. In this article, we provide an analysis of the performance of language models on this benchmark including prompting and fine-tuning techniques to adapt them to the SciQA task. We show that both fine-tuning and prompting techniques with intelligent few-shot selection allow us to obtain excellent results on the SciQA benchmark. We discuss the valuable lessons and common error categories, and outline their implications on how to optimise large language models for question answering over knowledge graphs.
%0 Conference Paper
%1 lehmann2024large
%A Lehmann, Jens
%A Meloni, Antonello
%A Motta, Enrico
%A Osborne, Francesco
%A Recupero, Diego Reforgiato
%A Salatino, Angelo Antonio
%A Vahdati, Sahar
%B ESWC 2024
%C Cham
%D 2024
%I Springer
%J The Semantic Web. ESWC 2024. Lecture Notes in Computer Science
%K imported yaff
%P 199--217
%T Large Language Models for Scientific Question Answering: An Extensive Analysis of the SciQA Benchmark
%U https://oro.open.ac.uk/98409/
%V 14664
%X The SciQA benchmark for scientific question answering aims to represent a challenging task for next-generation question-answering systems on which vanilla large language models fail. In this article, we provide an analysis of the performance of language models on this benchmark including prompting and fine-tuning techniques to adapt them to the SciQA task. We show that both fine-tuning and prompting techniques with intelligent few-shot selection allow us to obtain excellent results on the SciQA benchmark. We discuss the valuable lessons and common error categories, and outline their implications on how to optimise large language models for question answering over knowledge graphs.
@inproceedings{lehmann2024large,
abstract = {The SciQA benchmark for scientific question answering aims to represent a challenging task for next-generation question-answering systems on which vanilla large language models fail. In this article, we provide an analysis of the performance of language models on this benchmark including prompting and fine-tuning techniques to adapt them to the SciQA task. We show that both fine-tuning and prompting techniques with intelligent few-shot selection allow us to obtain excellent results on the SciQA benchmark. We discuss the valuable lessons and common error categories, and outline their implications on how to optimise large language models for question answering over knowledge graphs.},
added-at = {2024-12-11T11:32:34.000+0100},
address = {Cham},
author = {Lehmann, Jens and Meloni, Antonello and Motta, Enrico and Osborne, Francesco and Recupero, Diego Reforgiato and Salatino, Angelo Antonio and Vahdati, Sahar},
biburl = {https://puma.scadsai.uni-leipzig.de/bibtex/2a85126af6c7e48ee36ddfc915a3c3b19/scadsfct},
booktitle = {ESWC 2024},
interhash = {98b8030187c29c2e368e997759b0761d},
intrahash = {a85126af6c7e48ee36ddfc915a3c3b19},
journal = {The Semantic Web. ESWC 2024. Lecture Notes in Computer Science},
keywords = {imported yaff},
pages = {199--217},
publisher = {Springer},
timestamp = {2025-08-21T12:18:47.000+0200},
title = {Large Language Models for Scientific Question Answering: An Extensive Analysis of the SciQA Benchmark},
url = {https://oro.open.ac.uk/98409/},
volume = 14664,
year = 2024
}