Entity resolution is the data integration task of identifying matching entities (e.g. products, customers) in one or several data sources. Previous approaches for matching and clustering entities between multiple (>2) sources either treated the different sources as a single source or assumed that the individual sources are duplicate-free, so that only matches between sources have to be found. In this work we propose and evaluate a general Multi-Source Clean Dirty (MSCD) scheme with an arbitrary combination of clean (duplicate-free) and dirty sources. For this purpose, we extend a constraint-based clustering algorithm called Affinity Propagation (AP) for entity clustering with clean and dirty sources (MSCD-AP). We also consider a hierarchical version of it for improved scalability. Our evaluation considers a full range of datasets containing 0\% to 100\% of clean sources. We compare our proposed algorithms with other clustering schemes in terms of both match quality and runtime.
%0 Conference Paper
%1 Lerm2021-tr
%A Lerm, Stefan
%A Saeedi, Alieh
%A Rahm, Erhard
%D 2021
%I Gesellschaft für Informatik, Bonn
%K
%T Extended Affinity Propagation clustering for Multi-source entity resolution
%X Entity resolution is the data integration task of identifying matching entities (e.g. products, customers) in one or several data sources. Previous approaches for matching and clustering entities between multiple (>2) sources either treated the different sources as a single source or assumed that the individual sources are duplicate-free, so that only matches between sources have to be found. In this work we propose and evaluate a general Multi-Source Clean Dirty (MSCD) scheme with an arbitrary combination of clean (duplicate-free) and dirty sources. For this purpose, we extend a constraint-based clustering algorithm called Affinity Propagation (AP) for entity clustering with clean and dirty sources (MSCD-AP). We also consider a hierarchical version of it for improved scalability. Our evaluation considers a full range of datasets containing 0\% to 100\% of clean sources. We compare our proposed algorithms with other clustering schemes in terms of both match quality and runtime.
@inproceedings{Lerm2021-tr,
abstract = {Entity resolution is the data integration task of identifying matching entities (e.g. products, customers) in one or several data sources. Previous approaches for matching and clustering entities between multiple (>2) sources either treated the different sources as a single source or assumed that the individual sources are duplicate-free, so that only matches between sources have to be found. In this work we propose and evaluate a general Multi-Source Clean Dirty (MSCD) scheme with an arbitrary combination of clean (duplicate-free) and dirty sources. For this purpose, we extend a constraint-based clustering algorithm called Affinity Propagation (AP) for entity clustering with clean and dirty sources (MSCD-AP). We also consider a hierarchical version of it for improved scalability. Our evaluation considers a full range of datasets containing 0\% to 100\% of clean sources. We compare our proposed algorithms with other clustering schemes in terms of both match quality and runtime.},
added-at = {2024-09-10T11:56:37.000+0200},
author = {Lerm, Stefan and Saeedi, Alieh and Rahm, Erhard},
biburl = {https://puma.scadsai.uni-leipzig.de/bibtex/2aca2d0304d8a1a8c6fa193149b5d6cc1/scadsfct},
interhash = {628fef9c1401aca725c8b4c467a161ef},
intrahash = {aca2d0304d8a1a8c6fa193149b5d6cc1},
keywords = {},
publisher = {Gesellschaft f{\"u}r Informatik, Bonn},
timestamp = {2024-09-10T15:15:57.000+0200},
title = {Extended Affinity Propagation clustering for Multi-source entity resolution},
year = 2021
}