Schema/ontology matching consists in finding matches between types, properties and entities in heterogeneous sources of data in order to integrate them, which has become increasingly relevant with the development of web technologies and open data initiatives. One of the involved tasks is the matching of data properties, which attempts to try to find correspondences between the attributes of the entities. This is challenging due to the at times different names of equivalent properties. Furthermore, some properties may not be equivalent, but still match in 1..n relationships. These difficulties create the need for varied evaluation datasets for two reasons. First, they are needed to evaluate existing techniques in a variety of scenarios. Second, they enable the training of supervised techniques that may even become context-independent if trained with data from diverse enough contexts. To support the evaluation and training of data property matching techniques, we present a collection dataset consisting of product records from four different contexts. These datasets are the result of transforming two different existing datasets. In one of the datasets, some properties were filtered for being too noisy. The resulting processed dataset consists of json files with a listing of the product records and their properties, and a separate grouping of the properties that determines which ones match. It contains information about 2860 entities, with 4386 properties and 13350 pairwise matches.
%0 Journal Article
%1 Ayala2022-id
%A Ayala, Daniel
%A Hernández, Inma
%A Ruiz, David
%A Rahm, Erhard
%D 2022
%I Elsevier BV
%J Data Brief
%K area_bigdata integration; engineering; Data Ontology; Property matching
%N 107884
%P 107884
%T Multi-source dataset of e-commerce products with attributes for property matching
%V 41
%X Schema/ontology matching consists in finding matches between types, properties and entities in heterogeneous sources of data in order to integrate them, which has become increasingly relevant with the development of web technologies and open data initiatives. One of the involved tasks is the matching of data properties, which attempts to try to find correspondences between the attributes of the entities. This is challenging due to the at times different names of equivalent properties. Furthermore, some properties may not be equivalent, but still match in 1..n relationships. These difficulties create the need for varied evaluation datasets for two reasons. First, they are needed to evaluate existing techniques in a variety of scenarios. Second, they enable the training of supervised techniques that may even become context-independent if trained with data from diverse enough contexts. To support the evaluation and training of data property matching techniques, we present a collection dataset consisting of product records from four different contexts. These datasets are the result of transforming two different existing datasets. In one of the datasets, some properties were filtered for being too noisy. The resulting processed dataset consists of json files with a listing of the product records and their properties, and a separate grouping of the properties that determines which ones match. It contains information about 2860 entities, with 4386 properties and 13350 pairwise matches.
@article{Ayala2022-id,
abstract = {Schema/ontology matching consists in finding matches between types, properties and entities in heterogeneous sources of data in order to integrate them, which has become increasingly relevant with the development of web technologies and open data initiatives. One of the involved tasks is the matching of data properties, which attempts to try to find correspondences between the attributes of the entities. This is challenging due to the at times different names of equivalent properties. Furthermore, some properties may not be equivalent, but still match in 1..n relationships. These difficulties create the need for varied evaluation datasets for two reasons. First, they are needed to evaluate existing techniques in a variety of scenarios. Second, they enable the training of supervised techniques that may even become context-independent if trained with data from diverse enough contexts. To support the evaluation and training of data property matching techniques, we present a collection dataset consisting of product records from four different contexts. These datasets are the result of transforming two different existing datasets. In one of the datasets, some properties were filtered for being too noisy. The resulting processed dataset consists of json files with a listing of the product records and their properties, and a separate grouping of the properties that determines which ones match. It contains information about 2860 entities, with 4386 properties and 13350 pairwise matches.},
added-at = {2024-09-10T11:54:51.000+0200},
author = {Ayala, Daniel and Hern{\'a}ndez, Inma and Ruiz, David and Rahm, Erhard},
biburl = {https://puma.scadsai.uni-leipzig.de/bibtex/2966a18c0cfc5564f9841919ef698c134/scadsfct},
copyright = {http://creativecommons.org/licenses/by/4.0/},
interhash = {b09c70792fa7fd35b9715fb94c7ba978},
intrahash = {966a18c0cfc5564f9841919ef698c134},
journal = {Data Brief},
keywords = {area_bigdata integration; engineering; Data Ontology; Property matching},
language = {en},
month = {04},
number = 107884,
pages = 107884,
publisher = {Elsevier BV},
timestamp = {2024-11-22T15:44:23.000+0100},
title = {Multi-source dataset of e-commerce products with attributes for property matching},
volume = 41,
year = 2022
}