Description
Entity resolution is a crucial step for data quality and data integration. Learning-based approaches show high effectiveness at the expense of poor efficiency. To reduce the typically high execution times, we investigate how learning-based entity resolution can be realized in a cloud infrastructure using MapReduce. We propose and evaluate two efficient MapReduce-based strategies for pair-wise similarity computation and classifier application on the Cartesian product of two input sources. Our evaluation is based on real-world datasets and shows the high efficiency and effectiveness of the proposed approaches.
Keywords
- MapReduce, Hadoop
- Entity Resolution, Object matching, Similarity Join, Pair-wise comparison
- Cartesian product
- Machine Learning, Classification
BibTex
@inproceedings{Kolb:2011:LER:2064085.2064087,
author = {Kolb, Lars and K\"{o}pcke, Hanna and Thor, Andreas and Rahm, Erhard},
title = {{Learning-based Entity Resolution with MapReduce}},
booktitle = {Proceedings of the third international workshop on Cloud data management},
series = {CloudDB '11},
year = {2011},
isbn = {978-1-4503-0956-1},
location = {Glasgow, Scotland, UK},
pages = {1--6},
numpages = {6},
url = {http://doi.acm.org/10.1145/2064085.2064087},
doi = {http://doi.acm.org/10.1145/2064085.2064087},
acmid = {2064087},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {Cartesian product, Entity Resolution, Machine Learning, MapReduce},
}