@inproceedings{017167ec3c9f4221ad1a4eea158852f9,
title = "Probabilistic parallelisation of blocking non-matched records for big data",
abstract = "Blocking is a technique of filtering unlikely matched pairs for record matching, which aims to collect all pairs of records that relate to the same entities across different data sources. Blocking has been broadly adopted in data mining and database. However, for big data, there is no fast and effective blocking algorithm yet, because the number of candidate pairs is tremendous between large data sets. In this paper, we report on a probabilistic parallelisation of a recently proposed blocking that is a sequential algorithm for efficient record matching in single machines. Our approach runs blocking processes distributedly on partitioned input data. In order to reduce data exchange among those blocking processes, we adopt a probabilistic technique to assure that the processes can run independently and meanwhile the aggregated result is correct with respect to common metrics. Our experimental analysis endorses the advantage of our technique and shows its novel scalability on a Hadoop map-reduce system deployed physically in a cloud.",
author = "Chenxiao Dou and Daniel Sun and Chen, {Yi Cheng} and Guoqiang Li and Jianquan Liu",
note = "Publisher Copyright: {\textcopyright} 2016 IEEE.; 4th IEEE International Conference on Big Data, Big Data 2016 ; Conference date: 05-12-2016 Through 08-12-2016",
year = "2016",
doi = "10.1109/BigData.2016.7841009",
language = "???core.languages.en_GB???",
series = "Proceedings - 2016 IEEE International Conference on Big Data, Big Data 2016",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "3465--3473",
editor = "Ronay Ak and George Karypis and Yinglong Xia and Hu, {Xiaohua Tony} and Yu, {Philip S.} and James Joshi and Lyle Ungar and Ling Liu and Aki-Hiro Sato and Toyotaro Suzumura and Sudarsan Rachuri and Rama Govindaraju and Weijia Xu",
booktitle = "Proceedings - 2016 IEEE International Conference on Big Data, Big Data 2016",
}