@inproceedings{e6b70ad0dd8c44f0af1a2f1669cbc5b5,
title = "AFIS: Aligning detail-pages for full schema induction",
abstract = "Web data extraction is an essential task for web data integration. Most researches focus on data extraction from list-pages by detecting data-rich section and record boundary segmentation. However, in detail-pages which contain all-inclusive product information in each page, so the number of data attributes need to be aligned is much larger. In this paper, we formulate data extraction problem as alignment of leaf nodes from DOM Trees. We propose AFIS, Annotation-Free Induction of Full Schema for detail pages in this paper. AFIS applies Divide-and-Conquer and Longest Increasing Sequence (LIS) algorithms to mine landmarks from input. The experiments show that AFIS outperforms RoadRunner, FivaTech and TEX (F1 0.990) in terms of selected data. For full schema evaluation (all data), AFIS also represents the highest average performance (F1 0.937) compared with TEX and RoadRunner.",
keywords = "detail-pages alignment, divide-conquer alignment, landmark equivalence class, semi-structured data, web data extraction",
author = "Yuliana, {Oviliani Yenty} and Chang, {Chia Hui}",
note = "Publisher Copyright: {\textcopyright} 2016 IEEE.; null ; Conference date: 25-11-2016 Through 27-11-2016",
year = "2017",
month = mar,
day = "16",
doi = "10.1109/TAAI.2016.7880164",
language = "???core.languages.en_GB???",
series = "TAAI 2016 - 2016 Conference on Technologies and Applications of Artificial Intelligence, Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "220--227",
booktitle = "TAAI 2016 - 2016 Conference on Technologies and Applications of Artificial Intelligence, Proceedings",
}