@inproceedings{885d8a11864245bf875846bbc0d0be23,
title = "Page-level wrapper verification for unsupervised web data extraction",
abstract = "Unsupervised information extraction has been studied a lot in the past decade. However, not much attention has been paid to its wrapper maintenance. In this paper, we study wrapper construction and verification problem based on the given schema and template which is induced from unsupervised page-level wrapper induction system. We model the verification problem as a constraint satisfaction problem (CSP) for leaf node label assignment with respect to constraints specified by a finite state machine (FSM) which is constructed from previous learned schema and template. If there exists no solution to the CSP, i.e. no valid label sequence exists, we say the test page fails the verification; otherwise, we rank all valid label sequences by measuring the fitness of each label sequence for extraction. We evaluate the FSM based approach with XML validation via false positive rate and false negative rate and measure the extraction performance through extraction accuracy. The experimental result shows the proposed method can effectively filter invalid pages (zero false positive rate) and rank the correct label sequence with the highest score with 96.5% accuracy.",
keywords = "Extractor, Unsupervised Information Extraction, Wrapper Induction, Wrapper Verification",
author = "Chang, {Chia Hui} and Lin, {Yen Ling} and Lin, {Kuan Chen} and Mohammed Kayed",
year = "2013",
doi = "10.1007/978-3-642-41230-1_38",
language = "???core.languages.en_GB???",
isbn = "9783642412295",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
number = "PART 1",
pages = "454--467",
booktitle = "Web Information Systems Engineering, WISE 2013 - 14th International Conference, Proceedings",
edition = "PART 1",
note = "14th International Conference on Web Information Systems Engineering, WISE 2013 ; Conference date: 13-10-2013 Through 15-10-2013",
}