@inproceedings{d5059be2601e48ad845a6cba51d87f74,
title = "IEPAD: Information extraction based on pattern discovery",
abstract = "The research in information extraction (IE) regards the generation of wrappers that can extract particular information from semistructured Web documents. Similar to compiler generation, the extractor is actually a driver program, which is accompanied with the generated extraction rule. Previous work in this field aims to learn extraction rules from users' training example. In this paper, we propose IEPAD, a system that automatically discovers extraction rules from Web pages. The system can automatically identify record boundary by repeated pattern mining and multiple sequence alignment. The discovery of repeated patterns are realized through a data structure call PAT trees. Additionally, repeated patterns are further extended by pattern alignment to comprehend all record instances. This new track to IE involves no human effort and content-dependent heuristics. Experimental results show that the constructed extraction rules can achieve 97 percent extraction over fourteen popular search engines.",
keywords = "Extraction rule, Information extraction, Multiple string alignment, PAT tree",
author = "Chang, {Chia Hui} and Lui, {Shao Chen}",
note = "Publisher Copyright: {\textcopyright} 2001 ACM.; null ; Conference date: 01-05-2001 Through 05-05-2001",
year = "2001",
month = apr,
day = "1",
doi = "10.1145/371920.372182",
language = "???core.languages.en_GB???",
isbn = "1581133480",
series = "Proceedings of the 10th International Conference on World Wide Web, WWW 2001",
publisher = "Association for Computing Machinery, Inc",
pages = "681--688",
booktitle = "Proceedings of the 10th International Conference on World Wide Web, WWW 2001",
}