@inproceedings{b561e3e67550474c977ffb8da54b27de,
title = "Improving collocation extraction for high frequency words",
abstract = "The purpose of this paper is to introduce an alternative word association measure aimed at addressing the under-extraction collocations that contain high frequency words. While measures such as MI provide the important contribution of filtering out sheer high frequency of words in the detection of collocations in large corpora, one side effect of this filtering is that it becomes correspondingly difficult for such measures to detect true collocations involving high frequency words. As an alternative, we propose normalizing the MI measure by dividing the frequency of a candidate lexeme by the number of senses of that lexeme. We premise this alternative approach on the one sense per collocation assumption of Yarowsky (1992; 1995). Ten verb-noun collocations involving three high frequency verbs (make, take, run) are used to compare the extraction results of traditional MI and the proposed normalized MI. Results show the ranking of these high-frequency verbs as candidate collocates with the target focal nouns is raised by normalizing MI as proposed. Side effects of these improved rankings are discussed, such as increase in false positives resulting from higher recall. It is found that overall rank precision remains quite stable even with the increased recall of normalized MI.",
author = "David Wible and Kuo, {Chin Hwa} and Tsao, {Nai Lung}",
year = "2004",
language = "???core.languages.en_GB???",
series = "Proceedings of the 4th International Conference on Language Resources and Evaluation, LREC 2004",
publisher = "European Language Resources Association (ELRA)",
pages = "1855--1858",
editor = "Xavier, {Maria Francisca} and Rute Costa and Fatima Ferreira and Lino, {Maria Teresa} and Raquel Silva",
booktitle = "Proceedings of the 4th International Conference on Language Resources and Evaluation, LREC 2004",
note = "4th International Conference on Language Resources and Evaluation, LREC 2004 ; Conference date: 26-05-2004 Through 28-05-2004",
}