@inproceedings{3375c687557c4845b88cee20ccd5662c,
title = "Syntactic Chunking Across Different Corpora",
abstract = "Syntactic chunking has been a well-defined and well-studied task since its introduction in 2000 as the CONLL shared task. Though some efforts have been further spent on chunking performance improvement, the experimental data has been restricted, with few exceptions, to (part of) the Wall Street Journal data, as adopted in the shared task. It remains open how those successful chunking technologies could be extended to other data, which may differ in genre/domain and/or amount of annotation. In this paper we first train chunkers with three classifiers on three different data sets and test on four data sets. We also vary the size of training data systematically to show data requirements for chunkers. It turns out that there is no significant difference between those state-of-the-art classifiers; training on plentiful data from the same corpus (switchboard) yields comparable results to Wall Street Journal chunkers even when the underlying material is spoken; the results from a large amount of unmatched training data can be obtained by using a very modest amount of matched training data.",
author = "Weiqun Xu and Jean Carletta and Johanna Moore",
year = "2006",
doi = "10.1007/11965152_15",
language = "English",
isbn = "978-3-540-69267-6",
series = "Lecture Notes in Computer Science",
publisher = "Springer",
pages = "166--177",
editor = "Steve Renals and Samy Bengio and Fiscus, {Jonathan G.}",
booktitle = "Machine Learning for Multimodal Interaction",
address = "United Kingdom",
}