@inproceedings{4f0eab00d19c4fc4b46dec7d05ba872b,
title = "Plague Dot Text: Text mining and annotation of outbreak reports of the Third Plague Pandemic (1894-1952)",
abstract = "The design of models that govern diseases in their relation to population is built on information and data gathered from past outbreaks. However, epidemic outbreaks are never captured in statistical data alone but are communicated by narratives, built on empirical observations. Outbreak reports discuss correlations between populations, locations and the disease to infer insights into causes, vectors and potential interventions. The problem with these narratives is usually the lack of consistent structure that allows for exploration of their collection as a whole. Our interdisciplinary research investigates more than 100 reports from the third plague pandemic (1894-1952) evaluating ways of building a corpus to extract and structure information through text mining and manual annotation. In this paper we discuss the progress of our exploratory project, how we enhance optical character recognition (OCR) methods to improve text capture, our approach to structure the narratives and identify relevant entities in the reports. The structured corpus is made available via Solr enabling search and analysis across the whole collection for future research dedicated e.g. to the identification of concepts. The corpus will enable researchers to analyse the reports collectively and allows for deep insights into the global epidemiological consideration of plague in the early 20th century.",
author = "Arlene Casey and Mike Bennett and Richard Tobin and Claire Grover and Lukas Engelmann and Beatrice Alex",
note = "To be published in 12 Sep 2019",
year = "2019",
month = sep,
day = "12",
language = "English",
volume = "2461",
series = "CEUR Workshop Proceedings",
publisher = "CEUR Workshop Proceedings",
pages = "50--59",
editor = "Wevers, {Melvin } and Hasanuzzaman, {Mohammed } and Dias, {Ga{\"e}l } and D{\"u}ring, {Marten } and Jatowt, {Adam }",
booktitle = "HistoInformatics 2019",
}