<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1d3 20150301//EN" "http://jats.nlm.nih.gov/publishing/1.1d3/JATS-journalpublishing1.dtd">
<article article-type="research-article" dtd-version="1.1d3" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS ONE</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">plosone</journal-id>
<journal-title-group>
<journal-title>PLOS ONE</journal-title>
</journal-title-group>
<issn pub-type="epub">1932-6203</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, CA USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.1371/journal.pone.0157734</article-id>
<article-id pub-id-type="publisher-id">PONE-D-15-45457</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Research Article</subject>
</subj-group>
<subj-group subj-group-type="Discipline-v3"><subject>Social sciences</subject><subj-group><subject>Sociology</subject><subj-group><subject>Communications</subject><subj-group><subject>Social communication</subject><subj-group><subject>Social media</subject><subj-group><subject>Twitter</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Computer and information sciences</subject><subj-group><subject>Network analysis</subject><subj-group><subject>Social networks</subject><subj-group><subject>Social media</subject><subj-group><subject>Twitter</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Social sciences</subject><subj-group><subject>Sociology</subject><subj-group><subject>Social networks</subject><subj-group><subject>Social media</subject><subj-group><subject>Twitter</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Medicine and health sciences</subject><subj-group><subject>Infectious diseases</subject><subj-group><subject>Viral diseases</subject><subj-group><subject>Influenza</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Computer and information sciences</subject><subj-group><subject>Geoinformatics</subject><subj-group><subject>Geographic information systems</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Earth sciences</subject><subj-group><subject>Geography</subject><subj-group><subject>Geoinformatics</subject><subj-group><subject>Geographic information systems</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Neuroscience</subject><subj-group><subject>Cognitive science</subject><subj-group><subject>Artificial intelligence</subject><subj-group><subject>Machine learning</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Computer and information sciences</subject><subj-group><subject>Artificial intelligence</subject><subj-group><subject>Machine learning</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Physical sciences</subject><subj-group><subject>Mathematics</subject><subj-group><subject>Applied mathematics</subject><subj-group><subject>Algorithms</subject><subj-group><subject>Machine learning algorithms</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Research and analysis methods</subject><subj-group><subject>Simulation and modeling</subject><subj-group><subject>Algorithms</subject><subj-group><subject>Machine learning algorithms</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Neuroscience</subject><subj-group><subject>Cognitive science</subject><subj-group><subject>Artificial intelligence</subject><subj-group><subject>Machine learning</subject><subj-group><subject>Machine learning algorithms</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Computer and information sciences</subject><subj-group><subject>Artificial intelligence</subject><subj-group><subject>Machine learning</subject><subj-group><subject>Machine learning algorithms</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Social sciences</subject><subj-group><subject>Sociology</subject><subj-group><subject>Communications</subject><subj-group><subject>Social communication</subject><subj-group><subject>Social media</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Computer and information sciences</subject><subj-group><subject>Network analysis</subject><subj-group><subject>Social networks</subject><subj-group><subject>Social media</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Social sciences</subject><subj-group><subject>Sociology</subject><subj-group><subject>Social networks</subject><subj-group><subject>Social media</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Neuroscience</subject><subj-group><subject>Cognitive science</subject><subj-group><subject>Artificial intelligence</subject><subj-group><subject>Machine learning</subject><subj-group><subject>Support vector machines</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Computer and information sciences</subject><subj-group><subject>Artificial intelligence</subject><subj-group><subject>Machine learning</subject><subj-group><subject>Support vector machines</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Medicine and health sciences</subject><subj-group><subject>Public and occupational health</subject></subj-group></subj-group></article-categories>
<title-group>
<article-title>Applying GIS and Machine Learning Methods to Twitter Data for Multiscale Surveillance of Influenza</article-title>
<alt-title alt-title-type="running-head">Applying GIS and Machine Learning Methods to Twitter Data for Multiscale Surveillance of Influenza</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes" xlink:type="simple">
<contrib-id contrib-id-type="orcid">http://orcid.org/0000-0002-3336-2601</contrib-id>
<name name-style="western">
<surname>Allen</surname>
<given-names>Chris</given-names>
</name>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes" equal-contrib="yes" xlink:type="simple">
<name name-style="western">
<surname>Tsou</surname>
<given-names>Ming-Hsiang</given-names>
</name>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
<xref ref-type="corresp" rid="cor001">*</xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Aslam</surname>
<given-names>Anoshe</given-names>
</name>
<xref ref-type="aff" rid="aff002"><sup>2</sup></xref>
<xref ref-type="fn" rid="econtrib001"><sup>‡</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Nagel</surname>
<given-names>Anna</given-names>
</name>
<xref ref-type="aff" rid="aff002"><sup>2</sup></xref>
<xref ref-type="fn" rid="econtrib001"><sup>‡</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Gawron</surname>
<given-names>Jean-Mark</given-names>
</name>
<xref ref-type="aff" rid="aff003"><sup>3</sup></xref>
<xref ref-type="fn" rid="econtrib001"><sup>‡</sup></xref>
</contrib>
</contrib-group>
<aff id="aff001"><label>1</label> <addr-line>Department of Geography, San Diego State University, San Diego, California, United States of America</addr-line></aff>
<aff id="aff002"><label>2</label> <addr-line>Graduate School of Public Health, San Diego State University, San Diego, California, United States of America</addr-line></aff>
<aff id="aff003"><label>3</label> <addr-line>Department of Linguistics, San Diego State University, San Diego, California, United States of America</addr-line></aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple">
<name name-style="western">
<surname>Ebrahimi</surname>
<given-names>Mansour</given-names>
</name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/>
</contrib>
</contrib-group>
<aff id="edit1"><addr-line>Qom University, ISLAMIC REPUBLIC OF IRAN</addr-line></aff>
<author-notes>
<fn fn-type="conflict" id="coi001">
<p>The authors have declared that no competing interests exist.</p>
</fn>
<fn fn-type="con" id="contrib001">
<p>Conceived and designed the experiments: CA AA AN. Analyzed the data: CA AA AN. Contributed reagents/materials/analysis tools: CA JMG MT. Wrote the paper: CA JMG MT.</p>
</fn>
<fn fn-type="other" id="econtrib001">
<p>‡ These authors also contributed equally to this work.</p>
</fn>
<corresp id="cor001">* E-mail: <email xlink:type="simple">mtsou@mail.sdsu.edu</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>25</day>
<month>7</month>
<year>2016</year>
</pub-date>
<pub-date pub-type="collection">
<year>2016</year>
</pub-date>
<volume>11</volume>
<issue>7</issue>
<elocation-id>e0157734</elocation-id>
<history>
<date date-type="received">
<day>22</day>
<month>10</month>
<year>2015</year>
</date>
<date date-type="accepted">
<day>4</day>
<month>6</month>
<year>2016</year>
</date>
</history>
<permissions>
<copyright-year>2016</copyright-year>
<copyright-holder>Allen et al</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.pone.0157734"/>
<abstract>
<p>Traditional methods for monitoring influenza are haphazard and lack fine-grained details regarding the spatial and temporal dynamics of outbreaks. Twitter gives researchers and public health officials an opportunity to examine the spread of influenza in real-time and at multiple geographical scales. In this paper, we introduce an improved framework for monitoring influenza outbreaks using the social media platform Twitter. Relying upon techniques from geographic information science (GIS) and data mining, Twitter messages were collected, filtered, and analyzed for the thirty most populated cities in the United States during the 2013–2014 flu season. The results of this procedure are compared with national, regional, and local flu outbreak reports, revealing a statistically significant correlation between the two data sources. The main contribution of this paper is to introduce a comprehensive data mining process that enhances previous attempts to accurately identify tweets related to influenza. Additionally, geographical information systems allow us to target, filter, and normalize Twitter messages.</p>
</abstract>
<funding-group>
<award-group id="award001">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/100005716</institution-id>
<institution>National Science Board</institution>
</institution-wrap>
</funding-source>
<award-id>1028177</award-id>
<principal-award-recipient>
<name name-style="western">
<surname>Tsou</surname>
<given-names>Ming-Hsiang</given-names>
</name>
</principal-award-recipient>
</award-group>
<funding-statement>This research was conducted under a National Science Foundation Cyber-enabled Discovery and Innovation (CDI) grant (award #1028177). This research was also conducted under National Science Foundation grant #1416509.The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</funding-statement>
</funding-group>
<counts>
<fig-count count="3"/>
<table-count count="2"/>
<page-count count="10"/>
</counts>
<custom-meta-group>
<custom-meta id="data-availability">
<meta-name>Data Availability</meta-name>
<meta-value>Individual Twitter messages are protected by the Twitter Terms of Use and therefore cannot be redistributed. However, we are able to provide aggregated tweet counts at various geographic scales. We are uploading the data on tweet rates and flu rates as Supporting Information. Webmap is available online at <ext-link ext-link-type="uri" xlink:href="http://vision.sdsu.edu/hdma/smart/" xlink:type="simple">http://vision.sdsu.edu/hdma/smart/</ext-link>.</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="sec001" sec-type="intro">
<title>Introduction</title>
<p>Public health scholars have long studied the ways in which disease outbreaks can be monitored. Accurate information about the intensity and geographic distribution of illnesses can allow officials to allocate resources effectively and respond appropriately in order to combat these health threats. However, surveillance of influenza has traditionally been problematic due to uneven and haphazard reporting, as well as the time and cost overhead associated with gathering this information.</p>
<p>This paper introduces a geo-targeted data mining approach to influenza surveillance that relies upon data from the social media platform Twitter. Messages from Twitter are gathered and analyzed using geographic information system (GIS) methods such as spatial filtering, population normalization, and multi-scale analysis. Additionally, this work also turns to machine learning techniques to improve the filtering process in order to better distinguish tweets that appear to describe real-world cases of influenza from those that do not. For this purpose, a support vector machine (SVM) classifier was trained using manually tagged data collected from the 2012–2013 flu season. Comparing our results for the 2013–2014 flu season with official reports of influenza-like illness (ILI) on national, regional, and local levels demonstrates that the two signals are closely correlated, suggesting that Twitter holds great potential for monitoring influenza outbreaks.</p>
</sec>
<sec id="sec002">
<title>Background and Literature Review</title>
<p>This study builds on the flurry of recent research activity seeking to understand the dynamics of disease outbreak by analyzing Internet and social media data. Though remarkable progress has been made in the past few years, there remain significant obstacles facing researchers wanting to use these new sources of data to inform public health decisions.</p>
<p>Many researchers have turned to Google Flu Trends (GFT) for user-generated data on influenza. This site publishes information on flu-related web searches that can be used to monitor changes in Internet activity over time and space. Dukic et al. [<xref ref-type="bibr" rid="pone.0157734.ref001">1</xref>] modified the classical particle learning epidemiology model using GFT data to develop a method for monitoring the spread of flu. More recently, Dugas et al. [<xref ref-type="bibr" rid="pone.0157734.ref002">2</xref>] have relied upon historical outbreak data along with patterns in Flu Trends to create a forecast model that allows the team to predict flu cases one week in advance with reasonable precision. In the past few years, however, many have grown skeptical of the accuracy of Google Flu Trends. Olson et. al. [<xref ref-type="bibr" rid="pone.0157734.ref003">3</xref>], for example, argue that GFT predictions are not accurately tuned to yearly variations in flu patterns. Similarly, Declan Butler has commented in <italic>Nature</italic> [<xref ref-type="bibr" rid="pone.0157734.ref004">4</xref>] that Google’s significant miscalculation in predicting the intensity of flu outbreaks for the 2013 season casts serious doubt on the reliability of Flu Trends data to monitor illness.</p>
<p>In the past few years, the micro-blogging service Twitter has provided researchers with a rich new source of data, allowing for detailed examinations of complex diffusion processes, human behaviors, and collective attitudes around the world [<xref ref-type="bibr" rid="pone.0157734.ref005">5</xref>–<xref ref-type="bibr" rid="pone.0157734.ref007">7</xref>]. Scholars from divergent fields have similarly turned to Twitter to collect, filter, and analyze messages (called tweets) in order to gain new insights into the spread of flu. Nagel et al. [<xref ref-type="bibr" rid="pone.0157734.ref008">8</xref>], for instance, have developed a methodology for collecting and filtering tweets that demonstrates a high correlation with local and national reports of ILI cases. Other researchers have delved deeper into text analysis to more accurately remove tweets that are unrelated to actual cases of influenza. Notably, Lamb et al. [<xref ref-type="bibr" rid="pone.0157734.ref009">9</xref>] used manually-defined word feature classes to filter out tweets that are did not appear to reflect personal infection.</p>
<p>Though these efforts are valuable contributions to the fields of public health and Big Data analysis, this paper suggests that a greater reliance on geographic information system (GIS) and machine learning methods can shed new light on the role these exciting new data sources (particularly Twitter) can play in studying disease outbreak.</p>
</sec>
<sec id="sec003">
<title>Data Collection</title>
<p>The data collection procedure for this research was based upon the Visualizing Information Space in Ontological Networks (VISION) framework developed by Tsou, et. al. [<xref ref-type="bibr" rid="pone.0157734.ref010">10</xref>–<xref ref-type="bibr" rid="pone.0157734.ref011">11</xref>] to examine the interrelationships between cyberspace message, space, and time. Unlike most previous applications of the VISION framework, this study focuses exclusively on data collected from Twitter due to the real-time and dynamic nature of this platform.</p>
<p>In the past few years Twitter has emerged as one of the leading social media platforms, boasting more than 140 million active users. Each day, Twitter users produce millions of tweets (messages of 140 characters or less), which can be collected through official Twitter APIs. To assist with our data collection process, our research group has developed search tools which query the Twitter Search API based on spatial constraints and keyword filters. To allow researchers within our group to visually explore this data, a web map was developed that displays the search locations related to each keyword, as well as the intensity for that keyword. Using this interface, it is also possible to download the Twitter data in Excel form, which allows for a more detailed analysis of the various attributes associated with each tweet, such as the time it was published, the user’s location, the GPS coordinates of the tweet, and any URLs or hashtags contained within the message (see <xref ref-type="supplementary-material" rid="pone.0157734.s003">S3 File</xref>).</p>
<p>This study builds on the work of Nagel et al. [<xref ref-type="bibr" rid="pone.0157734.ref008">8</xref>] by using keywords that have been identified as effective indicators of influenza outbreak. The Twitter search tool collected tweets by the keywords “flu” and “influenza.” Tweets were collected from 31 major cities in the United States: Atlanta, GA; Austin, TX; Baltimore, MD; Boston, MA; Chicago, IL; Cleveland, OH; Columbus, OH; Dallas, TX; Denver, CO; Detroit, MI; El Paso, TX; Fort Worth, TX; Houston, TX; Indianapolis, IN; Jacksonville, FL; Los Angeles, CA; Memphis, TN; Milwaukee, WI; Nashville, TN; New York, NY; Oklahoma City, OK; Philadelphia, PA; Phoenix, AZ; Portland, OR; San Diego, CA; Seattle, WA; and Washington D.C. Unlike Nagel et. al., this research uses variable search radiuses for each of these cities, which were determined by the researchers.</p>
</sec>
<sec id="sec004" sec-type="materials|methods">
<title>Methods</title>
<p>For this study, geographic information system (GIS) methods were relied upon for data collection and normalization. To filter out noise from the dataset, a machine learning procedure was adopted that allowed our group to better distinguish tweets that appeared to indicate a real-world instance of influenza from those that did not.</p>
<p>In order to target specific locations for data collection, this project took advantage of the spatial filtering methods provided by the Twitter Search Application Programming Interface (API) (see <xref ref-type="supplementary-material" rid="pone.0157734.s003">S3 File</xref>). Most research focusing on Twitter data relies upon the Streaming API, which allows users to retrieve GPS-tagged tweets within a specified bounding box. However, the main disadvantage of this method is that the Streaming API only gives access to a one percent sampling of all tweets. By polling the Search API continually, we can access a much larger dataset for specific geographic areas, which allows for detailed analysis of the data at municipal, regional, and national scales.</p>
<p>Normalization of population is essential to understanding quantitative geographic data, and for this study a novel approach was taken to carry out this task. The Twitter Search API requires that geo-targeted searches specify a latitude/longitude pair as well as a radius, which can essentially be thought of as a point buffer. Each of these city point buffers was joined with census tract centroids to determine which tracts should be included in our population calculations. Using the fine-grained census data allows us to gain a more accurate estimation of population, which greatly improves our ability to accurately normalize tweet counts for individual cities.</p>
<p>With respect to noise filtering, Nagel et al. [<xref ref-type="bibr" rid="pone.0157734.ref008">8</xref>] have shown that excluding retweets and tweets containing URL links produces a much higher correlation coefficient with ground-truth data, and we have borrowed that strategy in this study. However, to further filter our data, we have developed a machine learning classification procedure. The goal of this procedure was to identify tweets that do not appear to indicate real-world cases of influenza so that they can be omitted from the statistical analysis. The following are example tweets containing the keyword “flu” and the determination of their validity that we hoped to accomplish with the classification task:</p>
<list list-type="bullet">
<list-item><p><italic>Flu medicine kicked in</italic>… <italic>Time for bed</italic> → Valid</p></list-item>
<list-item><p><italic>I gotta get over this flu</italic>!! → Valid</p></list-item>
<list-item><p><italic>Who gets the stomach flu the day before class starts</italic>? → Invalid</p></list-item>
<list-item><p><italic>I'm getting the flu shot today</italic>. <italic>#scared</italic> → Invalid</p></list-item>
<list-item><p><italic>Arm is killing from flu jab</italic> → Invalid</p></list-item>
<list-item><p><italic>this flu feels like death</italic> → Valid</p></list-item>
</list>
<p>As these examples show, in many cases, one can succeed at classifying tweets by recognizing one positive indicator (e.g., “get over”, “medicine”) or one negative indicator (e.g., “shot”, “jab”). This property strongly suggested to our team that using a linear learning algorithm would be an effective strategy.</p>
<p>In machine learning terms, the problem of assigning the labels “no” or “yes” to a tweet is a simple binary classification task. The features relevant to classification are words (such as “medicine” or “bed”) and n-grams (such as “stomach flu”, “flu shot”). The learning algorithm is given a training set of tweets represented as features with numerical values and labeled with their classes. With a linear classifier, the output of the learning algorithm is simply a set of feature weights; unseen examples can be classified using a linear combination of the weighted features. The primary advantage of using a linear classifier is that learning is cheap and scalable. Additional positive and negative indicators can be learned with more labeled data. Furthermore, the problem of what to do when indicators conflict is solved automatically by the feature weights. For instance, consider a tweet that contains both the word “stomach” and the word “medicine.” The learner classifies such tweets based on the weighting that was optimal in correctly classifying the training data. The particular kind of linear learner used here is called a support vector machine (SVM) [<xref ref-type="bibr" rid="pone.0157734.ref012">12</xref>].</p>
<p>In this context, an SVM was used to classify tweets that appeared to be indicators of real-world cases of influenza and those messages that appeared to be irrelevant to actual illness (see above examples). To train the SVM, 1,500 randomly sampled tweets from the 2012–2013 season containing the keyword “flu” were used to train the SVM classifier. Each of these tweets was manually inspected and classified as valid or invalid according to the likelihood that the message indicated an actual case of influenza. In order to assign numerical values to unigram, bi-gram, and tri-gram features, we use the term frequency—inverse document frequency (TF-IDF) scores for each word or word-pair. TF-IDF is a measure of the statistical significance of each term in a message that also accounts for the prevalence of that term in the overall data set. Essentially, this score assigns a higher weight to words that are more important to a specific message in comparison the entire set of messages [<xref ref-type="bibr" rid="pone.0157734.ref013">13</xref>]. It should be noted that using statistical transformations such as TF-IDF allows the researcher to avoid the task of manually identifying specific keywords indicating influenza; rather, the classifier is able to automatically recognize important keywords based on patterns in the manually classified training set. Consequently, applying this procedure to other illnesses (for example, Ebola) would simply require a new set of manually classified tweets with which to train the classifier.</p>
<p>To evaluate the classification model, we used standard machine learning measures of quality: <italic>recall</italic>, <italic>precision</italic> and the resulting <italic>F1 score</italic>. Using 1,000 randomly sampled tweets as a test set, we determined that the classifier has a precision score of 0.671, a recall score of 0.949, resulting in an F1 score of 0.786. This essentially means that the model was able to classify most valid tweets correctly (indicated by the high recall score), but as the precision score indicates, it occasionally incorrectly categorizes invalid tweets as valid.</p>
</sec>
<sec id="sec005" sec-type="results">
<title>Results</title>
<p>The results demonstrate that this new procedure provides significant advantages over previous studies when comparing tweet rates to local, regional, and national ILI reports. These outcomes are summarized in <xref ref-type="table" rid="pone.0157734.t001">Table 1</xref>, which shows the Pearson coefficient between each city’s tweet rate and both the regional and national ILI, as well as the coefficients between the tweet rate and local ILI, for cities where this data was available. Note that local ILI data is displayed from both emergency providers and sentinel providers. Although Aslam et al. [<xref ref-type="bibr" rid="pone.0157734.ref014">14</xref>] have shown that emergency reports produce a stronger signal, data from these sources are difficult to obtain, so sentinel reports are also included.</p>
<table-wrap id="pone.0157734.t001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0157734.t001</object-id>
<label>Table 1</label> <caption><title>Summary of correlations between tweet rates and ILI rates (local, regional, and national).</title></caption>
<alternatives>
<graphic id="pone.0157734.t001g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0157734.t001" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="left">CITY</th>
<th align="center">CORR. WITH LOCAL EMERGENCY ILI</th>
<th align="center">CORR. WITH LOCAL SENTINEL ILI</th>
<th align="center">CORR. WITH REGIONAL ILI</th>
<th align="center">CORR. WITH NATIONAL ILI</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">Atlanta</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.657</td>
<td align="char" char=".">0.679</td>
</tr>
<tr>
<td align="left">Austin</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.919</td>
<td align="char" char=".">0.830</td>
</tr>
<tr>
<td align="left">Baltimore</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.031</td>
<td align="char" char=".">-0.116</td>
</tr>
<tr>
<td align="left">Boston</td>
<td align="center">0.804<xref ref-type="table-fn" rid="t001fn001">*</xref></td>
<td align="center">0.105</td>
<td align="char" char=".">0.395</td>
<td align="char" char=".">0.433</td>
</tr>
<tr>
<td align="left">Chicago</td>
<td align="center">0.804<xref ref-type="table-fn" rid="t001fn001">*</xref></td>
<td align="center">0.636</td>
<td align="char" char=".">0.771</td>
<td align="char" char=".">0.782</td>
</tr>
<tr>
<td align="left">Cleveland</td>
<td align="center">0.784<xref ref-type="table-fn" rid="t001fn001">*</xref></td>
<td align="center">0.605</td>
<td align="char" char=".">0.819</td>
<td align="char" char=".">0.822</td>
</tr>
<tr>
<td align="left">Columbus</td>
<td align="center">0.877<xref ref-type="table-fn" rid="t001fn001">*</xref></td>
<td align="center">-0.235</td>
<td align="char" char=".">0.771</td>
<td align="char" char=".">0.776</td>
</tr>
<tr>
<td align="left">Dallas</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.702</td>
<td align="char" char=".">0.797</td>
</tr>
<tr>
<td align="left">Denver</td>
<td align="center">NA</td>
<td align="center">0.690</td>
<td align="char" char=".">0.599</td>
<td align="char" char=".">0.589</td>
</tr>
<tr>
<td align="left">Detroit</td>
<td align="center">NA</td>
<td align="center">0.757</td>
<td align="char" char=".">0.846</td>
<td align="char" char=".">0.878</td>
</tr>
<tr>
<td align="left">El Paso</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.422</td>
<td align="char" char=".">0.563</td>
</tr>
<tr>
<td align="left">Fort Worth</td>
<td align="center">NA</td>
<td align="center">0.855</td>
<td align="char" char=".">0.659</td>
<td align="char" char=".">0.734</td>
</tr>
<tr>
<td align="left">Houston</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.845</td>
<td align="char" char=".">0.663</td>
</tr>
<tr>
<td align="left">Indianapolis</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.750</td>
<td align="char" char=".">0.777</td>
</tr>
<tr>
<td align="left">Jacksonville</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.787</td>
<td align="char" char=".">0.778</td>
</tr>
<tr>
<td align="left">Los Angeles</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.793</td>
<td align="char" char=".">0.690</td>
</tr>
<tr>
<td align="left">Memphis</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.850</td>
<td align="char" char=".">0.854</td>
</tr>
<tr>
<td align="left">Milwaukee</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.761</td>
<td align="char" char=".">0.779</td>
</tr>
<tr>
<td align="left">Nashville</td>
<td align="center">NA</td>
<td align="center">0.827</td>
<td align="char" char=".">0.869</td>
<td align="char" char=".">0.875</td>
</tr>
<tr>
<td align="left">New Orleans</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.858</td>
<td align="char" char=".">0.886</td>
</tr>
<tr>
<td align="left">New York</td>
<td align="center">NA</td>
<td align="center">0.555</td>
<td align="char" char=".">0.630</td>
<td align="char" char=".">0.639</td>
</tr>
<tr>
<td align="left">Oklahoma City</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.463</td>
<td align="char" char=".">0.658</td>
</tr>
<tr>
<td align="left">Philadelphia</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.718</td>
<td align="char" char=".">0.624</td>
</tr>
<tr>
<td align="left">Phoenix</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.820</td>
<td align="char" char=".">0.685</td>
</tr>
<tr>
<td align="left">Portland</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.837</td>
<td align="char" char=".">0.725</td>
</tr>
<tr>
<td align="left">San Antonio</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.824</td>
<td align="char" char=".">0.809</td>
</tr>
<tr>
<td align="left">San Diego</td>
<td align="center">0.916<xref ref-type="table-fn" rid="t001fn001">*</xref></td>
<td align="center">0.693</td>
<td align="char" char=".">0.750</td>
<td align="char" char=".">0.626</td>
</tr>
<tr>
<td align="left">San Francisco</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.707</td>
<td align="char" char=".">0.616</td>
</tr>
<tr>
<td align="left">San Jose</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.715</td>
<td align="char" char=".">0.653</td>
</tr>
<tr>
<td align="left">Seattle</td>
<td align="center">NA</td>
<td align="center">0.830</td>
<td align="char" char=".">0.807</td>
<td align="char" char=".">0.665</td>
</tr>
<tr>
<td align="left">Washington DC</td>
<td align="center">NA</td>
<td align="center">NA</td>
<td align="char" char=".">0.756</td>
<td align="char" char=".">0.578</td>
</tr>
</tbody>
</table>
</alternatives>
<table-wrap-foot>
<fn id="t001fn001"><p>* Emergency ILI reports were incomplete and thus the correlation only compares tweet rates with available ILI data. Boston is missing weeks 36–46, 48, 50, and 6–10. Chicago is missing weeks 36–40 and 6–10. Cleveland is missing 36–39 and 4–10. Columbus is missing weeks 36–39 and 6–10. San Diego is missing week 6–10.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>In order to understand how flu tweets are related to national ILI percentages, we aggregated data for each city to formulate a national tweet rate. As shown in <xref ref-type="fig" rid="pone.0157734.g001">Fig 1</xref>, comparing the national ILI to the aggregated tweet rate reveals that the two numbers are highly correlated (r = 0.845). Though this national comparison ignores possible spatial variability in flu outbreaks, it nonetheless serves as a good baseline, because national ILI reports are much more reliable than data collected from local agencies.</p>
<fig id="pone.0157734.g001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0157734.g001</object-id>
<label>Fig 1</label>
<caption>
<title>National ILI compared to the aggregated tweet rates for all study cities.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0157734.g001" xlink:type="simple"/>
</fig>
<p>Comparisons between local ILI percentages and tweet rates for individual cities had mixed results, but many of the coefficients were significant. For instance, <xref ref-type="fig" rid="pone.0157734.g002">Fig 2a</xref> shows that Fort Worth had a coefficient of 0.854. Similarly, the Nashville-Davidson region (<xref ref-type="fig" rid="pone.0157734.g002">Fig 2b</xref>) demonstrates a close relationship between flu-related tweets and local ILI reports (r = 0.827), despite an apparent gap in ILI data for week 52.</p>
<fig id="pone.0157734.g002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0157734.g002</object-id>
<label>Fig 2</label>
<caption>
<title>Local sentinel-provided ILI compared to the tweet rate for Fort Worth (a), Nashville (b), Cleveland (c), and Boston (d).</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0157734.g002" xlink:type="simple"/>
</fig>
<p>A number of cities produced results that were less satisfactory. In certain cases, this may be due to unreliable reporting of local ILI. As shown in <xref ref-type="fig" rid="pone.0157734.g002">Fig 2c</xref>, the coefficient for Cleveland is not as high as other cities; however, the curve for local ILI (in red) is very jagged, often dropping abruptly to 0.0%, which suggests that ILI is not reported reliably in this area. Nonetheless, other cities such as Boston (<xref ref-type="fig" rid="pone.0157734.g002">Fig 2d</xref>) appear to have a more consistent ILI curve but still reveal little correlation between tweet rate and local ILI, suggesting that the current filtering and classification methods may require fine-tuning to account for spatial variability.</p>
<p>Since local ILI reporting can often be problematic, this study also analyzed twitter activity by taking advantage of regional ILI reports that are made available by the CDC. <xref ref-type="table" rid="pone.0157734.t002">Table 2</xref> shows each region (and the cities that are contained within) as well as the coefficients between the aggregated tweet rates and the ILI percentages for that region. In <xref ref-type="fig" rid="pone.0157734.g003">Fig 3</xref>, the regional correlation results have been mapped to show the geographic variation in the performance of our methods. Unsurprisingly, most regions consisting primarily of cities that showed poor correlations with national or local ILI reports also did not perform well at the regional level. However, aggregating the Twitter data into regions seemed to produce exceptional results in a number of cases. For instance, the correlation coefficient for Region 10 was significantly better (r = .927) than the results for the individual cities of Portland and Seattle. This finding suggests that regional ILI may be used to better evaluate the data collection and filtering methods, as it appears more reliable than local ILI reports, but still accounts for regional differences in the spread of influenza.</p>
<table-wrap id="pone.0157734.t002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0157734.t002</object-id>
<label>Table 2</label> <caption><title>Correlation coefficients aggregated by region.</title> <p>Regional ILI data provided by CDC.</p></caption>
<alternatives>
<graphic id="pone.0157734.t002g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0157734.t002" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="left">REGION</th>
<th align="left">CORRELATION WITH REGIONAL ILI</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">Region 1 (Boston)</td>
<td align="left">0.445283886</td>
</tr>
<tr>
<td align="left">Region 2 (New York)</td>
<td align="left">0.643321552</td>
</tr>
<tr>
<td align="left">Region 3 (Baltimore, Philadelphia, Washington DC)</td>
<td align="left">0.503859481</td>
</tr>
<tr>
<td align="left">Region 4 (Atlanta, Jacksonville, Memphis, Nashville)</td>
<td align="left">0.899332773</td>
</tr>
<tr>
<td align="left">Region 5 (Chicago, Columbus, Cleveland, Detroit, Indianapolis, Milwaukee)</td>
<td align="left">0.903099689</td>
</tr>
<tr>
<td align="left">Region 6 (Austin, Dallas, El Paso, Fort Worth, Houston, Oklahoma City, New Orleans, San Antonio)</td>
<td align="left">0.891701735</td>
</tr>
<tr>
<td align="left">Region 7 (No data)</td>
<td align="left">NA</td>
</tr>
<tr>
<td align="left">Region 8 (Denver)</td>
<td align="left">0.541016527</td>
</tr>
<tr>
<td align="left">Region 9 (Los Angeles, Phoenix, San Diego, San Francisco, San Jose)</td>
<td align="left">0.887259347</td>
</tr>
<tr>
<td align="left">Region 10 (Portland, Seattle)</td>
<td align="left">0.927950078</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<fig id="pone.0157734.g003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0157734.g003</object-id>
<label>Fig 3</label>
<caption>
<title>Map showing the correlation rank for each region.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0157734.g003" xlink:type="simple"/>
</fig>
</sec>
<sec id="sec006">
<title>Limitations</title>
<p>Though this study improves upon previous attempts to analyze the relationship between flu-related tweets and real-world outbreaks of influenza, there are remaining issues that might be addressed in future work.</p>
<p>Currently the support vector classifier used to filter tweets was trained with data from the 2012–2013 season. While this training set produced satisfactory results with respect to the filtering process, this approach to classification ignores the dynamic nature of social media data. For instance, our team discovered that the signal for flu-related tweets in Baltimore for the 2013–2014 season were anomalous in that they did not closely correspond to local ILI reports. After manually analyzing individual tweets for the city, we identified many messages in Baltimore mentioning the phrase “fresher’s flu”, which is a colloquial term to describe illness that sometimes accompanies the start of a college semester. Though these tweets may be indicators of real-world illness, the term “fresher’s flu” generally does not refer to actual influenza. After adjusting our filtering process to account for this anomaly, the correlation coefficients for Baltimore were similar to other cities. This incident demonstrates the need to develop a framework for continually re-training the tweet classifier so that it can account for the temporal and spatial dynamics of messages in cyberspace.</p>
<p>Additionally, the regional analysis of flu tweets presented in this paper might be further improved by including a greater number of search areas. Since Twitter messages were only collected for 31 major U.S. cities, regions with few large urban areas contained sparse data. Three regions (1, 2, and 8) contained tweets from only one city and one region (7) was not covered by any of our search areas. Including more search areas to cover less densely populated areas could address this issue.</p>
<p>Finally, it should be noted that if social media is used to systematically monitor influenza in the future, there are potential concerns that may need to be considered in order to maintain the reliability of this method. For instance, public awareness of these methods could influence behavior and consequently lead to false reporting. It is not hard to imagine a scenario where Twitter users would falsify flu-related tweets in order to garner more attention from public health officials and receive more resources such as vaccination supplies. However, as discussed by Petróczi and Haugen [<xref ref-type="bibr" rid="pone.0157734.ref015">15</xref>], such false reporting may be counterbalanced by further understanding the motivations individuals may have for distorting the truth, and these insights may allow researchers to identify Twitter messages that are likely to be insincere.</p>
</sec>
<sec id="sec007" sec-type="conclusions">
<title>Conclusions</title>
<p>This study demonstrates that social media holds great potential for monitoring the outbreak of flu and other illness. It builds on previous work but suggests that GIS methods can augment existing approaches. Additionally, this paper introduces a machine learning procedure to filter out noise from collected tweets—a task that has been a long-standing hurdle preventing researchers from taking Twitter seriously as a source of data. The outcome demonstrates that this procedure yields results that correlate more strongly with national and local ILI reports.</p>
</sec>
<sec id="sec008">
<title>Supporting Information</title>
<supplementary-material id="pone.0157734.s001" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" position="float" xlink:href="info:doi/10.1371/journal.pone.0157734.s001" xlink:type="simple">
<label>S1 File</label>
<caption>
<title>A Microsoft Excel file containing tables that list influenza-like illness (ILI) rates at local, regional, and national levels.</title>
<p>(XLSX)</p>
</caption>
</supplementary-material>
<supplementary-material id="pone.0157734.s002" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" position="float" xlink:href="info:doi/10.1371/journal.pone.0157734.s002" xlink:type="simple">
<label>S2 File</label>
<caption>
<title>A Microsoft Excel file containing tables that list flu-related tweet counts at local, regional, and national levels.</title>
<p>(XLSX)</p>
</caption>
</supplementary-material>
<supplementary-material id="pone.0157734.s003" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" position="float" xlink:href="info:doi/10.1371/journal.pone.0157734.s003" xlink:type="simple">
<label>S3 File</label>
<caption>
<title>A document containing links to example source code for collecting geo-targeted Twitter data and the web interface that is used to view and download Twitter data.</title>
<p>(DOCX)</p>
</caption>
</supplementary-material>
</sec>
</body>
<back>
<ref-list>
<title>References</title>
<ref id="pone.0157734.ref001"><label>1</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Dukic</surname> <given-names>VM</given-names></name>, <name name-style="western"><surname>Hedibert</surname> <given-names>FL</given-names></name>, <name name-style="western"><surname>Polson</surname> <given-names>N</given-names></name>. <article-title>Tracking Flu Epidemics Using Google Flu Trends and Particle Learning</article-title>. <source>Social Science Research Network</source>. <year>2009</year> <month>Nov</month> <day>25</day>.</mixed-citation></ref>
<ref id="pone.0157734.ref002"><label>2</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Dugas</surname> <given-names>AF</given-names></name>, <name name-style="western"><surname>Mehdi</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Yulia</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Levin</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Torcaso</surname> <given-names>F</given-names></name>, <name name-style="western"><surname>Igusa</surname> <given-names>T</given-names></name>, <etal>et al</etal>. <article-title>Influenza Forecasting with Google Flu Trends</article-title>. <source>PLoS ONE</source>. <year>2013</year>; <volume>8</volume>(<issue>2</issue>): <fpage>e56176</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1371/journal.pone.0056176" xlink:type="simple">10.1371/journal.pone.0056176</ext-link></comment> <object-id pub-id-type="pmid">23457520</object-id></mixed-citation></ref>
<ref id="pone.0157734.ref003"><label>3</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Olson</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Konty</surname> <given-names>K</given-names></name>, <name name-style="western"><surname>Paladini</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Viboud</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Simonsen</surname> <given-names>L</given-names></name>. <article-title>Reassessing Google Flu Trends Data for Detection of Seasonal and Pandemic Influenza: A Comparative Epidemiological Study at Three Geographic Scales</article-title>. <source>PLoS Computational Biology</source>. <year>2013</year>; <volume>9</volume>(<issue>10</issue>): <fpage>e1003256</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1371/journal.pcbi.1003256" xlink:type="simple">10.1371/journal.pcbi.1003256</ext-link></comment> <object-id pub-id-type="pmid">24146603</object-id></mixed-citation></ref>
<ref id="pone.0157734.ref004"><label>4</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Butler</surname> <given-names>D</given-names></name>. <article-title>When Google got Flu Wrong</article-title>. <source>Nature</source>. <year>2013</year> <month>Feb</month> <day>14</day>; <volume>494</volume>(<issue>7436</issue>): <fpage>155</fpage>–<lpage>6</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1038/494155a" xlink:type="simple">10.1038/494155a</ext-link></comment> <object-id pub-id-type="pmid">23407515</object-id></mixed-citation></ref>
<ref id="pone.0157734.ref005"><label>5</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Newsam</surname> <given-names>S</given-names></name>. <article-title>Crowdsourcing What Is Where: Community-Contributed Photos as Volunteered Geographic Information</article-title>. <source>IEEE Multimedia: Special Issue on Mining Community-Contributed Multimedia</source>. <year>2010</year>; <volume>17</volume> (<issue>4</issue>): <fpage>36</fpage>–<lpage>45</lpage>.</mixed-citation></ref>
<ref id="pone.0157734.ref006"><label>6</label><mixed-citation publication-type="other" xlink:type="simple">Perreault, M, Ruths D. The Effect of Mobile Platforms on Twitter Content Generation. In Proc. of the International Conference on Social Media and Weblogs. 2011.</mixed-citation></ref>
<ref id="pone.0157734.ref007"><label>7</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Golder</surname> <given-names>SA</given-names></name>, <name name-style="western"><surname>Macy</surname> <given-names>MW</given-names></name>. <article-title>Diurnal and Seasonal Mood Vary with Work, Sleep, and Daylength across Diverse Cultures</article-title>. <source>Science</source>. <year>2011</year>; <volume>333</volume>: <fpage>1878</fpage>–<lpage>1881</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1126/science.1202775" xlink:type="simple">10.1126/science.1202775</ext-link></comment> <object-id pub-id-type="pmid">21960633</object-id></mixed-citation></ref>
<ref id="pone.0157734.ref008"><label>8</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Nagel</surname> <given-names>AC</given-names></name>, <name name-style="western"><surname>Tsou</surname> <given-names>MH</given-names></name>, <name name-style="western"><surname>Spitzberg</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>An</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Gawron</surname> <given-names>JM</given-names></name>, <name name-style="western"><surname>Gupta</surname> <given-names>D</given-names></name>, <etal>et al</etal>. <article-title>The Complex Relationship of Realspace Events and Messages in Cyberspace: Case Study of Influenza and Pertussis Using Tweets</article-title>. <source>Journal of Medical Internet Research</source>. <year>2013</year> <month>Oct</month> <day>26</day>; <volume>15</volume>(<issue>10</issue>): <fpage>e237</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.2196/jmir.2705" xlink:type="simple">10.2196/jmir.2705</ext-link></comment> <object-id pub-id-type="pmid">24158773</object-id></mixed-citation></ref>
<ref id="pone.0157734.ref009"><label>9</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Lamb</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Paul</surname> <given-names>MJ</given-names></name>, <name name-style="western"><surname>Dredze</surname> <given-names>M</given-names></name>. <article-title>Separating Fact from Fear: Tracking Flu Infections on Twitter</article-title>. <source>In Proc. of NAACL-HLT</source>. <year>2013</year>; <fpage>789</fpage>–<lpage>795</lpage>.</mixed-citation></ref>
<ref id="pone.0157734.ref010"><label>10</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Tsou</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Leitner</surname> <given-names>M</given-names></name>. <article-title>Editorial: Visualization of Social Media: Seeing a Mirage or a Message? In Special Content Issue: Mapping Cyberspace and Social Media</article-title>. <source>Cartography and Geographic Information Science</source>. <year>2013</year>; <volume>40</volume>(<issue>2</issue>): <fpage>55</fpage>–<lpage>60</lpage>.</mixed-citation></ref>
<ref id="pone.0157734.ref011"><label>11</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Tsou</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Yang</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Lusher</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Han</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Spitzberg</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Gawron</surname> <given-names>JM</given-names></name>, <etal>et al</etal>. <article-title>Mapping Social Activities and Concepts with Social Media (Twitter) and Web Search Engines (Yahoo and Bing): A Case Study in 2012 U.S. Presidential Election</article-title>. <source>Cartography and Geographic Information Science</source>. <year>2013</year>. <volume>40</volume>(<issue>4</issue>): <fpage>337</fpage>–<lpage>348</lpage>.</mixed-citation></ref>
<ref id="pone.0157734.ref012"><label>12</label><mixed-citation publication-type="other" xlink:type="simple">Joachims T. Text Categorization with Support Vector Machines: Learning with Many Relevant Features. In Proc. of the European Conference on Machine Learning (ECML). Springer. 1998.</mixed-citation></ref>
<ref id="pone.0157734.ref013"><label>13</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Salton</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Buckley</surname> <given-names>C</given-names></name>. <article-title>Term-Weighting Approaches in Automatic Text Retrieval</article-title>. <source>Information Processing &amp; Management</source>. <year>1988</year>; <volume>24</volume>(<issue>5</issue>): <fpage>513</fpage>–<lpage>23</lpage>.</mixed-citation></ref>
<ref id="pone.0157734.ref014"><label>14</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Aslam</surname> <given-names>AA</given-names></name>, <name name-style="western"><surname>Tsou</surname> <given-names>MH</given-names></name>, <name name-style="western"><surname>Spitzberg</surname> <given-names>BH</given-names></name>, <name name-style="western"><surname>An</surname> <given-names>Li</given-names></name>, <name name-style="western"><surname>Gawron</surname> <given-names>JM</given-names></name>, <name name-style="western"><surname>Gupta</surname> <given-names>DK</given-names></name>, <etal>et al</etal>. <article-title>The Reliability of Tweets as a Supplementary Method of Seasonal Influenza Surveillance</article-title>. <source>J Med Internet Res</source>. <year>2014</year>; <volume>16</volume>(<issue>11</issue>): <fpage>e250</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.2196/jmir.3532" xlink:type="simple">10.2196/jmir.3532</ext-link></comment> <object-id pub-id-type="pmid">25406040</object-id></mixed-citation></ref>
<ref id="pone.0157734.ref015"><label>15</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Petróczi</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Haugen</surname> <given-names>K</given-names></name>. “<article-title>The Doping Self-Reporting Game: The Paradox of a ‘false-Telling’ Mechanism and Its Potential Research and Policy Implications</article-title>.” <source><italic>Sport Management Review</italic></source> <volume>15</volume>, <issue>no. 4</issue> (<month>November</month> <year>2012</year>): <fpage>513</fpage>–<lpage>17</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1016/j.smr.2012.04.002" xlink:type="simple">10.1016/j.smr.2012.04.002</ext-link></comment></mixed-citation></ref>
</ref-list>
</back>
</article>