<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article
  PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="3.0" xml:lang="en">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="nlm-ta">PLoS Comput Biol</journal-id>
<journal-id journal-id-type="pmc">ploscomp</journal-id><journal-title-group>
<journal-title>PLoS Computational Biology</journal-title></journal-title-group>
<issn pub-type="ppub">1553-734X</issn>
<issn pub-type="epub">1553-7358</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, USA</publisher-loc></publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">PCOMPBIOL-D-14-00285</article-id>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1003692</article-id>
<article-categories><subj-group subj-group-type="heading"><subject>Research Article</subject></subj-group><subj-group subj-group-type="Discipline-v2"><subject>Biology and life sciences</subject><subj-group><subject>Agriculture</subject><subj-group><subject>Agricultural production</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v2"><subject>Computer and information sciences</subject><subj-group><subject>Computerized simulations</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v2"><subject>Medicine and health sciences</subject><subj-group><subject>Epidemiology</subject></subj-group><subj-group><subject>Health care</subject><subj-group><subject>Environmental health</subject></subj-group></subj-group><subj-group><subject>Infectious diseases</subject><subj-group><subject>Infectious disease control</subject></subj-group></subj-group><subj-group><subject>Public and occupational health</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v2"><subject>Physical sciences</subject><subj-group><subject>Mathematics</subject><subj-group><subject>Applied mathematics</subject><subj-group><subject>Algorithms</subject></subj-group></subj-group></subj-group></subj-group></article-categories>
<title-group>
<article-title>A Likelihood-Based Approach to Identifying Contaminated Food Products Using Sales Data: Performance and Challenges</article-title>
<alt-title alt-title-type="running-head">Sales Data Can Speed Investigation of Contaminated Food</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Kaufman</surname><given-names>James</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref><xref ref-type="corresp" rid="cor1"><sup>*</sup></xref></contrib>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Lessler</surname><given-names>Justin</given-names></name><xref ref-type="aff" rid="aff2"><sup>2</sup></xref></contrib>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Harry</surname><given-names>April</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref><xref ref-type="fn" rid="fn1"><sup>¤</sup></xref></contrib>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Edlund</surname><given-names>Stefan</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref></contrib>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Hu</surname><given-names>Kun</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref></contrib>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Douglas</surname><given-names>Judith</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref></contrib>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Thoens</surname><given-names>Christian</given-names></name><xref ref-type="aff" rid="aff3"><sup>3</sup></xref></contrib>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Appel</surname><given-names>Bernd</given-names></name><xref ref-type="aff" rid="aff3"><sup>3</sup></xref></contrib>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Käsbohrer</surname><given-names>Annemarie</given-names></name><xref ref-type="aff" rid="aff3"><sup>3</sup></xref></contrib>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Filter</surname><given-names>Matthias</given-names></name><xref ref-type="aff" rid="aff3"><sup>3</sup></xref></contrib>
</contrib-group>
<aff id="aff1"><label>1</label><addr-line>IBM Almaden Research Center, San Jose, California, United States of America</addr-line></aff>
<aff id="aff2"><label>2</label><addr-line>Department of Epidemiology, Johns Hopkins Bloomberg School of Public Health, Baltimore, Maryland, United States of America</addr-line></aff>
<aff id="aff3"><label>3</label><addr-line>Federal Institute for Risk Assessment, Berlin, Germany</addr-line></aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple"><name name-style="western"><surname>Salathé</surname><given-names>Marcel</given-names></name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/></contrib>
</contrib-group>
<aff id="edit1"><addr-line>Pennsylvania State University, United States of America</addr-line></aff>
<author-notes>
<corresp id="cor1">* E-mail: <email xlink:type="simple">jhkauf@us.ibm.com</email></corresp>
<fn fn-type="conflict"><p>SE, KH, JD, and JK are paid employees of IBM Almaden Research Center. All other authors have declared that no competing interests exist.</p></fn>
<fn fn-type="con"><p>Conceived and designed the experiments: SE KH JK. Performed the experiments: KH MF CT SE AH. Analyzed the data: BA CT KH AH AK JL. Contributed reagents/materials/analysis tools: SE AH JK MF CT JL. Wrote the paper: SE KH JD JK. Preprocessed/organized raw food sales data: CT BA AK. Designed/wrote algorithm: SE. Planned/clustered data: AH CT. Proposed use of retail data with public health data: JK.</p></fn>
<fn id="fn1" fn-type="current-aff"><label>¤</label><p>Current address: Department of Statistics, Purdue University, West Lafayette, Indiana, United States of America</p></fn>
</author-notes>
<pub-date pub-type="collection"><month>7</month><year>2014</year></pub-date>
<pub-date pub-type="epub"><day>3</day><month>7</month><year>2014</year></pub-date>
<volume>10</volume>
<issue>7</issue>
<elocation-id>e1003692</elocation-id>
<history>
<date date-type="received"><day>14</day><month>2</month><year>2014</year></date>
<date date-type="accepted"><day>8</day><month>5</month><year>2014</year></date>
</history>
<permissions>
<copyright-year>2014</copyright-year>
<copyright-holder>Kaufman et al</copyright-holder><license xlink:type="simple"><license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p></license></permissions>
<abstract>
<p>Foodborne disease outbreaks of recent years demonstrate that due to increasingly interconnected supply chains these type of crisis situations have the potential to affect thousands of people, leading to significant healthcare costs, loss of revenue for food companies, and—in the worst cases—death. When a disease outbreak is detected, identifying the contaminated food quickly is vital to minimize suffering and limit economic losses. Here we present a likelihood-based approach that has the potential to accelerate the time needed to identify possibly contaminated food products, which is based on exploitation of food products sales data and the distribution of foodborne illness case reports. Using a real world food sales data set and artificially generated outbreak scenarios, we show that this method performs very well for contamination scenarios originating from a single “guilty” food product. As it is neither always possible nor necessary to identify the <italic>single</italic> offending product, the method has been extended such that it can be used as a binary classifier. With this extension it is possible to generate a set of potentially “guilty” products that contains the real outbreak source with very high accuracy. Furthermore we explore the patterns of food distributions that lead to “hard-to-identify” foods, the possibility of identifying these food groups <italic>a priori</italic>, and the extent to which the likelihood-based method can be used to quantify uncertainty. We find that high spatial correlation of sales data between products may be a useful indicator for “hard-to-identify” products.</p>
</abstract>
<abstract abstract-type="summary"><title>Author Summary</title>
<p>Response to foodborne disease outbreaks is complicated by globalization of our food supply chains. Rapid identification of contaminated products is essential to limit the damage caused by foodborne disease. Worldwide, foodborne disease outbreaks are responsible for $9B a year in medical costs and over $75B in economic losses. Yet relevant data required to accelerate the identification of suspicious food already exists as part of the inventory control systems used by retailers and distributors today. Combining this retail data with public health case reports has the potential to hasten outbreak investigations and provide public health investigators with better information on suspected products to test. This paper demonstrates the feasibility of the principle and efficiency of this approach. Based on these findings it can be concluded that in foodborne disease outbreaks retail data could be used to speed and target public health investigations and consequently reduce numbers of sick/dead people as well as reduce economic losses to the industry.</p>
</abstract>
<funding-group><funding-statement>Research carried out by the Federal Institute for Risk Assessment (BfR), Germany, has been supported by Federal Ministry of Education and Research (BMBF) research grant 13N11202. JL's work on this project was funded in part by grants from the NIH National Institute of Allergy and Infectious Disease (K22 AI092150-01 and R01 AI102939-01A1). The funders had no role in study design, data collection and analysis, decision to publish,or preparation of the manuscript.</funding-statement></funding-group><counts><page-count count="10"/></counts></article-meta>
</front>
<body><sec id="s1">
<title>Introduction</title>
<p>In recent years global trade has significantly altered the topology of food supply chains <xref ref-type="bibr" rid="pcbi.1003692-European1">[1]</xref>. As a result, the potential impact of contamination events has increased <xref ref-type="bibr" rid="pcbi.1003692-Marvin1">[2]</xref>. Worldwide, foodborne illness causes billions of dollars in healthcare related costs each year <xref ref-type="bibr" rid="pcbi.1003692-Hoffmann1">[3]</xref>, and more in economic losses to farmers, distributors, and food retailers <xref ref-type="bibr" rid="pcbi.1003692-Gadiel1">[4]</xref>, <xref ref-type="bibr" rid="pcbi.1003692-Abe1">[5]</xref>. In case of a foodborne disease outbreak, rapid identification of contaminated products is essential, since the medical and economic damages incurred grow with the duration of the outbreak. Currently public health investigators must reconstruct the relevant food distribution network in order to identify the contaminated food product or contaminated product groups during an outbreak <xref ref-type="bibr" rid="pcbi.1003692-World1">[6]</xref>. Lab-based analytical methods frequently provide the “gold standard” in verifying the source of foodborne illness outbreaks. These methods verify or cast doubt on epidemiological findings originating from case control studies with food consumption questionnaires <xref ref-type="bibr" rid="pcbi.1003692-Pastore1">[7]</xref>. In addition, the ability to track food through different stages of production, processing, and distribution (traceability) has been the subject of extensive study <xref ref-type="bibr" rid="pcbi.1003692-Regattieri1">[8]</xref>, <xref ref-type="bibr" rid="pcbi.1003692-Greig1">[9]</xref>. Nevertheless the time required to accomplish such investigations usually ranges from weeks to months. Accelerating this process may reduce the number of people sickened and help to restore consumer confidence in the safety of food products <xref ref-type="bibr" rid="pcbi.1003692-Filter1">[10]</xref>.</p>
<p>In a previous study, as a possible strategy to achieve this goal, we proposed a likelihood-based method that could be applied as an early response system to help determine the product most likely to be associated with a foodborne disease outbreak <xref ref-type="bibr" rid="pcbi.1003692-Doerr1">[11]</xref>. The method was tested with synthetic food sales data, but real data is readily available from retail sales companies. Proactive analysis of this retail data could complement and guide laboratory testing and trace back analysis.</p>
<p>In the work reported here, we test our likelihood-based method using raw food sales data. As a simplifying assumption, we model food consumption at the point of sale region. In future work, we will test this assumption by applying Huff's “gravity model” for retail shopping to smooth the sales distribution over other regions <xref ref-type="bibr" rid="pcbi.1003692-Huff1">[12]</xref>. Smoothing the sales distribution will also allow sensitivity analysis to spatial noise in the case reports.</p>
<p>In applying the likelihood-based method to real world sales data, we use a ROC (receiver operating characteristics) analysis to quantify the performance of the method, comparing two different classifiers. This analysis also identifies the optimal discrimination threshold to maximize performance as a function of both the selectivity and specificity for the likelihood-based analysis. Additionally we explore how the method's performance may depend on “structural” properties of the sales data distribution, as this understanding is essential for efforts to proactively predict which contaminated foods/food groups might be hard to pinpoint in the event of an outbreak.</p>
</sec><sec id="s2" sec-type="methods">
<title>Methods</title>
<sec id="s2a">
<title>Food Sales Data</title>
<p>We apply product specific retail sales data from stores of a German food retail company covering 3,513 of Germany's 8,235 postal zones. The dataset lists the weekly sales of 580 anonymous food products (N = 580). For application in this analysis, sales data were aggregated per postal zone and product over the three-year period 01/2008 to 12/2010. Let <italic>sales(n, r)</italic> represent the number of units of food product <italic>n</italic> sold in region <italic>r</italic> over this three-year period. We can now define a function <italic>f<sub>s</sub>(n, r)</italic> representing the probability that product <italic>n</italic> is sold in region <italic>r</italic> as:<disp-formula id="pcbi.1003692.e001"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1003692.e001" xlink:type="simple"/><label>(1)</label></disp-formula>where <italic>R</italic> is the set of all regions included in the analysis.</p>
</sec><sec id="s2b">
<title>Outbreak Pattern Generation</title>
<p>The underlying assumption of outbreak pattern generation is that for each product the distribution of sales across the postal codes reflects the true consumption pattern for that food <xref ref-type="bibr" rid="pcbi.1003692-Huff1">[12]</xref>. Hence, the function <italic>f<sub>c</sub>(n, r)</italic> represents the probability that product <italic>n</italic> is <italic>consumed</italic> in region <italic>r</italic> and in this paper we simply assume probability of consumption equals probability of sale:<disp-formula id="pcbi.1003692.e002"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1003692.e002" xlink:type="simple"/><label>(2)</label></disp-formula></p>
<p>Notice that for a given product <italic>n</italic>, <italic>f<sub>c</sub> (n, r)</italic> is a discrete probability mass function representing the probability that product <italic>n</italic> is consumed in location <italic>r</italic>, and that:<disp-formula id="pcbi.1003692.e003"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1003692.e003" xlink:type="simple"/><label>(3)</label></disp-formula></p>
<p>We take advantage of this when generating synthetic outbreak case reports for a selected “contaminated” product <italic>x</italic> (where we use <italic>x</italic> instead of <italic>n</italic> to indicated a single contaminated product). Using A. J. Walker's alias method <xref ref-type="bibr" rid="pcbi.1003692-Walker1">[13]</xref>, we draw <italic>M</italic> random locations by sampling from <italic>f<sub>c</sub> (x, r)</italic> over all locations <italic>r</italic> in <italic>R</italic>. In separate trials, synthetic case report data are generated assuming each of the 580 products, in turn, as the source of contamination. We assume the products are independent so <italic>f<sub>c</sub> (x, r)</italic> also defines the probability of a case report at location <italic>r</italic> due to contaminated product <italic>x</italic>. It is true that two “products” with different local “brands” or “ids” could in fact be the same food item simply rebranded when repackaged locally. Conversely, a product sold on a national scale under one single brand could become contaminated at a single point of sale retail site (e.g., a butcher shop). For the purposes of this study, the simulated case reports were generated self consistently from the retail data using the assumption that the data provided to us by product id were independent. Depending upon the spatial distribution of product <italic>x</italic>, it is likely that, during one simulated outbreak of 100 cases, multiple case reports will come from a same postal code. <xref ref-type="fig" rid="pcbi-1003692-g001">Figure 1</xref> plots the number of case reports per location for several different outbreaks each generated based on a different product. Distributions generated from widely distributed products (shown in blue) are flatter than distributions generated from products sold only locally or regionally (shown in red).</p>
<fig id="pcbi-1003692-g001" position="float"><object-id pub-id-type="doi">10.1371/journal.pcbi.1003692.g001</object-id><label>Figure 1</label><caption>
<title>Number of case reports per location for several different outbreaks each generated based on a different product.</title>
<p>For each product the results are averaged over 50 trials. For each trial, the x axis is sorted from most to least frequently occurring location to show the outbreak pattern.</p>
</caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1003692.g001" position="float" xlink:type="simple"/></fig></sec><sec id="s2c">
<title>Identifying Implicated Products</title>
<p>An outbreak can be described by the set of locations {<italic>R</italic>} of all reported cases where <italic>r<sub>i</sub></italic> is the location of the <italic>i<sup>th</sup></italic> case. Note that there is no limit or constraint on how many cases may come from a particular location. In order to identify implicated products we describe two estimation methods below.</p>
<sec id="s2c1">
<title>Method 1: The likelihood based method</title>
<p>Let <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e004" xlink:type="simple"/></inline-formula> be a parameter vector of length N, such that <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e005" xlink:type="simple"/></inline-formula> is 1 if a product <italic>k</italic> is contaminated and zero otherwise. Here we assume there is a single contaminated product in a given outbreak so only one element of vector <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e006" xlink:type="simple"/></inline-formula>. If we consider <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e007" xlink:type="simple"/></inline-formula> to be the parameter vector designating <italic>k</italic> as the contaminated product, then the likelihood of <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e008" xlink:type="simple"/></inline-formula> after observing <italic>m</italic> case reports is:<disp-formula id="pcbi.1003692.e009"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1003692.e009" xlink:type="simple"/><label>(4)</label></disp-formula>where <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e010" xlink:type="simple"/></inline-formula> is the probability that an individual living in location <italic>r<sub>i</sub></italic> consumed product <italic>k</italic> (see <xref ref-type="supplementary-material" rid="pcbi.1003692.s005">Text S1</xref> on the derivation of likelihood). Hence each element <italic>P<sub>k</sub>(m)</italic> of the vector <italic>P(m)</italic> is proportional to the likelihood that product <italic>k</italic> is the contaminated product. Dividing each element of <italic>P(m)</italic> by the largest element in <italic>P(m)</italic> yields the likelihood ratio for each product being the contaminated product given the first m elements of <italic>R</italic>. We denote this as <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e011" xlink:type="simple"/></inline-formula>. The product <italic>k</italic> that corresponds to the maximal element of <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e012" xlink:type="simple"/></inline-formula> is our maximum likelihood estimate for the contaminated product.</p>
</sec><sec id="s2c2">
<title>Method 2: The pair-wise Spearman's rank correlation method</title>
<p>Let <italic>sales(k)</italic> be a vector of length 3,513 (number of postal zones or locations used) where each element represents the total number of units of product <italic>k</italic> sold in a given location. Also, assume <italic>outbreak(R)</italic> is also a vector of length 3,513 where each element represents the number of times a location was drawn in <italic>R</italic>. Now a pair-wise Spearman's rank correlation coefficient was calculated for element <italic>k</italic> in positive definite vector <italic>P(m)</italic> by:<disp-formula id="pcbi.1003692.e013"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1003692.e013" xlink:type="simple"/><label>(5)</label></disp-formula></p>
<p>In this method, normalization of <italic>P(m)</italic> is done by setting its <italic>k<sub>th</sub></italic> element to:<disp-formula id="pcbi.1003692.e014"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1003692.e014" xlink:type="simple"/><label>(6)</label></disp-formula></p>
</sec></sec><sec id="s2d">
<title>Performance Estimation</title>
<p>We run the analysis varying the contaminated product, <italic>x</italic>, over all <italic>N</italic> = 580 products, and up to <italic>M</italic> = 100 synthetic case reports ending up with 58,000 <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e015" xlink:type="simple"/></inline-formula> vectors. Next, we repeat the experiment over <italic>S</italic> = 100 randomly seeded runs, denoting <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e016" xlink:type="simple"/></inline-formula> the outcome of <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e017" xlink:type="simple"/></inline-formula> in the <italic>s<sub>th</sub></italic> experiment. (See <xref ref-type="supplementary-material" rid="pcbi.1003692.s001">Dataset S1</xref> and <xref ref-type="supplementary-material" rid="pcbi.1003692.s002">Dataset S2</xref>.) Now we define a statistic <italic>A<sub>x,m</sub></italic> as such:<disp-formula id="pcbi.1003692.e018"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1003692.e018" xlink:type="simple"/><label>(7)</label></disp-formula><inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e019" xlink:type="simple"/></inline-formula> is a function that returns 1 if the index of maximum element in vector <italic>v</italic> is <italic>i</italic>; if not it returns 0. We call statistic <italic>A</italic> the success rate <xref ref-type="bibr" rid="pcbi.1003692-Doerr1">[11]</xref>.</p>
<p>Statistic <italic>B</italic> is based upon an ROC analysis. In an ROC analysis, we compute the average true positive rate and false positive rate (also called sensitivity and specificity). The average true positive rate (TPR) for a discrimination threshold <bold><italic>t</italic></bold> is defined as:<disp-formula id="pcbi.1003692.e020"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1003692.e020" xlink:type="simple"/><label>(8)</label></disp-formula></p>
<p>Here we assume the ≥ test returns 1 when satisfied, 0 otherwise. Essentially we sum the total number of outcomes where the ratio of “guilty” product <italic>x</italic> is above the threshold and then average over the <italic>S</italic> runs.</p>
<p>To define the false positive rate for a contaminated product <italic>x</italic> after <italic>m</italic> case reports, we first compute the number of true negatives in run <italic>s</italic>:<disp-formula id="pcbi.1003692.e021"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1003692.e021" xlink:type="simple"/><label>(9)</label></disp-formula></p>
<p>Next we compute the number of false positives:<disp-formula id="pcbi.1003692.e022"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1003692.e022" xlink:type="simple"/><label>(10)</label></disp-formula></p>
<p>The average false positive rate is now defined as:<disp-formula id="pcbi.1003692.e023"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1003692.e023" xlink:type="simple"/><label>(11)</label></disp-formula></p>
<p>In the analysis, we use the thresholds <italic>t</italic> of 1/256, 1/128, 1/64, 1/32, 1/16, 1/8, 1/4, 1/2 and 1 to generate the Area Under Curve (AUC) statistic. As some food distributions within the data set had no overlap with the generated outbreak pattern, and to avoid overestimation of specificity, we exclude so-called “zero probability” products from the average in the corresponding scenario. A product belonged to the zero probability category, by definition, when after 100 trials and 100 case reports for each trail, that product is <italic>never</italic> sold in any sampled location. Failure to exclude the zero probability set would artificially exaggerate the specificity of the method.</p>
</sec><sec id="s2e">
<title>Clustering of Food Products</title>
<p>In order to analyze how different food distribution patterns can influence the performance of the likelihood-based method, the similarity of the distribution patterns of the food products was measured by calculating the pair-wise Spearman's rank correlation coefficient, <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e024" xlink:type="simple"/></inline-formula>, on the basis of sales distribution data of all food products <xref ref-type="bibr" rid="pcbi.1003692-Clifford1">[14]</xref>. Similar to the estimation technique described in method 2 above, let <italic>sales(k)</italic> be a vector of length 3,513 (number of locations) where each element is the total sales of product <italic>k</italic> in a given location. The pair-wise Spearman's rank correlation is between two products, <italic>k</italic> and <italic>l</italic> becomes:<disp-formula id="pcbi.1003692.e025"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1003692.e025" xlink:type="simple"/><label>(12)</label></disp-formula></p>
<p>Since Spearman's <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e026" xlink:type="simple"/></inline-formula> provides a measure of pair-wised association between food distributions, the value of 1−<inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e027" xlink:type="simple"/></inline-formula> served as a <italic>dissimilarity</italic> measure describing the “distance” between each pair of food products. This measure was used as input for a hierarchical clustering algorithm [function hclust()] using the complete linkage method provided by the R-Manual <xref ref-type="bibr" rid="pcbi.1003692-ETH1">[15]</xref>.</p>
</sec></sec><sec id="s3">
<title>Results/Discussion</title>
<sec id="s3a">
<title>Performance of the Likelihood-Based Method</title>
<p>In order to evaluate its performance the method has been applied to a real world dataset of 580 food products with known distribution patterns across Germany <xref ref-type="bibr" rid="pcbi.1003692-Doerr1">[11]</xref>. In this analysis the simplifying assumption has been made, that exactly one of the known food products is responsible for a disease outbreak, which were generated based on the corresponding “guilty” food product distribution. The number of sampled cases defining the outbreak size has been varied from 1 to 100.</p>
<p>To assess the performance of the likelihood-based method statistic <italic>A</italic> and <italic>B</italic> were used. Each statistic describes the capability of the method to correctly identify the source of infection back from the comparison of the artificial outbreak pattern with each of the 580 food products under investigation. In <xref ref-type="fig" rid="pcbi-1003692-g002">Figure 2</xref> the green curve shows the success rate (statistic <italic>A</italic>) averaged over the 580 food products as contamination source. The average success rate of the algorithm rises steeply with the number of case reports reaching a level above 80% with only 50 case reports. However there are outbreak patterns for which the likelihood method is not effective with many more case reports required for unique identification of the correct “guilty” product. This is in line with the expectation the highest likelihood criteria are hard to accomplish for similarly distributed products.</p>
<fig id="pcbi-1003692-g002" position="float"><object-id pub-id-type="doi">10.1371/journal.pcbi.1003692.g002</object-id><label>Figure 2</label><caption>
<title>Average success rate, sensitivity with 1/32 and 1/8 threshold, and average suspect product set size vs. number of case reports.</title>
</caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1003692.g002" position="float" xlink:type="simple"/></fig>
<p>Taking advantage of the likelihood-based approach we can also assess the relative probability for <italic>all products</italic>. Selecting a <italic>discrimination threshold</italic>, we can then identify the group or subset of all products with likelihood ratio greater than that threshold, which we call the “suspect product set”. In <xref ref-type="fig" rid="pcbi-1003692-g002">Figure 2</xref> we also show the average probability that the contaminated product is found within this set for thresholds of 1/8 (cyan) and 1/32 (red). Also shown in <xref ref-type="fig" rid="pcbi-1003692-g002">Figure 2</xref> is the number of products found (on average) within the suspect product set, as a function of the number of case reports, for the same choices of threshold. Even for a threshold of 1/32, the average set size falls to as few as a dozen suspect products within only ten case reports <xref ref-type="bibr" rid="pcbi.1003692-Royall1">[16]</xref>.</p>
<p>To visualize the performance statistic <italic>B</italic> of this likelihood-based approach, we plot in <xref ref-type="fig" rid="pcbi-1003692-g003">Figure 3a</xref> the “receiver operating characteristic” or ROC curves for outbreak patterns with different numbers of cases. The ROC analysis characterizes the performance of the algorithm when the calculated likelihood ratio is applied as a binary classifier. The curve shows the “sensitivity” of the classifier as a measure of the fraction of true positives vs. the fraction of false positives (1-specificity). An ideal or perfect classifier would have a sensitivity of 1.0 at (1-specifity) = 0 (no false positives). The area under the ROC curve (AUC) provides a measure of overall performance. A perfect classifier has an AUC = 1.0. A useless classifier (e.g., with a linear ROC curve and slope of ½) would have an AUC of 0.5. As expected, this type of performance measure illustrates that the results of the likelihood-based approach depend on the number of case reports. Thus separate curves are shown for outbreaks with 1, 2, 3, 5, 10, and 50 cases. (The ROC curve is defined for only one case report. However, from a public health perspective an “outbreak” of foodborne illness is declared only after two or more cases.) As <xref ref-type="fig" rid="pcbi-1003692-g003">Figure 3a</xref> shows, the area under the cure approaches 1 for outbreak patterns with as few as 50 case reports. In <xref ref-type="fig" rid="pcbi-1003692-g003">Figures 3b and 3c</xref>, we compare the performance of the likelihood-based approach with a simple classifier based on the Spearman rank correlation coefficient <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e028" xlink:type="simple"/></inline-formula>. As these three figures make evident, the likelihood-based method outperforms the correlation-based approach. In a real world application, these performance improvements are of utmost importance to avoid false accusations of food manufacturers, unjustified product recalls, and a waste of limited analytical resources.</p>
<fig id="pcbi-1003692-g003" position="float"><object-id pub-id-type="doi">10.1371/journal.pcbi.1003692.g003</object-id><label>Figure 3a–c</label><caption>
<title>Performance measures of the likelihood-based approach.</title>
<p><xref ref-type="fig" rid="pcbi-1003692-g003">Figure 3a</xref> shows the ROC curve as a function of the number of case reports (see legend). <xref ref-type="fig" rid="pcbi-1003692-g003">Figure 3b</xref> shows a comparison of ROC curves generated with the likelihood-based method vs. a Spearman rank correlation based measure for outbreak patterns with 10 case reports. <xref ref-type="fig" rid="pcbi-1003692-g003">Figure 3c</xref> provides the area under the curve (AUC) as a function of the number of case reports for both classifiers.</p>
</caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1003692.g003" position="float" xlink:type="simple"/></fig>
<p>Using the Spearman's rank correlation coefficient <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e029" xlink:type="simple"/></inline-formula>, we explore how the performance of the likelihood-based method is related to associations between distinct product sales distributions. As <xref ref-type="fig" rid="pcbi-1003692-g004">Figure 4</xref> and <xref ref-type="supplementary-material" rid="pcbi.1003692.s003">Figures S1</xref> and <xref ref-type="supplementary-material" rid="pcbi.1003692.s004">S2</xref> confirm, the algorithm's performance is strongly influenced by associations between food sales distributions. Plotting the maximum Spearman's <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e030" xlink:type="simple"/></inline-formula> for each product against success rate, we assess how the magnitude of the association between the contaminated product and the food to which it is most similarly distributed affects the suspect product set size determined by the likelihood-based approach.</p>
<fig id="pcbi-1003692-g004" position="float"><object-id pub-id-type="doi">10.1371/journal.pcbi.1003692.g004</object-id><label>Figure 4a–c</label><caption>
<title>Suspect product set size vs. maximum pair-wise product correlation after observing 10, 20, and 50 simulated cases.</title>
<p>For large correlation, the contaminated product cannot always be uniquely determined.</p>
</caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1003692.g004" position="float" xlink:type="simple"/></fig>
<p>The data in <xref ref-type="fig" rid="pcbi-1003692-g004">Figure 4</xref> demonstrates that the number of suspect products increases steeply if the contaminated food and the product most related to it have high correlation. The knee of the curve shifts with set size increasing sharply for correlation ≳ 0.8 given 10 case reports, ≳ 0.9 given 20 case reports, etc. Comparing <xref ref-type="fig" rid="pcbi-1003692-g004">Figure 4c to 4a</xref>, it is clear that as the maximum pairwise correlation between a contaminated product and another product increases, the number of cases required to <italic>reduce</italic> the suspect product set size to a manageable number (e.g., below 10) increases. In <xref ref-type="supplementary-material" rid="pcbi.1003692.s004">Figure S2</xref> we also show the corresponding decrease in the “success rate” measure.</p>
<p>Consider ‘Y’ products with <italic>identical</italic> sales distributions. When the rank ordered distribution patterns of the contaminated food and at least one of these foods are equal, then the value of the likelihood for those products will remain the same. In this limit, the size of the suspect product set will never fall below Y, independent of the number of case reports. Understanding the maximum number of highly correlated products is therefore important given the larger goal of accelerating foodborne disease investigations, as it forewarns public health investigators of the largest number of products that may have to be tested together (in a worst case scenario).</p>
</sec><sec id="s3b">
<title>Clustering</title>
<p>As noted, a high degree of similarity between the distribution patterns of the food products under investigation and the spatial pattern of the contaminated “guilty” product implies that it is (will be) difficult to correctly identify the causative food item. To describe and visualize this property of the food data set, we calculate the correlation matrix and apply hierarchical clustering algorithms. <xref ref-type="fig" rid="pcbi-1003692-g005">Figure 5</xref> is a graphical representation of the pair-wise Spearman's correlation coefficient matrix as a so-called heat map. In this representation, products were sorted by the hierarchical clustering indicated in <xref ref-type="fig" rid="pcbi-1003692-g006">Figure 6</xref>. The colors indicate the degree of similarity between food products as measured by the Spearman's <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e031" xlink:type="simple"/></inline-formula>. This representation supports the finding that there is a large cluster of highly similar distributed food products within the given data set. Products belonging to this cluster make the biggest contribution to rapid decrease in classifier performance when the number of case reports falls below 10 (data not shown). The figure shows a distribution of cluster sizes within the retail sales data.</p>
<fig id="pcbi-1003692-g005" position="float"><object-id pub-id-type="doi">10.1371/journal.pcbi.1003692.g005</object-id><label>Figure 5</label><caption>
<title>Heat map of the pair-wise Spearman's <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e032" xlink:type="simple"/></inline-formula> matrix.</title>
<p>This figure depicts the correlation matrix map sorted by clusters.</p>
</caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1003692.g005" position="float" xlink:type="simple"/></fig><fig id="pcbi-1003692-g006" position="float"><object-id pub-id-type="doi">10.1371/journal.pcbi.1003692.g006</object-id><label>Figure 6</label><caption>
<title>Hierarchical clustering diagram of 580 food products.</title>
<p>Different colors indicate different clusters, defined by a cut-off value of 0.25. (Note that colors were used multiple times, i.e., non-adjacent clusters of the same color are not related in any special way.)</p>
</caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1003692.g006" position="float" xlink:type="simple"/></fig>
<p><xref ref-type="fig" rid="pcbi-1003692-g006">Figure 6</xref> shows a dendrogram visualizing the dissimilarity of the spatial distribution patterns of the 580 food products under investigation. Most similarly distributed food products are grouped at the bottom of the tree with a dissimilarity score close to 0 (i.e., the spatial distribution pattern is almost identical). Clusters of similar distributed food products are connected according to the dissimilarity score generated by the complete linkage method. For further investigations distinct clusters were generated (indicated by different colors) by cutting the tree horizontally at the dissimilarity level of 0.25. This ensures that within each cluster, all pair-wise Spearmen's correlation coefficients <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e033" xlink:type="simple"/></inline-formula> are at least 0.75. This choice of threshold was inspired by the observation reported above, which the suspect product set size increases rapidly when the maximum Spearman's <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e034" xlink:type="simple"/></inline-formula> is above ∼0.8.</p>
<p>The <italic>product</italic> data used in this study was provided as point of sale retail data by anonymized product id. After completion of the study, the products were identified as various dairy products. The 580 food items include some items that are locally branded (and sold) and some very widely distributed products sold nationally. The only factor we could identify as important to the product clustering shown in <xref ref-type="fig" rid="pcbi-1003692-g006">Figure 6</xref> was the spatial pattern of the food distribution including whether the food item was sold locally, regionally, or nationally. Categories, such as fresh or frozen, do not affect the observed clustering (and those factors where not used in generating the simulated outbreaks as they were not known to the authors before the study and not built into the simulation).</p>
<p>To characterize the clusters observed in the dendrogram in <xref ref-type="fig" rid="pcbi-1003692-g006">Figure 6</xref>, we show in <xref ref-type="fig" rid="pcbi-1003692-g007">Figure 7</xref>, for all clusters containing three food products, a series of small images showing color coded product sales volume in each of the 3,513 postal code regions where food is sold by the food retail company. The images are organized according to the product grouping generated by the clustering algorithm. The figure clearly shows that product clustering strongly depends on how widely spread or how localized is the spatial sale pattern of the product for each cluster. Products with similar sales distributions are placed in common clusters by the pair-wise Spearman's rank correlation method.</p>
<fig id="pcbi-1003692-g007" position="float"><object-id pub-id-type="doi">10.1371/journal.pcbi.1003692.g007</object-id><label>Figure 7</label><caption>
<title>A series of small images illustrating distribution patterns of food products sold by a German food retail company stratified by zip codes.</title>
<p>For illustration purposes, all product clusters containing exactly three products are displayed. Clusters are arranged in two columns of seven clusters each. Other cluster sizes exhibit similar correlations between product distribution patterns. This image is published with permission from Esri and its data providers, and from Michael Bauer Research GmbH, Nürnberg, Germany; Data Source: Microm 2013.</p>
</caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1003692.g007" position="float" xlink:type="simple"/></fig></sec><sec id="s3c">
<title>Performance of the Likelihood-Based Method in Case of Food Product Clusters</title>
<p><xref ref-type="fig" rid="pcbi-1003692-g008">Figures 8 a–c</xref> show that the average success rate for identification of contaminated foods within a cluster of a certain size is linearly related to the log of cluster size for (a) 10 case reports, (b) 20 case reports, and (c) 50 case reports. It can be stated, that the <italic>absolute magnitude</italic> of the slope of this linear relationship <italic>decreases</italic> in the presence of larger numbers of cases. This confirms, that even for highly correlated food distribution patterns the performance of a likelihood-based classifier will increase with additional information on case reports.</p>
<fig id="pcbi-1003692-g008" position="float"><object-id pub-id-type="doi">10.1371/journal.pcbi.1003692.g008</object-id><label>Figure 8a–c</label><caption>
<title>Average success rate after observing 10(a), 20(b), and 50(c) simulated cases vs. Log Number of foods in each cluster.</title>
</caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pcbi.1003692.g008" position="float" xlink:type="simple"/></fig></sec><sec id="s3d">
<title>Conclusions</title>
<p>This analysis shows how, when information on the food distribution channels is available, likelihood-based methods can quickly identify those products likely to be causing an outbreak using the geographic locations for even relatively few cases. However, these methods assume that food distribution channels are well characterized, which may rarely be the case. Nevertheless, our methods could be extremely useful for retail companies that want to assess which of their own products could potentially be involved in an ongoing disease outbreak, or identifying chains or individual stores that should be prioritized for investigation in an ongoing outbreak. In practice, multiple products may be contaminated by a single food ingredient. Here we use a very simple model of the probability of individuals consuming food for particular shops, which may be quite different from real consumption patterns.</p>
<p>In this paper we also make the simplifying assumption that food is consumed where it is sold. In fact, people travel. In the future, it is possible to extend the current work by adding Huff's “gravity model” for retail shopping behavior <xref ref-type="bibr" rid="pcbi.1003692-Huff1">[12]</xref>. This will effectively smooth the sales distribution over nearby regions. It will also make it possible to test the addition of noise in the case report generator. In the simplified model, any case report occurring in region where a product is never sold (probability 0) immediately excludes that product from consideration. The performance of the likelihood-based method in these more challenging scenarios will be explored in future research.</p>
<p>This analysis also provided some fundamental insights into the relationship of method's performance and inherited properties of the analyzed food sales data. We could confirm that the degree in similarity of the spatial food distribution pattern determines how quickly the likelihood method will converge on a finite suspect product set size. Generally, the maximum pair-wise correlation with the actual contaminated product is negatively related to success rate, and positively related to the number of cases required for a perfect prediction. This suggests that it may be beneficial to consider identifying groups of products as likely to contain the tainted food, rather than focusing on finding one product.</p>
<p>Additionally it has been shown that relevant intrinsic properties of the food sales data can be visualized by performing hierarchical clustering algorithms. This method provides a helpful graphical summary of the spatial similarity of food distributions. Further, on the basis of clusters generated by this algorithm, it is shown that log cluster size has a negative, linear relationship with success rate. This suggests that, as the number of products similarly distributed as the contaminated product increases, our ability to consistently identify the contaminated food in a small number of cases decreases. Highly correlated food product distributions are associated with products that are (and will be) harder to identify than uncorrelated product distributions. Since correlated product clusters can be identified proactively, suspect products can also be grouped for analysis accelerating an outbreak investigation.</p>
</sec></sec><sec id="s4">
<title>Supporting Information</title>
<supplementary-material id="pcbi.1003692.s001" mimetype="text/csv" xlink:href="info:doi/10.1371/journal.pcbi.1003692.s001" position="float" xlink:type="simple"><label>Dataset S1</label><caption>
<p><bold>This csv file contains normalized sales data for 580 anonymous food products across 3518 postal code areas in Germany.</bold> Retail sales data were provided by SymphonyIRI Group GmbH, Germany.</p>
<p>(CSV)</p>
</caption></supplementary-material><supplementary-material id="pcbi.1003692.s002" mimetype="application/x-rar-compressed" xlink:href="info:doi/10.1371/journal.pcbi.1003692.s002" position="float" xlink:type="simple"><label>Dataset S2</label><caption>
<p><bold>The first column in this data file indicates anonymous food product IDs (the number before the underscore in the naming convention) and the index number for experimental runs (the number after the underscore) up to 50 replications.</bold> In each simulation for a “suspect” product, we sampled 100 postal code areas shown in column B to CW used to represent the locale of case reports identified for a synthetic outbreak. Retail sales data were provided by SymphonyIRI Group GmbH, Germany.</p>
<p>(RAR)</p>
</caption></supplementary-material><supplementary-material id="pcbi.1003692.s003" mimetype="image/tiff" xlink:href="info:doi/10.1371/journal.pcbi.1003692.s003" position="float" xlink:type="simple"><label>Figure S1 a–b</label><caption>
<p><bold>Success rate and likelihood ratio for individual products (as contaminated product).</bold></p>
<p>(TIF)</p>
</caption></supplementary-material><supplementary-material id="pcbi.1003692.s004" mimetype="image/tiff" xlink:href="info:doi/10.1371/journal.pcbi.1003692.s004" position="float" xlink:type="simple"><label>Figure S2 a–c</label><caption>
<p><bold>Success rate as a function of maximum correlation (Spearman's</bold> <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pcbi.1003692.e035" xlink:type="simple"/></inline-formula><bold>) for (a) 10 case reports, (b) 20 case reports, and (c) 50 case reports.</bold> For large correlations, the contaminated product cannot always be uniquely determined.</p>
<p>(TIF)</p>
</caption></supplementary-material><supplementary-material id="pcbi.1003692.s005" mimetype="application/msword" xlink:href="info:doi/10.1371/journal.pcbi.1003692.s005" position="float" xlink:type="simple"><label>Text S1</label><caption>
<p><bold>Derivation of likelihood.</bold></p>
<p>(DOC)</p>
</caption></supplementary-material></sec></body>
<back>
<ack>
<p>Retail sales data were provided by SymphonyIRI Group GmbH, Germany.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="pcbi.1003692-European1"><label>1</label>
<mixed-citation publication-type="other" xlink:type="simple">European Commission (2011) MAP - Monitoring Agri-trade Policy - Global and EU agricultural exports rebound. Available: <ext-link ext-link-type="uri" xlink:href="http://ec.europa.eu/agriculture/trade-analysis/map/01_11_en.pdf" xlink:type="simple">http://ec.europa.eu/agriculture/trade-analysis/map/01_11_en.pdf</ext-link>. Accessed: 21 May 2014.</mixed-citation>
</ref>
<ref id="pcbi.1003692-Marvin1"><label>2</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Marvin</surname><given-names>HJP</given-names></name>, <name name-style="western"><surname>Kleter</surname><given-names>GA</given-names></name>, <name name-style="western"><surname>Frewer</surname><given-names>LJ</given-names></name>, <name name-style="western"><surname>Cope</surname><given-names>S</given-names></name>, <name name-style="western"><surname>Wentholt</surname><given-names>MTA</given-names></name>, <etal>et al</etal>. (<year>2009</year>) <article-title>A working procedure for identifying emerging food safety issues at an early stage: implications for European and international risk management practices</article-title>. <source>Food Control</source> <volume>20</volume> (<issue>4</issue>) <fpage>345</fpage>–<lpage>356</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1003692-Hoffmann1"><label>3</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Hoffmann</surname><given-names>S</given-names></name>, <name name-style="western"><surname>Batz</surname><given-names>MB</given-names></name>, <name name-style="western"><surname>Morris</surname><given-names>JG</given-names><suffix>Jr</suffix></name> (<year>2012</year>) <article-title>Annual cost of illness and quality-adjusted life year losses in the United States due to 14 foodborne pathogens</article-title>. <source>J Food Prot</source> <volume>75</volume> (<issue>7</issue>) <fpage>1292</fpage>–<lpage>302</lpage> <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.4315/0362-028X.JFP-11-417" xlink:type="simple">10.4315/0362-028X.JFP-11-417</ext-link></comment></mixed-citation>
</ref>
<ref id="pcbi.1003692-Gadiel1"><label>4</label>
<mixed-citation publication-type="other" xlink:type="simple">Gadiel D (2010) The economic cost of foodborne disease in New Zealand. Prepared for: New Zealand Food Safety Authority. Available: <ext-link ext-link-type="uri" xlink:href="http://www.foodsafety.govt.nz/elibrary/industry/economic-cost-foodborne-disease/foodborne-disease.pdf" xlink:type="simple">http://www.foodsafety.govt.nz/elibrary/industry/economic-cost-foodborne-disease/foodborne-disease.pdf</ext-link>. Accessed 19 May 2014.</mixed-citation>
</ref>
<ref id="pcbi.1003692-Abe1"><label>5</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Abe</surname><given-names>K</given-names></name>, <name name-style="western"><surname>Yamamoto</surname><given-names>S</given-names></name>, <name name-style="western"><surname>Shinagawa</surname><given-names>K</given-names></name> (<year>2002</year>) <article-title>Economic impact of an <italic>Escherichia coli</italic> O157: H7 outbreak in Japan</article-title>. <source>J Food Prot</source> <volume>65</volume> (<issue>1</issue>) <fpage>66</fpage>–<lpage>72</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1003692-World1"><label>6</label>
<mixed-citation publication-type="other" xlink:type="simple">World Health Organization (2008) Foodborne disease outbreaks: guidelines for investigation and control. Available: <ext-link ext-link-type="uri" xlink:href="http://www.who.int/foodsafety/publications/foodborne_disease/outbreak_guidelines.pdf" xlink:type="simple">http://www.who.int/foodsafety/publications/foodborne_disease/outbreak_guidelines.pdf</ext-link>. Accessed 19 May 2014.</mixed-citation>
</ref>
<ref id="pcbi.1003692-Pastore1"><label>7</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Pastore</surname><given-names>R</given-names></name>, <name name-style="western"><surname>Altpeter</surname><given-names>E</given-names></name>, <name name-style="western"><surname>Baumgartner</surname><given-names>A</given-names></name>, <name name-style="western"><surname>Hächler</surname><given-names>H</given-names></name>, <name name-style="western"><surname>Imhof</surname><given-names>R</given-names></name>, <etal>et al</etal>. (<year>2008</year>) <article-title>Outbreak of Salmonella serovar Stanley infections in Switzerland linked to locally produced soft cheese, September 2006–February 2007</article-title>. <source>Euro Surveill</source> <volume>13</volume>: <fpage>pii: 18979</fpage>.</mixed-citation>
</ref>
<ref id="pcbi.1003692-Regattieri1"><label>8</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Regattieri</surname><given-names>A</given-names></name>, <name name-style="western"><surname>Gamberi</surname><given-names>M</given-names></name>, <name name-style="western"><surname>Manzini</surname><given-names>R</given-names></name> (<year>2007</year>) <article-title>Traceability of food products: general framework and experimental evidence</article-title>. <source>J Food Eng</source> <volume>81</volume> (<issue>2</issue>) <fpage>347</fpage>–<lpage>356</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1003692-Greig1"><label>9</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Greig</surname><given-names>JD</given-names></name>, <name name-style="western"><surname>Ravel</surname><given-names>A</given-names></name> (<year>2009</year>) <article-title>Analysis of foodborne outbreak data reported internationally for source attribution</article-title>. <source>Int J Food Microbiol</source> <volume>130</volume> (<issue>2</issue>) <fpage>77</fpage>–<lpage>87</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1003692-Filter1"><label>10</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Filter</surname><given-names>M</given-names></name>, <name name-style="western"><surname>Thoens</surname><given-names>C</given-names></name>, <name name-style="western"><surname>Käsbohrer</surname><given-names>A</given-names></name>, <name name-style="western"><surname>Appel</surname><given-names>B</given-names></name> (<year>2012</year>) <article-title>Exploitation of commercial B2B data for risk assessment tasks in foodborne crisis events</article-title>. <source>In: Future Security: Comm Com Inf S</source> <volume>318</volume>: <fpage>471</fpage>–<lpage>474</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1003692-Doerr1"><label>11</label>
<mixed-citation publication-type="other" xlink:type="simple">Doerr D, Hu K, Renly S, Edlund S, Davis M, <etal>et al</etal>.. (2012) Accelerating investigation of food-borne disease outbreaks using pro-active geospatial modeling of food supply chains. In: Proceedings of the First ACM SIGSPATIAL International Workshop on Use of GIS in Public Health. ACM: Redondo Beach, California: 44–47. doi 10.1145/2452516.2452525.</mixed-citation>
</ref>
<ref id="pcbi.1003692-Huff1"><label>12</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Huff</surname><given-names>DL</given-names></name> (<year>1963</year>) <article-title>A probabilistic analysis of shopping center trade areas</article-title>. <source>Land Economics</source> <volume>39</volume> (<issue>1</issue>) <fpage>81</fpage>–<lpage>90</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1003692-Walker1"><label>13</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Walker</surname><given-names>AJ</given-names></name> (<year>1977</year>) <article-title>An efficient method for generating discrete random variables with general distributions, ACM Trans. Math</article-title>. <source>Software</source> <volume>3</volume>: <fpage>253</fpage>–<lpage>256</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1003692-Clifford1"><label>14</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Clifford</surname><given-names>P</given-names></name>, <name name-style="western"><surname>Richardson</surname><given-names>S</given-names></name>, <name name-style="western"><surname>Hemon</surname><given-names>D</given-names></name> (<year>1989</year>) <article-title>Assessing the significance of the correlation between two spatial processes</article-title>. <source>Biometrics</source> <volume>45</volume> (<issue>1</issue>) <fpage>123</fpage>–<lpage>134</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1003692-ETH1"><label>15</label>
<mixed-citation publication-type="other" xlink:type="simple">ETH Zurich. R: Hierarchical clustering (Undated) The R-software package ‘stats’ version 2.15.3. Available: <ext-link ext-link-type="uri" xlink:href="http://stat.ethz.ch/R-manual/R-patched/library/stats/html/hclust.html" xlink:type="simple">http://stat.ethz.ch/R-manual/R-patched/library/stats/html/hclust.html</ext-link>. Accessed 19 May 2014.</mixed-citation>
</ref>
<ref id="pcbi.1003692-Royall1"><label>16</label>
<mixed-citation publication-type="book" xlink:type="simple">Royall R (1997) Statistical evidence: a likelihood paradigm. Boca Raton, FL: Chapman and Hall/CRC Monographs on Statistics and Applied Probability 71. 171 p.</mixed-citation>
</ref>
</ref-list></back>
</article>