<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1d3 20150301//EN" "http://jats.nlm.nih.gov/publishing/1.1d3/JATS-journalpublishing1.dtd">
<article article-type="research-article" dtd-version="1.1d3" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS Comput Biol</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">ploscomp</journal-id>
<journal-title-group>
<journal-title>PLOS Computational Biology</journal-title>
</journal-title-group>
<issn pub-type="ppub">1553-734X</issn>
<issn pub-type="epub">1553-7358</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, CA USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">PCOMPBIOL-D-17-00170</article-id>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1005564</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Research Article</subject>
</subj-group>
<subj-group subj-group-type="Discipline-v3"><subject>Medicine and health sciences</subject><subj-group><subject>Infectious diseases</subject><subj-group><subject>Zoonoses</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Medicine and health sciences</subject><subj-group><subject>Infectious diseases</subject><subj-group><subject>Bacterial diseases</subject><subj-group><subject>Campylobacteriosis</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Medicine and health sciences</subject><subj-group><subject>Epidemiology</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>People and places</subject><subj-group><subject>Geographical locations</subject><subj-group><subject>Oceania</subject><subj-group><subject>New Zealand</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Organisms</subject><subj-group><subject>Bacteria</subject><subj-group><subject>Campylobacter</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Microbiology</subject><subj-group><subject>Medical microbiology</subject><subj-group><subject>Microbial pathogens</subject><subj-group><subject>Bacterial pathogens</subject><subj-group><subject>Campylobacter</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Medicine and health sciences</subject><subj-group><subject>Pathology and laboratory medicine</subject><subj-group><subject>Pathogens</subject><subj-group><subject>Microbial pathogens</subject><subj-group><subject>Bacterial pathogens</subject><subj-group><subject>Campylobacter</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Medicine and health sciences</subject><subj-group><subject>Epidemiology</subject><subj-group><subject>Genetic epidemiology</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Medicine and health sciences</subject><subj-group><subject>Epidemiology</subject><subj-group><subject>Disease surveillance</subject><subj-group><subject>Infectious disease surveillance</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Medicine and health sciences</subject><subj-group><subject>Infectious diseases</subject><subj-group><subject>Infectious disease control</subject><subj-group><subject>Infectious disease surveillance</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Medicine and health sciences</subject><subj-group><subject>Pathology and laboratory medicine</subject><subj-group><subject>Pathogenesis</subject></subj-group></subj-group></subj-group></article-categories>
<title-group>
<article-title><monospace>sourceR</monospace>: Classification and source attribution of infectious agents among heterogeneous populations</article-title>
<alt-title alt-title-type="running-head"><monospace>sourceR</monospace>: Classification and source attribution of infectious agents among heterogeneous populations</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0002-7644-6549</contrib-id>
<name name-style="western">
<surname>Miller</surname> <given-names>Poppy</given-names></name>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
<xref ref-type="corresp" rid="cor001">*</xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0003-0758-9658</contrib-id>
<name name-style="western">
<surname>Marshall</surname> <given-names>Jonathan</given-names></name>
<xref ref-type="aff" rid="aff002"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff003"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>French</surname> <given-names>Nigel</given-names></name>
<xref ref-type="aff" rid="aff003"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff004"><sup>4</sup></xref>
<xref ref-type="aff" rid="aff005"><sup>5</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0002-7902-2178</contrib-id>
<name name-style="western">
<surname>Jewell</surname> <given-names>Chris</given-names></name>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
</contrib-group>
<aff id="aff001">
<label>1</label>
<addr-line>CHICAS, Faculty of Health and Medicine, Lancaster University, Lancaster, England, United Kingdom</addr-line>
</aff>
<aff id="aff002">
<label>2</label>
<addr-line>Institute of Fundamental Sciences, Massey University, Palmerston North, New Zealand</addr-line>
</aff>
<aff id="aff003">
<label>3</label>
<addr-line>mEpiLab, Massey University, Palmerston North, New Zealand</addr-line>
</aff>
<aff id="aff004">
<label>4</label>
<addr-line>New Zealand Food Safety Science and Research Centre, Palmerston North, New Zealand</addr-line>
</aff>
<aff id="aff005">
<label>5</label>
<addr-line>New Zealand Institute for Advanced Studies, Auckland, New Zealand</addr-line>
</aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple">
<name name-style="western">
<surname>Poisot</surname> <given-names>Timothée</given-names></name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/>
</contrib>
</contrib-group>
<aff id="edit1">
<addr-line>Universite de Montreal, CANADA</addr-line>
</aff>
<author-notes>
<fn fn-type="conflict" id="coi001">
<p>The authors have declared that no competing interests exist.</p>
</fn>
<fn fn-type="con">
<p>
<list list-type="simple">
<list-item>
<p><bold>Conceptualization:</bold> CJ.</p>
</list-item>
<list-item>
<p><bold>Data curation:</bold> JM.</p>
</list-item>
<list-item>
<p><bold>Formal analysis:</bold> PM CJ.</p>
</list-item>
<list-item>
<p><bold>Funding acquisition:</bold> NF.</p>
</list-item>
<list-item>
<p><bold>Methodology:</bold> PM CJ JM.</p>
</list-item>
<list-item>
<p><bold>Project administration:</bold> CJ.</p>
</list-item>
<list-item>
<p><bold>Software:</bold> PM CJ.</p>
</list-item>
<list-item>
<p><bold>Supervision:</bold> CJ JM NF.</p>
</list-item>
<list-item>
<p><bold>Validation:</bold> PM CJ JM NF.</p>
</list-item>
<list-item>
<p><bold>Visualization:</bold> PM.</p>
</list-item>
<list-item>
<p><bold>Writing – original draft:</bold> PM.</p>
</list-item>
<list-item>
<p><bold>Writing – review &amp; editing:</bold> PM CJ JM NF.</p>
</list-item>
</list>
</p>
</fn>
<corresp id="cor001">* E-mail: <email xlink:type="simple">p.miller@lancaster.ac.uk</email></corresp>
</author-notes>
<pub-date pub-type="collection">
<month>5</month>
<year>2017</year>
</pub-date>
<pub-date pub-type="epub">
<day>30</day>
<month>5</month>
<year>2017</year>
</pub-date>
<volume>13</volume>
<issue>5</issue>
<elocation-id>e1005564</elocation-id>
<history>
<date date-type="received">
<day>30</day>
<month>1</month>
<year>2017</year>
</date>
<date date-type="accepted">
<day>10</day>
<month>5</month>
<year>2017</year>
</date>
</history>
<permissions>
<copyright-year>2017</copyright-year>
<copyright-holder>Miller et al</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.pcbi.1005564"/>
<abstract>
<p>Zoonotic diseases are a major cause of morbidity, and productivity losses in both human and animal populations. Identifying the source of food-borne zoonoses (e.g. an animal reservoir or food product) is crucial for the identification and prioritisation of food safety interventions. For many zoonotic diseases it is difficult to attribute human cases to sources of infection because there is little epidemiological information on the cases. However, microbial strain typing allows zoonotic pathogens to be categorised, and the relative frequencies of the strain types among the sources and in human cases allows inference on the likely source of each infection. We introduce <monospace>sourceR</monospace>, an <monospace>R</monospace> package for quantitative source attribution, aimed at food-borne diseases. It implements a Bayesian model using strain-typed surveillance data from both human cases and source samples, capable of identifying important sources of infection. The model measures the force of infection from each source, allowing for varying survivability, pathogenicity and virulence of pathogen strains, and varying abilities of the sources to act as vehicles of infection. A Bayesian non-parametric (Dirichlet process) approach is used to cluster pathogen strain types by epidemiological behaviour, avoiding model overfitting and allowing detection of strain types associated with potentially high “virulence”. <monospace>sourceR</monospace> is demonstrated using <italic>Campylobacter jejuni</italic> isolate data collected in New Zealand between 2005 and 2008. Chicken from a particular poultry supplier was identified as the major source of campylobacteriosis, which is qualitatively similar to results of previous studies using the same dataset. Additionally, the software identifies a cluster of 9 multilocus sequence types with abnormally high ‘virulence’ in humans. <monospace>sourceR</monospace> enables straightforward attribution of cases of zoonotic infection to putative sources of infection. As <monospace>sourceR</monospace> develops, we intend it to become an important and flexible resource for food-borne disease attribution studies.</p>
</abstract>
<funding-group>
<funding-statement>The authors received no specific funding for this work.</funding-statement>
</funding-group>
<counts>
<fig-count count="3"/>
<table-count count="0"/>
<page-count count="13"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>PLOS Publication Stage</meta-name>
<meta-value>vor-update-to-uncorrected-proof</meta-value>
</custom-meta>
<custom-meta>
<meta-name>Publication Update</meta-name>
<meta-value>2017-06-16</meta-value>
</custom-meta>
<custom-meta id="data-availability">
<meta-name>Data Availability</meta-name>
<meta-value>Data are available as part of the sourceR package on CRAN (<ext-link ext-link-type="uri" xlink:href="https://CRAN.R-project.org/package=sourceR" xlink:type="simple">https://CRAN.R-project.org/package=sourceR</ext-link>). The motivating campylobacteriosis dataset is named "campy".</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<disp-quote><p>This is a <italic>PLOS Computational Biology</italic> software paper.</p></disp-quote>
<sec id="sec001" sec-type="intro">
<title>Introduction</title>
<p>Zoonotic diseases are a major source of human morbidity world wide. In 2010, there were an estimated 600 million cases globally [<xref ref-type="bibr" rid="pcbi.1005564.ref001">1</xref>], of which 96 million were <italic>Campylobacter spp</italic>. resulting in 21,000 deaths [<xref ref-type="bibr" rid="pcbi.1005564.ref002">2</xref>]. Attributing cases of food-borne disease to putative sources of infection is crucial for identifying and prioritising food safety interventions, prompting routine national recording of human cases and surveillance of high-risk sources in many countries—for example FoodNet in the US [<xref ref-type="bibr" rid="pcbi.1005564.ref003">3</xref>], the Danish Zoonosis Centre (<ext-link ext-link-type="uri" xlink:href="http://food.dtu.dk" xlink:type="simple">food.dtu.dk</ext-link>), and the Ministry for Primary Industries in New Zealand (<ext-link ext-link-type="uri" xlink:href="http://foodsafety.govt.nz" xlink:type="simple">foodsafety.govt.nz</ext-link>).</p>
<p>Traditional approaches to source attribution include observational risk assessment, extrapolation of surveillance or outbreak data, and epidemiological field studies [<xref ref-type="bibr" rid="pcbi.1005564.ref004">4</xref>]. The results of such direct observational methods may be highly uncertain due to long and variable disease incubation times, and many exposures of an individual to multiple sources of infection. Nevertheless, statistical modelling of human case count data, incorporating molecular strain typing of pathogen isolates from national surveillance programmes, has shown promise for identifying important sources of food-borne illness [<xref ref-type="bibr" rid="pcbi.1005564.ref005">5</xref>, <xref ref-type="bibr" rid="pcbi.1005564.ref006">6</xref>].</p>
<p>The aim of this paper is to extend current approaches to statistical source attribution, and to provide a standard software package, <monospace>sourceR</monospace>, providing an intuitive interface to source attribution models for epidemiological domain specialists. Our principle innovation is a novel class of Bayesian non-parametric source attribution model which classifies strain types by differential epidemiological behaviour and accurately quantifies uncertainty. Furthermore, we allow for spatial and temporal heterogeneity in case and source data with the aim of detecting differential exposures to infection sources across space and time. <monospace>sourceR</monospace> represents the first standard software for source attribution, and is designed for use by epidemiologists and public health decision makers. It is written as an add-on package to <monospace>R</monospace>, the open-source lingua-franca for modern epidemiological analysis, and incorporates an object-orientated style to facilitate further model development and future maintainability.</p>
<p>The paper is structured as follows. We first introduce a motivating example and review existing source attribution models. The new model is described in the Design and Implementation section followed by a demonstration of model fitting using <monospace>sourceR</monospace> in the Materials and Methods section. Results and Discussion sections follow, and it concludes with details of Availability and Future directions.</p>
<sec id="sec002">
<title>Example: <italic>Campylobacter</italic> food-poisoning in Manawatu, New Zealand</title>
<p>In 2006, New Zealand had one of the highest incidences of campylobacteriosis in the developed world, with an annual incidence in excess of 400 cases per 100,000 people [<xref ref-type="bibr" rid="pcbi.1005564.ref007">7</xref>]. Our motivating data set was collected between 2005 and 2008 in the Manawatu region of New Zealand with the aim of identifying the most important sources of campylobacteriosis and implementing interventions. A campaign to change poultry processing procedures, supported in part by results from previous quantitative source attribution approaches, was successful in leading to a sharp decline in campylobacteriosis incidence after 2007 [<xref ref-type="bibr" rid="pcbi.1005564.ref006">6</xref>].</p>
<p><italic>Campylobacter</italic> has many subtypes which are usually defined using Multilocus Sequence Typing (MLST), a commonly used genotyping method providing a relatively rapid method of characterising isolates. An MLST sequence type is a unique combination of alleles at specified gene loci, typically located in conserved regions of the genome [<xref ref-type="bibr" rid="pcbi.1005564.ref008">8</xref>, <xref ref-type="bibr" rid="pcbi.1005564.ref009">9</xref>]. The data set consists of the dominant MLST-genotype <italic>Campylobacter</italic> isolated from each source (potential food and environmental sources) and human sample. The data was first published in [<xref ref-type="bibr" rid="pcbi.1005564.ref010">10</xref>], and is described in detail (including data collection methods) in [<xref ref-type="bibr" rid="pcbi.1005564.ref011">11</xref>] and [<xref ref-type="bibr" rid="pcbi.1005564.ref012">12</xref>]. These data are included in our <monospace>sourceR</monospace> package (named <monospace>campy</monospace>). We use this data set as a case study, and compare our results with previously published statistical approaches.</p>
</sec>
<sec id="sec003">
<title>Existing methods of source attribution</title>
<p>The general structure of the source attribution model is that the observed case-counts <italic>y</italic><sub><italic>i</italic></sub> for strain <italic>i</italic> (occurring in a defined surveillance period) are mutually independent Poisson distributed with means
<disp-formula id="pcbi.1005564.e001"><alternatives><graphic id="pcbi.1005564.e001g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e001" xlink:type="simple"/><mml:math display="block" id="M1"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd columnalign="right"><mml:mrow><mml:msub><mml:mi>λ</mml:mi> <mml:mi>i</mml:mi></mml:msub> <mml:mo>=</mml:mo> <mml:munderover><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>j</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>m</mml:mi></mml:munderover> <mml:msub><mml:mi>α</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:msub><mml:mi>p</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi></mml:mrow></mml:msub> <mml:mo>.</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></alternatives> <label>(1)</label></disp-formula>
where <italic>p</italic><sub><italic>ij</italic></sub> is the prevalence of strain <italic>i</italic> in source <italic>j</italic>, and “source effects” <bold><italic>α</italic></bold> measure each source’s capacity to act as a vehicle of infection. The estimated number of cases attributed to a particular source <italic>j</italic> is
<disp-formula id="pcbi.1005564.e002"><alternatives><graphic id="pcbi.1005564.e002g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e002" xlink:type="simple"/><mml:math display="block" id="M2"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd columnalign="right"><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>ξ</mml:mi> <mml:mo>^</mml:mo></mml:mover> <mml:mi>j</mml:mi></mml:msub> <mml:mo>=</mml:mo> <mml:msub><mml:mover accent="true"><mml:mi>α</mml:mi> <mml:mo>^</mml:mo></mml:mover> <mml:mi>j</mml:mi></mml:msub> <mml:munderover><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>i</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>n</mml:mi></mml:munderover> <mml:msub><mml:mi>p</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi></mml:mrow></mml:msub> <mml:mo>.</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></alternatives> <label>(2)</label></disp-formula></p>
<p>Comparing the relative magnitudes of <inline-formula id="pcbi.1005564.e003"><alternatives><graphic id="pcbi.1005564.e003g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e003" xlink:type="simple"/><mml:math display="inline" id="M3"><mml:msub><mml:mover accent="true"><mml:mi>ξ</mml:mi> <mml:mo>^</mml:mo></mml:mover> <mml:mi>j</mml:mi></mml:msub></mml:math></alternatives></inline-formula> provides a statistical method to prioritise intervention strategies to the most important sources of infection. The model is fitted in a Bayesian framework as posteriors for functions of parameters (such as <italic>ξ</italic>) are easily calculated, and to allow previous knowledge to be incorporated via informative priors.</p>
<p>A significant problem is that this model does not allow for some strain types have differential affinities for human infection resulting in over-dispersion of <bold><italic>y</italic></bold>. Additionally, it does not allow for uncertainty in <bold><italic>P</italic></bold>, inherent in sample based source data. In the rest of this section, we review current extensions to <xref ref-type="disp-formula" rid="pcbi.1005564.e001">Eq 1</xref> aimed at accounting for the Poisson over-dispersion in observed case numbers, and incorporating uncertainty in source surveillance data. In particular, the preliminary developments made by Hald <italic>et al</italic>. [<xref ref-type="bibr" rid="pcbi.1005564.ref005">5</xref>] and Müllner <italic>et al</italic>. [<xref ref-type="bibr" rid="pcbi.1005564.ref006">6</xref>] form an ontology on which we base our innovations.</p>
<sec id="sec004">
<title>Over-dispersion</title>
<p>Hald <italic>et al</italic>. [<xref ref-type="bibr" rid="pcbi.1005564.ref005">5</xref>] address the issue of Poisson over-dispersion in <xref ref-type="disp-formula" rid="pcbi.1005564.e001">Eq 1</xref> by introducing a “type effect” <bold><italic>q</italic></bold> accounting for some strain types being more adapted to human infection than others.
<disp-formula id="pcbi.1005564.e004"><alternatives><graphic id="pcbi.1005564.e004g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e004" xlink:type="simple"/><mml:math display="block" id="M4"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd columnalign="right"><mml:msub><mml:mi>λ</mml:mi> <mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:msub><mml:mi>q</mml:mi> <mml:mi>i</mml:mi></mml:msub> <mml:munderover><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>j</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>m</mml:mi></mml:munderover> <mml:msub><mml:mi>α</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:msub><mml:mi>c</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:msub><mml:mi>p</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi></mml:mrow></mml:msub> <mml:mo>.</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></alternatives> <label>(3)</label></disp-formula></p>
<p>Additionally, they include an offset <bold><italic>c</italic></bold> representing known rates of consumption of each source foodstuff, allowing <bold><italic>α</italic></bold> to be interpreted as a source-specific factor independent of exposure. However, the addition of <bold><italic>q</italic></bold> as a vector of uncorrelated unknowns over-specifies the model, with <italic>m</italic> + <italic>n</italic> parameters but only <italic>n</italic> independent disease case count observations. Hald <italic>et al</italic>. therefore reduce the number of parameters by heuristic <italic>a priori</italic> grouping of the elements of <bold><italic>q</italic></bold>, albeit with the generally undesirable property that quantification of uncertainty in the most appropriate choice of grouping is not readily permissible.</p>
<p>The “Modified Hald” model of Müllner <italic>et al</italic>. [<xref ref-type="bibr" rid="pcbi.1005564.ref006">6</xref>] treats <bold><italic>q</italic></bold> as log Normally distributed random effect, with unit mean and unknown variance <italic>τ</italic><sup>2</sup> <disp-formula id="pcbi.1005564.e005"><alternatives><graphic id="pcbi.1005564.e005g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e005" xlink:type="simple"/><mml:math display="block" id="M5"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd columnalign="right"><mml:mrow><mml:msub><mml:mi>q</mml:mi> <mml:mi>i</mml:mi></mml:msub> <mml:mo>∼</mml:mo> <mml:mtext>logNormal</mml:mtext> <mml:mrow><mml:mo>(</mml:mo> <mml:mn>1</mml:mn> <mml:mo>,</mml:mo> <mml:msup><mml:mi>τ</mml:mi> <mml:mn>2</mml:mn></mml:msup> <mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></alternatives> <label>(4)</label></disp-formula>
with a Gamma-distributed prior distribution imposed on <italic>τ</italic><sup>2</sup>. However, this approach suffers from <italic>a posteriori</italic> non-identifiability of <bold><italic>q</italic></bold> and <italic>τ</italic><sup>2</sup>, hindering the performance of MCMC algorithms used to fit the model [<xref ref-type="bibr" rid="pcbi.1005564.ref013">13</xref>]. Though this may be ameliorated by choosing an informative prior for <italic>τ</italic><sup>2</sup> with small mean, it results in severe shrinkage of <bold><italic>q</italic></bold> and inference which is sensitive to the choice of prior.</p>
</sec>
<sec id="sec005">
<title>Uncertainty in source sampling</title>
<p>The Modified Hald model introduces uncertainty into the prevalences <italic>p</italic><sub><italic>ij</italic></sub> by modelling the source sampling process. Let <italic>s</italic><sub><italic>j</italic></sub> denote the total number of source samples collected from source <italic>j</italic> = 1, …, <italic>m</italic>, of which <italic>x</italic><sub><italic>ij</italic></sub> are positive for pathogen type <italic>i</italic>. Normalisation of the number of positive samples <italic>x</italic><sub><italic>ij</italic></sub> gives the relative prevalence <inline-formula id="pcbi.1005564.e006"><alternatives><graphic id="pcbi.1005564.e006g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e006" xlink:type="simple"/><mml:math display="inline" id="M6"><mml:mrow><mml:msub><mml:mi>r</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi></mml:mrow></mml:msub> <mml:mo>=</mml:mo> <mml:msub><mml:mi>x</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi></mml:mrow></mml:msub> <mml:mo>/</mml:mo> <mml:msubsup><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>i</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>n</mml:mi></mml:msubsup> <mml:msub><mml:mi>x</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></alternatives></inline-formula> of type <italic>i</italic> in source <italic>j</italic>. The relative prevalence <italic>r</italic><sub><italic>ij</italic></sub> is then combined with the prevalence of positive samples <inline-formula id="pcbi.1005564.e007"><alternatives><graphic id="pcbi.1005564.e007g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e007" xlink:type="simple"/><mml:math display="inline" id="M7"><mml:mrow><mml:msub><mml:mi>k</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mo>=</mml:mo> <mml:msubsup><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>i</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>n</mml:mi></mml:msubsup> <mml:msub><mml:mi>x</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi></mml:mrow></mml:msub> <mml:mo>/</mml:mo> <mml:msub><mml:mi>s</mml:mi> <mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:math></alternatives></inline-formula> to calculate the absolute prevalence <italic>p</italic><sub><italic>ij</italic></sub> = <italic>r</italic><sub><italic>ij</italic></sub> × <italic>k</italic><sub><italic>j</italic></sub> of strain <italic>i</italic> in source <italic>j</italic>. The Modified Hald model was fitted in WinBUGS using an approximate two stage process [<xref ref-type="bibr" rid="pcbi.1005564.ref006">6</xref>]. First, a posterior distribution was estimated for the absolute prevalence of source types <bold><italic>p</italic></bold>, using the model specified in Eqs (<xref ref-type="disp-formula" rid="pcbi.1005564.e008">5</xref>) and (<xref ref-type="disp-formula" rid="pcbi.1005564.e009">6</xref>):
<disp-formula id="pcbi.1005564.e008"><alternatives><graphic id="pcbi.1005564.e008g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e008" xlink:type="simple"/><mml:math display="block" id="M8"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd columnalign="right"><mml:mrow><mml:msub><mml:mi>r</mml:mi> <mml:mrow><mml:mo>·</mml:mo> <mml:mi>j</mml:mi></mml:mrow></mml:msub> <mml:mo>∼</mml:mo> <mml:mstyle mathvariant="sans-serif"><mml:mtext>Dirichlet</mml:mtext></mml:mstyle> <mml:mrow><mml:mo>(</mml:mo> <mml:mn mathvariant="bold">1</mml:mn> <mml:mo>)</mml:mo></mml:mrow> <mml:mspace width="0.277778em"/><mml:mo>∀</mml:mo> <mml:mspace width="0.277778em"/><mml:mi>j</mml:mi></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></alternatives> <label>(5)</label></disp-formula>
<disp-formula id="pcbi.1005564.e009"><alternatives><graphic id="pcbi.1005564.e009g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e009" xlink:type="simple"/><mml:math display="block" id="M9"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd columnalign="right"><mml:mrow><mml:msub><mml:mi>k</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mo>∼</mml:mo> <mml:mstyle mathvariant="sans-serif"><mml:mtext>Beta</mml:mtext></mml:mstyle> <mml:mrow><mml:mo>(</mml:mo> <mml:mn>1</mml:mn> <mml:mo>,</mml:mo> <mml:mn>1</mml:mn> <mml:mo>)</mml:mo></mml:mrow> <mml:mspace width="0.277778em"/><mml:mo>∀</mml:mo> <mml:mspace width="0.277778em"/><mml:mi>j</mml:mi></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></alternatives> <label>(6)</label></disp-formula></p>
<p>The marginal posterior for each element of <bold><italic>p</italic></bold> was then approximated by a Beta distribution
<disp-formula id="pcbi.1005564.e010"><alternatives><graphic id="pcbi.1005564.e010g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e010" xlink:type="simple"/><mml:math display="block" id="M10"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd columnalign="right"><mml:mrow><mml:msub><mml:mi>p</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi></mml:mrow></mml:msub> <mml:mo>∼</mml:mo> <mml:mstyle mathvariant="sans-serif"><mml:mtext>Beta</mml:mtext></mml:mstyle> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>w</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi></mml:mrow></mml:msub> <mml:mo>,</mml:mo> <mml:msub><mml:mi>v</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi></mml:mrow></mml:msub> <mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></alternatives></disp-formula>
(using the method of moments to calculate <italic>w</italic><sub><italic>ij</italic></sub> and <italic>v</italic><sub><italic>ij</italic></sub>) which was then used as an independent prior in step 2. Since each isolate is assigned to only one type, we must observe <inline-formula id="pcbi.1005564.e011"><alternatives><graphic id="pcbi.1005564.e011g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e011" xlink:type="simple"/><mml:math display="inline" id="M11"><mml:mrow><mml:msubsup><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>i</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>n</mml:mi></mml:msubsup> <mml:msub><mml:mi>r</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi></mml:mrow></mml:msub> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow></mml:math></alternatives></inline-formula>, and therefore <inline-formula id="pcbi.1005564.e012"><alternatives><graphic id="pcbi.1005564.e012g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e012" xlink:type="simple"/><mml:math display="inline" id="M12"><mml:mrow><mml:msubsup><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>i</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>n</mml:mi></mml:msubsup> <mml:msub><mml:mi>p</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi></mml:mrow></mml:msub> <mml:mo>=</mml:mo> <mml:msub><mml:mi>k</mml:mi> <mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:math></alternatives></inline-formula>. This is not enforced when using independent Beta priors for each <italic>p</italic><sub><italic>ij</italic></sub> which results in <italic>k</italic><sub><italic>j</italic></sub> (the probability of a sample being positive given the sample is from source <italic>j</italic>) no longer being constrained to be between 0 and 1.</p>
</sec>
</sec>
</sec>
<sec id="sec006" sec-type="materials|methods">
<title>Design and implementation</title>
<p>Our approach addresses the deficiencies inherent in both the Hald and Modified Hald models by fitting a joint model for both source and human case sampling with non-parametric clustering of the type effects. This allows integration over uncertainty in the source sampling process without resorting to an approximate marginal probability distribution on <bold><italic>p</italic></bold>. The overdispersion is solved by non-parametrically clustering the pathogen types using a Dirichlet process (DP) on the type effect vector <bold><italic>q</italic></bold>. This is a data driven, automatic method which reduces the effective number of parameters in the model without requiring strong assumptions about <italic>τ</italic><sup>2</sup> in <xref ref-type="disp-formula" rid="pcbi.1005564.e005">Eq 4</xref>. Additionally, it quantifies the similarity between epidemiological characteristics (virulence, pathogenicity and survivability) of the subtypes forming the basis of future research on the genetic determinants of this behaviour. Often, human case data is associated with location such as urban/rural, or GPS coordinates whilst food samples are likely to be less spatially constrained (due to distances between production and sale locations). Both human and source data may exist for multiple time-periods. Therefore, we allow for spatial and temporal heterogeneity in the data.</p>
<sec id="sec007">
<title>HaldDP model</title>
<p>As with the Hald model, we assume the number of human cases <italic>y</italic><sub><italic>itl</italic></sub> identified by isolation of subtype <italic>i</italic> in time-period <italic>t</italic> at location <italic>l</italic> is Poisson distributed
<disp-formula id="pcbi.1005564.e013"><alternatives><graphic id="pcbi.1005564.e013g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e013" xlink:type="simple"/><mml:math display="block" id="M13"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd columnalign="right"><mml:mrow><mml:msub><mml:mi>y</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>t</mml:mi> <mml:mi>l</mml:mi></mml:mrow></mml:msub> <mml:mo>∼</mml:mo> <mml:mstyle mathvariant="sans-serif"><mml:mtext>Poisson</mml:mtext></mml:mstyle> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>λ</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>t</mml:mi> <mml:mi>l</mml:mi></mml:mrow></mml:msub> <mml:mo>=</mml:mo> <mml:msub><mml:mi>q</mml:mi> <mml:mi>i</mml:mi></mml:msub> <mml:munderover><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>j</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>m</mml:mi></mml:munderover> <mml:msub><mml:mi>α</mml:mi> <mml:mrow><mml:mi>j</mml:mi> <mml:mi>t</mml:mi> <mml:mi>l</mml:mi></mml:mrow></mml:msub> <mml:msub><mml:mi>p</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow></mml:msub> <mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></alternatives> <label>(7)</label></disp-formula></p>
<p>We allow for different exposures of humans to sources in different locations and times, by allowing the source effects to vary between times and locations, <italic>α</italic><sub><italic>jtl</italic></sub>.</p>
<p>For each source <italic>j</italic>, we model the number of positive source samples
<disp-formula id="pcbi.1005564.e014"><alternatives><graphic id="pcbi.1005564.e014g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e014" xlink:type="simple"/><mml:math display="block" id="M14"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd columnalign="right"><mml:mrow><mml:msub><mml:mi mathvariant="bold-italic">x</mml:mi> <mml:mrow><mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow></mml:msub> <mml:mo>∼</mml:mo> <mml:mtext>Multinomial</mml:mtext> <mml:mrow><mml:mo>(</mml:mo> <mml:msubsup><mml:mi>s</mml:mi> <mml:mrow><mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow> <mml:mo>+</mml:mo></mml:msubsup> <mml:mo>,</mml:mo> <mml:msub><mml:mi mathvariant="bold-italic">r</mml:mi> <mml:mrow><mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow></mml:msub> <mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></alternatives> <label>(8)</label></disp-formula>
where <bold><italic>x</italic></bold><sub><italic>jt</italic></sub> = (<italic>x</italic><sub><italic>ijt</italic></sub>, <italic>i</italic> = 1,…,<italic>n</italic>)<sup><italic>T</italic></sup> denotes the vector of type-counts in source <italic>j</italic> in time-period <italic>t</italic>, <inline-formula id="pcbi.1005564.e015"><alternatives><graphic id="pcbi.1005564.e015g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e015" xlink:type="simple"/><mml:math display="inline" id="M15"><mml:mrow><mml:msubsup><mml:mi>s</mml:mi> <mml:mrow><mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow> <mml:mo>+</mml:mo></mml:msubsup> <mml:mo>=</mml:mo> <mml:msubsup><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>i</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>n</mml:mi></mml:msubsup> <mml:msub><mml:mi>x</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></alternatives></inline-formula> denotes the number of positive samples obtained, and <bold><italic>r</italic></bold><sub><italic>jt</italic></sub> denotes a vector of relative prevalences <italic>Pr</italic>(type<sub><italic>i</italic></sub>|source<sub><italic>j</italic></sub>, time<sub><italic>t</italic></sub>). This automatically places the constraint <inline-formula id="pcbi.1005564.e016"><alternatives><graphic id="pcbi.1005564.e016g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e016" xlink:type="simple"/><mml:math display="inline" id="M16"><mml:mrow><mml:msubsup><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>i</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>n</mml:mi></mml:msubsup> <mml:msub><mml:mi>r</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow></mml:msub> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow></mml:math></alternatives></inline-formula>. The source case model is then coupled to the human case model through the simple relationship
<disp-formula id="pcbi.1005564.e017"><alternatives><graphic id="pcbi.1005564.e017g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e017" xlink:type="simple"/><mml:math display="block" id="M17"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd columnalign="right"><mml:mrow><mml:msub><mml:mi>p</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow></mml:msub> <mml:mo>=</mml:mo> <mml:msub><mml:mi>r</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow></mml:msub> <mml:msub><mml:mi>k</mml:mi> <mml:mrow><mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></alternatives> <label>(9)</label></disp-formula>
where <italic>k</italic><sub><italic>jt</italic></sub> is the prevalence of any isolate in source <italic>j</italic> in time-period <italic>t</italic>.</p>
<p>In principle, a Beta distribution could be used to model <italic>k</italic><sub><italic>jt</italic></sub>, arising as the conjugate posterior distribution of a Binomial sampling model for <inline-formula id="pcbi.1005564.e018"><alternatives><graphic id="pcbi.1005564.e018g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e018" xlink:type="simple"/><mml:math display="inline" id="M18"><mml:msubsup><mml:mi>s</mml:mi> <mml:mrow><mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow> <mml:mo>+</mml:mo></mml:msubsup></mml:math></alternatives></inline-formula> positive samples from <italic>s</italic><sub><italic>jt</italic></sub> tested, and a Beta prior on <italic>k</italic><sub><italic>jt</italic></sub>. We instead choose to fix the source prevalences at their empirical estimates (<inline-formula id="pcbi.1005564.e019"><alternatives><graphic id="pcbi.1005564.e019g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e019" xlink:type="simple"/><mml:math display="inline" id="M19"><mml:mrow><mml:msub><mml:mi>k</mml:mi> <mml:mrow><mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow></mml:msub> <mml:mo>=</mml:mo> <mml:msubsup><mml:mi>s</mml:mi> <mml:mrow><mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow> <mml:mo>+</mml:mo></mml:msubsup> <mml:mo>/</mml:mo> <mml:msub><mml:mi>s</mml:mi> <mml:mrow><mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></alternatives></inline-formula>) because the number of source samples is typically high.</p>
<p>The type effects <bold><italic>q</italic></bold>, which are assumed invariant across time or location, are drawn from a DP with base distribution <italic>Q</italic><sub>0</sub> and a concentration parameter <italic>a</italic><sub><italic>q</italic></sub> <disp-formula id="pcbi.1005564.e020"><alternatives><graphic id="pcbi.1005564.e020g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e020" xlink:type="simple"/><mml:math display="block" id="M20"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd columnalign="right"><mml:mrow><mml:msub><mml:mi>q</mml:mi> <mml:mi>i</mml:mi></mml:msub> <mml:mo>∼</mml:mo> <mml:mtext>DP</mml:mtext> <mml:mfenced close=")" open="(" separators=""><mml:msub><mml:mi>a</mml:mi> <mml:mi>q</mml:mi></mml:msub> <mml:mo>,</mml:mo> <mml:msub><mml:mi>Q</mml:mi> <mml:mn>0</mml:mn></mml:msub></mml:mfenced> <mml:mo>.</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></alternatives> <label>(10)</label></disp-formula></p>
<p>The Dirichlet process is a probability distribution whose range is a set of probability distributions and is defined by a base distribution and concentration parameter [<xref ref-type="bibr" rid="pcbi.1005564.ref014">14</xref>]. The concentration parameter of the DP <italic>a</italic><sub><italic>q</italic></sub> encodes prior information on the number of groups <italic>K</italic> to which the pathogen types are assigned. The Gamma base distribution of the DP <italic>Q</italic><sub>0</sub> induces a prior for the cluster locations. The DP groups the elements of <bold><italic>q</italic></bold> into a finite set of clusters 1: <italic>κ</italic> (unknown <italic>a priori</italic>) with values <italic>θ</italic><sub>1</sub>,…,<italic>θ</italic><sub><italic>κ</italic></sub> which addresses the inevitable over-dispersion in the case counts <bold><italic>y</italic></bold> robustly and clusters subtypes into groups with similar epidemiological behaviour.</p>
<p>Heterogeneity in the source matrix <italic>x</italic> is required to identify clusters from sources, which may not be guaranteed <italic>a priori</italic> due to the observational nature of the data collection.</p>
</sec>
<sec id="sec008">
<title>Inference</title>
<p>This section describes how the model is fitted in a Bayesian context by first describing the McMC algorithm used to fit this model, then developing the prior model.</p>
<sec id="sec009">
<title>McMC algorithm</title>
<p>The joint model over all unobserved and observed quantities is fitted using Markov chain Monte Carlo (McMC, full details in <xref ref-type="supplementary-material" rid="pcbi.1005564.s001">S1 Appendix</xref>). The source effects and relative prevalence parameters are updated using independent adaptive Metropolis-Hastings updates [<xref ref-type="bibr" rid="pcbi.1005564.ref015">15</xref>]. The type effects <bold><italic>q</italic></bold> are modelled using a DP (<xref ref-type="disp-formula" rid="pcbi.1005564.e020">Eq 10</xref>) with a Gamma base distribution <italic>Q</italic><sub>0</sub> ∼ <italic>Gamma</italic>(<italic>a</italic><sub><italic>θ</italic></sub>,<italic>b</italic><sub><italic>θ</italic></sub>). The choice of a Gamma base distribution with the Poisson likelihood (<xref ref-type="disp-formula" rid="pcbi.1005564.e013">Eq 7</xref>) permits the use of a marginal Gibbs strategy for efficient sampling from the posterior ditribution of <bold><italic>q</italic></bold>. Each observation <italic>i</italic> is assigned to a cluster <italic>k</italic> with value <italic>θ</italic><sub><italic>k</italic></sub>, such that <italic>q</italic><sub><italic>i</italic></sub> ↦ <italic>θ</italic><sub><italic>k</italic></sub>. The algorithm proceeds by alternately sampling from the posterior of the group assignments (adding new clusters or deleting empty clusters as necessary), and the posterior of <italic>θ</italic><sub><italic>k</italic></sub> for each cluster.</p>
</sec>
<sec id="sec010">
<title>Priors</title>
<p>The parameters <bold><italic>α</italic></bold><sub><italic>tl</italic></sub> and <bold><italic>q</italic></bold> account for a multitude of source and type specific factors which are difficult to quantify <italic>a priori</italic>. Therefore, with no single real-world interpretation, the distributional form of the priors were chosen for their flexibility. A Dirichlet prior is placed on each <bold>r</bold><sub><italic>jt</italic></sub> which suitably constrains the individuals <italic>r</italic><sub><italic>ij</italic></sub>s such that <inline-formula id="pcbi.1005564.e021"><alternatives><graphic id="pcbi.1005564.e021g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e021" xlink:type="simple"/><mml:math display="inline" id="M21"><mml:mrow><mml:msubsup><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>i</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>n</mml:mi></mml:msubsup> <mml:msub><mml:mi>r</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi> <mml:mi>t</mml:mi></mml:mrow></mml:msub> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow></mml:math></alternatives></inline-formula>. A Dirichlet prior is also placed on each <italic>α</italic><sub><italic>tl</italic></sub>, with the constraint <inline-formula id="pcbi.1005564.e022"><alternatives><graphic id="pcbi.1005564.e022g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e022" xlink:type="simple"/><mml:math display="inline" id="M22"><mml:mrow><mml:msubsup><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>j</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>m</mml:mi></mml:msubsup> <mml:msub><mml:mi>α</mml:mi> <mml:mrow><mml:mi>j</mml:mi> <mml:mi>t</mml:mi> <mml:mi>l</mml:mi></mml:mrow></mml:msub> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow></mml:math></alternatives></inline-formula> aiding identifiability between the mean of the source and type effect parameters. In <monospace>sourceR</monospace>, the concentration parameter of the DP <italic>α</italic><sub><italic>q</italic></sub> is specified by the analyst as a modelling decision.</p>
<p>We note that the choice of base distribution <italic>Q</italic><sub>0</sub> may have a stronger effect than anticipated due to the small size of the relative prevalence and source effect parameters. This can been seen by considering the marginal posterior for <italic>θ</italic><sub><italic>k</italic></sub> <disp-formula id="pcbi.1005564.e023"><alternatives><graphic id="pcbi.1005564.e023g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e023" xlink:type="simple"/><mml:math display="block" id="M23"><mml:mtable displaystyle="true"><mml:mtr><mml:mtd columnalign="right"><mml:mrow><mml:msub><mml:mi>θ</mml:mi> <mml:mi>k</mml:mi></mml:msub> <mml:mo>∼</mml:mo> <mml:mstyle mathvariant="sans-serif"><mml:mtext>Gamma</mml:mtext></mml:mstyle> <mml:mfenced close=")" open="(" separators=""><mml:msub><mml:mi>a</mml:mi> <mml:mi>θ</mml:mi></mml:msub> <mml:mo>+</mml:mo> <mml:munder><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>i</mml:mi> <mml:mo>:</mml:mo> <mml:msub><mml:mi>S</mml:mi> <mml:mi>i</mml:mi></mml:msub> <mml:mo>=</mml:mo> <mml:mi>k</mml:mi></mml:mrow></mml:munder> <mml:msub><mml:mi>y</mml:mi> <mml:mi>i</mml:mi></mml:msub> <mml:mo>,</mml:mo> <mml:msub><mml:mi>b</mml:mi> <mml:mi>θ</mml:mi></mml:msub> <mml:mo>+</mml:mo> <mml:munder><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>i</mml:mi> <mml:mo>:</mml:mo> <mml:msub><mml:mi>S</mml:mi> <mml:mi>i</mml:mi></mml:msub> <mml:mo>=</mml:mo> <mml:mi>k</mml:mi></mml:mrow></mml:munder> <mml:munderover><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>j</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>m</mml:mi></mml:munderover> <mml:msub><mml:mi>α</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mo>·</mml:mo> <mml:msub><mml:mi>p</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mfenced></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></alternatives></disp-formula></p>
<p>The term <inline-formula id="pcbi.1005564.e024"><alternatives><graphic id="pcbi.1005564.e024g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1005564.e024" xlink:type="simple"/><mml:math display="inline" id="M24"><mml:mrow><mml:msub><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>i</mml:mi> <mml:mo>:</mml:mo> <mml:msub><mml:mi>S</mml:mi> <mml:mi>i</mml:mi></mml:msub> <mml:mo>=</mml:mo> <mml:mi>k</mml:mi></mml:mrow></mml:msub> <mml:msubsup><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>j</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>m</mml:mi></mml:msubsup> <mml:msub><mml:mi>α</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mo>·</mml:mo> <mml:msub><mml:mi>p</mml:mi> <mml:mrow><mml:mi>i</mml:mi> <mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></alternatives></inline-formula> is very small (due to the Dirichlet priors on <italic>α</italic> and <bold>r</bold><sub><italic>j</italic></sub>), which can result in even a fairly small rate parameter (<italic>b</italic><sub><italic>θ</italic></sub>) dominating.</p>
</sec>
</sec>
<sec id="sec011">
<title>Code implementation</title>
<p>Standard McMC packages (e.g. WinBUGS, Stan, PyMC3) cannot implement marginal Gibbs sampling for Dirichlet processes, necessitating a custom McMC framework (see section ‘Extensibility’). We chose R as a platform because of its ubiquity in epidemiology, and advanced support for post-processing of McMC samples. Dependencies on other R packages are required, but these are installed automatically by R’s package manager.</p>
<p>
<monospace>sourceR</monospace> uses an object-oriented design, which allows separation of the model from the McMC algorithm. Internally, the model is represented as a directed acyclic graph (DAG) in which nodes are represented by an R6 class hierarchy. Generic adaptive Metropolis Hastings algorithms are attached to each parameter node, with the conditional independence properties of the DAG allowing automatic computation of the required (log) conditional posterior densities.</p>
<p>A difficulty with the DAG setup is the representation of the DP model on the type effects <bold><italic>q</italic></bold>, since each update of the marginal Gibbs sampler requires structural alterations. Therefore, we subsume the entire DP into a single node, with a bespoke marginal Gibbs sampling algorithm written for our Gamma base-distribution and Poisson likelihood model.</p>
</sec>
</sec>
<sec id="sec012" sec-type="materials|methods">
<title>Materials and methods</title>
<p>The case study below illustrates how the <monospace>sourceR</monospace> package is used in practice. We compare the results of our approach with results from the Modified Hald, Asymmetric Island (see <xref ref-type="supplementary-material" rid="pcbi.1005564.s002">S2 Appendix</xref> and [<xref ref-type="bibr" rid="pcbi.1005564.ref016">16</xref>, <xref ref-type="bibr" rid="pcbi.1005564.ref017">17</xref>]), and the “Dutch” model (see <xref ref-type="supplementary-material" rid="pcbi.1005564.s003">S3 Appendix</xref> and [<xref ref-type="bibr" rid="pcbi.1005564.ref018">18</xref>]). The priors for our model were selected to be minimally informative. The prevalence <italic>k</italic><sub><italic>j</italic></sub> is calculated by dividing the number of positive samples by the total number of samples for each source. In the data below, we note that for several samples the MLST typing failed, with the number of positive samples exceeding the apparent total number of MLST-typed isolates. Assuming MLST typing fails independently of pathogen type, this does not bias our results.</p>
<p>The model fitting process begins by formatting the data, constructing the HaldDP model and setting the McMC parameters before running the algorithm using the <monospace>update()</monospace> method.</p>
<p specific-use="line"><monospace><italic>## Format data</italic></monospace></p>
<p specific-use="line"><monospace>y &lt;- Y(data = campy$cases, <italic># Cases</italic></monospace></p>
<p specific-use="line"><monospace> y = “Human”, type = “Type”, time = “Time”, location = “Location”)</monospace></p>
<p specific-use="line"><monospace>x &lt;- X(data = campy$sources, <italic># Sources</italic></monospace></p>
<p specific-use="line"><monospace> x = “Count”, type = “Type”, time = “Time”, source = “Source”)</monospace></p>
<p specific-use="line"><monospace>k &lt;- Prev(data = campy$prev, <italic># Prevalences</italic></monospace></p>
<p specific-use="line"><monospace> prev = “Value”, time = “Time”, source = “Source”)</monospace></p>
<p specific-use="line"><monospace><italic>## Set priors</italic></monospace></p>
<p specific-use="line"><monospace>priors = list(a_theta = 0.01, b_theta = 0.00001, a_alpha = 1, a_r = 0.1)</monospace></p>
<p specific-use="line"><monospace><italic>## Construct model</italic></monospace></p>
<p specific-use="line"><monospace>my_model &lt;- HaldDP(y = y, x = x, k = k, priors = priors, a_q = 0.1)</monospace></p>
<p specific-use="line"><monospace><italic>## Set mcmc parameters</italic></monospace></p>
<p specific-use="line"><monospace>my_model$mcmc_params(n_iter = 1000, burn_in = 10000, thin = 500)</monospace></p>
<p specific-use="line"><monospace><italic>## Run model</italic></monospace></p>
<p specific-use="line"><monospace>my_model$update()</monospace></p>
<p>The <monospace>sourceR</monospace> package provides methods to extract and subset the complex posterior, calculate medians and credible intervals (with three possible methods percentile, SPIn [<xref ref-type="bibr" rid="pcbi.1005564.ref019">19</xref>], or Chen-Shao [<xref ref-type="bibr" rid="pcbi.1005564.ref020">20</xref>]) and plot a heatmap with a dendrogram showing the clustering of the type effects.</p>
<p specific-use="line"><monospace>my_model$extract()</monospace></p>
<p specific-use="line"><monospace>my_model$summary(alpha = 0.05, CI_type = “percentiles”)</monospace></p>
<p specific-use="line"><monospace>my_model$plot_heatmap()</monospace></p>
</sec>
<sec id="sec013" sec-type="results">
<title>Results</title>
<p>
<xref ref-type="fig" rid="pcbi.1005564.g001">Fig 1</xref> shows the the proportion of cases attributed to each source. The HaldDP model identified the highest proportion of human campylobacteriosis cases as coming from chicken produced by supplier A (a median of 67 percent of cases attributed). A further 11 percent were attributed to Chicken from poultry supplier B and 17 percent to Ovine. The median values for the proportion of cases attributed to each source are qualitatively similar between all models except the Dutch method.</p>
<fig id="pcbi.1005564.g001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1005564.g001</object-id>
<label>Fig 1</label>
<caption>
<title>Comparison of the proportion of human campylobacteriosis cases attributable to each source.</title>
<p>The models compared are: M1 (Dutch model), M2 (Modified Hald model), M3 (Island model) and M4 (HaldDP model). Error bars represent 95% percentile confidence or credible intervals with medians shown as a cross. Violin plots show the marginal posteriors of the <italic>ξ</italic><sub><italic>j</italic></sub> parameters.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1005564.g001" xlink:type="simple"/>
</fig>
<p>To visualise how the DP has clustered the type effects, Gower’s distance [<xref ref-type="bibr" rid="pcbi.1005564.ref021">21</xref>] is used to compute a dissimilarity matrix between all pairs of types. <xref ref-type="fig" rid="pcbi.1005564.g002">Fig 2</xref> shows that the DP identified four main type clusters (from 91 types). The violin plots of the marginal posterior distributions for each type effect (<xref ref-type="fig" rid="pcbi.1005564.g003">Fig 3</xref>) show the largest group of types has very small type effects and wide credible intervals compared to the other groups.</p>
<fig id="pcbi.1005564.g002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1005564.g002</object-id>
<label>Fig 2</label>
<caption>
<title>Heatmap showing the grouping of the type effects (q).</title>
<p>A white pixel represents a dissimilarity value of 1 between a pair of sub types, whilst dark blue (see pixels on the diagonal) gives a value of zero. The grey coloured bar shows the groupings if the dendrogram is cut at 4 groups.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1005564.g002" xlink:type="simple"/>
</fig>
<fig id="pcbi.1005564.g003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1005564.g003</object-id>
<label>Fig 3</label>
<caption>
<title>Violin plots of the marginal distributions of the type effects (q).</title>
<p>Note that the y axis uses a a log scale axis. The fill colour matches the coloured grouping bar on the heatmap.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1005564.g003" xlink:type="simple"/>
</fig>
<p>Model fit and convergence was assessed visually using trace and autocorrelation plots (see Fig A and Fig B in <xref ref-type="supplementary-material" rid="pcbi.1005564.s004">S4 Appendix</xref>).</p>
</sec>
<sec id="sec014" sec-type="conclusions">
<title>Discussion</title>
<p>
<monospace>sourceR</monospace> represents a significant advance in source attribution modelling, and translation of advanced statistical methods into mainstream epidemiological use. In particular, the DP clustering results in a large decrease in the effective number of parameters in the model and allows detection of unusually virulent subtypes (group 2 in <xref ref-type="fig" rid="pcbi.1005564.g003">Fig 3</xref>) by epidemiological behaviour. The subtypes in each cluster have similar epidemiological traits (such as virulence, pathogenicity and survivability) which forms the basis for future research on genetic determinants of those traits. Additionally, if a particular type moved into the high virulence group when repeating the analysis with further data from a later time period, it would flag that type as possibly evolving to become more risky for humans. The type effects for group 3 subtypes have very wide credible intervals due to the sparsity of source samples and human cases for those types.</p>
<p>The relatively large uncertainty for the disease origin (the credible intervals of <bold><italic>ξ</italic></bold>) is likely due to <italic>C. jejuni’s</italic> complex epidemiology [<xref ref-type="bibr" rid="pcbi.1005564.ref006">6</xref>] giving rise to <italic>a posteriori</italic> correlations between components of <bold><italic>α</italic></bold> and <bold><italic>q</italic></bold>. This is expected due to bias/variance trade-off: the Dutch and Island models both lack type effects risking biased results due to not all types being equally likely to infect humans. The Island model also possesses inherently strong and difficult to verify <italic>a priori</italic> assumptions (see [<xref ref-type="bibr" rid="pcbi.1005564.ref016">16</xref>] and <xref ref-type="supplementary-material" rid="pcbi.1005564.s002">S2 Appendix</xref>) which are not subject to uncertainty quantification. Moreover, by removing the approximation inherent in the Modified Hald model, we expect the HaldDP model to more accurately reflect inferential uncertainty—this is particularly important for decision making in food hygiene policy, especially when commercial interests must be supported by rigorous scientific advice.</p>
<p>Mixing and <italic>a posteriori</italic> correlations of the HaldDP model are significantly decreased in comparison to the Modified Hald model, if not entirely resolved. Although heterogeneity in <italic>X</italic> is required to fit the models, a sparse or highly unbalanced source matrix increases posterior correlations between some source and type effects. In our experience, the algorithm works best when the source matrix has a moderate amount of heterogeneity.</p>
<p>Whilst the HaldDP results for <bold><italic>ξ</italic></bold> are qualitatively similar to those from the other models (<xref ref-type="fig" rid="pcbi.1005564.g001">Fig 1</xref>, we note an interesting disagreement between the Island and Hald model derivatives when comparing the the number of cases attributed to Ovine and Bovine. We conjecture that this may be due to some non-identifiability between Bovine and Ovine sources as both sources have high contamination from the same types increasing the sensitivity of <bold><italic>ξ</italic></bold> to sampling error. It may also be due to lack of explicit source and type effects in the Island model. Resolving this disparity is the subject of ongoing research.</p>
</sec>
<sec id="sec015">
<title>Availability and future directions</title>
<p>The stable release version of <monospace>sourceR</monospace> is available from the Comprehensive R Archive Network, released under a GPL-3 licence. The development version is available at <ext-link ext-link-type="uri" xlink:href="http://fhm-chicas-code.lancs.ac.uk/millerp/sourceR" xlink:type="simple">http://fhm-chicas-code.lancs.ac.uk/millerp/sourceR</ext-link>. As this package develops, we intend <monospace>sourceR</monospace> to become a platform for new source attribution model development, providing a central analytic resource for public health professionals.</p>
<p>The main focus of extending <monospace>sourceR</monospace> will be on modelling spatiotemporal correlation in the time and location dependent parameters. A spatiotemporal correlation model on <bold><italic>α</italic></bold><sub><italic>tl</italic></sub> could be used to identify particular foci of source contamination, enabling targeted investigation of particular food supply regions. Implementation of time varying type effects may be appropriate as <italic>Campylobacter</italic> can evolve quickly and genetic variation conferring virulence may not be apparent from coarse-scale MLST typing [<xref ref-type="bibr" rid="pcbi.1005564.ref022">22</xref>]. Interaction terms between some sources and types would allow for the biologically plausible possibility that certain types are differentially likely to survive and cause disease, dependent on the food source they appear in. Additionally, water/ environmental samples could be attributed to the other sources of infection allowing estimation of the proportion of cases attributed to different paths of infection (direct infection from the source versus infection via the environment).</p>
<p>However, including interaction terms and additional paths of infection would significantly increase the number of parameters and the number and strength of posterior correlations. With higher posterior correlations, the current Metropolis-Hastings based fitting algorithm would suffer from a loss of efficiency. This could be addressed with gradient-based fitting algorithms such as Hamiltonian Monte Carlo (HMC) [<xref ref-type="bibr" rid="pcbi.1005564.ref023">23</xref>] which are designed to converge to high-dimensional, non-orthogonal target distributions much more quickly. In particular, the No U-Turn Sample (NUTS) presents an attractive method for tuning HMC adaptively, a quality which we consider necessary to minimise user intervention and maximise research productivity [<xref ref-type="bibr" rid="pcbi.1005564.ref024">24</xref>].</p>
<p>With increased interest in source attribution models for both food-borne pathogens, <monospace>sourceR</monospace> has been written with extensibility in mind. In particular, the DAG representation allows for rapid construction of modified and new models. The package routines are written in R (as opposed to C or C++) to aid readability, with the node class hierarchy and three stage workflow designed to aid the addition of new model classes. All internal classes and methods are documented to enable prospective developers to familiarise themselves with the source code quickly, and an extensive test suite is provided. We note that the DAG framework is not limited solely to source attribution models and may used for other Bayesian applications, particularly those for which a Dirichlet process is required.</p>
</sec>
<sec id="sec016" sec-type="conclusions">
<title>Conclusions</title>
<p>We have presented a novel source attribution model which builds upon, and unites, the Hald and Modified Hald approaches. It is widely applicable, fully joint, and does not require approximations or a large number of assumptions. Mixing and <italic>a posteriori</italic> correlations are significantly decreased in comparison to the Modified Hald model. Furthermore, it allows the data to inform type effect clustering
using a Bayesian non-parametric model which identifies groups of sub types with similar
putative virulence, pathogenicity and survivability. This is a significant improvement over the previous attempts to improve model identifiability (fixing some source and type effects <italic>a priori</italic>, or modelling the type effects as random using a 2 stage model). Like the Modified Hald model, the new model incorporates uncertainty in the prevalence matrix into the model, however, it does this by fitting a fully joint model rather than a 2 step model. This
has the advantage of allowing the human cases to influence the uncertainty in the source data and preserves the restriction on the sum of the prevalences for each source. The <monospace>sourceR</monospace> package implements this model to enable straightforward attribution of cases of zoonotic infection to putative sources of infection by epidemiologists and public health decision makers.</p>
</sec>
<sec id="sec017">
<title>Supporting information</title>
<supplementary-material id="pcbi.1005564.s001" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1005564.s001" xlink:type="simple">
<label>S1 Appendix</label>
<caption>
<title>Full McMC algorithm.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1005564.s002" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1005564.s002" xlink:type="simple">
<label>S2 Appendix</label>
<caption>
<title>Island model overview.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1005564.s003" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1005564.s003" xlink:type="simple">
<label>S3 Appendix</label>
<caption>
<title>Dutch model overview.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1005564.s004" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1005564.s004" xlink:type="simple">
<label>S4 Appendix</label>
<caption>
<title>Model fit and convergence diagnostic plots.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
</sec>
</body>
<back>
<ack>
<p>We thank all members of the Hopkirk Molecular Epidemiology Team (Massey University), Environmental Science and Research, MidCentral Health, Public Health Services, MedLab Central, the New Zealand Food Safety Authority, Petra Müllner (for the Manawatu data set) and Geoff Jones (for his helpful input on automatic clustering methods).</p>
</ack>
<ref-list>
<title>References</title>
<ref id="pcbi.1005564.ref001">
<label>1</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Havelaar</surname> <given-names>AH</given-names></name>, <name name-style="western"><surname>Kirk</surname> <given-names>MD</given-names></name>, <name name-style="western"><surname>Torgerson</surname> <given-names>PR</given-names></name>, <name name-style="western"><surname>Gibb</surname> <given-names>HJ</given-names></name>, <name name-style="western"><surname>Hald</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Lake</surname> <given-names>RJ</given-names></name>, <etal>et al</etal>. <article-title>World Health Organization Global Estimates and Regional Comparisons of the Burden of Foodborne Disease in 2010</article-title>. <source>PLoS Med</source>. <year>2015</year>;<volume>12</volume>(<issue>12</issue>):<fpage>1</fpage>–<lpage>23</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1371/journal.pmed.1001923" xlink:type="simple">10.1371/journal.pmed.1001923</ext-link></comment></mixed-citation>
</ref>
<ref id="pcbi.1005564.ref002">
<label>2</label>
<mixed-citation publication-type="other" xlink:type="simple">World Health Organization. WHO estimates of the global burden of foodborne diseases: foodborne disease burden epidemiology reference group 2007–2015; 2015. available on the WHO web site (<ext-link ext-link-type="uri" xlink:href="http://www.who.int" xlink:type="simple">www.who.int</ext-link>) or can be purchased from WHO Press, World Health Organization, 20 Avenue Appia, 1211 Geneva 27, Switzerland. Available from: <ext-link ext-link-type="uri" xlink:href="http://apps.who.int/iris/bitstream/10665/199350/1/9789241565165_eng.pdf?ua=1" xlink:type="simple">http://apps.who.int/iris/bitstream/10665/199350/1/9789241565165_eng.pdf?ua=1</ext-link>.</mixed-citation>
</ref>
<ref id="pcbi.1005564.ref003">
<label>3</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Pires</surname> <given-names>SM</given-names></name>, <name name-style="western"><surname>Evers</surname> <given-names>EG</given-names></name>, <name name-style="western"><surname>van Pelt</surname> <given-names>W</given-names></name>, <name name-style="western"><surname>Ayers</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Scallan</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Angulao</surname> <given-names>FJ</given-names></name>, <etal>et al</etal>. <article-title>Attributing the human disease burden of foodbourne infections to specific sources</article-title>. <source>Foodborne Pathogens and Disease</source>. <year>2009</year>;<volume>6</volume>(<issue>4</issue>):<fpage>417</fpage>–<lpage>24</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1089/fpd.2008.0208" xlink:type="simple">10.1089/fpd.2008.0208</ext-link></comment> <object-id pub-id-type="pmid">19415971</object-id></mixed-citation>
</ref>
<ref id="pcbi.1005564.ref004">
<label>4</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Crump</surname> <given-names>JA</given-names></name>, <name name-style="western"><surname>Griffin</surname> <given-names>PM</given-names></name>, <name name-style="western"><surname>Angulo</surname> <given-names>FJ</given-names></name>. <article-title>Bacterial Contamination of Animal Feed and Its Relationship to Human Foodborne Illness</article-title>. <source>Clinical Infectious Diseases</source>. <year>2002</year>;<volume>35</volume>(<issue>7</issue>):<fpage>859</fpage>–<lpage>865</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1086/342885" xlink:type="simple">10.1086/342885</ext-link></comment> <object-id pub-id-type="pmid">12228823</object-id></mixed-citation>
</ref>
<ref id="pcbi.1005564.ref005">
<label>5</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Hald</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Vose</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Wegener</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Koupeev</surname> <given-names>T</given-names></name>. <article-title>A Bayesian Approach to Quantify the Contribution of Animal-Food Sources to Human Salmonellosis</article-title>. <source>Risk Analysis</source>. <year>2004</year>;<volume>24</volume>(<issue>1</issue>):<fpage>255</fpage>–<lpage>269</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1111/j.0272-4332.2004.00427.x" xlink:type="simple">10.1111/j.0272-4332.2004.00427.x</ext-link></comment> <object-id pub-id-type="pmid">15028016</object-id></mixed-citation>
</ref>
<ref id="pcbi.1005564.ref006">
<label>6</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Müllner</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Jones</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Noble</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Spencer</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Hathaway</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>French</surname> <given-names>N</given-names></name>. <article-title>Source Attribution of Food Borne Zoonoses in New Zealand: A Modified Hald Model</article-title>. <source>Risk Analysis</source>. <year>2009</year>;<volume>29</volume>(<issue>7</issue>). <object-id pub-id-type="pmid">19486473</object-id></mixed-citation>
</ref>
<ref id="pcbi.1005564.ref007">
<label>7</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Baker</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Wilson</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Ikram</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Chambers</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Shoemack</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Cook</surname> <given-names>G</given-names></name>. <article-title>Regulation of Chicken Contamination Urgently Needed to Control New Zealand’s Serious Campylobacteriosis Epidemic</article-title>. <source>The New Zealand Medical Journal</source>. <year>2006</year>;.</mixed-citation>
</ref>
<ref id="pcbi.1005564.ref008">
<label>8</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Dingle</surname> <given-names>K</given-names></name>, <name name-style="western"><surname>Colles</surname> <given-names>F</given-names></name>, <name name-style="western"><surname>Wareing</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Ure</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Fox</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Bolton</surname> <given-names>F</given-names></name>, <etal>et al</etal>. <article-title>Multilocus sequence typing system for Campylobacter jejuni</article-title>. <source>Journal of Clinical Microbiology</source>. <year>2001</year>;. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1128/JCM.39.1.14-23.2001" xlink:type="simple">10.1128/JCM.39.1.14-23.2001</ext-link></comment></mixed-citation>
</ref>
<ref id="pcbi.1005564.ref009">
<label>9</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Urwin</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Maiden</surname> <given-names>M</given-names></name>. <article-title>Multi-locus Sequence Typing: A Tool for Global Epidemiology</article-title>. <source>Trends in Microbiology</source>. <year>2003</year>;. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1016/j.tim.2003.08.006" xlink:type="simple">10.1016/j.tim.2003.08.006</ext-link></comment> <object-id pub-id-type="pmid">14557031</object-id></mixed-citation>
</ref>
<ref id="pcbi.1005564.ref010">
<label>10</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Müllner</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Collins-Emerson</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Midwinter</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Carter</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Spencer</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>van der Logt</surname> <given-names>P</given-names></name>, <etal>et al</etal>. <article-title>Molecular Epidemiology of Campylobacter jejuni in a Geographically Isolated Country with a Uniquely Structured Poultry Industry</article-title>. <source>Applied and Environmental Microbiology</source>. <year>2010</year>;<volume>76</volume>(<issue>7</issue>):<fpage>2145</fpage>–<lpage>2154</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1128/AEM.00862-09" xlink:type="simple">10.1128/AEM.00862-09</ext-link></comment> <object-id pub-id-type="pmid">20154115</object-id></mixed-citation>
</ref>
<ref id="pcbi.1005564.ref011">
<label>11</label>
<mixed-citation publication-type="book" xlink:type="simple">
<name name-style="western"><surname>French</surname> <given-names>N</given-names></name>, <name name-style="western"><surname>Marshall</surname> <given-names>J</given-names></name>. <source>Dynamic Modelling of Campylobacter Sources in the Manawatu</source>. <publisher-name>Hopkirk Institute, Massey University</publisher-name>; <year>2009</year>.</mixed-citation>
</ref>
<ref id="pcbi.1005564.ref012">
<label>12</label>
<mixed-citation publication-type="book" xlink:type="simple">
<name name-style="western"><surname>French</surname> <given-names>N</given-names></name>, <name name-style="western"><surname>Marshall</surname> <given-names>J</given-names></name>. <source>Completion of Sequence Typing of Human and Poultry Isolates and Source Attribution Modelling</source>. <publisher-name>Hopkirk Institute, Massey University</publisher-name>; <year>2013</year>.</mixed-citation>
</ref>
<ref id="pcbi.1005564.ref013">
<label>13</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Gelfand</surname> <given-names>AE</given-names></name>, <name name-style="western"><surname>Sahu</surname> <given-names>SK</given-names></name>, <name name-style="western"><surname>Carlin</surname> <given-names>BP</given-names></name>. <article-title>Efficient parameterisations for normal linear mixed models</article-title>. <source>Biometrika</source>. <year>1995</year>;<volume>82</volume>(<issue>3</issue>):<fpage>479</fpage>–<lpage>488</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1093/biomet/82.3.479" xlink:type="simple">10.1093/biomet/82.3.479</ext-link></comment></mixed-citation>
</ref>
<ref id="pcbi.1005564.ref014">
<label>14</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Ferguson</surname> <given-names>T</given-names></name>. <article-title>Bayesian Analysis of some Nonparametric Problems</article-title>. <source>Ann Stat</source>. <year>1973</year>;<volume>1</volume>:<fpage>209</fpage>–<lpage>230</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1214/aos/1176342360" xlink:type="simple">10.1214/aos/1176342360</ext-link></comment></mixed-citation>
</ref>
<ref id="pcbi.1005564.ref015">
<label>15</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Roberts</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Rosenthall</surname> <given-names>J</given-names></name>. <article-title>Examples of Adaptive MCMC</article-title>. <source>University of Toronto Department of Statistics</source>; <year>2006</year>.</mixed-citation>
</ref>
<ref id="pcbi.1005564.ref016">
<label>16</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Wilson</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Gabriel</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Leatherbarrow</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Cheesebrough</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Hart</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Diggle</surname> <given-names>P</given-names></name>. <article-title>Tracing the Source of Campylobacteriosis</article-title>. <source>PLoS Genetics</source>. <year>2008</year>;. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1371/journal.pgen.1000203" xlink:type="simple">10.1371/journal.pgen.1000203</ext-link></comment> <object-id pub-id-type="pmid">18818764</object-id></mixed-citation>
</ref>
<ref id="pcbi.1005564.ref017">
<label>17</label>
<mixed-citation publication-type="other" xlink:type="simple">Wilson D. <monospace>iSource</monospace>; 2016. Available from: <ext-link ext-link-type="uri" xlink:href="http://www.danielwilson.me.uk/iSource.html" xlink:type="simple">http://www.danielwilson.me.uk/iSource.html</ext-link>.</mixed-citation>
</ref>
<ref id="pcbi.1005564.ref018">
<label>18</label>
<mixed-citation publication-type="other" xlink:type="simple">van Pelt W, van de Giessen A, van Leeuwen W, Wannet W, Henken A, Evers E. Oorsprong, Omvang en Kosten van Humane Salmonellose. Deel1. Oorsprong van Humane Salmonellose met Betrekking tot Varken, Rund, Kip, ei en Overige Bronnen. Infectieziekten Bull. 1999;.</mixed-citation>
</ref>
<ref id="pcbi.1005564.ref019">
<label>19</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Liu</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>Gelman</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Zheng</surname> <given-names>T</given-names></name>. <article-title>Simulation-efficient Shortest Probability Intervals</article-title>. <source>Statistics and Computing</source>. <year>2015</year>;. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1007/s11222-015-9563-8" xlink:type="simple">10.1007/s11222-015-9563-8</ext-link></comment></mixed-citation>
</ref>
<ref id="pcbi.1005564.ref020">
<label>20</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Chen</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Shao</surname> <given-names>Q</given-names></name>. <article-title>Monte Carlo Estimation of Bayesian Credible and HPD Intervals</article-title>. <source>Journal of Computational and Graphical Statistics</source>. <year>1991</year>;.</mixed-citation>
</ref>
<ref id="pcbi.1005564.ref021">
<label>21</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Gower</surname> <given-names>JC</given-names></name>. <article-title>A general coefficient of similarity and some of its properties</article-title>. <source>Biometrics</source>. <year>1971</year>;. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.2307/2528823" xlink:type="simple">10.2307/2528823</ext-link></comment></mixed-citation>
</ref>
<ref id="pcbi.1005564.ref022">
<label>22</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Wilson</surname> <given-names>DJ</given-names></name>, <name name-style="western"><surname>Gabriel</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Leatherbarrow</surname> <given-names>AJH</given-names></name>, <name name-style="western"><surname>Cheesbrough</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Gee</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Bolton</surname> <given-names>E</given-names></name>, <etal>et al</etal>. <article-title>Rapid Evolution and the Importance of Recombination to the Gastroenteric Pathogen Campylobacter jejuni</article-title>. <source>Molecular Biology and Evolution</source>. <year>2009</year>;<volume>26</volume>(<issue>2</issue>):<fpage>385</fpage>–<lpage>397</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1093/molbev/msn264" xlink:type="simple">10.1093/molbev/msn264</ext-link></comment> <object-id pub-id-type="pmid">19008526</object-id></mixed-citation>
</ref>
<ref id="pcbi.1005564.ref023">
<label>23</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Duane</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Kennedy</surname> <given-names>AD</given-names></name>, <name name-style="western"><surname>Pendleton</surname> <given-names>BJ</given-names></name>, <name name-style="western"><surname>Roweth</surname> <given-names>D</given-names></name>. <article-title>Hybrid Monte Carlo</article-title>. <source>Physics Letters B</source>. <year>1987</year>;<volume>195</volume>(<issue>2</issue>):<fpage>216</fpage>–<lpage>222</lpage>. <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1016/0370-2693(87)91197-X" xlink:type="simple">http://dx.doi.org/10.1016/0370-2693(87)91197-X</ext-link>.</mixed-citation>
</ref>
<ref id="pcbi.1005564.ref024">
<label>24</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Homan</surname> <given-names>MD</given-names></name>, <name name-style="western"><surname>Gelman</surname> <given-names>A</given-names></name>. <article-title>The No-U-turn Sampler: Adaptively Setting Path Lengths in Hamiltonian Monte Carlo</article-title>. <source>J Mach Learn Res</source>. <year>2014</year>;.</mixed-citation>
</ref>
</ref-list>
</back>
</article>