<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1d3 20150301//EN" "http://jats.nlm.nih.gov/publishing/1.1d3/JATS-journalpublishing1.dtd">
<article article-type="research-article" dtd-version="1.1d3" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS Comput Biol</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">ploscomp</journal-id>
<journal-title-group>
<journal-title>PLOS Computational Biology</journal-title>
</journal-title-group>
<issn pub-type="ppub">1553-734X</issn>
<issn pub-type="epub">1553-7358</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, CA USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1009182</article-id>
<article-id pub-id-type="publisher-id">PCOMPBIOL-D-20-02147</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Research Article</subject>
</subj-group>
<subj-group subj-group-type="Discipline-v3">
<subject>Medicine and health sciences</subject><subj-group><subject>Pathology and laboratory medicine</subject><subj-group><subject>Pathogens</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Heredity</subject><subj-group><subject>Genetic linkage</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Medicine and health sciences</subject><subj-group><subject>Epidemiology</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Medicine and health sciences</subject><subj-group><subject>Epidemiology</subject><subj-group><subject>Genetic epidemiology</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Evolutionary biology</subject><subj-group><subject>Evolutionary systematics</subject><subj-group><subject>Phylogenetics</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Taxonomy</subject><subj-group><subject>Evolutionary systematics</subject><subj-group><subject>Phylogenetics</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Computer and information sciences</subject><subj-group><subject>Data management</subject><subj-group><subject>Taxonomy</subject><subj-group><subject>Evolutionary systematics</subject><subj-group><subject>Phylogenetics</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Mutation</subject><subj-group><subject>Substitution mutation</subject></subj-group></subj-group></subj-group></subj-group></article-categories>
<title-group>
<article-title>Sample size calculation for phylogenetic case linkage</article-title>
<alt-title alt-title-type="running-head">Sample size calculation for phylogenetic studies</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Wohl</surname>
<given-names>Shirlee</given-names>
</name>
<role content-type="https://casrai.org/credit/">Formal analysis</role>
<role content-type="https://casrai.org/credit/">Methodology</role>
<role content-type="https://casrai.org/credit/">Software</role>
<role content-type="https://casrai.org/credit/">Validation</role>
<role content-type="https://casrai.org/credit/">Visualization</role>
<role content-type="https://casrai.org/credit/">Writing – original draft</role>
<role content-type="https://casrai.org/credit/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"/>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0002-0954-4093</contrib-id>
<name name-style="western">
<surname>Giles</surname>
<given-names>John R.</given-names>
</name>
<role content-type="https://casrai.org/credit/">Formal analysis</role>
<role content-type="https://casrai.org/credit/">Methodology</role>
<role content-type="https://casrai.org/credit/">Software</role>
<role content-type="https://casrai.org/credit/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"/>
</contrib>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0002-9741-8109</contrib-id>
<name name-style="western">
<surname>Lessler</surname>
<given-names>Justin</given-names>
</name>
<role content-type="https://casrai.org/credit/">Conceptualization</role>
<role content-type="https://casrai.org/credit/">Formal analysis</role>
<role content-type="https://casrai.org/credit/">Funding acquisition</role>
<role content-type="https://casrai.org/credit/">Methodology</role>
<role content-type="https://casrai.org/credit/">Supervision</role>
<role content-type="https://casrai.org/credit/">Visualization</role>
<role content-type="https://casrai.org/credit/">Writing – original draft</role>
<role content-type="https://casrai.org/credit/">Writing – review &amp; editing</role>
<xref ref-type="corresp" rid="cor001">*</xref>
<xref ref-type="aff" rid="aff001"/>
</contrib>
</contrib-group>
<aff id="aff001"><addr-line>Johns Hopkins Bloomberg School of Public Health, Department of Epidemiology, Baltimore, Maryland, United States of America</addr-line></aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple">
<name name-style="western">
<surname>Pitzer</surname>
<given-names>Virginia E.</given-names>
</name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/>
</contrib>
</contrib-group>
<aff id="edit1"><addr-line>Yale School of Public Health, UNITED STATES</addr-line></aff>
<author-notes>
<fn fn-type="conflict" id="coi001">
<p>The authors have declared that no competing interests exist.</p>
</fn>
<corresp id="cor001">* E-mail: <email xlink:type="simple">justin@jhu.edu</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>6</day>
<month>7</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<month>7</month>
<year>2021</year>
</pub-date>
<volume>17</volume>
<issue>7</issue>
<elocation-id>e1009182</elocation-id>
<history>
<date date-type="received">
<day>2</day>
<month>12</month>
<year>2020</year>
</date>
<date date-type="accepted">
<day>14</day>
<month>6</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-year>2021</copyright-year>
<copyright-holder>Wohl et al</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.pcbi.1009182"/>
<abstract>
<p>Sample size calculations are an essential component of the design and evaluation of scientific studies. However, there is a lack of clear guidance for determining the sample size needed for phylogenetic studies, which are becoming an essential part of studying pathogen transmission. We introduce a statistical framework for determining the number of true infector-infectee transmission pairs identified by a phylogenetic study, given the size and population coverage of that study. We then show how characteristics of the criteria used to determine linkage and aspects of the study design can influence our ability to correctly identify transmission links, in sometimes counterintuitive ways. We test the overall approach using outbreak simulations and provide guidance for calculating the sensitivity and specificity of the linkage criteria, the key inputs to our approach. The framework is freely available as the R package <italic>phylosamp</italic>, and is broadly applicable to designing and evaluating a wide array of pathogen phylogenetic studies.</p>
</abstract>
<abstract abstract-type="summary">
<title>Author summary</title>
<p>Sequencing the genetic material of viral and bacterial pathogens has become an important part of tracking and combating human infectious diseases. Specifically, comparing the pathogen DNA or RNA sequences collected from infected individuals can allow researchers and public health experts to determine who infected whom, or detect when a pathogen entered a specific country or geographic area. However, it is often impossible to collect samples from every single infected person, and these missing sequences can pose problems for this type of analysis, especially if there is some bias behind which samples were selected for sequencing. We have developed a mathematical framework that allows users to determine the probability their conclusions about pathogen transmission are correct given the number and proportion of samples from a pathogen outbreak they have sequenced. This framework is freely available, easy to use, and broadly generalizable to any pathogen, and we hope that it can be used to inform the design and sampling strategies behind future sequencing-based studies.</p>
</abstract>
<funding-group>
<award-group id="award001">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/100000865</institution-id>
<institution>Bill and Melinda Gates Foundation</institution>
</institution-wrap>
</funding-source>
<award-id>OPP1195157</award-id>
<principal-award-recipient>
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0002-9741-8109</contrib-id>
<name name-style="western">
<surname>Lessler</surname>
<given-names>Justin</given-names>
</name>
</principal-award-recipient>
</award-group>
<funding-statement>Funding was provided by Bill and Melinda Gates Foundation OPP1195157 (S.W. and J.L.). The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</funding-statement>
</funding-group>
<counts>
<fig-count count="6"/>
<table-count count="3"/>
<page-count count="22"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>PLOS Publication Stage</meta-name>
<meta-value>vor-update-to-uncorrected-proof</meta-value>
</custom-meta>
<custom-meta>
<meta-name>Publication Update</meta-name>
<meta-value>2021-07-16</meta-value>
</custom-meta>
<custom-meta id="data-availability">
<meta-name>Data Availability</meta-name>
<meta-value>All code and simulation data are available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/HopkinsIDD/phylosamplesize" xlink:type="simple">https://github.com/HopkinsIDD/phylosamplesize</ext-link>.</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="sec001" sec-type="intro">
<title>Introduction</title>
<p>As the cost of pathogen sequencing has declined, the number and size of studies based on pathogen sequence analysis has increased dramatically [<xref ref-type="bibr" rid="pcbi.1009182.ref001">1</xref>]. Traditionally, researchers have sequenced convenience samples collected as part of routine clinical or public health activities (e.g., diagnostic specimens collected as part of an outbreak response), or as part of studies where specimens are collected for other purposes. However, the analysis of pathogen genomic sequences is increasingly becoming a primary goal of both research studies and public health surveillance efforts [<xref ref-type="bibr" rid="pcbi.1009182.ref002">2</xref>–<xref ref-type="bibr" rid="pcbi.1009182.ref005">5</xref>]. This shift has been driven by the apparent utility of pathogen sequence data for understanding aspects of pathogen spread ranging from the frequency and source of introductions into a region [<xref ref-type="bibr" rid="pcbi.1009182.ref006">6</xref>–<xref ref-type="bibr" rid="pcbi.1009182.ref010">10</xref>], to identifying endogenous spread of emerging diseases [<xref ref-type="bibr" rid="pcbi.1009182.ref011">11</xref>,<xref ref-type="bibr" rid="pcbi.1009182.ref012">12</xref>], to understanding the role of “hotspots” in maintaining broader community epidemics [<xref ref-type="bibr" rid="pcbi.1009182.ref013">13</xref>], to understanding transmission patterns at an individual or “microscale” level [<xref ref-type="bibr" rid="pcbi.1009182.ref003">3</xref>,<xref ref-type="bibr" rid="pcbi.1009182.ref014">14</xref>].</p>
<p>Despite these many examples, there is a lack of clear and accessible guidance for appropriately designing and sizing studies aimed at understanding pathogen transmission, or for evaluating the design and conclusions of past studies. Without such guidance, it is difficult for researchers to design studies in a way that maximizes the chances of success, and difficult for reviewers to appropriately evaluate papers and grant applications centered around molecular or phylogenetic outcomes [<xref ref-type="bibr" rid="pcbi.1009182.ref015">15</xref>,<xref ref-type="bibr" rid="pcbi.1009182.ref016">16</xref>]. In particular, undersampling or biased sampling can lead to poorly supported inferences about patterns of disease spread [<xref ref-type="bibr" rid="pcbi.1009182.ref017">17</xref>,<xref ref-type="bibr" rid="pcbi.1009182.ref018">18</xref>]. While there are examples of researchers conducting careful <italic>a priori</italic> analyses of sampling strategies [<xref ref-type="bibr" rid="pcbi.1009182.ref019">19</xref>–<xref ref-type="bibr" rid="pcbi.1009182.ref021">21</xref>], these have largely relied on sophisticated techniques that are not broadly generalizable. Hence, there is a need for broadly accepted and accessible guidance for the selection of specimens for sequencing and phylogenetic analyses.</p>
<p>As noted above, pathogen sequences have been used to understand multiple aspects of infectious disease transmission at scales ranging from the global (e.g., movement of pathogens between countries) to the individual (e.g., reconstruction of individual transmission chains). Arguably, all such analyses can be reduced to the basic question of whether pairs of infected units (individuals, locations, etc.) are related or connected within a particular number of generations of transmission. Therefore, developing tools for assessing the number of sequences needed to confidently identify linked individuals (infections separated by no more than a specific number of generations of transmission) is a good place to start building a theory for power calculations for phylogenetic inference that can later be applied to questions at vastly different spatial or temporal scales. In this paper, we present a framework for making critical decisions about study design when the goal is to identify infector-infectee pairs, and we illustrate this approach with simulation studies.</p>
</sec>
<sec id="sec002" sec-type="materials|methods">
<title>Methods</title>
<sec id="sec003">
<title>General principles</title>
<p>In this paper we will focus on studies that aim to identify infector-infectee pairs from phylogenetic analysis of pathogen sequence data collected from infected individuals. We assume the study aims to achieve some level of certainty that identified infector-infectee pairs are correct, and may also require identification of some minimum number of pairs. Below we lay out a precise terminology (<bold><xref ref-type="table" rid="pcbi.1009182.t001">Table 1</xref></bold>) and general principles.</p>
<table-wrap id="pcbi.1009182.t001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1009182.t001</object-id>
<label>Table 1</label> <caption><title>Parameters used in calculations and simulations.</title></caption>
<alternatives>
<graphic id="pcbi.1009182.t001g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.t001" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="left" style="background-color:#EFEFEF">Parameter</th>
<th align="left" style="background-color:#EFEFEF">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center"><italic>M</italic></td>
<td align="left">Number of infections sampled</td>
</tr>
<tr>
<td align="center"><italic>N</italic></td>
<td align="left">Total number of (relevant) infected individuals in an outbreak</td>
</tr>
<tr>
<td align="center"><italic>ρ</italic></td>
<td align="left">Proportion of outbreak infections sampled (<italic>M</italic>/<italic>N</italic>)</td>
</tr>
<tr>
<td align="center"><italic>η</italic></td>
<td align="left">Sensitivity of the linkage criteria</td>
</tr>
<tr>
<td align="center"><italic>χ</italic></td>
<td align="left">Specificity of the linkage criteria</td>
</tr>
<tr>
<td align="center"><italic>ϕ</italic></td>
<td align="left">Probability that an identified link represents a true transmission event (1-False Discovery Rate)</td>
</tr>
<tr>
<td align="center"><italic>R</italic></td>
<td align="left">Reproductive number of a pathogen</td>
</tr>
<tr>
<td align="center"><italic>R</italic><sub>pop</sub></td>
<td align="left">Average reproductive number of a pathogen in a finite population (always &lt;1)</td>
</tr>
<tr>
<td align="center"><italic>μ</italic></td>
<td align="left">Substitution rate of the pathogen (in substitutions observed per genome per transmission event)</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<p>To start, we define the term <italic>linkage criteria</italic> to represent all the criteria used to infer whether a set of infected individuals are linked to one another by direct transmission. The <italic>linkage criteria</italic> can be derived from a combination of genetic distance between pathogens isolated from different individuals, tree structure (e.g., clade support), and epidemiologic information (e.g., relative dates of symptom onset). We refer to infections inferred to be connected by transmission using this criteria as <italic>linked pairs</italic>. Some of these linked pairs will represent actual transmission events (<italic>true transmission pairs</italic>) and some will be false positives. We want to determine the sample size (<italic>M</italic>) and proportion of the population (<italic>ρ</italic>) required to recover a predetermined number of linked pairs, while keeping the <italic>false discovery rate</italic> (the proportion of these linked pairs that are false positives) below a predetermined threshold. When applied to a study where design was dictated by other factors (e.g., specimen availability), the same methods can be used to determine the <italic>false discovery rate</italic>, which will inform the confidence we have in any conclusions about disease transmission in that study.</p>
<p>To capture <italic>true transmission pairs</italic>, the infector and their partner infectee must both be in the sample. Therefore, correctly identifying direct transmission links (and, conversely, calculating the false discovery rate) depends on the sampling fraction (<italic>ρ</italic>), which is equal to the sample size (<italic>M</italic>) divided by the total number of infected individuals in the relevant population (<italic>N</italic>). Identification of these links will further depend on the <italic>sensitivity</italic> (<italic>η</italic>) and <italic>specificity</italic> (<italic>χ</italic>) of the criteria used to define linkage. We define sensitivity as the probability that the linkage criteria will identify a true transmission pair as a linked pair given that both the infector and infectee are in the sample. Similarly, the specificity is the probability that two infections not linked by transmission are not linked by the linkage criteria.</p>
<p>Here we show that, if we have reasonable estimates of the sampling fraction, sensitivity, and specificity, we can, for a sample of size <italic>M</italic>, estimate the false discovery rate. The relationship between these parameters can then be used to design studies with a sample size and sampling fraction that minimizes the false discovery rate and therefore maximizes our ability to draw inferences from identified infections.</p>
</sec>
<sec id="sec004">
<title>Calculating sample size and false discovery rate</title>
<sec id="sec005">
<title>Multiple links and multiple true transmissions</title>
<p>In most transmission scenarios, we will be interested in linking an infected individual to both their infector and anyone they infect. Therefore, we must account for the fact that each infection in an outbreak may be linked by transmission to multiple other infections, only some of which may have been sampled. If the goal is to identify all true transmission pairs in the sample, the linkage criteria used must similarly allow for each infection to be linked to multiple other infections. Given this, we can calculate the probability of correctly identifying a true transmission pair, <italic>ϕ</italic> (equal to one minus the false discovery rate), as a function of just the sensitivity and specificity of the linkage criteria, the proportion sampled, and the sample size. Conceptually, this probability of correctly identifying a transmission pair is equal to the number of true positives (correctly identified true transmission pairs) divided by the total number of positives (linked pairs, regardless of true transmission status):
<disp-formula id="pcbi.1009182.e001">
<alternatives>
<graphic id="pcbi.1009182.e001g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e001" xlink:type="simple"/>
<mml:math display="block" id="M1">
<mml:mi>ϕ</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">True</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">Positives</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">True</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">Positives</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="normal">False</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">Positives</mml:mi></mml:mrow></mml:mfrac>
</mml:math>
</alternatives>
</disp-formula></p>
<p>Because we allow each infection to have multiple transmission partners, this probability will also depend on the average number of transmission links per infection, which is determined by the epidemiological parameter <italic>R</italic>, the expected number of other individuals each infected individual infects in a fully susceptible population. However, sampling infections over a finite period of time produces a bounded sampling frame, in which the average number of infectees per infector, denoted <italic>R</italic><sub>pop</sub>, may differ from <italic>R</italic>. This is because terminal nodes in the transmission network within this finite sampling frame are presumed to have no known child infections, and therefore an <italic>R</italic> value of zero. These nodes (which may or may not have child infections outside the sampling frame) contribute an <italic>R</italic> value of 0, decreasing the average number of infectees per infector. In fact, <italic>R</italic><sub>pop</sub> must be less than 1, see ‘Estimating the average reproductive number’ below. Because each infection is linked to, on average, <italic>R</italic><sub>pop</sub> infectees as well as its infector, each infection has <italic>R</italic><sub><italic>pop</italic></sub>+1 true transmission partners. If we assume that the distribution of the number of transmission partners per infection is Poisson distributed, we get the following equation for the true discovery rate, <italic>ϕ</italic> (see <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s015">S1 Text</xref></bold> for full derivation):
<disp-formula id="pcbi.1009182.e002">
<alternatives>
<graphic id="pcbi.1009182.e002g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e002" xlink:type="simple"/>
<mml:math display="block" id="M2">
<mml:mi>ϕ</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>η</mml:mi><mml:mi>ρ</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">pop</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mi>η</mml:mi><mml:mi>ρ</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">pop</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo><mml:mo>+</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>−</mml:mo><mml:mi>χ</mml:mi></mml:mrow><mml:mo>)</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mi>M</mml:mi><mml:mo>−</mml:mo><mml:mi>ρ</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">pop</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo><mml:mo>−</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mfrac>
</mml:math>
</alternatives>
<label>(1)</label>
</disp-formula></p>
<p>Under the same assumptions, we show that the total number of sampled true pairs, <inline-formula id="pcbi.1009182.e003"><alternatives><graphic id="pcbi.1009182.e003g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e003" xlink:type="simple"/><mml:math display="inline" id="M3"><mml:mi mathvariant="double-struck">E</mml:mi><mml:mo>[</mml:mo><mml:mi mathvariant="normal">number</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">of</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">true</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">pairs</mml:mi><mml:mo>]</mml:mo></mml:math></alternatives></inline-formula>, can be calculated as:
<disp-formula id="pcbi.1009182.e004">
<alternatives>
<graphic id="pcbi.1009182.e004g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e004" xlink:type="simple"/>
<mml:math display="block" id="M4">
<mml:mi mathvariant="double-struck">E</mml:mi><mml:mo>[</mml:mo><mml:mrow><mml:mi mathvariant="normal">number</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">of</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">true</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">pairs</mml:mi></mml:mrow><mml:mo>]</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>M</mml:mi><mml:mi>ρ</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">pop</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo><mml:mi>η</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac>
</mml:math>
</alternatives>
</disp-formula></p>
<p>Through algebraic rearrangement of these equations we can determine the expected number of pairs observed in this sample, <inline-formula id="pcbi.1009182.e005"><alternatives><graphic id="pcbi.1009182.e005g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e005" xlink:type="simple"/><mml:math display="inline" id="M5"><mml:mi mathvariant="double-struck">E</mml:mi><mml:mo>[</mml:mo><mml:mi mathvariant="normal">number</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">of</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">pairs</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">observed</mml:mi><mml:mo>]</mml:mo></mml:math></alternatives></inline-formula>:
<disp-formula id="pcbi.1009182.e006">
<alternatives>
<graphic id="pcbi.1009182.e006g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e006" xlink:type="simple"/>
<mml:math display="block" id="M6">
<mml:mi mathvariant="double-struck">E</mml:mi><mml:mo>[</mml:mo><mml:mrow><mml:mi mathvariant="normal">number</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">of</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">pairs</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">observed</mml:mi></mml:mrow><mml:mo>]</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mo>[</mml:mo><mml:mrow><mml:mi>η</mml:mi><mml:mi>ρ</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">pop</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo><mml:mo>+</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>−</mml:mo><mml:mi>χ</mml:mi></mml:mrow><mml:mo>)</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mi>M</mml:mi><mml:mo>−</mml:mo><mml:mi>ρ</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">pop</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo><mml:mo>−</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>]</mml:mo>
</mml:math>
</alternatives>
</disp-formula></p>
<p>These equations can be used to determine the false discovery rate (1−<italic>ϕ</italic>) and the expected number of linked pairs given a particular criteria, sample size, and sampling proportion. Additionally, we can use these equations to observe how the expected number of links and the true discovery rate vary with the proportion sampled and the sample size (<bold><xref ref-type="fig" rid="pcbi.1009182.g001">Fig 1A</xref></bold>). For a given sensitivity and specificity of the linkage criteria, we observe that the false discovery rate <italic>increases</italic> with sample size if the proportion sampled remains constant, suggesting that studies aimed at correctly identifying the highest proportion of transmission links should prioritize sampling proportion over an arbitrary number of samples. Additionally, the relationship between false discovery rate and sampling proportion is dependent on the sample size needed to obtain that sampling proportion such that the impact of sampling proportion increases with sample size. We also observe the effects of changing sensitivity and specificity on the false discovery rate and find that the specificity of the linkage criteria is of key importance when attempting to minimize the false discovery rate of transmission pairs (<bold><xref ref-type="fig" rid="pcbi.1009182.g001">Fig 1B</xref></bold>).</p>
<fig id="pcbi.1009182.g001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1009182.g001</object-id>
<label>Fig 1</label>
<caption>
<title>Sample size and false discovery rate given multiple linkage and multiple transmissions.</title>
<p>(<bold>A</bold>) Effect of sample size (red lines) or proportion sampled (blue lines) on the expected number of linked pairs (upper plots) or the false discovery rate of linked pairs (lower plots). The specificity and sensitivity are held constant. (<bold>B</bold>) Effect of varying the sensitivity and specificity of the linkage criteria on the false discovery rate (FDR). White dots: theoretical sensitivity and specificity values at different genetic distance thresholds (1–10 substitutions between infections; leftmost white dot represents a threshold of 1 substitution) for a hypothetical pathogen with substitution rate = 1 substitution/genome/transmission and <italic>R</italic> = 2 (see ‘Determining sensitivity and specificity’ below for details). In both panels, <italic>R</italic><sub><italic>pop</italic></sub> = 1.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.g001" xlink:type="simple"/>
</fig>
</sec>
<sec id="sec006">
<title>Single link and single true transmission</title>
<p>We can also derive the relationship between the sample size and false discovery rate for the special case where each infection is the transmission pair of exactly one other sample, relevant when we are only interested in identifying the correct infector of a given infection. In this case, the linkage criteria will similarly identify exactly one probable link for each infection [<xref ref-type="bibr" rid="pcbi.1009182.ref015">15</xref>]. These assumptions about transmission simplify the relationship between sample size and false discovery rate. Here, we calculate the false discovery rate for transmission pairs under these assumptions (see <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s015">S1 Text</xref></bold> for full derivation).</p>
<p>The probability of correctly identifying a true transmission pair (<italic>ϕ</italic>) under the assumptions of single transmission and single linkage is:
<disp-formula id="pcbi.1009182.e007">
<alternatives>
<graphic id="pcbi.1009182.e007g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e007" xlink:type="simple"/>
<mml:math display="block" id="M7">
<mml:mi>ϕ</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>η</mml:mi><mml:mi>ρ</mml:mi></mml:mrow><mml:mrow><mml:mi>η</mml:mi><mml:mi>ρ</mml:mi><mml:mo>+</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>−</mml:mo><mml:msup><mml:mrow><mml:mi>χ</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi><mml:mo>−</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mo>)</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>−</mml:mo><mml:mi>η</mml:mi></mml:mrow><mml:mo>)</mml:mo><mml:mi>ρ</mml:mi><mml:mo>+</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>−</mml:mo><mml:msup><mml:mrow><mml:mi>χ</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi><mml:mo>−</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mo>)</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>−</mml:mo><mml:mi>ρ</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mfrac>
</mml:math>
</alternatives>
<label>(2)</label>
</disp-formula></p>
<p>Under the same assumptions, we can also calculate the expected total number of true transmission pairs that will be identified in our sample, <inline-formula id="pcbi.1009182.e008"><alternatives><graphic id="pcbi.1009182.e008g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e008" xlink:type="simple"/><mml:math display="inline" id="M8"><mml:mi mathvariant="double-struck">E</mml:mi><mml:mo>[</mml:mo><mml:mi mathvariant="normal">number</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">of</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">true</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">pairs</mml:mi><mml:mo>]</mml:mo></mml:math></alternatives></inline-formula>, as:
<disp-formula id="pcbi.1009182.e009">
<alternatives>
<graphic id="pcbi.1009182.e009g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e009" xlink:type="simple"/>
<mml:math display="block" id="M9">
<mml:mi mathvariant="double-struck">E</mml:mi><mml:mo>[</mml:mo><mml:mrow><mml:mi mathvariant="normal">number</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">of</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">true</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">pairs</mml:mi></mml:mrow><mml:mo>]</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mi>η</mml:mi><mml:mi>ρ</mml:mi>
</mml:math>
</alternatives>
</disp-formula></p>
<p>Through algebraic rearrangement of these equations, we can determine the expected number of linked pairs (identified with the linkage criteria) observed in this sample (<inline-formula id="pcbi.1009182.e010"><alternatives><graphic id="pcbi.1009182.e010g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e010" xlink:type="simple"/><mml:math display="inline" id="M10"><mml:mi mathvariant="double-struck">E</mml:mi><mml:mo>[</mml:mo><mml:mi mathvariant="normal">number</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">of</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">pairs</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">observed</mml:mi><mml:mo>]</mml:mo></mml:math></alternatives></inline-formula>):
<disp-formula id="pcbi.1009182.e011">
<alternatives>
<graphic id="pcbi.1009182.e011g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e011" xlink:type="simple"/>
<mml:math display="block" id="M11">
<mml:mi mathvariant="double-struck">E</mml:mi><mml:mo>[</mml:mo><mml:mrow><mml:mi mathvariant="normal">number</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">of</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">pairs</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">observed</mml:mi></mml:mrow><mml:mo>]</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mo>[</mml:mo><mml:mrow><mml:mi>η</mml:mi><mml:mi>ρ</mml:mi><mml:mo>+</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>−</mml:mo><mml:msup><mml:mrow><mml:mi>χ</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi><mml:mo>−</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mo>)</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>−</mml:mo><mml:mi>η</mml:mi></mml:mrow><mml:mo>)</mml:mo><mml:mi>ρ</mml:mi><mml:mo>+</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>−</mml:mo><mml:msup><mml:mrow><mml:mi>χ</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi><mml:mo>−</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mo>)</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>−</mml:mo><mml:mi>ρ</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>]</mml:mo>
</mml:math>
</alternatives>
</disp-formula></p>
<p>As in the multiple links and multiple transmissions case, we observe that the false discovery rate increases with the sample size, but decreases with the proportion sampled. We also again see the important effect of the specificity of the linkage criteria on the false discovery rate (<bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s001">S1 Fig</xref></bold>). The relationships between these parameters and our ability to correctly identify transmission links are clearly robust to transmission model specification.</p>
</sec>
</sec>
<sec id="sec007">
<title>Estimating the average reproductive number</title>
<p>In the previous section, we distinguished <italic>R</italic>, the basic reproductive number of a pathogen, from <italic>R</italic><sub>pop</sub>, the <italic>average</italic> reproductive number in a bounded sampling frame. This is an important distinction because we can show that the average reproductive number (<italic>R</italic><sub>pop</sub>) is at most one. This is because any sampling frame contains a finite number of infected individuals, and individuals on terminal nodes of the captured transmission chain have not, by definition, infected any other individuals within the sampling frame (though they may have passed the infection to others outside the finite sample). Averaging the <italic>R</italic> value from these terminal nodes (which is zero, because they are terminal nodes) with the R value from all other nodes is what allows the <italic>R</italic><sub>pop</sub> average to drop below one, even when the true value of R is significantly greater than one. In other words, there will always be more infections (at minimum, all infectees in a transmission chain plus a single index case) than infection events (see <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s002">S2A Fig</xref></bold>). Hence, <italic>R</italic><sub>pop</sub>, which is equal to the number of actual transmission events divided by the number of infections, will be at most one.</p>
<p>In epidemic situations where there is a single introduction, <italic>R</italic><sub>pop</sub> will be close to one, as the number of infections will exceed the number of infection events by precisely one. In situations where there are multiple introductions (e.g., transmission chains that are persistently seeded from sources outside the sampling frame) then <italic>R</italic><sub>pop</sub> may be substantially less than one (<bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s002">S2B Fig</xref></bold>). Specifically:
<disp-formula id="pcbi.1009182.e012">
<alternatives>
<graphic id="pcbi.1009182.e012g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e012" xlink:type="simple"/>
<mml:math display="block" id="M12">
<mml:mfrac><mml:mrow><mml:mi mathvariant="normal">cases</mml:mi><mml:mo>−</mml:mo><mml:mi mathvariant="normal">introductions</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">cases</mml:mi></mml:mrow></mml:mfrac>
</mml:math>
</alternatives>
</disp-formula></p>
<p>The examples shown in this paper focus on epidemics seeded by a single introduction, where <italic>R</italic><sub>pop</sub> is approximately equal to one.</p>
</sec>
<sec id="sec008">
<title>Determining sensitivity and specificity</title>
<p>In the framework presented here, the sensitivity and specificity of the linkage criteria are needed to estimate the false discovery rate from sample size and vice versa. This criteria can be based on a number of phylogenetic and epidemiological metrics, and may depend on the data available for a particular study. In this section, we outline two methods for approximating the sensitivity and specificity of a simple genomic metric: genetic distance.</p>
<p>Both methods involve determining these parameters from the discrete distributions of genetic distances between linked and unlinked infections, but they differ in how these distributions are obtained. Given the distributions, we can consider a number of different genetic distance thresholds (e.g., 1 or 2 mutations observed between sequences) that could be used as the criteria for differentiating between linked and unliked pairs, and we can calculate the sensitivity and specificity at each. The optimal threshold and its associated sensitivity and specificity can be selected in a variety of ways [<xref ref-type="bibr" rid="pcbi.1009182.ref022">22</xref>–<xref ref-type="bibr" rid="pcbi.1009182.ref025">25</xref>] based on the specific study goals.</p>
<p>Below, we describe two ways to obtain the genetic distance distributions of linked and unlinked infection pairs for a hypothetical pathogen with <italic>R</italic> = 2 and a substitution rate (<italic>μ</italic>) of 1 substitution per genome per generation. We use the substitution rate rather than the pathogen mutation rate because our method concerns mutations <italic>observed</italic> between pathogen transmission events. We then use these genetic distance distributions to determine sensitivity and specificity, and ultimately to calculate the false discovery rate given a specific sample size and proportion. Here and henceforth, “generation” refers to a generation of transmission (not viral replication time).</p>
</sec>
<sec id="sec009">
<title>Empirical method</title>
<p>One way to estimate the relevant genetic distance distributions is to use existing data. Specifically, we need a subsample of infections for which sequencing data is available and we have a high degree of confidence—based on epidemiological data—of the true transmission relationships between included infections. For example, infected individuals who share a household versus community members with no known relationship. We can compute the genetic distance between every pair of pathogen sequences from this subsample and use the results to approximate the underlying genetic distance distributions between linked and unlinked infections in the population.</p>
<p>We illustrate this method on a simulated outbreak of approximately 1500 infections (data available at <ext-link ext-link-type="uri" xlink:href="https://github.com/HopkinsIDD/phylosamplesize" xlink:type="simple">https://github.com/HopkinsIDD/phylosamplesize</ext-link>), created using the <italic>outbreaker</italic> R package [<xref ref-type="bibr" rid="pcbi.1009182.ref026">26</xref>,<xref ref-type="bibr" rid="pcbi.1009182.ref027">27</xref>] (see ‘Outbreak simulations’ below). To create our known subsample, we selected a small number of infections from early in the outbreak and extracted their true transmission links and simulated genomes. We then calculated the genetic distance matrix of sequences in this subsample and determined the genetic distance distributions for linked and unlinked infection pairs (<bold><xref ref-type="fig" rid="pcbi.1009182.g002">Fig 2A</xref></bold>). Next, we estimated the sensitivity and specificity at every mutation threshold (0 mutations, 1 mutation, etc.) and used the point closest to the (0,1) corner to determine the optimal threshold for differentiating between linked and unliked infections. In this case, the optimal threshold was 3 mutations, which had a sensitivity of 0.95 and a specificity of 0.88.</p>
<fig id="pcbi.1009182.g002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1009182.g002</object-id>
<label>Fig 2</label>
<caption>
<title>Determining the sensitivity and specificity of a genetic distance threshold.</title>
<p>(<bold>A</bold>) Empirical distribution of genetic distances for linked (purple) and unlinked (yellow) infections for 50 infections selected from early in a simulated outbreak (<italic>μ</italic> = 1 substitution/genome/generation, <italic>R</italic> = 2). Inset: receiver operating characteristic (ROC) for all possible genetic distance thresholds. Optimal threshold shown as green dot (ROC) and dashed vertical line (distribution). (<bold>B</bold>) Estimated distribution of genetic distances for linked and unlinked infections generated by the substitution rate method. Parameters and plots are as in (A).</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.g002" xlink:type="simple"/>
</fig>
<sec id="sec010">
<title>Substitution rate method</title>
<p>Observed pathogen substitution rates can also be used to estimate the genetic distance distributions, especially when a subsample of infections with known transmission histories is not available. If we assume that the number of mutations observed between two linked infections is Poisson distributed around the substitution rate and that we know the distribution of the number of generations between infections in the population, the probability of observing a specific genetic distance (<italic>d</italic>) between the sequences from any two infected individuals linked by transmission is:
<disp-formula id="pcbi.1009182.e013">
<alternatives>
<graphic id="pcbi.1009182.e013g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e013" xlink:type="simple"/>
<mml:math display="block" id="M13">
<mml:mrow><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>g</mml:mi><mml:mrow><mml:mi mathvariant="normal">link</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mrow><mml:mi>g</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>g</mml:mi><mml:mrow><mml:mi mathvariant="normal">link</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mrow><mml:mi>g</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>⋅</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>d</mml:mi><mml:mo>;</mml:mo><mml:mi>i</mml:mi><mml:mo>⋅</mml:mo><mml:mi>μ</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:mrow>
</mml:math>
</alternatives>
<label>(3)</label>
</disp-formula>
where <italic>g</italic>(<italic>i</italic>) is the probability of observing <italic>i</italic> generations between infections, <italic>g</italic><sub>link</sub> is the maximum number of generations between infections considered linked, <italic>f</italic>(<italic>d</italic>;<italic>i</italic>∙<italic>μ</italic>) is the probability of observing <italic>d</italic> mutations between two infections separated by <italic>i</italic> generations, and <italic>μ</italic> is the substitution rate per genome per generation (see <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s016">S2 Text</xref></bold>).</p>
<p>Similarly, the probability of observing a genetic distance <italic>d</italic> between two infections not linked by transmission is:
<disp-formula id="pcbi.1009182.e014">
<alternatives>
<graphic id="pcbi.1009182.e014g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e014" xlink:type="simple"/>
<mml:math display="block" id="M14">
<mml:mrow><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>g</mml:mi><mml:mrow><mml:mi mathvariant="normal">link</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>g</mml:mi><mml:mrow><mml:mi>max</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mrow><mml:mi>g</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>g</mml:mi><mml:mrow><mml:mi mathvariant="normal">link</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>g</mml:mi><mml:mrow><mml:mi>max</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mrow><mml:mi>g</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle><mml:mo>⋅</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>d</mml:mi><mml:mo>;</mml:mo><mml:mi>i</mml:mi><mml:mo>⋅</mml:mo><mml:mi>μ</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow>
</mml:math>
</alternatives>
<label>(4)</label>
</disp-formula></p>
<p>Where <italic>g</italic><sub>max</sub> is the maximum number of generations considered.</p>
<p>Since we assume that the number of substitutions between two linked infections is Poisson distributed, <italic>f</italic>(<italic>d</italic>;<italic>i</italic>∙<italic>μ</italic>) is simply the probability density function of a Poisson distribution with mean <italic>i</italic>×<italic>μ</italic>. Determining the distribution of generations between infections, however, is a non-trivial task [<xref ref-type="bibr" rid="pcbi.1009182.ref028">28</xref>–<xref ref-type="bibr" rid="pcbi.1009182.ref030">30</xref>] and depends on several factors, including the shape of the epidemic and the period of time from which infections are sampled (<bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s003">S3 Fig</xref></bold>). In the examples included herein, we use simulations to empirically approximate this distribution (see <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s016">S2 Text</xref></bold>), but it is likely that adequate approximations can be obtained by other means—or that more sophisticated approaches can be employed to directly estimate the necessary genetic distance distributions [<xref ref-type="bibr" rid="pcbi.1009182.ref031">31</xref>].</p>
<p>Given the approximate generation distribution between infections, we calculated the genetic distance distributions for linked and unlinked infections for the pathogen described above. The optimal genetic distance threshold for distinguishing between linked and unlinked infections was 4 mutations (sensitivity = 0.98, specificity = 0.99) (<bold><xref ref-type="fig" rid="pcbi.1009182.g002">Fig 2B</xref></bold>). The empirical and substitution rate methods result in a similar, but not identical, optimal threshold for the pathogen in this example, likely due to sparse sampling in the empirical case.</p>
<p>Regardless of which method we choose, we can use the sensitivity and specificity values to calculate the probability of correctly identifying a true transmission pair (<italic>ϕ</italic>) for this pathogen. We use <xref ref-type="disp-formula" rid="pcbi.1009182.e002"><bold>Eq</bold> 1</xref>, allowing for each infection to have multiple transmission partners. We will also assume that we are able to sample 50% of the cases in this hypothetical outbreak of 1500 infections:
<disp-formula id="pcbi.1009182.e015">
<alternatives>
<graphic id="pcbi.1009182.e015g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e015" xlink:type="simple"/>
<mml:math display="block" id="M15">
<mml:mi>ϕ</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>η</mml:mi><mml:mi>ρ</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">pop</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mi>η</mml:mi><mml:mi>ρ</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">pop</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo><mml:mo>+</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>−</mml:mo><mml:mi>χ</mml:mi></mml:mrow><mml:mo>)</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mi>M</mml:mi><mml:mo>−</mml:mo><mml:mi>ρ</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">pop</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo><mml:mo>−</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>0.98</mml:mn><mml:mo>*</mml:mo><mml:mn>0.5</mml:mn><mml:mo>*</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mn>0.98</mml:mn><mml:mo>*</mml:mo><mml:mn>0.5</mml:mn><mml:mo>*</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo><mml:mo>+</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>−</mml:mo><mml:mn>0.99</mml:mn></mml:mrow><mml:mo>)</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>750</mml:mn><mml:mo>−</mml:mo><mml:mn>0.5</mml:mn><mml:mo>*</mml:mo><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo><mml:mo>−</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mn>0.116</mml:mn>
</mml:math>
</alternatives>
</disp-formula></p>
<p>We note that, despite a reproductive number (R) of 2, a single introduction into this outbreak means we should use <italic>R</italic><sub>pop</sub> = 1. Given our assumptions, we find that under 12% of our inferred linked infections—using a genetic distance threshold of 4 mutations—are likely to reflect true transmission relationships. A better specificity value is needed to achieve more confidence in direct transmission links, which can occur for pathogens that incur a significant number of mutations between infections considered linked [<xref ref-type="bibr" rid="pcbi.1009182.ref032">32</xref>]. For pathogens that do not meet these criteria (as in the example here), it may not be possible to use genetic distance alone to distinguish between linked and unlinked infections (<bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s004">S4 Fig</xref></bold>).</p>
</sec>
</sec>
<sec id="sec011">
<title>Outbreak simulations</title>
<p>We used outbreak simulations to validate our approach. We simulated outbreaks using the ‘simOutbreak’ function implemented in the <italic>outbreaker</italic> R package [<xref ref-type="bibr" rid="pcbi.1009182.ref026">26</xref>]. For all simulations we assumed a large number of susceptible individuals in the population (n.hosts = 100,000), a genome length of 1,000 nucleotides, and no importation events (single source outbreak). We also assumed every infected individual transmitted their infection exactly one time step after infection, and ran the simulation for the number of generations needed to achieve a final outbreak size of approximately 1,000 infections (ln(1000)/ln(R)). We discarded simulations with an outbreak size of less than 100 or more than 2000 infected individuals; these discarded simulations did not count towards the total number of simulations for a given set of parameters. After simulating the source population, we randomly selected a predetermined proportion of infections from that population.</p>
<p>For each sampling proportion, we simulated outbreaks over a variety of substitution rates and reproductive numbers. We allowed the substitution rate to vary between 0.0001–4 mutations per genome per generation, and allowed the reproductive number to vary between 1.3–18. We chose these ranges to encompass substitution rates [<xref ref-type="bibr" rid="pcbi.1009182.ref033">33</xref>,<xref ref-type="bibr" rid="pcbi.1009182.ref034">34</xref>] and reproductive numbers [<xref ref-type="bibr" rid="pcbi.1009182.ref035">35</xref>] observed in actual human pathogens, and set the transition rate to be equal to the transversion rate for the purposes for this simulation. We note that, while pathogens can have reproductive numbers below 1.3, this was the minimum value that produced enough outbreaks with greater than 100 individuals in a reasonable amount of time. We divided each parameter range into 100 discrete values and ran simulations with all combinations of substitution rate and reproductive number, for a total of 10,000 simulations for each sampling proportion. We required simulated outbreaks to contain at least 100 and no more than 2000 infections for analysis. Validation plots were made in R using ggplot2 [<xref ref-type="bibr" rid="pcbi.1009182.ref036">36</xref>], and smoothed conditional means were calculated with the geom_smooth function from this package.</p>
</sec>
<sec id="sec012">
<title>Implementation</title>
<p>Functions for calculating the false discovery rate for a specific sample size or proportion are implemented in the R package <italic>phylosamp</italic>, freely available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/HopkinsIDD/phylosamp" xlink:type="simple">https://github.com/HopkinsIDD/phylosamp</ext-link>. This package also includes functions for calculating the necessary sample size based on a desired false discovery rate (inverse of <bold>Eqs <xref ref-type="disp-formula" rid="pcbi.1009182.e002">1</xref> and <xref ref-type="disp-formula" rid="pcbi.1009182.e007">2</xref></bold>), and functions to estimate the number of transmission pairs that will be observed given a sample size and a set of assumptions (e.g., multiple links and multiple transmissions, single link and single transmission, etc.). We also provide generation distributions for values of <italic>R</italic> between 1.3–18, derived from the simulations described in <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s016">S2 Text</xref></bold>.</p>
</sec>
<sec id="sec013">
<title>Applications to existing datasets</title>
<p>We used the phylosamp package to apply our method to an existing mumps virus dataset. We converted the reported substitution rate of 4.76×10<sup>−4</sup> substitutions/site/year [<xref ref-type="bibr" rid="pcbi.1009182.ref037">37</xref>] to 0.36 substitutions/genome/generation as follows:
<disp-formula id="pcbi.1009182.e016">
<alternatives>
<graphic id="pcbi.1009182.e016g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e016" xlink:type="simple"/>
<mml:math display="block" id="M16">
<mml:mfrac><mml:mrow><mml:mn>4.76</mml:mn><mml:mo>×</mml:mo><mml:msup><mml:mrow><mml:mn>10</mml:mn></mml:mrow><mml:mrow><mml:mo>−</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msup><mml:mi mathvariant="normal">substitutions</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">site</mml:mi><mml:mo>·</mml:mo><mml:mi mathvariant="normal">year</mml:mi></mml:mrow></mml:mfrac><mml:mo>×</mml:mo><mml:mfrac><mml:mrow><mml:mn>15384</mml:mn><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">sites</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">genome</mml:mi></mml:mrow></mml:mfrac><mml:mo>×</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">year</mml:mi></mml:mrow><mml:mrow><mml:mn>365</mml:mn><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">days</mml:mi></mml:mrow></mml:mfrac><mml:mo>×</mml:mo><mml:mfrac><mml:mrow><mml:mn>18</mml:mn><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">days</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">generation</mml:mi></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mn>0.36</mml:mn><mml:mspace width="0.25em"/><mml:mi mathvariant="normal">subs</mml:mi><mml:mo>/</mml:mo><mml:mi mathvariant="normal">genome</mml:mi><mml:mo>/</mml:mo><mml:mi mathvariant="normal">generation</mml:mi>
</mml:math>
</alternatives>
</disp-formula></p>
<p>We used a sampling proportion of 0.93, which is the fraction of samples from patients affiliated with Harvard University (71) that resulted in complete genomes. We also noted that the original mumps manuscript reports multiple lineages circulating within Harvard University, which would reduce the average reproductive number (<italic>R</italic><sub>pop</sub>) used to calculate the true discovery rate. However, decreasing this value again only decreases confidence in identified links, so we used <italic>R</italic><sub>pop</sub> = 1 to again calculate the upper bound of this estimate.</p>
<p>When applying the methods to a hypothetical SARS-CoV-2 outbreak, we converted a substitution rate of 24.896 substitutions/genome/year [<xref ref-type="bibr" rid="pcbi.1009182.ref038">38</xref>–<xref ref-type="bibr" rid="pcbi.1009182.ref040">40</xref>] to 0.34 substitutions/genome/generation using a generation time of 5 days [<xref ref-type="bibr" rid="pcbi.1009182.ref041">41</xref>]. The samplesize function in the phylosamp package gave the following error message when used with the optimum sensitivity and specificity (along with an outbreak size of 120 and true discovery rate of 0.9), indicating no amount of sampling would lead to high confidence in identified links: “Input values do no produce a viable solution.”</p>
</sec>
</sec>
<sec id="sec014" sec-type="results">
<title>Results</title>
<sec id="sec015">
<title>Method performance with known sensitivity and specificity</title>
<p>We used simulated outbreaks to validate the relationship between sample size and false discovery rate using genetic distance as our linkage criteria. We subsampled each outbreak and, using the known transmission relationships and genetic distances between simulated infections, calculated the false discovery rate at each possible genetic distance threshold in the subsample (“simulated FDR”). For each simulation (before subsampling), we also calculated the actual specificity and sensitivity at every relevant genetic distance threshold. We used these values and the observed <italic>R</italic><sub>pop</sub> (roughly equal to one in most simulations) to then calculate the theoretical false discovery rate at a particular sampling proportion using <xref ref-type="disp-formula" rid="pcbi.1009182.e002"><bold>Eq</bold> 1</xref>. We find that the theoretical false discovery rate is consistent with the simulated value for a wide array of pathogen substitution rates and reproductive numbers (<bold><xref ref-type="fig" rid="pcbi.1009182.g003">Fig 3</xref></bold>).</p>
<fig id="pcbi.1009182.g003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1009182.g003</object-id>
<label>Fig 3</label>
<caption>
<title>Predicted versus observed false discovery rate in outbreak simulations.</title>
<p>Theoretical versus simulated false discovery rate (FDR) for each genetic distance threshold in 10,000 simulations of varying substitution rate and reproductive number (approximately 260,000 points per plot, see <bold>Tables <xref ref-type="table" rid="pcbi.1009182.t002">2</xref> and <xref ref-type="table" rid="pcbi.1009182.t003">3</xref></bold>). Outbreak sizes range from 100–2000, as described in <bold>Methods</bold>. White line: smoothed conditional mean; grey dashed line: y = x line. Increasing values of the sample size (<italic>M</italic>) are plotted in darker color; because the maximum outbreak size is fixed at 2000, the maximum sample size differs for each sampling proportion. Increasing both the sample size and proportion reduces bias and error, see <bold>Tables <xref ref-type="table" rid="pcbi.1009182.t002">2</xref> and <xref ref-type="table" rid="pcbi.1009182.t003">3</xref></bold>.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.g003" xlink:type="simple"/>
</fig>
<p>Overall, the bias of our estimate of the false discovery rate approached zero for all sampling proportions (<bold><xref ref-type="table" rid="pcbi.1009182.t002">Table 2</xref></bold>). The average error was less than 0.04 in each case (i.e., false discovery rate estimate is off by no more than 4%), decreasing significantly with increased sample size or proportion sampled (<bold>Tables <xref ref-type="table" rid="pcbi.1009182.t003">3</xref></bold> and <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s011">S1</xref></bold>). We note that special care should be taken with low sample sizes and low theoretical false discovery rates, as error rates can be particularly high in this range. Additionally, while our method is an unbiased estimator and overall correct in expectation, it is always possible for performance in a particular set of individuals sampled from a population to deviate substantially from expectation. As an example, in a small fraction of simulations, there were by chance no true transmission links (or, in some cases, no false positives) in our subsample. This fixes the simulated false discovery rate at 1 (or 0, when there are no false positives), which may not be representative of the overall relationship between sample size and false discovery rate and highlights how the specific infections sampled can affect results, particularly when sample sizes are low.</p>
<table-wrap id="pcbi.1009182.t002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1009182.t002</object-id>
<label>Table 2</label> <caption><title>Bias of calculated false discovery rate for simulations with fixed sampling proportion.</title></caption>
<alternatives>
<graphic id="pcbi.1009182.t002g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.t002" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="center"/>
<th align="center">⍴ = 0.10</th>
<th align="center">⍴ = 0.25</th>
<th align="center">⍴ = 0.50</th>
<th align="center">⍴ = 0.75</th>
<th align="center">All ⍴ values</th>
<th align="center">N</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center" style="background-color:#EFEFEF">FDR = 0.00–0.25</td>
<td align="center">-0.0006</td>
<td align="center">0.0045</td>
<td align="center">0.0001</td>
<td align="center">0.0036</td>
<td align="center"><bold>0.0022</bold></td>
<td align="center">17,900</td>
</tr>
<tr>
<td align="center" style="background-color:#EFEFEF">FDR = 0.25–0.50</td>
<td align="center">0.0044</td>
<td align="center">0.0045</td>
<td align="center">0.0009</td>
<td align="center">0.0032</td>
<td align="center"><bold>0.0032</bold></td>
<td align="center">31,633</td>
</tr>
<tr>
<td align="center" style="background-color:#EFEFEF">FDR = 0.50–0.75</td>
<td align="center">0.0064</td>
<td align="center">0.0039</td>
<td align="center">0.0006</td>
<td align="center">0.001</td>
<td align="center"><bold>0.0029</bold></td>
<td align="center">51,069</td>
</tr>
<tr>
<td align="center" style="background-color:#EFEFEF">FDR = 0.75–1.00</td>
<td align="center">0.0001</td>
<td align="center">0.0001</td>
<td align="center">&lt;0.0001</td>
<td align="center">&lt;0.0001</td>
<td align="center"><bold>0.0001</bold></td>
<td align="center">965,125</td>
</tr>
<tr>
<td align="center" style="background-color:#EFEFEF">All FDR Values</td>
<td align="center"><bold>0.0005</bold></td>
<td align="center"><bold>0.0005</bold></td>
<td align="center"><bold>0.0001</bold></td>
<td align="center"><bold>0.0002</bold></td>
<td align="center"><bold>0.0003</bold></td>
<td align="center">1,065,727</td>
</tr>
<tr>
<td align="center" style="background-color:#EFEFEF">N</td>
<td align="center">261,360</td>
<td align="center">267,239</td>
<td align="center">268,900</td>
<td align="center">268,228</td>
<td align="center">1,065,727</td>
<td align="center"/>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<table-wrap id="pcbi.1009182.t003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1009182.t003</object-id>
<label>Table 3</label> <caption><title>Error of calculated false discovery rate for simulations with fixed sampling proportion.</title></caption>
<alternatives>
<graphic id="pcbi.1009182.t003g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.t003" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="center"/>
<th align="center" style="background-color:#EFEFEF">⍴ = 0.10</th>
<th align="center" style="background-color:#EFEFEF">⍴ = 0.25</th>
<th align="center" style="background-color:#EFEFEF">⍴ = 0.50</th>
<th align="center" style="background-color:#EFEFEF">⍴ = 0.75</th>
<th align="center" style="background-color:#EFEFEF">All ⍴ values</th>
<th align="center" style="background-color:#EFEFEF">N</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center" style="background-color:#EFEFEF">FDR = 0.00–0.25</td>
<td align="center">0.2135</td>
<td align="center">0.1359</td>
<td align="center">0.0799</td>
<td align="center">0.0401</td>
<td align="center"><bold>0.098</bold></td>
<td align="center">17,900</td>
</tr>
<tr>
<td align="center" style="background-color:#EFEFEF">FDR = 0.25–0.50</td>
<td align="center">0.2751</td>
<td align="center">0.1583</td>
<td align="center">0.079</td>
<td align="center">0.0416</td>
<td align="center"><bold>0.1275</bold></td>
<td align="center">31,633</td>
</tr>
<tr>
<td align="center" style="background-color:#EFEFEF">FDR = 0.50–0.75</td>
<td align="center">0.2057</td>
<td align="center">0.0979</td>
<td align="center">0.0478</td>
<td align="center">0.0259</td>
<td align="center"><bold>0.092</bold></td>
<td align="center">51,069</td>
</tr>
<tr>
<td align="center" style="background-color:#EFEFEF">FDR = 0.75–1.00</td>
<td align="center">0.0155</td>
<td align="center">0.0069</td>
<td align="center">0.0035</td>
<td align="center">0.002</td>
<td align="center"><bold>0.007</bold></td>
<td align="center">965,125</td>
</tr>
<tr>
<td align="center" style="background-color:#EFEFEF">All FDR Values</td>
<td align="center"><bold>0.032</bold></td>
<td align="center"><bold>0.0181</bold></td>
<td align="center"><bold>0.0097</bold></td>
<td align="center"><bold>0.0052</bold></td>
<td align="center"><bold>0.0161</bold></td>
<td align="center">1,065,727</td>
</tr>
<tr>
<td align="center" style="background-color:#EFEFEF">N</td>
<td align="center">261,360</td>
<td align="center">267,239</td>
<td align="center">268,900</td>
<td align="center">268,228</td>
<td align="center">1,065,727</td>
<td align="center"/>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<p>To better understand why the error rate of our estimator increases as the false discovery rate decreases, we stratified the simulation data by the sensitivity and specificity given a particular genetic distance threshold. We found that the error is highest when sensitivity is low and specificity is high (<bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s005">S5A and S5B Fig</xref></bold>), which occurs when a high genetic distance threshold is used. This combination often produces low false discovery rates, but is highly dependent on sampling (namely, if any true positives or false positives are sampled). This leads to highly variable simulated false discovery rates and consequently higher error rates. Unsurprisingly, this analysis also highlights that a discrete threshold like genetic distance produces a limited number of possible sensitivity and specificity combinations (<bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s005">S5C</xref></bold> and <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s005">S5D Fig</xref></bold>). Therefore, obtaining reasonable estimates for these values in tandem is of key importance when using our method to estimate the false discovery rate of a phylogenetic study.</p>
</sec>
<sec id="sec016">
<title>Method performance with estimated sensitivity and specificity</title>
<p>We repeated the false discovery rate comparison described above, but instead of using the actual sensitivity and specificity observed in each simulation, we calculated these parameters from the substitution rate used to generate that simulated outbreak (<bold><xref ref-type="fig" rid="pcbi.1009182.g004">Fig 4</xref></bold>). To reduce reliance on simulation data to calculate necessary parameters, we used <italic>R</italic><sub>pop</sub> = 1 rather than the empirical value.</p>
<fig id="pcbi.1009182.g004" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1009182.g004</object-id>
<label>Fig 4</label>
<caption>
<title>Validation of substitution rate method to calculate sensitivity and specificity.</title>
<p>Theoretical versus simulated false discovery rate (FDR) for each genetic distance threshold in 10,000 simulations of varying substitution rate and reproductive number (approximately 260,000 points per plot, see <bold>Tables <xref ref-type="table" rid="pcbi.1009182.t002">2</xref> and <xref ref-type="table" rid="pcbi.1009182.t003">3</xref></bold>). Outbreak sizes range from 100–2000, as described in <bold>Methods</bold>. White line: smoothed conditional mean; grey dashed line: <italic>y</italic> = <italic>x</italic> line. Increasing values of the sample size (<italic>M</italic>) are plotted in darker color; increasing both the sample size and proportion reduces bias and error, see <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s012">S2</xref></bold> and <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s013">S3</xref> Tables</bold>.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.g004" xlink:type="simple"/>
</fig>
<p>Under this more realistic set of assumptions, we observe a slight bias, though overall values remain less than one percent (<bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s012">S2</xref> and <xref ref-type="supplementary-material" rid="pcbi.1009182.s013">S3</xref> Tables</bold>). However, while mean bias is very low on average, it is greater when the theoretical false discovery rate is low, reaching an average of nearly 8% off the simulated value for predicted false discovery rates less than 25%. Average error rates were similarly slightly increased, but remained less than 4% overall. Despite these trends, the vast majority of false discovery rate estimates (as well as sensitivity and specificity estimates) fall very close to their true values (<bold><xref ref-type="fig" rid="pcbi.1009182.g005">Fig 5</xref></bold>). This observation holds true when only examining the optimal genetic distance threshold (using the closest to the (0,1) corner method, as described in <bold>Methods</bold>) (<bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s006">S6 Fig</xref></bold>) rather than estimated values at all thresholds shown in <bold>Figs <xref ref-type="fig" rid="pcbi.1009182.g004">4</xref> and <xref ref-type="fig" rid="pcbi.1009182.g005">5</xref></bold>.</p>
<fig id="pcbi.1009182.g005" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1009182.g005</object-id>
<label>Fig 5</label>
<caption>
<title>Histogram of raw parameter error using substitution rate method.</title>
<p>Theoretical minus simulated parameter values for each genetic distance threshold in 10,000 simulations of varying substitution rate and reproductive number for a given sampling proportion (see <bold><xref ref-type="fig" rid="pcbi.1009182.g004">Fig 4</xref></bold>). Top row: theoretical minus simulated false discovery rate; middle row: theoretical minus simulated sensitivity; bottom row: theoretical minus simulated specificity. Colors correspond to sampling proportion as in <bold><xref ref-type="fig" rid="pcbi.1009182.g004">Fig 4</xref></bold>.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.g005" xlink:type="simple"/>
</fig>
<p>Given that correct sensitivity and specificity values are an important component of calculating the theoretical false discovery rate, we looked at the specific estimates for these parameters generated by our substitution rate method. When considering only direct transmissions as linked (as we do throughout these simulations), <xref ref-type="disp-formula" rid="pcbi.1009182.e013"><bold>Eq</bold> 3</xref> simplifies to simply a Poisson distribution around the substitution rate, resulting in highly accurate and precise sensitivity estimates (<bold>Figs <xref ref-type="fig" rid="pcbi.1009182.g005">5</xref></bold> and <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s007">S7</xref></bold>). However, we find that our estimates for specificity have a positive bias regardless of sample size or proportion (<bold>Figs <xref ref-type="fig" rid="pcbi.1009182.g005">5</xref></bold> and <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s008">S8</xref> and <xref ref-type="supplementary-material" rid="pcbi.1009182.s009">S9</xref></bold>). We hypothesized that inaccuracies in the estimated specificity cause the bias observed in the false discovery rate estimate and were due to the distribution of generations between infections used in our calculation; as discussed in <bold>Methods</bold>, this is a non-trivial distribution that we estimated by averaging over many simulations (see <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s016">S2 Text</xref></bold> for details).</p>
<p>To test this hypothesis, we used the actual distribution of generations between infections from each simulation in our calculation of specificity (sensitivity estimates are unaffected by this distribution when considering only direct transmissions, as described above). We find that this does in fact reduce bias in our specificity estimates (<bold><xref ref-type="fig" rid="pcbi.1009182.g006">Fig 6</xref></bold>) and leads to largely unbiased (&lt;2%) estimates of the false discovery rate, even at low theoretical false discovery rate values (<bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s010">S10 Fig</xref></bold> and <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s014">S4 Table</xref></bold>).</p>
<fig id="pcbi.1009182.g006" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1009182.g006</object-id>
<label>Fig 6</label>
<caption>
<title>Effect of the generation distribution on specificity of the linkage criteria.</title>
<p>Theoretical versus simulated specificity for each genetic distance threshold in 10,000 simulations of varying substitution rate and reproductive number (proportion sampled = 0.75). White line: smoothed conditional mean; grey dashed line: y = x line. Increasing values of the sample size (<italic>M</italic>) are plotted in darker color. (<bold>A</bold>) Theoretical sensitivity and specificity calculated using average distribution of generations between infections from simulations (see <bold><xref ref-type="supplementary-material" rid="pcbi.1009182.s016">S2 Text</xref></bold>). (<bold>B</bold>) Theoretical sensitivity and specificity calculated using the actual distribution of generations between infections from that simulated outbreak.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.g006" xlink:type="simple"/>
</fig>
</sec>
<sec id="sec017">
<title>Application of the sampling framework</title>
<sec id="sec018">
<title>Illustrative retrospective example</title>
<p>To illustrate our sample size calculation framework, we used a publicly available dataset from an outbreak caused by a well characterized pathogen (mumps virus) that had been subject to both genomic and epidemiological analysis [<xref ref-type="bibr" rid="pcbi.1009182.ref037">37</xref>]. We first used the substitution rate method described above to calculate the sensitivity and specificity of genetic distance as a linkage criteria using the substitution rate reported in the study (molecular clock rate = 4.76×10<sup>−4</sup> substitutions per site per year). We converted this substitution rate to 0.36 substitutions per genome per generation using the mean generation interval estimated in the study (18 days), which falls within previous estimates of this parameter [<xref ref-type="bibr" rid="pcbi.1009182.ref042">42</xref>–<xref ref-type="bibr" rid="pcbi.1009182.ref044">44</xref>]. We used the effective reproductive number reported for Harvard University (1.70) to estimate the generation time distribution using our <italic>phylosamp</italic> package, as shown in the R code below:</p>
<p specific-use="line">library(phylosamp)</p>
<p specific-use="line">data("gen_dist_sim")</p>
<p specific-use="line">mgd &lt;- as.numeric(gen_dist_sim[gen_dist_sim$R = = 1.70, -(1:2)])</p>
<p specific-use="line">get_optim_roc(sens_spec_roc(cutoff = 1:20,mut_rate = 0.36,mean_gens_pdf = mgd))</p>
<p>This method results in an optimal sensitivity of 0.95 and specificity of 0.95 using a cutoff of two mutations.</p>
<p>We then used these parameter values to calculate the true discovery rate of our linkage criteria, i.e., the proportion of identified links (whole mumps genomes differing by &lt;2 mutations) that represent actual transmission pairs. We focused on the part of the mumps outbreak within Harvard university, for which 66 whole genomes sequences were generated from 71 unique patient samples. While the true number of cases at Harvard was likely significantly higher, this provides a maximum sampling proportion of 93% of infections. Using the <italic>phylosamp</italic> package, we calculated the true discovery rate as follows:</p>
<p specific-use="line">truediscoveryrate(eta = optim$sensitivity,chi = 1-optim$specificity,rho = 0.93,M = 66,R = 1)</p>
<p>Using our method, we calculated a true discovery rate of 0.35. This low value suggests that genetic distance alone would not be sufficient to identify specific transmission links within the Harvard community during this mumps outbreak. This is in line with the findings of the original paper, which demonstrates the need for both genomic and epidemiological data to understand transmission, and emphasizes the frequent need for such epidemiological data to achieve the required specificity for high confidence estimation of transmissiosn links.</p>
</sec>
<sec id="sec019">
<title>Illustrative prospective example</title>
<p>To demonstrate how our method could be used to estimate the sample size needed to identify transmission links with 90% confidence (i.e, a true discovery rate of 0.9), we applied our method to a hypothetical COVID-19 outbreak in an unvaccinated community with 120 infections. We calculated the sensitivity and specificity of genetic distance using a substitution rate of 0.34 mutations per genome per generation [<xref ref-type="bibr" rid="pcbi.1009182.ref038">38</xref>–<xref ref-type="bibr" rid="pcbi.1009182.ref041">41</xref>] and an <italic>R</italic> value of 3, consistent with many efforts [<xref ref-type="bibr" rid="pcbi.1009182.ref045">45</xref>,<xref ref-type="bibr" rid="pcbi.1009182.ref046">46</xref>]:</p>
<p specific-use="line">mgd &lt;- as.numeric(gen_dist_sim[gen_dist_sim$R = = 3, -(1:2)])</p>
<p specific-use="line">get_optim_roc(sens_spec_roc(cutoff = 1:20,mut_rate = 0.34,mean_gens_pdf = mgd))</p>
<p>This method results in an optimal sensitivity of 0.95 and a specificity of 0.84 using a cutoff of two mutations. Using these parameters, we found that not even perfect sampling could lead to a true discovery rate of at least 0.9:</p>
<p specific-use="line">samplesize(eta = optim$sensitivity,chi = 1-optim$specificity,N = 120,R = 1,phi = 0.9)</p>
<p>This suggests that genetic distance alone is not sufficient to differentiate linked and unliked SARS-CoV-2 infections at high confidence. However, if we could identify additional phylogenetic or epidemiological criteria that would increase the specificity to 0.999 (keeping the sensitivity at 0.95), a sample size of 11 would achieve our desired confidence in direct transmission links. Additionally, it may be more fruitful to focus on cases linked within several generations of transmission, during which additional mutations would have time to accumulate.</p>
</sec>
</sec>
</sec>
<sec id="sec020" sec-type="conclusions">
<title>Discussion</title>
<p>We have developed a mathematical framework for making informed sampling decisions in pathogen genome sequencing studies. Specifically, this framework allows for easy calculation of the relationship between the number or proportion of infections sampled during an outbreak and the ability of some phylogenetic or epidemiological criteria to correctly identify infections within this sample that are linked by direct transmission. Understanding this relationship is crucial to making correct inferences about pathogen transmission patterns, especially as genomic studies are becoming more feasible and widely used to answer both scientific and public health questions.</p>
<p>This framework is broadly applicable to a variety of phylogenetic or epidemiological approaches, as long as the sensitivity and specificity of the criteria can be approximated. With a basic understanding of the pathogen and the criteria being used, researchers can more effectively design studies that correctly identify transmission pairs with a known level of confidence. Additionally, this generalizable method (available as a free software, the R package <italic>phylosamp</italic>) provides a metric by which reviewers of these studies can evaluate their conclusions. We apply our method to simulated outbreaks using genetic distance as the linkage criteria and find that we can effectively estimate the false discovery rate for a variety of pathogen substitution rates, reproductive numbers, and relevant genetic distance thresholds. It is important to note, however, that for a given sensitivity and specificity, there may not always be a study design that achieves the desired false discovery rate.</p>
<p>Performance of the method presented depends on our ability to estimate the sensitivity and specificity of a particular linkage criteria. While we present two methods for doing this—empirically and theoretically using the substitution rate of the pathogen—implementing either in practice is not without challenges, and improved estimation of these values may be a fruitful area for future research. For instance, the substitution rate based approach also depends on the distribution of the number of generations of transmission between infections in the underlying population. Although distributions derived from simulations (provided as part of the <italic>phylosamp</italic> package) provide a reasonable proxy, estimates of sensitivity and specificity are much improved when using the exact generation distribution, which currently can only be determined from complete knowledge of all transmission events. Further research into all the factors affecting this distribution will be necessary to improve its estimation. Likewise, there are challenges to the empirical approach, particularly for novel pathogens.</p>
<p>Better performance can likely be obtained by not restricting ourselves to genetic distance alone when determining a linkage criteria. Genetic distance is easy to determine from sequence data, but this simple metric does not take into account ancestral relationships or uncertainty around these relationships, and is limited to discrete mutational changes. Applying more complex phylogenetic criteria may allow us to learn more about transmission relationships, though there is a limit to the extent to which genetic data can be used to distinguish infections in fast-spreading (or slow-mutating) pathogen outbreaks. There are several examples of outbreaks in which multiple infected individuals have the same consensus viral genome [<xref ref-type="bibr" rid="pcbi.1009182.ref032">32</xref>]. In this case, incorporating epidemiological data (e.g., location, time of symptom onset) may be important in determining which infections are unlikely to be linked. This incorporation of additional data may complicate calculation of the sensitivity and specificity, so developing the methodology around calculating these parameters will be important to further development of our method. This will likely build on a larger effort to better integrate epidemiological and genomic data into pathogen transmission studies [<xref ref-type="bibr" rid="pcbi.1009182.ref026">26</xref>,<xref ref-type="bibr" rid="pcbi.1009182.ref047">47</xref>–<xref ref-type="bibr" rid="pcbi.1009182.ref049">49</xref>].</p>
<p>The application of our methodology to a previous mumps outbreak and a hypothetical COVID-19 outbreak highlights the need to move beyond genetic distance as a linkage criteria; for pathogens with a substitution rate similar to that of mumps virus, genetic distance is not enough to differentiate between linked and unlinked cases even in densely sampled outbreaks. In trying to apply this method to other outbreaks, it also became clear that well-characterized substitution rates and reproductive numbers are essential for calculating sensitivity and specificity using our method, and that these parameters are less clearly defined for pathogens with long and variable generation times, such as bacterial infections. Variable periods of replication within a host makes it difficult to characterize a per-generation substitution rate that is broadly applicable over the entire outbreak and can be used to estimate sensitivity and specificity. In these cases, more nuanced criteria such as phylogenetic relatedness will likely be more informative than the number of mutations between sequenced infections; while we provide instructions for using genetic distance as a linkage criteria in order to give a concrete example of calculating sensitivity and specificity, the primary focus of this manuscript is to demonstrate how they can be used to calculate or evaluate sample sizes.</p>
<p>While in this manuscript we have focused on direct transmission pairs, our framework is designed to be extensible to alternative definitions of linkage; for example, infections connected within a specified number of transmission events. Expanding the definition of linkage to include such indirect transmissions has a number of useful applications in outbreak research, such as identifying and connecting transmission clusters. This method could also be extended to more complex direct transmission relationships, for example when within-host evolution results in the existence of viral quasispecies within infected individuals, each of which has some potential of being transmitted. In all of these scenarios, it is equally important to understand the sample size needed to make the desired inferences.</p>
<p>We hope that this work represents a step towards developing a larger theory of study design for making inferences from pathogen sequence data, but recognize it is only a step. The focus of this paper is sample size and the impact of undersampling, but spatial and/or temporal biases are also important for determining which infections are sampled [<xref ref-type="bibr" rid="pcbi.1009182.ref050">50</xref>–<xref ref-type="bibr" rid="pcbi.1009182.ref052">52</xref>]. For example, understanding routes of direct transmission may require dense sampling of a small group of highly-connected individuals, while understanding general transmission trends over the course of a geographically-dispersed outbreak may require us to sample broadly over space and time. Additionally, it will be important to take into account the contact network underlying pathogen transmission, since some individuals may be more likely to transmit their infection to others. Finally, the goal of linking infections is seldom the linkages themselves, but the larger inferences about risk and transmission derived from those linkages. Adapting the techniques here to more directly link sample size calculations to these outcomes is an important next step.</p>
</sec>
<sec id="sec021" sec-type="supplementary-material">
<title>Supporting information</title>
<supplementary-material id="pcbi.1009182.s001" mimetype="image/tiff" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s001" xlink:type="simple">
<label>S1 Fig</label>
<caption>
<title>Sample size and false discovery rate given single linkage and single transmission.</title>
<p>(<bold>A</bold>) Effect of sample size (red lines) or proportion sampled (blue lines) on the expected number of linked pairs (upper plots) or the false discovery rate of linked pairs (lower plots). The specificity and sensitivity are held constant. (<bold>B</bold>) Effect of varying the sensitivity and specificity of the linkage criteria on the false discovery rate (FDR).</p>
<p>(TIF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s002" mimetype="image/tiff" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s002" xlink:type="simple">
<label>S2 Fig</label>
<caption>
<title>Estimating the average reproductive number in a population.</title>
<p>Two hypothetical outbreaks with a pathogen reproductive number (<italic>R</italic>) equal to 2 and a total of 15 infections. Black circles represent infections; blue circles represent infections who have not yet infected others, or whose descendents are outside the sampling frame. (<bold>A</bold>) Outbreak caused by a single introduction, meaning there were 14 transmission events and 15 total infections. In other words, <inline-formula id="pcbi.1009182.e017"><alternatives><graphic id="pcbi.1009182.e017g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e017" xlink:type="simple"/><mml:math display="inline" id="M17"><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">pop</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>14</mml:mn></mml:mrow><mml:mrow><mml:mn>15</mml:mn></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mn>0.933</mml:mn></mml:math></alternatives></inline-formula>. (<bold>B</bold>) Outbreak caused by two separate introductions, meaning there were only 13 infection events in the sampling frame, resulting in <inline-formula id="pcbi.1009182.e018"><alternatives><graphic id="pcbi.1009182.e018g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1009182.e018" xlink:type="simple"/><mml:math display="inline" id="M18"><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">pop</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>13</mml:mn></mml:mrow><mml:mrow><mml:mn>15</mml:mn></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mn>0.867</mml:mn></mml:math></alternatives></inline-formula>.</p>
<p>(TIF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s003" mimetype="image/tiff" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s003" xlink:type="simple">
<label>S3 Fig</label>
<caption>
<title>Effects of R and G on the distribution of generations between cases.</title>
<p>Distribution of the number of generations between infections averaged over 1000 simulated outbreaks with reproduction number R and number of generations of transmission G. Distributions are shown for three values of R (rows). Left column: distribution of generations between infections after 3 generations of transmission; middle column: distribution after <italic>ln</italic>(1000)/<italic>ln</italic>(<italic>R</italic>) generations of transmission (see <xref ref-type="sec" rid="sec002">Methods</xref>); right column: distribution after <italic>ln</italic>(1000)/<italic>ln</italic>(<italic>R</italic>)+2 generations of transmission.</p>
<p>(TIF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s004" mimetype="image/tiff" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s004" xlink:type="simple">
<label>S4 Fig</label>
<caption>
<title>Genetic distance distributions for different types of pathogens.</title>
<p>(<bold>A</bold>) Distribution of genetic distances for linked (purple) and unlinked (yellow) infections for a hypothetical pathogen with substitution rate = 1 substitution/genome/generation and <italic>R</italic> = 1.5. Inset: receiver operating characteristic (ROC) curve for all possible genetic distance cutoff values. Optimal threshold shown as green dot (ROC) and dashed vertical line (distribution). (<bold>B</bold>) Distribution of genetic distances for linked and unlinked cases for a hypothetical pathogen with substitution rate = 0.2 mutations/genome/generation and <italic>R</italic> = 3. Inset: ROC curve for all possible genetic distance cutoff values for this pathogen. The optimal threshold is shown as in (A).</p>
<p>(TIF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s005" mimetype="image/tiff" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s005" xlink:type="simple">
<label>S5 Fig</label>
<caption>
<title>Error of false discovery rate calculation by sensitivity and specificity.</title>
<p>(<bold>A</bold>) Average false discovery from 10,000 simulated outbreaks (proportion sampled = 0.75) binned by sensitivity and specificity (bin size = 0.02). Grey = no genetic distance thresholds in simulation produced this combination of sensitivity and specificity. (<bold>B</bold>) Zoom view of (A), with specificity ranging from 0.9–1 (bin size = 0.002). (<bold>C</bold>) Number of data points with sensitivity and specificity in the desired bins (i.e., number of data points used to calculate average error in panel (A). (<bold>D</bold>) Zoom view of (C), with specificity ranging from 0.9–1.</p>
<p>(TIF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s006" mimetype="image/tiff" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s006" xlink:type="simple">
<label>S6 Fig</label>
<caption>
<title>Histogram of raw parameter error using substitution rate method (optimal threshold only).</title>
<p>Theoretical minus simulated parameter values for the optimal genetic distance threshold (determined by selecting the threshold for which the point at (1-specificity, sensitivity) is closest to the (0,1) corner) in 10,000 simulations of varying substitution rate and reproductive number for a given sampling proportion. Top row: theoretical minus simulated false discovery rate; middle row: theoretical minus simulated sensitivity; bottom row: theoretical minus simulated specificity. Colors correspond to sampling proportion as in <bold><xref ref-type="fig" rid="pcbi.1009182.g004">Fig 4</xref></bold>.</p>
<p>(TIF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s007" mimetype="image/tiff" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s007" xlink:type="simple">
<label>S7 Fig</label>
<caption>
<title>Predicted versus observed sensitivity using substitution rate method.</title>
<p>Theoretical versus simulated sensitivity for each genetic distance threshold in 10,000 simulations of varying substitution rate and reproductive number. White line: smoothed conditional mean; grey dashed line: <italic>y</italic> = <italic>x</italic> line. Increasing values of the sample size (<italic>M</italic>) are plotted in darker color.</p>
<p>(TIF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s008" mimetype="image/tiff" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s008" xlink:type="simple">
<label>S8 Fig</label>
<caption>
<title>Predicted versus observed specificity using substitution rate method.</title>
<p>Theoretical versus simulated specificity for each genetic distance threshold in 10,000 simulations of varying substitution rate and reproductive number. Outbreak sizes range from 100–2000, as described in <bold>Methods</bold>. White line: smoothed conditional mean; grey dashed line: y = x line. Increasing values of the sample size (<italic>M</italic>) are plotted in darker color.</p>
<p>(TIF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s009" mimetype="image/tiff" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s009" xlink:type="simple">
<label>S9 Fig</label>
<caption>
<title>Histogram of raw specificity error using substitution rate method by sample size and proportion.</title>
<p>Theoretical minus simulated specificity for each genetic distance threshold in 10,000 simulations of varying substitution rate and reproductive number for a given sampling proportion. Each column represents 10,000 simulations with a specific sampling proportion (colors as in <bold><xref ref-type="fig" rid="pcbi.1009182.g004">Fig 4</xref></bold>) and sample size within each proportion (determined by the final outbreak size) goes from low (top row) to high (bottom row).</p>
<p>(TIF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s010" mimetype="image/tiff" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s010" xlink:type="simple">
<label>S10 Fig</label>
<caption>
<title>Predicted versus observed false discovery rate using actual generation distribution.</title>
<p>Theoretical versus simulated false discovery rate (FDR) for each genetic distance threshold in 10,000 simulations of varying substitution rate and reproductive number. Theoretical FDR is calculated using the actual distribution of generations between infections from the corresponding simulated outbreak. White line: smoothed conditional mean; grey dashed line: <italic>y</italic> = <italic>x</italic> line. Increasing values of the sample size (<italic>M</italic>) are plotted in darker color.</p>
<p>(TIF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s011" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s011" xlink:type="simple">
<label>S1 Table</label>
<caption>
<title>Error of false discovery rate calculation by sample size.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s012" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s012" xlink:type="simple">
<label>S2 Table</label>
<caption>
<title>Bias and error of false discovery rate calculation using substitution rate method.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s013" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s013" xlink:type="simple">
<label>S3 Table</label>
<caption>
<title>Error and of false discovery rate calculation using substitution rate method by sample size.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s014" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s014" xlink:type="simple">
<label>S4 Table</label>
<caption>
<title>Bias and error of false discovery rate using actual generation distribution.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s015" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s015" xlink:type="simple">
<label>S1 Text</label>
<caption>
<title>Deriving probably of transmission given linkage.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1009182.s016" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s016" xlink:type="simple">
<label>S2 Text</label>
<caption>
<title>Determining sensitivity and specificity of genetic distance as a linkage criteria.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
</sec>
</body>
<back>
<ack>
<p>We thank Stuart Ray for his insightful comments on the manuscript.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="pcbi.1009182.ref001"><label>1</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Neher</surname> <given-names>RA</given-names></name>, <name name-style="western"><surname>Bedford</surname> <given-names>T</given-names></name>. <article-title>Real-Time Analysis and Visualization of Pathogen Sequence Data</article-title>. <source>J Clin Microbiol</source>. <year>2018</year>;<volume>56</volume>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1128/JCM.00480-18" xlink:type="simple">10.1128/JCM.00480-18</ext-link></comment> <object-id pub-id-type="pmid">30135232</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref002"><label>2</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Quick</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Loman</surname> <given-names>NJ</given-names></name>, <name name-style="western"><surname>Duraffour</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Simpson</surname> <given-names>JT</given-names></name>, <name name-style="western"><surname>Severi</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Cowley</surname> <given-names>L</given-names></name>, <etal>et al</etal>. <article-title>Real-time, portable genome sequencing for Ebola surveillance</article-title>. <source>Nature</source>. <year>2016</year>;<volume>530</volume>: <fpage>228</fpage>–<lpage>232</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/nature16996" xlink:type="simple">10.1038/nature16996</ext-link></comment> <object-id pub-id-type="pmid">26840485</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref003"><label>3</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Gardy</surname> <given-names>JL</given-names></name>, <name name-style="western"><surname>Johnston</surname> <given-names>JC</given-names></name>, <name name-style="western"><surname>Ho Sui</surname> <given-names>SJ</given-names></name>, <name name-style="western"><surname>Cook</surname> <given-names>VJ</given-names></name>, <name name-style="western"><surname>Shah</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Brodkin</surname> <given-names>E</given-names></name>, <etal>et al</etal>. <article-title>Whole-genome sequencing and social-network analysis of a tuberculosis outbreak</article-title>. <source>N Engl J Med</source>. <year>2011</year>;<volume>364</volume>: <fpage>730</fpage>–<lpage>739</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1056/NEJMoa1003176" xlink:type="simple">10.1056/NEJMoa1003176</ext-link></comment> <object-id pub-id-type="pmid">21345102</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref004"><label>4</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Jackson</surname> <given-names>BR</given-names></name>, <name name-style="western"><surname>Tarr</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Strain</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Jackson</surname> <given-names>KA</given-names></name>, <name name-style="western"><surname>Conrad</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Carleton</surname> <given-names>H</given-names></name>, <etal>et al</etal>. <article-title>Implementation of Nationwide Real-time Whole-genome Sequencing to Enhance Listeriosis Outbreak Detection and Investigation</article-title>. <source>Clin Infect Dis</source>. <year>2016</year>;<volume>63</volume>: <fpage>380</fpage>–<lpage>386</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/cid/ciw242" xlink:type="simple">10.1093/cid/ciw242</ext-link></comment> <object-id pub-id-type="pmid">27090985</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref005"><label>5</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Snider</surname> <given-names>CJ</given-names></name>, <name name-style="western"><surname>Diop</surname> <given-names>OM</given-names></name>, <name name-style="western"><surname>Burns</surname> <given-names>CC</given-names></name>, <name name-style="western"><surname>Tangermann</surname> <given-names>RH</given-names></name>, <name name-style="western"><surname>Wassilak</surname> <given-names>SGF</given-names></name>. <article-title>Surveillance Systems to Track Progress Toward Polio Eradication—Worldwide, 2014–2015</article-title>. <source>MMWR Morb Mortal Wkly Rep</source>. <year>2016</year>;<volume>65</volume>: <fpage>346</fpage>–<lpage>351</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.15585/mmwr.mm6513a3" xlink:type="simple">10.15585/mmwr.mm6513a3</ext-link></comment> <object-id pub-id-type="pmid">27054558</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref006"><label>6</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Lei</surname> <given-names>F</given-names></name>, <name name-style="western"><surname>Shi</surname> <given-names>W</given-names></name>. <article-title>Prospective of Genomics in Revealing Transmission, Reassortment and Evolution of Wildlife-Borne Avian Influenza A (H5N1) Viruses</article-title>. <source>Curr Genomics</source>. <year>2011</year>;<volume>12</volume>: <fpage>466</fpage>–<lpage>474</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.2174/138920211797904052" xlink:type="simple">10.2174/138920211797904052</ext-link></comment> <object-id pub-id-type="pmid">22547954</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref007"><label>7</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Nelson</surname> <given-names>MI</given-names></name>, <name name-style="western"><surname>Simonsen</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Viboud</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Miller</surname> <given-names>MA</given-names></name>, <name name-style="western"><surname>Holmes</surname> <given-names>EC</given-names></name>. <article-title>Phylogenetic analysis reveals the global migration of seasonal influenza A viruses</article-title>. <source>PLoS Pathog</source>. <year>2007</year>;<volume>3</volume>: <fpage>1220</fpage>–<lpage>1228</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.ppat.0030131" xlink:type="simple">10.1371/journal.ppat.0030131</ext-link></comment> <object-id pub-id-type="pmid">17941707</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref008"><label>8</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Gonzalez-Reiche</surname> <given-names>AS</given-names></name>, <name name-style="western"><surname>Hernandez</surname> <given-names>MM</given-names></name>, <name name-style="western"><surname>Sullivan</surname> <given-names>MJ</given-names></name>, <name name-style="western"><surname>Ciferri</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Alshammary</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Obla</surname> <given-names>A</given-names></name>, <etal>et al</etal>. <article-title>Introductions and early spread of SARS-CoV-2 in the New York City area</article-title>. <source>Science</source>. <year>2020</year>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1126/science.abc1917" xlink:type="simple">10.1126/science.abc1917</ext-link></comment> <object-id pub-id-type="pmid">32471856</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref009"><label>9</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Thézé</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Li</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>du Plessis</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Bouquet</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Kraemer</surname> <given-names>MUG</given-names></name>, <name name-style="western"><surname>Somasekar</surname> <given-names>S</given-names></name>, <etal>et al</etal>. <article-title>Genomic Epidemiology Reconstructs the Introduction and Spread of Zika Virus in Central America and Mexico</article-title>. <source>Cell Host Microbe</source>. <year>2018</year>;<volume>23</volume>: <fpage>855</fpage>–<lpage>864.e7</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.chom.2018.04.017" xlink:type="simple">10.1016/j.chom.2018.04.017</ext-link></comment> <object-id pub-id-type="pmid">29805095</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref010"><label>10</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Weill</surname> <given-names>F-X</given-names></name>, <name name-style="western"><surname>Domman</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Njamkepo</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Almesbahi</surname> <given-names>AA</given-names></name>, <name name-style="western"><surname>Naji</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Nasher</surname> <given-names>SS</given-names></name>, <etal>et al</etal>. <article-title>Genomic insights into the 2016–2017 cholera epidemic in Yemen</article-title>. <source>Nature</source>. <year>2019</year>;<volume>565</volume>: <fpage>230</fpage>–<lpage>233</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/s41586-018-0818-3" xlink:type="simple">10.1038/s41586-018-0818-3</ext-link></comment> <object-id pub-id-type="pmid">30602788</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref011"><label>11</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Carroll</surname> <given-names>MW</given-names></name>, <name name-style="western"><surname>Matthews</surname> <given-names>DA</given-names></name>, <name name-style="western"><surname>Hiscox</surname> <given-names>JA</given-names></name>, <name name-style="western"><surname>Elmore</surname> <given-names>MJ</given-names></name>, <name name-style="western"><surname>Pollakis</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Rambaut</surname> <given-names>A</given-names></name>, <etal>et al</etal>. <article-title>Temporal and spatial analysis of the 2014–2015 Ebola virus outbreak in West Africa</article-title>. <source>Nature</source>. <year>2015</year>;<volume>524</volume>: <fpage>97</fpage>–<lpage>101</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/nature14594" xlink:type="simple">10.1038/nature14594</ext-link></comment> <object-id pub-id-type="pmid">26083749</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref012"><label>12</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Park</surname> <given-names>DJ</given-names></name>, <name name-style="western"><surname>Dudas</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Wohl</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Goba</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Whitmer</surname> <given-names>SLM</given-names></name>, <name name-style="western"><surname>Andersen</surname> <given-names>KG</given-names></name>, <etal>et al</etal>. <article-title>Ebola Virus Epidemiology, Transmission, and Evolution during Seven Months in Sierra Leone</article-title>. <source>Cell</source>. <year>2015</year>;<volume>161</volume>: <fpage>1516</fpage>–<lpage>1526</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.cell.2015.06.007" xlink:type="simple">10.1016/j.cell.2015.06.007</ext-link></comment> <object-id pub-id-type="pmid">26091036</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref013"><label>13</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Ratmann</surname> <given-names>O</given-names></name>, <name name-style="western"><surname>Kagaayi</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Hall</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Golubchick</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Kigozi</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Xi</surname> <given-names>X</given-names></name>, <etal>et al</etal>. <article-title>Quantifying HIV transmission flow between high-prevalence hotspots and surrounding communities: a population-based study in Rakai, Uganda</article-title>. <source>Lancet HIV</source>. <year>2020</year>;<volume>7</volume>: <fpage>e173</fpage>–<lpage>e183</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/S2352-3018%2819%2930378-9" xlink:type="simple">10.1016/S2352-3018(19)30378-9</ext-link></comment> <object-id pub-id-type="pmid">31953184</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref014"><label>14</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Salje</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Lessler</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Endy</surname> <given-names>TP</given-names></name>, <name name-style="western"><surname>Curriero</surname> <given-names>FC</given-names></name>, <name name-style="western"><surname>Gibbons</surname> <given-names>RV</given-names></name>, <name name-style="western"><surname>Nisalak</surname> <given-names>A</given-names></name>, <etal>et al</etal>. <article-title>Revealing the microscale spatial signature of dengue transmission and immunity in an urban population</article-title>. <source>Proc Natl Acad Sci U S A</source>. <year>2012</year>;<volume>109</volume>: <fpage>9535</fpage>–<lpage>9538</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1073/pnas.1120621109" xlink:type="simple">10.1073/pnas.1120621109</ext-link></comment> <object-id pub-id-type="pmid">22645364</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref015"><label>15</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Volz</surname> <given-names>EM</given-names></name>, <name name-style="western"><surname>Frost</surname> <given-names>SDW</given-names></name>. <article-title>Inferring the source of transmission with phylogenetic data</article-title>. <source>PLoS Comput Biol</source>. <year>2013</year>;<volume>9</volume>: <fpage>e1003397</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pcbi.1003397" xlink:type="simple">10.1371/journal.pcbi.1003397</ext-link></comment> <object-id pub-id-type="pmid">24367249</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref016"><label>16</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Frost</surname> <given-names>SDW</given-names></name>, <name name-style="western"><surname>Pybus</surname> <given-names>OG</given-names></name>, <name name-style="western"><surname>Gog</surname> <given-names>JR</given-names></name>, <name name-style="western"><surname>Viboud</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Bonhoeffer</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Bedford</surname> <given-names>T</given-names></name>. <article-title>Eight challenges in phylodynamic inference</article-title>. <source>Epidemics</source>. <year>2015</year>;<volume>10</volume>: <fpage>88</fpage>–<lpage>92</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.epidem.2014.09.001" xlink:type="simple">10.1016/j.epidem.2014.09.001</ext-link></comment> <object-id pub-id-type="pmid">25843391</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref017"><label>17</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Grabowski</surname> <given-names>MK</given-names></name>, <name name-style="western"><surname>Lessler</surname> <given-names>J</given-names></name>. <article-title>Phylogenetic insights into age-disparate partnerships and HIV</article-title>. <source>The lancet. HIV</source>. <year>2017</year>. pp. <fpage>e8</fpage>–<lpage>e9</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/S2352-3018%2816%2930184-9" xlink:type="simple">10.1016/S2352-3018(16)30184-9</ext-link></comment> <object-id pub-id-type="pmid">27914876</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref018"><label>18</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Mavian</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Marini</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Manes</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Capua</surname> <given-names>I</given-names></name>, <name name-style="western"><surname>Prosperi</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Salemi</surname> <given-names>M</given-names></name>. <article-title>Regaining perspective on SARS-CoV-2 molecular tracing and its implications</article-title>. <source>medRxiv</source>. <year>2020</year>; 2020.03.16.20034470.</mixed-citation></ref>
<ref id="pcbi.1009182.ref019"><label>19</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Farhat</surname> <given-names>MR</given-names></name>, <name name-style="western"><surname>Shapiro</surname> <given-names>BJ</given-names></name>, <name name-style="western"><surname>Sheppard</surname> <given-names>SK</given-names></name>, <name name-style="western"><surname>Colijn</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Murray</surname> <given-names>M</given-names></name>. <article-title>A phylogeny-based sampling strategy and power calculator informs genome-wide associations study design for microbial pathogens</article-title>. <source>Genome Med</source>. <year>2014</year>;<volume>6</volume>: <fpage>101</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/s13073-014-0101-7" xlink:type="simple">10.1186/s13073-014-0101-7</ext-link></comment> <object-id pub-id-type="pmid">25484920</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref020"><label>20</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Kelly</surname> <given-names>BJ</given-names></name>, <name name-style="western"><surname>Gross</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Bittinger</surname> <given-names>K</given-names></name>, <name name-style="western"><surname>Sherrill-Mix</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Lewis</surname> <given-names>JD</given-names></name>, <name name-style="western"><surname>Collman</surname> <given-names>RG</given-names></name>, <etal>et al</etal>. <article-title>Power and sample-size estimation for microbiome studies using pairwise distances and PERMANOVA</article-title>. <source>Bioinformatics</source>. <year>2015</year>;<volume>31</volume>: <fpage>2461</fpage>–<lpage>2468</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bioinformatics/btv183" xlink:type="simple">10.1093/bioinformatics/btv183</ext-link></comment> <object-id pub-id-type="pmid">25819674</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref021"><label>21</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Network</surname> <given-names>HPT</given-names></name>, Others. <source>HPTN 071: population effects of antiretroviral therapy to reduce HIV transmission (PopART): a cluster-randomized trial of the impact of a combination prevention package on population-level HIV incidence in Zambia and South Africa</source>. <year>2013</year>.</mixed-citation></ref>
<ref id="pcbi.1009182.ref022"><label>22</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Youden</surname> <given-names>WJ</given-names></name>. <article-title>Index for rating diagnostic tests</article-title>. <source>Cancer</source>. <year>1950</year>;<volume>3</volume>: <fpage>32</fpage>–<lpage>35</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/1097-0142%281950%293%3A1%26lt%3B32%3A%3Aaid-cncr2820030106%26gt%3B3.0.co%3B2-3" xlink:type="simple">10.1002/1097-0142(1950)3:1&lt;32::aid-cncr2820030106&gt;3.0.co;2-3</ext-link></comment> <object-id pub-id-type="pmid">15405679</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref023"><label>23</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Perkins</surname> <given-names>NJ</given-names></name>, <name name-style="western"><surname>Schisterman</surname> <given-names>EF</given-names></name>. <article-title>The inconsistency of “optimal” cutpoints obtained using two criteria based on the receiver operating characteristic curve</article-title>. <source>Am J Epidemiol</source>. <year>2006</year>;<volume>163</volume>: <fpage>670</fpage>–<lpage>675</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/aje/kwj063" xlink:type="simple">10.1093/aje/kwj063</ext-link></comment> <object-id pub-id-type="pmid">16410346</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref024"><label>24</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Liu</surname> <given-names>X</given-names></name>. <article-title>Classification accuracy and cut point selection</article-title>. <source>Stat Med</source>. <year>2012</year>;<volume>31</volume>: <fpage>2676</fpage>–<lpage>2686</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/sim.4509" xlink:type="simple">10.1002/sim.4509</ext-link></comment> <object-id pub-id-type="pmid">22307964</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref025"><label>25</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Zou</surname> <given-names>KH</given-names></name>, <name name-style="western"><surname>Yu</surname> <given-names>C-R</given-names></name>, <name name-style="western"><surname>Liu</surname> <given-names>K</given-names></name>, <name name-style="western"><surname>Carlsson</surname> <given-names>MO</given-names></name>, <name name-style="western"><surname>Cabrera</surname> <given-names>J</given-names></name>. <article-title>Optimal thresholds by maximizing or minimizing various metrics via ROC-type analysis</article-title>. <source>Acad Radiol</source>. <year>2013</year>;<volume>20</volume>: <fpage>807</fpage>–<lpage>815</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.acra.2013.02.004" xlink:type="simple">10.1016/j.acra.2013.02.004</ext-link></comment> <object-id pub-id-type="pmid">23582776</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref026"><label>26</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Jombart</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Cori</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Didelot</surname> <given-names>X</given-names></name>, <name name-style="western"><surname>Cauchemez</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Fraser</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Ferguson</surname> <given-names>N</given-names></name>. <article-title>Bayesian reconstruction of disease outbreaks by combining epidemiologic and genomic data</article-title>. <source>PLoS Comput Biol</source>. <year>2014</year>;<volume>10</volume>: <fpage>e1003457</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pcbi.1003457" xlink:type="simple">10.1371/journal.pcbi.1003457</ext-link></comment> <object-id pub-id-type="pmid">24465202</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref027"><label>27</label><mixed-citation publication-type="other" xlink:type="simple">Team RC, Others. R: A language and environment for statistical computing. 2013. Available: <ext-link ext-link-type="uri" xlink:href="http://finzi.psych.upenn.edu/R/library/dplR/doc/intro-dplR.pdf" xlink:type="simple">http://finzi.psych.upenn.edu/R/library/dplR/doc/intro-dplR.pdf</ext-link></mixed-citation></ref>
<ref id="pcbi.1009182.ref028"><label>28</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Dobrow</surname> <given-names>RP</given-names></name>. <article-title>On the distribution of distances in recursive trees</article-title>. <source>J Appl Probab</source>. <year>1996</year>;<volume>33</volume>: <fpage>749</fpage>–<lpage>757</lpage>.</mixed-citation></ref>
<ref id="pcbi.1009182.ref029"><label>29</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Mahmoud</surname> <given-names>HM</given-names></name>, <name name-style="western"><surname>Neininger</surname> <given-names>R</given-names></name>. <article-title>Distribution of distances in random binary search trees</article-title>. <source>Ann Appl Probab</source>. <year>2003</year>;<volume>13</volume>: <fpage>253</fpage>–<lpage>276</lpage>.</mixed-citation></ref>
<ref id="pcbi.1009182.ref030"><label>30</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Salje</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Cummings</surname> <given-names>DAT</given-names></name>, <name name-style="western"><surname>Lessler</surname> <given-names>J</given-names></name>. <article-title>Estimating infectious disease transmission distances using the overall distribution of cases</article-title>. <source>Epidemics</source>. <year>2016</year>;<volume>17</volume>: <fpage>10</fpage>–<lpage>18</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.epidem.2016.10.001" xlink:type="simple">10.1016/j.epidem.2016.10.001</ext-link></comment> <object-id pub-id-type="pmid">27744095</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref031"><label>31</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Worby</surname> <given-names>CJ</given-names></name>, <name name-style="western"><surname>Chang</surname> <given-names>H-H</given-names></name>, <name name-style="western"><surname>Hanage</surname> <given-names>WP</given-names></name>, <name name-style="western"><surname>Lipsitch</surname> <given-names>M</given-names></name>. <article-title>The distribution of pairwise genetic distances: a tool for investigating disease transmission</article-title>. <source>Genetics</source>. <year>2014</year>;<volume>198</volume>: <fpage>1395</fpage>–<lpage>1404</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1534/genetics.114.171538" xlink:type="simple">10.1534/genetics.114.171538</ext-link></comment> <object-id pub-id-type="pmid">25313129</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref032"><label>32</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Campbell</surname> <given-names>F</given-names></name>, <name name-style="western"><surname>Strang</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Ferguson</surname> <given-names>N</given-names></name>, <name name-style="western"><surname>Cori</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Jombart</surname> <given-names>T</given-names></name>. <article-title>When are pathogen genome sequences informative of transmission events?</article-title> <source>PLoS Pathog</source>. <year>2018</year>;<volume>14</volume>: <fpage>e1006885</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.ppat.1006885" xlink:type="simple">10.1371/journal.ppat.1006885</ext-link></comment> <object-id pub-id-type="pmid">29420641</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref033"><label>33</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Jenkins</surname> <given-names>GM</given-names></name>, <name name-style="western"><surname>Rambaut</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Pybus</surname> <given-names>OG</given-names></name>, <name name-style="western"><surname>Holmes</surname> <given-names>EC</given-names></name>. <article-title>Rates of molecular evolution in RNA viruses: a quantitative phylogenetic analysis</article-title>. <source>J Mol Evol</source>. <year>2002</year>;<volume>54</volume>: <fpage>156</fpage>–<lpage>165</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1007/s00239-001-0064-3" xlink:type="simple">10.1007/s00239-001-0064-3</ext-link></comment> <object-id pub-id-type="pmid">11821909</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref034"><label>34</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Duchêne</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Holt</surname> <given-names>KE</given-names></name>, <name name-style="western"><surname>Weill</surname> <given-names>F-X</given-names></name>, <name name-style="western"><surname>Le Hello</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Hawkey</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Edwards</surname> <given-names>DJ</given-names></name>, <etal>et al</etal>. <article-title>Genome-scale rates of evolutionary change in bacteria.</article-title> <source>Microb Genom</source>. <year>2016</year>;<volume>2</volume>: <fpage>e000094</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1099/mgen.0.000094" xlink:type="simple">10.1099/mgen.0.000094</ext-link></comment> <object-id pub-id-type="pmid">28348834</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref035"><label>35</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>van den Driessche</surname> <given-names>P</given-names></name>. <article-title>Reproduction numbers of infectious disease models</article-title>. <source>Infect Dis Model</source>. <year>2017</year>;<volume>2</volume>: <fpage>288</fpage>–<lpage>303</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.idm.2017.06.002" xlink:type="simple">10.1016/j.idm.2017.06.002</ext-link></comment> <object-id pub-id-type="pmid">29928743</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref036"><label>36</label><mixed-citation publication-type="book" xlink:type="simple"><name name-style="western"><surname>Wickham</surname> <given-names>H</given-names></name>. <source>ggplot2: Elegant Graphics for Data Analysis</source>. <publisher-name>Springer-Verlag</publisher-name> <publisher-loc>New York</publisher-loc>; <year>2016</year>. Available: <ext-link ext-link-type="uri" xlink:href="https://ggplot2.tidyverse.org" xlink:type="simple">https://ggplot2.tidyverse.org</ext-link></mixed-citation></ref>
<ref id="pcbi.1009182.ref037"><label>37</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Wohl</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Metsky</surname> <given-names>HC</given-names></name>, <name name-style="western"><surname>Schaffner</surname> <given-names>SF</given-names></name>, <name name-style="western"><surname>Piantadosi</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Burns</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Lewnard</surname> <given-names>JA</given-names></name>, <etal>et al</etal>. <article-title>Combining genomics and epidemiology to track mumps virus transmission in the United States</article-title>. <source>PLoS Biol</source>. <year>2020</year>;<volume>18</volume>: <fpage>e3000611</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pbio.3000611" xlink:type="simple">10.1371/journal.pbio.3000611</ext-link></comment> <object-id pub-id-type="pmid">32045407</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref038"><label>38</label><mixed-citation publication-type="other" xlink:type="simple">Genomic epidemiology of novel coronavirus—Global subsampling. [cited 20 Mar 2021]. Available: <ext-link ext-link-type="uri" xlink:href="https://nextstrain.org/ncov/global?=clock" xlink:type="simple">https://nextstrain.org/ncov/global?l=clock</ext-link></mixed-citation></ref>
<ref id="pcbi.1009182.ref039"><label>39</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Hadfield</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Megill</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Bell</surname> <given-names>SM</given-names></name>, <name name-style="western"><surname>Huddleston</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Potter</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Callender</surname> <given-names>C</given-names></name>, <etal>et al</etal>. <article-title>Nextstrain: real-time tracking of pathogen evolution</article-title>. <source>Bioinformatics</source>. <year>2018</year>;<volume>34</volume>: <fpage>4121</fpage>–<lpage>4123</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bioinformatics/bty407" xlink:type="simple">10.1093/bioinformatics/bty407</ext-link></comment> <object-id pub-id-type="pmid">29790939</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref040"><label>40</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Sagulenko</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Puller</surname> <given-names>V</given-names></name>, <name name-style="western"><surname>Neher</surname> <given-names>RA</given-names></name>. <article-title>TreeTime: Maximum-likelihood phylodynamic analysis</article-title>. <source>Virus Evol</source>. <year>2018</year>;<volume>4</volume>: <fpage>vex042</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/ve/vex042" xlink:type="simple">10.1093/ve/vex042</ext-link></comment> <object-id pub-id-type="pmid">29340210</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref041"><label>41</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Ferretti</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Wymant</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Kendall</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Zhao</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Nurtay</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Abeler-Dörner</surname> <given-names>L</given-names></name>, <etal>et al</etal>. <article-title>Quantifying SARS-CoV-2 transmission suggests epidemic control with digital contact tracing</article-title>. <source>bioRxiv</source>. medRxiv; <year>2020</year>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1126/science.abb6936" xlink:type="simple">10.1126/science.abb6936</ext-link></comment> <object-id pub-id-type="pmid">32234805</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref042"><label>42</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Vink</surname> <given-names>MA</given-names></name>, <name name-style="western"><surname>Bootsma</surname> <given-names>MCJ</given-names></name>, <name name-style="western"><surname>Wallinga</surname> <given-names>J</given-names></name>. <article-title>Serial intervals of respiratory infectious diseases: a systematic review and analysis</article-title>. <source>Am J Epidemiol</source>. <year>2014</year>;<volume>180</volume>: <fpage>865</fpage>–<lpage>875</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/aje/kwu209" xlink:type="simple">10.1093/aje/kwu209</ext-link></comment> <object-id pub-id-type="pmid">25294601</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref043"><label>43</label><mixed-citation publication-type="book" xlink:type="simple"><name name-style="western"><surname>Anderson</surname> <given-names>RM</given-names></name>, <name name-style="western"><surname>May</surname> <given-names>RM</given-names></name>. <source>Infectious Diseases of Humans: Dynamics and Control</source>. <publisher-name>OUP Oxford</publisher-name>; <year>1992</year>.</mixed-citation></ref>
<ref id="pcbi.1009182.ref044"><label>44</label><mixed-citation publication-type="book" xlink:type="simple"><name name-style="western"><surname>Vynnycky</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>White</surname> <given-names>R</given-names></name>. <source>An Introduction to Infectious Disease Modelling</source>. <publisher-name>OUP Oxford</publisher-name>; <year>2010</year>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/aje/kwp394" xlink:type="simple">10.1093/aje/kwp394</ext-link></comment> <object-id pub-id-type="pmid">20007674</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref045"><label>45</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Billah</surname> <given-names>MA</given-names></name>, <name name-style="western"><surname>Miah</surname> <given-names>MM</given-names></name>, <name name-style="western"><surname>Khan</surname> <given-names>MN</given-names></name>. <article-title>Reproductive number of coronavirus: A systematic review and meta-analysis based on global level evidence</article-title>. <source>PLoS One</source>. <year>2020</year>;<volume>15</volume>: <fpage>e0242128</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pone.0242128" xlink:type="simple">10.1371/journal.pone.0242128</ext-link></comment> <object-id pub-id-type="pmid">33175914</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref046"><label>46</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Katul</surname> <given-names>GG</given-names></name>, <name name-style="western"><surname>Mrad</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Bonetti</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Manoli</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Parolari</surname> <given-names>AJ</given-names></name>. <article-title>Global convergence of COVID-19 basic reproduction number and estimation from early-time SIR dynamics</article-title>. <source>PLoS One</source>. <year>2020</year>;<volume>15</volume>: <fpage>e0239800</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pone.0239800" xlink:type="simple">10.1371/journal.pone.0239800</ext-link></comment> <object-id pub-id-type="pmid">32970786</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref047"><label>47</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Klinkenberg</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Backer</surname> <given-names>JA</given-names></name>, <name name-style="western"><surname>Didelot</surname> <given-names>X</given-names></name>, <name name-style="western"><surname>Colijn</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Wallinga</surname> <given-names>J</given-names></name>. <article-title>Simultaneous inference of phylogenetic and transmission trees in infectious disease outbreaks</article-title>. <source>PLoS Comput Biol</source>. <year>2017</year>;<volume>13</volume>: <fpage>e1005495</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pcbi.1005495" xlink:type="simple">10.1371/journal.pcbi.1005495</ext-link></comment> <object-id pub-id-type="pmid">28545083</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref048"><label>48</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Ypma</surname> <given-names>RJF</given-names></name>, <name name-style="western"><surname>Bataille</surname> <given-names>AMA</given-names></name>, <name name-style="western"><surname>Stegeman</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Koch</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Wallinga</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>van Ballegooijen</surname> <given-names>WM</given-names></name>. <article-title>Unravelling transmission trees of infectious diseases by combining genetic and epidemiological data</article-title>. <source>Proc Biol Sci</source>. <year>2012</year>;<volume>279</volume>: <fpage>444</fpage>–<lpage>450</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1098/rspb.2011.0913" xlink:type="simple">10.1098/rspb.2011.0913</ext-link></comment> <object-id pub-id-type="pmid">21733899</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref049"><label>49</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Morelli</surname> <given-names>MJ</given-names></name>, <name name-style="western"><surname>Thébaud</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Chadœuf</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>King</surname> <given-names>DP</given-names></name>, <name name-style="western"><surname>Haydon</surname> <given-names>DT</given-names></name>, <name name-style="western"><surname>Soubeyrand</surname> <given-names>S</given-names></name>. <article-title>A Bayesian inference framework to reconstruct transmission trees using epidemiological and genetic data</article-title>. <source>PLoS Comput Biol</source>. <year>2012</year>;<volume>8</volume>: <fpage>e1002768</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pcbi.1002768" xlink:type="simple">10.1371/journal.pcbi.1002768</ext-link></comment> <object-id pub-id-type="pmid">23166481</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref050"><label>50</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Stack</surname> <given-names>JC</given-names></name>, <name name-style="western"><surname>Welch</surname> <given-names>JD</given-names></name>, <name name-style="western"><surname>Ferrari</surname> <given-names>MJ</given-names></name>, <name name-style="western"><surname>Shapiro</surname> <given-names>BU</given-names></name>, <name name-style="western"><surname>Grenfell</surname> <given-names>BT</given-names></name>. <article-title>Protocols for sampling viral sequences to study epidemic dynamics</article-title>. <source>J R Soc Interface</source>. <year>2010</year>;<volume>7</volume>: <fpage>1119</fpage>–<lpage>1127</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1098/rsif.2009.0530" xlink:type="simple">10.1098/rsif.2009.0530</ext-link></comment> <object-id pub-id-type="pmid">20147314</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref051"><label>51</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>de Silva</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Ferguson</surname> <given-names>NM</given-names></name>, <name name-style="western"><surname>Fraser</surname> <given-names>C</given-names></name>. <article-title>Inferring pandemic growth rates from sequence data</article-title>. <source>J R Soc Interface</source>. <year>2012</year>;<volume>9</volume>: <fpage>1797</fpage>–<lpage>1808</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1098/rsif.2011.0850" xlink:type="simple">10.1098/rsif.2011.0850</ext-link></comment> <object-id pub-id-type="pmid">22337627</object-id></mixed-citation></ref>
<ref id="pcbi.1009182.ref052"><label>52</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Hall</surname> <given-names>MD</given-names></name>, <name name-style="western"><surname>Woolhouse</surname> <given-names>MEJ</given-names></name>, <name name-style="western"><surname>Rambaut</surname> <given-names>A</given-names></name>. <article-title>The effects of sampling strategy on the quality of reconstruction of viral population dynamics using Bayesian skyline family coalescent methods: A simulation study</article-title>. <source>Virus Evol</source>. <year>2016</year>;<volume>2</volume>: <fpage>vew003</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/ve/vew003" xlink:type="simple">10.1093/ve/vew003</ext-link></comment> <object-id pub-id-type="pmid">27774296</object-id></mixed-citation></ref>
</ref-list>
</back>
<sub-article article-type="author-comment" id="pcbi.1009182.r001" specific-use="rebutted-decision-letter-unavailable">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1009182.r001</article-id>
<title-group>
<article-title>Author response to previous submission</article-title>
</title-group>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>0</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="author-response-date">2 Dec 2020</named-content>
</p>
<supplementary-material id="pcbi.1009182.s017" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s017" xlink:type="simple">
<label>Attachment</label>
<caption>
<p>Submitted filename: <named-content content-type="submitted-filename">phylosamp_elife_reviewerresponse.pdf</named-content></p>
</caption>
</supplementary-material>
</body>
</sub-article>
<sub-article article-type="aggregated-review-documents" id="pcbi.1009182.r002" specific-use="decision-letter">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1009182.r002</article-id>
<title-group>
<article-title>Decision Letter 0</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>Pitzer</surname>
<given-names>Virginia E.</given-names>
</name>
<role>Deputy Editor</role>
</contrib>
</contrib-group>
<permissions>
<copyright-year>2021</copyright-year>
<copyright-holder>Virginia E. Pitzer</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<related-object document-id="10.1371/journal.pcbi.1009182" document-id-type="doi" document-type="article" id="rel-obj002" link-type="peer-reviewed-article"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>0</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="letter-date">20 Jan 2021</named-content>
</p>
<p>Dear Dr. Lessler,</p>
<p>Thank you very much for submitting your manuscript "Sample Size Calculation for Phylogenetic Case Linkage" for consideration at PLOS Computational Biology.</p>
<p>As with all papers reviewed by the journal, your manuscript was reviewed by members of the editorial board and by several independent reviewers. In light of the reviews (below this email), we would like to invite the resubmission of a significantly-revised version that takes into account the reviewers' comments.</p>
<p>As you'll see, the reviews are mixed. While Reviewer 2 has only minor suggestions for improvements in the clarity of the text, Reviewers 1 and 3 have more substantive comments. In particular, both would like to see the methods illustrated using an openly available real data set, and I strongly encourage the authors to do so. Reviewer 3 (who previously reviewed the manuscript at eLife) still has some substantive methodological concerns, although I think that these can be addressed.</p>
<p>We cannot make any decision about publication until we have seen the revised manuscript and your response to the reviewers' comments. Your revised manuscript is also likely to be sent to reviewers for further evaluation.</p>
<p>When you are ready to resubmit, please upload the following:</p>
<p>[1] A letter containing a detailed list of your responses to the review comments and a description of the changes you have made in the manuscript. Please note while forming your response, if your article is accepted, you may have the opportunity to make the peer review history publicly available. The record will include editor decision letters (with reviews) and your responses to reviewer comments. If eligible, we will contact you to opt in or out.</p>
<p>[2] Two versions of the revised manuscript: one with either highlights or tracked changes denoting where the text has been changed; the other a clean version (uploaded as the manuscript file).</p>
<p>Important additional instructions are given below your reviewer comments.</p>
<p>Please prepare and submit your revised manuscript within 60 days. If you anticipate any delay, please let us know the expected resubmission date by replying to this email. Please note that revised manuscripts received after the 60-day due date may require evaluation and peer review similar to newly submitted manuscripts.</p>
<p>Thank you again for your submission. We hope that our editorial process has been constructive so far, and we welcome your feedback at any time. Please don't hesitate to contact us if you have any questions or comments.</p>
<p>Sincerely,</p>
<p>Virginia E. Pitzer, Sc.D.</p>
<p>Deputy Editor-in-Chief</p>
<p>PLOS Computational Biology</p>
<p>Virginia Pitzer</p>
<p>Deputy Editor-in-Chief</p>
<p>PLOS Computational Biology</p>
<p>***********************</p>
<p>As you'll see, the reviews are mixed. While Reviewer 2 has only minor suggestions for improvements in the clarity of the text, Reviewers 1 and 3 have more substantive comments. In particular, both would like to see the methods illustrated using an openly available real data set, and I strongly encourage the authors to do so. Reviewer 3 (who previously reviewed the manuscript at eLife) still has some substantive methodological concerns, although I think that these can be addressed.</p>
<p>Reviewer's Responses to Questions</p>
<p><bold>Comments to the Authors:</bold></p>
<p><bold>Please note here if the review is uploaded as an attachment.</bold></p>
<p>Reviewer #1: Wohl et al present a statistical framework for calculating sample sizes for robust determinations of the infector-infectee pairs within transmission chains of pathogen genomic epidemiology studies. Their framework also provides methods for calculating FDR and the expected number of true transmission pairs from the specificity and sensitivity of the linkage criteria (genetic distances), sample size, the proportion of samples sequenced, and the effective reproductive number of the pathogen analysed. The authors demonstrate the utility of this framework with simulation data and developed the R package “phylosamp” to provide an implementation of their framework.</p>
<p>This manuscript addresses a neglected problem in many genetic epidemiology studies regarding the level of sequencing required to be carried out in order for robust conclusions to be made when reconstructing transmission chains of pathogen outbreaks using WGS data. The work is novel as there are a lack of current formal agreed upon standards for carrying out this aspect study design, and is both relevant and timely given the increasing widespread adoption of genetic epidemiology techniques for understanding pathogen transmission dynamics. Further, the manuscript is well written, the underlying methodology well described, and the use cases of the software and limitations are appropriately discussed.</p>
<p>Please find my comments below, divided into different sections for (a) the manuscript describing the framework and (b) the R package phylosamp. I hope these are useful to the authors.</p>
<p>A. Manuscript comments:</p>
<p>(1) The reliance of phylosamp at present on genetic distances alone as the linkage criteria presents a key limitation in calculating appropriate sample sizes and other parameters for a study concerning slowly evolving pathogens where there is limited genetic variation accumulating between transmission pairs/generations which prohibits their detection from WGS alone. I recognise that the focus of this manuscript is a first step towards more comprehensive approaches, and that these concerns are discussed in both the manuscript, and in previous supplied reviews from a submission to eLife, but also believe that this limits the utility of the software for many genetic epidemiology studies.</p>
<p>(2) While the simulation data provide a useful and convincing illustration of the framework, it would be excellent to also see an example application of phylosamp to an existing published pathogen dataset to further demonstrate its utility. Again, I recognise that this has been discussed in previous reviews from a submission to eLife, but the inclusion of such data would present a substantial improvement to the work and encourage further adoption of the framework.</p>
<p>(3) The definition of Rpop provided from line 100, where it is first introduced requires rephrasing for clarity. While this is better described later in the manuscript from line 149, the earlier text could be clarified to avoid the reader having to scroll back and forth throughout the paper. I recognise that this text has already been refined based on the reviewer comments from the previous submission to eLife, however, it could benefit from further refinement for improved clarity and flow.</p>
<p>(4) Figure 1B: Does each white dot indicate the sensitivity and 1-specificty for a SNP/genetic distance increased in increments of 1? i.e. 0, 1, 2, 3, 4, … SNPs? If so, it would be helpful to indicate the values of these increments either by annotation of the figure itself or expansion of the figure legend to improve clarity.</p>
<p>(5) Line 242: There don’t appear to be any citations for the range of effective reproductive numbers of human pathogens explored in simulation studies.</p>
<p>(6) Figure S5: It appears that either the figure panels or the legend descriptions might be inverted for A and B, as well as C and D.</p>
<p>(7) The authors have put substantial effort into making their work openly available by submitting a preprint on medrxiv and providing all code and data files required to reproduce their analyses and manuscript figures via github (available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/HopkinsIDD/phylosamplesize" xlink:type="simple">https://github.com/HopkinsIDD/phylosamplesize</ext-link>). I was able to reproduce all figures and analyses until line #113 of figures.Rmd at which point I was unable to proceed further.</p>
<p>i.e.</p>
<p># first time only: calculate tfdr from simulations and save to file</p>
<p>calc.tfdr(simdata="data/simdata_var_N10000",rho_values=c(0.1,0.25,0.5,0.75),max_sim_size=2000,</p>
<p>sens_spec_method="sim",mgd=mgd,outdir="data/full_data_sim.Rdata")</p>
<p>I think this might be due to the files being specified by the prefix “simdata_var_N10000” where it might need to be instead specified as “simdata_var_gen_N10000”, but the authors may need to look into this further.</p>
<p>B. Phylosamp R package and documentation comments:</p>
<p>Code from the R package was clearly structured and generally well commented. The package is freely available and easily installed via the devtools library. I was able to reproduce the results from the vignette code easily and without issue, and found the explanations very clear and informative. I have provided some comments on the R package and documentation below that I hope are useful to the authors, but do not regard any of these to be critical changes, nor do I require that these suggested changes be made for the publication of this manuscript.</p>
<p>- In the vignettes it may be worth providing a simple reiteration of what each argument provided to the function is in the vignette (e.g. for eta, chi, rho)</p>
<p>- There appears to be a typo at the top of the ‘Illustrated examples’ vignette page, I think “this vignette…” should perhaps be “In this vignette…”.</p>
<p>- When using the help operator in R, I found the package to be well documented for all functions, but at times it was a little unclear to me which defaults were used when these are not supplied explicitly by the user i.e. the assumption argument. I think based on the manuscript and example function provided via the help operation in R this is mtml for ‘multiple-transmission multiple-linkage’, but perhaps this could be further clarified in the package documentation</p>
<p>Reviewer #2: Wohl et al. present a method for understanding how sampling, both in terms of overall depth and in terms of proportion, influences how accurately we can identify true infector-infectee pairs (linked cases) from a phylogeny of pathogen genomes. This theoretical area of genomic epidemiology is sorely underdeveloped, especially when compared to the rigorous theoretical framework for sampling design available for traditional epidemiological studies. This work is the first real step I’ve seen to develop sample size calculations for genomic epidemiological studies. The manuscript is clearly written, and I am satisfied by how the authors have addressed previous reviewer comments. While this work should be accepted, I do have some minor comments that should be addressed to avoid reader confusion and position this paper in the appropriate context. These comments do not require further analytic work; they are only textual changes.</p>
<p>1. In the Introduction the authors draw on many examples of how pathogen genomic information can be used to investigate public health questions (lines 34-37) at multiple scales (lines 47-49), and declare that all of those questions can be boiled down to a question of asking whether pairs of infections are related. I disagree with this, especially within the context of sampling. Sampling considerations within phylogeographic studies, which seek to infer patterns of spatial linkage, center on the assumption that sampling must be sufficiently broad and random to have fully sampled all circulating genetic lineages, generally at an intensity that is proportional to a lineage’s prevalence. For those questions I don’t see how it’s important that linked pairs are captured, and thus I don’t see how this method would help me to design better phylogeographic studies. I would recommend that the authors pivot their introduction to orient this work towards phylogenetic studies of “Who Infected Whom” or phylogenetic birth-death processes, where this method seems most useful.</p>
<p>2. In the section “Determining sensitivity and specificity” the discussion of “mutation rate” is confusing. Given that the generation time is the serial interval between infections, the rate at which changes in the genome would accrue AND be observed at the consensus level should be referred to as the pathogen “substitution rate” rather than the “mutation rate”. I realize that may sound pedantic, but this actually caused some confusion for me given that the selected example rate of 1 mutation/genome/generation is actually a reasonable expectation of the biological mutation rate per pathogen replication cycle.</p>
<p>3. I presume that the high substitution rate was selected such that differences in the distributions of expected mutations between linked and unlinked cases (Fig 2B) would appear more distinct. Using genetic distance as the sole basis for distinguishing linked and unlinked cases gets significantly murkier for “natural” substitution rates, as the authors have shown nicely in Fig S4, mentioned on lines 229-230, and discussed in the Discussion. I appreciate those efforts, and I want to stress that I do not feel that this rate selection is disingenuous in any way. However, in the Discussion the authors’ solution to this issue is to incorporate epidemiological data (such as location data, symptom onset date, contact history etc) to improve resolution of linked versus unlinked cases. Again, I don’t deny that multiple data sources would improve these designations, but it is unclear to me then how one would then calculate sensitivity and specificity. Given that this method relies upon knowing those values, this solution actually seems quite challenging to implement and at least mentioning that in the Discussion is important.</p>
<p>4. I find the R_pop quantity to be highly unintuitive. While we generally discuss R_eff as changing over an outbreak given depletion of susceptibles, I’ve never seen a formulation where the average R is calculated across the population with terminal samples presumed to be 0 because their child infections are not sampled. I will say that Figure S2 helped to clarify this concept greatly, and I’m thankful for that addition. However, I still find the in-text explanation (lines 145-157) very confusing. I think the key to making this clearer is to explicitly say that, within the bounded sampling frame, any terminal nodes (leaves) in the tree/transmission network are presumed to have no known child infections, and thus contribute an R value of 0, which is what allows R_pop to drop below one even for diseases where R_eff is easily greater than one.</p>
<p>Reviewer #3: In this work the authors seek to provide guidance to understand how sampling impacts the discovery of transmission events using genomic data. The question is interesting and important but the exploration here is limited to the simplest transmission scenario, with a single introduction, uniform random sampling, a known sensitivity and specificity of the genetic linkage system used (or this can be estimated but again it requires some strong assumptions) and Poisson distributed secondary infections. There is no application to real data, either for a sequenced (or partially sequenced) outbreak with analysis of the study design, or for the exploration of the linkage criteria.</p>
<p>The "single linkage" assumption seems hard to justify and the authors' give a derivation of the main result in S1 Text part D, so it's not clear why this assumption merits so much discussion earlier.</p>
<p>On page 16 of SI Text, k_i is the number of i's true transmission links that are in the sample. So k_i has to add to something less than M, the number of samples. This means that K (sum_i k_i) is not a sum of *independent* Poisson distributed random variables with rate parameter lambda - they are dependent because their sum is constrained. This impacts the expected number of pairs. It would be approximately correct if the sampling fraction is very small, because the sum of k_i would not approach M so the constraint would have minimal impact. But particularly in this paper, something whose bias gets more severe in a way that depends on the sampling fraction is not good. Also the distribution of the number of pairs is important (not just the expectation) .</p>
<p>On the same page I don't get the E(number of true pairs) / Pr(pair is true) - could this be a typo?</p>
<p>- Chi, not X, should be in Table 1</p>
<p>**********</p>
<p><bold>Have all data underlying the figures and results presented in the manuscript been provided?</bold></p>
<p>Large-scale datasets should be made available via a public repository as described in the <italic>PLOS Computational Biology</italic> <ext-link ext-link-type="uri" xlink:href="http://journals.plos.org/ploscompbiol/s/data-availability" xlink:type="simple">data availability policy</ext-link>, and numerical data that underlies graphs or summary statistics should be provided in spreadsheet form as supporting information.</p>
<p>Reviewer #1: Yes</p>
<p>Reviewer #2: None</p>
<p>Reviewer #3: Yes</p>
<p>**********</p>
<p>PLOS authors have the option to publish the peer review history of their article (<ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/ploscompbiol/s/editorial-and-peer-review-process#loc-peer-review-history" xlink:type="simple">what does this mean?</ext-link>). If published, this will include your full peer review and any attached files.</p>
<p>If you choose “no”, your identity will remain anonymous but your review may still be made public.</p>
<p><bold>Do you want your identity to be public for this peer review?</bold> For information about this choice, including consent withdrawal, please see our <ext-link ext-link-type="uri" xlink:href="https://www.plos.org/privacy-policy" xlink:type="simple">Privacy Policy</ext-link>.</p>
<p>Reviewer #1: No</p>
<p>Reviewer #2: No</p>
<p>Reviewer #3: No</p>
<p><underline>Figure Files:</underline></p>
<p>While revising your submission, please upload your figure files to the Preflight Analysis and Conversion Engine (PACE) digital diagnostic tool, <underline><ext-link ext-link-type="uri" xlink:href="https://pacev2.apexcovantage.com/" xlink:type="simple">https://pacev2.apexcovantage.com</ext-link></underline>. PACE helps ensure that figures meet PLOS requirements. To use PACE, you must first register as a user. Then, login and navigate to the UPLOAD tab, where you will find detailed instructions on how to use the tool. If you encounter any issues or have any questions when using PACE, please email us at <underline><email xlink:type="simple">figures@plos.org</email></underline>.</p>
<p><underline>Data Requirements:</underline></p>
<p>Please note that, as a condition of publication, PLOS' data policy requires that you make available all data used to draw the conclusions outlined in your manuscript. Data must be deposited in an appropriate repository, included within the body of the manuscript, or uploaded as supporting information. This includes all numerical values that were used to generate graphs, histograms etc.. For an example in PLOS Biology see here: <ext-link ext-link-type="uri" xlink:href="http://www.plosbiology.org/article/info%3Adoi%2F10.1371%2Fjournal.pbio.1001908#s5" xlink:type="simple">http://www.plosbiology.org/article/info%3Adoi%2F10.1371%2Fjournal.pbio.1001908#s5</ext-link>.</p>
<p><underline>Reproducibility:</underline></p>
<p>To enhance the reproducibility of your results, PLOS recommends that you deposit laboratory protocols in protocols.io, where a protocol can be assigned its own identifier (DOI) such that it can be cited independently in the future. For instructions, please see <underline><ext-link ext-link-type="uri" xlink:href="http://journals.plos.org/plospathogens/s/submission-guidelines" xlink:type="simple">http://journals.plos.org/compbiol/s/submission-guidelines#loc-materials-and-methods</ext-link></underline></p>
</body>
</sub-article>
<sub-article article-type="author-comment" id="pcbi.1009182.r003">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1009182.r003</article-id>
<title-group>
<article-title>Author response to Decision Letter 0</article-title>
</title-group>
<related-object document-id="10.1371/journal.pcbi.1009182" document-id-type="doi" document-type="peer-reviewed-article" id="rel-obj003" link-type="rebutted-decision-letter" object-id="10.1371/journal.pcbi.1009182.r002" object-id-type="doi" object-type="decision-letter"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>1</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="author-response-date">9 Apr 2021</named-content>
</p>
<supplementary-material id="pcbi.1009182.s018" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s018" xlink:type="simple">
<label>Attachment</label>
<caption>
<p>Submitted filename: <named-content content-type="submitted-filename">phylosamp_ploscompbio_reviewerresponse.pdf</named-content></p>
</caption>
</supplementary-material>
</body>
</sub-article>
<sub-article article-type="aggregated-review-documents" id="pcbi.1009182.r004" specific-use="decision-letter">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1009182.r004</article-id>
<title-group>
<article-title>Decision Letter 1</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>Pitzer</surname>
<given-names>Virginia E.</given-names>
</name>
<role>Deputy Editor-in-Chief</role>
</contrib>
</contrib-group>
<permissions>
<copyright-year>2021</copyright-year>
<copyright-holder>Virginia E. Pitzer</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<related-object document-id="10.1371/journal.pcbi.1009182" document-id-type="doi" document-type="article" id="rel-obj004" link-type="peer-reviewed-article"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>1</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="letter-date">20 May 2021</named-content>
</p>
<p>Dear Dr. Lessler,</p>
<p>Thank you very much for submitting your manuscript "Sample Size Calculation for Phylogenetic Case Linkage" for consideration at PLOS Computational Biology. As with all papers reviewed by the journal, your manuscript was reviewed by members of the editorial board and by several independent reviewers. The reviewers appreciated the attention to an important topic. Based on the reviews, we are likely to accept this manuscript for publication, providing that you modify the manuscript according to the review recommendations.</p>
<p>Please address the very minor points raised by the reviewer. Also, note that some of the variables did not render correctly in the pdf of the main text (at least not on my computer). Please check the final submission and ensure that it looks correct. Once these minor points have been addressed, we should be able to accept the manuscript without further review.</p>
<p>Please prepare and submit your revised manuscript within 30 days. If you anticipate any delay, please let us know the expected resubmission date by replying to this email.</p>
<p>When you are ready to resubmit, please upload the following:</p>
<p>[1] A letter containing a detailed list of your responses to all review comments, and a description of the changes you have made in the manuscript. Please note while forming your response, if your article is accepted, you may have the opportunity to make the peer review history publicly available. The record will include editor decision letters (with reviews) and your responses to reviewer comments. If eligible, we will contact you to opt in or out</p>
<p>[2] Two versions of the revised manuscript: one with either highlights or tracked changes denoting where the text has been changed; the other a clean version (uploaded as the manuscript file).</p>
<p>Important additional instructions are given below your reviewer comments.</p>
<p>Thank you again for your submission to our journal. We hope that our editorial process has been constructive so far, and we welcome your feedback at any time. Please don't hesitate to contact us if you have any questions or comments.</p>
<p>Sincerely,</p>
<p>Virginia E. Pitzer, Sc.D.</p>
<p>Deputy Editor-in-Chief</p>
<p>PLOS Computational Biology</p>
<p>Virginia Pitzer</p>
<p>Deputy Editor-in-Chief</p>
<p>PLOS Computational Biology</p>
<p>***********************</p>
<p>A link appears below if there are any accompanying review attachments. If you believe any reviews to be missing, please contact <email xlink:type="simple">ploscompbiol@plos.org</email> immediately:</p>
<p>[LINK]</p>
<p>Please address the very minor points raised by the reviewer. Also, note that some of the variables did not render correctly in the pdf of the main text (at least not on my computer). Please check the final submission and ensure that it looks correct. Once these minor points have been addressed, we should be able to accept the manuscript without further review.</p>
<p>Reviewer's Responses to Questions</p>
<p><bold>Comments to the Authors:</bold></p>
<p><bold>Please note here if the review is uploaded as an attachment.</bold></p>
<p>Reviewer #1: Many thanks to the authors for considering the points outlined in my previous review. I am satisfied that the authors have adequately addressed all points raised and include only minor typographical feedback below.</p>
<p>Line 136 (marked up version): It may be worth changing mutation to substitution here "rate = 1 mutation/genome/transmission"</p>
<p>Line 281 (marked up version): It might be worth changing the section heading to reflect that it contains multiple examples i.e. "Application to existing datasets"</p>
<p>Lines 386 and 413 (marked up version): The same subheading is used twice for each of the examples, it may be worth making them more specific to the example detailed in each section.</p>
<p>**********</p>
<p><bold>Have the authors made all data and (if applicable) computational code underlying the findings in their manuscript fully available?</bold></p>
<p>The <ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/ploscompbiol/s/materials-and-software-sharing" xlink:type="simple">PLOS Data policy</ext-link> requires authors to make all data and code underlying the findings described in their manuscript fully available without restriction, with rare exception (please refer to the Data Availability Statement in the manuscript PDF file). The data and code should be provided as part of the manuscript or its supporting information, or deposited to a public repository. For example, in addition to summary statistics, the data points behind means, medians and variance measures should be available. If there are restrictions on publicly sharing data or code —e.g. participant privacy or use of data from a third party—those must be specified.</p>
<p>Reviewer #1: Yes</p>
<p>**********</p>
<p>PLOS authors have the option to publish the peer review history of their article (<ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/ploscompbiol/s/editorial-and-peer-review-process#loc-peer-review-history" xlink:type="simple">what does this mean?</ext-link>). If published, this will include your full peer review and any attached files.</p>
<p>If you choose “no”, your identity will remain anonymous but your review may still be made public.</p>
<p><bold>Do you want your identity to be public for this peer review?</bold> For information about this choice, including consent withdrawal, please see our <ext-link ext-link-type="uri" xlink:href="https://www.plos.org/privacy-policy" xlink:type="simple">Privacy Policy</ext-link>.</p>
<p>Reviewer #1: No</p>
<p>Figure Files:</p>
<p>While revising your submission, please upload your figure files to the Preflight Analysis and Conversion Engine (PACE) digital diagnostic tool, <ext-link ext-link-type="uri" xlink:href="https://pacev2.apexcovantage.com" xlink:type="simple">https://pacev2.apexcovantage.com</ext-link>. PACE helps ensure that figures meet PLOS requirements. To use PACE, you must first register as a user. Then, login and navigate to the UPLOAD tab, where you will find detailed instructions on how to use the tool. If you encounter any issues or have any questions when using PACE, please email us at <email xlink:type="simple">figures@plos.org</email>.</p>
<p>Data Requirements:</p>
<p>Please note that, as a condition of publication, PLOS' data policy requires that you make available all data used to draw the conclusions outlined in your manuscript. Data must be deposited in an appropriate repository, included within the body of the manuscript, or uploaded as supporting information. This includes all numerical values that were used to generate graphs, histograms etc.. For an example in PLOS Biology see here: <ext-link ext-link-type="uri" xlink:href="http://www.plosbiology.org/article/info%3Adoi%2F10.1371%2Fjournal.pbio.1001908#s5" xlink:type="simple">http://www.plosbiology.org/article/info%3Adoi%2F10.1371%2Fjournal.pbio.1001908#s5</ext-link>.</p>
<p>Reproducibility:</p>
<p>To enhance the reproducibility of your results, we recommend that you deposit your laboratory protocols in protocols.io, where a protocol can be assigned its own identifier (DOI) such that it can be cited independently in the future. Additionally, PLOS ONE offers an option to publish peer-reviewed clinical study protocols. Read more information on sharing protocols at <ext-link ext-link-type="uri" xlink:href="https://plos.org/protocols?utm_medium=editorial-email&amp;utm_source=authorletters&amp;utm_campaign=protocols" xlink:type="simple">https://plos.org/protocols?utm_medium=editorial-email&amp;utm_source=authorletters&amp;utm_campaign=protocols</ext-link></p>
<p>References:</p>
<p>Review your reference list to ensure that it is complete and correct. If you have cited papers that have been retracted, please include the rationale for doing so in the manuscript text, or remove these references and replace them with relevant current references. Any changes to the reference list should be mentioned in the rebuttal letter that accompanies your revised manuscript.</p>
<p><italic>If you need to cite a retracted article, indicate the article’s retracted status in the References list and also include a citation and full reference for the retraction notice.</italic></p>
</body>
</sub-article>
<sub-article article-type="author-comment" id="pcbi.1009182.r005">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1009182.r005</article-id>
<title-group>
<article-title>Author response to Decision Letter 1</article-title>
</title-group>
<related-object document-id="10.1371/journal.pcbi.1009182" document-id-type="doi" document-type="peer-reviewed-article" id="rel-obj005" link-type="rebutted-decision-letter" object-id="10.1371/journal.pcbi.1009182.r004" object-id-type="doi" object-type="decision-letter"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>2</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="author-response-date">2 Jun 2021</named-content>
</p>
<supplementary-material id="pcbi.1009182.s019" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1009182.s019" xlink:type="simple">
<label>Attachment</label>
<caption>
<p>Submitted filename: <named-content content-type="submitted-filename">phylosamp_ploscompbio_reviewerresponse2.docx</named-content></p>
</caption>
</supplementary-material>
</body>
</sub-article>
<sub-article article-type="editor-report" id="pcbi.1009182.r006" specific-use="decision-letter">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1009182.r006</article-id>
<title-group>
<article-title>Decision Letter 2</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>Pitzer</surname>
<given-names>Virginia E.</given-names>
</name>
<role>Deputy Editor-in-Chief</role>
</contrib>
</contrib-group>
<permissions>
<copyright-year>2021</copyright-year>
<copyright-holder>Virginia E. Pitzer</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<related-object document-id="10.1371/journal.pcbi.1009182" document-id-type="doi" document-type="article" id="rel-obj006" link-type="peer-reviewed-article"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>2</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="letter-date">14 Jun 2021</named-content>
</p>
<p>Dear Dr. Lessler,</p>
<p>We are pleased to inform you that your manuscript 'Sample Size Calculation for Phylogenetic Case Linkage' has been provisionally accepted for publication in PLOS Computational Biology.</p>
<p>Before your manuscript can be formally accepted you will need to complete some formatting changes, which you will receive in a follow up email. A member of our team will be in touch with a set of requests.</p>
<p>Please note that your manuscript will not be scheduled for publication until you have made the required changes, so a swift response is appreciated.</p>
<p>IMPORTANT: The editorial review process is now complete. PLOS will only permit corrections to spelling, formatting or significant scientific errors from this point onwards. Requests for major changes, or any which affect the scientific understanding of your work, will cause delays to the publication date of your manuscript.</p>
<p>Should you, your institution's press office or the journal office choose to press release your paper, you will automatically be opted out of early publication. We ask that you notify us now if you or your institution is planning to press release the article. All press must be co-ordinated with PLOS.</p>
<p>Thank you again for supporting Open Access publishing; we are looking forward to publishing your work in PLOS Computational Biology. </p>
<p>Best regards,</p>
<p>Virginia E. Pitzer, Sc.D.</p>
<p>Deputy Editor-in-Chief</p>
<p>PLOS Computational Biology</p>
<p>Virginia Pitzer</p>
<p>Deputy Editor-in-Chief</p>
<p>PLOS Computational Biology</p>
<p>***********************************************************</p>
</body>
</sub-article>
<sub-article article-type="editor-report" id="pcbi.1009182.r007" specific-use="acceptance-letter">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1009182.r007</article-id>
<title-group>
<article-title>Acceptance letter</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>Pitzer</surname>
<given-names>Virginia E.</given-names>
</name>
<role>Deputy Editor-in-Chief</role>
</contrib>
</contrib-group>
<permissions>
<copyright-year>2021</copyright-year>
<copyright-holder>Virginia E. Pitzer</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<related-object document-id="10.1371/journal.pcbi.1009182" document-id-type="doi" document-type="article" id="rel-obj007" link-type="peer-reviewed-article"/>
</front-stub>
<body>
<p>
<named-content content-type="letter-date">30 Jun 2021</named-content>
</p>
<p>PCOMPBIOL-D-20-02147R2 </p>
<p>Sample Size Calculation for Phylogenetic Case Linkage</p>
<p>Dear Dr Lessler,</p>
<p>I am pleased to inform you that your manuscript has been formally accepted for publication in PLOS Computational Biology. Your manuscript is now with our production department and you will be notified of the publication date in due course.</p>
<p>The corresponding author will soon be receiving a typeset proof for review, to ensure errors have not been introduced during production. Please review the PDF proof of your manuscript carefully, as this is the last chance to correct any errors. Please note that major changes, or those which affect the scientific understanding of the work, will likely cause delays to the publication date of your manuscript. </p>
<p>Soon after your final files are uploaded, unless you have opted out, the early version of your manuscript will be published online. The date of the early version will be your article's publication date. The final article will be published to the same URL, and all versions of the paper will be accessible to readers.</p>
<p>Thank you again for supporting PLOS Computational Biology and open-access publishing. We are looking forward to publishing your work! </p>
<p>With kind regards,</p>
<p>Katalin Szabo</p>
<p>PLOS Computational Biology | Carlyle House, Carlyle Road, Cambridge CB4 3DN | United Kingdom <email xlink:type="simple">ploscompbiol@plos.org</email> | Phone +44 (0) 1223-442824 | <ext-link ext-link-type="uri" xlink:href="http://ploscompbiol.org" xlink:type="simple">ploscompbiol.org</ext-link> | @PLOSCompBiol</p>
</body>
</sub-article>
</article>