<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1d3 20150301//EN" "http://jats.nlm.nih.gov/publishing/1.1d3/JATS-journalpublishing1.dtd">
<article article-type="research-article" dtd-version="1.1d3" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS Genet</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">plosgen</journal-id>
<journal-title-group>
<journal-title>PLOS Genetics</journal-title>
</journal-title-group>
<issn pub-type="ppub">1553-7390</issn>
<issn pub-type="epub">1553-7404</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, CA USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">PGENETICS-D-19-00848</article-id>
<article-id pub-id-type="doi">10.1371/journal.pgen.1008619</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Research Article</subject>
</subj-group>
<subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Simulation and modeling</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>DNA</subject><subj-group><subject>DNA recombination</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Biochemistry</subject><subj-group><subject>Nucleic acids</subject><subj-group><subject>DNA</subject><subj-group><subject>DNA recombination</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Population biology</subject><subj-group><subject>Population metrics</subject><subj-group><subject>Population size</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Evolutionary biology</subject><subj-group><subject>Population genetics</subject><subj-group><subject>Genetic polymorphism</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Population genetics</subject><subj-group><subject>Genetic polymorphism</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Population biology</subject><subj-group><subject>Population genetics</subject><subj-group><subject>Genetic polymorphism</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Evolutionary biology</subject><subj-group><subject>Population genetics</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Population genetics</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Population biology</subject><subj-group><subject>Population genetics</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Evolutionary biology</subject><subj-group><subject>Population genetics</subject><subj-group><subject>Effective population size</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Population genetics</subject><subj-group><subject>Effective population size</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Population biology</subject><subj-group><subject>Population genetics</subject><subj-group><subject>Effective population size</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Population biology</subject><subj-group><subject>Population metrics</subject><subj-group><subject>Effective population size</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Population biology</subject><subj-group><subject>Population metrics</subject><subj-group><subject>Population size</subject><subj-group><subject>Effective population size</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Computational biology</subject><subj-group><subject>Genome evolution</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject><subj-group><subject>Genome evolution</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Evolutionary biology</subject><subj-group><subject>Molecular evolution</subject><subj-group><subject>Genome evolution</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Heredity</subject><subj-group><subject>Linkage disequilibrium</subject></subj-group></subj-group></subj-group></subj-group></article-categories>
<title-group>
<article-title>Accounting for long-range correlations in genome-wide simulations of large cohorts</article-title>
<alt-title alt-title-type="running-head">Accounting for long-range correlations in genome-wide simulations</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0003-2496-8588</contrib-id>
<name name-style="western">
<surname>Nelson</surname> <given-names>Dominic</given-names></name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Formal analysis</role>
<role content-type="http://credit.casrai.org/">Investigation</role>
<role content-type="http://credit.casrai.org/">Methodology</role>
<role content-type="http://credit.casrai.org/">Software</role>
<role content-type="http://credit.casrai.org/">Validation</role>
<role content-type="http://credit.casrai.org/">Writing – original draft</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0002-7894-5253</contrib-id>
<name name-style="western">
<surname>Kelleher</surname> <given-names>Jerome</given-names></name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Formal analysis</role>
<role content-type="http://credit.casrai.org/">Investigation</role>
<role content-type="http://credit.casrai.org/">Methodology</role>
<role content-type="http://credit.casrai.org/">Software</role>
<role content-type="http://credit.casrai.org/">Validation</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff002"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0003-0715-3432</contrib-id>
<name name-style="western">
<surname>Ragsdale</surname> <given-names>Aaron P.</given-names></name>
<role content-type="http://credit.casrai.org/">Formal analysis</role>
<role content-type="http://credit.casrai.org/">Methodology</role>
<role content-type="http://credit.casrai.org/">Writing – original draft</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0002-1480-3045</contrib-id>
<name name-style="western">
<surname>Moreau</surname> <given-names>Claudia</given-names></name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Data curation</role>
<role content-type="http://credit.casrai.org/">Formal analysis</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff003"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0002-5012-4162</contrib-id>
<name name-style="western">
<surname>McVean</surname> <given-names>Gil</given-names></name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Investigation</role>
<role content-type="http://credit.casrai.org/">Methodology</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff002"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0002-9183-964X</contrib-id>
<name name-style="western">
<surname>Gravel</surname> <given-names>Simon</given-names></name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Formal analysis</role>
<role content-type="http://credit.casrai.org/">Funding acquisition</role>
<role content-type="http://credit.casrai.org/">Investigation</role>
<role content-type="http://credit.casrai.org/">Methodology</role>
<role content-type="http://credit.casrai.org/">Writing – original draft</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
<xref ref-type="corresp" rid="cor001">*</xref>
</contrib>
</contrib-group>
<aff id="aff001">
<label>1</label>
<addr-line>McGill University and Genome Québec Innovation Centre, McGill University, Montréal, Québec, Canada</addr-line>
</aff>
<aff id="aff002">
<label>2</label>
<addr-line>Big Data Institute, Li Ka Shing Centre for Health Information and Discovery, University of Oxford, Oxford, United Kingdom</addr-line>
</aff>
<aff id="aff003">
<label>3</label>
<addr-line>Centre Intersectoriel en Santé Durable, Université du Québec à Chicoutimi, Saguenay, Québec, Canada</addr-line>
</aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple">
<name name-style="western">
<surname>Williams</surname> <given-names>Amy L.</given-names></name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/>
</contrib>
</contrib-group>
<aff id="edit1">
<addr-line>Cornell University, UNITED STATES</addr-line>
</aff>
<author-notes>
<fn fn-type="conflict" id="coi001">
<p>The authors have declared that no competing interests exist.</p>
</fn>
<corresp id="cor001">* E-mail: <email xlink:type="simple">simon.gravel@mcgill.ca</email></corresp>
</author-notes>
<pub-date pub-type="collection">
<month>5</month>
<year>2020</year>
</pub-date>
<pub-date pub-type="epub">
<day>5</day>
<month>5</month>
<year>2020</year>
</pub-date>
<volume>16</volume>
<issue>5</issue>
<elocation-id>e1008619</elocation-id>
<history>
<date date-type="received">
<day>23</day>
<month>5</month>
<year>2019</year>
</date>
<date date-type="accepted">
<day>21</day>
<month>1</month>
<year>2020</year>
</date>
</history>
<permissions>
<copyright-year>2020</copyright-year>
<copyright-holder>Nelson et al</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.pgen.1008619"/>
<abstract>
<p>Coalescent simulations are widely used to examine the effects of evolution and demographic history on the genetic makeup of populations. Thanks to recent progress in algorithms and data structures, simulators such as the widely-used <monospace>msprime</monospace> now provide genome-wide simulations for millions of individuals. However, this software relies on classic coalescent theory and its assumptions that sample sizes are small and that the region being simulated is short. Here we show that coalescent simulations of long regions of the genome exhibit large biases in identity-by-descent (IBD), long-range linkage disequilibrium (LD), and ancestry patterns, particularly when the sample size is large. We present a Wright-Fisher extension to <monospace>msprime</monospace>, and show that it produces more realistic distributions of IBD, LD, and ancestry proportions, while also addressing more subtle biases of the coalescent. Further, these extensions are more computationally efficient than state-of-the-art coalescent simulations when simulating long regions, including whole-genome data. For shorter regions, efficiency can be maintained via a hybrid model which simulates the recent past under the Wright-Fisher model and uses coalescent simulations in the distant past.</p>
</abstract>
<abstract abstract-type="summary">
<title>Author summary</title>
<p>Coalescent theory has provided deep theoretical insight into patterns of human diversity. Implementations of coalescent models in simulation software such as <monospace>ms</monospace> have further provided tools to interpret thousands of genomic studies. Recent technical progress has allowed for a dramatic increase in the scale at which genomes can be both measured and simulated, opening up opportunities for a finer understanding of evolutionary biology. However, we show that coalescent simulations of long regions of the genome exhibit large biases in sample relatedness, distorting haplotype sharing and ancestry patterns in simulated cohorts. We trace these biases to basic assumptions of the coalescent model, and show how the assumptions can be relaxed to provide a better description of the observed patterns of genetic polymorphism at a fraction of the computational cost.</p>
</abstract>
<funding-group>
<award-group id="award001">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/501100000024</institution-id>
<institution>Canadian Institutes of Health Research</institution>
</institution-wrap>
</funding-source>
<award-id>MOP-136855</award-id>
<principal-award-recipient>
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0002-9183-964X</contrib-id>
<name name-style="western">
<surname>Gravel</surname> <given-names>Simon</given-names></name>
</principal-award-recipient>
</award-group>
<award-group id="award002">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/100010269</institution-id>
<institution>Wellcome Trust</institution>
</institution-wrap>
</funding-source>
<award-id>100956/Z/13/Z</award-id>
<principal-award-recipient>
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0002-5012-4162</contrib-id>
<name name-style="western">
<surname>McVean</surname> <given-names>Gil</given-names></name>
</principal-award-recipient>
</award-group>
<funding-statement>This research was undertaken, in part, thanks to funding from the Canada Research Chairs program (<ext-link ext-link-type="uri" xlink:href="http://www.chairs-chaires.gc.ca/" xlink:type="simple">http://www.chairs-chaires.gc.ca/</ext-link>) (SG), NSERC discovery grant (<ext-link ext-link-type="uri" xlink:href="http://www.nserc-crsng.gc.ca/" xlink:type="simple">http://www.nserc-crsng.gc.ca/</ext-link>) (SG), CIHR Discovery grant MOP-136855 (<ext-link ext-link-type="uri" xlink:href="http://cihr-irsc.gc.ca/" xlink:type="simple">http://cihr-irsc.gc.ca/</ext-link>) (SG), the Robertson Foundation (JK), the Li Ka Shing Foundation (GM), and Wellcome Trust grant 100956/Z/13/Z (<ext-link ext-link-type="uri" xlink:href="https://wellcome.ac.uk/" xlink:type="simple">https://wellcome.ac.uk/</ext-link>) (GM). The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</funding-statement>
</funding-group>
<counts>
<fig-count count="5"/>
<table-count count="0"/>
<page-count count="12"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>PLOS Publication Stage</meta-name>
<meta-value>vor-update-to-uncorrected-proof</meta-value>
</custom-meta>
<custom-meta>
<meta-name>Publication Update</meta-name>
<meta-value>2020-06-02</meta-value>
</custom-meta>
<custom-meta id="data-availability">
<meta-name>Data Availability</meta-name>
<meta-value>Software available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/tskit-dev/msprime" xlink:type="simple">https://github.com/tskit-dev/msprime</ext-link>.</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="sec001" sec-type="intro">
<title>Introduction</title>
<p>Simulations of genome evolution are widely used in the development of computational tools for statistical and population genetics research (e.g., [<xref ref-type="bibr" rid="pgen.1008619.ref001">1</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref002">2</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref003">3</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref004">4</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref005">5</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref006">6</xref>]). Coalescent theory has been used extensively for this purpose, with Hudson’s <monospace>ms</monospace> simulation program [<xref ref-type="bibr" rid="pgen.1008619.ref007">7</xref>] having been cited over two thousand times since its publication in 2002. The more recent <monospace>msprime</monospace> coalescent simulation software [<xref ref-type="bibr" rid="pgen.1008619.ref008">8</xref>] implements Hudson’s original algorithm [<xref ref-type="bibr" rid="pgen.1008619.ref009">9</xref>], but with a performance increase of several orders of magnitude. This is achieved largely through the introduction of a new data structure, the succinct tree sequence [<xref ref-type="bibr" rid="pgen.1008619.ref010">10</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref011">11</xref>], which is extremely efficient at storing genetic variation. For example, simulating a 100 megabase region in a sample of 100,000 individuals generates an 88MB uncompressed succinct tree sequence, whereas the Newick tree format used by <monospace>ms</monospace> takes approximately 3.5TB of space [<xref ref-type="bibr" rid="pgen.1008619.ref008">8</xref>].</p>
<p>Simulated data are useful to the extent that they accurately reflect real genetic variation. However, the coalescent is known to be biased relative to the Wright-Fisher model when the sample size is large [<xref ref-type="bibr" rid="pgen.1008619.ref012">12</xref>] or for events in the recent past [<xref ref-type="bibr" rid="pgen.1008619.ref013">13</xref>]. However, these biases have had limited practical impact because collecting such large empirical data sets was prohibitively costly and the simulation of such large samples was computationally overwhelming. Both limitations have now been lifted: sequencing datasets now regularly include thousands of sequenced genomes, and <monospace>msprime</monospace> can simulate hundreds of thousands of genomes on a laptop computer. The assumptions of the underlying coalescent models should be carefully reexamined in this context.</p>
<p>We highlight qualitative and quantitative inaccuracies in coalescent simulations of long regions, due to violated assumptions of the underlying genealogical model. We implement an extension to <monospace>msprime</monospace> which corrects the majority of these biases via a backwards-in-time Wright-Fisher model within <monospace>msprime</monospace> (see overview in Methods section and <xref ref-type="supplementary-material" rid="pgen.1008619.s001">S1 Appendix</xref>), which generates biologically plausible genealogies regardless of sample size (a separate implementation of such a model, without using succinct tree sequences, can also be found in [<xref ref-type="bibr" rid="pgen.1008619.ref014">14</xref>]). Our backwards-in-time Wright-Fisher simulations are also much faster than coalescent simulations for large samples and long regions. For shorter regions, the coalescent is slightly faster. Using a hybrid approach with Wright-Fisher dynamics in the recent past and coalescent dynamics further back in time (as was done in [<xref ref-type="bibr" rid="pgen.1008619.ref013">13</xref>]) preserves the computational advantages of the coalescent with the long-range accuracy of the Wright-Fisher model for shorter genomic regions.</p>
<sec id="sec002">
<title>Motivation</title>
<p>This work was motivated by our observation that large-scale coalescent simulations resulted in unrealistic relatedness among samples, where nearly every pair of simulated individuals were second- or third-degree cousins according to the time to their most recent common ancestor. This is because individuals had too many simulated ancestors: whereas diploid individuals carry at most 2<sup><italic>t</italic></sup> ancestors at generation <italic>t</italic> in the past, coalescent simulations allow for many more ancestors.</p>
<p>This excess of ancestors is a side effect of how Hudson’s coalescent algorithm models recombination. Hudson’s coalescent model assumes a small region being simulated [<xref ref-type="bibr" rid="pgen.1008619.ref015">15</xref>], and so does not account for multiple simultaneous recombinations during meiosis. The per-generation recombination rate in long genomic regions is maintained by multiple recombinations occurring at different times, with each recombination introducing a new ancestral lineage. This can lead to more than two ancestors within one generation (<xref ref-type="fig" rid="pgen.1008619.g001">Fig 1</xref>).</p>
<fig id="pgen.1008619.g001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pgen.1008619.g001</object-id>
<label>Fig 1</label>
<caption>
<title>Comparing coalescent and Wright-Fisher lineages one generation in the past.</title>
<p>A schematic of simulated lineages for a haploid sample with a single long chromosome. In the coalescent, each recombination event creates a new, independent lineage, leading to an unrealistic number of simulated parents. The Wright-Fisher model allows for back-and-forth recombination, so recombination events alternately assign genetic material between only two parental lineages. Multiple chromosomes exaggerate the difference, segregating as expected in the Wright-Fisher model but adding extra lineages under the coalescent.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.g001" xlink:type="simple"/>
</fig>
<p>This property of the coalescent recombination model is often innocuous when regions simulated are too short for back-and-forth recombinations to occur, or when the number of lineages is small enough that long range correlations are practically negligible [<xref ref-type="bibr" rid="pgen.1008619.ref013">13</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref016">16</xref>]. In larger samples, or under migration models, recent events induce long-range correlations along the genome [<xref ref-type="bibr" rid="pgen.1008619.ref012">12</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref017">17</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref018">18</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref019">19</xref>]. For example, individuals with a recent migrant ancestor are likely to have migrant ancestry in several chromosomes, and this is not accounted for by Hudson’s coalescent. Significant differences have further been observed between the simulated genealogies of coalescent and Wright-Fisher models at a single locus [<xref ref-type="bibr" rid="pgen.1008619.ref013">13</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref014">14</xref>], such as the more rapid decay in the number of lineages over time in the Wright-Fisher model when sample size is large. Model differences become even more pronounced over long regions, where correlations between distant gene genealogies must be taken into account.</p>
<p>To highlight the magnitude of the genealogical distortions which can occur, we first use both the coalescent and Wright-Fisher models to simulate haploid sample sizes from 500 to 10,000 in a diploid population with size 10,000 and growth rate 0.001. Each sample contains 22 chromosomes of realistic lengths. <xref ref-type="fig" rid="pgen.1008619.g002">Fig 2</xref> shows that for 10,000 samples the number of lineages in the coalescent simulation increases very rapidly to reach 10 times the haploid population size 2<italic>N</italic> (This issue was also raised in [<xref ref-type="bibr" rid="pgen.1008619.ref020">20</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref021">21</xref>]). Simulations with smaller sample sizes also show a rapid growth in number of lineages to beyond the haploid population size, but the growth is slower and the excess is less pronounced than in larger samples. In the Wright-Fisher simulation, the initial growth in number of lineages is much slower and can never exceed the haploid population size, regardless of sample size.</p>
<fig id="pgen.1008619.g002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pgen.1008619.g002</object-id>
<label>Fig 2</label>
<caption>
<title>Number of surviving lineages over time in coalescent and backwards-in-time Wright-Fisher dynamics.</title>
<p>We simulated a varying number of haploid whole genomes with 22 chromosomes of realistic lengths in a population of 10,000 diploid individuals. Dotted line shows effective population size. The implementation for simulations with multiple chromosomes is described in <xref ref-type="supplementary-material" rid="pgen.1008619.s001">S1 Appendix</xref>.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.g002" xlink:type="simple"/>
</fig>
<p>While genealogical distortions are most clear in the first few generations, this explosion of lineages also affects genealogies in the more distant past. <xref ref-type="fig" rid="pgen.1008619.g002">Fig 2</xref> also shows that, despite rapid coalescence lowering the initial spike in the number of lineages, their number remains above the population size for hundreds of generations into the past. The effect is even more dramatic within a constant-sized population, with <xref ref-type="supplementary-material" rid="pgen.1008619.s007">S2 Fig</xref> showing a case where the number of lineages remains above the effective population size for more than 100,000 generations in the past.</p>
<p>The number of lineages cannot be observed directly from genetic data, but these genealogical distortions have consequences for commonly used measures of genetic diversity.</p>
</sec>
</sec>
<sec id="sec003" sec-type="results">
<title>Results</title>
<p>In this section, we first highlight qualitative differences in multi-locus statistics between the coalescent and backwards Wright-Fisher models, and we show that the Wright-Fisher models provide a better description of the data while increasing tractability.</p>
<sec id="sec004">
<title>Distribution of IBD</title>
<p>Under the Wright-Fisher model, diploid inheritance constrains the possible gene genealogies [<xref ref-type="bibr" rid="pgen.1008619.ref012">12</xref>] and introduces correlations in IBD sharing along long simulated regions: two samples with a recent common ancestor may be IBD at several distant positions of their genome (for example on different chromosomes). In the coalescent, gene genealogies of unlinked loci are constructed independently, and do not capture this effect [<xref ref-type="bibr" rid="pgen.1008619.ref012">12</xref>].</p>
<p>Modelling relatedness patterns is important in large cohorts, where cryptic relatives are common [<xref ref-type="bibr" rid="pgen.1008619.ref022">22</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref023">23</xref>]. To illustrate the significance of explicitly modelling diploid inheritance in a sample with close relatives, we compared simulated cohorts to genotype data from participants of the Genizon Biobank containing 8,435 individuals from the province of Quebec, Canada [<xref ref-type="bibr" rid="pgen.1008619.ref024">24</xref>]. A description of this biobank and IBD detection methods is given in <xref ref-type="supplementary-material" rid="pgen.1008619.s004">S4 Appendix</xref>. Pairwise IBD patterns observed in this cohort are shown in <xref ref-type="fig" rid="pgen.1008619.g003">Fig 3</xref>.</p>
<fig id="pgen.1008619.g003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pgen.1008619.g003</object-id>
<label>Fig 3</label>
<caption>
<title>Number of IBD segments between pairs of individuals versus total length of shared IBD segments.</title>
<p>22 chromosomes of realistic lengths, simulated under Wright-Fisher model (middle) and coalescent (bottom), compared to data from 8,435 individuals from the Genizon Biobank (top), as well as the analytical expectation under Eqs (1), (2), (3), and (4) in <xref ref-type="supplementary-material" rid="pgen.1008619.s003">S3 Appendix</xref> (white circles). Siblings were filtered from the Genizon cohort, as explained in <xref ref-type="supplementary-material" rid="pgen.1008619.s004">S4 Appendix</xref>. Simulations contained 5,000 haploid samples with a diploid population size of 10,000. The isolated cluster in the Wright-Fisher simulations reflects the discrete nature of possible genealogical relationships (siblings, cousins, etc.) in the Wright-Fisher model.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.g003" xlink:type="simple"/>
</fig>
<p>We simulated 5,000 human haploid whole genomes (chromosome lengths and recombination rates are described in <xref ref-type="supplementary-material" rid="pgen.1008619.s001">S1 Appendix</xref>) in a diploid population of constant size 10,000 under the coalescent and Wright-Fisher models, and used the simulated genealogies to extract IBD segments inherited from common ancestors up to 5 generations in the past. Closer relatedness means more IBD segments and longer average length, leading to a relationship between number of segments and total length of IBD which is typically used in identifying relative status [<xref ref-type="bibr" rid="pgen.1008619.ref022">22</xref>]. Since the detection of very short IBD segments is challenging in practice, we counted only simulated IBD segments greater than 5 centimorgans, in both simulations and the data.</p>
<p>
<xref ref-type="fig" rid="pgen.1008619.g003">Fig 3</xref> shows the difference between the two models, with the Wright-Fisher model showing excellent qualitative agreement with the Genizon data. Quantitative differences are expected since simulations were performed in a non-monogamous randomly-mating population. By contrast, the coalescent model exhibits far too few IBD segments for closely related individuals and poor clustering by TMRCA. An analytical model for the expected number and length of shared ancestry segments (shown as white dots in <xref ref-type="fig" rid="pgen.1008619.g003">Fig 3</xref>) is provided in <xref ref-type="supplementary-material" rid="pgen.1008619.s003">S3 Appendix</xref>. The separated cluster predicted by the Wright-Fisher model represents simulated half-siblings: neither full- nor half-siblings are present in the Genizon data. Other relationships also form clusters that overlap due to variance in amounts of genetic material shared IBD. Residual differences between Wright-Fisher simulations and theoretical predictions in <xref ref-type="fig" rid="pgen.1008619.g003">Fig 3</xref> have to do with the requirement that IBD segments be at least 5cM to be detected. Better agreement could be achieved by using a cutoff of 1cM in simulations (see <xref ref-type="supplementary-material" rid="pgen.1008619.s008">S3 Fig</xref>).</p>
<p>The distribution of long IBD segments between related individuals is primarily determined by their degree of recent relatedness. For example, even though the population history and sampling process affects the <italic>number</italic> of sampled first cousins, the recent IBD relatedness <italic>among</italic> first cousins in large outbred populations is relatively independent of history and sampling: This is why the simulated and empirical distributions observed on <xref ref-type="fig" rid="pgen.1008619.g003">Fig 3</xref> are in good agreement despite differences in population sizes, and why the theoretical predictions that describe both are independent of the population demography. Because the number of close relatives changes with sampling and population size, the discrepancy between coalescent and Wright-Fisher models is more acute for large sample sizes (see <xref ref-type="supplementary-material" rid="pgen.1008619.s008">S3 Fig</xref> and <xref ref-type="supplementary-material" rid="pgen.1008619.s009">S4 Fig</xref> for simulations under different models). Yet <xref ref-type="supplementary-material" rid="pgen.1008619.s008">S3 Fig</xref> shows clear differences between Wright-Fisher and coalescent models with <italic>N</italic><sub><italic>e</italic></sub> = 10, 000 and 500 samples. More generally, Shchur et. al. (2018) [<xref ref-type="bibr" rid="pgen.1008619.ref023">23</xref>] calculated the expected number of <italic>p</italic>-th cousins in a sample of size <italic>K</italic> taken from a population of effective size <italic>N</italic>. In a monogamous Wright-Fisher population, when <italic>K</italic>/<italic>N</italic> = 0.2, we expect approximately 55% of samples to have a first cousin, and 95% to have a second cousin within the cohort.</p>
<p>The long-range correlations induced by genealogical relatedness can also be measured as linkage disequilibrium between distant loci. This LD is used to estimate sizes of small populations in conservation genetics [<xref ref-type="bibr" rid="pgen.1008619.ref025">25</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref026">26</xref>]. Hudson’s coalescent does not capture such LD patterns [<xref ref-type="bibr" rid="pgen.1008619.ref017">17</xref>], whereas the Wright-Fisher extension to <monospace>msprime</monospace> predicts the patterns of LD expected under diploid mating (see <xref ref-type="supplementary-material" rid="pgen.1008619.s002">S2 Appendix</xref>).</p>
</sec>
<sec id="sec005">
<title>Ancestry variance following admixture</title>
<p>In admixed populations, simulations should capture patterns of ancestry variation among present-day samples. The distribution of ancestry within recently admixed populations can be strongly dependent on pedigree structure [<xref ref-type="bibr" rid="pgen.1008619.ref018">18</xref>], making coalescent simulations of these scenarios problematic.</p>
<p>We consider the variance of ancestry proportions following a single pulse of migration. Ancestry variance can be divided into genealogical variance and recombination variance [<xref ref-type="bibr" rid="pgen.1008619.ref027">27</xref>]. In the first few generations after admixture, variance is driven by genealogical differences in the number of migrant ancestors of each individual. As time goes on, each present-day individual has more ancestors from the admixed generation, exponentially reducing this source of variance. After roughly 10 generations, variation in the amount of genetic material received from each migrant ancestor becomes a stronger source of variance [<xref ref-type="bibr" rid="pgen.1008619.ref027">27</xref>].</p>
<p>We performed whole-genome simulations to evaluate how well the Wright-Fisher and coalescent models capture variance in ancestry. <xref ref-type="fig" rid="pgen.1008619.g004">Fig 4</xref> shows ancestry variance from simulations of 80 haploid samples in a diploid population of size 80, and a single event of 30% admixture at varying time in the past. These parameters were chosen to match those in [<xref ref-type="bibr" rid="pgen.1008619.ref027">27</xref>], but here again the qualitative patterns depend weakly on the sample size and older demographic history. The approximate expected values are derived from an argument similar to the one presented for IBD sharing in <xref ref-type="supplementary-material" rid="pgen.1008619.s003">S3 Appendix</xref> and outlined in [<xref ref-type="bibr" rid="pgen.1008619.ref027">27</xref>].</p>
<fig id="pgen.1008619.g004" position="float">
<object-id pub-id-type="doi">10.1371/journal.pgen.1008619.g004</object-id>
<label>Fig 4</label>
<caption>
<title>Variance in ancestry after a single admixture event, as a function of time since admixture.</title>
<p>Calculated from 80 haploid samples in a diploid population of size 80, with 30% admixture proportions. Error bars show 95% confidence intervals over 50 simulations.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.g004" xlink:type="simple"/>
</fig>
<p>The Wright-Fisher model captures both short- and long-term variance in ancestry, as expected. In the coalescent simulations the initial phase of genealogical variance is not present, leading to a 20-fold underestimate of the variance in ancestry. Lacking a diploid population pedigree, whole-genome coalescent simulations of recently admixed populations do not reflect the distribution of ancestry expected in a large cohort, even under an idealized random-mating scenario.</p>
</sec>
<sec id="sec006">
<title>Other genealogical effects</title>
<p>Bhaskar et al. [<xref ref-type="bibr" rid="pgen.1008619.ref013">13</xref>] showed that simultaneous coalescences in the Wright-Fisher model lead to more singletons and fewer doubletons than in the coalescent, which was verified in [<xref ref-type="bibr" rid="pgen.1008619.ref014">14</xref>]. <xref ref-type="supplementary-material" rid="pgen.1008619.s006">S1 Fig</xref> and <xref ref-type="supplementary-material" rid="pgen.1008619.s005">S1 Table</xref> replicate these single-locus results. King et al. [<xref ref-type="bibr" rid="pgen.1008619.ref017">17</xref>] pointed out correlation patterns among unlinked loci induced by genealogical relatedness—these results correspond to the infinite-recombination distance in <xref ref-type="supplementary-material" rid="pgen.1008619.s002">S2 Appendix</xref>.</p>
</sec>
<sec id="sec007">
<title>Performance</title>
<p>The main advantage of <monospace>msprime</monospace> over alternate simulators is speed and scalability. This is achieved by efficient algorithms and, especially, new data structures for storing and manipulating ancestral states throughout a simulation. We therefore need to ensure that the present modification preserves these advantages.</p>
<p>Hudson’s coalescent algorithm avoids simulating recombination and coalescent events that do not affect genetic variation in the present sample. Whereas our Wright-Fisher implementation must iterate over all discrete generations, Hudson’s coalescent can traverse long stretches of time in a single step if there are no such events. The Hudson model is therefore more efficient than the Wright-Fisher model when the number of lineages is small, as can happen in small samples and short genomic regions, or in the distant past. However, <xref ref-type="fig" rid="pgen.1008619.g002">Fig 2</xref> shows that the number of lineages in whole-genome coalescent simulations is so high that the time between events is on average much less than a single generation. Furthermore, these lineages come at an additional memory and computational cost for the coalescent model. This naturally suggests using a hybrid approach with Wright-Fisher dynamics in the recent past and coalescent dynamics in the more distant past, following the approach of Bhaskar et. al. [<xref ref-type="bibr" rid="pgen.1008619.ref013">13</xref>].</p>
<p>Our Wright-Fisher extension is integrated with <monospace>msprime</monospace>’s core simulation framework, and can easily be combined with coalescent simulations as part of a hybrid model. Since the optimal switching time depends on the number of extant lineages and total length of uncoalesced ancestral material, it will vary between different demographic models.</p>
<p>
<xref ref-type="fig" rid="pgen.1008619.g005">Fig 5</xref> shows computation times for Wright-Fisher, Hudson coalescent, and hybrid simulations of 1,000 haploid samples within a population of constant size 10,000. The pure Wright-Fisher simulations are fastest at whole-genome scale, whereas pure coalescent simulations and hybrid approaches are slightly faster for shorter regions. There is a small performance cost to switching models, which explains the slightly longer runtime for the hybrid model with 100 Wright-Fisher generations versus pure coalescent simulations. The hybrid model with 1,000 Wright-Fisher generations compares favourably in terms of performance and accuracy to the coalescent for a wide range of simulated lengths.</p>
<fig id="pgen.1008619.g005" position="float">
<object-id pub-id-type="doi">10.1371/journal.pgen.1008619.g005</object-id>
<label>Fig 5</label>
<caption>
<title>Computation time of Hudson coalescent, Wright-Fisher, and hybrid models.</title>
<p>Hybrid models used 100 and 1000 Wright-Fisher generations before switching to the coalescent. Simulations contain from 1 to 22 chromosomes of realistic lengths (using the method described in <xref ref-type="supplementary-material" rid="pgen.1008619.s001">S1 Appendix</xref>) in 1,000 haploid samples drawn from a diploid population of constant size 10,000. Results for other population sizes are shown in <xref ref-type="supplementary-material" rid="pgen.1008619.s010">S5 Fig</xref>.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.g005" xlink:type="simple"/>
</fig>
</sec>
</sec>
<sec id="sec008" sec-type="materials|methods">
<title>Methods</title>
<sec id="sec009">
<title>Implementation</title>
<p>To understand the modifications needed to turn msprime into a back-in-time Wright-Fisher simulator, we first outline Hudson’s original algorithm to simulate samples under the coalescent model. This brief overview is intended to give context to the modifications which enable Wright-Fisher simulations to be performed in the same framework. More details of how Hudson’s algorithm is implemented in <monospace>msprime</monospace> are given in [<xref ref-type="bibr" rid="pgen.1008619.ref008">8</xref>].</p>
<p>First, a number of randomly-mating populations are specified, including effective sizes and migration rates over time. Samples are introduced as haploid lineages within the populations, and the region of the genome being simulated is specified. The algorithm then constructs the genealogy of each locus within this region by tracing its lineages backwards in time and tracking genomic segments that are ancestral to the sample.</p>
<p>To begin, each lineage contains a single ancestral segment spanning the whole simulated genomic region of a sample. As time proceeds backwards, lineages can be split by recombination events (leaving the amount of ancestral material unchanged), or participate in common ancestor events, where any overlapping regions coalesce (reducing the amount of ancestral material). The rate of recombination events depends on the sum of the genetic map distance spanned by ancestral segments carried by all extant lineages, and common ancestor events occur at a rate determined by the number of uncoalesced lineages and the effective population size. Migration events move haploid lineages between randomly-mating populations, and demographic events modify the number of populations or their size and growth rate parameters. Recombination and common ancestor events are generated at rates depending on the amount of extant ancestral material, and the simulation terminates when every position on the genome has a most recent common ancestor.</p>
<p>Implementing a back-in-time Wright-Fisher model requires two important changes to Hudson’s algorithm. First, rather than drawing a time to the next event from an exponential distribution, we iterate though discrete generations and draw the events which occur at each time. Second, we modify the way recombination events are carried out, to account for the possibility of multiple recombinations in a single transmission: we model the number and spatial distribution of breakpoints as a Poisson process, with rate equal to the per-generation recombination rate (i.e., the distance in Morgans). This model ensures that each gamete has a unique diploid parent. An overview of this model is illustrated in <xref ref-type="fig" rid="pgen.1008619.g001">Fig 1</xref> and the detailed order of events occurring at each generation is given in <xref ref-type="supplementary-material" rid="pgen.1008619.s001">S1 Appendix</xref>.</p>
</sec>
<sec id="sec010">
<title>Ethics statement</title>
<p>Access to the Genizon cohort genotyping data was granted under study number A07-M42-15B of the McGill university IRB. Third party data were analysed anonymously so consent was not obtained.</p>
</sec>
</sec>
<sec id="sec011" sec-type="conclusions">
<title>Discussion</title>
<p>While the Wright-Fisher model may generate a more realistic pedigree than the coalescent model in the recent past, it was recognized early on as an idealized model [<xref ref-type="bibr" rid="pgen.1008619.ref028">28</xref>, <xref ref-type="bibr" rid="pgen.1008619.ref029">29</xref>]. Our implementation does not track monogamous couples, for example, and therefore will vastly overestimate the prevalence of half-sibs and underestimate full sibs compared to a realistic human cohort. Assortative mating and inbreeding are not accounted for, and the migration model, while biologically plausible, is a simplification of the real migration process (see implementation details in <xref ref-type="supplementary-material" rid="pgen.1008619.s001">S1 Appendix</xref>). Care should be taken in applications which are particularly sensitive to fine-scale mating or migration patterns.</p>
<p>Many of these issues can be addressed by allowing simulations to take place within a pre-specified pedigree, which is a natural extension to our backwards-in-time Wright-Fisher implementation. Rather than drawing genealogical links at random according to demographic parameters, lineages can simply follow a known pedigree. When reaching a pedigree founder, simulations can then continue by reverting to either the Wright-Fisher or the coalescent models. Real pedigrees of any size could then be used, from extended families up to population-scale [<xref ref-type="bibr" rid="pgen.1008619.ref030">30</xref>], or they could be generated with the desired patterns of monogamy or assortative mating in a separate step. While conceptually straightforward, maintaining efficiency while simulating within population-scale pedigrees is non-trivial. We leave such an implementation for future work.</p>
<p>Improvements to recombination models is also a natural extension of the present approach. Assigning sexes to parents would allow simulation of the X-chromosome and sex-biased migration. Recombination can be extended to model crossover interference and sex-biased recombination, which have effects on the distribution of IBD [<xref ref-type="bibr" rid="pgen.1008619.ref031">31</xref>], as well as non-crossover events.</p>
<p>Finally, the performance of the hybrid model could also be improved. If the number of Wright-Fisher generations were chosen optimally, it is likely to be more efficient than pure Wright-Fisher simulations in nearly all scenarios. Better guidelines for finding this optimal value could be developed, or possibly built into the simulation framework itself.</p>
<p>The limitations of the coalescent model have been well-studied, but were generally tied to modest effects except in very large cohorts [<xref ref-type="bibr" rid="pgen.1008619.ref013">13</xref>]. We have shown significant qualitative and quantitative biases in whole-genome simulations of large, complex cohorts. Analysis of such cohorts is challenging, and simulations are a valuable tool for evaluating disease associations and the effects of demography in this context. We have presented here an extension to <monospace>msprime</monospace> which corrects major biases and increases performance at whole-genome scale, allowing simulations to continue supporting modern large-scale sequencing efforts.</p>
</sec>
<sec id="sec012">
<title>Supporting information</title>
<supplementary-material id="pgen.1008619.s001" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.s001" xlink:type="simple">
<label>S1 Appendix</label>
<caption>
<title>Wright-Fisher implementation details.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pgen.1008619.s002" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.s002" xlink:type="simple">
<label>S2 Appendix</label>
<caption>
<title>Long-range linkage disequilibrium.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pgen.1008619.s003" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.s003" xlink:type="simple">
<label>S3 Appendix</label>
<caption>
<title>An approximate model for IBD sharing.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pgen.1008619.s004" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.s004" xlink:type="simple">
<label>S4 Appendix</label>
<caption>
<title>The Genizon Biobank.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pgen.1008619.s005" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.s005" xlink:type="simple">
<label>S1 Table</label>
<caption>
<title>Relative difference in mean number of singletons, doubletons, and tripletons under the Wright-Fisher (<italic>N</italic><sub><italic>WF</italic></sub>) and Hudson (<italic>N</italic><sub><italic>H</italic></sub>) models.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pgen.1008619.s006" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.s006" xlink:type="simple">
<label>S1 Fig</label>
<caption>
<title>Number of singletons, doubletons, and tripletons simulated under Wright-Fisher and Hudson coalescent models.</title>
<p>A 1Mb region was simulated 100 times in 20,000 haploid lineages in a diploid population of 10,000 individuals.</p>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pgen.1008619.s007" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.s007" xlink:type="simple">
<label>S2 Fig</label>
<caption>
<title>Number of surviving lineages over time in coalescent and back-in-time Wright-Fisher dynamics.</title>
<p>We simulated 10,000 haploid whole genomes with 22 chromosomes of realistic lengths in a population of 10,000 diploid individuals. The method for simulating multiple chromosomes is described in <xref ref-type="supplementary-material" rid="pgen.1008619.s001">S1 Appendix</xref>. Similar results were shown in [<xref ref-type="bibr" rid="pgen.1008619.ref021">21</xref>].</p>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pgen.1008619.s008" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.s008" xlink:type="simple">
<label>S3 Fig</label>
<caption>
<title>Number of IBD segments between pairs of individuals versus total length of shared IBD segments.</title>
<p>22 chromosomes of realistic lengths, simulated under Wright-Fisher model (top) and coalescent (bottom), compared to the analytical expectation under Eqs (1) and (2) in <xref ref-type="supplementary-material" rid="pgen.1008619.s003">S3 Appendix</xref>. Effective population size 10,000, sample size A) 5000, B) 2500, C) 1000, D) 500. Minimum IBD segment length of 1 centimorgan.</p>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pgen.1008619.s009" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.s009" xlink:type="simple">
<label>S4 Fig</label>
<caption>
<title>Number of IBD segments between pairs of individuals versus total length of shared IBD segments, under the Gutenkunst et. al. (2009) [<xref ref-type="bibr" rid="pgen.1008619.ref003">3</xref>] out-of-Africa model.</title>
<p>22 chromosomes of realistic lengths, simulated under Wright-Fisher model (top) and coalescent (bottom), compared to the analytical expectation under Eqs (1) and (2) in <xref ref-type="supplementary-material" rid="pgen.1008619.s003">S3 Appendix</xref>. The African, European, and Asian populations had 1000 haploid samples each.</p>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pgen.1008619.s010" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.s010" xlink:type="simple">
<label>S5 Fig</label>
<caption>
<title>Computation time of Hudson coalescent, Wright-Fisher, and hybrid models with 100 and 1000 Wright-Fisher generations before switching to the coalescent.</title>
<p>Simulations contain from 1 to 22 chromosomes of realistic lengths, using the method described in <xref ref-type="supplementary-material" rid="pgen.1008619.s001">S1 Appendix</xref>, in 500 haploid samples within a diploid population of size 500.</p>
<p>(PDF)</p>
</caption>
</supplementary-material>
</sec>
</body>
<back>
<ref-list>
<title>References</title>
<ref id="pgen.1008619.ref001">
<label>1</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Carlson</surname> <given-names>CS</given-names></name>, <name name-style="western"><surname>Eberle</surname> <given-names>MA</given-names></name>, <name name-style="western"><surname>Rieder</surname> <given-names>MJ</given-names></name>, <name name-style="western"><surname>Yi</surname> <given-names>Q</given-names></name>, <name name-style="western"><surname>Kruglyak</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Nickerson</surname> <given-names>DA</given-names></name>. <article-title>Selecting a Maximally Informative Set of Single-Nucleotide Polymorphisms for Association Analyses Using Linkage Disequilibrium</article-title>. <source>The American Journal of Human Genetics</source>. <year>2004</year>;<volume>74</volume>(<issue>1</issue>):<fpage>106</fpage>–<lpage>120</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1086/381000" xlink:type="simple">10.1086/381000</ext-link></comment> <object-id pub-id-type="pmid">14681826</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref002">
<label>2</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Voight</surname> <given-names>BF</given-names></name>, <name name-style="western"><surname>Kudaravalli</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Wen</surname> <given-names>X</given-names></name>, <name name-style="western"><surname>Pritchard</surname> <given-names>JK</given-names></name>. <article-title>A map of recent positive selection in the human genome</article-title>. <source>PLoS biology</source>. <year>2006</year>;<volume>4</volume>(<issue>3</issue>):<fpage>e72</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pbio.0040072" xlink:type="simple">10.1371/journal.pbio.0040072</ext-link></comment> <object-id pub-id-type="pmid">16494531</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref003">
<label>3</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Gutenkunst</surname> <given-names>RN</given-names></name>, <name name-style="western"><surname>Hernandez</surname> <given-names>RD</given-names></name>, <name name-style="western"><surname>Williamson</surname> <given-names>SH</given-names></name>, <name name-style="western"><surname>Bustamante</surname> <given-names>CD</given-names></name>. <article-title>Inferring the joint demographic history of multiple populations from multidimensional SNP frequency data</article-title>. <source>PLoS Genetics</source>. <year>2009</year>;<volume>5</volume>(<issue>10</issue>). <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pgen.1000695" xlink:type="simple">10.1371/journal.pgen.1000695</ext-link></comment> <object-id pub-id-type="pmid">19851460</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref004">
<label>4</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Li</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Durbin</surname> <given-names>R</given-names></name>. <article-title>Inference of human population history from individual whole-genome sequences</article-title>. <source>Nature</source>. <year>2011</year>;<volume>475</volume>(<issue>7357</issue>):<fpage>493</fpage>–<lpage>6</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/nature10231" xlink:type="simple">10.1038/nature10231</ext-link></comment> <object-id pub-id-type="pmid">21753753</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref005">
<label>5</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Li</surname> <given-names>N</given-names></name>, <name name-style="western"><surname>Stephens</surname> <given-names>M</given-names></name>. <article-title>Modeling Linkage Disequilibrium and Identifying Recombination Hotspots Using Single-Nucleotide Polymorphism Data</article-title>. <source>Genetics</source>. <year>2003</year>;<volume>165</volume>(<issue>4</issue>):<fpage>2213</fpage>–<lpage>2233</lpage>. <object-id pub-id-type="pmid">14704198</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref006">
<label>6</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Nielsen</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Williamson</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Kim</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>Hubisz</surname> <given-names>MJ</given-names></name>, <name name-style="western"><surname>Clark</surname> <given-names>AG</given-names></name>, <name name-style="western"><surname>Bustamante</surname> <given-names>C</given-names></name>. <article-title>Genomic scans for selective sweeps using SNP data</article-title>. <source>Genome Research</source>. <year>2005</year>;<volume>15</volume>(<issue>11</issue>):<fpage>1566</fpage>–<lpage>1575</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1101/gr.4252305" xlink:type="simple">10.1101/gr.4252305</ext-link></comment> <object-id pub-id-type="pmid">16251466</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref007">
<label>7</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Hudson</surname> <given-names>RR</given-names></name>. <article-title>Generating samples under a Wright-Fisher neutral model of genetic variation</article-title>. <source>Bioinformatics</source>. <year>2002</year>;<volume>18</volume>(<issue>2</issue>):<fpage>337</fpage>–<lpage>338</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bioinformatics/18.2.337" xlink:type="simple">10.1093/bioinformatics/18.2.337</ext-link></comment> <object-id pub-id-type="pmid">11847089</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref008">
<label>8</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Kelleher</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Etheridge</surname> <given-names>AM</given-names></name>, <name name-style="western"><surname>McVean</surname> <given-names>G</given-names></name>. <article-title>Efficient Coalescent Simulation and Genealogical Analysis for Large Sample Sizes</article-title>. <source>PLoS Comput Biol</source>. <year>2016</year>;<volume>12</volume>(<issue>5</issue>):<fpage>1</fpage>–<lpage>22</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pcbi.1004842" xlink:type="simple">10.1371/journal.pcbi.1004842</ext-link></comment></mixed-citation>
</ref>
<ref id="pgen.1008619.ref009">
<label>9</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Hudson</surname> <given-names>RR</given-names></name>. <article-title>Properties of a neutral allele model with intragenic recombination</article-title>. <source>Theoretical Population Biology</source>. <year>1983</year>;<volume>23</volume>(<issue>2</issue>):<fpage>183</fpage>–<lpage>201</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/0040-5809(83)90013-8" xlink:type="simple">10.1016/0040-5809(83)90013-8</ext-link></comment> <object-id pub-id-type="pmid">6612631</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref010">
<label>10</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Kelleher</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Thornton</surname> <given-names>KR</given-names></name>, <name name-style="western"><surname>Ashander</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Ralph</surname> <given-names>PL</given-names></name>. <article-title>Efficient pedigree recording for fast population genetics simulation</article-title>. <source>PLoS computational biology</source>. <year>2018</year>;<volume>14</volume>(<issue>11</issue>):<fpage>e1006581</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pcbi.1006581" xlink:type="simple">10.1371/journal.pcbi.1006581</ext-link></comment> <object-id pub-id-type="pmid">30383757</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref011">
<label>11</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Kelleher</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Wong</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>Wohns</surname> <given-names>AW</given-names></name>, <name name-style="western"><surname>Fadil</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Albers</surname> <given-names>PK</given-names></name>, <name name-style="western"><surname>McVean</surname> <given-names>G</given-names></name>. <article-title>Inferring whole-genome histories in large population datasets</article-title>. <source>Nature Genetics</source>. <year>2019</year>;<volume>51</volume>(<issue>9</issue>):<fpage>1330</fpage>–<lpage>1338</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/s41588-019-0483-y" xlink:type="simple">10.1038/s41588-019-0483-y</ext-link></comment> <object-id pub-id-type="pmid">31477934</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref012">
<label>12</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Wakeley</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>King</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Low</surname> <given-names>BS</given-names></name>, <name name-style="western"><surname>Ramachandran</surname> <given-names>S</given-names></name>. <article-title>Gene genealogies within a fixed pedigree, and the robustness of kingman’s coalescent</article-title>. <source>Genetics</source>. <year>2012</year>;<volume>190</volume>(<issue>4</issue>):<fpage>1433</fpage>–<lpage>1445</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1534/genetics.111.135574" xlink:type="simple">10.1534/genetics.111.135574</ext-link></comment> <object-id pub-id-type="pmid">22234858</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref013">
<label>13</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Bhaskar</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Clark</surname> <given-names>AG</given-names></name>, <name name-style="western"><surname>Song</surname> <given-names>YS</given-names></name>. <article-title>Distortion of genealogical properties when the sample is very large</article-title>. <source>Proceedings of the National Academy of Sciences of the United States of America</source>. <year>2014</year>;<volume>111</volume>(<issue>6</issue>):<fpage>2385</fpage>–<lpage>90</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1073/pnas.1322709111" xlink:type="simple">10.1073/pnas.1322709111</ext-link></comment> <object-id pub-id-type="pmid">24469801</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref014">
<label>14</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Palamara</surname> <given-names>PF</given-names></name>. <article-title>ARGON: fast, whole-genome simulation of the discrete time Wright-fisher process</article-title>. <source>Bioinformatics</source>. <year>2016</year>;<volume>32</volume>(<issue>19</issue>):<fpage>3032</fpage>–<lpage>3034</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bioinformatics/btw355" xlink:type="simple">10.1093/bioinformatics/btw355</ext-link></comment> <object-id pub-id-type="pmid">27312410</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref015">
<label>15</label>
<mixed-citation publication-type="book" xlink:type="simple">
<name name-style="western"><surname>Hudson</surname> <given-names>RR</given-names></name>. <chapter-title>Gene genealogies and the coalescent process</chapter-title>. In: <name name-style="western"><surname>Futuyma</surname> <given-names>D</given-names></name>. and <name name-style="western"><surname>Antonovics</surname> <given-names>J</given-names></name>. (eds), <source>Oxford Surveys in Evolutionary Biology</source>. <volume>vol. 7</volume>; <year>1990</year>. p. <fpage>1</fpage>–<lpage>44</lpage>.</mixed-citation>
</ref>
<ref id="pgen.1008619.ref016">
<label>16</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Wilton</surname> <given-names>PR</given-names></name>, <name name-style="western"><surname>Baduel</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Landon</surname> <given-names>MM</given-names></name>, <name name-style="western"><surname>Wakeley</surname> <given-names>J</given-names></name>. <article-title>Population structure and coalescence in pedigrees: Comparisons to the structured coalescent and a framework for inference</article-title>. <source>Theoretical Population Biology</source>. <year>2017</year>;<volume>115</volume>:<fpage>1</fpage>–<lpage>12</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.tpb.2017.01.004" xlink:type="simple">10.1016/j.tpb.2017.01.004</ext-link></comment> <object-id pub-id-type="pmid">28143695</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref017">
<label>17</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>King</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Wakeley</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Carmi</surname> <given-names>S</given-names></name>. <article-title>A non-zero variance of Tajima’s estimator for two sequences even for infinitely many unlinked loci</article-title>. <source>Theoretical Population Biology</source>. <year>2018</year>;<volume>122</volume>:<fpage>22</fpage>–<lpage>29</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.tpb.2017.03.002" xlink:type="simple">10.1016/j.tpb.2017.03.002</ext-link></comment> <object-id pub-id-type="pmid">28341209</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref018">
<label>18</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Liang</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Nielsen</surname> <given-names>R</given-names></name>. <article-title>The lengths of admixture tracts</article-title>. <source>Genetics</source>. <year>2014</year>;<volume>197</volume>(<issue>3</issue>):<fpage>953</fpage>–<lpage>967</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1534/genetics.114.162362" xlink:type="simple">10.1534/genetics.114.162362</ext-link></comment> <object-id pub-id-type="pmid">24770332</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref019">
<label>19</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Ball</surname> <given-names>RM</given-names></name>, <name name-style="western"><surname>Neigel</surname> <given-names>JE</given-names></name>, <name name-style="western"><surname>Avise</surname> <given-names>JC</given-names></name>. <article-title>Gene Genealogies within the Organismal Pedigrees of Random-Mating Populations</article-title>. <source>Evolution</source>. <year>1990</year>;<volume>44</volume>(<issue>2</issue>):<fpage>360</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1111/j.1558-5646.1990.tb05205.x" xlink:type="simple">10.1111/j.1558-5646.1990.tb05205.x</ext-link></comment> <object-id pub-id-type="pmid">28564387</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref020">
<label>20</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Verhoeven</surname> <given-names>KJF</given-names></name>, <name name-style="western"><surname>Simonsen</surname> <given-names>KL</given-names></name>. <article-title>Genomic haplotype blocks may not accurately reflect spatial variation in historic recombination intensity</article-title>. <source>Molecular Biology and Evolution</source>. <year>2005</year>;<volume>22</volume>(<issue>3</issue>):<fpage>735</fpage>–<lpage>740</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/molbev/msi058" xlink:type="simple">10.1093/molbev/msi058</ext-link></comment> <object-id pub-id-type="pmid">15563716</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref021">
<label>21</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Davies</surname> <given-names>JL</given-names></name>, <name name-style="western"><surname>Simančík</surname> <given-names>F</given-names></name>, <name name-style="western"><surname>Lyngsø</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Mailund</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Hein</surname> <given-names>J</given-names></name>. <article-title>On recombination-induced multiple and simultaneous coalescent events</article-title>. <source>Genetics</source>. <year>2007</year>;<volume>177</volume>(<issue>4</issue>):<fpage>2151</fpage>–<lpage>2160</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1534/genetics.107.071126" xlink:type="simple">10.1534/genetics.107.071126</ext-link></comment> <object-id pub-id-type="pmid">17947442</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref022">
<label>22</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Henn</surname> <given-names>BM</given-names></name>, <name name-style="western"><surname>Hon</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Macpherson</surname> <given-names>JM</given-names></name>, <name name-style="western"><surname>Eriksson</surname> <given-names>N</given-names></name>, <name name-style="western"><surname>Saxonov</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Pe’er</surname> <given-names>I</given-names></name>, <etal>et al</etal>. <article-title>Cryptic distant relatives are common in both isolated and cosmopolitan genetic samples</article-title>. <source>PLoS ONE</source>. <year>2012</year>;<volume>7</volume>(<issue>4</issue>). <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pone.0034267" xlink:type="simple">10.1371/journal.pone.0034267</ext-link></comment></mixed-citation>
</ref>
<ref id="pgen.1008619.ref023">
<label>23</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Shchur</surname> <given-names>V</given-names></name>, <name name-style="western"><surname>Nielsen</surname> <given-names>R</given-names></name>. <article-title>On the number of siblings and p-th cousins in a large population sample</article-title>. <source>Journal of Mathematical Biology</source>. <year>2018</year>;<volume>77</volume>(<issue>5</issue>):<fpage>1</fpage>–<lpage>20</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1007/s00285-018-1252-8" xlink:type="simple">10.1007/s00285-018-1252-8</ext-link></comment></mixed-citation>
</ref>
<ref id="pgen.1008619.ref024">
<label>24</label>
<mixed-citation publication-type="other" xlink:type="simple">Genome Quebec. Genizon Biobank; (2020). <ext-link ext-link-type="uri" xlink:href="http://www.genomequebec.com/genizon-biobank/" xlink:type="simple">http://www.genomequebec.com/genizon-biobank/</ext-link>.</mixed-citation>
</ref>
<ref id="pgen.1008619.ref025">
<label>25</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Waples</surname> <given-names>RS</given-names></name>. <article-title>A bias correction for estimates of effective population size based on linkage disequilibrium at unlinked gene loci</article-title>. <source>Conservation Genetics</source>. <year>2006</year>;<volume>7</volume>(<issue>2</issue>):<fpage>167</fpage>–<lpage>184</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1007/s10592-005-9100-y" xlink:type="simple">10.1007/s10592-005-9100-y</ext-link></comment></mixed-citation>
</ref>
<ref id="pgen.1008619.ref026">
<label>26</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Ragsdale</surname> <given-names>AP</given-names></name>, <name name-style="western"><surname>Gravel</surname> <given-names>S</given-names></name>. <article-title>Unbiased Estimation of Linkage Disequilibrium from Unphased Data</article-title>. <source>Molecular Biology and Evolution</source>. <year>2019</year>.</mixed-citation>
</ref>
<ref id="pgen.1008619.ref027">
<label>27</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Gravel</surname> <given-names>S</given-names></name>. <article-title>Population genetics models of local ancestry</article-title>. <source>Genetics</source>. <year>2012</year>;<volume>191</volume>(<issue>2</issue>):<fpage>607</fpage>–<lpage>619</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1534/genetics.112.139808" xlink:type="simple">10.1534/genetics.112.139808</ext-link></comment> <object-id pub-id-type="pmid">22491189</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref028">
<label>28</label>
<mixed-citation publication-type="book" xlink:type="simple">
<name name-style="western"><surname>Fisher</surname> <given-names>R</given-names></name>. <source>The genetical theory of natural selection</source>. <publisher-name>Clarendon Press</publisher-name>; <year>1930</year>.</mixed-citation>
</ref>
<ref id="pgen.1008619.ref029">
<label>29</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Wright</surname> <given-names>S</given-names></name>. <article-title>Evolution in Mendelian populations</article-title>. <source>Genetics</source>. <year>1931</year>;<volume>16</volume>(<issue>2</issue>):<fpage>97</fpage>. <object-id pub-id-type="pmid">17246615</object-id></mixed-citation>
</ref>
<ref id="pgen.1008619.ref030">
<label>30</label>
<mixed-citation publication-type="other" xlink:type="simple">BALSAC. BALSAC Population Database: 2016-2017 Annual Report.; 2018. <ext-link ext-link-type="uri" xlink:href="http://balsac.uqac.ca/english/files/2018/01/BALSAC_RA2017_EN_page_WEB_v2-1.pdf" xlink:type="simple">http://balsac.uqac.ca/english/files/2018/01/BALSAC_RA2017_EN_page_WEB_v2-1.pdf</ext-link>.</mixed-citation>
</ref>
<ref id="pgen.1008619.ref031">
<label>31</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Caballero</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Seidman</surname> <given-names>DN</given-names></name>, <name name-style="western"><surname>Qiao</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>Sannerud</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Dyer</surname> <given-names>TD</given-names></name>, <name name-style="western"><surname>Lehman</surname> <given-names>DM</given-names></name>, <etal>et al</etal>. <article-title>Crossover interference and sex-specific genetic maps shape identical by descent sharing in close relatives</article-title>. <source>PLOS Genetics</source>. <year>2019</year>;<volume>15</volume>(<issue>12</issue>):<fpage>1</fpage>–<lpage>29</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pgen.1007979" xlink:type="simple">10.1371/journal.pgen.1007979</ext-link></comment></mixed-citation>
</ref>
</ref-list>
</back>
<sub-article article-type="aggregated-review-documents" id="pgen.1008619.r001" specific-use="decision-letter">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pgen.1008619.r001</article-id>
<title-group>
<article-title>Decision Letter 0</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>Tang</surname>
<given-names>Hua</given-names>
</name>
<role>Section Editor: Natural Variation</role>
</contrib>
<contrib contrib-type="author">
<name name-style="western">
<surname>Williams</surname>
<given-names>Amy L.</given-names>
</name>
<role>Guest Editor</role>
</contrib>
</contrib-group>
<permissions>
<copyright-year>2020</copyright-year>
<copyright-holder>Tang, Williams</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<related-object document-id="10.1371/journal.pgen.1008619" document-id-type="doi" document-type="article" id="rel-obj001" link-type="peer-reviewed-article"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>0</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="letter-date">4 Jul 2019</named-content>
</p>
<p>Dear Dr Nelson,</p>
<p>Thank you very much for submitting your Research Article entitled 'Coupling Wright-Fisher and coalescent dynamics for realistic simulation of population-scale datasets' to PLOS Genetics. Your manuscript was fully evaluated at the editorial level and by independent peer reviewers. The reviewers appreciated the attention to an important problem, but raised some substantial concerns about the current manuscript. Based on the reviews, we will not be able to accept this version of the manuscript, but we would be willing to review again a much-revised version. We cannot, of course, promise publication at that time.</p>
<p>Should you decide to revise the manuscript for further consideration here, your revisions should address the specific points made by each reviewer. Of note, both reviewers asked for a more detailed discussion and analysis of the impact of Wright-Fisher modeling. As noted by reviewer 2, performing simulations in a pedigree addresses the issues raised in this manuscript and should be explored. We will also require a detailed list of your responses to the review comments and a description of the changes you have made in the manuscript.</p>
<p>If you decide to revise the manuscript for further consideration at PLOS Genetics, please aim to resubmit within the next 60 days, unless it will take extra time to address the concerns of the reviewers, in which case we would appreciate an expected resubmission date by email to <email xlink:type="simple">plosgenetics@plos.org</email>.</p>
<p>If present, accompanying reviewer attachments are included with this email; please notify the journal office if any appear to be missing. They will also be available for download from the link below. You can use this link to log into the system when you are ready to submit a revised version, having first consulted our <ext-link ext-link-type="uri" xlink:href="http://journals.plos.org/plosgenetics/s/submit-now#loc-submission-checklist" xlink:type="simple">Submission Checklist</ext-link>.</p>
<p>To enhance the reproducibility of your results, we recommend that you deposit your laboratory protocols in protocols.io, where a protocol can be assigned its own identifier (DOI) such that it can be cited independently in the future. For instructions see our <ext-link ext-link-type="uri" xlink:href="http://journals.plos.org/plosgenetics/s/submission-guidelines#loc-materials-and-methods" xlink:type="simple">guidelines</ext-link>.</p>
<p>Please be aware that our <ext-link ext-link-type="uri" xlink:href="http://journals.plos.org/plosgenetics/s/data-availability" xlink:type="simple">data availability policy</ext-link> requires that all numerical data underlying graphs or summary statistics are included with the submission, and you will need to provide this upon resubmission if not already present. In addition, we do not permit the inclusion of phrases such as "data not shown" or "unpublished results" in manuscripts. All points should be backed up by data provided with the submission.</p>
<p>While revising your submission, please upload your figure files to the <ext-link ext-link-type="uri" xlink:href="http://pace.apexcovantage.com/" xlink:type="simple">Preflight Analysis and Conversion Engine</ext-link> (PACE) digital diagnostic tool.  PACE helps ensure that figures meet PLOS requirements. To use PACE, you must first register as a user. Then, login and navigate to the UPLOAD tab, where you will find detailed instructions on how to use the tool. If you encounter any issues or have any questions when using PACE, please email us at <email xlink:type="simple">figures@plos.org</email>.</p>
<p>PLOS has incorporated <ext-link ext-link-type="uri" xlink:href="http://www.crossref.org/crosscheck.html" xlink:type="simple">Similarity Check</ext-link>, powered by iThenticate, into its journal-wide submission system in order to screen submitted content for originality before publication. Each PLOS journal undertakes screening on a proportion of submitted articles. You will be contacted if needed following the screening process.</p>
<p>To resubmit, use the link below and 'Revise Submission' in the 'Submissions Needing Revision' folder.</p>
<p>[LINK]</p>
<p>We are sorry that we cannot be more positive about your manuscript at this stage. Please do not hesitate to contact us if you have any concerns or questions.</p>
<p>Yours sincerely,</p>
<p>Amy L. Williams</p>
<p>Guest Editor</p>
<p>PLOS Genetics</p>
<p>Hua Tang</p>
<p>Section Editor: Natural Variation</p>
<p>PLOS Genetics</p>
<p>Reviewer's Responses to Questions</p>
<p><bold>Comments to the Authors:</bold></p>
<p><bold>Please note here if the review is uploaded as an attachment.</bold></p>
<p>Reviewer #1: Review of Nelson et al</p>
<p>In this manuscript the authors describe an implementation of a discrete time Wright Fisher model as part of the msprime package. The authors show how the coalescent approximation breaks down when the sample size, call it n, approaches the effective population size, Ne, and when large regions of the genome are simulated. While this result has been known for some time, the authors describe new facets to this issue and provide an implemented solution. The DTWF model that they implement compares favorably to the Hudson coalescent with respect to runtime and adequately captures features of the genealogical process that the coalescent approximation can not.</p>
<p>Generally I believe this to be a significant contribution, however the manuscript as written needs some substantial revision. I have point by point criticisms and suggestions that follow.</p>
<p>1) Generally it would be helpful to explore at what ratio of n/Ne these issues manifest. I would suggest revising the figures 2 and 3 to include a number of n/Ne ratios to show how this scaling effects the fit of the coalescent approximation.</p>
<p>2) Figure 1A is very hard to follow. It certainly does not clarify what is going on. I’d suggest the authors create a new figure to describe things.</p>
<p>3) Lines 87-90 in the motivation section. Is this issue purely a consequence of diploidy not being modeled? If the authors could explain the rationale a bit here it would be helpful.</p>
<p>4) Also with respect to motivation—the authors currently look at IBD tract lengths and the variance in ancestry as potential issues with the coalescent approximation. While this is great, both of those essentially are two facets of the same issue—recombination not being adequately captured by the coalescent. Can the authors look at different aspects of the data? For instance is the SFS perturbed in this regime under the coalescent?</p>
<p>5) Line 134—typo here. No section given.</p>
<p>6) Fig 3—unclear what the units of TMRA (shown in colors) are. Generations? Also in that figure—why is there a large gap in the data points in the top panel?</p>
<p>7) Figure 5 caption—the caption says that the sample was 1000 haploids but Ne=10000 diploids. Is this a typo? Was the actual sample size 10000 haploids?</p>
<p>8) With respect to hybrid models—it would be good to show how IBD and LD are affected by the hybrid model – are these features faithfully captured by using the hybrid models?</p>
<p>9) With respect to the performance analysis—it looks like the DTWF outperforms the coalescent model starting at 1e9 bp. While this is fine, we almost never have to simulate a billion bp chromosome and instead we can simulate unlinked chromosomes as separate, one from another. The authors should probably point this out.</p>
<p>10) In the Supplement the authors should spend more time describing the implementation. It is very non-technical at this point. Also the authors might point the reader to the code.</p>
<p>11) Line 382—typo “underestimated”</p>
<p>12) Last point—the authors should show how the issue of large samples not being adequately modeled under the coalescent is realized in empirical data. For instance the authors could analyse IBD tract lengths in the UK Biobank dataset and show that the distribution observed does not square with a coalescent process. As written the paper feels more like a technical computing note than a genetics paper.</p>
<p>Reviewer #2: This manuscript proposes a Wright-Fisher extension of msprime, a well-used coalescent simulator. Clearly this is a useful extension, but I feel that further work is needed for publication in PLoS Genetics.</p>
<p>First, it is disappointing to see only simulation results under a constant population size model. The authors should explore more realistic demographic models (e.g., previously inferred human population histories with two phases of exponential growth in the recent past) and study the accuracy of the standard coalescent model under those scenarios.</p>
<p>The authors have not directly demonstrated that using the WF model produces a better fit to real data. For example, it would be interesting to compare the IBD length distribution estimated from real data with simulation results from msprime (WF) and msprime (Hudson) under an inferred demographic model (e.g., inferred using the site frequency spectrum).</p>
<p>Co-author Kelleher has done interesting work on simulating pedigrees. It would be natural to think of a hybrid model where a pre-specified pedigree or a probabilistic pedigree model is used for the recent past, followed by the standard coalescent in the distant past. This would be a welcome extension and could be more useful than the WF extension. After all, the WF model is rather simple and idealized, while the actual mating pattern in real populations is much more complicated. Related to this point, would it be possible to incorporate other random mating models (e.g., general Cannings exchangeable models) into msprime?</p>
<p>Since one of the main motivations for the WF extension concerns IBD sharing, it seems important to implement crossover interference. If this is not an overly difficult extension, I would strongly recommend implementing it.</p>
<p>Please explain why msprime (WF) is faster than the previous version of msprime, as shown in Figure 5. Is it because the number of lineages is bounded by the population size in "msprime (WF)", as shown in Figure 2? It would be good to discuss Figure 5 in the context of Figure 2. Related to this point, please explain why "hybrid (100 WF generations)" is slower than "msprime (Hudson)", while "hybrid (1000 WF generations)" is faster than "msprime (Hudson)". To determine the optimal switch time in the hybrid model, it seems that one should investigate the trade-off between the computational overhead for using the WF model and the reduction in the number of lineages. This suggests that the optimal switch time would depend on the demographic model. This point should be clarified. Similarly, the authors should explain why "msprime (WF)" is less efficient than "msprime (Hudson)" for shorter regions, by discussing the trade-off mentioned above.</p>
<p>My understanding is that Bhaskar et al. (PNAS 2014, 111:2385-2390) first proposed the hybrid model, but this is not clearly acknowledged in the manuscript. The first three pages of the manuscript (including the title) give the impression that the idea is being proposed here for the first time.</p>
<p>Minor comments:</p>
<p>- Figure 1A: This figure is difficult to understand. Please explain it more clearly in the caption.</p>
<p>- Figure 2 : Perhaps this should be plotted with the x-axis in log scale? Also, it would not hurt to mention that the x-axis is in "Generations (backwards in time)".</p>
<p>- Figure 3 : In the top figure, please explain why there are few IBD segments of length between ~7*10^8 and ~10^9.</p>
<p>- Line 81-82: "We traced this phenomenon to samples having more than 2^t simulated ancestors at generation t in the past" is ambiguous. I think you meant, "We traced this phenomenon to some individuals in the sample having..."</p>
<p>- Line 95-96: It would help the reader to explain here why recent events in migration models induce long-range correlations along the genome.</p>
<p>- Lines 101-105: Bhaskar et al. (2014) compared the WF and the coalescent models with respect to the number of lineages at a single site. It would be good to discuss this result in relation to your result.</p>
<p>- There are blank references to sections throughout the manuscript. For example, Figure 2 caption ends with "described in Section ."</p>
<p>- Line 140: Replace "closely related samples" with "closely related individuals".</p>
<p>**********</p>
<p><bold>Have all data underlying the figures and results presented in the manuscript been provided?</bold></p>
<p>Large-scale datasets should be made available via a public repository as described in the <italic>PLOS Genetics</italic> <ext-link ext-link-type="uri" xlink:href="http://journals.plos.org/plosgenetics/s/data-availability" xlink:type="simple">data availability policy</ext-link>, and numerical data that underlies graphs or summary statistics should be provided in spreadsheet form as supporting information.</p>
<p>Reviewer #1: Yes</p>
<p>Reviewer #2: None</p>
<p>**********</p>
<p>PLOS authors have the option to publish the peer review history of their article (<ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/plosgenetics/s/editorial-and-peer-review-process#loc-peer-review-history" xlink:type="simple">what does this mean?</ext-link>). If published, this will include your full peer review and any attached files.</p>
<p>If you choose “no”, your identity will remain anonymous but your review may still be made public.</p>
<p><bold>Do you want your identity to be public for this peer review?</bold> For information about this choice, including consent withdrawal, please see our <ext-link ext-link-type="uri" xlink:href="https://www.plos.org/privacy-policy" xlink:type="simple">Privacy Policy</ext-link>.</p>
<p>Reviewer #1: No</p>
<p>Reviewer #2: No</p>
</body>
</sub-article>
<sub-article article-type="author-comment" id="pgen.1008619.r002">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pgen.1008619.r002</article-id>
<title-group>
<article-title>Author response to Decision Letter 0</article-title>
</title-group>
<related-object document-id="10.1371/journal.pgen.1008619" document-id-type="doi" document-type="peer-reviewed-article" id="rel-obj002" link-type="rebutted-decision-letter" object-id="10.1371/journal.pgen.1008619.r001" object-id-type="doi" object-type="decision-letter"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>1</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="author-response-date">4 Oct 2019</named-content>
</p>
<supplementary-material id="pgen.1008619.s011" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.s011" xlink:type="simple">
<label>Attachment</label>
<caption>
<p>Submitted filename: <named-content content-type="submitted-filename">Response to reviewers.pdf</named-content></p>
</caption>
</supplementary-material>
</body>
</sub-article>
<sub-article article-type="aggregated-review-documents" id="pgen.1008619.r003" specific-use="decision-letter">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pgen.1008619.r003</article-id>
<title-group>
<article-title>Decision Letter 1</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>Tang</surname>
<given-names>Hua</given-names>
</name>
<role>Section Editor: Natural Variation</role>
</contrib>
<contrib contrib-type="author">
<name name-style="western">
<surname>Williams</surname>
<given-names>Amy L.</given-names>
</name>
<role>Guest Editor</role>
</contrib>
</contrib-group>
<permissions>
<copyright-year>2020</copyright-year>
<copyright-holder>Tang, Williams</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<related-object document-id="10.1371/journal.pgen.1008619" document-id-type="doi" document-type="article" id="rel-obj003" link-type="peer-reviewed-article"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>1</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="letter-date">10 Dec 2019</named-content>
</p>
<p>* Please note while forming your response, if your article is accepted, you may have the opportunity to make the peer review history publicly available. The record will include editor decision letters (with reviews) and your responses to reviewer comments. If eligible, we will contact you to opt in or out. *</p>
<p>Dear Dr Nelson,</p>
<p>Thank you very much for submitting your Research Article entitled 'Accounting for long-range correlations in genome-wide simulations of large cohorts' to PLOS Genetics. Your manuscript was fully evaluated at the editorial level and by independent peer reviewers. Only one comment from Reviewer 2 remains to be addressed, and this should likely be possible quickly. One possibility is to simply make a textual change.</p>
<p>We therefore ask you to modify the manuscript according to the review recommendations before we can consider your manuscript for acceptance. Your revisions should address the specific points made by each reviewer.</p>
<p>In addition we ask that you:</p>
<p>1) Provide a detailed list of your responses to the review comments and a description of the changes you have made in the manuscript.</p>
<p>2) Upload a Striking Image with a corresponding caption to accompany your manuscript if one is available (either a new image or an existing one from within your manuscript). If this image is judged to be suitable, it may be featured on our website. Images should ideally be high resolution, eye-catching, single panel square images. For examples, please browse our <ext-link ext-link-type="uri" xlink:href="http://www.plosgenetics.org/article/browse/volume" xlink:type="simple">archive</ext-link>. If your image is from someone other than yourself, please ensure that the artist has read and agreed to the terms and conditions of the Creative Commons Attribution License. Note: we cannot publish copyrighted images.</p>
<p>We hope to receive your revised manuscript within the next 30 days. If you anticipate any delay in its return, we would ask you to let us know the expected resubmission date by email to <email xlink:type="simple">plosgenetics@plos.org</email>.</p>
<p>If present, accompanying reviewer attachments should be included with this email; please notify the journal office if any appear to be missing. They will also be available for download from the link below. You can use this link to log into the system when you are ready to submit a revised version, having first consulted our <ext-link ext-link-type="uri" xlink:href="http://journals.plos.org/plosgenetics/s/submit-now#loc-submission-checklist" xlink:type="simple">Submission Checklist</ext-link>.</p>
<p>While revising your submission, please upload your figure files to the <ext-link ext-link-type="uri" xlink:href="http://pace.apexcovantage.com/" xlink:type="simple">Preflight Analysis and Conversion Engine</ext-link> (PACE) digital diagnostic tool. PACE helps ensure that figures meet PLOS requirements. To use PACE, you must first register as a user. Then, login and navigate to the UPLOAD tab, where you will find detailed instructions on how to use the tool. If you encounter any issues or have any questions when using PACE, please email us at <email xlink:type="simple">figures@plos.org</email>.</p>
<p>Please be aware that our <ext-link ext-link-type="uri" xlink:href="http://journals.plos.org/plosgenetics/s/data-availability" xlink:type="simple">data availability policy</ext-link> requires that all numerical data underlying graphs or summary statistics are included with the submission, and you will need to provide this upon resubmission if not already present. In addition, we do not permit the inclusion of phrases such as "data not shown" or "unpublished results" in manuscripts. All points should be backed up by data provided with the submission.</p>
<p>PLOS has incorporated <ext-link ext-link-type="uri" xlink:href="http://www.crossref.org/crosscheck.html" xlink:type="simple">Similarity Check</ext-link>, powered by iThenticate, into its journal-wide submission system in order to screen submitted content for originality before publication. Each PLOS journal undertakes screening on a proportion of submitted articles. You will be contacted if needed following the screening process.</p>
<p>To resubmit, you will need to go to the link below and 'Revise Submission' in the 'Submissions Needing Revision' folder.</p>
<p>[LINK]</p>
<p>Please let us know if you have any questions while making these revisions.</p>
<p>Yours sincerely,</p>
<p>Amy L. Williams</p>
<p>Guest Editor</p>
<p>PLOS Genetics</p>
<p>Hua Tang</p>
<p>Section Editor: Natural Variation</p>
<p>PLOS Genetics</p>
<p>Reviewer's Responses to Questions</p>
<p><bold>Comments to the Authors:</bold></p>
<p><bold>Please note here if the review is uploaded as an attachment.</bold></p>
<p>Reviewer #1: I'm pleased with the edits made to this revision. This is an excellent contribution.</p>
<p>Reviewer #2: Overall the authors have done a good job of revising the paper and I am generally satisfied with all the changes. One exception is their response to my first major comment regarding the effect of demography on the distribution of pairwise IBD length. The authors have done simulation using the Out-of-Africa model from Gutenkunst et al. (2009), but my understanding is that in that model the present effective population sizes of YRI, CEU, and CHB are 7300, 29524, and 53403, respectively. What would happen if the present effective population size were much larger, say 1 million or 10 million, while the sample size is held at 1000? The authors claim, "the overall relationship between IBD counts and IBD length ... does not depend on the details of the demographic history or sample sizes." To me, this seems like a strong claim which warrants more rigorous justification, as it might send an incorrect message to the reader. To what extent does it not depend on the demographic model? Could you be more quantitative?</p>
<p>**********</p>
<p><bold>Have all data underlying the figures and results presented in the manuscript been provided?</bold></p>
<p>Large-scale datasets should be made available via a public repository as described in the <italic>PLOS Genetics</italic> <ext-link ext-link-type="uri" xlink:href="http://journals.plos.org/plosgenetics/s/data-availability" xlink:type="simple">data availability policy</ext-link>, and numerical data that underlies graphs or summary statistics should be provided in spreadsheet form as supporting information.</p>
<p>Reviewer #1: Yes</p>
<p>Reviewer #2: None</p>
<p>**********</p>
<p>PLOS authors have the option to publish the peer review history of their article (<ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/plosgenetics/s/editorial-and-peer-review-process#loc-peer-review-history" xlink:type="simple">what does this mean?</ext-link>). If published, this will include your full peer review and any attached files.</p>
<p>If you choose “no”, your identity will remain anonymous but your review may still be made public.</p>
<p><bold>Do you want your identity to be public for this peer review?</bold> For information about this choice, including consent withdrawal, please see our <ext-link ext-link-type="uri" xlink:href="https://www.plos.org/privacy-policy" xlink:type="simple">Privacy Policy</ext-link>.</p>
<p>Reviewer #1: No</p>
<p>Reviewer #2: No</p>
</body>
</sub-article>
<sub-article article-type="author-comment" id="pgen.1008619.r004">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pgen.1008619.r004</article-id>
<title-group>
<article-title>Author response to Decision Letter 1</article-title>
</title-group>
<related-object document-id="10.1371/journal.pgen.1008619" document-id-type="doi" document-type="peer-reviewed-article" id="rel-obj004" link-type="rebutted-decision-letter" object-id="10.1371/journal.pgen.1008619.r003" object-id-type="doi" object-type="decision-letter"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>2</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="author-response-date">12 Jan 2020</named-content>
</p>
<supplementary-material id="pgen.1008619.s012" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pgen.1008619.s012" xlink:type="simple">
<label>Attachment</label>
<caption>
<p>Submitted filename: <named-content content-type="submitted-filename">Response to reviewers #2.pdf</named-content></p>
</caption>
</supplementary-material>
</body>
</sub-article>
<sub-article article-type="editor-report" id="pgen.1008619.r005" specific-use="decision-letter">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pgen.1008619.r005</article-id>
<title-group>
<article-title>Decision Letter 2</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>Tang</surname>
<given-names>Hua</given-names>
</name>
<role>Section Editor: Natural Variation</role>
</contrib>
<contrib contrib-type="author">
<name name-style="western">
<surname>Williams</surname>
<given-names>Amy L.</given-names>
</name>
<role>Guest Editor</role>
</contrib>
</contrib-group>
<permissions>
<copyright-year>2020</copyright-year>
<copyright-holder>Tang, Williams</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<related-object document-id="10.1371/journal.pgen.1008619" document-id-type="doi" document-type="article" id="rel-obj005" link-type="peer-reviewed-article"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>2</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="letter-date">21 Jan 2020</named-content>
</p>
<p>Dear Dr Nelson,</p>
<p>We are pleased to inform you that your manuscript entitled "Accounting for long-range correlations in genome-wide simulations of large cohorts" has been editorially accepted for publication in PLOS Genetics. Congratulations!</p>
<p>Before your submission can be formally accepted and sent to production you will need to complete our formatting changes, which you will receive in a follow up email. Please be aware that it may take several days for you to receive this email; during this time no action is required by you. Please note: the accept date on your published article will reflect the date of this provisional accept, but your manuscript will not be scheduled for publication until the required changes have been made.</p>
<p>Once your paper is formally accepted, an uncorrected proof of your manuscript will be published online ahead of the final version, unless you’ve already opted out via the online submission form. If, for any reason, you do not want an earlier version of your manuscript published online or are unsure if you have already indicated as such, please let the journal staff know immediately at <email xlink:type="simple">plosgenetics@plos.org</email>.</p>
<p>In the meantime, please log into Editorial Manager at <ext-link ext-link-type="uri" xlink:href="https://www.editorialmanager.com/pgenetics/" xlink:type="simple">https://www.editorialmanager.com/pgenetics/</ext-link>, click the "Update My Information" link at the top of the page, and update your user information to ensure an efficient production and billing process. Note that PLOS requires an ORCID iD for all corresponding authors. Therefore, please ensure that you have an ORCID iD and that it is validated in Editorial Manager. To do this, go to ‘Update my Information’ (in the upper left-hand corner of the main menu), and click on the Fetch/Validate link next to the ORCID field.  This will take you to the ORCID site and allow you to create a new iD or authenticate a pre-existing iD in Editorial Manager.</p>
<p>If you have a press-related query, or would like to know about one way to make your underlying data available (as you will be aware, this is required for publication), please see the end of this email. If your institution or institutions have a press office, please notify them about your upcoming article at this point, to enable them to help maximise its impact. Inform journal staff as soon as possible if you are preparing a press release for your article and need a publication date.</p>
<p>Thank you again for supporting open-access publishing; we are looking forward to publishing your work in PLOS Genetics!</p>
<p>Yours sincerely,</p>
<p>Amy L. Williams</p>
<p>Guest Editor</p>
<p>PLOS Genetics</p>
<p>Hua Tang</p>
<p>Section Editor: Natural Variation</p>
<p>PLOS Genetics</p>
<p><ext-link ext-link-type="uri" xlink:href="http://www.plosgenetics.org" xlink:type="simple">www.plosgenetics.org</ext-link></p>
<p>Twitter: @PLOSGenetics</p>
<p>----------------------------------------------------</p>
<p>Comments from the reviewers (if applicable):</p>
<p>----------------------------------------------------</p>
<p><bold>Data Deposition</bold></p>
<p>If you have submitted a Research Article or Front Matter that has associated data that are not suitable for deposition in a subject-specific public repository (such as GenBank or ArrayExpress), one way to make that data available is to deposit it in the <ext-link ext-link-type="uri" xlink:href="http://www.datadryad.org" xlink:type="simple">Dryad Digital Repository</ext-link>. As you may recall, we ask all authors to agree to make data available; this is one way to achieve that. A full list of recommended repositories can be found on our <ext-link ext-link-type="uri" xlink:href="http://journals.plos.org/plosgenetics/s/data-availability#loc-recommended-repositories" xlink:type="simple">website</ext-link>.</p>
<p>The following link will take you to the Dryad record for your article, so you won't have to re‐enter its bibliographic information, and can upload your files directly: </p>
<p><ext-link ext-link-type="uri" xlink:href="http://datadryad.org/submit?journalID=pgenetics&amp;manu=PGENETICS-D-19-00848R2" xlink:type="simple">http://datadryad.org/submit?journalID=pgenetics&amp;manu=PGENETICS-D-19-00848R2</ext-link></p>
<p>More information about depositing data in Dryad is available at <ext-link ext-link-type="uri" xlink:href="http://www.datadryad.org/depositing" xlink:type="simple">http://www.datadryad.org/depositing</ext-link>. If you experience any difficulties in submitting your data, please contact <email xlink:type="simple">help@datadryad.org</email> for support.</p>
<p>Additionally, please be aware that our <ext-link ext-link-type="uri" xlink:href="http://journals.plos.org/plosgenetics/s/data-availability" xlink:type="simple">data availability policy</ext-link> requires that all numerical data underlying display items are included with the submission, and you will need to provide this before we can formally accept your manuscript, if not already present.</p>
<p>----------------------------------------------------</p>
<p><bold>Press Queries</bold></p>
<p>If you or your institution will be preparing press materials for this manuscript, or if you need to know your paper's publication date for media purposes, please inform the journal staff as soon as possible so that your submission can be scheduled accordingly. Your manuscript will remain under a strict press embargo until the publication date and time. This means an early version of your manuscript will not be published ahead of your final version. PLOS Genetics may also choose to issue a press release for your article. If there's anything the journal should know or you'd like more information, please get in touch via <email xlink:type="simple">plosgenetics@plos.org</email>.</p>
</body>
</sub-article>
<sub-article article-type="editor-report" id="pgen.1008619.r006" specific-use="acceptance-letter">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pgen.1008619.r006</article-id>
<title-group>
<article-title>Acceptance letter</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>Tang</surname>
<given-names>Hua</given-names>
</name>
<role>Section Editor: Natural Variation</role>
</contrib>
<contrib contrib-type="author">
<name name-style="western">
<surname>Williams</surname>
<given-names>Amy L.</given-names>
</name>
<role>Guest Editor</role>
</contrib>
</contrib-group>
<permissions>
<copyright-year>2020</copyright-year>
<copyright-holder>Tang, Williams</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<related-object document-id="10.1371/journal.pgen.1008619" document-id-type="doi" document-type="article" id="rel-obj006" link-type="peer-reviewed-article"/>
</front-stub>
<body>
<p>
<named-content content-type="letter-date">28 Apr 2020</named-content>
</p>
<p>PGENETICS-D-19-00848R2 </p>
<p>Accounting for long-range correlations in genome-wide simulations of large cohorts </p>
<p>Dear Dr Nelson, </p>
<p>We are pleased to inform you that your manuscript entitled "Accounting for long-range correlations in genome-wide simulations of large cohorts" has been formally accepted for publication in PLOS Genetics!  Your manuscript is now with our production department and you will be notified of the publication date in due course.</p>
<p>The corresponding author will soon be receiving a typeset proof for review, to ensure errors have not been introduced during production. Please review the PDF proof of your manuscript carefully, as this is the last chance to correct any errors. Please note that major changes, or those which affect the scientific understanding of the work, will likely cause delays to the publication date of your manuscript. </p>
<p>Soon after your final files are uploaded, unless you have opted out or your manuscript is a front-matter piece, the early version of your manuscript will be published online. The date of the early version will be your article's publication date. The final article will be published to the same URL, and all versions of the paper will be accessible to readers.</p>
<p>Thank you again for supporting PLOS Genetics and open-access publishing. We are looking forward to publishing your work! </p>
<p>With kind regards,</p>
<p>Matt Lyles</p>
<p>PLOS Genetics</p>
<p>On behalf of:</p>
<p>The PLOS Genetics Team</p>
<p>Carlyle House, Carlyle Road, Cambridge CB4 3DN | United Kingdom</p>
<p><email xlink:type="simple">plosgenetics@plos.org</email> | +44 (0) 1223-442823</p>
<p><ext-link ext-link-type="uri" xlink:href="http://plosgenetics.org" xlink:type="simple">plosgenetics.org</ext-link> | Twitter: @PLOSGenetics</p>
</body>
</sub-article>
</article>