<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1d3 20150301//EN" "http://jats.nlm.nih.gov/publishing/1.1d3/JATS-journalpublishing1.dtd">
<article article-type="research-article" dtd-version="1.1d3" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS ONE</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">plosone</journal-id>
<journal-title-group>
<journal-title>PLOS ONE</journal-title>
</journal-title-group>
<issn pub-type="epub">1932-6203</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, CA USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.1371/journal.pone.0280951</article-id>
<article-id pub-id-type="publisher-id">PONE-D-22-24527</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Research Article</subject>
</subj-group>
<subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Computational techniques</subject><subj-group><subject>Computational pipelines</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Heredity</subject><subj-group><subject>Genetic mapping</subject><subj-group><subject>Variant genotypes</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genetic loci</subject><subj-group><subject>Alleles</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Medicine and health sciences</subject><subj-group><subject>Oncology</subject><subj-group><subject>Cancers and neoplasms</subject><subj-group><subject>Gynecological tumors</subject><subj-group><subject>Ovarian cancer</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Heredity</subject><subj-group><subject>Heterozygosity</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Medicine and health sciences</subject><subj-group><subject>Oncology</subject><subj-group><subject>Cancers and neoplasms</subject><subj-group><subject>Genitourinary tract tumors</subject><subj-group><subject>Prostate cancer</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Medicine and health sciences</subject><subj-group><subject>Urology</subject><subj-group><subject>Prostate diseases</subject><subj-group><subject>Prostate cancer</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Science policy</subject><subj-group><subject>Open science</subject><subj-group><subject>Open data</subject></subj-group></subj-group></subj-group></article-categories>
<title-group>
<article-title>Inflated expectations: Rare-variant association analysis using public controls</article-title>
<alt-title alt-title-type="running-head">Inflation using public controls</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0001-6274-2841</contrib-id>
<name name-style="western">
<surname>Kim</surname>
<given-names>Jung</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role content-type="http://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="http://credit.niso.org/contributor-roles/validation/">Validation</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-original-draft/">Writing – original draft</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Karyadi</surname>
<given-names>Danielle M.</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-original-draft/">Writing – original draft</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0001-6584-005X</contrib-id>
<name name-style="western">
<surname>Hartley</surname>
<given-names>Stephen W.</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role content-type="http://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-original-draft/">Writing – original draft</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Zhu</surname>
<given-names>Bin</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role content-type="http://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="http://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0001-9419-4384</contrib-id>
<name name-style="western">
<surname>Wang</surname>
<given-names>Mingyi</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role content-type="http://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="http://credit.niso.org/contributor-roles/resources/">Resources</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff002"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff003"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Wu</surname>
<given-names>Dongjing</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role content-type="http://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff002"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff003"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Song</surname>
<given-names>Lei</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Armstrong</surname>
<given-names>Gregory T.</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff004"><sup>4</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0002-7755-5683</contrib-id>
<name name-style="western">
<surname>Bhatia</surname>
<given-names>Smita</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff005"><sup>5</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Robison</surname>
<given-names>Leslie L.</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff004"><sup>4</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Yasui</surname>
<given-names>Yutaka</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff004"><sup>4</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Carter</surname>
<given-names>Brian</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff006"><sup>6</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Sampson</surname>
<given-names>Joshua N.</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0003-0074-1098</contrib-id>
<name name-style="western">
<surname>Freedman</surname>
<given-names>Neal D.</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Goldstein</surname>
<given-names>Alisa M.</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Mirabello</surname>
<given-names>Lisa</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Chanock</surname>
<given-names>Stephen J.</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Morton</surname>
<given-names>Lindsay M.</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0001-6006-0740</contrib-id>
<name name-style="western">
<surname>Savage</surname>
<given-names>Sharon A.</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0001-8193-1488</contrib-id>
<name name-style="western">
<surname>Stewart</surname>
<given-names>Douglas R.</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
<xref ref-type="corresp" rid="cor001">*</xref>
</contrib>
</contrib-group>
<aff id="aff001"><label>1</label> <addr-line>Division of Cancer Epidemiology and Genetics, National Cancer Institute, Rockville, Maryland, United States of America</addr-line></aff>
<aff id="aff002"><label>2</label> <addr-line>Cancer Genomics Research Laboratory, Division of Cancer Epidemiology and Genetics, National Cancer Institute, Rockville, Maryland, United States of America</addr-line></aff>
<aff id="aff003"><label>3</label> <addr-line>Leidos Biomedical Research, Inc., Frederick National Laboratory for Cancer Research, Frederick, Maryland, United States of America</addr-line></aff>
<aff id="aff004"><label>4</label> <addr-line>Department of Epidemiology and Cancer Control, St. Jude Children’s Research Hospital, Memphis, Tennessee, United States of America</addr-line></aff>
<aff id="aff005"><label>5</label> <addr-line>Institute for Cancer Outcomes and Survivorship, University of Alabama at Birmingham, Birmingham, Alabama, United States of America</addr-line></aff>
<aff id="aff006"><label>6</label> <addr-line>Department of Population Science, American Cancer Society, Atlanta, Georgia, United States of America</addr-line></aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple">
<name name-style="western">
<surname>Galli</surname>
<given-names>Alvaro</given-names>
</name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/>
</contrib>
</contrib-group>
<aff id="edit1"><addr-line>CNR, ITALY</addr-line></aff>
<author-notes>
<fn fn-type="conflict" id="coi001">
<p>All authors have declared no competing interests.</p>
</fn>
<corresp id="cor001">* E-mail: <email xlink:type="simple">drstewart@mail.nih.gov</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>25</day>
<month>1</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>18</volume>
<issue>1</issue>
<elocation-id>e0280951</elocation-id>
<history>
<date date-type="received">
<day>2</day>
<month>9</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>12</day>
<month>1</month>
<year>2023</year>
</date>
</history>
<permissions>
<license xlink:href="https://creativecommons.org/publicdomain/zero/1.0/" xlink:type="simple">
<license-p>This is an open access article, free of all copyright, and may be freely reproduced, distributed, transmitted, modified, built upon, or otherwise used by anyone for any lawful purpose. The work is made available under the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/publicdomain/zero/1.0/" xlink:type="simple">Creative Commons CC0</ext-link> public domain dedication.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.pone.0280951"/>
<abstract>
<p>The use of publicly available sequencing datasets as controls (hereafter, “public controls”) in studies of rare variant disease associations has great promise but can increase the risk of false-positive discovery. The specific factors that could contribute to inflated distribution of test statistics have not been systematically examined. Here, we leveraged both public controls, gnomAD v2.1 and several datasets sequenced in our laboratory to systematically investigate factors that could contribute to the false-positive discovery, as measured by λ<sub>Δ95</sub>, a measure to quantify the degree of inflation in statistical significance. Analyses of datasets in this investigation found that 1) the significantly inflated distribution of test statistics decreased substantially when the same variant caller and filtering pipelines were employed, 2) differences in library prep kits and sequencers did not affect the false-positive discovery rate and, 3) joint <italic>vs</italic>. separate variant-calling of cases and controls did not contribute to the inflation of test statistics. Currently available methods do not adequately adjust for the high false-positive discovery. These results, especially if replicated, emphasize the risks of using public controls for rare-variant association tests in which individual-level data and the computational pipeline are not readily accessible, which prevents the use of the same variant-calling and filtering pipelines on both cases and controls. A plausible solution exists with the emergence of cloud-based computing, which can make it possible to bring containerized analytical pipelines to the data (rather than the data to the pipeline) and could avert or minimize these issues. It is suggested that future reports account for this issue and provide this as a limitation in reporting new findings based on studies that cannot practically analyze all data on a single pipeline.</p>
</abstract>
<funding-group>
<award-group id="award001">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/100011541</institution-id>
<institution>Division of Cancer Epidemiology and Genetics, National Cancer Institute</institution>
</institution-wrap>
</funding-source>
</award-group>
<award-group id="award002">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/100000009</institution-id>
<institution>Foundation for the National Institutes of Health</institution>
</institution-wrap>
</funding-source>
<award-id>CA55727</award-id>
<principal-award-recipient>
<name name-style="western">
<surname>Armstrong</surname>
<given-names>Gregory T.</given-names>
</name>
</principal-award-recipient>
</award-group>
<award-group id="award003">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/100008746</institution-id>
<institution>National Cancer Center</institution>
</institution-wrap>
</funding-source>
<award-id>CA21765</award-id>
</award-group>
<funding-statement>This work was supported by the Intramural Research Program of the Division of Cancer Epidemiology and Genetics of the National Cancer Institute, Bethesda, MD. CCSS also is supported by the National Cancer Institute (CA55727, GT Armstrong, principal investigator) and St. Jude Children’s Research Hospital through the National Cancer Institute Cancer Center Support (CORE) grant (CA21765, C. Roberts, principal investigator) and the American Lebanese-Syrian Associated Charities (ALSAC).</funding-statement>
</funding-group>
<counts>
<fig-count count="4"/>
<table-count count="1"/>
<page-count count="13"/>
</counts>
<custom-meta-group>
<custom-meta id="data-availability">
<meta-name>Data Availability</meta-name>
<meta-value>Exome data are available from dbGAP (accession number phs001327.v2.p1 and phs001286.v2.p2).</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="sec001" sec-type="intro">
<title>Introduction</title>
<p>Large-scale, publicly available germline exome and genome sequencing datasets have emerged as invaluable tools for investigating associations between genetic variants and disease. These datasets are frequently used as controls to substantially increase the statistical power for investigation of rare genetic variants that could contribute to specific diseases. Although the method of variant-calling in each resource is generally described (<italic>e</italic>.<italic>g</italic>., Exome Variant Server [<xref ref-type="bibr" rid="pone.0280951.ref001">1</xref>], 1000 Genomes [<xref ref-type="bibr" rid="pone.0280951.ref002">2</xref>], The Exome Aggregation Consortium/The Genome Aggregation Database [gnomAD] [<xref ref-type="bibr" rid="pone.0280951.ref003">3</xref>]), the raw data files and/or pipeline methods typically are not readily accessible. Previous studies have reported that using public controls in rare-variant association analyses can lead to a marked increase in false-positive findings [<xref ref-type="bibr" rid="pone.0280951.ref004">4</xref>, <xref ref-type="bibr" rid="pone.0280951.ref005">5</xref>]. Although methods have been developed to adjust for this inflation (<italic>e</italic>.<italic>g</italic>., TRAPD [<xref ref-type="bibr" rid="pone.0280951.ref006">6</xref>], ProxECAT [<xref ref-type="bibr" rid="pone.0280951.ref007">7</xref>], iECAT [<xref ref-type="bibr" rid="pone.0280951.ref008">8</xref>]), the performance of these methods in larger datasets and the specific factors that contribute to the inflated distribution of test statistics have not been systematically examined.</p>
</sec>
<sec id="sec002" sec-type="results">
<title>Results</title>
<sec id="sec003">
<title>Overview of λ<sub>Δ95</sub>, analytic approach and sample sets</title>
<p><xref ref-type="table" rid="pone.0280951.t001">Table 1</xref> and Figs <xref ref-type="fig" rid="pone.0280951.g001">1</xref>–<xref ref-type="fig" rid="pone.0280951.g004">4</xref> summarize the analyses performed to systematically investigate factors that could contribute to false-positive findings by determining λ<sub>Δ95</sub>, which quantifies the degree of inflation in statistical significance. λ is a metric developed for measuring p-value inflation in genome-wide association studies. Guo <italic>et al</italic>. [<xref ref-type="bibr" rid="pone.0280951.ref006">6</xref>] adapted λ for use in rare-variant association studies to calculate λ<sub>Δ95</sub> which adjusts for many results with p = 1.00. However, λ<sub>Δ95</sub> does not fully capture the inflated distribution of test statistics if the observed p-values deviate from expected p-values greater than the median. Thus, visual inspection of the line deviation from the 95% confidence interval (CI, gray area in figures) was also evaluated.</p>
<fig id="pone.0280951.g001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0280951.g001</object-id>
<label>Fig 1</label>
<caption>
<title>Demonstration of increased false-positive findings with expected-null findings using public controls.</title>
<p>Quantile-quantile plot (synonymous variants only) of non-Finnish European non-TCGA (The Cancer Genome Atlas) gnomAD (serving as a public control) versus an experimental dataset. We observed highly inflated p-values deviating from the 95% confidence interval.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0280951.g001" xlink:type="simple"/>
</fig>
<fig id="pone.0280951.g002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0280951.g002</object-id>
<label>Fig 2</label>
<caption>
<title>Evaluation of laboratory factors contributing to elevated false-positive findings.</title>
<p>Quantile-Quantile plot of two experimental datasets (sub-sampled CCSS data) that used the same capture kit (EZ Exome+UTR PE) and differ in the use of library prep kit (BiooNextFlex vs. KapaHyper Plus) and sequencer (HiSeq 2000/2500 vs. HiSeq 4000). Variants in both cohorts were called using HaplotypeCaller and UnifiedGenotyper and/or Freebayes.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0280951.g002" xlink:type="simple"/>
</fig>
<fig id="pone.0280951.g003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0280951.g003</object-id>
<label>Fig 3</label>
<caption>
<title>Evaluation of the effects of joint vs separate variant calling on elevated false-positive findings.</title>
<p>Quantile-Quantile plot of sub-sampled CCSS data that were called jointly or separately. Red shows the two cohorts variant-called jointly; blue shows the two cohorts variant-called separately.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0280951.g003" xlink:type="simple"/>
</fig>
<fig id="pone.0280951.g004" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0280951.g004</object-id>
<label>Fig 4</label>
<caption>
<title>Use of different (A) and same (B) variant-calling pipelines.</title>
<p>Quantile-quantile plot of distribution of p-values from synonymous variants in sub-sampled CCSS data (n = 4000) randomly divided (n = 2000 each) and called with (A) different callers (Ensemble vs. HaplotypeCaller) or (B) same caller (HaplotypeCaller) and post-variant filtering. In panel A, we observed inflated p-values deviating from the 95% confidence interval (shading), while in panel B, we observed no deviation from the 95% confidence interval (shading), consistent with minimal or no inflation of p-values.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0280951.g004" xlink:type="simple"/>
</fig>
<table-wrap id="pone.0280951.t001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0280951.t001</object-id>
<label>Table 1</label> <caption><title>Summary of data analyses.</title></caption>
<alternatives>
<graphic id="pone.0280951.t001g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0280951.t001" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="center"/>
<th align="center">Analysis</th>
<th align="center">Sample Set #1 (n = participants)</th>
<th align="center">Caller for Sample Set #1</th>
<th align="center">Sample Set #2 (n = participants)</th>
<th align="center">Caller for Sample Set #2</th>
<th align="center">Number of Genes Tested</th>
<th align="center">λ<sub>Δ95</sub></th>
</tr>
</thead>
<tbody>
<tr>
<td align="center"><xref ref-type="fig" rid="pone.0280951.g001">Fig 1</xref></td>
<td align="center">Demonstration of increased false-positive findings with expected-null findings using public controls</td>
<td align="center">Dataset #1: NFE non-TCGA gnomAD (n = 51,377)</td>
<td align="center">HaplotypeCaller</td>
<td align="center">Dataset #2: CPSII/PLCO (n = 597) 97.4% samples have &gt; 95%CEU</td>
<td align="center">HaplotypeCaller</td>
<td align="center">17,482</td>
<td align="center">1.09</td>
</tr>
<tr>
<td align="center"><xref ref-type="fig" rid="pone.0280951.g002">Fig 2</xref></td>
<td align="center">Evaluation of laboratory factors contributing to elevated false-positive findings <sup>1</sup></td>
<td align="center">Dataset #2: CPSII/PLCO (n = 597)<sup>2</sup> 97.4% samples have &gt; 95%CEU</td>
<td align="center">Ensemble</td>
<td align="center">Datasets #3 and 4 combined: CCSS (n = 4,000)<sup>3</sup> 94.9% samples have &gt; 95%CEU</td>
<td align="center">Ensemble</td>
<td align="center">10,461</td>
<td align="center">1.00</td>
</tr>
<tr>
<td align="center"><xref ref-type="fig" rid="pone.0280951.g003">Fig 3</xref></td>
<td align="center">Evaluation of the effects of joint vs separate variant calling on elevated false-positive findings</td>
<td align="center">Dataset #2: CPSII/PLCO (n = 597) 97.4% samples have &gt; 95%CEU</td>
<td align="center">Ensemble</td>
<td align="center">Dataset #3: CCSS (n = 2,000) 94.5% samples have &gt; 95%CEU</td>
<td align="center">Ensemble</td>
<td align="center">Joint: 10,244 Separate: 10,224</td>
<td align="center">Joint<sup>4</sup>: 0.91 Separate<sup>5</sup>: 0.94</td>
</tr>
<tr>
<td align="center"><xref ref-type="fig" rid="pone.0280951.g004">Fig 4A</xref></td>
<td align="center">Use of different variant-calling pipelines</td>
<td align="center">Dataset #3: CCSS (n = 2,000) 94.5% samples have &gt; 95%CEU</td>
<td align="center">Ensemble</td>
<td align="center">Dataset #4: CCSS (n = 2,000) 95.3% samples have &gt; 95%CEU</td>
<td align="center">HaplotypeCaller</td>
<td align="center">16,281</td>
<td align="center">1.16</td>
</tr>
<tr>
<td align="center"><xref ref-type="fig" rid="pone.0280951.g004">Fig 4B</xref></td>
<td align="center">Use of same variant-calling and post-variant filtering</td>
<td align="center">Dataset #3: CCSS (n = 2,000) 94.5% samples have &gt; 95%CEU</td>
<td align="center">HaplotypeCaller</td>
<td align="center">Dataset #4: CCSS (n = 2,000) 95.3% samples have &gt; 95%CEU</td>
<td align="center">HaplotypeCaller</td>
<td align="center">16,327</td>
<td align="center">0.99</td>
</tr>
<tr>
<td align="center"><xref ref-type="supplementary-material" rid="pone.0280951.s001">S1 Fig</xref></td>
<td align="center">Evaluation of published methods to correct an elevated false positive rate</td>
<td align="center">Dataset #1: NFE non-TCGA gnomAD (n = 51,377)</td>
<td align="center">HaplotypeCaller</td>
<td align="center">Datasets vary: CCSS (n = 4,300, n = 1,000, n = 400) 94.9% samples have &gt; 95% CEU in 4,300 and 1,000 cases and 95.3% samples have &gt; 95.3% CEU in 400 cases</td>
<td align="center">Ensemble</td>
<td align="center">For each method (n = 4,300, n = 1,000, n = 400): TRAPD (14,972, 14,714, 14,710); ProxECAT (3,987, 441, 65); iECAT (12,906, 7,234, 4,898)</td>
<td align="center">For each method, (n = 4,300, n = 1,000, n = 400): TRAPD (1.59, 1.00, 1.35);ProxECAT (2.79, 2.13, 1.90); iECAT (1.20, 0.45, 0.43)</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<p>“Ensemble” caller refers to the use of HaplotypeCaller <italic>and</italic> UnifiedGenotyper <italic>and/or</italic> FreeBayes. CPSII: Cancer Prevention Study II (dataset); CCSS: Childhood Cancer Survivor Study (dataset); gnomAD: The Genome Aggregation Database; NFE: Non-Finnish European; PLCO: Prostate, Lung, Colorectal, Ovarian Cancer (dataset); TCGA: The Cancer Genome Atlas. The “Number of Genes Tested” varies since at least five variants per gene is required to be considered.</p>
<p>For each analysis, we evaluated the distribution of rare, synonymous variants from two different sample sets (listed as “Sample Set #1” and “Sample Set #2” which vary depending on the study) collapsed by gene from individuals of non-Finnish European (NFE) ancestry. The caller for the two sample sets also varied, depending on the investigation but was either HaplotypeCaller or “Ensemble,” a combination of three different callers (see <xref ref-type="supplementary-material" rid="pone.0280951.s002">S1 File</xref>). The number of genes tested varies due to differences in datasets and filtering. We did not perform burden tests on genes with fewer than 3 observed variant carriers. This means that some genes that were testable on larger datasets or with more permissive filtering may no longer be testable with smaller samples or stricter filtering.</p>
<p>We used the exome sequencing component of gnomAD, excluding cases from The Cancer Genome Atlas, (TCGA) (n = 51,377) [<xref ref-type="bibr" rid="pone.0280951.ref003">3</xref>] as a public control dataset. The three exome sequencing datasets from our laboratory were the Prostate, Lung, Colon, Ovary Screening Trial (PLCO) (n = 374) [<xref ref-type="bibr" rid="pone.0280951.ref009">9</xref>], Cancer Prevention Study II (CPSII) (n = 223) [<xref ref-type="bibr" rid="pone.0280951.ref010">10</xref>], and the Childhood Cancer Survivor Study (CCSS) (n = 5,105), [<xref ref-type="bibr" rid="pone.0280951.ref011">11</xref>] all of which are available through dbGaP.</p>
<p>For each figure, the datasets, callers and number of genes in the analysis are listed at the top, which matches the information for each analysis in <xref ref-type="table" rid="pone.0280951.t001">Table 1</xref>. Each figure is a QQ plot, which compares the expected with the observed p-value (on a log scale) for rare, synonymous variants from two different sample sets. Since the evaluated variants are synonymous, we do not expect any deviation from a uniform distribution of p-values under the null hypothesis of no association between phenotypes and rare variants. Thus we expect the slope of the plot to approximate 1. Deviations from the slope = 1 (as measured by visual inspection or λ<sub>Δ95</sub>) suggest systematic noise or error (<italic>e</italic>.<italic>g</italic>., laboratory processes or factors in variant-calling or annotation) with “inflated” p-values. Factors (e.g., use of identical processes for datasets) that reduce inflation restore the compared distributions to a slope that approximates 1.</p>
</sec>
<sec id="sec004">
<title>Demonstration of increased false-positive findings with expected-null findings using public controls (<xref ref-type="table" rid="pone.0280951.t001">Table 1</xref>, row 2, and <xref ref-type="fig" rid="pone.0280951.g001">Fig 1</xref>)</title>
<p>To illustrate the increase in false positive findings using public controls, we compared two ancestry-matched non-disease cohorts using a rare-variant association (“burden”) test of synonymous variants (only) that would be expected to be null. We analyzed variants from 17,482 genes from the 51,377 individuals in the NFE non-TCGA gnomAD dataset and compared them with variants from 597 cancer-free individuals in the CPSII/PLCO cohort. HaplotypeCaller was used for both sample sets, but different post-variant filtering methods were applied. We observed significantly inflated p-values (λ<sub>Δ95</sub> = 1.09) with a distribution that was highly deviated from the 95% CI in the quantile-quantile (QQ) plot (<xref ref-type="fig" rid="pone.0280951.g001">Fig 1</xref>).</p>
</sec>
<sec id="sec005">
<title>Evaluation of laboratory factors contributing to elevated false-positive findings (<xref ref-type="table" rid="pone.0280951.t001">Table 1</xref>, row 3, and <xref ref-type="fig" rid="pone.0280951.g002">Fig 2</xref>)</title>
<p>We next investigated the possible origins of the inflated p-values by focusing on factors that could differ between public controls and an experimental dataset such as laboratory processes (<italic>e</italic>.<italic>g</italic>., capture kit, library prep kit, sequencing platform). To do this, we compared the distribution of rare synonymous variants from 10,461 genes in a dataset from CPSII/PLCO (n = 597) with a dataset from CCSS (n = 4,000) that shared the same capture kit, calling and post-variant filtering but differed in library prep kit and sequencing platform. Although there was some deviation from the 95% CI (41 genes), <xref ref-type="fig" rid="pone.0280951.g002">Fig 2</xref> shows minimal deviation (λ<sub>Δ95</sub> = 1.00) from the expected null distribution in this comparison.</p>
</sec>
<sec id="sec006">
<title>Evaluation of the effects of joint vs separate variant calling on elevated false-positive findings (<xref ref-type="table" rid="pone.0280951.t001">Table 1</xref>, row 4, and <xref ref-type="fig" rid="pone.0280951.g003">Fig 3</xref>)</title>
<p>We next investigated the possible origins of the inflated p-values by focusing on factors that could differ between public controls and an experimental dataset such as variant-calling differences (<italic>e</italic>.<italic>g</italic>., single <italic>vs</italic>. multiple callers, joint <italic>vs</italic>. separate calling, same <italic>vs</italic>. different callers). To do this, we evaluated the effects of joint <italic>vs</italic>. separate variant-calling on the inflated distribution of test statistics by comparing the distribution of rare synonymous variants from 10,244 genes in a dataset from CPSII/PLCO (n = 597) with a dataset from CCSS (n = 2,000) using the same Ensemble (HaplotypeCaller plus at least one other caller) variant-calling pipeline. <xref ref-type="fig" rid="pone.0280951.g003">Fig 3</xref> shows minimal deviation from the null distribution with variant-calling performed either jointly (λ<sub>Δ95</sub> = 0.91; both sample sets variant-called together) or separately (λ<sub>Δ95</sub> = 0.94; each sample set variant-called separately). Taken together, these results suggest that joint <italic>vs</italic>. separate variant-calling does not contribute to the observed inflation.</p>
</sec>
<sec id="sec007">
<title>Use of different variant-calling pipelines (<xref ref-type="table" rid="pone.0280951.t001">Table 1</xref>, row 5, and <xref ref-type="fig" rid="pone.0280951.g004">Fig 4A</xref>)</title>
<p>We next considered the use of different variant-calling pipelines. We randomly separated an experimental dataset derived from CCSS (n = 4,000, samples sequenced at the same time in our laboratory) into two groups (each n = 2,000). Dataset #3, specified in <xref ref-type="table" rid="pone.0280951.t001">Table 1</xref>, row 5, was called using the Ensemble caller, whereas Dataset #4 was called using just HaplotypeCaller. There was a deviation from the null distribution in the QQ plot (λ<sub>Δ95</sub> = 1.16; <xref ref-type="fig" rid="pone.0280951.g004">Fig 4A</xref>) when these two different variant-calling pipelines were used in these datasets.</p>
</sec>
<sec id="sec008">
<title>Use of same variant-calling and post-variant filtering (<xref ref-type="table" rid="pone.0280951.t001">Table 1</xref>, row 6, and <xref ref-type="fig" rid="pone.0280951.g004">Fig 4B</xref>)</title>
<p>To evaluate same variant-calling pipelines, we used Dataset #3 and Dataset #4 (<xref ref-type="fig" rid="pone.0280951.g004">Fig 4A</xref>, <xref ref-type="table" rid="pone.0280951.t001">Table 1</xref>, row 6) called using HaplotypeCaller. There was minimal deviation from the null distribution in the QQ plot (λ<sub>Δ95</sub> = 0.99; <xref ref-type="fig" rid="pone.0280951.g004">Fig 4B</xref>) when same variant caller with the same post-variant filters were used on both datasets, illustrating the importance of applying the same variant-calling pipeline and post-variant filtering to compared cohorts.</p>
</sec>
<sec id="sec009">
<title>Evaluation of published methods to correct an elevated false positive rate (<xref ref-type="table" rid="pone.0280951.t001">Table 1</xref>, row 7, and <xref ref-type="supplementary-material" rid="pone.0280951.s001">S1 Fig</xref>)</title>
<p>To determine the ability of three published methods (TRAPD [<xref ref-type="bibr" rid="pone.0280951.ref006">6</xref>], ProxECAT [<xref ref-type="bibr" rid="pone.0280951.ref007">7</xref>], and iECAT [<xref ref-type="bibr" rid="pone.0280951.ref008">8</xref>]) to adjust inflated p-values in larger datasets, we analyzed the distribution of rare variants in NFE non-TCGA gnomAD (n = 51,377) with sub-sampled CCSS data of varying sizes (n = 4,300, n = 1,000, n = 400) thus mimicking the methods that were presented in each tool. For the largest dataset (n = 4,300, red lines), we observed highly inflated p-values (λ<sub>Δ95</sub> = 1.59 [TRAPD]; λ<sub>Δ95</sub> = 2.79 [ProxECAT]; λ<sub>Δ95</sub> = 1.20 [iECAT]: <xref ref-type="table" rid="pone.0280951.t001">Table 1</xref> and <xref ref-type="supplementary-material" rid="pone.0280951.s001">S1 Fig</xref>). Since the size of the sub-sampled CCSS data examined was larger than in the previously published studies (range n = 393 to 927), [<xref ref-type="bibr" rid="pone.0280951.ref006">6</xref>–<xref ref-type="bibr" rid="pone.0280951.ref008">8</xref>] we decreased the sub-sampled CCSS data from 4,300 to 1,000 and 400. A reduction of the inflated p-values was observed with decreasing sub-sampled dataset size, despite retaining the same set of gnomAD controls (<xref ref-type="supplementary-material" rid="pone.0280951.s001">S1 Fig</xref>, blue and black lines). This observation suggests that smaller sub-sampled datasets are not powered to detect inflated p-values and that, unfortunately, the currently available methods do not always sufficiently adjust for the increased false-positive findings.</p>
</sec>
</sec>
<sec id="sec010" sec-type="conclusions">
<title>Discussion</title>
<p>Our analyses of a limited number of datasets show that false-positive results occur if rare-variant association tests are conducted using cases and controls that have different variant-calling and post-variant-calling filtering pipelines. Differences in laboratory components (<italic>e</italic>.<italic>g</italic>., capture kit, library prep kit and/or sequencing platform) and joint <italic>vs</italic>. separate variant-calling did not substantially inflated distribution of test statistics, a finding reported by other groups [<xref ref-type="bibr" rid="pone.0280951.ref012">12</xref>]. Occult population stratification is not a likely explanation for our findings given the very high percentages of European (CEU) ancestry in both case and control cohorts (<xref ref-type="table" rid="pone.0280951.t001">Table 1</xref>). These results, especially if replicated, emphasize the risks of using public controls for association tests in which individual-level data and the computational pipeline are not readily accessible, which prevents the use of the same variant-calling and filtering pipelines.</p>
<p>Possible options to effectively utilize publicly available genomic datasets without introducing substantial biases include: 1) obtaining individual level data from a publicly available dataset and process using the experimental dataset’s variant-calling pipeline through a portal that protects identifying information as per the ethical oversight of the study; 2) access to sufficiently detailed variant-calling and filtering pipeline documentation on publicly available datasets and applying this to the experimental dataset; or, 3) sequencing controls in-house and match the variant-calling pipeline elements with the experimental dataset. However, each of these proposed solutions have limitations, including: 1) lack of adequate consent and/or data-sharing agreements to provide individual-level data from public resources; 2) inadequate computational resources (<italic>e</italic>.<italic>g</italic>., storage and/or processors) needed to process experimental datasets and publicly available resources using the same bioinformatic pipelines; and 3) absence of available in-house controls and/or insufficient resources to sequence and process the resultant data.</p>
<p>Another option is the development and use of a standard variant-calling pipeline by all investigators. However, this poses significant, practical obstacles including the need for continual adjustments to improve accuracy and performance. Moreover, the rapid dissemination of next-generation sequencing technologies has led to many local solutions, making it difficult to develop an academic standard. Until there is a stable solution to compare a dataset to public controls, investigators should carefully evaluate the use of publicly available data for biases and implement strategies and methods to minimize such biases particularly when using a statistical test (<italic>e</italic>.<italic>g</italic>., Fisher’s exact test). At a minimum, public controls should not be the sole dataset in rare-variant association tests.</p>
<p>In summary, public controls are important tools for rare-variant analyses (<italic>e</italic>.<italic>g</italic>., population filtering and variant frequency) but their use for direct statistical tests (<italic>e</italic>.<italic>g</italic>., rare-variant association tests) without the same variant-calling and post-calling variant filtering pipeline is problematic. Importantly, the currently published methods do not adequately adjust for the likely high false-positive findings. A plausible solution exists with the emergence of cloud-based computing, which can make it possible to bring containerized analytical pipelines to the data (rather than the data to the pipeline) and could avert the issues mentioned above. It is suggested that future reports account for this issue and provide this as a limitation in reporting new findings based on studies that cannot practically analyze all data on a single pipeline.</p>
</sec>
<sec id="sec011" sec-type="materials|methods">
<title>Materials and methods</title>
<p>(See also <xref ref-type="supplementary-material" rid="pone.0280951.s002">S1 File</xref> for additional details on selection of datasets, calling and filtering overview, and rare-variant association (burden) testing and assessment.)</p>
<sec id="sec012">
<title>Datasets</title>
<p>Analyses were performed on datasets from previously published large, exome-sequenced cancer cohorts. A dataset of 4,300 long-term cancer survivors was utilized from the Childhood Cancer Survivor Study (CCSS) [<xref ref-type="bibr" rid="pone.0280951.ref011">11</xref>]. Additionally, an in-house control dataset was composed of the combined control sets from the Cancer Prevention Study II (CPSII) [<xref ref-type="bibr" rid="pone.0280951.ref010">10</xref>], and the Prostate, Lung, Colorectal, Ovarian Cancer (PLCO) [<xref ref-type="bibr" rid="pone.0280951.ref009">9</xref>] datasets. To ensure homogenous ancestry, the CPSII [<xref ref-type="bibr" rid="pone.0280951.ref010">10</xref>], CCSS [<xref ref-type="bibr" rid="pone.0280951.ref011">11</xref>] and PLCO [<xref ref-type="bibr" rid="pone.0280951.ref009">9</xref>] datasets were restricted to samples that were estimated to be at least 80% European (CEU) ancestry as determined by industry-standard methods detailed elsewhere [<xref ref-type="bibr" rid="pone.0280951.ref013">13</xref>]. For CCSS, we also restricted samples to those that were not whole-genome-amplified.</p>
<p>Our public control set was composed of publicly available data from the Genome Aggregation Database (gnomAD) [<xref ref-type="bibr" rid="pone.0280951.ref003">3</xref>] v2.1 and including only non-Finnish European (NFE) after excluding data from individuals from The Cancer Genome Atlas (TCGA) (n = 51,377). QQ plots were used to visually demonstrate p-value inflation and the λ<sub>Δ95</sub> statistic was used for quantitative assessment of this inflation. Details of λ<sub>Δ95</sub> statistic calculation is in <xref ref-type="supplementary-material" rid="pone.0280951.s002">S1 File</xref>.</p>
</sec>
<sec id="sec013">
<title>Variant calling</title>
<p>For datasets called by HaplotypeCaller, the following additional filters were applied (these are the standard hard filters recommended by GATK): QD≥2, FS≤60, MQ≥40, MQRankSum≥-12.5, ReadPosRankSum≥-8, SOR≤3.</p>
<p>For datasets called by Ensemble, the following additional filters were applied: at the genotype level: 1) variants required a GQ &gt; 20 and the alternate allele depth (AD) to be greater than 1, and 2) variant must be called by HaplotypeCaller and either FreeBayes or UnifiedGenotyper. Among heterozygous genotype calls, the total ratio between alternate AD and total depth (DP) must be greater than 0.3. If there were 3 or fewer heterozygous genotype calls, the depth must be greater than 0.2, the observed carrier frequency must be less than 10%, and there must not be any multiallelic heterozygous genotype calls (no individuals with a genotype containing two different alternate alleles).</p>
</sec>
<sec id="sec014">
<title>Variant filtering and annotation</title>
<p>Variants used in the analyses were 1) classified as synonymous (coding) for at least one gene, 2) not be SNPEFF HIGH or MODERATE for any gene, 3) have an allele frequency less than 0.01 in the population databases (all populations in 1000 Genomes, ESP, and all populations other than NFE in ExAC and gnomAD-exome), 4) within 5bp of the target region, called by HaplotypeCaller and either FreeBayes or UnifiedGenotyper, and 5) must not be a duplicate variant (due to indel alignment issues). At the genotype level, variants were required to have a GQ score greater than 20 and the alternate allele depth to be greater than 1. Among heterozygous genotype calls, the total ratio between an alternate AD and DP must be greater than 0.3, or if there are 3 or fewer heterozygous genotype calls the depth must be greater than 0.2. The observed carrier frequency must be less than 10%. There must not be any multiallelic heterozygous genotype calls. i.e.: no individuals with a genotype containing two different alternate alleles.</p>
</sec>
<sec id="sec015">
<title>Analyses performed</title>
<p>Five sets of analyses were performed, corresponding to Figs <xref ref-type="fig" rid="pone.0280951.g001">1</xref>–<xref ref-type="fig" rid="pone.0280951.g004">4</xref> plus <xref ref-type="supplementary-material" rid="pone.0280951.s001">S1 Fig</xref>, as listed in <xref ref-type="table" rid="pone.0280951.t001">Table 1</xref> and corresponding to section headers in the Results section:</p>
</sec>
<sec id="sec016">
<title>Demonstration of increased false-positive findings with expected-null findings using public controls</title>
<p>To demonstrate the inflated p-values present in a presumed-null analysis, we compared the Non-Finnish European (NFE) and non-TCGA subset of gnomAD (n = 51,377) with an in-house control dataset (CPSII/PLCO, n = 597; 97.4% of samples have &gt;95% CEU [European] ancestry) using Fisher’s exact test. The following filters were applied: the variant must be 1) called by HaplotypeCaller, 2) within 5 base pairs of the CCSS target region, 3) synonymous and within a coding exon, 4) have an allele frequency less than 0.001 in the population databases (all populations in 1000 Genomes and ESP, and all populations other than NFE in ExAC and gnomAD-exome; as 1000 Genomes and ESP were included as a filtering given a small proportion of the full gnomAD-exome dataset), 5) exist in both the CCSS dataset and the gnomAD dataset, 6) pass the HaplotypeCaller hard filters recommended by the Broad Institute, 7) must not be a duplicate variant (due to indel alignment issues), and 8) must not be on a RepeatMasker SimpleRepeat or a 5-base-pair (or longer) homopolymer run. In addition, 90% of all samples in both CCSS and gnomAD must have coverage depth greater than 10.</p>
</sec>
<sec id="sec017">
<title>Evaluation of laboratory factors contributing to elevated false-positive findings</title>
<p>To determine whether laboratory factors contributed to p-value inflation, we tested Dataset #2 (CPSII/PLCO control dataset (n = 597)) against Datasets #3 and #4 of the CCSS dataset (n = 4000), again restricting to synonymous coding variants (which presumably would not vary significantly between the two groups). The CPSII/PLCO dataset used the BiooNextFlex library prep kit and was sequenced on a combination of the Illumina HiSeq 2000 and HiSeq 2500 sequencer. The CCSS dataset used the KapaHyper Plus library prep kit and the HiSeq 4000 sequencer.</p>
</sec>
<sec id="sec018">
<title>Evaluation of the effects of joint vs separate variant calling on elevated false-positive findings</title>
<p>In joint calling, all samples in a dataset are called simultaneously, using information from across all samples to assist in assessing and calling variant loci. Obviously, our datasets and the gnomAD external control dataset were called separately, so we developed a test to determine whether this could be a source of the inflation. A subset of the CCSS dataset (Dataset #3, n = 2,000) and CPSII/PLCO (dataset#2, (n = 597)) were called jointly and separately followed by rare-variant association (burden) tests. The same filter set was used as used in Analysis set 3 (above). Variants used in this analysis must be classified as 1) synonymous (coding) for at least one gene and must not be SnpEff HIGH or MODERATE for any gene, 2) have an allele frequency less than 0.001 in the population databases (all populations in 1000 Genomes, ESP, ExAC, gnomAD exome and gnomAD genome), 3) within 5bp of the target region, 4) called by HaplotypeCaller and either FreeBayes or UnifiedGenotyper, and 5) must not be a duplicate variant (due to indel alignment issues). At the genotype level, variants were required to have a genotype quality (GQ) score greater than 20 and the alternate allele depth to be greater than 1. Among heterozygous genotype calls, the total ratio between alternate allele depth and total depth must be greater than 0.3, or if there are 3 or fewer heterozygous genotype calls the depth must be greater than 0.2. The observed carrier frequency must be less than 10%. There must not be any multiallelic heterozygous genotype calls. ie: no individuals with a genotype containing two different alternate alleles.</p>
</sec>
<sec id="sec019">
<title>Use of different and same variant-calling pipelines</title>
<p>To determine whether differences in variant-calling methodology could introduce p-value inflation, we split the CCSS dataset into two equally sized subsets (Datasets #3 and #4, n = 2000 each) and ran rare-variant association (burden) tests in which the calling methods differed (<xref ref-type="fig" rid="pone.0280951.g004">Fig 4A</xref>: Ensemble and HaplotypeCaller) and in which the calling methods were the same (<xref ref-type="fig" rid="pone.0280951.g004">Fig 4B</xref>: HaplotypeCaller only). Variants used in these analyses must be 1) classified as synonymous for at least one gene, 2) must not be SnpEff HIGH or MODERATE for any gene, 3) have an allele frequency less than 0.001 in the population databases (all populations in Thousand Genomes, ESP, ExAC, gnomAD exome and gnomAD genome), 4) within 5bp of the target region and 5), must not be a duplicate variant (due to indel alignment issues).</p>
</sec>
<sec id="sec020">
<title>Evaluation of published methods to correct an elevated false positive rate</title>
<p>TRAPD, [<xref ref-type="bibr" rid="pone.0280951.ref006">6</xref>] ProxECAT [<xref ref-type="bibr" rid="pone.0280951.ref007">7</xref>] and iECAT [<xref ref-type="bibr" rid="pone.0280951.ref008">8</xref>] were used as per each method’s published reference. For each method, three analyses were performed on case (CCSS) and public control (gnomAD) data: 1) the full 4,300-sample CCSS set (95.1% of samples have &gt;95% CEU [European] ancestry) vs. gnomAD, 2) a random 1000-sample subset of CCSS vs. gnomAD, and 3) a random 400-sample subset of CCSS vs. gnomAD. No genotype-level filtering was performed because there is no way to implement such filters on the gnomAD dataset since we can only access aggregate frequency-level data. Both cases (CCSS) and public controls (gnomAD) were restricted to European ancestry. For gnomAD, specifically the “non-Finnish European” (NFE) without TCGA subset was used. The following filters were applied to both the CCSS and gnomAD datasets for all analysis: 1) variant must be within 5 base pairs of the CCSS target region, 2) must not be a duplicate read, 3) must not be on RepeatMasker, SimpleRepeat or a 5-base-pair (or longer) homopolymer run, and 4) must be called by HaplotypeCaller.</p>
<p>Specific details for each of the three methods are provided in S1 Text in <xref ref-type="supplementary-material" rid="pone.0280951.s002">S1 File</xref>.</p>
</sec>
</sec>
<sec id="sec021" sec-type="supplementary-material">
<title>Supporting information</title>
<supplementary-material id="pone.0280951.s001" mimetype="image/tiff" position="float" xlink:href="info:doi/10.1371/journal.pone.0280951.s001" xlink:type="simple">
<label>S1 Fig</label>
<caption>
<title>Evaluation of published methods to correct an elevated false positive rate.</title>
<p>Quantile-quantile plot of non-Finnish European non-TCGA (The Cancer Genome Atlas) gnomAD subjects (n = 51,377) versus a sub-sampled CCSS dataset showing greatly inflated p-values, which diminishes with decreasing dataset size. Filtered to include rare variants using methods described in A) TRAPD, B) ProxECAT, C) iECAT.</p>
<p>(TIF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pone.0280951.s002" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pone.0280951.s002" xlink:type="simple">
<label>S1 File</label>
<caption>
<title>Supplemental method.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
</sec>
</body>
<back>
<ack>
<p>This work utilized the computational resources of the NIH High-Performance Computing Biowulf cluster. We thank the CCSS participants and referring physicians for their valuable contributions.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="pone.0280951.ref001"><label>1</label><mixed-citation publication-type="other" xlink:type="simple">(ESP) NGESPr. Exome Variant Server Seattle, WA [<ext-link ext-link-type="uri" xlink:href="http://evs.gs.washington.edu/EVS/" xlink:type="simple">http://evs.gs.washington.edu/EVS/</ext-link>.</mixed-citation></ref>
<ref id="pone.0280951.ref002"><label>2</label><mixed-citation publication-type="journal" xlink:type="simple"><collab>Genomes Project C</collab>, <name name-style="western"><surname>Auton</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Brooks</surname> <given-names>LD</given-names></name>, <name name-style="western"><surname>Durbin</surname> <given-names>RM</given-names></name>, <name name-style="western"><surname>Garrison</surname> <given-names>EP</given-names></name>, <name name-style="western"><surname>Kang</surname> <given-names>HM</given-names></name>, <etal>et al</etal>. <article-title>A global reference for human genetic variation</article-title>. <source>Nature</source>. <year>2015</year>;<volume>526</volume>(<issue>7571</issue>):<fpage>68</fpage>–<lpage>74</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/nature15393" xlink:type="simple">10.1038/nature15393</ext-link></comment> <object-id pub-id-type="pmid">26432245</object-id></mixed-citation></ref>
<ref id="pone.0280951.ref003"><label>3</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Karczewski</surname> <given-names>KJ</given-names></name>, <name name-style="western"><surname>Francioli</surname> <given-names>LC</given-names></name>, <name name-style="western"><surname>Tiao</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Cummings</surname> <given-names>BB</given-names></name>, <name name-style="western"><surname>Alfoldi</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Wang</surname> <given-names>Q</given-names></name>, <etal>et al</etal>. <article-title>The mutational constraint spectrum quantified from variation in 141,456 humans</article-title>. <source>Nature</source>. <year>2020</year>;<volume>581</volume>(<issue>7809</issue>):<fpage>434</fpage>–<lpage>43</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/s41586-020-2308-7" xlink:type="simple">10.1038/s41586-020-2308-7</ext-link></comment> <object-id pub-id-type="pmid">32461654</object-id></mixed-citation></ref>
<ref id="pone.0280951.ref004"><label>4</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Gudmundsson</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Singer-Berk</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Watts</surname> <given-names>NA</given-names></name>, <name name-style="western"><surname>Phu</surname> <given-names>W</given-names></name>, <name name-style="western"><surname>Goodrich</surname> <given-names>JK</given-names></name>, <name name-style="western"><surname>Solomonson</surname> <given-names>M</given-names></name>, <etal>et al</etal>. <article-title>Variant interpretation using population databases: Lessons from gnomAD</article-title>. <source>Hum Mutat</source>. <year>2022</year>;<volume>43</volume>(<issue>8</issue>):<fpage>1012</fpage>–<lpage>30</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/humu.24309" xlink:type="simple">10.1002/humu.24309</ext-link></comment> <object-id pub-id-type="pmid">34859531</object-id></mixed-citation></ref>
<ref id="pone.0280951.ref005"><label>5</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Wojcik</surname> <given-names>GL</given-names></name>, <name name-style="western"><surname>Murphy</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Edelson</surname> <given-names>JL</given-names></name>, <name name-style="western"><surname>Gignoux</surname> <given-names>CR</given-names></name>, <name name-style="western"><surname>Ioannidis</surname> <given-names>AG</given-names></name>, <name name-style="western"><surname>Manning</surname> <given-names>A</given-names></name>, <etal>et al</etal>. <article-title>Opportunities and challenges for the use of common controls in sequencing studies</article-title>. <source>Nat Rev Genet</source>. <year>2022</year>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/s41576-022-00487-4" xlink:type="simple">10.1038/s41576-022-00487-4</ext-link></comment> <object-id pub-id-type="pmid">35581355</object-id></mixed-citation></ref>
<ref id="pone.0280951.ref006"><label>6</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Guo</surname> <given-names>MH</given-names></name>, <name name-style="western"><surname>Plummer</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Chan</surname> <given-names>YM</given-names></name>, <name name-style="western"><surname>Hirschhorn</surname> <given-names>JN</given-names></name>, <name name-style="western"><surname>Lippincott</surname> <given-names>MF</given-names></name>. <article-title>Burden Testing of Rare Variants Identified through Exome Sequencing via Publicly Available Control Data</article-title>. <source>Am J Hum Genet</source>. <year>2018</year>;<volume>103</volume>(<issue>4</issue>):<fpage>522</fpage>–<lpage>34</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.ajhg.2018.08.016" xlink:type="simple">10.1016/j.ajhg.2018.08.016</ext-link></comment> <object-id pub-id-type="pmid">30269813</object-id></mixed-citation></ref>
<ref id="pone.0280951.ref007"><label>7</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Hendricks</surname> <given-names>AE</given-names></name>, <name name-style="western"><surname>Billups</surname> <given-names>SC</given-names></name>, <name name-style="western"><surname>Pike</surname> <given-names>HNC</given-names></name>, <name name-style="western"><surname>Farooqi</surname> <given-names>IS</given-names></name>, <name name-style="western"><surname>Zeggini</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Santorico</surname> <given-names>SA</given-names></name>, <etal>et al</etal>. <article-title>ProxECAT: Proxy External Controls Association Test. A new case-control gene region association test using allele frequencies from public controls</article-title>. <source>PLoS Genet</source>. <year>2018</year>;<volume>14</volume>(<issue>10</issue>):<fpage>e1007591</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pgen.1007591" xlink:type="simple">10.1371/journal.pgen.1007591</ext-link></comment> <object-id pub-id-type="pmid">30325923</object-id></mixed-citation></ref>
<ref id="pone.0280951.ref008"><label>8</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Lee</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Kim</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Fuchsberger</surname> <given-names>C</given-names></name>. <article-title>Improving power for rare-variant tests by integrating external controls</article-title>. <source>Genet Epidemiol</source>. <year>2017</year>;<volume>41</volume>(<issue>7</issue>):<fpage>610</fpage>–<lpage>9</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/gepi.22057" xlink:type="simple">10.1002/gepi.22057</ext-link></comment> <object-id pub-id-type="pmid">28657150</object-id></mixed-citation></ref>
<ref id="pone.0280951.ref009"><label>9</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Prorok</surname> <given-names>PC</given-names></name>, <name name-style="western"><surname>Andriole</surname> <given-names>GL</given-names></name>, <name name-style="western"><surname>Bresalier</surname> <given-names>RS</given-names></name>, <name name-style="western"><surname>Buys</surname> <given-names>SS</given-names></name>, <name name-style="western"><surname>Chia</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Crawford</surname> <given-names>ED</given-names></name>, <etal>et al</etal>. <article-title>Design of the Prostate, Lung, Colorectal and Ovarian (PLCO) Cancer Screening Trial</article-title>. <source>Control Clin Trials</source>. <year>2000</year>;<volume>21</volume>(<issue>6 Suppl</issue>):<fpage>273S</fpage>–<lpage>309S</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/s0197-2456%2800%2900098-2" xlink:type="simple">10.1016/s0197-2456(00)00098-2</ext-link></comment> <object-id pub-id-type="pmid">11189684</object-id></mixed-citation></ref>
<ref id="pone.0280951.ref010"><label>10</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Calle</surname> <given-names>EE</given-names></name>, <name name-style="western"><surname>Rodriguez</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Jacobs</surname> <given-names>EJ</given-names></name>, <name name-style="western"><surname>Almon</surname> <given-names>ML</given-names></name>, <name name-style="western"><surname>Chao</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>McCullough</surname> <given-names>ML</given-names></name>, <etal>et al</etal>. <article-title>The American Cancer Society Cancer Prevention Study II Nutrition Cohort: rationale, study design, and baseline characteristics</article-title>. <source>Cancer</source>. <year>2002</year>;<volume>94</volume>(<issue>9</issue>):<fpage>2490</fpage>–<lpage>501</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/cncr.101970" xlink:type="simple">10.1002/cncr.101970</ext-link></comment> <object-id pub-id-type="pmid">12015775</object-id></mixed-citation></ref>
<ref id="pone.0280951.ref011"><label>11</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Kim</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Gianferante</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Karyadi</surname> <given-names>DM</given-names></name>, <name name-style="western"><surname>Hartley</surname> <given-names>SW</given-names></name>, <name name-style="western"><surname>Frone</surname> <given-names>MN</given-names></name>, <name name-style="western"><surname>Luo</surname> <given-names>W</given-names></name>, <etal>et al</etal>. <article-title>Frequency of Pathogenic Germline Variants in Cancer-Susceptibility Genes in the Childhood Cancer Survivor Study</article-title>. <source>JNCI Cancer Spectr</source>. <year>2021</year>;<volume>5</volume>(<issue>2</issue>):<fpage>pkab007</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/jncics/pkab007" xlink:type="simple">10.1093/jncics/pkab007</ext-link></comment> <object-id pub-id-type="pmid">34308104</object-id></mixed-citation></ref>
<ref id="pone.0280951.ref012"><label>12</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Chen</surname> <given-names>Z</given-names></name>, <name name-style="western"><surname>Boehnke</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Fuchsberger</surname> <given-names>C</given-names></name>. <article-title>Combining sequence data from multiple studies: Impact of analysis strategies on rare variant calling and association results</article-title>. <source>Genet Epidemiol</source>. <year>2020</year>;<volume>44</volume>(<issue>1</issue>):<fpage>41</fpage>–<lpage>51</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/gepi.22261" xlink:type="simple">10.1002/gepi.22261</ext-link></comment> <object-id pub-id-type="pmid">31520493</object-id></mixed-citation></ref>
<ref id="pone.0280951.ref013"><label>13</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Pritchard</surname> <given-names>JK</given-names></name>, <name name-style="western"><surname>Stephens</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Donnelly</surname> <given-names>P</given-names></name>. <article-title>Inference of population structure using multilocus genotype data</article-title>. <source>Genetics</source>. <year>2000</year>;<volume>155</volume>(<issue>2</issue>):<fpage>945</fpage>–<lpage>59</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/genetics/155.2.945" xlink:type="simple">10.1093/genetics/155.2.945</ext-link></comment> <object-id pub-id-type="pmid">10835412</object-id></mixed-citation></ref>
</ref-list>
</back>
</article>