<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1d3 20150301//EN" "http://jats.nlm.nih.gov/publishing/1.1d3/JATS-journalpublishing1.dtd">
<article article-type="research-article" dtd-version="1.1d3" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS Comput Biol</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">ploscomp</journal-id>
<journal-title-group>
<journal-title>PLOS Computational Biology</journal-title>
</journal-title-group>
<issn pub-type="ppub">1553-734X</issn>
<issn pub-type="epub">1553-7358</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, CA USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">PCOMPBIOL-D-19-00410</article-id>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1007613</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Research Article</subject>
</subj-group>
<subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Heredity</subject><subj-group><subject>Genetic mapping</subject><subj-group><subject>Haplotypes</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Simulation and modeling</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genetic loci</subject><subj-group><subject>Alleles</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Heredity</subject><subj-group><subject>Genetic mapping</subject><subj-group><subject>Variant genotypes</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Molecular biology</subject><subj-group><subject>Molecular biology techniques</subject><subj-group><subject>Sequencing techniques</subject><subj-group><subject>RNA sequencing</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Molecular biology techniques</subject><subj-group><subject>Sequencing techniques</subject><subj-group><subject>RNA sequencing</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Medicine and health sciences</subject><subj-group><subject>Diagnostic medicine</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Human genetics</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Computational biology</subject><subj-group><subject>Genome analysis</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject><subj-group><subject>Genome analysis</subject></subj-group></subj-group></subj-group></subj-group></article-categories>
<title-group>
<article-title>SmartPhase: Accurate and fast phasing of heterozygous variant pairs for genetic diagnosis of rare diseases</article-title>
<alt-title alt-title-type="running-head">SmartPhase: Accurate and fast phasing of heterozygous variant pairs</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Hager</surname> <given-names>Paul</given-names></name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Formal analysis</role>
<role content-type="http://credit.casrai.org/">Investigation</role>
<role content-type="http://credit.casrai.org/">Methodology</role>
<role content-type="http://credit.casrai.org/">Resources</role>
<role content-type="http://credit.casrai.org/">Software</role>
<role content-type="http://credit.casrai.org/">Validation</role>
<role content-type="http://credit.casrai.org/">Visualization</role>
<role content-type="http://credit.casrai.org/">Writing – original draft</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0002-9713-6398</contrib-id>
<name name-style="western">
<surname>Mewes</surname> <given-names>Hans-Werner</given-names></name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Project administration</role>
<role content-type="http://credit.casrai.org/">Supervision</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff002"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Rohlfs</surname> <given-names>Meino</given-names></name>
<role content-type="http://credit.casrai.org/">Data curation</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff003"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Klein</surname> <given-names>Christoph</given-names></name>
<role content-type="http://credit.casrai.org/">Funding acquisition</role>
<role content-type="http://credit.casrai.org/">Project administration</role>
<role content-type="http://credit.casrai.org/">Supervision</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff003"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0003-2511-4699</contrib-id>
<name name-style="western">
<surname>Jeske</surname> <given-names>Tim</given-names></name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Data curation</role>
<role content-type="http://credit.casrai.org/">Formal analysis</role>
<role content-type="http://credit.casrai.org/">Investigation</role>
<role content-type="http://credit.casrai.org/">Methodology</role>
<role content-type="http://credit.casrai.org/">Project administration</role>
<role content-type="http://credit.casrai.org/">Resources</role>
<role content-type="http://credit.casrai.org/">Software</role>
<role content-type="http://credit.casrai.org/">Supervision</role>
<role content-type="http://credit.casrai.org/">Validation</role>
<role content-type="http://credit.casrai.org/">Visualization</role>
<role content-type="http://credit.casrai.org/">Writing – original draft</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff003"><sup>3</sup></xref>
<xref ref-type="corresp" rid="cor001">*</xref>
</contrib>
</contrib-group>
<aff id="aff001">
<label>1</label>
<addr-line>Institute of Bioinformatics and Systems Biology, Helmholtz Zentrum München GmbH, Neuherberg, Germany</addr-line>
</aff>
<aff id="aff002">
<label>2</label>
<addr-line>Technische Universität München, School of Life Sciences, Weihenstephan, Freising, Germany</addr-line>
</aff>
<aff id="aff003">
<label>3</label>
<addr-line>Department of Pediatrics, Dr. von Hauner Children’s Hospital, University Hospital, LMU Munich, München, Germany</addr-line>
</aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple">
<name name-style="western">
<surname>Pertea</surname> <given-names>Mihaela</given-names></name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/>
</contrib>
</contrib-group>
<aff id="edit1">
<addr-line>Johns Hopkins University, UNITED STATES</addr-line>
</aff>
<author-notes>
<fn fn-type="conflict" id="coi001">
<p>The authors have declared that no competing interests exist.</p>
</fn>
<corresp id="cor001">* E-mail: <email xlink:type="simple">tim.jeske@helmholtz-muenchen.de</email></corresp>
</author-notes>
<pub-date pub-type="collection">
<month>2</month>
<year>2020</year>
</pub-date>
<pub-date pub-type="epub">
<day>7</day>
<month>2</month>
<year>2020</year>
</pub-date>
<volume>16</volume>
<issue>2</issue>
<elocation-id>e1007613</elocation-id>
<history>
<date date-type="received">
<day>4</day>
<month>4</month>
<year>2019</year>
</date>
<date date-type="accepted">
<day>17</day>
<month>12</month>
<year>2019</year>
</date>
</history>
<permissions>
<copyright-year>2020</copyright-year>
<copyright-holder>Hager et al</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.pcbi.1007613"/>
<abstract>
<p>There is an increasing need to use genome and transcriptome sequencing to genetically diagnose patients suffering from suspected monogenic rare diseases. The proper detection of compound heterozygous variant combinations as disease-causing candidates is a challenge in diagnostic workflows as haplotype information is lost by currently used next-generation sequencing technologies. Consequently, computational tools are required to phase, or resolve the haplotype of, the high number of heterozygous variants in the exome or genome of each patient. Here we present SmartPhase, a phasing tool designed to efficiently reduce the set of potential compound heterozygous variant pairs in genetic diagnoses pipelines. The phasing algorithm of SmartPhase creates haplotypes using both parental genotype information and reads generated by DNA or RNA sequencing and is thus well suited to resolve the phase of rare variants. To inform the user about the reliability of a phasing prediction, it computes a confidence score which is essential to select error-free predictions. It incorporates existing haplotype information and applies logical rules to determine variants that can be excluded as causing a recessive, monogenic disease. SmartPhase can phase either all possible variant pairs in predefined genetic loci or preselected variant pairs of interest, thus keeping the focus on clinically relevant results. We compared SmartPhase to WhatsHap, one of the leading comparable phasing tools, using simulated data and a real clinical cohort of 921 patients. On both data sets, SmartPhase generated error-free predictions using our derived confidence score threshold. It outperformed WhatsHap with regard to the percentage of resolved pairs when parental genotype information is available. On the cohort data, SmartPhase enabled on average the exclusion of approximately 22% of the input variant pairs in each singleton patient and 44% in each trio patient. SmartPhase is implemented as an open-source Java tool and freely available at <ext-link ext-link-type="uri" xlink:href="http://ibis.helmholtz-muenchen.de/smartphase/" xlink:type="simple">http://ibis.helmholtz-muenchen.de/smartphase/</ext-link>.</p>
</abstract>
<funding-group>
<funding-statement>The work was supported by funding of The Leona M. and Harry B. Helmsley Charitable Trust, the Care-for-Rare Foundation, and BMBF (PID-NET, 01GM1517A). The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</funding-statement>
</funding-group>
<counts>
<fig-count count="3"/>
<table-count count="1"/>
<page-count count="12"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>PLOS Publication Stage</meta-name>
<meta-value>vor-update-to-uncorrected-proof</meta-value>
</custom-meta>
<custom-meta>
<meta-name>Publication Update</meta-name>
<meta-value>2020-02-20</meta-value>
</custom-meta>
<custom-meta id="data-availability">
<meta-name>Data Availability</meta-name>
<meta-value>Data cannot be shared publicly because of European and national data protection regulations that require that genomic data must not be made public to protect the privacy of patients. Data are available from the Dr. von Hauner Children’s Hospital, Medical Center of the LMU Munich for researchers who meet the criteria for access to confidential patient data. Please direct your requests to the Dr. von Hauner Children’s Hospital (<email xlink:type="simple">sekretariat.kinderklinik@med.uni-muenchen.de</email>) or the Institutional Review Board of the LMU (<email xlink:type="simple">ethikkommission@med.uni-muenchen.de</email>).</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<disp-quote>
<p>This is a <italic>PLOS Computational Biology</italic> Software paper.</p>
</disp-quote>
<sec id="sec001" sec-type="intro">
<title>Introduction</title>
<p>Genetic defects are the source of a wide and diverse range of monogenic or Mendelian diseases that are individually rare but collectively common. So far, more than 5, 000 different disorders and traits are known that are caused by mutations in only one gene [<xref ref-type="bibr" rid="pcbi.1007613.ref001">1</xref>]. Genome as well as transcriptome sequencing is increasingly used to genetically diagnose patients suffering from a suspected monogenic rare disease [<xref ref-type="bibr" rid="pcbi.1007613.ref002">2</xref>–<xref ref-type="bibr" rid="pcbi.1007613.ref004">4</xref>]. However, detecting disease-causing variants among thousands of benign variants is a great challenge. Widely-used strategies and guidelines for variant prioritization are based on the predicted or known deleteriousness of a variant, its frequency in large scale sequencing studies and its segregation with the disease phenotype [<xref ref-type="bibr" rid="pcbi.1007613.ref005">5</xref>, <xref ref-type="bibr" rid="pcbi.1007613.ref006">6</xref>]. Assuming autosomal recessive monogenic inheritance, the disease-causing variants are either homozygous or compound heterozygous with two heterozygous mutations together affecting both parental alleles of a gene locus [<xref ref-type="bibr" rid="pcbi.1007613.ref007">7</xref>]. Consequently, clinical workflows aim to detect with preference rare variants that are predicted to be harmful and are homozygous or compound heterozygous in the patient. The accurate determination of whether two heterozygous variants are located on the same or different parental alleles is a challenge that is faced by all diagnostic pipelines in the context of recessive monogenic diseases.</p>
<p>Haplotypes can either be resolved experimentally during sequencing or inferred computationally afterwards [<xref ref-type="bibr" rid="pcbi.1007613.ref008">8</xref>]. Several technologies for haplotype-resolved genome sequencing have been developed but are seldom used in a clinical setting because of their prohibitive cost and complexity. Computational tools for phasing use sequencing data of family members, reads spanning multiple variants or reference haplotype panels. Sequencing data of parents or other family members is most informative for phasing but might not always be available and cannot be used for variants that are heterozygous in both parents and the child. Using reads spanning multiple variants requires no additional data, but the length of the underlying reads limits the number of variants that can be phased. Panel-based phasing methods are useful for common variants but fail for rare variants which are the focus when diagnosing rare diseases. The combination of different phasing strategies is promising as it can compensate for the disadvantages of the individual approaches.</p>
<p>Existing phasing tools offer limited utility for clinical purposes because they are designed to phase complete chromosomes instead of genetic loci of interest or incorporate only one phasing strategy. phASER improves the phasing range of read-based phasing by incorporating RNA sequencing reads in addition to DNA sequencing reads, but it does not perform pedigree-based phasing [<xref ref-type="bibr" rid="pcbi.1007613.ref009">9</xref>]. WhatsHap combines read-based phasing with pedigree-based phasing but offers no options to restrict phasing to pre-selected variants or genomic regions [<xref ref-type="bibr" rid="pcbi.1007613.ref010">10</xref>]. The user would either have to accept unnecessarily long runtimes for phasing complete chromosomes or trim the sequencing data to the regions of interest before each execution which would require additional time and storage resources. Neither option is feasible in a clinical setting especially when dealing with large cohorts of thousands of patients with few regions of interest. Furthermore, none of these phasing tools are able to label pairs of heterozygous variants as clinically irrelevant by using the fact that the genotypes of healthy parents contradict the potential pathogenicity of the pair.</p>
<p>To overcome these limitations, we developed SmartPhase, a ready-to-use phasing tool tailored for clinical workflows to improve the analysis of potential compound heterozygous variant pairs in terms of simplicity, speed and accuracy. SmartPhase is able to flexibly use available trio sequencing information and read information of DNA as well as RNA sequencing data. Additionally, it informs about the confidence of its predictions and implements rules to logically exclude variant constellations that cannot be disease-causing.</p>
</sec>
<sec id="sec002" sec-type="materials|methods">
<title>Design and implementation</title>
<p>To fully take advantage of the breadth of phasing informative data generated in clinical research, SmartPhase is able to combine trio phasing, read-based phasing, additional logical rules and GATK physical phasing to resolve as many variant combinations as non-pathogenic as possible. Furthermore, SmartPhase focuses on diagnostically relevant genomic loci in its input while providing a comprehensive bitflag and confidence score system to intuitively represent its results in its output.</p>
<sec id="sec003">
<title>Trio phasing</title>
<p>If parental genotypes are provided, all patient heterozygous variants are examined for the possibility of using the parental variant calls to allocate a variant to either the maternal or paternal chromosome. If the pedigree information allows the phase to be determined, a confidence score of 1.0 is given.</p>
<p>Generally, it is not possible to assign <italic>de novo</italic> mutations to the correct allele by trio information. However, if the inherited allele is present in both parents, with one being heterozygous and the other being homozygous for said allele, it is probabilistically assigned to the homozygous parent with a confidence score of 0.66.</p>
</sec>
<sec id="sec004">
<title>Read-based phasing</title>
<p>If DNA or RNA sequencing reads are provided, SmartPhase uses multi-variant spanning reads to aid in the resolution of local haplotypes. If a read spans two variant positions and contains both variants, this constitutes an evidence that both variants lie on the same allele in <italic>cis</italic> configuration. If two reads span both variant positions, each containing only one of the variants, this is an evidence that the variants lie on opposing chromosomes in <italic>trans</italic> configuration. To ensure the creation of accurate haplotypes, SmartPhase ignores reads that are not mapped, not part of a proper pair in case of paired-end reads, marked as duplicate reads, or not part of a primary alignment. Further, the user can choose to ignore reads whose mapping quality is lower than that of a defined threshold.</p>
<p>In order to inform about the quality of the inferred haplotype for two variants <italic>v</italic><sub>1</sub> and <italic>v</italic><sub>2</sub>, a confidence score, Confidence(<italic>v</italic><sub>1</sub>, <italic>v</italic><sub>2</sub>), is computed by the formula
<disp-formula id="pcbi.1007613.e001"><alternatives><graphic id="pcbi.1007613.e001g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1007613.e001" xlink:type="simple"/><mml:math display="block" id="M1"><mml:mrow><mml:mfrac><mml:mrow><mml:mrow><mml:mo>|</mml:mo> <mml:mrow><mml:mover><mml:mover><mml:mrow><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo>−</mml:mo><mml:munder><mml:munder><mml:mrow><mml:mfrac><mml:mrow><mml:mover><mml:mover><mml:mrow><mml:mfrac><mml:mrow><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>∑</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>l</mml:mi><mml:mrow><mml:msub><mml:mi>v</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:msubsup><mml:mn>1</mml:mn></mml:mstyle><mml:msup><mml:mn>0</mml:mn><mml:mrow><mml:mo>−</mml:mo><mml:msub><mml:mi>q</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo>/</mml:mo><mml:mn>10</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:msub><mml:mi>l</mml:mi><mml:mrow><mml:msub><mml:mi>v</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">︷</mml:mo></mml:mover><mml:mrow><mml:mtext>Corrected</mml:mtext><mml:mspace width="2pt"/><mml:msub><mml:mi>v</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mover><mml:mo>+</mml:mo><mml:mover><mml:mover><mml:mrow><mml:mfrac><mml:mrow><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>∑</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>l</mml:mi><mml:mrow><mml:msub><mml:mi>v</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:msubsup><mml:mn>1</mml:mn></mml:mstyle><mml:msup><mml:mn>0</mml:mn><mml:mrow><mml:mo>−</mml:mo><mml:msub><mml:mi>q</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo>/</mml:mo><mml:mn>10</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:msub><mml:mi>l</mml:mi><mml:mrow><mml:msub><mml:mi>v</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">︷</mml:mo></mml:mover><mml:mrow><mml:mtext>Corrected</mml:mtext><mml:mspace width="2pt"/><mml:msub><mml:mi>v</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:mover></mml:mrow><mml:mn>2</mml:mn></mml:mfrac></mml:mrow><mml:mo stretchy="true">︸</mml:mo></mml:munder><mml:mrow><mml:mtext>Average</mml:mtext><mml:mspace width="2pt"/><mml:mtext>Inverse</mml:mtext><mml:mspace width="2pt"/><mml:mtext>Phred</mml:mtext></mml:mrow></mml:munder><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:mrow><mml:mo stretchy="true">︷</mml:mo></mml:mover><mml:mrow><mml:mtext>Trans</mml:mtext><mml:mspace width="2pt"/><mml:mtext>Subscore</mml:mtext></mml:mrow></mml:mover><mml:mo>−</mml:mo><mml:mover><mml:mover><mml:mrow><mml:mtext>min</mml:mtext><mml:mo stretchy="false">(</mml:mo><mml:mn>2</mml:mn><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mo stretchy="false">(</mml:mo></mml:mstyle><mml:mn>1</mml:mn><mml:mo>−</mml:mo><mml:munder><mml:munder><mml:mrow><mml:mfrac><mml:mrow><mml:mover><mml:mover><mml:mrow><mml:mfrac><mml:mrow><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>∑</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>l</mml:mi><mml:mrow><mml:msub><mml:mi>v</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:msubsup><mml:mn>1</mml:mn></mml:mstyle><mml:msup><mml:mn>0</mml:mn><mml:mrow><mml:mo>−</mml:mo><mml:msub><mml:mi>q</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo>/</mml:mo><mml:mn>10</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:msub><mml:mi>l</mml:mi><mml:mrow><mml:msub><mml:mi>v</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">︷</mml:mo></mml:mover><mml:mrow><mml:mtext>Corrected</mml:mtext><mml:mspace width="2pt"/><mml:msub><mml:mi>v</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mover><mml:mo>+</mml:mo><mml:mover><mml:mover><mml:mrow><mml:mfrac><mml:mrow><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>∑</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>l</mml:mi><mml:mrow><mml:msub><mml:mi>v</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:msubsup><mml:mn>1</mml:mn></mml:mstyle><mml:msup><mml:mn>0</mml:mn><mml:mrow><mml:mo>−</mml:mo><mml:msub><mml:mi>q</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo>/</mml:mo><mml:mn>10</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:msub><mml:mi>l</mml:mi><mml:mrow><mml:msub><mml:mi>v</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">︷</mml:mo></mml:mover><mml:mrow><mml:mtext>Corrected</mml:mtext><mml:mspace width="2pt"/><mml:msub><mml:mi>v</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:mover></mml:mrow><mml:mn>2</mml:mn></mml:mfrac></mml:mrow><mml:mo stretchy="true">︸</mml:mo></mml:munder><mml:mrow><mml:mtext>Average</mml:mtext><mml:mspace width="2pt"/><mml:mtext>Inverse</mml:mtext><mml:mspace width="2pt"/><mml:mtext>Phred</mml:mtext></mml:mrow></mml:munder><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo stretchy="true">︷</mml:mo></mml:mover><mml:mrow><mml:mtext>Cis</mml:mtext><mml:mspace width="2pt"/><mml:mtext>Subscore</mml:mtext></mml:mrow></mml:mover></mml:mrow> <mml:mo>|</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>+</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:mfrac></mml:mrow></mml:math></alternatives> <label>(1)</label></disp-formula>
where <italic>n</italic> is the number of reads overlapping two variant positions, <italic>n</italic><sub><italic>trans</italic></sub> is the number of reads supporting a <italic>trans</italic> configuration where each variant is present in only half of the reads, <italic>n</italic><sub><italic>cis</italic></sub> is the number of reads containing both variants, <italic>q</italic><sub><italic>k</italic></sub> is the Phred quality score of a read at a particular position <italic>k</italic>, and <italic>l</italic> represents the length of the variant allele being examined in this read (either <italic>v</italic><sub>1</sub> or <italic>v</italic><sub>2</sub>). The confidence score was designed to summarize the strength of the evidence behind a read-based phasing call. For a thorough explanation of how the confidence score is calculated, see Section 1.1 of <xref ref-type="supplementary-material" rid="pcbi.1007613.s001">S1 Appendix</xref>. In order to differentiate high quality phasing calls from low quality phasing calls, we derived 0.34 as a threshold for the confidence score as explained in Section 1.2 of <xref ref-type="supplementary-material" rid="pcbi.1007613.s001">S1 Appendix</xref>.</p>
<p>Variants are directly phased with their immediate neighbor using the <italic>cis</italic> and <italic>trans</italic> subscores, calculated by summing the inverse Phred score corrected evidence counts. This basic strategy permits the creation of seed haplotypes that can be locally extended to neighboring variants if overlapping reads exist. SmartPhase can elongate these haplotypes by applying this strategy within paired-end reads as the two paired-end reads must come from the same haplotype, regardless of physical distance or disjointedness. SmartPhase also leverages RNA sequencing reads that connect distant variants due to the read spanning exon-exon boundaries. To calculate the confidence of non-directly phased variants, the confidence scores of all directly phased variants on the shortest path are multiplied together. This helps to represent the growing uncertainty of phase calls as haplotype blocks increase in size and the distance between variants increases.</p>
</sec>
<sec id="sec005">
<title>Phasing intervals</title>
<p>If both reads as well as parental variants are provided, the local haplotype blocks created by read-based phasing are combined using variants that were pedigree phased. Any contradictions between pedigree phasing and read-based phasing are resolved according to their confidence scores. All variant pairs not phased by direct evidences again have their confidence scores calculated by taking the product of all directly phased linking variants on the shortest possible path between the variants in the pair.</p>
</sec>
<sec id="sec006">
<title>Innocuous labeling</title>
<p>If parental genotype information is given, certain variant pair constellations can be designated as <italic>innocuous</italic> based on the assumption that the parents of the patient are healthy. <italic>Innocuous</italic> variants are those variants that are deemed to be clinically irrelevant as all variant combinations they partake in have been deduced to be non-disease-causing in Section 1.3 of <xref ref-type="supplementary-material" rid="pcbi.1007613.s001">S1 Appendix</xref>. Variant pairs are labeled as <italic>innocuous</italic> if one of the variants is homozygous in a parent or if mother, father, and child all possess the same heterozygous genotype for one of the variants in the pair.</p>
</sec>
<sec id="sec007">
<title>GATK physical phasing</title>
<p>If a variant is not visible in the alignment of the reads as given by the provided mapping file, this variant was most likely called as a result of read realignment done by the used variant calling program. As a consequence, these variants are designated as not found by SmartPhase. As the HaplotypeCaller (HC) tool of the Genome Analysis Toolkit (GATK) [<xref ref-type="bibr" rid="pcbi.1007613.ref011">11</xref>] is currently one of the most widely-used variant calling algorithms, we implemented the ability to incorporate phasing information returned by HC when no variant evidences were found within the reads and the variant could not be phased by trio phasing. If HC calls variants through read rearrangement, these variants are usually physically phased at the same time and the used local haplotype information is provided in the resulting variant file. The phase of otherwise missing variants is adopted from the variant files and given a confidence score of 1.0.</p>
</sec>
<sec id="sec008">
<title>Input &amp; output</title>
<p>SmartPhase resolves haplotypes in genomic intervals of interest. Genomic intervals can either be directly defined by the user, or are generated by creating regions enveloping potential compound heterozygous variant pairs of interest. SmartPhase accepts up to two variant specifying files encompassing all variants and those that have been filtered to be deemed clinically relevant. The all variants file is used to create haplotype blocks and usually corresponds to the result of variant calling. The filtered variants file is optional, but can be used to narrow the scope of the output as only those variants specified in this file are printed in the final result. These variants generally constitute the set of variants that were filtered for clinical relevance according to allele frequency, predicted functional impact and other criteria. As many mapping files containing DNA or RNA sequencing reads as desired can be provided to be used during read-based phasing.</p>
<p>The phases and confidence scores of all heterozygous variant combinations within the given or created intervals are the results of SmartPhase. To fully capture the complexity of phased variant pairs, <italic>innocuous</italic> pairs, and variants that were not found in the mapping data, we developed a bitwise flag system to efficiently store all necessary information in a single number. The classification of variant pairs according to the defined criteria is visualized in <xref ref-type="fig" rid="pcbi.1007613.g001">Fig 1</xref>. Table A of <xref ref-type="supplementary-material" rid="pcbi.1007613.s001">S1 Appendix</xref> shows the possible combinations of bits and the corresponding final flag.</p>
<fig id="pcbi.1007613.g001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1007613.g001</object-id>
<label>Fig 1</label>
<caption>
<title>Visualization of the bit flag system.</title>
<p>If a variant pair could be phased, it is either labeled as <italic>cis</italic> or <italic>trans</italic>. Additionally, it can be labeled as <italic>innocuous</italic>. If a variant pair could not be phased, there was either too little evidence for calling <italic>cis</italic> or <italic>trans</italic> or one of both variant alleles could not be found in the mapped reads.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007613.g001" xlink:type="simple"/>
</fig>
</sec>
</sec>
<sec id="sec009" sec-type="results">
<title>Results</title>
<p>We validated SmartPhase on simulated as well as real clinical whole-exome sequencing (WES) data to show its accuracy and compare its performance to WhatsHap. We benchmarked both tools using the runtime, the number of <italic>innocuous</italic> and phased pairs, and the proportion of incorrect predictions. As variant pairs phased or labeled as <italic>innocuous</italic> are equally informative for diagnostic workflows, we summarize these by the term “<italic>cleared</italic> pairs”. We refer to them as <italic>confidently cleared</italic> pairs after removing low quality calls with a confidence score below 0.34 as derived in Section 1.2 of <xref ref-type="supplementary-material" rid="pcbi.1007613.s001">S1 Appendix</xref>. In the context of clinical diagnosis, we are especially interested in variant pairs that can be excluded as being non-disease-causing. Variant pairs can be designated as being clinically non-relevant by <italic>innocuous</italic> labeling or by being on the same allele through confident <italic>cis</italic> calls which we sum up as <italic>non-pathogenic</italic> pairs.</p>
<sec id="sec010">
<title>Comparison of SmartPhase to WhatsHap on simulated data</title>
<p>As described in Section 2 of <xref ref-type="supplementary-material" rid="pcbi.1007613.s001">S1 Appendix</xref>, we simulated WES data of the widely used CEU and YRI trio and phased their heterozygous variants in genes on chromosome 1 and 19 using SmartPhase and WhatsHap. We generated a set of 26, 638 potential heterozygous variant pairs distributed over 2, 922 genes with 4.21 heterozygous variants per gene on average (see Table B of <xref ref-type="supplementary-material" rid="pcbi.1007613.s001">S1 Appendix</xref>). <xref ref-type="table" rid="pcbi.1007613.t001">Table 1</xref> shows an overview of the main results of the benchmark (complete data in <xref ref-type="supplementary-material" rid="pcbi.1007613.s002">S1 Table</xref>).</p>
<table-wrap id="pcbi.1007613.t001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1007613.t001</object-id>
<label>Table 1</label>
<caption>
<title>Benchmark results of SmartPhase and WhatsHap.</title>
</caption>
<alternatives>
<graphic id="pcbi.1007613.t001g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007613.t001" xlink:type="simple"/>
<table border="0" frame="box" rules="all">
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="left" colspan="4">Scenario</th>
<th align="left" colspan="6">SmartPhase</th>
<th align="left" colspan="4">WhatsHap</th>
</tr>
<tr>
<th align="left">Trio</th>
<th align="left">Chr</th>
<th align="left">Pairs</th>
<th align="left">Mode</th>
<th align="right"><italic>Confidently cleared</italic> in %</th>
<th align="right"><italic>Innocuous</italic></th>
<th align="right">Phased</th>
<th align="right">Err</th>
<th align="right">LQ</th>
<th align="right">Time in S</th>
<th align="right">Phased in %</th>
<th align="right">Phased</th>
<th align="right">Err</th>
<th align="right">Time in S</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">CEU</td>
<td align="left">1</td>
<td align="left">6, 783</td>
<td align="left"><italic>read only</italic></td>
<td align="char" char=".">12.1</td>
<td align="right">0</td>
<td align="right">953</td>
<td align="right">14</td>
<td align="right">130</td>
<td align="char" char=".">49.3</td>
<td align="char" char=".">14.9</td>
<td align="right">1, 010</td>
<td align="right">5</td>
<td align="char" char=".">274.3</td>
</tr>
<tr>
<td align="left">CEU</td>
<td align="left">19</td>
<td align="left">4, 531</td>
<td align="left"><italic>read only</italic></td>
<td align="char" char=".">17.2</td>
<td align="right">0</td>
<td align="right">891</td>
<td align="right">4</td>
<td align="right">110</td>
<td align="char" char=".">31.0</td>
<td align="char" char=".">20.8</td>
<td align="right">940</td>
<td align="right">2</td>
<td align="char" char=".">157.6</td>
</tr>
<tr>
<td align="left">YRI</td>
<td align="left">1</td>
<td align="left">9, 186</td>
<td align="left"><italic>read only</italic></td>
<td align="char" char=".">14.3</td>
<td align="right">0</td>
<td align="right">1, 462</td>
<td align="right">4</td>
<td align="right">146</td>
<td align="char" char=".">62.4</td>
<td align="char" char=".">17.4</td>
<td align="right">1, 595</td>
<td align="right">7</td>
<td align="char" char=".">288.0</td>
</tr>
<tr>
<td align="left">YRI</td>
<td align="left">19</td>
<td align="left">6, 138</td>
<td align="left"><italic>read only</italic></td>
<td align="char" char=".">17.2</td>
<td align="right">0</td>
<td align="right">1, 198</td>
<td align="right">3</td>
<td align="right">141</td>
<td align="char" char=".">37.2</td>
<td align="char" char=".">20.3</td>
<td align="right">1, 248</td>
<td align="right">3</td>
<td align="char" char=".">169.6</td>
</tr>
<tr>
<td align="left">CEU</td>
<td align="left">1</td>
<td align="left">6, 783</td>
<td align="left"><italic>read &amp; trio</italic></td>
<td align="char" char=".">100.0</td>
<td align="right">4, 700</td>
<td align="right">2, 083</td>
<td align="right">0</td>
<td align="right">0</td>
<td align="char" char=".">51.3</td>
<td align="char" char=".">79.6</td>
<td align="right">5, 399</td>
<td align="right">74</td>
<td align="char" char=".">270.2</td>
</tr>
<tr>
<td align="left">CEU</td>
<td align="left">19</td>
<td align="left">4, 531</td>
<td align="left"><italic>read &amp; trio</italic></td>
<td align="char" char=".">100.0</td>
<td align="right">3, 087</td>
<td align="right">1, 444</td>
<td align="right">0</td>
<td align="right">0</td>
<td align="char" char=".">31.1</td>
<td align="char" char=".">85.2</td>
<td align="right">3, 860</td>
<td align="right">66</td>
<td align="char" char=".">159.7</td>
</tr>
<tr>
<td align="left">YRI</td>
<td align="left">1</td>
<td align="left">9, 186</td>
<td align="left"><italic>read &amp; trio</italic></td>
<td align="char" char=".">100.0</td>
<td align="right">5, 618</td>
<td align="right">3, 568</td>
<td align="right">0</td>
<td align="right">0</td>
<td align="char" char=".">63.0</td>
<td align="char" char=".">88.0</td>
<td align="right">8, 085</td>
<td align="right">153</td>
<td align="char" char=".">282.7</td>
</tr>
<tr>
<td align="left">YRI</td>
<td align="left">19</td>
<td align="left">6, 138</td>
<td align="left"><italic>read &amp; trio</italic></td>
<td align="char" char=".">100.0</td>
<td align="right">3, 739</td>
<td align="right">2, 399</td>
<td align="right">0</td>
<td align="right">0</td>
<td align="char" char=".">38.3</td>
<td align="char" char=".">89.1</td>
<td align="right">5, 470</td>
<td align="right">101</td>
<td align="char" char=".">168.1</td>
</tr>
</tbody>
</table>
</alternatives>
<table-wrap-foot>
<fn id="t001fn001">
<p>We compared SmartPhase (SP) to WhatsHap (WH) using the number of phased pairs, the number of incorrect phased pairs (Err), and the runtime on the same processing node, measured in seconds, as benchmark parameters. Variant pairs that were both labeled as <italic>innocuous</italic> and phased were only counted as <italic>innocuous</italic>. Pairs that were phased with a confidence score below 0.34 are counted as low quality (LQ) pairs.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Phasing in <italic>read only</italic> mode results in low amounts of phased pairs for both phasing tools. In comparison to WhatsHap, SmartPhase clears 2.8%–3.6% less pairs because variant alleles are often not found in the mapped reads as reported in the variant file. Variant calling tools, like the GATK HaplotypeCaller, rearrange read alignments internally before calling variants. As SmartPhase does not realign reads in contrast to WhatsHap, it performs worse in areas of uncertain mapping when only read information is provided. The results for phasing based on read and trio information demonstrate the power of SmartPhase, as the combination of <italic>innocuous</italic> labeling and phasing clears all input variant pairs. This corresponds to 10.9%–20.4% more variant pairs cleared in comparison to WhatsHap.</p>
<p>SmartPhase generated error-free predictions in the combined <italic>read &amp; trio</italic> mode and all errors in <italic>read only</italic> mode are labeled as low quality. WhatsHap has an average error-rate of 1.03% (0.21%–1.89%) with a remarkable increase of the error rate in combined <italic>read &amp; trio</italic> mode. Although this number is quite low, it can have detrimental consequences if even only one pair is wrongly predicted as being in <italic>cis</italic> configuration when in reality it is disease-causing. This emphasizes how crucial a confidence score is in generating trustworthy and accurate predictions.</p>
<p>Another advantage of SmartPhase is its runtime which is on average five times faster than WhatsHap, independent of using only read or both read and trio information. While the absolute difference is minor for the limited, simulated data, the runtime becomes particularly relevant in a clinical setting where variant pairs from all chromosomes of hundreds of patients must be phased.</p>
</sec>
<sec id="sec011">
<title>Validation of SmartPhase on clinical WES data</title>
<p>We validated SmartPhase on a cohort of clinical WES data that consists of 121 trio and 800 singleton patients without parental genotype information. As detailed in Section 3 of <xref ref-type="supplementary-material" rid="pcbi.1007613.s001">S1 Appendix</xref>, we selected a set of 116, 613 potential compound heterozygous variant pairs after filtering for rare and protein-altering heterozygous autosomal variants. On average, we identified 126.62 ± 161.25 variant pairs per individual.</p>
<sec id="sec012">
<title>Overall performance of SmartPhase</title>
<p>To evaluate the overall performance of SmartPhase on real data, we applied it to all 116, 613 variant pairs identified in the 921 individuals of the cohort with a runtime of 190 minutes for all patients or 12 seconds per patient. The results with and without physical phasing for both singleton and trio patients are shown in <xref ref-type="fig" rid="pcbi.1007613.g002">Fig 2</xref> (complete data in <xref ref-type="supplementary-material" rid="pcbi.1007613.s003">S2 Table</xref>).</p>
<fig id="pcbi.1007613.g002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1007613.g002</object-id>
<label>Fig 2</label>
<caption>
<title>Boxplots showing the distribution of relative amounts of pairs labeled as <italic>cis</italic>, <italic>trans</italic>, and <italic>innocuous</italic> (only for trio phasing) as well as the percentages of pairs that are <italic>cleared</italic>, <italic>confidently cleared</italic> after removing low quality phasing predictions, and pairs that can be excluded as being <italic>non-pathogenic</italic>.</title>
<p>The plots show results for SmartPhase using only read information for 800 singleton patients (a), using both trio and read phasing for 121 trio patients (b) and the results for the same individuals using physical phasing information provided by the HaplotypeCaller of GATK (c) &amp; (d).</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007613.g002" xlink:type="simple"/>
</fig>
<p>For singleton patients, in median 25.91% of all variant pairs can be cleared by being phased (see <xref ref-type="fig" rid="pcbi.1007613.g002">Fig 2(a)</xref>). Ignoring low quality phase predictions with a confidence score below 0.34 results in 2.88% phased pairs. The median fraction of <italic>non-pathogenic</italic> variants is slightly lower at 2.62%.</p>
<p>For trio patients, in median 60.00% of all input pairs are cleared (see <xref ref-type="fig" rid="pcbi.1007613.g002">Fig 2(b)</xref>). The percentage is lower than in the simulated data where all variant pairs could be cleared. As real WES data is imperfect due to failures in exome capturing or low coverage, some genotypes are missing in one or both of the parents which may make a variant pair impossible to resolve. In our simulated data, the largest fraction of pairs is cleared due to <italic>innocuous</italic> labeling. In our clinical data, it corresponds to the smallest fraction of <italic>cleared</italic> pairs. This is due to the preceding filtering for rare variants which are unlikely to be homozygous in one parent or heterozygous in both parents (see Section 3.2 of <xref ref-type="supplementary-material" rid="pcbi.1007613.s001">S1 Appendix</xref>). Removing low quality phase predictions results in 31.03% of the pairs being cleared. In median, 27.06% of all input variant pairs are <italic>non-pathogenic</italic>.</p>
<p>To enhance the power of SmartPhase it is able to incorporate phasing information generated by the HaplotypeCaller of GATK. For both phasing of singleton and trio patients, <xref ref-type="fig" rid="pcbi.1007613.g002">Fig 2(c) and 2(d)</xref> show a noticeable increase in <italic>cis</italic> calls indicating that GATK physical phasing mostly informs about variants being on the same allele, as expected. The percentage of pairs that can be considered clinically irrelevant increases from 2.62% to 21.64% for singletons and from 27.06% to 43.91% for trio patients.</p>
</sec>
<sec id="sec013">
<title>Comparison of SmartPhase to WhatsHap</title>
<p>In order to extend the comparison of SmartPhase and WhatsHap to real data, we applied both tools in <italic>read only</italic> and in combined <italic>read &amp; trio</italic> mode to 21, 066 variant pairs in 121 trio patients of the clinical WES cohort. On the reduced data set, WhatsHap required 90 hours to phase the patients making an analysis of the entire cohort of 921 patients not realistic as it would take approximately 585 hours or more than 28 days. Reducing the input files to the regions of interest would shorten the runtime but requires additional preprocessing steps consuming time and storage resources also resulting in an unrealistic scenario. <xref ref-type="fig" rid="pcbi.1007613.g003">Fig 3</xref> shows the distribution of the percentages of <italic>cleared</italic> pairs for each trio patient for both tools in both modes (complete data in <xref ref-type="supplementary-material" rid="pcbi.1007613.s004">S3 Table</xref>).</p>
<fig id="pcbi.1007613.g003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1007613.g003</object-id>
<label>Fig 3</label>
<caption>
<title>Boxplots showing the percentage of <italic>cleared</italic> pairs for SmartPhase (SP) and WhatsHap (WH) in <italic>read only</italic> and in combined <italic>read &amp; trio</italic> mode on the 21, 066 variant pairs identified in the 121 trio patients of the clinical WES data cohort.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007613.g003" xlink:type="simple"/>
</fig>
<p>The performance in <italic>read only</italic> mode is in the same range for SmartPhase and WhatsHap with 50.0% and 62.5% in median. When removing low confidence predictions of SmartPhase with a confidence score below 0.34 the percentage of <italic>cleared</italic> pairs decreases to a median of 19.4% with some outliers still showing a similar performance to WhatsHap.</p>
<p>Examining a subset of 4, 701 variant pairs that can be phased using parental genotype information, SmartPhase generated phase predictions for 1, 577 pairs with 246 pairs being phased confidently. All of these confident phasing predictions are consistent with parental genotypes. For the 1, 331 pairs with a confidence score below 0.34, only 63 predictions are inconsistent with parental genotypes. For 1, 491 pairs, the aligned reads do not contain at least one of both variants. WhatsHap in comparison generated phase predictions for 3, 058 pairs with 34 erroneous calls. Even though WhatsHap phased markedly more variant pairs, it is not possible to filter out incorrect calls. This can have serious clinical effects, especially when considering that 15 of the 34 errors are <italic>cis</italic> calls that are in fact compound heterozygous according to the parental genotypes.</p>
<p>In combined <italic>read &amp; trio</italic> mode SmartPhase increased the percentage of <italic>cleared</italic> variant pairs considerably to a median value of 75.8% before and 48.7% after confidence score filtering. WhatsHap performed remarkably worse when parental genotype data is provided. The phasing rate drops from 62.5% to 25.0% because WhatsHap ignores variants when genotypes are missing in the parents or contradicting Mendelian inheritance is observed.</p>
</sec>
<sec id="sec014">
<title>Summary</title>
<p>The validation on simulated and clinical data confirms that SmartPhase is characterized by a fast and highly accurate performance. We demonstrated that using a confidence score threshold of 0.34 generates error-free predictions for the complete simulated data set and a sub-set of the clinical data. We showed that SmartPhase outperforms WhatsHap when both read and trio information is provided due to <italic>innocuous</italic> labeling and WhatsHap’s strict handling of variants with missing genotypes in the parents or those violating Mendelian rules. If only read-based phasing is possible, SmartPhase clears less pairs than WhatsHap as SmartPhase takes the provided read alignments as is without performing any realignment. However, as SmartPhase compensates by incorporating phasing information generated by GATK HC, it can approximate WhatsHap’s read-based phasing performance. As GATK HC is the currently prevailing variant caller, physical phasing information of realigned reads can be exploited in most pipelines. Not performing computationally expensive read realignment allows SmartPhase to be markedly faster which is crucial when considering rapidly growing clinical patient cohorts. As the average read length of sequencing techniques is constantly increasing, read mapping will become more and more accurate without the need for realignment, putting SmartPhase at a clear advantage.</p>
<p>As a stand-alone Java application, SmartPhase can be seamlessly incorporated into any clinical workflow without requiring further installations or downloads. Taken together, SmartPhase greatly simplifies the selection of potential compound heterozygous variant pairs as disease candidates and reduces the search space for pathogenic compound heterozygous variant pairs considerably. The resulting speed up of the analysis of clinical sequence data is helpful for all patients, even if their disease is not caused by a compound heterozygous variant.</p>
</sec>
</sec>
</sec>
<sec id="sec015">
<title>Availability and future directions</title>
<p>The source code of SmartPhase, its documentation, a minimum test data set, and the complete validation pipeline that generates and evaluates simulated data are provided in <xref ref-type="supplementary-material" rid="pcbi.1007613.s005">S1 Code</xref> and <xref ref-type="supplementary-material" rid="pcbi.1007613.s006">S1 Text</xref>. Additionally, all files together with the simulated data can be found at <ext-link ext-link-type="uri" xlink:href="http://ibis.helmholtz-muenchen.de/smartphase/" xlink:type="simple">http://ibis.helmholtz-muenchen.de/smartphase/</ext-link>. Comprehensive results of SmartPhase using simulated data and for each of the 921 individuals of the cohort are available in <xref ref-type="supplementary-material" rid="pcbi.1007613.s002">S1</xref>, <xref ref-type="supplementary-material" rid="pcbi.1007613.s003">S2</xref> and <xref ref-type="supplementary-material" rid="pcbi.1007613.s004">S3</xref> Tables.</p>
<p>Besides the demonstrated use of SmartPhase in detecting and filtering compound heterozygous variants, it can further be used for the analysis of multi-nucleotide variants that have been shown to play an important role in human diseases [<xref ref-type="bibr" rid="pcbi.1007613.ref012">12</xref>]. As the detection of multi-nucleotide variants requires phasing of nearby variants, SmartPhase is perfectly suited for this clinical task. SmartPhase was designed to be easy to use in any existing clinical sequencing data workflow. To increase the usability even more, we plan to make SmartPhase available as a module for analysis platforms like Galaxy [<xref ref-type="bibr" rid="pcbi.1007613.ref013">13</xref>] or KNIME4NGS [<xref ref-type="bibr" rid="pcbi.1007613.ref014">14</xref>]. Beyond that, the phasing efficacy of SmartPhase can be improved by the integration of panel-based phasing methods enabling the connection of haplotype blocks by phasing of common variants.</p>
<sec id="sec016">
<title>Ethics statement</title>
<p>The underlying studies that generated human exome sequencing data that were used for the manuscript were approved by the Ethics Commission of the Medical Faculty of the LMU Munich. The reference numbers of the ethics votes are 346-11, 381-11, 387-11, 438-11, 486-11, 303-12, 187-13 BB, 353-13, 66-14, 501-14 and 806-16. The consent was obtained in written form.</p>
</sec>
</sec>
<sec id="sec017">
<title>Supporting information</title>
<supplementary-material id="pcbi.1007613.s001" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007613.s001" xlink:type="simple">
<label>S1 Appendix</label>
<caption>
<title>Additional information on methodology and validation.</title>
<p>The first part provides details on the confidence score formula, the derivation of the confidence score threshold, <italic>innocuous</italic> labeling and the list of all possible bitflags generated by SmartPhase. The second part describes the generation of simulated WES data, the configuration of SmartPhase and WhatsHap, the selection of candidate variant pairs. The third part describes the processing of clinical WES data, the filtering of candidate variant pairs and gives general information on the performed validations.</p>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1007613.s002" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007613.s002" xlink:type="simple">
<label>S1 Table</label>
<caption>
<title>Results of SmartPhase and WhatsHap on simulated data.</title>
<p>Results of our benchmark of SmartPhase on simulated data.</p>
<p>(XLSX)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1007613.s003" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007613.s003" xlink:type="simple">
<label>S2 Table</label>
<caption>
<title>Results of SmartPhase on clinical WES data.</title>
<p>Results of our application of SmartPhase to the cohort of clinical WES data.</p>
<p>(XLSX)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1007613.s004" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007613.s004" xlink:type="simple">
<label>S3 Table</label>
<caption>
<title>Results of SmartPhase and WhatsHap on trio patients.</title>
<p>Results of our comparison of SmartPhase to WhatsHap on trio patients of the clinical WES data cohort.</p>
<p>(XLSX)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1007613.s005" mimetype="application/zip" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007613.s005" xlink:type="simple">
<label>S1 Code</label>
<caption>
<title>Source code files of SmartPhase.</title>
<p>In addition to the source code, the archive file contains the scripts of the validation pipeline and a minimum test data set.</p>
<p>(ZIP)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1007613.s006" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007613.s006" xlink:type="simple">
<label>S1 Text</label>
<caption>
<title>Documentation of SmartPhase.</title>
<p>The documentation gives instructions on how to run SmartPhase and how to interpret its results.</p>
<p>(PDF)</p>
</caption>
</supplementary-material>
</sec>
</body>
<back>
<ref-list>
<title>References</title>
<ref id="pcbi.1007613.ref001">
<label>1</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Amberger</surname> <given-names>JS</given-names></name>, <name name-style="western"><surname>Bocchini</surname> <given-names>CA</given-names></name>, <name name-style="western"><surname>Schiettecatte</surname> <given-names>F</given-names></name>, <name name-style="western"><surname>Scott</surname> <given-names>AF</given-names></name>, <name name-style="western"><surname>Hamosh</surname> <given-names>A</given-names></name>. <article-title>OMIM.org: Online Mendelian Inheritance in Man (OMIM<sup>®</sup>), an online catalog of human genes and genetic disorders</article-title>. <source>Nucleic Acids Research</source>. <year>2015</year>;<volume>43</volume>(<issue>D1</issue>):<fpage>D789</fpage>–<lpage>D798</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/nar/gku1205" xlink:type="simple">10.1093/nar/gku1205</ext-link></comment> <object-id pub-id-type="pmid">25428349</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007613.ref002">
<label>2</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Wright</surname> <given-names>CF</given-names></name>, <name name-style="western"><surname>FitzPatrick</surname> <given-names>DR</given-names></name>, <name name-style="western"><surname>Firth</surname> <given-names>HV</given-names></name>. <article-title>Paediatric genomics: diagnosing rare disease in children</article-title>. <source>Nature Reviews Genetics</source>. <year>2018</year>;<volume>19</volume>(<issue>5</issue>):<fpage>253</fpage>–<lpage>268</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/nrg.2017.116" xlink:type="simple">10.1038/nrg.2017.116</ext-link></comment> <object-id pub-id-type="pmid">29398702</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007613.ref003">
<label>3</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Cummings</surname> <given-names>BB</given-names></name>, <name name-style="western"><surname>Marshall</surname> <given-names>JL</given-names></name>, <name name-style="western"><surname>Tukiainen</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Lek</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Donkervoort</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Foley</surname> <given-names>AR</given-names></name>, <etal>et al</etal>. <article-title>Improving genetic diagnosis in Mendelian disease with transcriptome sequencing</article-title>. <source>Science Translational Medicine</source>. <year>2017</year>;<volume>9</volume>(<issue>386</issue>):<fpage>eaal5209</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1126/scitranslmed.aal5209" xlink:type="simple">10.1126/scitranslmed.aal5209</ext-link></comment> <object-id pub-id-type="pmid">28424332</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007613.ref004">
<label>4</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Kremer</surname> <given-names>LS</given-names></name>, <name name-style="western"><surname>Bader</surname> <given-names>DM</given-names></name>, <name name-style="western"><surname>Mertes</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Kopajtich</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Pichler</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Iuso</surname> <given-names>A</given-names></name>, <etal>et al</etal>. <article-title>Genetic diagnosis of Mendelian disorders via RNA sequencing</article-title>. <source>Nature Communications</source>. <year>2017</year>;<volume>8</volume>:<fpage>15824</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/ncomms15824" xlink:type="simple">10.1038/ncomms15824</ext-link></comment> <object-id pub-id-type="pmid">28604674</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007613.ref005">
<label>5</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Ng</surname> <given-names>SB</given-names></name>, <name name-style="western"><surname>Nickerson</surname> <given-names>DA</given-names></name>, <name name-style="western"><surname>Bamshad</surname> <given-names>MJ</given-names></name>, <name name-style="western"><surname>Shendure</surname> <given-names>J</given-names></name>. <article-title>Massively parallel sequencing and rare disease</article-title>. <source>Human Molecular Genetics</source>. <year>2010</year>;<volume>19</volume>(<issue>R2</issue>):<fpage>R119</fpage>–<lpage>R124</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/hmg/ddq390" xlink:type="simple">10.1093/hmg/ddq390</ext-link></comment> <object-id pub-id-type="pmid">20846941</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007613.ref006">
<label>6</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Richards</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Aziz</surname> <given-names>N</given-names></name>, <name name-style="western"><surname>Bale</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Bick</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Das</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Gastier-Foster</surname> <given-names>J</given-names></name>, <etal>et al</etal>. <article-title>Standards and guidelines for the interpretation of sequence variants: a joint consensus recommendation of the American College of Medical Genetics and Genomics and the Association for Molecular Pathology</article-title>. <source>Genetics in Medicine</source>. <year>2015</year>;<volume>17</volume>(<issue>5</issue>):<fpage>405</fpage>–<lpage>423</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/gim.2015.30" xlink:type="simple">10.1038/gim.2015.30</ext-link></comment> <object-id pub-id-type="pmid">25741868</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007613.ref007">
<label>7</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Boycott</surname> <given-names>KM</given-names></name>, <name name-style="western"><surname>Vanstone</surname> <given-names>MR</given-names></name>, <name name-style="western"><surname>Bulman</surname> <given-names>DE</given-names></name>, <name name-style="western"><surname>MacKenzie</surname> <given-names>AE</given-names></name>. <article-title>Rare-disease genetics in the era of next-generation sequencing: discovery to translation</article-title>. <source>Nature Reviews Genetics</source>. <year>2013</year>;<volume>14</volume>(<issue>10</issue>):<fpage>681</fpage>–<lpage>691</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/nrg3555" xlink:type="simple">10.1038/nrg3555</ext-link></comment> <object-id pub-id-type="pmid">23999272</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007613.ref008">
<label>8</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Choi</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>Chan</surname> <given-names>AP</given-names></name>, <name name-style="western"><surname>Kirkness</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Telenti</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Schork</surname> <given-names>NJ</given-names></name>. <article-title>Comparison of phasing strategies for whole human genomes</article-title>. <source>PLOS Genetics</source>. <year>2018</year>;<volume>14</volume>(<issue>4</issue>):<fpage>e1007308</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pgen.1007308" xlink:type="simple">10.1371/journal.pgen.1007308</ext-link></comment> <object-id pub-id-type="pmid">29621242</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007613.ref009">
<label>9</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Castel</surname> <given-names>SE</given-names></name>, <name name-style="western"><surname>Mohammadi</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Chung</surname> <given-names>WK</given-names></name>, <name name-style="western"><surname>Shen</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>Lappalainen</surname> <given-names>T</given-names></name>. <article-title>Rare variant phasing and haplotypic expression from RNA sequencing with phASER</article-title>. <source>Nature communications</source>. <year>2016</year>;<volume>7</volume>:<fpage>12817</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/ncomms12817" xlink:type="simple">10.1038/ncomms12817</ext-link></comment> <object-id pub-id-type="pmid">27605262</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007613.ref010">
<label>10</label>
<mixed-citation publication-type="other" xlink:type="simple">Martin M, Patterson M, Garg S, Fischer SO, Pisanti N, Klau GW, et al. WhatsHap: fast and accurate read-based phasing. bioRxiv. 2016; p. 085050.</mixed-citation>
</ref>
<ref id="pcbi.1007613.ref011">
<label>11</label>
<mixed-citation publication-type="other" xlink:type="simple">Poplin R, Ruano-Rubio V, DePristo MA, Fennell TJ, Carneiro MO, der Auwera GAV, et al. Scaling accurate genetic variant discovery to tens of thousands of samples. bioRxiv. 2018; p. 201178.</mixed-citation>
</ref>
<ref id="pcbi.1007613.ref012">
<label>12</label>
<mixed-citation publication-type="other" xlink:type="simple">Kaplanis J, Akawi N, Gallone G, McRae JF, Prigmore E, Wright CF, et al. Exome-wide assessment of the functional impact and pathogenicity of multi-nucleotide mutations. bioRxiv. 2018; p. 258723.</mixed-citation>
</ref>
<ref id="pcbi.1007613.ref013">
<label>13</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Afgan</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Baker</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Batut</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>van den Beek</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Bouvier</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Čech</surname> <given-names>M</given-names></name>, <etal>et al</etal>. <article-title>The Galaxy platform for accessible, reproducible and collaborative biomedical analyses: 2018 update</article-title>. <source>Nucleic Acids Research</source>. <year>2018</year>;<volume>46</volume>(<issue>W1</issue>):<fpage>W537</fpage>–<lpage>W544</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/nar/gky379" xlink:type="simple">10.1093/nar/gky379</ext-link></comment> <object-id pub-id-type="pmid">29790989</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007613.ref014">
<label>14</label>
<mixed-citation publication-type="other" xlink:type="simple">Hastreiter M, Jeske T, Hoser J, Kluge M, Ahomaa K, Friedl MS, et al. KNIME4NGS: a comprehensive toolbox for Next Generation Sequencing analysis. Bioinformatics. 2017; p. btx003.</mixed-citation>
</ref>
</ref-list>
</back>
</article>