<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1d3 20150301//EN" "http://jats.nlm.nih.gov/publishing/1.1d3/JATS-journalpublishing1.dtd">
<article article-type="research-article" dtd-version="1.1d3" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS ONE</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">plosone</journal-id>
<journal-title-group>
<journal-title>PLOS ONE</journal-title>
</journal-title-group>
<issn pub-type="epub">1932-6203</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, CA USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.1371/journal.pone.0185056</article-id>
<article-id pub-id-type="publisher-id">PONE-D-17-13379</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Research Article</subject>
</subj-group>
<subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject><subj-group><subject>Metagenomics</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Research and analysis methods</subject><subj-group><subject>Database and informatics methods</subject><subj-group><subject>Bioinformatics</subject><subj-group><subject>Sequence analysis</subject><subj-group><subject>Sequence alignment</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Molecular biology</subject><subj-group><subject>Molecular biology techniques</subject><subj-group><subject>Cloning</subject><subj-group><subject>DNA cloning</subject><subj-group><subject>Shotgun sequencing</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Research and analysis methods</subject><subj-group><subject>Molecular biology techniques</subject><subj-group><subject>Cloning</subject><subj-group><subject>DNA cloning</subject><subj-group><subject>Shotgun sequencing</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Molecular biology</subject><subj-group><subject>Molecular biology techniques</subject><subj-group><subject>Sequencing techniques</subject><subj-group><subject>Shotgun sequencing</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Research and analysis methods</subject><subj-group><subject>Molecular biology techniques</subject><subj-group><subject>Sequencing techniques</subject><subj-group><subject>Shotgun sequencing</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Computational biology</subject><subj-group><subject>Genome analysis</subject><subj-group><subject>Sequence assembly tools</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject><subj-group><subject>Genome analysis</subject><subj-group><subject>Sequence assembly tools</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Social sciences</subject><subj-group><subject>Sociology</subject><subj-group><subject>Social systems</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Computer and information sciences</subject><subj-group><subject>Computer software</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Organisms</subject><subj-group><subject>Bacteria</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Computational biology</subject><subj-group><subject>Genome analysis</subject><subj-group><subject>Genome annotation</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject><subj-group><subject>Genome analysis</subject><subj-group><subject>Genome annotation</subject></subj-group></subj-group></subj-group></subj-group></subj-group></article-categories>
<title-group>
<article-title>BBMerge – Accurate paired shotgun read merging via overlap</article-title>
<alt-title alt-title-type="running-head">BBMerge – Accurate paired shotgun read merging via overlap</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Bushnell</surname>
<given-names>Brian</given-names>
</name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Data curation</role>
<role content-type="http://credit.casrai.org/">Formal analysis</role>
<role content-type="http://credit.casrai.org/">Investigation</role>
<role content-type="http://credit.casrai.org/">Methodology</role>
<role content-type="http://credit.casrai.org/">Project administration</role>
<role content-type="http://credit.casrai.org/">Software</role>
<role content-type="http://credit.casrai.org/">Supervision</role>
<role content-type="http://credit.casrai.org/">Validation</role>
<role content-type="http://credit.casrai.org/">Writing – original draft</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Rood</surname>
<given-names>Jonathan</given-names>
</name>
<role content-type="http://credit.casrai.org/">Software</role>
<xref ref-type="aff" rid="aff002"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0002-3126-2199</contrib-id>
<name name-style="western">
<surname>Singer</surname>
<given-names>Esther</given-names>
</name>
<role content-type="http://credit.casrai.org/">Supervision</role>
<role content-type="http://credit.casrai.org/">Visualization</role>
<role content-type="http://credit.casrai.org/">Writing – original draft</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
<xref ref-type="corresp" rid="cor001">*</xref>
</contrib>
</contrib-group>
<aff id="aff001"><label>1</label> <addr-line>DOE Joint Genome Institute, Walnut Creek, CA, United States of America</addr-line></aff>
<aff id="aff002"><label>2</label> <addr-line>National Renewable Energy Laboratory, Golden, CO, United States of America</addr-line></aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple">
<name name-style="western">
<surname>Biggs</surname>
<given-names>Patrick Jon</given-names>
</name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/>
</contrib>
</contrib-group>
<aff id="edit1"><addr-line>Massey University, NEW ZEALAND</addr-line></aff>
<author-notes>
<fn fn-type="conflict" id="coi001">
<p>The authors have declared no competing interests exist.</p>
</fn>
<corresp id="cor001">* E-mail: <email xlink:type="simple">esinger@lbl.gov</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>26</day>
<month>10</month>
<year>2017</year>
</pub-date>
<pub-date pub-type="collection">
<year>2017</year>
</pub-date>
<volume>12</volume>
<issue>10</issue>
<elocation-id>e0185056</elocation-id>
<history>
<date date-type="received">
<day>6</day>
<month>4</month>
<year>2017</year>
</date>
<date date-type="accepted">
<day>6</day>
<month>9</month>
<year>2017</year>
</date>
</history>
<permissions>
<license xlink:href="https://creativecommons.org/publicdomain/zero/1.0/" xlink:type="simple">
<license-p>This is an open access article, free of all copyright, and may be freely reproduced, distributed, transmitted, modified, built upon, or otherwise used by anyone for any lawful purpose. The work is made available under the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/publicdomain/zero/1.0/" xlink:type="simple">Creative Commons CC0</ext-link> public domain dedication.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.pone.0185056"/>
<abstract>
<p>Merging paired-end shotgun reads generated on high-throughput sequencing platforms can substantially improve various subsequent bioinformatics processes, including genome assembly, binning, mapping, annotation, and clustering for taxonomic analysis. With the inexorable growth of sequence data volume and CPU core counts, the speed and scalability of read-processing tools becomes ever-more important. The accuracy of shotgun read merging is crucial as well, as errors introduced by incorrect merging percolate through to reduce the quality of downstream analysis. Thus, we designed a new tool to maximize accuracy and minimize processing time, allowing the use of read merging on larger datasets, and in analyses highly sensitive to errors. We present BBMerge, a new merging tool for paired-end shotgun sequence data. We benchmark BBMerge by comparison with eight other widely used merging tools, assessing speed, accuracy and scalability. Evaluations of both synthetic and real-world datasets demonstrate that BBMerge produces merged shotgun reads with greater accuracy and at higher speed than any existing merging tool examined. BBMerge also provides the ability to merge non-overlapping shotgun read pairs by using <italic>k</italic>-mer frequency information to assemble the unsequenced gap between reads, achieving a significantly higher merge rate while maintaining or increasing accuracy.</p>
</abstract>
<funding-group>
<award-group id="award001">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/100000015</institution-id>
<institution>U.S. Department of Energy</institution>
</institution-wrap>
</funding-source>
<award-id>DE-AC02-05CH11231</award-id>
</award-group>
<funding-statement>This work was conducted by the U.S. Department of Energy Joint Genome Institute, a DOE Office of Science User Facility, and is supported under Contract No. DE-AC02-05CH11231. The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</funding-statement>
</funding-group>
<counts>
<fig-count count="6"/>
<table-count count="3"/>
<page-count count="15"/>
</counts>
<custom-meta-group>
<custom-meta id="data-availability">
<meta-name>Data Availability</meta-name>
<meta-value>Mock community data are available from <ext-link ext-link-type="uri" xlink:href="http://genome.jgi.doe.gov/MeCorS/MeCorS.home.html" xlink:type="simple">http://genome.jgi.doe.gov/MeCorS/MeCorS.home.html</ext-link>. Synthetic data generated from the genome of <italic>Chlamydomonas reinhardtii</italic> (v3.0) is available at <ext-link ext-link-type="uri" xlink:href="https://genome.jgi.doe.gov/Chlre3/Chlre3.home.html" xlink:type="simple">https://genome.jgi.doe.gov/Chlre3/Chlre3.home.html</ext-link> and <ext-link ext-link-type="uri" xlink:href="ftp://ftp.jgi-psf.org/pub/JGI_data/Chlamy/v3.0/Chlre3.fasta.gz" xlink:type="simple">ftp://ftp.jgi-psf.org/pub/JGI_data/Chlamy/v3.0/Chlre3.fasta.gz</ext-link>.</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="sec001" sec-type="intro">
<title>Introduction</title>
<p>Many sequencing platforms–including Illumina and Ion Torrent, which comprise the majority of sequencing capacity at many institutions—produce relatively short reads with tens to low hundreds of bases. Short read lengths result from the decline of signal intensity and integrity with each subsequent base during the sequencing process. To compensate for this, paired-end reads are generated by sequencing two end regions of a nucleic acid fragment [<xref ref-type="bibr" rid="pone.0185056.ref001">1</xref>].</p>
<p>Although many advances have been achieved using paired-end sequencing, there remain situations in which single, longer reads are preferable to paired shorter reads, such as de novo assembly contig-building, read binning or clustering, gene annotation, and small-variant calling. To address this need, several programs have been designed to merge paired short reads into single longer reads; however, most of these are designed to primarily merge 16S rRNA gene amplicon sequences rather than shotgun sequence data.</p>
<p>In this study, we describe BBMerge, a new overlap-based tool for merging short high-throughput shotgun sequencing reads. BBMerge allows simple adjustment of merging sensitivity to accurately and efficiently process large datasets from a variety of sequence types. We designed BBMerge to address common difficulties associated with paired-end shotgun read merging, <italic>i</italic>.<italic>e</italic>. reducing incorrect merge rates, increasing scalability, and handling non-overlapping pairs from longer fragments, which most tools cannot merge. BBMerge’s performance is compared to existing read merging tools that allow shotgun read input using both synthetic and real-world data from <italic>Chlamydomonas reinhardtii</italic> and a defined microbial community with bacterial and archaeal members (MBARC-26) [<xref ref-type="bibr" rid="pone.0185056.ref002">2</xref>], respectively.</p>
</sec>
<sec id="sec002" sec-type="materials|methods">
<title>Materials and methods</title>
<sec id="sec003">
<title>Synthetic and real-world sequence data</title>
<p>In order to evaluate merging performance, we used synthetically generated data from a eukaryotic genome to allow precise evaluation of merging accuracy as well as real-world shotgun metagenome data from a prokaryotic community. These two datasets include eukaryotic, bacterial and archaeal organisms with complete reference genomes spanning a large spectrum of %GC.</p>
<p>We synthetically generated 20 million reads based on the <italic>Chlamydomonas reinhardtii</italic> genome (v3.0), which was retrieved from the JGI Plant Genomics Resource Phytozome (<ext-link ext-link-type="uri" xlink:href="ftp://ftp.jgi-psf.org/pub/JGI_data/Chlamy/v3.0/Chlre3.fasta.gz" xlink:type="simple">ftp://ftp.jgi-psf.org/pub/JGI_data/Chlamy/v3.0/Chlre3.fasta.gz</ext-link>). Synthetic reads were generated using BBMap (<ext-link ext-link-type="uri" xlink:href="https://sourceforge.net/projects/bbmap/" xlink:type="simple">https://sourceforge.net/projects/bbmap/</ext-link>) as follows: first, reference sequences were indexed (<xref ref-type="table" rid="pone.0185056.t001">Table 1A</xref>). Second, synthetic reads were generated (<xref ref-type="table" rid="pone.0185056.t001">Table 1B</xref>). Third, read headers were renamed according to their known insert size, to allow subsequent grading (<xref ref-type="table" rid="pone.0185056.t001">Table 1C</xref>). Fourth, reads were decompressed and moved to ramdisk (<xref ref-type="table" rid="pone.0185056.t001">Table 1D</xref>).</p>
<table-wrap id="pone.0185056.t001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0185056.t001</object-id>
<label>Table 1</label> <caption><title>Test data setup.</title></caption>
<alternatives>
<graphic id="pone.0185056.t001g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0185056.t001" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="left">Step</th>
<th align="left">Note</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">A</td>
<td align="left">bbmap.sh ref = chlamy_reference.fasta.qz</td>
</tr>
<tr>
<td align="left">B</td>
<td align="left">randomreads.sh reads = 20m out = synth20m.fq.gz len = 150 paired int pigz = 32 zl = 6 minq = 14 midq = 24 maxq = 34 qv = 6 adderrors = t nrate = 0.00 maxns = 2 maxnlen = 8 ow mininsert = 100 maxinsert = 400 gaussian overlap = 150 banns fragadapter = <monospace>GACGCTGCCGACGAATAGAGAGGTGTAGATCTCGGTGGTCGCCGTATCATT</monospace> fragadapter2 = <monospace>CCGAGCCCACGAGACTAAGGCGAATCTCGTATGCCGTCTTCTGCTTG</monospace></td>
</tr>
<tr>
<td align="left">C</td>
<td align="left">rename.sh in = synth20m.fq.gz out = renamed20m.fq.gz renamebyinsert int</td>
</tr>
<tr>
<td align="left">D</td>
<td align="left">reformat.sh in = renamed20m.fq.gz out = /dev/shm/r#.fq</td>
</tr>
<tr>
<td align="left">E</td>
<td align="left">bbmap.sh ref = mock_ref.fa</td>
</tr>
<tr>
<td align="left">F</td>
<td align="left">bbmap.sh in = mock_raw.fq.gz outm = mapped_renamed_noindels.fq.gz ow po indelfilter = 0 renamebyinsert maxindel = 20 minid = 0.75</td>
</tr>
<tr>
<td align="left">G</td>
<td align="left">reformat.sh in = clean.fq.gz out = 20m.fq.gz srt = 20m</td>
</tr>
<tr>
<td align="left">H</td>
<td align="left">grademerge.sh in = merged.fq</td>
</tr>
<tr>
<td align="left">I</td>
<td align="left">reformat.sh in = mock_raw.fq.gz out = r#.fq srt = 20m</td>
</tr>
<tr>
<td align="left">J</td>
<td align="left">spades.py—meta -k25,55,95,125—phred-offset 33 -s merged.fq -1 unmerged1.fq -2 unmerged2.fq -o spades_out</td>
</tr>
<tr>
<td align="left">K</td>
<td align="left">quast.py -o quast_out -R mock_ref.fa -f spades_*/contigs.fasta</td>
</tr>
</tbody>
</table>
</alternatives>
<table-wrap-foot>
<fn id="t001fn001"><p>BBMap, BBMerge, RandomReads, Rename, Reformat, and GradeMerge are part of the open-source BBMap package (<ext-link ext-link-type="uri" xlink:href="https://sourceforge.net/projects/bbmap/" xlink:type="simple">https://sourceforge.net/projects/bbmap/</ext-link>).</p></fn>
</table-wrap-foot>
</table-wrap>
<p>Real-world data is comprised of shotgun metagenomic sequence data from MBARC-26, a microbial mock community consisting of 23 bacterial and 3 archaeal strains [<xref ref-type="bibr" rid="pone.0185056.ref003">3</xref>–<xref ref-type="bibr" rid="pone.0185056.ref010">10</xref>]. DNA extraction from MBARC-26, Illumina metagenome library creation, and shotgun sequencing were performed as described in [<xref ref-type="bibr" rid="pone.0185056.ref004">4</xref>], yielding 2x150 bp reads.</p>
<p>Reference genomes for MBARC-26 were retrieved from JGI’s IMG [<xref ref-type="bibr" rid="pone.0185056.ref011">11</xref>] and used for mapping as described in the following: Reference genomes were first indexed (<xref ref-type="table" rid="pone.0185056.t001">Table 1E</xref>). Second, shotgun metagenome reads were mapped to reference sequences to a) determine insert sizes, and b) to remove reads that mapped with indels or that did not map in a properly paired orientation (<xref ref-type="table" rid="pone.0185056.t001">Table 1F</xref>) using BBMap’s default settings. This filtering step ensured the correct determination of the insert size for each read pair for subsequent grading; insert sizes of unpaired reads cannot be determined, and reads mapped with indels yield a different insert size as calculated by mapping versus merging. Mapping was not necessary for the synthetic data as the true insert size was known <italic>a priori</italic>. The remaining shotgun metagenome reads were subsampled to 20 million read pairs (<xref ref-type="table" rid="pone.0185056.t001">Table 1G</xref>).</p>
<p>Grading was performed using GradeMerge (<xref ref-type="table" rid="pone.0185056.t001">Table 1H</xref>) to obtain the number of correctly and incorrectly merged reads. A merged read was considered correct if its length exactly matched the insert size indicated by its header. The reported percentage values and signal-to-noise ratio (SNR) are defined as:
<disp-formula id="pone.0185056.e001">
<alternatives>
<graphic id="pone.0185056.e001g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0185056.e001" xlink:type="simple"/>
<mml:math display="block" id="M1">
<mml:mi mathvariant="bold-italic">C</mml:mi><mml:mi>%</mml:mi><mml:mo>=</mml:mo><mml:mn mathvariant="bold">100</mml:mn><mml:mo>*</mml:mo><mml:mfrac><mml:mrow><mml:mi mathvariant="bold-italic">C</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">P</mml:mi></mml:mrow></mml:mfrac>
</mml:math>
</alternatives>
<label>(1)</label>
</disp-formula>
<disp-formula id="pone.0185056.e002">
<alternatives>
<graphic id="pone.0185056.e002g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0185056.e002" xlink:type="simple"/>
<mml:math display="block" id="M2">
<mml:mi mathvariant="bold-italic">I</mml:mi><mml:mi>%</mml:mi><mml:mo>=</mml:mo><mml:mn mathvariant="bold">100</mml:mn><mml:mo>*</mml:mo><mml:mfrac><mml:mrow><mml:mi mathvariant="bold-italic">I</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">P</mml:mi></mml:mrow></mml:mfrac>
</mml:math>
</alternatives>
<label>(2)</label>
</disp-formula>
<disp-formula id="pone.0185056.e003">
<alternatives>
<graphic id="pone.0185056.e003g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0185056.e003" xlink:type="simple"/>
<mml:math display="block" id="M3">
<mml:mi mathvariant="bold-italic">S</mml:mi><mml:mi mathvariant="bold-italic">N</mml:mi><mml:mi mathvariant="bold-italic">R</mml:mi><mml:mo>=</mml:mo><mml:mn mathvariant="bold">10</mml:mn><mml:mo>⋅</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">l</mml:mi><mml:mi mathvariant="bold-italic">o</mml:mi><mml:mi mathvariant="bold-italic">g</mml:mi></mml:mrow><mml:mrow><mml:mn mathvariant="bold">10</mml:mn></mml:mrow></mml:msub><mml:mfrac><mml:mrow><mml:mi mathvariant="bold-italic">C</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">I</mml:mi></mml:mrow></mml:mfrac>
</mml:math>
</alternatives>
<label>(3)</label>
</disp-formula>
, where:</p>
<list list-type="simple">
<list-item><label>a</label><p><bold><italic>C</italic></bold> is the number of correctly merged reads.</p></list-item>
<list-item><label>b</label><p><bold><italic>I</italic></bold> is the number of incorrectly merged reads.</p></list-item>
<list-item><label>c</label><p><bold><italic>C%</italic></bold> is the percent of correctly merged reads.</p></list-item>
<list-item><label>d</label><p><bold><italic>I%</italic></bold> is the percent of incorrectly merged reads.</p></list-item>
<list-item><label>e</label><p><bold><italic>P</italic></bold> is the number of input read pairs.</p></list-item>
</list>
<p>Assembly quality was evaluated using raw shotgun metagenomic reads from MBARC-26 subsampled to 20 million read pairs (<xref ref-type="table" rid="pone.0185056.t001">Table 1I</xref>). To eliminate potential impact originating from pre-processing, reads were not filtered or trimmed. Reads were merged with each tool, then both the merged and unmerged output was passed to SPAdes v. 3.8.2 [<xref ref-type="bibr" rid="pone.0185056.ref012">12</xref>] for assembly in metagenome mode (<xref ref-type="table" rid="pone.0185056.t001">Table 1J</xref>). Assembled contigs were compared to the metagenome reference using QUAST v. 4.2 [<xref ref-type="bibr" rid="pone.0185056.ref013">13</xref>] for evaluation (<xref ref-type="table" rid="pone.0185056.t001">Table 1K</xref>). Global and local misassemblies as defined in [<xref ref-type="bibr" rid="pone.0185056.ref013">13</xref>] were combined and are reported as “total misassemblies”.</p>
<sec id="sec004">
<title>Paired-end read merging tools</title>
<p>All algorithms for read merging compared here (<xref ref-type="table" rid="pone.0185056.t002">Table 2</xref>) are based on overlap detection [<xref ref-type="bibr" rid="pone.0185056.ref014">14</xref>–<xref ref-type="bibr" rid="pone.0185056.ref019">19</xref>], with the exception of leeHom [<xref ref-type="bibr" rid="pone.0185056.ref020">20</xref>] and BBMerge, which additionally use adapter-sequence detection; and COPE [<xref ref-type="bibr" rid="pone.0185056.ref018">18</xref>] and BBMerge, which additionally use kmer counts in non-default modes. All tools were executed as described in <xref ref-type="supplementary-material" rid="pone.0185056.s001">S1 Table</xref>.</p>
<table-wrap id="pone.0185056.t002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0185056.t002</object-id>
<label>Table 2</label> <caption><title>Read merging tools compared in this study in alphabetical order.</title></caption>
<alternatives>
<graphic id="pone.0185056.t002g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0185056.t002" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="left">Program</th>
<th align="center">Language</th>
<th align="center">Open-Source</th>
<th align="center">Multi-threaded</th>
<th align="center">gzip I/O</th>
<th align="center">Reference</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">BBMerge v.36.20</td>
<td align="center">Java</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center"><xref ref-type="table-fn" rid="t002fn001"><sup>a</sup></xref></td>
</tr>
<tr>
<td align="left">COPE v.1.1.3</td>
<td align="center">C++</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">Partial</td>
<td align="center">[<xref ref-type="bibr" rid="pone.0185056.ref018">18</xref>]</td>
</tr>
<tr>
<td align="left">fastq-join v.1.1.2–537</td>
<td align="center">C++</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">Yes</td>
<td align="center">[<xref ref-type="bibr" rid="pone.0185056.ref019">19</xref>]</td>
</tr>
<tr>
<td align="left">FLASH v.1.2.11</td>
<td align="center">C</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">[<xref ref-type="bibr" rid="pone.0185056.ref014">14</xref>]</td>
</tr>
<tr>
<td align="left">leeHom (retrieved July 15, 2016)</td>
<td align="center">C++</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">Yes</td>
<td align="center">[<xref ref-type="bibr" rid="pone.0185056.ref020">20</xref>]</td>
</tr>
<tr>
<td align="left">PEAR v.0.9.6</td>
<td align="center">C</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">[<xref ref-type="bibr" rid="pone.0185056.ref017">17</xref>]</td>
</tr>
<tr>
<td align="left">Stitch (retrieved July 15, 2016)</td>
<td align="center">Python</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">[<xref ref-type="bibr" rid="pone.0185056.ref021">21</xref>]</td>
</tr>
<tr>
<td align="left">USEARCH v.8.1.1861</td>
<td align="center">Unknown</td>
<td align="center">No</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">[<xref ref-type="bibr" rid="pone.0185056.ref015">15</xref>]</td>
</tr>
<tr>
<td align="left">XORRO v.0.98</td>
<td align="center">C</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">No</td>
<td align="center">[<xref ref-type="bibr" rid="pone.0185056.ref016">16</xref>]</td>
</tr>
</tbody>
</table>
</alternatives>
<table-wrap-foot>
<fn id="t002fn001"><p><sup>a</sup>: <ext-link ext-link-type="uri" xlink:href="https://sourceforge.net/projects/bbmap/" xlink:type="simple">https://sourceforge.net/projects/bbmap/</ext-link></p></fn>
</table-wrap-foot>
</table-wrap>
<p>Although an effort was made to compare all available overlap-based read merging tools for a comprehensive evaluation in this study, the testing methodology precluded the use of PANDAseq [<xref ref-type="bibr" rid="pone.0185056.ref022">22</xref>], which cannot process reads with renamed headers. Eloper [<xref ref-type="bibr" rid="pone.0185056.ref023">23</xref>] was tested, but not included, as it was unable to produce fastq files or retain the original read headers.</p>
</sec>
<sec id="sec005">
<title>Parameters and testing</title>
<p>Each program was tested for speed, accuracy, and scalability. All testing was executed on the NERSC Genepool cluster (<ext-link ext-link-type="uri" xlink:href="http://www.nersc.gov/users/computational-systems/genepool/" xlink:type="simple">http://www.nersc.gov/users/computational-systems/genepool/</ext-link>), using a 1 TB, 32-core node based Intel Xeon E5-4650L CPUs @ 2.60GHz. Reads and writes were all performed using a ramdisk to eliminate any impact of contention for the cluster’s shared file system.</p>
<p>Execution of merging tools was performed according to each program’s defaults, except as noted (<xref ref-type="supplementary-material" rid="pone.0185056.s001">S1 Table</xref>). For accuracy testing, each program was run multiple times; the single parameter that was identified to impact the respective tool’s sensitivity most was varied between runs (if available) (<xref ref-type="supplementary-material" rid="pone.0185056.s002">S2 Table</xref>). After each run, the resulting output was graded, <italic>i</italic>.<italic>e</italic>. each merged read’s length was compared to the true insert size noted in that read’s header.</p>
<p>Speed and scalability testing was executed using the Linux “time” command, <italic>e</italic>.<italic>g</italic>. “time bbmerge.sh <italic>&lt;other options&gt;</italic>”, with default parameters and varying numbers of threads. For BBMerge, three modes were included in this study: default, REM, and RSEM, as described in 2.2.3 and 2.2.4. For COPE, two modes were included: default (M0), using simple overlap only, and M3, using <italic>k</italic>-mers to join non-overlapping pairs. COPE’s M1 mode was not found to differ substantially from M0, and M2 did not produce output, so neither are included. Speed tests were performed on both synthetic and real-world shotgun metagenome reads. Since no significant difference was found, we only report test results for the real-world metagenome data.</p>
</sec>
</sec>
</sec>
<sec id="sec006" sec-type="results">
<title>Results</title>
<sec id="sec007">
<title>BBMerge overlap-detection</title>
<p>Overlap-detection involves multiple heuristics, controlled by constants denoted C<sub>i</sub>. These have already been optimized through extensive empirical testing and do not need to be adjusted by the user; they are only presented to describe the algorithm. For each read pair:</p>
<list id="list1" list-type="simple">
<list-item><label>1</label><p>Read 2 is reverse-complemented, because read 1 and read 2 are produced from opposite strands of the initial DNA fragment.</p></list-item>
<list-item><label>2</label><p>Read 1 and read 2 are aligned in every possible offset.
<list list-type="alpha-lower">
<list-item><p>An “offset” is defined by the relative start position of the reads. For offset <bold><italic>O</italic></bold> = 0, each base number <bold><italic>X</italic></bold><sub><bold><italic>i</italic></bold></sub> of read 1 aligns to base number <bold><italic>X</italic></bold><sub><bold><italic>i</italic></bold></sub> of read 2. In general, each base <bold><italic>X</italic></bold><sub><bold><italic>i</italic></bold></sub> in read 1 aligns to base <bold><italic>X</italic></bold><sub><bold><italic>i+O</italic></bold></sub> in read 2.</p></list-item>
<list-item><p>This alignment only counts matches and mismatches; indels are not allowed.</p></list-item>
</list></p></list-item>
<list-item><label>3</label><p>The standard mode for determining the offset is called “ratio mode”. For each offset, a ratio <bold><italic>R</italic></bold> is calculated:
<disp-formula id="pone.0185056.e004">
<alternatives>
<graphic id="pone.0185056.e004g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0185056.e004" xlink:type="simple"/>
<mml:math display="block" id="M4">
<mml:mi mathvariant="bold-italic">R</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi mathvariant="bold-italic">B</mml:mi><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">C</mml:mi></mml:mrow><mml:mrow><mml:mn mathvariant="bold">0</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi mathvariant="bold-italic">B</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="bold-italic">G</mml:mi></mml:mrow></mml:mfrac>
</mml:math>
</alternatives>
<label>(4)</label>
</disp-formula>
, where:</p></list-item></list>
<p><bold><italic>B</italic></bold> is the number of mismatches, <bold><italic>G</italic></bold> is the number of matches, <bold><italic>C</italic></bold><sub><bold><italic>0</italic></bold></sub> is a constant. An optional flag, “ouq”, allows <bold><italic>B</italic></bold> and <bold><italic>G</italic></bold> to be calculated using quality scores, but this is only helpful if the quality scores are accurate.</p>
<list continued-from="list1" list-type="simple">
<list-item><label>4</label><p>The two best (lowest) ratios, <bold><italic>R</italic></bold><sub><bold><italic>1</italic></bold></sub> and <bold><italic>R</italic></bold><sub><bold><italic>2</italic></bold></sub>, are tracked throughout the process.</p></list-item>
<list-item><label>5</label><p>Once the alignments finish, <bold><italic>R</italic></bold><sub><bold><italic>1</italic></bold></sub> and <bold><italic>R</italic></bold><sub><bold><italic>2</italic></bold></sub> are examined to decide whether an alignment will be accepted (<xref ref-type="fig" rid="pone.0185056.g001">Fig 1A</xref>) or discarded (<xref ref-type="fig" rid="pone.0185056.g001">Fig 1B</xref>), using heuristics with different constants.
<list list-type="alpha-lower">
<list-item><p>If <bold><italic>R</italic></bold><sub><bold><italic>1</italic></bold></sub><bold>&gt;<italic>C</italic></bold><sub><bold><italic>1</italic></bold></sub>, the alignment will be rejected as invalid.</p></list-item>
<list-item><p>If <bold><italic>R</italic></bold><sub><bold><italic>1</italic></bold></sub>*<bold><italic>C</italic></bold><sub><bold><italic>2</italic></bold></sub><bold>&gt;<italic>R</italic></bold><sub><bold><italic>2</italic></bold></sub>, the alignment will be rejected as ambiguous.</p></list-item>
<list-item><p>If <bold><italic>R</italic></bold><sub><bold><italic>2</italic></bold></sub><bold>&lt;<italic>C</italic></bold><sub><bold><italic>3</italic></bold></sub>, the alignment will be rejected as ambiguous.</p></list-item>
<list-item><p>If <bold><italic>G</italic>&lt;max(<italic>C</italic></bold><sub><bold><italic>4</italic></bold></sub>, <bold><italic>V</italic></bold>) the alignment will be rejected as having too short of an overlap. <bold><italic>V</italic></bold> is derived from the sequence complexity of a given pair, decreasing as complexity increases.</p></list-item>
<list-item><p>If <bold><italic>S</italic></bold>&lt;<bold><italic>C</italic></bold><sub><bold><italic>6</italic></bold></sub>, the alignment will be rejected as too short. <bold><italic>S</italic></bold> is the insert size implied by the alignment.</p></list-item>
<list-item><p>Otherwise, the best alignment will be reported for further consideration.</p></list-item>
</list></p></list-item>
<list-item><label>6</label><p>At extreme sensitivity settings, an additional algorithm–“flat mode”–is used. This mode determines the best overlap by minimizing the number of mismatching bases.
<list list-type="alpha-lower">
<list-item><p>At the “xstrict” and “ustrict” settings, the alignment is only accepted if the best offset from flat mode matches the best offset from ratio mode.</p></list-item>
<list-item><p>At the “xloose” setting, an alignment produced by flat mode will be accepted if no alignment was produced by ratio mode.</p></list-item>
<list-item><p>Otherwise, flat mode is not used.</p></list-item>
</list></p></list-item>
<list-item><label>7</label><p>If the pair has an alignment reported in 5) or 6), it is subjected to further scrutiny.
<list list-type="alpha-lower">
<list-item><p>If the implied insert size is shorter than the read length, and adapter sequences have been specified, non-overlapping portions of the reads are aligned to respective expected adapter sequence. If they do not match, the alignment is rejected.</p></list-item>
<list-item><p>The number of expected mismatches (<bold><italic>E</italic></bold>) in the overlap is calculated using quality scores. If <bold><italic>B</italic>&gt;<italic>E</italic></bold>*<bold><italic>C</italic></bold><sub><bold><italic>5</italic></bold></sub>, the alignment is rejected.</p></list-item>
<list-item><p>The probability (<bold><italic>P</italic></bold>) of the specific pattern of matches and mismatches is calculated. If <bold><italic>P</italic></bold>&lt;<bold><italic>C</italic></bold><sub><bold><italic>6</italic></bold></sub>, the alignment is rejected.</p></list-item>
</list></p></list-item>
<list-item><label>8</label><p>If, at this point, the alignment has not been rejected, the read pair is merged to create a new read of size equal to the insert size implied by the overlap.
<list list-type="alpha-lower">
<list-item><p>The overlapping portions of the reads are represented in the resulting read as a consensus of the two parent sequences. Matching bases are assigned an increased quality score; for non-matching bases, the base with the higher quality score is used, and is assigned a quality score equal to the difference between the two parent qualities. Where both quality scores are equal and the bases mismatch, the resulting base is N.</p></list-item>
<list-item><p>If only the tail ends of the reads overlap, the insert size (and thus resulting read) is longer than the original read length. The merged read will be composed of the non-overlapping portion of read 1; the consensus of the overlapping sequence; and the non-overlapping portion of read 2, respectively.</p></list-item>
<list-item><p>If the tail ends of the reads do not overlap, the insert size is shorter than the initial read length, and the non-overlapping portion is non-genomic sequencing adapter read-through. In this case the resulting read is trimmed to the insert size, and will be 100% consensus sequence.</p></list-item>
</list></p></list-item>
</list>
<fig id="pone.0185056.g001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0185056.g001</object-id>
<label>Fig 1</label>
<caption>
<title/>
<p><bold>Merging scenarios in BBMerge modes: default (A-B), REM (C-F), and RSEM (G-I).</bold> The left column (Fig 1A,C,D,F) displays scenarios resulting in successfully merged reads, while the right column (Fig 1B,E,G,H) displays scenarios resulting in discarded unmerged pairs.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0185056.g001" xlink:type="simple"/>
</fig>
</sec>
<sec id="sec008">
<title>BBMerge <italic>k</italic>-mer-based modes</title>
<p>BBMerge has the ability to improve merging accuracy or merge non-overlapping reads using <italic>k</italic>-mer frequency information, if the sequencing depth is sufficient (<xref ref-type="fig" rid="pone.0185056.g002">Fig 2</xref>) and the library is randomly sheared. There are two <italic>k</italic>-mer-using modes described in this paper, REM and RSEM, which stand for “Require Extension Match” and “Require Strict Extension Match". In each case, the default BBMerge algorithm is used with an additional <italic>k</italic>-mer-based extension step. To summarize: The input read file is processed once, to build a table of <italic>k</italic>-mer counts. The file is then processed a second time to perform merging. Steps performed during the merging phase for each read pair include:</p>
<list list-type="order">
<list-item><p>The standard BBMerge algorithm is used to determine the insert size <bold><italic>S</italic></bold><sub><bold><italic>0</italic></bold></sub> based purely on overlap (<xref ref-type="fig" rid="pone.0185056.g001">Fig 1A and 1B</xref>).</p></list-item>
<list-item><p>Each read is extended by a fixed length on the tail end only, using the Tadpole assembler (<ext-link ext-link-type="uri" xlink:href="https://sourceforge.net/projects/bbmap/" xlink:type="simple">https://sourceforge.net/projects/bbmap/</ext-link>). When not specified, as in this study, extension defaults to 50 bp. Extension will stop prematurely if a branch <italic>k</italic>-mer is encountered, or <italic>k</italic>-mer depth drops below a set threshold, so extension may not reach the full length specified by the user.
<list list-type="alpha-lower">
<list-item><p>A “branch <italic>k</italic>-mer” is a <italic>k</italic>-mer with more than one possible next <italic>k</italic>-mer. They are identified based on BBMerge’s optional Tadpole-specific parameters.</p></list-item>
<list-item><p>If extension completely fails such that neither read is extended by at least one base, insert size <bold><italic>S</italic></bold><sub><bold><italic>0</italic></bold></sub> is used regardless of mode and subsequent steps are skipped.</p></list-item>
</list></p></list-item>
<list-item><p>If extension was successful, the BBMerge algorithm is applied to the extended reads to obtain a new insert size <bold><italic>S</italic></bold><sub><bold><italic>1</italic></bold></sub>.</p></list-item>
<list-item><p>In REM mode, the alignment is accepted if <bold><italic>S</italic></bold><sub><bold><italic>0</italic></bold></sub> <bold>= <italic>S</italic></bold><sub><bold><italic>1</italic></bold></sub> (<xref ref-type="fig" rid="pone.0185056.g001">Fig 1C</xref>). If there is no <bold><italic>S</italic></bold><sub><bold><italic>0</italic></bold></sub> because overlap failed in step 1, <bold><italic>S</italic></bold><sub><bold><italic>1</italic></bold></sub> will be used (<xref ref-type="fig" rid="pone.0185056.g001">Fig 1D</xref>). If <bold><italic>S</italic></bold><sub><bold><italic>0</italic></bold></sub> and <bold><italic>S</italic></bold><sub><bold><italic>1</italic></bold></sub> exist and <bold><italic>S</italic></bold><sub><bold><italic>0</italic></bold></sub><bold>! = <italic>S</italic></bold><sub><bold><italic>1</italic></bold></sub>, the alignment is rejected (<xref ref-type="fig" rid="pone.0185056.g001">Fig 1E</xref>).</p></list-item>
<list-item><p>In RSEM mode, the alignment is exclusively accepted if <bold><italic>S</italic></bold><sub><bold><italic>0</italic></bold></sub> <bold>= <italic>S</italic></bold><sub><bold><italic>1</italic></bold></sub> (<xref ref-type="fig" rid="pone.0185056.g001">Fig 1F</xref>). If <bold><italic>S</italic></bold><sub><bold><italic>0</italic></bold></sub><bold><italic>≠S</italic></bold><sub><bold><italic>1</italic></bold></sub> (<xref ref-type="fig" rid="pone.0185056.g001">Fig 1G</xref>), or if there was no initial overlap detected (<xref ref-type="fig" rid="pone.0185056.g001">Fig 1H</xref>), the alignment is rejected.</p></list-item>
</list>
<fig id="pone.0185056.g002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0185056.g002</object-id>
<label>Fig 2</label>
<caption>
<title>Relationship between % merged reads and genome coverage.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0185056.g002" xlink:type="simple"/>
</fig>
<p>In practice, REM mode can produce merged reads from initially non-overlapping pairs, with insert size &gt; sum of the read lengths. RSEM will only produce merged reads &lt; sum of the read lengths–a strict subset of the merged reads produced by BBMerge run in pure overlap mode. Requiring that the overlap after extension matches the initial overlap reduces false-positive merges caused by short repeats.</p>
<p>Although <italic>k</italic>-mer-based modes can increase accuracy and merge rates, read processing requires more time and memory in these modes. This memory constraint may hence render <italic>k</italic>-mer modes impractical on very large datasets. Though not evaluated in this study, BBMerge also has additional <italic>k</italic>-mer-related options, “ecct” and “kfilter”. “ecct” enables <italic>k</italic>-mer-based error correction of reads that initially fail to merge; if the reads still fail to merge after correction, the changes are rolled back. This can increase the merge rate in data with many sequencing errors. “kfilter” is a setting applied after a potential overlap is found; if the merged read contains any <italic>k</italic>-mers that were not already present at a specified depth in the original file, the overlap is assumed to be wrong and will be rejected. All <italic>k</italic>-mer-using modes use the same <italic>k</italic>-mer count table, so they can be enabled concurrently without using additional memory, and with little speed impact.</p>
</sec>
<sec id="sec009">
<title>BBMerge threading</title>
<p>BBMerge uses both pipelined and parallel threads to achieve a high degree of scalability. Data is streamed from and to disk during execution, so that BBMerge’s memory requirements (in default overlap mode) are unrelated to the amount of input data. Data is read by one thread per file and packaged into lists of <bold><italic>P</italic></bold> read pairs each (<bold><italic>P</italic></bold> = 200 by default). These lists are added to an ArrayBlockingQueue, a data structure that allows safe concurrent read/write access. A number of parallel worker threads is spawned (controlled by the “t” flag). Each worker fetches a list of reads from the queue; if the queue is empty, it will block until a new list is added. The worker thread will then iterate through the list and attempt to merge each of the read pairs, tracking statistics in thread-local variables, and adding merged reads to a new list. The finished list of merged reads is added to an output ArrayBlockingQueue, which is being fed by all of the worker threads. An output thread pulls lists from this output queue, and writes the reads to disk. The worker threads finish when all reads have been processed. Finally, the master thread summarizes and prints the statistics from the worker threads. As a result, the worker threads do not interfere with memory used by any other thread except when pulling lists from the input queue, or sending lists to the output queue; this means shared memory is only mutated twice per <bold><italic>P</italic></bold> read pairs. Furthermore, <bold><italic>P</italic></bold> can be set to an arbitrarily high value on the command line (with the “readbufferlength” flag), so that distributing and gathering work has minimal negative impact on scalability. Most tools in the BBMap package share this threading design.</p>
</sec>
<sec id="sec010">
<title>Deployment and use</title>
<p>BBMerge is written in Java, with no other dependencies. It is distributed with both the source and precompiled class files, allowing simple deployment and use on any computer supporting Java, from Windows laptops to HPC Linux-based clusters. BBMerge is designed for production use, so to simplify pipeline integration, it supports a wide variety of input and output formats–fasta or fastq; interleaved or dual-file; raw or compressed; encoded in ASCII-33 or ASCII-64, with input format autodetection. It also provides alternative processing modes such as insert-size histogram generation, adapter-sequence detection, and overlap-based error-correction (without merging), allowing its use in situations when paired reads are preferred over merged reads.</p>
</sec>
</sec>
<sec id="sec011" sec-type="conclusions">
<title>Discussion</title>
<p>We tested BBMerge in three modes (default, REM, RSEM) and compared its merging performance with eight other read merging programs (<xref ref-type="table" rid="pone.0185056.t002">Table 2</xref>) using synthetically generated reads from an algae genome, and real-world shotgun metagenomic reads from a prokaryotic mock community (MBARC-26). Merging performance was evaluated based on accuracy, speed and computing efficiency.</p>
<sec id="sec012">
<title>Accuracy of paired-end read merging</title>
<p>BBMerge outperformed all other tools in merging accuracy across the sensitivity curve, with the lowest rate of incorrectly merged reads for any given rate of correctly merged reads, though this difference was more pronounced in the synthetic (<xref ref-type="fig" rid="pone.0185056.g003">Fig 3A</xref>) compared to the real-world data (<xref ref-type="fig" rid="pone.0185056.g003">Fig 3B</xref>). Similarly, BBMerge resulted in the highest correct merge rate (<xref ref-type="fig" rid="pone.0185056.g003">Fig 3</xref>) of all non-<italic>k</italic>-mer-using tools.</p>
<fig id="pone.0185056.g003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0185056.g003</object-id>
<label>Fig 3</label>
<caption>
<title/>
<p><bold>Comparison of merging accuracy by program using synthetic (A) and shotgun metagenome sequences (B).</bold> Correctly merged reads are defined as % of total input pairs. Program performance at default sensitivity is indicated by a triangle.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0185056.g003" xlink:type="simple"/>
</fig>
<p>Results from the three discussed <italic>k</italic>-mer-utilizing modes are clearly distinguishable from those of the purely overlap-based tools and modes (<xref ref-type="fig" rid="pone.0185056.g003">Fig 3</xref>). BBMerge’s RSEM mode substantially reduced the rate of incorrectly merged reads, while slightly reducing the rate of correctly merged reads. BBMerge’s REM mode, and COPE’s M3 mode, substantially increased correct merge rates compared to the programs’ default modes by merging initially non-overlapping reads (<xref ref-type="fig" rid="pone.0185056.g003">Fig 3</xref>). BBMerge-REM achieved the highest rate of correctly merged reads in the real-world data (77.5%) followed by COPE-M3 (62.1%), and COPE-M3 achieved the highest merge rate in the synthetic data (94.4%) followed by BBMerge-REM (93.8%). Stitch yielded 69.2% incorrectly and 0.8% correctly merged reads in the synthetic data, and 49.1% incorrectly and 0.64% correctly merged reads in the real-world data (<xref ref-type="supplementary-material" rid="pone.0185056.s003">S3 Table</xref>).</p>
</sec>
<sec id="sec013">
<title>Speed and scalability of paired-end read merging</title>
<p>Merging speeds were evaluated using the real-world metagenome reads and programs set to default sensitivity. Multi-threaded programs were allowed to use all 32 available threads. Compared to the other merging tools, BBMerge and FLASH were substantially faster, although we found that USEARCH, PEAR, BBMerge REM/RSEM, and fastq-join can all merge large datasets within reasonable timescales (<xref ref-type="fig" rid="pone.0185056.g004">Fig 4</xref>). Based on the performance on our shotgun sequence datasets, XORRO, COPE, leeHom and Stitch were projected to require &gt;1 day to process a 500 Gbp dataset.</p>
<fig id="pone.0185056.g004" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0185056.g004</object-id>
<label>Fig 4</label>
<caption>
<title>Speed comparison by program of shotgun metagenome sequences.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0185056.g004" xlink:type="simple"/>
</fig>
<p>BBMerge variants, PEAR, and Stitch exhibited near-perfect scaling in these tests, and are expected to continue scaling past 32 threads if run on a system with more CPU cores (<xref ref-type="fig" rid="pone.0185056.g005">Fig 5</xref>). FLASH scaled linearly to 6 threads, at which point speed plateaued. leeHom scaled to a peak at 4 threads, after which speed slightly declined. USEARCH also reached a peak at ~4 threads, but did not scale as well; 4-threaded speed was only 150% of single-threaded speed, rather than an ideal 400%. Subsequently, USEARCH’s performance declined, ending at 85% of its peak speed at the maximum of 32 threads. Single-threaded programs (fastq-join, XORRO, and COPE) are each represented by a single point.</p>
<fig id="pone.0185056.g005" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0185056.g005</object-id>
<label>Fig 5</label>
<caption>
<title>Scalability of each program, determined by measuring speed using various numbers of threads.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0185056.g005" xlink:type="simple"/>
</fig>
</sec>
<sec id="sec014">
<title>Assembly quality following read merging</title>
<p>Assembly quality was evaluated with QUAST; we report here assembly continuity (NA50), genome completeness, misassemblies, and indels as defined in [<xref ref-type="bibr" rid="pone.0185056.ref013">13</xref>] (<xref ref-type="table" rid="pone.0185056.t003">Table 3</xref>, <xref ref-type="supplementary-material" rid="pone.0185056.s004">S4 Table</xref>). Gurevich <italic>et al</italic>. [<xref ref-type="bibr" rid="pone.0185056.ref013">13</xref>] defined NA50 as the length at which the collection of all reference-aligned contigs, of that length or longer, contain at least half of the assembled bases. Merged reads were generally characterized by substantially improved assembly continuity compared to the raw data (<xref ref-type="table" rid="pone.0185056.t003">Table 3</xref>, <xref ref-type="fig" rid="pone.0185056.g006">Fig 6A</xref>), with BBMerge-REM reaching a nearly two-fold increase in NA50 (119 kbp compared to 60 kbp). BBMerge-RSEM, BBMerge, USEARCH, and leeHom resulted in similar NA50 metrics (101–104 kbp). The NA50 achieved with the remaining programs ranged from 61 kbp (PEAR) to 98 kbp (COPE-M3), aside from Stitch at 5.6 kbp. The raw data resulted in a total misassembly count of 119. Only BBMerge-RSEM and BBMerge-REM reduced this count, to 115 and 117, respectively (<xref ref-type="table" rid="pone.0185056.t003">Table 3</xref>, <xref ref-type="fig" rid="pone.0185056.g006">Fig 6B</xref>). The studied merge tools fell into 3 misassembly-count clusters: BBMerge variants and USEARCH ranged from 115 to 131; XORRO, fastq-join, COPE-M3, FLASH, leeHom, and COPE ranged from 158 to 294; and PEAR and Stitch resulted in 660 and 20,986 misassemblies, respectively.</p>
<fig id="pone.0185056.g006" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0185056.g006</object-id>
<label>Fig 6</label>
<caption>
<title>NA50 length and misassembly rates for a SPAdes assembly of each program’s output at default settings.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0185056.g006" xlink:type="simple"/>
</fig>
<table-wrap id="pone.0185056.t003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0185056.t003</object-id>
<label>Table 3</label> <caption><title>Assembly metrics reported by QUAST for SPAdes metagenomic assemblies.</title></caption>
<alternatives>
<graphic id="pone.0185056.t003g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0185056.t003" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="left">Program</th>
<th align="left">NA50 (bp)</th>
<th align="left">Total Misassemblies</th>
<th align="left">Indels/ 100 kbp</th>
<th align="left">Genome Completeness (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">Raw Data</td>
<td align="left">60007</td>
<td align="left">119</td>
<td align="left">1.13</td>
<td align="left">84.5</td>
</tr>
<tr>
<td align="left">BBMerge</td>
<td align="left">102577</td>
<td align="left">127</td>
<td align="left">0.84</td>
<td align="left">84.88</td>
</tr>
<tr>
<td align="left">BBMerge-REM</td>
<td align="left">119328</td>
<td align="left">117</td>
<td align="left">0.81</td>
<td align="left">85.18</td>
</tr>
<tr>
<td align="left">BBMerge-RSEM</td>
<td align="left">104441</td>
<td align="left">115</td>
<td align="left">0.84</td>
<td align="left">84.88</td>
</tr>
<tr>
<td align="left">COPE</td>
<td align="left">89603</td>
<td align="left">294</td>
<td align="left">1.52</td>
<td align="left">85.17</td>
</tr>
<tr>
<td align="left">COPE-M3</td>
<td align="left">98240</td>
<td align="left">227</td>
<td align="left">1.24</td>
<td align="left">83.92</td>
</tr>
<tr>
<td align="left">fastq-join</td>
<td align="left">80672</td>
<td align="left">183</td>
<td align="left">1.17</td>
<td align="left">84.74</td>
</tr>
<tr>
<td align="left">FLASH</td>
<td align="left">94846</td>
<td align="left">282</td>
<td align="left">1.41</td>
<td align="left">85.20</td>
</tr>
<tr>
<td align="left">leeHom</td>
<td align="left">101992</td>
<td align="left">290</td>
<td align="left">1.1</td>
<td align="left">84.91</td>
</tr>
<tr>
<td align="left">PEAR</td>
<td align="left">60937</td>
<td align="left">660</td>
<td align="left">1.46</td>
<td align="left">84.28</td>
</tr>
<tr>
<td align="left">Stitch</td>
<td align="left">5623</td>
<td align="left">20986</td>
<td align="left">47.78</td>
<td align="left">68.38</td>
</tr>
<tr>
<td align="left">USEARCH</td>
<td align="left">102156</td>
<td align="left">131</td>
<td align="left">0.88</td>
<td align="left">84.77</td>
</tr>
<tr>
<td align="left">XORRO</td>
<td align="left">97403</td>
<td align="left">158</td>
<td align="left">1.08</td>
<td align="left">84.85</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<p>Indel rates are noted because they can induce frameshifts, which disrupt gene annotation. BBMerge variants and USEARCH clustered together closely, with rates ranging from 0.81 (BBMerge-REM) to 0.88 (USEARCH) indels per 100 kbp (<xref ref-type="table" rid="pone.0185056.t003">Table 3</xref>). The other tools yielded rates ranging from 1.08 (XORRO) to 1.52 (COPE), except for Stitch (47.78 per 100 kbp). The raw data yielded 1.13 indels per 100 kbp. The fraction of reference bases covered by assemblies exhibited a narrow range from 83.9% (COPE-M3) to 85.2% (FLASH), aside from Stitch at 68.4% (<xref ref-type="table" rid="pone.0185056.t003">Table 3</xref>). All tools except PEAR, COPE-M3, and Stitch exceeded the 84.5% genome coverage of the raw read assembly. BBMerge-REM outperformed BBMerge in every assembly metric, but COPE-M3’s performance relative to COPE was more nuanced: COPE-M3 had a greater NA50 and fewer misassemblies and indels, but a 1.2% lower genome recovery than COPE.</p>
</sec>
</sec>
<sec id="sec015" sec-type="conclusions">
<title>Conclusion</title>
<p>Correctly merged shotgun reads can improve the performance of applications that benefit from longer reads, yet erroneously merged reads can create serious issues due to the introduction of new errors, a concern that is not present for other common preprocessing steps such as quality-trimming. Even at a low rate, the addition of incorrectly merged reads can cause misassemblies and reduced assembly contiguity compared to unmerged or correctly merged data (<xref ref-type="fig" rid="pone.0185056.g006">Fig 6</xref>). It is this possibility of introducing new errors that renders merging especially sensitive to accuracy.</p>
<p>Since BBMerge has been developed primarily as a tool to aid in clustering and de-novo assembly of shotgun metagenome sequence data, minimizing the false-positive merge rate has been considered paramount. Our data indicates that BBMerge successfully minimized the false-positive rate when merging shotgun reads from synthetic and real-world datasets, and was able to improve assembly quality by increasing continuity while reducing the number of misassemblies. Its ability to achieve maximal accuracy while scaling near-linearly to reach the highest speed of the compared software makes BBMerge a promising tool for improving the assembly of large datasets such as shotgun metagenomes.</p>
</sec>
<sec id="sec016">
<title>Supporting information</title>
<supplementary-material id="pone.0185056.s001" mimetype="application/msword" position="float" xlink:href="info:doi/10.1371/journal.pone.0185056.s001" xlink:type="simple">
<label>S1 Table</label>
<caption>
<title>Program command lines.</title>
<p>Non-default parameters are stated in bold letters.</p>
<p>(DOC)</p>
</caption>
</supplementary-material>
<supplementary-material id="pone.0185056.s002" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" position="float" xlink:href="info:doi/10.1371/journal.pone.0185056.s002" xlink:type="simple">
<label>S2 Table</label>
<caption>
<title>Program sensitivity parameters.</title>
<p>Default settings are stated in bold letters.</p>
<p>(DOCX)</p>
</caption>
</supplementary-material>
<supplementary-material id="pone.0185056.s003" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" position="float" xlink:href="info:doi/10.1371/journal.pone.0185056.s003" xlink:type="simple">
<label>S3 Table</label>
<caption>
<title/>
<p>Number of correctly and incorrectly merged read pairs, and Signal-Noise Ratio (SNR), from the synthetic <bold>(A)</bold> and real-world <bold>(B)</bold> shotgun datasets by program and sensitivity. All numbers are out of 20,000,000 input read pairs. Defaults are in bold.</p>
<p>(DOCX)</p>
</caption>
</supplementary-material>
<supplementary-material id="pone.0185056.s004" mimetype="application/msword" position="float" xlink:href="info:doi/10.1371/journal.pone.0185056.s004" xlink:type="simple">
<label>S4 Table</label>
<caption>
<title>Assembly report by program.</title>
<p>(DOC)</p>
</caption>
</supplementary-material>
</sec>
</body>
<back>
<ack>
<p>We thank Bill Andreopoulos, Alex Copeland, Robert Egan, Bryce Foster, Douglas Jacobsen, Elmar Pruesse, Adam Rivers, Axel Visel, Zhong Wang, and Tanja Woyke for valuable comments and suggestions. This work was conducted by the U.S. Department of Energy Joint Genome Institute, a DOE Office of Science User Facility, is supported under Contract No. DE-AC02-05CH11231.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="pone.0185056.ref001"><label>1</label><mixed-citation publication-type="book" xlink:type="simple"><name name-style="western"><surname>Berka</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Chen</surname> <given-names>Z</given-names></name>, <name name-style="western"><surname>Egholm</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Godwin</surname> <given-names>BC</given-names></name>. <source>Paired end sequencing</source>. <publisher-name>US Patent Office</publisher-name>; <year>2009</year>.</mixed-citation></ref>
<ref id="pone.0185056.ref002"><label>2</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Singer</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Andreopoulos</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Bowers</surname> <given-names>RM</given-names></name>, <name name-style="western"><surname>Lee</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Deshpande</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Chiniquy</surname> <given-names>J</given-names></name>, <etal>et al</etal>. <article-title>Next generation sequencing data of a defined microbial mock community</article-title>. <source>Scientific Data</source>. <year>2016</year>;<volume>3</volume>: <fpage>160081</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/sdata.2016.81" xlink:type="simple">10.1038/sdata.2016.81</ext-link></comment> <object-id pub-id-type="pmid">27673566</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref003"><label>3</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Lander</surname> <given-names>ES</given-names></name>, <name name-style="western"><surname>Linton</surname> <given-names>LM</given-names></name>, <name name-style="western"><surname>Birren</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Nusbaum</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Zody</surname> <given-names>MC</given-names></name>, <name name-style="western"><surname>Baldwin</surname> <given-names>J</given-names></name>, <etal>et al</etal>. <article-title>Initial sequencing and analysis of the human genome</article-title>. <source>Nature</source>. Nature Publishing Group; <year>2001</year>;<volume>409</volume>: <fpage>860</fpage>–<lpage>921</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/35057062" xlink:type="simple">10.1038/35057062</ext-link></comment> <object-id pub-id-type="pmid">11237011</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref004"><label>4</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Singer</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Andreopoulos</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Bowers</surname> <given-names>RM</given-names></name>, <name name-style="western"><surname>Lee</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Deshpande</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Chiniquy</surname> <given-names>J</given-names></name>, <etal>et al</etal>. <article-title>Next generation sequencing data of a defined microbial mock community</article-title>. <source>Scientific Data</source>.</mixed-citation></ref>
<ref id="pone.0185056.ref005"><label>5</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Ng</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Wei</surname> <given-names>C-L</given-names></name>, <name name-style="western"><surname>Sung</surname> <given-names>W-K</given-names></name>, <name name-style="western"><surname>Chiu</surname> <given-names>KP</given-names></name>, <name name-style="western"><surname>Lipovich</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Ang</surname> <given-names>CC</given-names></name>, <etal>et al</etal>. <article-title>Gene identification signature (GIS) analysis for transcriptome characterization and genome annotation</article-title>. <source>Nat Meth</source>. <year>2005</year>;<volume>2</volume>: <fpage>105</fpage>–<lpage>111</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/nmeth733" xlink:type="simple">10.1038/nmeth733</ext-link></comment> <object-id pub-id-type="pmid">15782207</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref006"><label>6</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Shendure</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Porreca</surname> <given-names>GJ</given-names></name>, <name name-style="western"><surname>Reppas</surname> <given-names>NB</given-names></name>, <name name-style="western"><surname>Lin</surname> <given-names>X</given-names></name>. <article-title>Accurate multiplex polony sequencing of an evolved bacterial genome</article-title>. <source>Science</source>. <year>2005</year>;<volume>309</volume>: <fpage>1728</fpage>–<lpage>1732</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1126/science.1117389" xlink:type="simple">10.1126/science.1117389</ext-link></comment> <object-id pub-id-type="pmid">16081699</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref007"><label>7</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Dunn</surname> <given-names>JJ</given-names></name>, <name name-style="western"><surname>McCorkle</surname> <given-names>SR</given-names></name>, <name name-style="western"><surname>Everett</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Anderson</surname> <given-names>CW</given-names></name>. <article-title>Paired-end genomic signature tags: a method for the functional analysis of genomes and epigenomes</article-title>. <source>Genet Eng (NY)</source>. <year>2007</year>;<volume>28</volume>: <fpage>159</fpage>–<lpage>173</lpage>.</mixed-citation></ref>
<ref id="pone.0185056.ref008"><label>8</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Korbel</surname> <given-names>JO</given-names></name>, <name name-style="western"><surname>Urban</surname> <given-names>AE</given-names></name>, <name name-style="western"><surname>Affourtit</surname> <given-names>JP</given-names></name>, <name name-style="western"><surname>Godwin</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Grubert</surname> <given-names>F</given-names></name>. <article-title>Paired-end mapping reveals extensive structural variation in the human genome</article-title>. <source>Science</source>. <year>2007</year>;<volume>318</volume>: <fpage>420</fpage>–<lpage>426</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1126/science.1149504" xlink:type="simple">10.1126/science.1149504</ext-link></comment> <object-id pub-id-type="pmid">17901297</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref009"><label>9</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Chen</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Kim</surname> <given-names>YC</given-names></name>, <name name-style="western"><surname>Jung</surname> <given-names>YC</given-names></name>, <name name-style="western"><surname>Xuan</surname> <given-names>Z</given-names></name>, <name name-style="western"><surname>Dworkin</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Zhang</surname> <given-names>Y</given-names></name>, <etal>et al</etal>. <article-title>Scanning the human genome at kilobase resolution</article-title>. <source>Genome Research</source>. <year>2008</year>;<volume>18</volume>: <fpage>751</fpage>–<lpage>762</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1101/gr.068304.107" xlink:type="simple">10.1101/gr.068304.107</ext-link></comment> <object-id pub-id-type="pmid">18292219</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref010"><label>10</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Holt</surname> <given-names>RA</given-names></name>, <name name-style="western"><surname>Jones</surname> <given-names>SJM</given-names></name>. <article-title>The new paradigm of flow cell sequencing</article-title>. <source>Genome Research</source>. <year>2008</year>;<volume>18</volume>: <fpage>839</fpage>–<lpage>846</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1101/gr.073262.107" xlink:type="simple">10.1101/gr.073262.107</ext-link></comment> <object-id pub-id-type="pmid">18519653</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref011"><label>11</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Markowitz</surname> <given-names>VM</given-names></name>, <name name-style="western"><surname>Ivanova</surname> <given-names>NN</given-names></name>, <name name-style="western"><surname>Szeto</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Palaniappan</surname> <given-names>K</given-names></name>, <name name-style="western"><surname>Chu</surname> <given-names>K</given-names></name>, <name name-style="western"><surname>Dalevi</surname> <given-names>D</given-names></name>, <etal>et al</etal>. <article-title>IMG/M: a data management and analysis system for metagenomes</article-title>. <source>Nucleic Acids Research</source>. <year>2007</year>;<volume>36</volume>: <fpage>D534</fpage>–<lpage>D538</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/nar/gkm869" xlink:type="simple">10.1093/nar/gkm869</ext-link></comment> <object-id pub-id-type="pmid">17932063</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref012"><label>12</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Bankevich</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Nurk</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Antipov</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Gurevich</surname> <given-names>AA</given-names></name>, <name name-style="western"><surname>Dvorkin</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Kulikov</surname> <given-names>AS</given-names></name>, <etal>et al</etal>. <article-title>SPAdes: A New Genome Assembly Algorithm and Its Applications to Single-Cell Sequencing</article-title>. <source>Journal of Computational Biology</source>. <year>2012</year>;<volume>19</volume>: <fpage>455</fpage>–<lpage>477</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1089/cmb.2012.0021" xlink:type="simple">10.1089/cmb.2012.0021</ext-link></comment> <object-id pub-id-type="pmid">22506599</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref013"><label>13</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Gurevich</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Saveliev</surname> <given-names>V</given-names></name>, <name name-style="western"><surname>Vyahhi</surname> <given-names>N</given-names></name>, <name name-style="western"><surname>Tesler</surname> <given-names>G</given-names></name>. <article-title>QUAST: quality assessment tool for genome assemblies</article-title>. <source>Bioinformatics</source>. <year>2013</year>;<volume>29</volume>: <fpage>1072</fpage>–<lpage>1075</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bioinformatics/btt086" xlink:type="simple">10.1093/bioinformatics/btt086</ext-link></comment> <object-id pub-id-type="pmid">23422339</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref014"><label>14</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Magoc</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Salzberg</surname> <given-names>SL</given-names></name>. <article-title>FLASH: fast length adjustment of short reads to improve genome assemblies</article-title>. <source>Bioinformatics</source>. <year>2011</year>;<volume>27</volume>: <fpage>2957</fpage>–<lpage>2963</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bioinformatics/btr507" xlink:type="simple">10.1093/bioinformatics/btr507</ext-link></comment> <object-id pub-id-type="pmid">21903629</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref015"><label>15</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Edgar</surname> <given-names>RC</given-names></name>. <article-title>Search and clustering orders of magnitude faster than BLAST</article-title>. <source>Bioinformatics</source>. <year>2010</year>;<volume>26</volume>: <fpage>2460</fpage>–<lpage>2461</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bioinformatics/btq461" xlink:type="simple">10.1093/bioinformatics/btq461</ext-link></comment> <object-id pub-id-type="pmid">20709691</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref016"><label>16</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Dickson</surname> <given-names>RJ</given-names></name>, <name name-style="western"><surname>Gloor</surname> <given-names>GB</given-names></name>. <article-title>XORRO: Rapid Paired-End Read Overlapper</article-title>. <source>arXiv</source>. <year>2013</year>;<volume>1304</volume>.<fpage>4620</fpage>.</mixed-citation></ref>
<ref id="pone.0185056.ref017"><label>17</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Zhang</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Kobert</surname> <given-names>K</given-names></name>, <name name-style="western"><surname>Flouri</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Stamatakis</surname> <given-names>A</given-names></name>. <article-title>PEAR: a fast and accurate Illumina Paired-End reAd mergeR</article-title>. <source>Bioinformatics</source>. <year>2014</year>;<volume>30</volume>: <fpage>614</fpage>–<lpage>620</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bioinformatics/btt593" xlink:type="simple">10.1093/bioinformatics/btt593</ext-link></comment> <object-id pub-id-type="pmid">24142950</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref018"><label>18</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Liu</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Yuan</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Yiu</surname> <given-names>SM</given-names></name>, <name name-style="western"><surname>Li</surname> <given-names>Z</given-names></name>, <name name-style="western"><surname>Xie</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>Chen</surname> <given-names>Y</given-names></name>, <etal>et al</etal>. <article-title>COPE: an accurate k-mer-based pair-end reads connection tool to facilitate genome assembly</article-title>. <source>Bioinformatics</source>. <year>2012</year>;<volume>28</volume>: <fpage>2870</fpage>–<lpage>2874</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bioinformatics/bts563" xlink:type="simple">10.1093/bioinformatics/bts563</ext-link></comment> <object-id pub-id-type="pmid">23044551</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref019"><label>19</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Aronesty</surname> <given-names>E</given-names></name>. <article-title>Comparison of sequencing utility programs</article-title>. <source>The Open Bioinformatics Journal</source>. <year>2013</year>.</mixed-citation></ref>
<ref id="pone.0185056.ref020"><label>20</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Renaud</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Stenzel</surname> <given-names>U</given-names></name>, <name name-style="western"><surname>Kelso</surname> <given-names>J</given-names></name>. <article-title>leeHom: adaptor trimming and merging for Illumina sequencing reads</article-title>. <source>Nucleic Acids Research</source>. <year>2014</year>;<volume>42</volume>: <fpage>e141</fpage>–<lpage>e141</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/nar/gku699" xlink:type="simple">10.1093/nar/gku699</ext-link></comment> <object-id pub-id-type="pmid">25100869</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref021"><label>21</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Brown</surname> <given-names>CT</given-names></name>, <name name-style="western"><surname>Davis-Richardson</surname> <given-names>AG</given-names></name>, <name name-style="western"><surname>Giongo</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Gano</surname> <given-names>KA</given-names></name>, <name name-style="western"><surname>Crabb</surname> <given-names>DB</given-names></name>, <name name-style="western"><surname>Mukherjee</surname> <given-names>N</given-names></name>, <etal>et al</etal>. <article-title>Gut Microbiome Metagenomics Analysis Suggests a Functional Model for the Development of Autoimmunity for Type 1 Diabetes</article-title>. <name name-style="western"><surname>Roop</surname> <given-names>RM</given-names></name>, editor. <source>PLoS ONE</source>. <year>2011</year>;<volume>6</volume>: <fpage>e25792</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pone.0025792" xlink:type="simple">10.1371/journal.pone.0025792</ext-link></comment> <object-id pub-id-type="pmid">22043294</object-id></mixed-citation></ref>
<ref id="pone.0185056.ref022"><label>22</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Masella</surname> <given-names>AP</given-names></name>, <name name-style="western"><surname>Bartram</surname> <given-names>AK</given-names></name>, <name name-style="western"><surname>Truszkowski</surname> <given-names>JM</given-names></name>, <name name-style="western"><surname>Brown</surname> <given-names>DG</given-names></name>, <name name-style="western"><surname>Neufeld</surname> <given-names>JD</given-names></name>. <article-title>PANDAseq: PAired-eND Assembler for Illumina sequences</article-title>. <source>BMC Bioinformatics</source>. BioMed Central Ltd; <year>2012</year>;<volume>13</volume>: <fpage>1</fpage>–<lpage>7</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/1471-2105-13-1" xlink:type="simple">10.1186/1471-2105-13-1</ext-link></comment></mixed-citation></ref>
<ref id="pone.0185056.ref023"><label>23</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Silver</surname> <given-names>DH</given-names></name>, <name name-style="western"><surname>Ben-Elazar</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Bogoslavsky</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Yanai</surname> <given-names>I</given-names></name>. <article-title>ELOPER: elongation of paired-end reads as a pre-processing tool for improved de novo genome assembly</article-title>. <source>Bioinformatics</source>. <year>2013</year>;<volume>29</volume>: <fpage>1455</fpage>–<lpage>1457</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bioinformatics/btt169" xlink:type="simple">10.1093/bioinformatics/btt169</ext-link></comment> <object-id pub-id-type="pmid">23603334</object-id></mixed-citation></ref>
</ref-list>
</back>
</article>