<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1d3 20150301//EN" "http://jats.nlm.nih.gov/publishing/1.1d3/JATS-journalpublishing1.dtd">
<article article-type="research-article" dtd-version="1.1d3" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS Comput Biol</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">ploscomp</journal-id>
<journal-title-group>
<journal-title>PLOS Computational Biology</journal-title>
</journal-title-group>
<issn pub-type="ppub">1553-734X</issn>
<issn pub-type="epub">1553-7358</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, CA USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">PCOMPBIOL-D-18-00852</article-id>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1006583</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Research Article</subject>
</subj-group>
<subj-group subj-group-type="Discipline-v3"><subject>Computer and information sciences</subject><subj-group><subject>Neural networks</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Neuroscience</subject><subj-group><subject>Neural networks</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Molecular biology</subject><subj-group><subject>Molecular biology techniques</subject><subj-group><subject>DNA barcoding</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Research and analysis methods</subject><subj-group><subject>Molecular biology techniques</subject><subj-group><subject>DNA barcoding</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Evolutionary biology</subject><subj-group><subject>Evolutionary systematics</subject><subj-group><subject>Molecular systematics</subject><subj-group><subject>DNA barcoding</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Taxonomy</subject><subj-group><subject>Evolutionary systematics</subject><subj-group><subject>Molecular systematics</subject><subj-group><subject>DNA barcoding</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Computer and information sciences</subject><subj-group><subject>Data management</subject><subj-group><subject>Taxonomy</subject><subj-group><subject>Evolutionary systematics</subject><subj-group><subject>Molecular systematics</subject><subj-group><subject>DNA barcoding</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Physical sciences</subject><subj-group><subject>Mathematics</subject><subj-group><subject>Statistics</subject><subj-group><subject>Statistical noise</subject><subj-group><subject>Gaussian noise</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Computational biology</subject><subj-group><subject>Genome analysis</subject><subj-group><subject>Genomic libraries</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject><subj-group><subject>Genome analysis</subject><subj-group><subject>Genomic libraries</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Research and analysis methods</subject><subj-group><subject>Database and informatics methods</subject><subj-group><subject>Bioinformatics</subject><subj-group><subject>Sequence analysis</subject><subj-group><subject>Sequence alignment</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Molecular biology</subject><subj-group><subject>Molecular biology techniques</subject><subj-group><subject>Sequencing techniques</subject><subj-group><subject>DNA sequencing</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Research and analysis methods</subject><subj-group><subject>Molecular biology techniques</subject><subj-group><subject>Sequencing techniques</subject><subj-group><subject>DNA sequencing</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Molecular biology</subject><subj-group><subject>Molecular biology techniques</subject><subj-group><subject>Sequencing techniques</subject><subj-group><subject>Genome sequencing</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Research and analysis methods</subject><subj-group><subject>Molecular biology techniques</subject><subj-group><subject>Sequencing techniques</subject><subj-group><subject>Genome sequencing</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>DNA</subject><subj-group><subject>DNA libraries</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Biochemistry</subject><subj-group><subject>Nucleic acids</subject><subj-group><subject>DNA</subject><subj-group><subject>DNA libraries</subject></subj-group></subj-group></subj-group></subj-group></subj-group></article-categories>
<title-group>
<article-title>Deepbinner: Demultiplexing barcoded Oxford Nanopore reads with deep convolutional neural networks</article-title>
<alt-title alt-title-type="running-head">Deepbinner: Demultiplexing Nanopore reads with deep convolutional neural networks</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0001-8349-0778</contrib-id>
<name name-style="western">
<surname>Wick</surname> <given-names>Ryan R.</given-names></name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Data curation</role>
<role content-type="http://credit.casrai.org/">Formal analysis</role>
<role content-type="http://credit.casrai.org/">Investigation</role>
<role content-type="http://credit.casrai.org/">Methodology</role>
<role content-type="http://credit.casrai.org/">Software</role>
<role content-type="http://credit.casrai.org/">Writing – original draft</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"/>
<xref ref-type="corresp" rid="cor001">*</xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0003-3613-4839</contrib-id>
<name name-style="western">
<surname>Judd</surname> <given-names>Louise M.</given-names></name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Investigation</role>
<role content-type="http://credit.casrai.org/">Methodology</role>
<role content-type="http://credit.casrai.org/">Resources</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"/>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0003-3949-2471</contrib-id>
<name name-style="western">
<surname>Holt</surname> <given-names>Kathryn E.</given-names></name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Funding acquisition</role>
<role content-type="http://credit.casrai.org/">Supervision</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"/>
</contrib>
</contrib-group>
<aff id="aff001">
<addr-line>Department of Biochemistry and Molecular Biology, Bio21 Molecular Science and Biotechnology Institute, University of Melbourne, Parkville, Victoria, Australia</addr-line>
</aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple">
<name name-style="western">
<surname>Pertea</surname> <given-names>Mihaela</given-names></name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/>
</contrib>
</contrib-group>
<aff id="edit1">
<addr-line>Johns Hopkins University, UNITED STATES</addr-line>
</aff>
<author-notes>
<fn fn-type="conflict" id="coi001">
<p>The authors have declared that no competing interests exist.</p>
</fn>
<corresp id="cor001">* E-mail: <email xlink:type="simple">rrwick@gmail.com</email></corresp>
</author-notes>
<pub-date pub-type="collection">
<month>11</month>
<year>2018</year>
</pub-date>
<pub-date pub-type="epub">
<day>20</day>
<month>11</month>
<year>2018</year>
</pub-date>
<volume>14</volume>
<issue>11</issue>
<elocation-id>e1006583</elocation-id>
<history>
<date date-type="received">
<day>25</day>
<month>5</month>
<year>2018</year>
</date>
<date date-type="accepted">
<day>15</day>
<month>10</month>
<year>2018</year>
</date>
</history>
<permissions>
<copyright-year>2018</copyright-year>
<copyright-holder>Wick et al</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.pcbi.1006583"/>
<abstract>
<p>Multiplexing, the simultaneous sequencing of multiple barcoded DNA samples on a single flow cell, has made Oxford Nanopore sequencing cost-effective for small genomes. However, it depends on the ability to sort the resulting sequencing reads by barcode, and current demultiplexing tools fail to classify many reads. Here we present Deepbinner, a tool for Oxford Nanopore demultiplexing that uses a deep neural network to classify reads based on the raw electrical read signal. This ‘signal-space’ approach allows for greater accuracy than existing ‘base-space’ tools (Albacore and Porechop) for which signals must first be converted to DNA base calls, itself a complex problem that can introduce noise into the barcode sequence. To assess Deepbinner and existing tools, we performed multiplex sequencing on 12 amplicons chosen for their distinguishability. This allowed us to establish a ground truth classification for each read based on internal sequence alone. Deepbinner had the lowest rate of unclassified reads (7.8%) and the highest demultiplexing precision (98.5% of classified reads were correctly assigned). It can be used alone (to maximise the number of classified reads) or in conjunction with other demultiplexers (to maximise precision and minimise false positive classifications). We also found cross-sample chimeric reads (0.3%) and evidence of barcode switching (0.3%) in our dataset, which likely arise during library preparation and may be detrimental for quantitative studies that use multiplexing. Deepbinner is open source (GPLv3) and available at <ext-link ext-link-type="uri" xlink:href="https://github.com/rrwick/Deepbinner" xlink:type="simple">https://github.com/rrwick/Deepbinner</ext-link>.</p>
</abstract>
<funding-group>
<award-group id="award001">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/100000865</institution-id>
<institution>Bill and Melinda Gates Foundation</institution>
</institution-wrap>
</funding-source>
<award-id>OPP1175797</award-id>
<principal-award-recipient>
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0001-8349-0778</contrib-id>
<name name-style="western">
<surname>Wick</surname> <given-names>Ryan R.</given-names></name>
</principal-award-recipient>
</award-group>
<award-group id="award002">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/100008717</institution-id>
<institution>Sylvia and Charles Viertel Charitable Foundation</institution>
</institution-wrap>
</funding-source>
<principal-award-recipient>
<name name-style="western">
<surname>Holt</surname> <given-names>Kathryn E</given-names></name>
</principal-award-recipient>
</award-group>
<funding-statement>This work was supported by the Bill and Melinda Gates Foundation, Seattle (grant number OPP1175797). KEH is a Viertel Foundation of Australia Senior Medical Research Fellow. The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</funding-statement>
</funding-group>
<counts>
<fig-count count="1"/>
<table-count count="1"/>
<page-count count="11"/>
</counts>
<custom-meta-group>
<custom-meta id="data-availability">
<meta-name>Data Availability</meta-name>
<meta-value>Supporting data is published on ENA and figshare and is accessible via the following links: <ext-link ext-link-type="uri" xlink:href="https://www.ebi.ac.uk/ena/data/view/PRJEB28450" xlink:type="simple">https://www.ebi.ac.uk/ena/data/view/PRJEB28450</ext-link>, <ext-link ext-link-type="uri" xlink:href="https://figshare.com/projects/Deepbinner/34223" xlink:type="simple">https://figshare.com/projects/Deepbinner/34223</ext-link>.</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<disp-quote><p>This is a <italic>PLoS Computational Biology</italic> Software paper.</p></disp-quote>
<sec id="sec001" sec-type="intro">
<title>Introduction</title>
<sec id="sec002">
<title>Oxford Nanopore barcoding</title>
<p>Multiplexing (barcoding) is a common strategy used to distribute high-throughput DNA sequencing capacity over multiple samples [<xref ref-type="bibr" rid="pcbi.1006583.ref001">1</xref>]. For each input DNA sample, a unique barcode is incorporated into the library of DNA molecules prepared for sequencing. Multiple barcoded DNA libraries can then be combined and sequenced simultaneously on the same flow cell. The resulting reads must then be demultiplexed: sorted into bins according to the barcode sequence. Barcoding has obvious economic advantages, allowing users to divide the fixed cost of a sequencer flow cell over multiple input samples.</p>
<p>When the Oxford Nanopore Technologies (ONT) MinION sequencer was first released, its yield was measured in hundreds of Megabases (Mbp) and effective sequencing of a bacterial genome required an entire flow cell [<xref ref-type="bibr" rid="pcbi.1006583.ref002">2</xref>]. The last four years have seen a nearly 100-fold increase in yield, with 10 Gbp or more now possible from a MinION sequencing run [<xref ref-type="bibr" rid="pcbi.1006583.ref003">3</xref>]. ONT’s native barcoding kit for 1D ligation sequencing (EXP-NBD103) provides 12 barcodes which are ligated onto both ends of the DNA molecules to be sequenced. This kit allows the sequencing capacity of a single MinION run to be distributed across 12 bacterial genomes which can thus be simultaneously sequenced on a single flow cell [<xref ref-type="bibr" rid="pcbi.1006583.ref004">4</xref>].</p>
<p>Each ONT sequencing read is generated as a signal composed of variations in electrical current as the DNA molecule moves through the nanopore. The MinION sequencer measures the current at 4 kHz and the DNA advances at a rate of 450 bases/sec, equating to ∼9 measurements per base, on average. These ‘signal-space’ reads (a.k.a. raw signal) are translated into ‘base-space’ nucleotide sequences by basecalling software [<xref ref-type="bibr" rid="pcbi.1006583.ref005">5</xref>–<xref ref-type="bibr" rid="pcbi.1006583.ref007">7</xref>]. Basecalling is an inexact process and the resulting reads have a per-base error rate of 5–25% [<xref ref-type="bibr" rid="pcbi.1006583.ref008">8</xref>]. This error rate can be a problem for downstream analyses, including current ONT barcode demultiplexing tools, such as Albacore and Porechop, where it is common for more than 20% of the barcoded reads to be unassigned to a bin and therefore unusable [<xref ref-type="bibr" rid="pcbi.1006583.ref004">4</xref>]. Other types of ONT read analyses often achieve better performance by working with the raw signal instead [<xref ref-type="bibr" rid="pcbi.1006583.ref009">9</xref>, <xref ref-type="bibr" rid="pcbi.1006583.ref010">10</xref>].</p>
</sec>
<sec id="sec003">
<title>Convolutional neural networks</title>
<p>In the last decade, neural networks—specifically convolutional neural networks (CNNs)—have revolutionised the field of image classification, achieving record high accuracies for detecting and localising objects within images [<xref ref-type="bibr" rid="pcbi.1006583.ref011">11</xref>, <xref ref-type="bibr" rid="pcbi.1006583.ref012">12</xref>]. This progress has been fuelled by general-purpose computing on graphics processing units (GPUs) which allow much faster performance when training and classifying and have in turn allowed for more complex CNNs than were previously feasible. While early CNNs used fewer than 10 layers of neurons [<xref ref-type="bibr" rid="pcbi.1006583.ref013">13</xref>], modern varieties can be very ‘deep’, containing 100 layers or more [<xref ref-type="bibr" rid="pcbi.1006583.ref014">14</xref>]. Despite their impressive accuracy, deep CNNs have been criticised for their incomprehensibility—it can be difficult to tell how or why a CNN classifier made a particular decision [<xref ref-type="bibr" rid="pcbi.1006583.ref015">15</xref>].</p>
<p>Barcode classification using ONT raw signal is conceptually similar to image classification, but it is a simpler problem in two key aspects. First, ONT raw signal is a one-dimensional array of values whereas images typically have three dimensions (height, width and colour channels). Second, there are a smaller number of possible barcode classes (12 to 96, depending on the kit used) than possible image classes (often more than 1000) [<xref ref-type="bibr" rid="pcbi.1006583.ref016">16</xref>].</p>
</sec>
<sec id="sec004">
<title>Deepbinner</title>
<p>Here we present Deepbinner, a tool for ONT barcode demultiplexing using a deep CNN to classify reads into barcode bins using the raw read signal. We compare its performance with that of other ONT demultiplexing tools, Albacore and Porechop, which work in base-space. Operating in signal-space gives Deepbinner more power to demultiplex reads and the ability to sort raw reads for downstream uses such as Nanopolish [<xref ref-type="bibr" rid="pcbi.1006583.ref009">9</xref>]. We demonstrate its use with the EXP-NBD103 set of 12 barcodes, but it could equally be trained on any barcode set.</p>
</sec>
</sec>
<sec id="sec005" sec-type="materials|methods">
<title>Design and implementation</title>
<sec id="sec006">
<title>Deepbinner</title>
<sec id="sec007">
<title>Network architecture</title>
<p>Deepbinner is implemented using the TensorFlow [<xref ref-type="bibr" rid="pcbi.1006583.ref017">17</xref>] and Keras [<xref ref-type="bibr" rid="pcbi.1006583.ref018">18</xref>] code libraries. Its neural network architecture was based on elements developed in the field of image classification: groups of convolutional layers followed by max pooling layers [<xref ref-type="bibr" rid="pcbi.1006583.ref019">19</xref>]; parallel ‘inception’ modules and low dimension bottlenecks [<xref ref-type="bibr" rid="pcbi.1006583.ref020">20</xref>]; noise, dropout and batch normalisation layers [<xref ref-type="bibr" rid="pcbi.1006583.ref021">21</xref>]; and global average pooling [<xref ref-type="bibr" rid="pcbi.1006583.ref022">22</xref>].</p>
<p>Using these elements, we trialled hundreds of randomised network architectures to search for an effective design. Networks were assessed on their loss (categorical cross-entropy) and classification accuracy on a validation set. To discourage overfitting, we preferred models with fewer parameters and a small ratio of validation set loss to training set loss. The best performing architecture was subsequently refined to produce the final Deepbinner network shown in <xref ref-type="fig" rid="pcbi.1006583.g001">Fig 1</xref>. One notable way that Deepbinner’s architecture differs from image classification networks is the number of filters. Deepbinner uses a constant filter size (48, except for where the parallel module increases the filter count) whereas image classification networks commonly use a smaller number of filters in early layers and a larger number in later layers.</p>
<fig id="pcbi.1006583.g001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1006583.g001</object-id>
<label>Fig 1</label>
<caption>
<title>Neural network architecture.</title>
<p>Layers in the network are drawn as coloured blocks and data as groups of vertical lines. Data dimensions are shown for each step of the process as <italic>data length</italic> × <italic>filter count</italic>. Gaussian noise and dropout layers are only active during network training, not during classification.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1006583.g001" xlink:type="simple"/>
</fig>
<p>Deepbinner’s input array size of 1024 was chosen for two reasons. First, it is a power of two, allowing the data to halve in length as it progresses through the network’s dimension-reducing layers. Second, an ONT native barcode is 40 bp in length (24 bp for the barcode itself plus 8 bp of flanking sequence on each side) which has a typical raw signal length of 250 to 600 values, making 1024 the smallest power of two which can reliably capture the entire signal.</p>
</sec>
<sec id="sec008">
<title>Network training</title>
<p>Our training data came from eight R9.4 flow cells (designed for 1D sequencing) and six R9.5 flow cells (designed for 1D<sup>2</sup> sequencing but also compatible with 1D sequencing), all used with the EXP-NBD103 barcoding kit. We basecalled the reads using Albacore (v2.3.1) and searched for adapters and barcodes in the read sequences using the Edlib library [<xref ref-type="bibr" rid="pcbi.1006583.ref023">23</xref>]. Both read-start signal (raw signal from the beginning of the read) and read-end signal (raw signal from the end of the read) were extracted from each fast5 file and any open-pore signals (high current values corresponding to the absence of a DNA molecule in the nanopore [<xref ref-type="bibr" rid="pcbi.1006583.ref005">5</xref>]) were trimmed off. We then conducted semi-global dynamic time warping [<xref ref-type="bibr" rid="pcbi.1006583.ref024">24</xref>] between read signals and the expected signals for adapters and barcodes to find those elements in the read signals. Instances with a clear barcode signal were used to generate training samples of length 1024 (to match Deepbinner’s input array size).</p>
<p>In addition to training Deepbinner on barcoded signals, we also included a variety of barcode-less signals in the training set that were assigned a corresponding no-barcode class. These included signals from real sequences that lacked barcodes (taken from non-barcode parts of the reads), adapter signals from non-barcoded sequencing runs and multiple types of simulated signals: flat signal, Gaussian noise and Perlin noise (a coherent noise function which generates a smoother signal than Gaussian noise [<xref ref-type="bibr" rid="pcbi.1006583.ref025">25</xref>]) (<xref ref-type="supplementary-material" rid="pcbi.1006583.s005">S1 Fig</xref>). Their presence in the training set ensures that Deepbinner can actively assign reads to a no-barcode class, not just fail to find a strong match. These no-barcode samples were included at a rate to make up approximately <inline-formula id="pcbi.1006583.e001"><alternatives><graphic id="pcbi.1006583.e001g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1006583.e001" xlink:type="simple"/><mml:math display="inline" id="M1"><mml:mfrac><mml:mn>1</mml:mn> <mml:mn>4</mml:mn></mml:mfrac></mml:math></alternatives></inline-formula> of the training samples, resulting in a total of 3 308 789 read-start samples and 1 029 684 read-end samples, each an array of 1024 electrical current values with a corresponding barcode label. There were fewer read-end training samples due to the fact that barcodes more reliably occur at the starts of reads than the ends.</p>
<p>Data augmentation is a method of artificially expanding a training set by duplicating samples with transformations [<xref ref-type="bibr" rid="pcbi.1006583.ref026">26</xref>]. Deepbinner applies data augmentation during network training by distorting signals in the temporal dimension (<xref ref-type="supplementary-material" rid="pcbi.1006583.s006">S2 Fig</xref>). This is carried out by duplicating current values at <inline-formula id="pcbi.1006583.e002"><alternatives><graphic id="pcbi.1006583.e002g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1006583.e002" xlink:type="simple"/><mml:math display="inline" id="M2"><mml:mfrac><mml:mn>1</mml:mn> <mml:mn>4</mml:mn></mml:mfrac></mml:math></alternatives></inline-formula> of the signal’s positions (randomly chosen) and deleting values at <inline-formula id="pcbi.1006583.e003"><alternatives><graphic id="pcbi.1006583.e003g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pcbi.1006583.e003" xlink:type="simple"/><mml:math display="inline" id="M3"><mml:mfrac><mml:mn>1</mml:mn> <mml:mn>4</mml:mn></mml:mfrac></mml:math></alternatives></inline-formula> of the positions. The result is a signal of equal length but elongated in some places and shortened in others. We did not implement distortion of the signal amplitude because a similar role is carried out by the network’s Gaussian noise layer (<xref ref-type="fig" rid="pcbi.1006583.g001">Fig 1</xref>). For our dataset, we found that an augmentation factor of two (one augmented signal for each unmodified signal) resulted in the best classification accuracy. However, this factor is adjustable, and datasets with fewer training samples may benefit from more data augmentation.</p>
<p>We refined our training datasets by conducting a four-way split, classifying each quarter of the data using a model trained on the other three quarters. Samples which were misclassified (assigned a label which disagreed with the training label) were discarded. This served to remove ambiguous training samples (approximately 0.25%), leaving final training sets of 3 300 075 and 1 027 469 samples for the starts and ends of reads, respectively (totalling 16.5 GB in size).</p>
<p>We produced the final Deepbinner trained models by training the network for 1000 epochs (100000 training samples per epoch) using a data augmentation factor of two and a random 95:5 training:validation split (<xref ref-type="supplementary-material" rid="pcbi.1006583.s007">S3 Fig</xref>). Data augmentation was only performed on the training data which, combined with the network’s Gaussian noise and dropout layers (only active during training), explains why validation loss and accuracy were superior to training loss and accuracy. Each model was trained on a single NVIDIA P100 GPU and took approximately 40 hours to complete.</p>
</sec>
<sec id="sec009">
<title>Read classification</title>
<p>When classifying raw ONT signal, Deepbinner will generate a probability for each possible barcode, along with a no-barcode probability (<xref ref-type="fig" rid="pcbi.1006583.g001">Fig 1</xref>). Deepbinner will assign a barcode to the signal if the highest probability is sufficiently larger (a difference of &gt;0.5) than the second-highest probability. If the best and second-best matches are too close or if the best-match is ‘no barcode’, then Deepbinner assigns the ‘none’ label to the signal.</p>
<p>Deepbinner can classify each read using the read-start signal, the read-end signal or both. Using both start and end is appropriate for library preparations which add barcodes to both sides of a read, such as EXP-NBD103. If both are used, Deepbinner will independently perform classification using the read start and read end. Reads will be binned if a sufficient match was found on either end, but if the two ends match different barcodes, the read will be considered a chimera and put in the ‘none’ bin. These reads can be identified by running Deepbinner in verbose mode which provides detailed information on barcode calls. Deepbinner can optionally require a positive match on both the start and end to bin a read, enabling stringent demultiplexing. Other library preparations, such as the SQK-RBK004 rapid barcoding kit, only add a barcode to the start of the read, in which case Deepbinner’s start-only classification is appropriate.</p>
<p>An ONT barcode signal often appears in the first 1024 values of a read’s raw signal, but this may not be the case for reads which contain a longer-than-normal amount of open-pore signal. Deepbinner therefore examines multiple 1024-length signal windows overlapping by 512 samples. By default, it examines 11 such windows (covering 6144 samples in total), but this can be configured. Deepbinner merges the results across these windows by retaining the maximum probability for each barcode, then renormalising the probabilities.</p>
</sec>
</sec>
<sec id="sec010">
<title>Evaluation</title>
<sec id="sec011">
<title>Study design</title>
<p>In order to assess the accuracy with which Deepbinner can assign reads to bins based on barcode signals, we aimed to sequence a library of barcoded DNA molecules for which the input source of each molecule could be verified independently of the barcode. To do this, we chose 12 bacterial isolates of different species that had been sequenced via Illumina HiSeq, which produces high accuracy short reads. To identify a region from each that was entirely unique to that genome, we performed a co-assembly of the pooled Illumina reads for all samples with SPAdes (v3.11.1) [<xref ref-type="bibr" rid="pcbi.1006583.ref027">27</xref>] using a small k-mer (k = 23). The longest contigs from this assembly were matched to their source genome by comparison to individual (non-pooled) genome assemblies, and we choose a single contig per source genome. This produced 12 sequences, one from each genome, each composed entirely of unique 23-mers, making them easily distinguishable from each other.</p>
</sec>
<sec id="sec012">
<title>Amplicon library preparation and MinION sequencing</title>
<p>The 12 bacterial isolates were grown overnight at 37°C on LB agar plates. Single colonies were then grown overnight at 37°C in Luria broth. Bacterial cell pellets from 1.5 ml of broth culture were generated by centrifugation at 15 000×g for 5 minutes. DNA was extracted from these pellets using Agencourt GenFind v2 (Beckman Coulter) with minor modifications as follows. Cell pellets were resuspended in 400 μl lysis buffer containing 9 μl Proteinase K (96 mg/ml Beckman Coulter) and 1 μl RNase A (100 mg/ml Sigma Aldrich R6513) by gentle tip mixing. Samples were lysed at 37°C for 30 minutes. gDNA was extracted from the lysed samples by completing the remaining steps of the GenFind v2 for 200 μl of blood/serum from the binding step onwards.</p>
<p>We used the Primer3web tool (v4.1.0) to choose 25 bp long-range PCR primers from each of the 12 unique sequences to define amplicons which ranged from 9 to 11 kbp (<xref ref-type="supplementary-material" rid="pcbi.1006583.s004">S4 File</xref>). PCR was performed using LongAMP <italic>Taq</italic> 2X Master Mix (New England Biolabs) with 150–450 ng gDNA as input template, primers at 1 μM, an annealing temperature of 56°C and 35 cycles of amplification. Following the PCR, size selection purification was performed with Agencourt AMPure beads (Beckman Coulter) at 0.6× ratio. The size and specificity of the PCR product was confirmed by capillary electrophoresis (Fragment Analyser AATI). A sequencing library was prepared from the purified amplicons using the Nanopore 1D ligation sequencing kit (SQK-LSK108) with the native barcoding expansion kit (EXP-NBD103) as per the manufacturer’s instructions. The run was performed on a MinION MK1b device using an R9.4 flow cell (FLO-MIN106), MinKNOW v18.03.1 and the <monospace>NC_48Hr_Sequencing_Run_FLO-MIN106_SQK-LSK108</monospace> protocol.</p>
</sec>
<sec id="sec013">
<title>Demultiplexing</title>
<p>The reads were basecalled with Albacore (v2.3.1) using the following options: <monospace>barcoding</monospace> (to enable demultiplexing) and <monospace>disable_filtering</monospace> (to include low-quality reads). The resulting FASTQ files were pooled and shuffled, and then given to Porechop (v0.2.3) to be independently demultiplexed. The pre-basecalled fast5 files were demultiplexed with Deepbinner (v0.2.0).</p>
<p>In addition to running the tools with default parameters, we also tested Porechop and Deepbinner with parameters to increase or decrease their demultiplexing stringency. Porechop’s lenient settings reduce the barcode and difference thresholds from their default values (75% and 5%, respectively) to 60% and 1%. This makes Porechop willing to consider very low-quality alignments and assign a barcode even when there is a close second-best match. Porechop’s stringent settings increase the barcode threshold to 85% (so it will only consider high-quality alignments) and require a barcode match on both the start and end of reads. The only difference between Deepbinner’s default and stringent settings is that the latter requires a barcode match on both the start and end of reads.</p>
<p>We assigned ground truth classifications to the ONT reads by aligning their basecalled sequences to the amplicon reference sequences with minimap2 (v2.12) [<xref ref-type="bibr" rid="pcbi.1006583.ref028">28</xref>] and binning with the <monospace>assign_reads_to_reference.py</monospace> script (included with Deepbinner). Reads which failed to meet an alignment threshold (100 bp or 10% of the read length, whichever is smaller) were classified as ‘unknown’. Reads which exceeded an alignment threshold to a secondary amplicon (50 bp or 5% of the read length, whichever is larger) were classified as ‘chimera’. This method is only able to detect cross-bin chimeras, i.e. reads with separate components from two different amplicons. It cannot detect within-bin chimeras, e.g. two copies of the same amplicon concatenated in a single read.</p>
</sec>
<sec id="sec014">
<title>Additional test sets</title>
<p>We also performed a whole genome sequencing (WGS) run using 12 different bacterial species, as it represents a more realistic sequencing scenario for many users. We followed the same preparation and analysis described above for the amplicon sequencing but with genomic DNA and excluding the PCR-specific steps. The reads were aligned with minimap2 (v2.12) [<xref ref-type="bibr" rid="pcbi.1006583.ref028">28</xref>] to all twelve complete genome sequences (produced via hybrid Illumina-Nanopore assembly using Unicycler v0.4.6 [<xref ref-type="bibr" rid="pcbi.1006583.ref004">4</xref>]) and assigned ground truth classifications with the same script and thresholds used for binning the amplicon reads. Due to shared sequences between the genomes, confidently assigning ground truth was more challenging than for the amplicon set, resulting in more reads being classified as ‘unknown’.</p>
<p>Based on examination of basecalled sequences, we estimate approximately 0–2% of reads in a barcoded read set may be genuinely lacking a barcode, i.e. one was not ligated during preparation. These reads can be difficult to distinguish from reads which do have a barcode but where no classification was possible, so to examine the demultiplexing tools’ behaviour on non-barcoded reads, we additionally assessed them using a non-barcoded ONT read set (SQK-LSK108, R9.4 flow cell). This is referred to as the ‘negative control’ set and the ground truth classification for all its reads was ‘none’.</p>
<p>To ensure a fair assessment, no reads from any of the three test sets (amplicon, WGS and negative control) were used in the training of Deepbinner’s models.</p>
</sec>
</sec>
</sec>
<sec id="sec015" sec-type="results">
<title>Results</title>
<sec id="sec016">
<title>Performance on evaluation sets</title>
<p>The barcoded amplicon MinION sequencing run produced 1 893 881 reads (9.69 Gbp), 1 642 052 of which (87%) could be reliably assigned to an amplicon based on the internal read sequence alone, producing our ground truth classifications. The WGS run produced 1 300 656 reads (12.2 Gbp), 968 137 of which (74%) could be assigned to a genome for ground truth. Our negative control set contained 101 993 reads (945 Mbp).</p>
<p>Of all three demultiplexers tested (using default parameters), Deepbinner performed best on both precision (a.k.a. positive predictive value, proportion of binned reads correctly assigned) and recall (a.k.a. accuracy, proportion of all reads correctly assigned) (<xref ref-type="table" rid="pcbi.1006583.t001">Table 1</xref>, <xref ref-type="supplementary-material" rid="pcbi.1006583.s001">S1 File</xref>). It was particularly strong on recall, correctly binning 253 300 more reads (1265 Mbp) than Albacore and 176 740 (836 Mbp) more than Porechop in the amplicon set. The same patterns were evident on the WGS set, with Deepbinner displaying improvements of ∼10% in recall and 1–2% in precision, as compared to Albacore and Porechop (<xref ref-type="supplementary-material" rid="pcbi.1006583.s002">S2 File</xref>).</p>
<table-wrap id="pcbi.1006583.t001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1006583.t001</object-id>
<label>Table 1</label>
<caption>
<title>Classification performance of demultiplexing tools.</title>
</caption>
<alternatives>
<graphic id="pcbi.1006583.t001g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1006583.t001" xlink:type="simple"/>
<table border="0" frame="box" rules="all">
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="left" rowspan="2"/>
<th align="center" rowspan="2">Binned<break/>reads</th>
<th align="center" colspan="3">Reads with known ground truth<break/>(N = 1642052)</th>
<th align="center" colspan="3">Other reads</th>
</tr>
<tr>
<th align="center">Precision<break/>(PPV)</th>
<th align="center">Recall<break/>(accuracy)</th>
<th align="center">Q score<break/>range</th>
<th align="center">Binned<break/>unknown</th>
<th align="center">Binned<break/>chimeric</th>
<th align="center">Binned neg<break/>control</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">Albacore</td>
<td align="char" char=".">73.8%</td>
<td align="char" char=".">97.3%</td>
<td align="char" char=".">78.9%</td>
<td align="center">6.8–11.3</td>
<td align="char" char=".">23.5%</td>
<td align="char" char=".">84.3%</td>
<td align="char" char="."><bold>0.0</bold>%</td>
</tr>
<tr>
<td align="left">Porechop</td>
<td align="char" char=".">79.0%</td>
<td align="char" char=".">97.5%</td>
<td align="char" char=".">83.6%</td>
<td align="center">6.1–11.3</td>
<td align="char" char=".">33.5%</td>
<td align="char" char=".">72.4%</td>
<td align="char" char=".">1.8%</td>
</tr>
<tr>
<td align="left">Deepbinner</td>
<td align="char" char=".">92.2%</td>
<td align="char" char="."><bold>98.5</bold>%</td>
<td align="char" char="."><bold>94.3</bold>%</td>
<td align="center">5.0–11.3</td>
<td align="char" char=".">68.8%</td>
<td align="char" char="."><bold>43.4</bold>%</td>
<td align="char" char=".">0.3%</td>
</tr>
<tr>
<td align="left">Porechop (lenient)</td>
<td align="char" char=".">95.9%</td>
<td align="char" char=".">92.3%</td>
<td align="char" char=".">89.9%</td>
<td align="center">4.9–11.3</td>
<td align="char" char=".">85.8%</td>
<td align="char" char=".">92.0%</td>
<td align="char" char=".">73.5%</td>
</tr>
<tr>
<td align="left">Porechop (stringent)</td>
<td align="char" char=".">17.2%</td>
<td align="char" char=".">99.7%</td>
<td align="char" char=".">18.8%</td>
<td align="center">7.8–11.5</td>
<td align="char" char=".">5.8%</td>
<td align="char" char=".">1.0%</td>
<td align="char" char=".">0.0%</td>
</tr>
<tr>
<td align="left">Deepbinner (stringent)</td>
<td align="char" char=".">53.4%</td>
<td align="char" char=".">99.4%</td>
<td align="char" char=".">56.8%</td>
<td align="center">5.7–11.3</td>
<td align="char" char=".">28.7%</td>
<td align="char" char=".">3.8%</td>
<td align="char" char=".">0.0%</td>
</tr>
</tbody>
</table>
</alternatives>
<table-wrap-foot>
<fn id="t001fn001">
<p>Classification metrics for the three tested demultiplexers using the amplicon read set. The first three rows show results using the tools’ default parameters. The last three rows show results where parameters were changed to increase or decrease stringency.</p>
</fn>
<fn id="t001fn002">
<p>Binned reads = proportion of all reads assigned to a barcode. Precision (positive predictive value) = proportion of binned reads correctly assigned. Recall (accuracy) = proportion of all reads correctly assigned. Q score range = mean Phred quality scores of binned reads (2.5<sup>th</sup>–97.5<sup>th</sup> percentile). Binned unknown = proportion of unknown reads (those unable to be assigned to any amplicon reference) assigned to a barcode. Binned chimeric = proportion of chimeric reads (those assigned to more than one amplicon reference) assigned to a barcode. Binned negative control = proportion of negative control reads (those from a separate barcode-less library preparation) assigned to a barcode.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Deepbinner’s superior recall is primarily due to its ability to bin low-quality reads. All three demultiplexers perform similarly on very high-quality reads, but Albacore and Porechop’s recall fall below 50% for low-quality reads (<xref ref-type="supplementary-material" rid="pcbi.1006583.s008">S4 Fig</xref>). Deepbinner, however, maintains a recall of over 85% for the entire quality range.</p>
<p>Deepbinner was much more likely than other demultiplexers to put ‘unknown’ reads (those which we could not assign to a reference sequence, 12.7% of the amplicon reads) into a barcode bin. Some are likely reads that were too low-quality to be assigned to a source based on the internal sequence. The negative control test shows that both Porechop and Deepbinner can suffer from over-sensitivity, putting barcode-less reads in a bin. For chimeric reads (0.31% of the amplicon reads), Deepbinner was less likely than other tools to assign a barcode.</p>
<p>There is often a trade-off between precision and recall which can be adjusted via demultiplexer parameters (<xref ref-type="table" rid="pcbi.1006583.t001">Table 1</xref>). When Porechop is given lenient settings, it has lower precision (92.3%) but improved recall (89.9%). When given stringent settings, Porechop achieves very high precision (99.7%) but at great cost to recall (18.9%). When Deepbinner is given stringent settings, its precision and recall become 99.4% and 56.8%, respectively. Combining multiple demultiplexers (only binning reads where multiple tools agree) improves precision at the cost of recall (<xref ref-type="supplementary-material" rid="pcbi.1006583.s001">S1 File</xref>). However, precision never exceeded 99.7%, even when we used multiple demultiplexers and very stringent settings.</p>
<p>Deepbinner requires as little as 1 CPU and 1 GB of RAM, though it can take advantage of additional resources to run faster, benefitting from up to about 16 CPUs and 8 GB of RAM. It is difficult to make a speed performance comparison between Deepbinner and the other tools, as they use computer resources differently. Albacore is likely the fastest for many users, as its demultiplexing requires little extra time over basecalling. If Deepbinner is run on a CPU, it is the slowest demultiplexer tested—classifying the amplicon set took 29.5 hours to complete (∼18 reads/sec, using a total of 347 CPU hours). When run on a GPU (NVIDIA P100), its performance is comparable to Porechop, with the amplicon set classification taking 4.6 hours (∼114 reads/sec).</p>
</sec>
<sec id="sec017">
<title>Implications</title>
<p>Many of Deepbinner’s advantages stem from the fact that it operates not on basecalled sequences, but on the more informative raw signal. This may account for Deepbinner’s high recall (including the ability to bin low-quality reads) and precision. By demultiplexing fast5 files before basecalling, it simplifies downstream analyses such as Nanopolish which require raw reads [<xref ref-type="bibr" rid="pcbi.1006583.ref009">9</xref>]. Deepbinner also opens up the possibility of constructing barcodes with modified DNA bases to increase the size of the genomic alphabet. Such high-alphabet barcodes could be easier to differentiate at the signal level, and if so would allow for a greater number of unique barcodes in a given sequence length.</p>
<p>The disadvantages of Deepbinner are similar to those experienced by CNN classifiers in other contexts. Training the network is computationally intensive, and a large volume of training data is required. Trained networks may not generalise well across different flow cells and library preparation kits, necessitating a separate trained network for each. When Deepbinner makes an error during classification, the ‘black box’ nature of neural networks makes it difficult to understand why.</p>
</sec>
<sec id="sec018">
<title>Recommendations</title>
<p>Deepbinner’s high recall is well suited to applications where the most important factor is maximising the number of classified reads and therefore the amount of useable data. A user can run Deepbinner during sequencing, binning fast5 files as they are produced, and then run Albacore on each resulting directory of reads. While other demultiplexing approaches can discard (i.e. fail to classify) over 20% of the data, this proportion will likely be less than 10% with Deepbinner. This greater yield of reads may improve assemblies, even when the additional reads are low-quality [<xref ref-type="bibr" rid="pcbi.1006583.ref029">29</xref>].</p>
<p>For applications where precision is paramount (i.e. incorrectly binned reads must be minimised), Porechop with stringent settings may be the best choice, though its very low recall means that most reads (&gt;80%) will be lost. Deepbinner with stringent settings (requiring both start and end barcodes) has slightly worse precision than stringent Porechop but can classify more than half the reads.</p>
<p>Our tests showed that demultiplexing precision plateaued at about 99.7%, i.e. 0.3% of reads with a ground truth label are consistently assigned to the wrong bin by all demultiplexing tools. This implies these reads may have the wrong barcode ligated to the DNA, a problem that no demultiplexing tool could fix. This may arise in the library preparation, whereby unligated barcodes could be carried through after sample pooling and then be available for ligation to incorrect DNA fragments in the adapter ligation step. If so, a bead clean-up with size selection after barcode ligation (but before adapter ligation) may mitigate the issue by reducing the number of free barcode sequences. This small amount of barcode crosstalk, along with cross-barcode chimeric reads, is likely inconsequential for isolate sequencing but could be a serious problem in some quantitative applications such as metagenomics or transcriptomics [<xref ref-type="bibr" rid="pcbi.1006583.ref030">30</xref>].</p>
</sec>
</sec>
<sec id="sec019">
<title>Availability and future directions</title>
<p>Deepbinner, documentation and pre-trained models are available on GitHub: <ext-link ext-link-type="uri" xlink:href="https://github.com/rrwick/Deepbinner" xlink:type="simple">https://github.com/rrwick/Deepbinner</ext-link>. The raw fast5 read sets used in this manuscript are available on ENA: <ext-link ext-link-type="uri" xlink:href="https://www.ebi.ac.uk/ena/data/view/PRJEB28450" xlink:type="simple">https://www.ebi.ac.uk/ena/data/view/PRJEB28450</ext-link>. The basecalled reads and full classification results are available on figshare: <ext-link ext-link-type="uri" xlink:href="https://figshare.com/projects/Deepbinner/34223" xlink:type="simple">https://figshare.com/projects/Deepbinner/34223</ext-link>.</p>
<p>Future development of Deepbinner will involve training models for other library preparations, including larger barcode sets (more than 12 barcodes) as they become available. Improved performance and parallelism is also a focus, to ensure that Deepbinner can keep up with high yield sequencing runs, such as those from ONT’s higher throughput GridION X5 and PromethION sequencers.</p>
</sec>
<sec id="sec020">
<title>Supporting information</title>
<supplementary-material id="pcbi.1006583.s001" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1006583.s001" xlink:type="simple">
<label>S1 File</label>
<caption>
<title>Confusion matrices (amplicon).</title>
<p>Classification counts per reference bin and predicted bin, for each demultiplexer tested, using the amplicon read set.</p>
<p>(XLSX)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1006583.s002" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1006583.s002" xlink:type="simple">
<label>S2 File</label>
<caption>
<title>Confusion matrices (WGS).</title>
<p>Classification counts per reference bin and predicted bin, for each demultiplexer tested, using the whole genome read set.</p>
<p>(XLSX)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1006583.s003" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1006583.s003" xlink:type="simple">
<label>S3 File</label>
<caption>
<title>Negative control results.</title>
<p>Classification counts per predicted bin, for each demultiplexer tested, using the negative control read set.</p>
<p>(XLSX)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1006583.s004" mimetype="application-x/ext-file" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1006583.s004" xlink:type="simple">
<label>S4 File</label>
<caption>
<title>Amplicon sequences.</title>
<p>FASTA file of the 12 amplicon sequences.</p>
<p>(FASTA)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1006583.s005" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1006583.s005" xlink:type="simple">
<label>S1 Fig</label>
<caption>
<title>No-barcode training signals.</title>
<p>Multiple types of signals were included in the training set to explicitly teach the neural network what a barcode-free signal looks like. The signal amplitude has been normalised to a mean of 0 and a variance of 1.</p>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1006583.s006" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1006583.s006" xlink:type="simple">
<label>S2 Fig</label>
<caption>
<title>Data augmentation via temporal distortion.</title>
<p>One real training sample can yield multiple additional training samples by distorting the signal along the temporal axis. The signal amplitude has been normalised to a mean of 0 and a variance of 1.</p>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1006583.s007" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1006583.s007" xlink:type="simple">
<label>S3 Fig</label>
<caption>
<title>Training metrics for the read-start and read-end models.</title>
<p>Generated using a random 95:5 training:validation split. Training data has poorer performance than validation data due to data augmentation and training-only layers (Gaussian noise and dropout) in the network.</p>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1006583.s008" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1006583.s008" xlink:type="simple">
<label>S4 Fig</label>
<caption>
<title>Classification metrics by q score.</title>
<p>Precision and recall for each tool’s demultiplexing of the amplicon read set as a function of read q score.</p>
<p>(PDF)</p>
</caption>
</supplementary-material>
</sec>
</body>
<back>
<ack>
<p>We would like to thank Wendy Wilson for conducting the long-range PCR amplification.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="pcbi.1006583.ref001">
<label>1</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Church</surname> <given-names>GM</given-names></name>, <name name-style="western"><surname>Kieffer-Higgins</surname> <given-names>S</given-names></name>. <article-title>Multiplex DNA sequencing</article-title>. <source>Science</source>. <year>1988</year>;<volume>240</volume>(<issue>4849</issue>):<fpage>185</fpage>–<lpage>188</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1126/science.3353714" xlink:type="simple">10.1126/science.3353714</ext-link></comment> <object-id pub-id-type="pmid">3353714</object-id></mixed-citation>
</ref>
<ref id="pcbi.1006583.ref002">
<label>2</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Quick</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Quinlan</surname> <given-names>AR</given-names></name>, <name name-style="western"><surname>Loman</surname> <given-names>NJ</given-names></name>. <article-title>A reference bacterial genome dataset generated on the MinION portable single-molecule nanopore sequencer</article-title>. <source>GigaScience</source>. <year>2014</year>;<volume>3</volume>(<issue>1</issue>):<fpage>1</fpage>–<lpage>6</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/2047-217X-3-22" xlink:type="simple">10.1186/2047-217X-3-22</ext-link></comment></mixed-citation>
</ref>
<ref id="pcbi.1006583.ref003">
<label>3</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Jansen</surname> <given-names>HJ</given-names></name>, <name name-style="western"><surname>Liem</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Jong-Raadsen</surname> <given-names>SA</given-names></name>, <name name-style="western"><surname>Dufour</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Weltzien</surname> <given-names>FA</given-names></name>, <name name-style="western"><surname>Swinkels</surname> <given-names>W</given-names></name>, <etal>et al</etal>. <article-title>Rapid de novo assembly of the European eel genome from nanopore sequencing reads</article-title>. <source>Scientific Reports</source>. <year>2017</year>;<volume>7</volume>(<issue>1</issue>):<fpage>1</fpage>–<lpage>13</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/s41598-017-07650-6" xlink:type="simple">10.1038/s41598-017-07650-6</ext-link></comment></mixed-citation>
</ref>
<ref id="pcbi.1006583.ref004">
<label>4</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Wick</surname> <given-names>RR</given-names></name>, <name name-style="western"><surname>Judd</surname> <given-names>LM</given-names></name>, <name name-style="western"><surname>Gorrie</surname> <given-names>CL</given-names></name>, <name name-style="western"><surname>Holt</surname> <given-names>KE</given-names></name>. <article-title>Completing bacterial genome assemblies with multiplex MinION sequencing</article-title>. <source>Microbial Genomics</source>. <year>2017</year>;<volume>3</volume>(<issue>10</issue>):<fpage>1</fpage>–<lpage>7</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1099/mgen.0.000132" xlink:type="simple">10.1099/mgen.0.000132</ext-link></comment></mixed-citation>
</ref>
<ref id="pcbi.1006583.ref005">
<label>5</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Stoiber</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Brown</surname> <given-names>J</given-names></name>. <article-title>BasecRAWller: streaming nanopore basecalling directly from raw signal</article-title>. <source>bioRxiv</source>. <year>2017</year>; p. <fpage>1</fpage>–<lpage>15</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref006">
<label>6</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Teng</surname> <given-names>HH</given-names></name>, <name name-style="western"><surname>Hall</surname> <given-names>MB</given-names></name>, <name name-style="western"><surname>Duarte</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Cao</surname> <given-names>MD</given-names></name>, <name name-style="western"><surname>Coin</surname> <given-names>LJM</given-names></name>. <article-title>Chiron: translating nanopore raw signal directly into nucleotide sequence using deep learning</article-title>. <source>bioRxiv</source>. <year>2017</year>; p. <fpage>1</fpage>–<lpage>10</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref007">
<label>7</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Boža</surname> <given-names>V</given-names></name>, <name name-style="western"><surname>Brejová</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Vinař</surname> <given-names>T</given-names></name>. <article-title>DeepNano: Deep recurrent neural networks for base calling in MinION Nanopore reads</article-title>. <source>PLOS ONE</source>. <year>2017</year>;<volume>12</volume>(<issue>6</issue>):<fpage>1</fpage>–<lpage>13</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref008">
<label>8</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Wick</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Judd</surname> <given-names>LM</given-names></name>, <name name-style="western"><surname>Holt</surname> <given-names>KE</given-names></name>. <source>Comparison of Oxford Nanopore basecalling tools</source>; <year>2018</year>. Available from: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.1188469" xlink:type="simple">https://doi.org/10.5281/zenodo.1188469</ext-link>.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref009">
<label>9</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Loman</surname> <given-names>NJ</given-names></name>, <name name-style="western"><surname>Quick</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Simpson</surname> <given-names>JT</given-names></name>. <article-title>A complete bacterial genome assembled de novo using only nanopore sequencing data</article-title>. <source>Nature Methods</source>. <year>2015</year>;<volume>12</volume>(<issue>8</issue>):<fpage>733</fpage>–<lpage>735</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/nmeth.3444" xlink:type="simple">10.1038/nmeth.3444</ext-link></comment> <object-id pub-id-type="pmid">26076426</object-id></mixed-citation>
</ref>
<ref id="pcbi.1006583.ref010">
<label>10</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Loose</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Malla</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Stout</surname> <given-names>M</given-names></name>. <article-title>Real time selective sequencing using nanopore technology</article-title>. <source>bioRxiv</source>. <year>2016</year>;<volume>13</volume>(<issue>9</issue>):<fpage>038760</fpage>.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref011">
<label>11</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Krizhevsky</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Sutskever</surname> <given-names>I</given-names></name>, <name name-style="western"><surname>Hinton</surname> <given-names>GE</given-names></name>. <article-title>ImageNet classification with deep convolutional neural networks</article-title>. <source>Advances in Neural Information Processing Systems</source>. <year>2012</year>; p. <fpage>1097</fpage>–<lpage>1105</lpage>. <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1016/j.protcy.2014.09.007" xlink:type="simple">http://dx.doi.org/10.1016/j.protcy.2014.09.007</ext-link>.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref012">
<label>12</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Szegedy</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Ioffe</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Vanhoucke</surname> <given-names>V</given-names></name>, <name name-style="western"><surname>Alemi</surname> <given-names>A</given-names></name>. <article-title>Inception-v4, Inception-ResNet and the impact of residual connections on learning</article-title>. <source>CoRR</source>. <year>2016</year>;abs/1602.0.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref013">
<label>13</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Cun</surname> <given-names>YL</given-names></name>, <name name-style="western"><surname>Bottou</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Bengio</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>Haffiner</surname> <given-names>P</given-names></name>. <article-title>Gradient based learning applied to document recognition</article-title>. <source>Proceedings of IEEE</source>. <year>1998</year>;<volume>86</volume>(<issue>11</issue>):<volume>86</volume>(<issue>11</issue>):<fpage>2278</fpage>–<lpage>2324</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1109/5.726791" xlink:type="simple">10.1109/5.726791</ext-link></comment></mixed-citation>
</ref>
<ref id="pcbi.1006583.ref014">
<label>14</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>He</surname> <given-names>K</given-names></name>, <name name-style="western"><surname>Zhang</surname> <given-names>X</given-names></name>, <name name-style="western"><surname>Ren</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Sun</surname> <given-names>J</given-names></name>. <article-title>Deep Residual Learning for Image Recognition</article-title>. <source>ArxivOrg</source>. <year>2015</year>;<volume>7</volume>(<issue>3</issue>):<fpage>171</fpage>–<lpage>180</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref015">
<label>15</label>
<mixed-citation publication-type="book" xlink:type="simple">
<name name-style="western"><surname>Zeiler</surname> <given-names>MD</given-names></name>, <name name-style="western"><surname>Fergus</surname> <given-names>R</given-names></name>. <chapter-title>Visualizing and understanding convolutional networks</chapter-title>. In: <name name-style="western"><surname>Fleet</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Pajdla</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Schiele</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Tuytelaars</surname> <given-names>T</given-names></name>, editors. <source>Computer Vision—ECCV 2014</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>; <year>2014</year>. p. <fpage>818</fpage>–<lpage>833</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref016">
<label>16</label>
<mixed-citation publication-type="other" xlink:type="simple">Jia Deng, Wei Dong, Socher R, Li-Jia Li, Kai Li, Li Fei-Fei. ImageNet: a large-scale hierarchical image database. 2009 IEEE Conference on Computer Vision and Pattern Recognition. 2009; p. 248–255.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref017">
<label>17</label>
<mixed-citation publication-type="other" xlink:type="simple">Abadi M, Barham P, Chen J, Chen Z, Davis A, Dean J, et al. TensorFlow: a system for large-scale machine learning. 12th USENIX Symposium on Operating Systems Design and Implementation. 2016; p. 265–284.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref018">
<label>18</label>
<mixed-citation publication-type="other" xlink:type="simple">Chollet F. Keras; 2015. Available from: <ext-link ext-link-type="uri" xlink:href="https://github.com/keras-team/keras" xlink:type="simple">https://github.com/keras-team/keras</ext-link>.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref019">
<label>19</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Simonyan</surname> <given-names>K</given-names></name>, <name name-style="western"><surname>Zisserman</surname> <given-names>A</given-names></name>. <article-title>Very deep convolutional networks for large-scale image recognition</article-title>. <source>International Conference on Learning Representations (ICRL)</source>. <year>2015</year>; p. <fpage>1</fpage>–<lpage>14</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref020">
<label>20</label>
<mixed-citation publication-type="other" xlink:type="simple">Szegedy C, Liu W, Jia Y, Sermanet P, Reed S, Anguelov D, et al. Going deeper with convolutions. Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition. 2015;07-12-June:1–9.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref021">
<label>21</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Ioffe</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Szegedy</surname> <given-names>C</given-names></name>. <article-title>Batch normalization: accelerating deep network training by reducing internal covariate shift</article-title>. <source>CoRR</source>. <year>2015</year>;abs/1502.0.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref022">
<label>22</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Lin</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Chen</surname> <given-names>Q</given-names></name>, <name name-style="western"><surname>Yan</surname> <given-names>S</given-names></name>. <article-title>Network in network</article-title>. <source>CoRR</source>. <year>2013</year>;abs/1312.4:<fpage>1</fpage>–<lpage>10</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref023">
<label>23</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Šošić</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Šikić</surname> <given-names>M</given-names></name>. <article-title>Edlib: A C/C ++ library for fast, exact sequence alignment using edit distance</article-title>. <source>Bioinformatics</source>. <year>2017</year>;<volume>33</volume>(<issue>9</issue>):<fpage>1394</fpage>–<lpage>1395</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bioinformatics/btw753" xlink:type="simple">10.1093/bioinformatics/btw753</ext-link></comment> <object-id pub-id-type="pmid">28453688</object-id></mixed-citation>
</ref>
<ref id="pcbi.1006583.ref024">
<label>24</label>
<mixed-citation publication-type="other" xlink:type="simple">Boža V, Brejová B, Vinař T. Improving Nanopore Reads Raw Signal Alignment. arXiv. 2017;.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref025">
<label>25</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Perlin</surname> <given-names>K</given-names></name>. <article-title>An image synthesizer</article-title>. <source>ACM SIGGRAPH Computer Graphics</source>. <year>1985</year>;<volume>19</volume>(<issue>3</issue>):<fpage>287</fpage>–<lpage>296</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1145/325165.325247" xlink:type="simple">10.1145/325165.325247</ext-link></comment></mixed-citation>
</ref>
<ref id="pcbi.1006583.ref026">
<label>26</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Perez</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Wang</surname> <given-names>J</given-names></name>. <article-title>The Effectiveness of Data Augmentation in Image Classification using Deep Learning</article-title>. <source>CoRR</source>. <year>2017</year>;abs/1712.0.</mixed-citation>
</ref>
<ref id="pcbi.1006583.ref027">
<label>27</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Bankevich</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Nurk</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Antipov</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Gurevich</surname> <given-names>Aa</given-names></name>, <name name-style="western"><surname>Dvorkin</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Kulikov</surname> <given-names>AS</given-names></name>, <etal>et al</etal>. <article-title>SPAdes: a new genome assembly algorithm and its applications to single-cell sequencing</article-title>. <source>Journal of Computational Biology</source>. <year>2012</year>;<volume>19</volume>(<issue>5</issue>):<fpage>455</fpage>–<lpage>477</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1089/cmb.2012.0021" xlink:type="simple">10.1089/cmb.2012.0021</ext-link></comment> <object-id pub-id-type="pmid">22506599</object-id></mixed-citation>
</ref>
<ref id="pcbi.1006583.ref028">
<label>28</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Li</surname> <given-names>H</given-names></name>. <article-title>Minimap2: pairwise alignment for nucleotide sequences</article-title>. <source>Bioinformatics</source>. <year>2018</year>; p. bty191. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bioinformatics/bty191" xlink:type="simple">10.1093/bioinformatics/bty191</ext-link></comment></mixed-citation>
</ref>
<ref id="pcbi.1006583.ref029">
<label>29</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Wick</surname> <given-names>RR</given-names></name>, <name name-style="western"><surname>Judd</surname> <given-names>LM</given-names></name>, <name name-style="western"><surname>Gorrie</surname> <given-names>CL</given-names></name>, <name name-style="western"><surname>Holt</surname> <given-names>KE</given-names></name>. <article-title>Unicycler: resolving bacterial genome assemblies from short and long sequencing reads</article-title>. <source>PLOS Computational Biology</source>. <year>2017</year>;<volume>13</volume>(<issue>6</issue>):<fpage>e1005595</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pcbi.1005595" xlink:type="simple">10.1371/journal.pcbi.1005595</ext-link></comment> <object-id pub-id-type="pmid">28594827</object-id></mixed-citation>
</ref>
<ref id="pcbi.1006583.ref030">
<label>30</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Sinha</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Stanley</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Gulati</surname> <given-names>GS</given-names></name>, <name name-style="western"><surname>Ezran</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Travaglini</surname> <given-names>KJ</given-names></name>, <name name-style="western"><surname>Wei</surname> <given-names>E</given-names></name>, <etal>et al</etal>. <article-title>Index switching causes “spreading-of-signal” among multiplexed samples in Illumina HiSeq 4000 DNA sequencing</article-title>. <source>bioRxiv</source>. <year>2017</year>; p. <fpage>125724</fpage>.</mixed-citation>
</ref>
</ref-list>
</back>
</article>