<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1d3 20150301//EN" "http://jats.nlm.nih.gov/publishing/1.1d3/JATS-journalpublishing1.dtd">
<article article-type="research-article" dtd-version="1.1d3" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS Comput Biol</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">ploscomp</journal-id>
<journal-title-group>
<journal-title>PLOS Computational Biology</journal-title>
</journal-title-group>
<issn pub-type="ppub">1553-734X</issn>
<issn pub-type="epub">1553-7358</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, CA USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">PCOMPBIOL-D-19-02219</article-id>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1007781</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Research Article</subject>
</subj-group>
<subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Computational biology</subject><subj-group><subject>Genome analysis</subject><subj-group><subject>Sequence assembly tools</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject><subj-group><subject>Genome analysis</subject><subj-group><subject>Sequence assembly tools</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Database and informatics methods</subject><subj-group><subject>Biological databases</subject><subj-group><subject>Sequence databases</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Database and informatics methods</subject><subj-group><subject>Bioinformatics</subject><subj-group><subject>Sequence analysis</subject><subj-group><subject>Sequence databases</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject><subj-group><subject>Metagenomics</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Microbiology</subject><subj-group><subject>Bacteriology</subject><subj-group><subject>Bacterial genetics</subject><subj-group><subject>Bacterial genomics</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Microbial genetics</subject><subj-group><subject>Bacterial genetics</subject><subj-group><subject>Bacterial genomics</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject><subj-group><subject>Microbial genomics</subject><subj-group><subject>Bacterial genomics</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Microbiology</subject><subj-group><subject>Microbial genomics</subject><subj-group><subject>Bacterial genomics</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Molecular biology</subject><subj-group><subject>Molecular biology techniques</subject><subj-group><subject>DNA construction</subject><subj-group><subject>Plasmid construction</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Molecular biology techniques</subject><subj-group><subject>DNA construction</subject><subj-group><subject>Plasmid construction</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Neuroscience</subject><subj-group><subject>Cognitive science</subject><subj-group><subject>Cognition</subject><subj-group><subject>Memory</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Neuroscience</subject><subj-group><subject>Learning and memory</subject><subj-group><subject>Memory</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Database and informatics methods</subject><subj-group><subject>Biological databases</subject><subj-group><subject>Genomic databases</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Computational biology</subject><subj-group><subject>Genome analysis</subject><subj-group><subject>Genomic databases</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject><subj-group><subject>Genome analysis</subject><subj-group><subject>Genomic databases</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Microbiology</subject><subj-group><subject>Medical microbiology</subject><subj-group><subject>Microbiome</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject><subj-group><subject>Microbial genomics</subject><subj-group><subject>Microbiome</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Microbiology</subject><subj-group><subject>Microbial genomics</subject><subj-group><subject>Microbiome</subject></subj-group></subj-group></subj-group></subj-group></article-categories>
<title-group>
<article-title>PlasClass improves plasmid sequence classification</article-title>
<alt-title alt-title-type="running-head">PlasClass: Plasmid sequence classification</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0002-6296-5209</contrib-id>
<name name-style="western">
<surname>Pellow</surname> <given-names>David</given-names></name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Data curation</role>
<role content-type="http://credit.casrai.org/">Formal analysis</role>
<role content-type="http://credit.casrai.org/">Methodology</role>
<role content-type="http://credit.casrai.org/">Software</role>
<role content-type="http://credit.casrai.org/">Validation</role>
<role content-type="http://credit.casrai.org/">Writing – original draft</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
<xref ref-type="corresp" rid="cor001">*</xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0001-6636-8818</contrib-id>
<name name-style="western">
<surname>Mizrahi</surname> <given-names>Itzik</given-names></name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Funding acquisition</role>
<xref ref-type="aff" rid="aff002"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<name name-style="western">
<surname>Shamir</surname> <given-names>Ron</given-names></name>
<role content-type="http://credit.casrai.org/">Conceptualization</role>
<role content-type="http://credit.casrai.org/">Funding acquisition</role>
<role content-type="http://credit.casrai.org/">Project administration</role>
<role content-type="http://credit.casrai.org/">Supervision</role>
<role content-type="http://credit.casrai.org/">Writing – original draft</role>
<role content-type="http://credit.casrai.org/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
<xref ref-type="corresp" rid="cor001">*</xref>
</contrib>
</contrib-group>
<aff id="aff001">
<label>1</label>
<addr-line>Blavatnik School of Computer Science, Tel Aviv University, Tel Aviv, Israel</addr-line>
</aff>
<aff id="aff002">
<label>2</label>
<addr-line>Department of Life Sciences, Ben-Gurion University of the Negev and the National Institute for Biotechnology in the Negev, Marcus Family Campus, Beer-Sheva, Israel</addr-line>
</aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple">
<name name-style="western">
<surname>Pertea</surname> <given-names>Mihaela</given-names></name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/>
</contrib>
</contrib-group>
<aff id="edit1">
<addr-line>Johns Hopkins University, UNITED STATES</addr-line>
</aff>
<author-notes>
<fn fn-type="conflict" id="coi001">
<p>The authors have declared that no competing interests exist.</p>
</fn>
<corresp id="cor001">* E-mail: <email xlink:type="simple">dpellow@tau.ac.il</email> (DP); <email xlink:type="simple">rshamir@tau.ac.il</email> (RS)</corresp>
</author-notes>
<pub-date pub-type="collection">
<month>4</month>
<year>2020</year>
</pub-date>
<pub-date pub-type="epub">
<day>3</day>
<month>4</month>
<year>2020</year>
</pub-date>
<volume>16</volume>
<issue>4</issue>
<elocation-id>e1007781</elocation-id>
<history>
<date date-type="received">
<day>24</day>
<month>12</month>
<year>2019</year>
</date>
<date date-type="accepted">
<day>8</day>
<month>3</month>
<year>2020</year>
</date>
</history>
<permissions>
<copyright-year>2020</copyright-year>
<copyright-holder>Pellow et al</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.pcbi.1007781"/>
<abstract>
<p>Many bacteria contain plasmids, but separating between contigs that originate on the plasmid and those that are part of the bacterial genome can be difficult. This is especially true in metagenomic assembly, which yields many contigs of unknown origin. Existing tools for classifying sequences of plasmid origin give less reliable results for shorter sequences, are trained using a fraction of the known plasmids, and can be difficult to use in practice. We present PlasClass, a new plasmid classifier. It uses a set of standard classifiers trained on the most current set of known plasmid sequences for different sequence lengths. We tested PlasClass sequence classification on held-out data and simulations, as well as publicly available bacterial isolates and plasmidome samples and plasmids assembled from metagenomic samples. PlasClass outperforms the state-of-the-art plasmid classification tool on shorter sequences, which constitute the majority of assembly contigs, allowing it to achieve higher F1 scores in classifying sequences from a wide range of datasets. PlasClass also uses significantly less time and memory. PlasClass can be used to easily classify plasmid and bacterial genome sequences in metagenomic or isolate assemblies. It is available under the MIT license from: <ext-link ext-link-type="uri" xlink:href="https://github.com/Shamir-Lab/PlasClass" xlink:type="simple">https://github.com/Shamir-Lab/PlasClass</ext-link>.</p>
</abstract>
<funding-group>
<award-group id="award001">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/501100003289</institution-id>
<institution>Ministry of Aliyah and Immigrant Absorption</institution>
</institution-wrap>
</funding-source>
<principal-award-recipient>
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0002-6296-5209</contrib-id>
<name name-style="western">
<surname>Pellow</surname> <given-names>David</given-names></name>
</principal-award-recipient>
</award-group>
<award-group id="award002">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/501100003977</institution-id>
<institution>Israel Science Foundation</institution>
</institution-wrap>
</funding-source>
<award-id>1339/18</award-id>
<principal-award-recipient>
<name name-style="western">
<surname>Shamir</surname> <given-names>Ron</given-names></name>
</principal-award-recipient>
</award-group>
<award-group id="award003">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/501100003977</institution-id>
<institution>Israel Science Foundation</institution>
</institution-wrap>
</funding-source>
<award-id>1947/19</award-id>
<principal-award-recipient>
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0001-6636-8818</contrib-id>
<name name-style="western">
<surname>Mizrahi</surname> <given-names>Itzik</given-names></name>
</principal-award-recipient>
</award-group>
<award-group id="award004">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/501100001742</institution-id>
<institution>United States-Israel Binational Science Foundation</institution>
</institution-wrap>
</funding-source>
<award-id>2016694</award-id>
<principal-award-recipient>
<name name-style="western">
<surname>Shamir</surname> <given-names>Ron</given-names></name>
</principal-award-recipient>
</award-group>
<award-group id="award005">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/100010663</institution-id>
<institution>H2020 European Research Council</institution>
</institution-wrap>
</funding-source>
<award-id>640384</award-id>
<principal-award-recipient>
<contrib-id authenticated="true" contrib-id-type="orcid">http://orcid.org/0000-0001-6636-8818</contrib-id>
<name name-style="western">
<surname>Mizrahi</surname> <given-names>Itzik</given-names></name>
</principal-award-recipient>
</award-group>
<funding-statement>DP is supported in part by an Edmond J. Safra PhD Fellowship (<ext-link ext-link-type="uri" xlink:href="https://safrabio.cs.tau.ac.il/" xlink:type="simple">https://safrabio.cs.tau.ac.il/</ext-link>), and in part by an Israel Ministry of Immigrant Absorption PhD fellowship (<ext-link ext-link-type="uri" xlink:href="https://www.gov.il/en/departments/general/research_students_scholarship" xlink:type="simple">https://www.gov.il/en/departments/general/research_students_scholarship</ext-link>). RS is supported in part by grants from the Israel Science Foundation (ISF - <ext-link ext-link-type="uri" xlink:href="https://www.isf.org.il/#/" xlink:type="simple">https://www.isf.org.il/#/</ext-link>) grant 1339/18, the US - Israel Binational Science Foundation (BSF - <ext-link ext-link-type="uri" xlink:href="https://www.bsf.org.il/" xlink:type="simple">https://www.bsf.org.il/</ext-link>), and the US National Science Foundation (NSF - <ext-link ext-link-type="uri" xlink:href="https://www.nsf.gov/" xlink:type="simple">https://www.nsf.gov/</ext-link>) grant 2016694. IM is supported in part by ISF grant 1947/19 (ISF - <ext-link ext-link-type="uri" xlink:href="https://www.isf.org.il/#/" xlink:type="simple">https://www.isf.org.il/#/</ext-link>) and ERC Horizon 2020 research and innovation program grant 640384 (<ext-link ext-link-type="uri" xlink:href="https://ec.europa.eu/programmes/horizon2020/en" xlink:type="simple">https://ec.europa.eu/programmes/horizon2020/en</ext-link>). The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</funding-statement>
</funding-group>
<counts>
<fig-count count="1"/>
<table-count count="6"/>
<page-count count="9"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>PLOS Publication Stage</meta-name>
<meta-value>vor-update-to-uncorrected-proof</meta-value>
</custom-meta>
<custom-meta>
<meta-name>Publication Update</meta-name>
<meta-value>2020-04-15</meta-value>
</custom-meta>
<custom-meta id="data-availability">
<meta-name>Data Availability</meta-name>
<meta-value>The reported results are for publicly available datasets. The waste-water plasmidome and human gut metagenomes are available through the SRA (accessions ERR1538272, ERR1297700, ERR1297720, ERR1297770, ERR1297796, ERR1297822, ERR1297834). The bacterial isolates dataset was previously curated from publicly available data by Arredondo-Alonso et al. It can be accessed from <ext-link ext-link-type="uri" xlink:href="https://gitlab.com/sirarredondo/Plasmid_Assembly" xlink:type="simple">https://gitlab.com/sirarredondo/Plasmid_Assembly</ext-link>.</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<p>This is a <italic>PLOS Computational Biology</italic> Software paper.</p>
<sec id="sec001" sec-type="intro">
<title>Introduction</title>
<p>When using high-throughput sequencing to study the presence and dynamics of plasmids in their bacterial hosts, it is often necessary to classify sequences as being of plasmid or chromosomal origin. This is especially true in the case of metagenomic sequencing, which can include many sequences of unknown origin and varying lengths. We focus on the challenge of classifying contigs in a metagenomic assembly in order to identify which are of plasmid origin.</p>
<p>The current state-of-the-art classifier of plasmid sequences is PlasFlow [<xref ref-type="bibr" rid="pcbi.1007781.ref001">1</xref>], a neural network based algorithm that was shown to perform better than previous tools such as cBar [<xref ref-type="bibr" rid="pcbi.1007781.ref002">2</xref>]. While PlasFlow is successful in classifying small sets of long sequences, it produces less reliable results for short sequences and requires large memory on very large metagenomic datasets.</p>
<p>Here we present PlasClass, a new plasmid sequence classifier implemented as an easy to use Python package. It uses a set of logistic regression classifiers each trained on sequences of a different length sampled from plasmid and bacterial genome reference sequences. When applied on a set of sequences, the appropriate length-specific classifier is used for each sequence.</p>
<p>We tested PlasClass on simulated data, on bacterial isolates, on a wastewater plasmidome, and on plasmids assembled from human gut microbiome samples. For shorter sequences, which are the majority of contigs in an assembly, PlasClass achieved better F1 scores than PlasFlow. This resulted in better overall performance across all the datasets tested. PlasClass also used significantly less RAM and disk memory than PlasFlow, and can be run much faster by using multiprocessing.</p>
<p>PlasClass is provided at <ext-link ext-link-type="uri" xlink:href="https://github.com/Shamir-Lab/PlasClass" xlink:type="simple">https://github.com/Shamir-Lab/PlasClass</ext-link>.</p>
</sec>
<sec id="sec002" sec-type="materials|methods">
<title>Design and implementation</title>
<sec id="sec003">
<title>Training databases</title>
<p>We used reference sequence databases to obtain the training sequences for our classifiers. For the plasmid references we used plasmid sequences listed in PLSDB [<xref ref-type="bibr" rid="pcbi.1007781.ref003">3</xref>] (v.2018_12_05), an up-to-date curated plasmid database. After filtering out duplicate sequences this database contained 13469 reference plasmids (median length: 53.8kb).</p>
<p>For the bacterial chromosome references we downloaded all complete bacterial genome assemblies from NCBI (download date January 9, 2019). We removed sequences annotated as being plasmids and filtered out duplicates, leaving 13491 reference chromosomes (median length: 3.7Mbp).</p>
<p>One quarter of the sequences were randomly removed from the databases before training in order to provide a held-out test set for validation. PlasClass was retrained on the full databases and this version was used for testing on assembled data.</p>
</sec>
<sec id="sec004">
<title>Training the classifiers</title>
<p>We sampled sequence fragments of different lengths from the reference sequences with replacement and constructed a k-mer frequency vector for each fragment. Canonical k-mers of lengths 3–7 were used, resulting in a feature vector of length 10952 for each fragment. Fragment lengths were 500k, 100k, 10k, and 1k. For the two shorter lengths, 90,000 training fragments were used from each class. For the lengths 500k and 100k, since there were not enough long plasmids to do the same, we sampled enough fragments to cover all of the sufficiently long plasmids to a depth of 5. This resulted in 1934 and 45525 plasmid fragments of length 500k and 100k, respectively on the full plasmid database.</p>
<p>For each length, a logistic regression classifier was trained on the plasmid and chromosomal fragments’ k-mer frequency vectors using the scikit-learn [<xref ref-type="bibr" rid="pcbi.1007781.ref004">4</xref>] machine learning library in Python. Code is provided to retrain the models on user-supplied reference sequence databases.</p>
</sec>
<sec id="sec005">
<title>Length-specific classification</title>
<p>PlasClass uses four logistic regression models to classify sequences of different length. Each sequence is assigned to the closest length from among 1kb, 10kb, 100kb, and 500kb. Equivalently, this defines four length ranges: (0,5.5kb], (5.5kb,55kb], (55kb,300kb], (300kb, ∞). Given a sequence, its k-mers are counted, the canonical k-mer frequency vector is calculated and used to classify it with the classifier for the range it falls into. k-mer counting can be performed in parallel for different sequences. Finally, all classification results are concatenated into a single output in the same order as the input sequences.</p>
</sec>
<sec id="sec006">
<title>Classification with PlasClass</title>
<p>PlasClass is available at <ext-link ext-link-type="uri" xlink:href="https://github.com/Shamir-Lab/PlasClass" xlink:type="simple">https://github.com/Shamir-Lab/PlasClass</ext-link>. It has been retrained using the full set of database references. PlasClass can be used as a command-line tool to classify sequences in an input fasta file or it can be imported as a module into the user’s code to classify sequences in the user’s program. It can be run in parallel mode to achieve faster runtimes. PlasClass is fully documented in <xref ref-type="supplementary-material" rid="pcbi.1007781.s001">S1 File</xref> and at the url provided above.</p>
</sec>
</sec>
<sec id="sec007" sec-type="results">
<title>Results</title>
<p>We tested performance of PlasClass on both simulated and real data and compared it to PlasFlow.</p>
<sec id="sec008">
<title>Experimental settings</title>
<p>PlasClass and PlasFlow both assign class probabilities to each sequence. We say a sequence is classified as having plasmid origin if the probability that it belongs to the plasmid class is &gt; 0.5. When running PlasFlow, this probability was summed over all plasmid classes, and we set the parameter <monospace>--threshold = 0.5</monospace> to ensure each sequence is classified as either plasmid or bacterial. All assemblies were performed using the <monospace>--meta</monospace> option of SPAdes [<xref ref-type="bibr" rid="pcbi.1007781.ref005">5</xref>] v3.12.</p>
</sec>
<sec id="sec009">
<title>Performance metrics</title>
<p>We calculated the precision, recall and F1 scores counting the <italic>number</italic> of true positive and false positive predictions. Some previous works [<xref ref-type="bibr" rid="pcbi.1007781.ref001">1</xref>, <xref ref-type="bibr" rid="pcbi.1007781.ref006">6</xref>] calculated performance based on the <italic>lengths</italic> of the sequences classified as plasmids and the total length of the plasmids in a sample. A length-weighted metric is appropriate in the context of plasmid sequence assembly, but in the context of contig classification this makes little sense. (Consider the extreme case of one extremely long sequence and 999 very short ones. Classifying the long contig is easy, but a classifier that only identifies it correctly will have weighted precision and recall near 1 even though only 1/1000 of the sequences are correctly classified.) For this reason we used the numbers of correctly classified sequences.</p>
<p>On the assembled contigs we follow the previous works [<xref ref-type="bibr" rid="pcbi.1007781.ref001">1</xref>, <xref ref-type="bibr" rid="pcbi.1007781.ref006">6</xref>] and consider a contig to be from the plasmid class if it matches a plasmid reference sequence—even if it also matches a chromosomal reference sequence. This is appropriate for classifying all sequences in an assembly to determine their origin. However, when constructing a benchmark for a classifier, it may be more suitable to filter ambiguous sequences that may belong to both classes out of the test set. For this reason, we also report results with all ambiguous sequences filtered out in <xref ref-type="supplementary-material" rid="pcbi.1007781.s002">S2 File</xref>.</p>
</sec>
<sec id="sec010">
<title>Classifying sequences from held-out references</title>
<p>We sampled overlapping <italic>L</italic>-long fragments covering the held out plasmids with an overlap of <italic>L</italic>/2 for <italic>L</italic> = 100k, 10k and 1k. A matching number of <italic>L</italic>-long fragments were sampled from the held out bacterial genomes for each length <italic>L</italic>. (Note that this creates a balanced classification scenario.) <xref ref-type="table" rid="pcbi.1007781.t001">Table 1</xref> summarizes the classification results. PlasClass improved precision at the cost of slightly lower recall and had better overall F1 on the shorter sequence lengths. These short sequences can make up the majority of contigs in metagenomic assemblies, allowing PlasClass to outperform PlasFlow in many settings as shown below.</p>
<table-wrap id="pcbi.1007781.t001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1007781.t001</object-id>
<label>Table 1</label>
<caption>
<title>Performance on held out data.</title>
</caption>
<alternatives>
<graphic id="pcbi.1007781.t001g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007781.t001" xlink:type="simple"/>
<table border="0" frame="box" rules="all">
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="left" rowspan="2">Length (bp)</th>
<th align="left" rowspan="2"># fragments per class</th>
<th align="center" colspan="3">PlasClass</th>
<th align="center" colspan="3">PlasFlow</th>
</tr>
<tr>
<th align="center">Precision</th>
<th align="center">Recall</th>
<th align="center">F1</th>
<th align="center">Precision</th>
<th align="center">Recall</th>
<th align="center">F1</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">100k</td>
<td align="left">2979</td>
<td align="char" char=".">96.9</td>
<td align="char" char=".">85.4</td>
<td align="char" char=".">90.8</td>
<td align="char" char=".">95.6</td>
<td align="char" char=".">88.4</td>
<td align="char" char=".">91.9</td>
</tr>
<tr>
<td align="left">10k</td>
<td align="left">56583</td>
<td align="char" char=".">88.7</td>
<td align="char" char=".">86.4</td>
<td align="char" char=".">87.6</td>
<td align="char" char=".">83.1</td>
<td align="char" char=".">87.7</td>
<td align="char" char=".">85.3</td>
</tr>
<tr>
<td align="left">1k</td>
<td align="left">607656</td>
<td align="char" char=".">75.1</td>
<td align="char" char=".">74.6</td>
<td align="char" char=".">74.8</td>
<td align="char" char=".">59.7</td>
<td align="char" char=".">79.1</td>
<td align="char" char=".">68.1</td>
</tr>
</tbody>
</table>
</alternatives>
<table-wrap-foot>
<fn id="t001fn001">
<p>Performance of PlasClass and PlasFlow on fixed length sequence fragments sampled from the held out references.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="sec011">
<title>Performance on a benchmark of bacterial isolates</title>
<p>We compared the performance of PlasClass to PlasFlow on the isolate assemblies from the benchmark in [<xref ref-type="bibr" rid="pcbi.1007781.ref006">6</xref>]. Specifically, we downloaded the assemblies and all bacterial and plasmid reference sequences used in the benchmarking experiment of [<xref ref-type="bibr" rid="pcbi.1007781.ref006">6</xref>] (available from: <ext-link ext-link-type="uri" xlink:href="https://gitlab.com/sirarredondo/Plasmid_Assembly" xlink:type="simple">https://gitlab.com/sirarredondo/Plasmid_Assembly</ext-link>). Assembled contigs were mapped to the references using BLAST and contigs with matches (&gt;95% mapping identity along &gt;95% of the contig length) were assigned to the plasmid or chromosome class as described. There were 60579 contigs across all the assemblies of which 36172 matched one of the classes (8569 plasmid and 27603 chromosome) and were used in this test. As seen in <xref ref-type="table" rid="pcbi.1007781.t002">Table 2</xref>, the majority of these sequences were extremely short (68% of the 36172 contigs &lt;500bp). We looked at the impact of these short sequences by filtering out contigs below a certain length and the results of both methods improved when shorter sequences were filtered out. In all cases, PlasClass had consistently higher F1.</p>
<table-wrap id="pcbi.1007781.t002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1007781.t002</object-id>
<label>Table 2</label>
<caption>
<title>Performance on bacterial isolates.</title>
</caption>
<alternatives>
<graphic id="pcbi.1007781.t002g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007781.t002" xlink:type="simple"/>
<table border="0" frame="box" rules="all">
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="left" rowspan="2">Contig length (bp)</th>
<th align="left" rowspan="2"># of contigs</th>
<th align="center" colspan="3">PlasClass</th>
<th align="center" colspan="3">PlasFlow</th>
</tr>
<tr>
<th align="center">Precision</th>
<th align="center">Recall</th>
<th align="center">F1</th>
<th align="center">Precision</th>
<th align="center">Recall</th>
<th align="center">F1</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">All</td>
<td align="left">36172</td>
<td align="char" char=".">43.65</td>
<td align="char" char=".">77.58</td>
<td align="char" char=".">55.87</td>
<td align="char" char=".">31.16</td>
<td align="char" char=".">87.77</td>
<td align="char" char=".">46.00</td>
</tr>
<tr>
<td align="left">&gt;500</td>
<td align="left">11659</td>
<td align="char" char=".">53.15</td>
<td align="char" char=".">91.30</td>
<td align="char" char=".">67.18</td>
<td align="char" char=".">37.68</td>
<td align="char" char=".">89.23</td>
<td align="char" char=".">52.99</td>
</tr>
<tr>
<td align="left">&gt;1000</td>
<td align="left">7414</td>
<td align="char" char=".">59.95</td>
<td align="char" char=".">91.82</td>
<td align="char" char=".">72.54</td>
<td align="char" char=".">47.54</td>
<td align="char" char=".">90.04</td>
<td align="char" char=".">62.23</td>
</tr>
<tr>
<td align="left">&gt;5000</td>
<td align="left">3999</td>
<td align="char" char=".">61.84</td>
<td align="char" char=".">92.12</td>
<td align="char" char=".">74.00</td>
<td align="char" char=".">50.05</td>
<td align="char" char=".">92.31</td>
<td align="char" char=".">64.91</td>
</tr>
</tbody>
</table>
</alternatives>
<table-wrap-foot>
<fn id="t002fn001">
<p>Performance on bacterial isolates from [<xref ref-type="bibr" rid="pcbi.1007781.ref006">6</xref>], as a function of the minimum contig length.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="sec012">
<title>Performance on simulated metagenome assemblies</title>
<p>We simulated metagenomes by randomly selecting bacterial genome references from the NCBI along with their associated plasmids and using realistic distributions for genome abundance and plasmid copy number. For genome abundance we used the log-normal distribution, normalized so that the relative abundances sum to 1. For plasmid copy number we used a geometric distribution with parameter <italic>p</italic> = <italic>min</italic>(1, <italic>log</italic>(<italic>L</italic>)/7) where <italic>L</italic> is the plasmid length. This makes it much less likely for a long plasmid to have a copy number above 1, while shorter plasmids can have higher copy numbers. Short reads were simulated from the genome references using InSilicoSeq [<xref ref-type="bibr" rid="pcbi.1007781.ref007">7</xref>] and assembled.</p>
<p>We then classified the assembled contigs. Classification was performed on the assembled contigs that had a match to either a reference plasmid or reference chromosome sequence used in the simulation (1641 plasmid contigs, 32451 chromosome contigs in Sim1, and 14272 plasmid contigs, 374397 chromosome contigs in Sim2). F1 results are shown in <xref ref-type="table" rid="pcbi.1007781.t003">Table 3</xref>. PlasClass outperformed PlasFlow by more than 17%. Scores were low for both methods due to the many short contigs in the assembly (50% and 73% of the contigs &lt;500 bp in Sim1 and Sim2 respectively) and the class imbalance. We show the impact of short sequences on performance in <xref ref-type="table" rid="pcbi.1007781.t004">Table 4</xref>. PlasClass consistently outperformed PlasFlow, and both methods performed better as more short sequences were filtered out.</p>
<table-wrap id="pcbi.1007781.t003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1007781.t003</object-id>
<label>Table 3</label>
<caption>
<title>Performance on simulated metagenomes.</title>
</caption>
<alternatives>
<graphic id="pcbi.1007781.t003g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007781.t003" xlink:type="simple"/>
<table border="0" frame="box" rules="all">
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="center"/>
<th align="center"># chromosomes</th>
<th align="center"># plasmids</th>
<th align="center"># unique</th>
<th align="center"># contigs</th>
<th align="center">PlasClass F1</th>
<th align="center">PlasFlow F1</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">Sim1</td>
<td align="center">34</td>
<td align="center">82</td>
<td align="center">56</td>
<td align="center">34092</td>
<td align="char" char=".">15.79</td>
<td align="char" char=".">13.49</td>
</tr>
<tr>
<td align="left">Sim2</td>
<td align="center">198</td>
<td align="center">333</td>
<td align="center">219</td>
<td align="center">388669</td>
<td align="char" char=".">12.08</td>
<td align="char" char=".">8.79</td>
</tr>
</tbody>
</table>
</alternatives>
<table-wrap-foot>
<fn id="t003fn001">
<p>Summary of the simulated metagenome datasets and comparison of F1 scores. # unique is the number of distinct plasmids, ignoring multiple copies.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="pcbi.1007781.t004" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1007781.t004</object-id>
<label>Table 4</label>
<caption>
<title>Simulated metagenome performance by length.</title>
</caption>
<alternatives>
<graphic id="pcbi.1007781.t004g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007781.t004" xlink:type="simple"/>
<table border="0" frame="box" rules="all">
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="left" rowspan="2"/>
<th align="left" rowspan="2">Contig length (bp)</th>
<th align="left" rowspan="2"># of contigs</th>
<th align="center" colspan="3">PlasClass</th>
<th align="center" colspan="3">PlasFlow</th>
</tr>
<tr>
<th align="center">Precision</th>
<th align="center">Recall</th>
<th align="center">F1</th>
<th align="center">Precision</th>
<th align="center">Recall</th>
<th align="center">F1</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" rowspan="4">Sim1</td>
<td align="left">All</td>
<td align="left">34092</td>
<td align="char" char=".">8.94</td>
<td align="char" char=".">67.40</td>
<td align="char" char=".">15.79</td>
<td align="char" char=".">7.30</td>
<td align="char" char=".">87.75</td>
<td align="char" char=".">13.49</td>
</tr>
<tr>
<td align="left">&gt;500</td>
<td align="left">17023</td>
<td align="char" char=".">11.22</td>
<td align="char" char=".">78.55</td>
<td align="char" char=".">19.64</td>
<td align="char" char=".">8.20</td>
<td align="char" char=".">85.05</td>
<td align="char" char=".">14.95</td>
</tr>
<tr>
<td align="left">&gt;1000</td>
<td align="left">11696</td>
<td align="char" char=".">15.67</td>
<td align="char" char=".">80.96</td>
<td align="char" char=".">26.26</td>
<td align="char" char=".">10.92</td>
<td align="char" char=".">85.00</td>
<td align="char" char=".">19.36</td>
</tr>
<tr>
<td align="left">&gt;5000</td>
<td align="left">4032</td>
<td align="char" char=".">36.11</td>
<td align="char" char=".">86.80</td>
<td align="char" char=".">51.00</td>
<td align="char" char=".">28.09</td>
<td align="char" char=".">90.80</td>
<td align="char" char=".">42.91</td>
</tr>
<tr>
<td align="left" rowspan="4">Sim2</td>
<td align="left">All</td>
<td align="left">388669</td>
<td align="char" char=".">6.64</td>
<td align="char" char=".">66.98</td>
<td align="char" char=".">12.08</td>
<td align="char" char=".">4.64</td>
<td align="char" char=".">84.31</td>
<td align="char" char=".">8.79</td>
</tr>
<tr>
<td align="left">&gt;500</td>
<td align="left">106814</td>
<td align="char" char=".">13.76</td>
<td align="char" char=".">76.00</td>
<td align="char" char=".">23.29</td>
<td align="char" char=".">8.42</td>
<td align="char" char=".">84.23</td>
<td align="char" char=".">15.32</td>
</tr>
<tr>
<td align="left">&gt;1000</td>
<td align="left">45597</td>
<td align="char" char=".">22.42</td>
<td align="char" char=".">79.20</td>
<td align="char" char=".">34.95</td>
<td align="char" char=".">14.01</td>
<td align="char" char=".">86.52</td>
<td align="char" char=".">24.11</td>
</tr>
<tr>
<td align="left">&gt;5000</td>
<td align="left">5642</td>
<td align="char" char=".">46.50</td>
<td align="char" char=".">81.18</td>
<td align="char" char=".">59.13</td>
<td align="char" char=".">38.48</td>
<td align="char" char=".">88.49</td>
<td align="char" char=".">53.63</td>
</tr>
</tbody>
</table>
</alternatives>
<table-wrap-foot>
<fn id="t004fn001">
<p>Performance on simulated metagenomes as a function of the minimum contig length.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="sec013">
<title>Performance on a plasmidome sample</title>
<p>We assembled the wastewater plasmidome sample ERR1538272 from the study by Shi et al. [<xref ref-type="bibr" rid="pcbi.1007781.ref008">8</xref>]. It is a metagenomic sample that was enriched for plasmid sequences. Each contig in the assembly was matched to the plasmid and bacterial reference databases using BLAST. The set of 9854 contigs (out of 35285) that matched the reference sequences (1888 plasmid contigs, 7966 chromosome contigs) was used as the gold standard to test the classifiers (contig length distribution is presented in <xref ref-type="supplementary-material" rid="pcbi.1007781.s003">S3 File</xref> See also <xref ref-type="supplementary-material" rid="pcbi.1007781.s004">S1 Fig</xref>). Although the plasmid-enriched setting favors PlasFlow, which sacrifices precision for higher recall, PlasClass still had a higher combined F1 as shown in <xref ref-type="table" rid="pcbi.1007781.t005">Table 5</xref>.</p>
<table-wrap id="pcbi.1007781.t005" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1007781.t005</object-id>
<label>Table 5</label>
<caption>
<title>Performance on a plasmidome sample.</title>
</caption>
<alternatives>
<graphic id="pcbi.1007781.t005g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007781.t005" xlink:type="simple"/>
<table border="0" frame="box" rules="all">
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="left"/>
<th align="left">Precision</th>
<th align="left">Recall</th>
<th align="left">F1 score</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">PlasClass</td>
<td align="char" char="."><bold>32.32</bold></td>
<td align="char" char=".">64.25</td>
<td align="char" char="."><bold>43.01</bold></td>
</tr>
<tr>
<td align="left">PlasFlow</td>
<td align="char" char=".">23.72</td>
<td align="char" char="."><bold>86.49</bold></td>
<td align="char" char=".">37.23</td>
</tr>
</tbody>
</table>
</alternatives>
<table-wrap-foot>
<fn id="t005fn001">
<p>Performance of PlasClass and PlasFlow on the plasmidome sample from [<xref ref-type="bibr" rid="pcbi.1007781.ref008">8</xref>].</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>We computed the precision-recall curve for the classification of the gold standard contigs in this sample by PlasClass, shown in <xref ref-type="supplementary-material" rid="pcbi.1007781.s005">S2 Fig</xref> (see also <xref ref-type="supplementary-material" rid="pcbi.1007781.s003">S3 File</xref>. The area under the curve is 0.41, more than double the baseline of 0.19 (the fraction of the contigs that are of plasmid origin).</p>
</sec>
<sec id="sec014">
<title>Classifying plasmids assembled from metagenomic samples</title>
<p>We assembled six publicly available human gut microbiome samples (accessions: ERR1297700, ERR1297720, ERR1297770, ERR1297796, ERR1297822, ERR1297834) and found plasmid sequences in the assemblies using Recycler [<xref ref-type="bibr" rid="pcbi.1007781.ref009">9</xref>]. Recycler assembles plasmid sequences based on coverage and circularity—features that are not used by the classifiers. 16–27 plasmids were assembled per sample (median length: 3.4kb). We classified each of the plasmids generated by Recycler to determine the extent of agreement between the sequence classifiers and this orthogonal approach. As seen in <xref ref-type="fig" rid="pcbi.1007781.g001">Fig 1</xref>, PlasClass agreed with Recycler on the same number or more plasmids than PlasFlow in all samples. This suggests that PlasClass can correctly identify more plasmids in real datasets, which contain many previously unknown plasmid sequences.</p>
<fig id="pcbi.1007781.g001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1007781.g001</object-id>
<label>Fig 1</label>
<caption>
<title>Classifying plasmids assembled from metagenomic samples.</title>
<p>Agreement of PlasClass and PlasFlow classifications with the plasmids generated by Recycler.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007781.g001" xlink:type="simple"/>
</fig>
</sec>
<sec id="sec015">
<title>Resource usage</title>
<p>In <xref ref-type="table" rid="pcbi.1007781.t006">Table 6</xref>, we compare the runtime and memory usage of PlasClass and PlasFlow on the full plasmidome, simulated metagenome, and isolate bacterial datasets. PlasClass (running with a single process) was faster than PlasFlow on the most time consuming sample and was significantly faster in all cases when using multiprocessing. It used less than half the RAM of PlasFlow and the RAM usage was not increased significantly when using multiprocessing. PlasFlow writes the feature matrices to disk while PlasClass does not. Performance was measured on a 44-core, 2.2 GHz server with 792 GB of RAM.</p>
<table-wrap id="pcbi.1007781.t006" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1007781.t006</object-id>
<label>Table 6</label>
<caption>
<title>Resource usage.</title>
</caption>
<alternatives>
<graphic id="pcbi.1007781.t006g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007781.t006" xlink:type="simple"/>
<table border="0" frame="box" rules="all">
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="left" rowspan="2">Dataset</th>
<th align="center" colspan="3">PlasFlow</th>
<th align="center" colspan="2">PlasClass</th>
<th align="center" colspan="2">PlasClass—8 processes</th>
</tr>
<tr>
<th align="center">Runtime</th>
<th align="center">RAM</th>
<th align="center">Disk</th>
<th align="center">Runtime</th>
<th align="center">RAM</th>
<th align="center">Runtime</th>
<th align="center">RAM</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">Isolates</td>
<td align="char" char=".">12.8</td>
<td align="char" char=".">47.8</td>
<td align="char" char=".">21.4</td>
<td align="char" char=".">36.3</td>
<td align="char" char=".">17.2</td>
<td align="char" char=".">6.8</td>
<td align="char" char=".">17.2</td>
</tr>
<tr>
<td align="left">Sim1</td>
<td align="char" char=".">7.1</td>
<td align="char" char=".">28.3</td>
<td align="char" char=".">12.1</td>
<td align="char" char=".">16.2</td>
<td align="char" char=".">12.0</td>
<td align="char" char=".">3.0</td>
<td align="char" char=".">12.0</td>
</tr>
<tr>
<td align="left">Sim2</td>
<td align="char" char=".">89.3</td>
<td align="char" char=".">291.3</td>
<td align="char" char=".">137.5</td>
<td align="char" char=".">54.8</td>
<td align="char" char=".">17.3</td>
<td align="char" char=".">17.1</td>
<td align="char" char=".">17.3</td>
</tr>
<tr>
<td align="left">Plasmidome</td>
<td align="char" char=".">7.9</td>
<td align="char" char=".">28.8</td>
<td align="char" char=".">12.2</td>
<td align="char" char=".">4.2</td>
<td align="char" char=".">12.2</td>
<td align="char" char=".">5.2</td>
<td align="char" char=".">17.3</td>
</tr>
</tbody>
</table>
</alternatives>
<table-wrap-foot>
<fn id="t006fn001">
<p>Runtime (wall clock time, in minutes) and memory usage (in GB) of PlasClass and PlasFlow.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="sec016" sec-type="conclusions">
<title>Discussion</title>
<p>We presented the PlasClass algorithm for classifying plasmid sequences. We applied the algorithm across a wide range of contexts and showed that in most cases PlasClass outperformed the state-of the-art algorithm PlasFlow. It was also faster and required less memory.</p>
<p>The task of classifying plasmid sequences in the real-world context of metagenomic data is a difficult task due to the nature of the assembled sequences: the sequences are mostly short (60-90% are shorter than 1 kbp, see Tables <xref ref-type="table" rid="pcbi.1007781.t002">2</xref> and <xref ref-type="table" rid="pcbi.1007781.t004">4</xref>), and there is an imbalance between the number of plasmid and bacterial sequences (1:3 in the bacterial isolates, and 1:4 in the plasmid-enriched plasmidome samples presented). Given the constraints, the quality of classification is naturally limited, but the task is of high importance for understanding plasmid role in horizontal transfer, antibiotic resistance and ecology. We also showed that classification quality improves when focusing on longer sequences and when plasmid sequences are enriched.</p>
</sec>
<sec id="sec017">
<title>Availability and future directions</title>
<p>PlasClass is open-source and freely available under the MIT license. PlasClass is maintained on GitHub, enabling bug-reporting and community collaboration in extending the tool to meet needs of the users as they arise. It can be found at <ext-link ext-link-type="uri" xlink:href="https://github.com/Shamir-Lab/PlasClass" xlink:type="simple">https://github.com/Shamir-Lab/PlasClass</ext-link>.</p>
<p>We plan to use PlasClass in order to improve plasmid assembly from metagenomic samples, by utilizing the classification scores of contigs. Another possible future direction is to tailor the plasmid training data to the problem at hand: Currently we use all known plasmids for training, which creates a bias towards clinically relevant samples. By using training datasets tailored to other specific environments one can create a classifier that would fit those environments better.</p>
</sec>
<sec id="sec018">
<title>Supporting information</title>
<supplementary-material id="pcbi.1007781.s001" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007781.s001" xlink:type="simple">
<label>S1 File</label>
<caption>
<title>PlasClass documentation.</title>
<p>Complete documentation for using PlasClass.</p>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1007781.s002" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007781.s002" xlink:type="simple">
<label>S2 File</label>
<caption>
<title>Results with ambiguous sequences filtered.</title>
<p>Extended results reporting performance with ambiguous sequences filtered out.</p>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1007781.s003" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007781.s003" xlink:type="simple">
<label>S3 File</label>
<caption>
<title>Plasmidome dataset extended results.</title>
<p>Extended results reporting the contig lengths and precision-recall curve for the plasmidome sample.</p>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1007781.s004" mimetype="image/tiff" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007781.s004" xlink:type="simple">
<label>S1 Fig</label>
<caption>
<title>Plasmidome contig lengths.</title>
<p>Histogram of the contig lengths in the plasmidome assembly. Note that the y-axis uses log-scale.</p>
<p>(TIF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1007781.s005" mimetype="image/tiff" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1007781.s005" xlink:type="simple">
<label>S2 Fig</label>
<caption>
<title>Plasmidome precision-recall curve.</title>
<p>Precision-recall curve for the classification of contigs of in the plasmidome sample.</p>
<p>(TIF)</p>
</caption>
</supplementary-material>
</sec>
</body>
<back>
<ack>
<p>We thank the members of the Shamir and Mizrahi Labs for their help and advice.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="pcbi.1007781.ref001">
<label>1</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Krawczyk</surname> <given-names>PS</given-names></name>, <name name-style="western"><surname>Lipinski</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Dziembowski</surname> <given-names>A</given-names></name>. <article-title>PlasFlow: predicting plasmid sequences in metagenomic data using genome signatures</article-title>. <source>Nucleic Acids Research</source>. <year>2018</year>;<volume>46</volume>(<issue>6</issue>):<fpage>e35</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/nar/gkx1321" xlink:type="simple">10.1093/nar/gkx1321</ext-link></comment> <object-id pub-id-type="pmid">29346586</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007781.ref002">
<label>2</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Zhou</surname> <given-names>F</given-names></name>, <name name-style="western"><surname>Xu</surname> <given-names>Y</given-names></name>. <article-title>cBar: a computer program to distinguish plasmid-derived from chromosome-derived sequence fragments in metagenomics data</article-title>. <source>Bioinformatics</source>. <year>2010</year>;<volume>26</volume>(<issue>16</issue>):<fpage>2051</fpage>–<lpage>2052</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bioinformatics/btq299" xlink:type="simple">10.1093/bioinformatics/btq299</ext-link></comment> <object-id pub-id-type="pmid">20538725</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007781.ref003">
<label>3</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Galata</surname> <given-names>V</given-names></name>, <name name-style="western"><surname>Fehlmann</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Backes</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Keller</surname> <given-names>A</given-names></name>. <article-title>PLSDB: a resource of complete bacterial plasmids</article-title>. <source>Nucleic Acids Research</source>. <year>2018</year>;<volume>47</volume>(<issue>D1</issue>):<fpage>D195</fpage>–<lpage>D202</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1007781.ref004">
<label>4</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Pedregosa</surname> <given-names>F</given-names></name>, <name name-style="western"><surname>Varoquaux</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Gramfort</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Michel</surname> <given-names>V</given-names></name>, <name name-style="western"><surname>Thirion</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Grisel</surname> <given-names>O</given-names></name>, <etal>et al</etal>. <article-title>Scikit-learn: Machine Learning in Python</article-title>. <source>Journal of Machine Learning Research</source>. <year>2011</year>;<volume>12</volume>:<fpage>2825</fpage>–<lpage>2830</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1007781.ref005">
<label>5</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Bankevich</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Nurk</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Antipov</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Gurevich</surname> <given-names>AA</given-names></name>, <name name-style="western"><surname>Dvorkin</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Kulikov</surname> <given-names>AS</given-names></name>, <etal>et al</etal>. <article-title>SPAdes: a new genome assembly algorithm and its applications to single-cell sequencing</article-title>. <source>Journal of Computational Biology</source>. <year>2012</year>;<volume>19</volume>(<issue>5</issue>):<fpage>455</fpage>–<lpage>477</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1089/cmb.2012.0021" xlink:type="simple">10.1089/cmb.2012.0021</ext-link></comment> <object-id pub-id-type="pmid">22506599</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007781.ref006">
<label>6</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Arredondo-Alonso</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Willems</surname> <given-names>RJ</given-names></name>, <name name-style="western"><surname>van Schaik</surname> <given-names>W</given-names></name>, <name name-style="western"><surname>Schürch</surname> <given-names>AC</given-names></name>. <article-title>On the (im)possibility of reconstructing plasmids from whole-genome short-read sequencing data</article-title>. <source>Microbial genomics</source>. <year>2017</year>;<volume>3</volume>(<issue>10</issue>):<fpage>e000128</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1099/mgen.0.000128" xlink:type="simple">10.1099/mgen.0.000128</ext-link></comment> <object-id pub-id-type="pmid">29177087</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007781.ref007">
<label>7</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Gourlé</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Karlsson-Lindsjö</surname> <given-names>O</given-names></name>, <name name-style="western"><surname>Hayer</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Bongcam-Rudloff</surname> <given-names>E</given-names></name>. <article-title>Simulating Illumina metagenomic data with InSilicoSeq</article-title>. <source>Bioinformatics</source>. <year>2018</year>;<volume>35</volume>(<issue>3</issue>):<fpage>521</fpage>–<lpage>522</lpage>.</mixed-citation>
</ref>
<ref id="pcbi.1007781.ref008">
<label>8</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Shi</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>Zhang</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Tian</surname> <given-names>Z</given-names></name>, <name name-style="western"><surname>Yang</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Zhang</surname> <given-names>Y</given-names></name>. <article-title>Characteristics of ARG-carrying plasmidome in the cultivable microbial community from wastewater treatment system under high oxytetracycline concentration</article-title>. <source>Applied microbiology and biotechnology</source>. <year>2018</year>;<volume>102</volume>(<issue>4</issue>):<fpage>1847</fpage>–<lpage>1858</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1007/s00253-018-8738-6" xlink:type="simple">10.1007/s00253-018-8738-6</ext-link></comment> <object-id pub-id-type="pmid">29332216</object-id></mixed-citation>
</ref>
<ref id="pcbi.1007781.ref009">
<label>9</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Rozov</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Brown Kav</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Bogumil</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Shterzer</surname> <given-names>N</given-names></name>, <name name-style="western"><surname>Halperin</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Mizrahi</surname> <given-names>I</given-names></name>, <etal>et al</etal>. <article-title>Recycler: an algorithm for detecting plasmids from de novo assembly graphs</article-title>. <source>Bioinformatics</source>. <year>2017</year>;<volume>33</volume>(<issue>4</issue>):<fpage>475</fpage>–<lpage>482</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bioinformatics/btw651" xlink:type="simple">10.1093/bioinformatics/btw651</ext-link></comment> <object-id pub-id-type="pmid">28003256</object-id></mixed-citation>
</ref>
</ref-list>
</back>
</article>