<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1d3 20150301//EN" "http://jats.nlm.nih.gov/publishing/1.1d3/JATS-journalpublishing1.dtd">
<article article-type="research-article" dtd-version="1.1d3" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS Comput Biol</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">ploscomp</journal-id>
<journal-title-group>
<journal-title>PLOS Computational Biology</journal-title>
</journal-title-group>
<issn pub-type="ppub">1553-734X</issn>
<issn pub-type="epub">1553-7358</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, CA USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1008984</article-id>
<article-id pub-id-type="publisher-id">PCOMPBIOL-D-21-00825</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Research Article</subject>
</subj-group>
<subj-group subj-group-type="Discipline-v3">
<subject>Computer and information sciences</subject><subj-group><subject>Software engineering</subject><subj-group><subject>Computer software</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Engineering and technology</subject><subj-group><subject>Software engineering</subject><subj-group><subject>Computer software</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Gene identification and analysis</subject><subj-group><subject>Genetic screens</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject><subj-group><subject>Animal genomics</subject><subj-group><subject>Mammalian genomics</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject><subj-group><subject>Animal genomics</subject><subj-group><subject>Invertebrate genomics</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject><subj-group><subject>Plant genomics</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Bioengineering</subject><subj-group><subject>Biotechnology</subject><subj-group><subject>Plant biotechnology</subject><subj-group><subject>Plant genomics</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Engineering and technology</subject><subj-group><subject>Bioengineering</subject><subj-group><subject>Biotechnology</subject><subj-group><subject>Plant biotechnology</subject><subj-group><subject>Plant genomics</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Plant science</subject><subj-group><subject>Plant biotechnology</subject><subj-group><subject>Plant genomics</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Plant genetics</subject><subj-group><subject>Plant genomics</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Plant science</subject><subj-group><subject>Plant genetics</subject><subj-group><subject>Plant genomics</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Neuroscience</subject><subj-group><subject>Cognitive science</subject><subj-group><subject>Cognitive psychology</subject><subj-group><subject>Language</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Psychology</subject><subj-group><subject>Cognitive psychology</subject><subj-group><subject>Language</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Social sciences</subject><subj-group><subject>Psychology</subject><subj-group><subject>Cognitive psychology</subject><subj-group><subject>Language</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Animal studies</subject><subj-group><subject>Experimental organism systems</subject><subj-group><subject>Model organisms</subject><subj-group><subject>Arabidopsis thaliana</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Model organisms</subject><subj-group><subject>Arabidopsis thaliana</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Organisms</subject><subj-group><subject>Eukaryota</subject><subj-group><subject>Plants</subject><subj-group><subject>Brassica</subject><subj-group><subject>Arabidopsis thaliana</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Animal studies</subject><subj-group><subject>Experimental organism systems</subject><subj-group><subject>Plant and algal models</subject><subj-group><subject>Arabidopsis thaliana</subject></subj-group></subj-group></subj-group></subj-group></subj-group></article-categories>
<title-group>
<article-title>Gene name errors: Lessons not learned</article-title>
<alt-title alt-title-type="running-head">Gene name errors: Lessons not learned</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0003-2163-6203</contrib-id>
<name name-style="western">
<surname>Abeysooriya</surname>
<given-names>Mandhri</given-names>
</name>
<role content-type="https://casrai.org/credit/">Data curation</role>
<role content-type="https://casrai.org/credit/">Formal analysis</role>
<role content-type="https://casrai.org/credit/">Investigation</role>
<role content-type="https://casrai.org/credit/">Methodology</role>
<role content-type="https://casrai.org/credit/">Software</role>
<role content-type="https://casrai.org/credit/">Validation</role>
<role content-type="https://casrai.org/credit/">Visualization</role>
<role content-type="https://casrai.org/credit/">Writing – original draft</role>
<role content-type="https://casrai.org/credit/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"/>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0002-8715-6854</contrib-id>
<name name-style="western">
<surname>Soria</surname>
<given-names>Megan</given-names>
</name>
<role content-type="https://casrai.org/credit/">Investigation</role>
<role content-type="https://casrai.org/credit/">Validation</role>
<role content-type="https://casrai.org/credit/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"/>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0002-7891-836X</contrib-id>
<name name-style="western">
<surname>Kasu</surname>
<given-names>Mary Sravya</given-names>
</name>
<role content-type="https://casrai.org/credit/">Investigation</role>
<role content-type="https://casrai.org/credit/">Validation</role>
<role content-type="https://casrai.org/credit/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"/>
</contrib>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0002-7688-6974</contrib-id>
<name name-style="western">
<surname>Ziemann</surname>
<given-names>Mark</given-names>
</name>
<role content-type="https://casrai.org/credit/">Conceptualization</role>
<role content-type="https://casrai.org/credit/">Data curation</role>
<role content-type="https://casrai.org/credit/">Formal analysis</role>
<role content-type="https://casrai.org/credit/">Investigation</role>
<role content-type="https://casrai.org/credit/">Methodology</role>
<role content-type="https://casrai.org/credit/">Project administration</role>
<role content-type="https://casrai.org/credit/">Resources</role>
<role content-type="https://casrai.org/credit/">Software</role>
<role content-type="https://casrai.org/credit/">Supervision</role>
<role content-type="https://casrai.org/credit/">Validation</role>
<role content-type="https://casrai.org/credit/">Visualization</role>
<role content-type="https://casrai.org/credit/">Writing – original draft</role>
<role content-type="https://casrai.org/credit/">Writing – review &amp; editing</role>
<xref ref-type="corresp" rid="cor001">*</xref>
<xref ref-type="aff" rid="aff001"/>
</contrib>
</contrib-group>
<aff id="aff001"><addr-line>Deakin University, School of Life and Environmental Sciences, Geelong, Australia</addr-line></aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple">
<name name-style="western">
<surname>Ouzounis</surname>
<given-names>Christos A.</given-names>
</name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/>
</contrib>
</contrib-group>
<aff id="edit1"><addr-line>CPERI, GREECE</addr-line></aff>
<author-notes>
<fn fn-type="conflict" id="coi001">
<p>The authors have declared that no competing interests exist.</p>
</fn>
<corresp id="cor001">* E-mail: <email xlink:type="simple">m.ziemann@deakin.edu.au</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>30</day>
<month>7</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<month>7</month>
<year>2021</year>
</pub-date>
<volume>17</volume>
<issue>7</issue>
<elocation-id>e1008984</elocation-id>
<history>
<date date-type="received">
<day>4</day>
<month>5</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>1</day>
<month>7</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-year>2021</copyright-year>
<copyright-holder>Abeysooriya et al</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.pcbi.1008984"/>
<abstract>
<p>Erroneous conversion of gene names into other dates and other data types has been a frustration for computational biologists for years. We hypothesized that such errors in supplementary files might diminish after a report in 2016 highlighting the extent of the problem. To assess this, we performed a scan of supplementary files published in PubMed Central from 2014 to 2020. Overall, gene name errors continued to accumulate unabated in the period after 2016. An improved scanning software we developed identified gene name errors in 30.9% (3,436/11,117) of articles with supplementary Excel gene lists; a figure significantly higher than previously estimated. This is due to gene names being converted not just to dates and floating-point numbers, but also to internal date format (five-digit numbers). These findings further reinforce that spreadsheets are ill-suited to use with large genomic data.</p>
</abstract>
<abstract abstract-type="summary">
<title>Author summary</title>
<p>Autocorrection is a feature of modern softwares including messaging apps, word processors and spreadsheets. These are designed to avoid data entry errors but “autocorrect fails” can lead to information being distorted in undesired and sometimes humorous ways. What is not funny though is having genomics spreadsheets suffer from auto-conversion of gene names like <italic>SEPT8</italic>, <italic>DEC1</italic> and <italic>MARCH3</italic> into dates, a problem first characterised in 2004. A 2016 article on this topic led the Human Gene Name Consortium to change many of these gene names to be less susceptible to autocorrect. Despite this, our work here shows that gene name autocorrect errors continue to accumulate in supplementary genomics spreadsheet files at a rapid pace. To avoid this and other reproducibility problems with spreadsheets, big changes are required in the way genomics scientists analyse and share data. We provide several practical steps researchers can take to avoid gene name errors and reiterate that big genomics data analysis is better suited to Python/R notebooks rather than spreadsheets.</p>
</abstract>
<funding-group>
<funding-statement>The author(s) received no specific funding for this work.</funding-statement>
</funding-group>
<counts>
<fig-count count="3"/>
<table-count count="5"/>
<page-count count="13"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>PLOS Publication Stage</meta-name>
<meta-value>vor-update-to-uncorrected-proof</meta-value>
</custom-meta>
<custom-meta>
<meta-name>Publication Update</meta-name>
<meta-value>2021-08-11</meta-value>
</custom-meta>
<custom-meta id="data-availability">
<meta-name>Data Availability</meta-name>
<meta-value>Code availability: Bash and R scripts supporting this paper are available in the GitHub repository (<ext-link ext-link-type="uri" xlink:href="https://github.com/markziemann/GeneNameErrors2020" xlink:type="simple">https://github.com/markziemann/GeneNameErrors2020</ext-link>).</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="sec001" sec-type="intro">
<title>Background</title>
<p>It is a well-documented problem that spreadsheet software inadvertently converts gene symbols to dates and floating-point numbers, with these errors propagating downstream to annotation sets and other databases [<xref ref-type="bibr" rid="pcbi.1008984.ref001">1</xref>]. Previous work shows that gene name errors are made while researchers analyse and prepare supplementary files for publication [<xref ref-type="bibr" rid="pcbi.1008984.ref002">2</xref>]. A screen of 18 journals found that one fifth of publications with supplementary Excel gene lists contained errors (704/3597). It remains unknown how frequent gene name errors are outside of these 18 journals, and whether the attention of previous publications has resulted in the mitigation of the problem.</p>
<p>Notably, software developers are beginning to remedy the problem at their end, with some packages like LibreOffice now resisting the conversion of gene symbols to dates (Version: 6.4.6.2). In addition, a recent announcement by the HUGO Gene Nomenclature Committee (HGNC) outlined plans for specifically changing gene symbols to avoid auto-correction [<xref ref-type="bibr" rid="pcbi.1008984.ref003">3</xref>]. For example, <italic>SEPT1</italic> becomes <italic>SEPTIN1</italic> and <italic>MARCH1</italic> becomes <italic>MARCHF1</italic>. It will likely take months and perhaps years for the new gene symbols to appear in publications.</p>
<p>Although changes to gene names and software will help, they won’t solve the overarching problem with spreadsheets; that (i) errors occur silently, (ii) errors can be hidden amongst thousands of rows of data, and (iii) they are difficult to audit. Research shows that errors are surprisingly common in the business setting [<xref ref-type="bibr" rid="pcbi.1008984.ref004">4</xref>], which raises the question as to how common such errors are in science. The difficulty in auditing spreadsheets makes them generally incompatible with the principles of computational reproducibility [<xref ref-type="bibr" rid="pcbi.1008984.ref005">5</xref>].</p>
<p>Our main goal here is to examine whether gene name errors have diminished since 2016 or they continue to be a problem. We also assess the behaviour of current spreadsheet software in converting gene names to dates and identify Excel date genes across Eukarya. We follow this up with a screen of supplementary files from genomics-related PubMed Central (PMC) publications in the period 2014 to 2020.</p>
</sec>
<sec id="sec002" sec-type="results">
<title>Results</title>
<sec id="sec003">
<title>Testing spreadsheet software</title>
<p>We tested the propensity of various spreadsheet software to convert gene names into dates after importing a set of strings containing human gene names by (i) opening a text file, (ii) pasting data, and (iii) directly typing (<xref ref-type="table" rid="pcbi.1008984.t001">Table 1</xref>). We found that Microsoft Excel and Google Sheets converted this data to dates in all three modes of import. LibreOffice and Gnumeric did not convert gene names to dates in our tests here. The date conversion behaviour of Excel and Google Sheets could be circumvented by formatting the destination cells as “plain text” prior to pasting or typing. Nevertheless, this result shows that using LibreOffice and Gnumeric are safer than Excel and Google Sheets.</p>
<table-wrap id="pcbi.1008984.t001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1008984.t001</object-id>
<label>Table 1</label> <caption><title>Gene name conversion behaviour of spreadsheet software.</title></caption>
<alternatives>
<graphic id="pcbi.1008984.t001g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1008984.t001" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="center"/>
<th align="center" colspan="4">Gene name conversion</th>
</tr>
<tr>
<th align="center">Software</th>
<th align="center">Microsoft Excel</th>
<th align="center">Google Sheets</th>
<th align="center">LibreOffice</th>
<th align="center">Gnumeric</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center">Text file open</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">No</td>
</tr>
<tr>
<td align="center">Pasting data</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">No</td>
</tr>
<tr>
<td align="center">Typing</td>
<td align="center">Yes</td>
<td align="center">Yes</td>
<td align="center">No</td>
<td align="center">No</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
</sec>
<sec id="sec004">
<title>Identifying Excel date genes across kingdoms</title>
<p>Although recent changes have been made to human and mouse gene names to prevent conversion to dates [<xref ref-type="bibr" rid="pcbi.1008984.ref003">3</xref>], it is uncertain whether such changes have propagated through to other species. To assess this, we downloaded all eukaryotic gene names available in Ensembl and imported this into Excel and collected all genes that were converted to dates. In total there were 1,544 gene names converted to dates, from 104 taxa (Tables <xref ref-type="table" rid="pcbi.1008984.t002">2</xref> and S1). Although most affected gene names were vertebrate in origin, there were gene names affected in all groups.</p>
<table-wrap id="pcbi.1008984.t002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1008984.t002</object-id>
<label>Table 2</label> <caption><title>Gene names vulnerable to date conversion across Eukarya.</title></caption>
<alternatives>
<graphic id="pcbi.1008984.t002g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1008984.t002" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="center"/>
<th align="center">Taxa</th>
<th align="center">Genes</th>
<th align="center">Genes affected</th>
<th align="center">Taxa affected</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center">Vertebrates</td>
<td align="center">310</td>
<td align="center">5,263,175</td>
<td align="center">1,325</td>
<td align="center">76</td>
</tr>
<tr>
<td align="center">Metazoa</td>
<td align="center">59</td>
<td align="center">525,867</td>
<td align="center">17</td>
<td align="center">3</td>
</tr>
<tr>
<td align="center">Plants</td>
<td align="center">60</td>
<td align="center">244,101</td>
<td align="center">35</td>
<td align="center">4</td>
</tr>
<tr>
<td align="center">Fungi</td>
<td align="center">59</td>
<td align="center">788,221</td>
<td align="center">140</td>
<td align="center">12</td>
</tr>
<tr>
<td align="center">Protists</td>
<td align="center">39</td>
<td align="center">163,026</td>
<td align="center">27</td>
<td align="center">9</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
</sec>
<sec id="sec005">
<title>Gene name errors by year</title>
<p>In order to determine whether gene name errors in supplementary files remain a problem, we undertook a screen of genomics-related publications in PMC. We collated a list of 166,139 genomics articles published between 2014 and 2020, and screened them using an enhanced script. In addition to identifying conversions to standard date formats (eg: 3/1/2016, Mar-3, 3-Mar) and floating-point numbers (eg: 9.33E+22), this script also recognises five-digit numbers as likely to be the result of gene name errors as this is the internal date format used by spreadsheets [<xref ref-type="bibr" rid="pcbi.1008984.ref001">1</xref>].</p>
<p>The results of this screen are shown in <xref ref-type="table" rid="pcbi.1008984.t003">Table 3</xref>. From this set of publications, 32,841 had supplementary files in Excel format (with “xls” or “xlsx” suffixes). Of these, 11,117 publications were detected to contain at least one list of gene symbols. The software detected 3,470 publications with suspected gene name errors. After manually opening each spreadsheet file (5,136 files), we identified 34 publications as being false positives, leaving 3,436 publications with confirmed gene name errors (<xref ref-type="supplementary-material" rid="pcbi.1008984.s002">S2</xref> and <xref ref-type="supplementary-material" rid="pcbi.1008984.s003">S3</xref> Tables). These publications contain a total of 5,086 spreadsheets with gene name errors. The proportion of publications with Excel gene lists that contain errors was 30.9%; substantially higher than previously reported [<xref ref-type="bibr" rid="pcbi.1008984.ref002">2</xref>].</p>
<table-wrap id="pcbi.1008984.t003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1008984.t003</object-id>
<label>Table 3</label> <caption><title>Results of a screen for gene name errors in PMC.</title></caption>
<alternatives>
<graphic id="pcbi.1008984.t003g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1008984.t003" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="center"/>
<th align="center">2014</th>
<th align="center">2015</th>
<th align="center">2016</th>
<th align="center">2017</th>
<th align="center">2018</th>
<th align="center">2019</th>
<th align="center">2020</th>
<th align="center">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center">Publications screened</td>
<td align="center">19976</td>
<td align="center">21204</td>
<td align="center">22261</td>
<td align="center">23976</td>
<td align="center">24986</td>
<td align="center">26046</td>
<td align="center">27690</td>
<td align="center">166139</td>
</tr>
<tr>
<td align="center">Excel files screened</td>
<td align="center">2948</td>
<td align="center">4318</td>
<td align="center">4472</td>
<td align="center">4355</td>
<td align="center">4824</td>
<td align="center">5481</td>
<td align="center">6443</td>
<td align="center">32841</td>
</tr>
<tr>
<td align="center">Excel files with gene lists</td>
<td align="center">2286</td>
<td align="center">3037</td>
<td align="center">3331</td>
<td align="center">3021</td>
<td align="center">3566</td>
<td align="center">3342</td>
<td align="center">4496</td>
<td align="center">23670</td>
</tr>
<tr>
<td align="center">Publications with Excel gene lists</td>
<td align="center">936</td>
<td align="center">1491</td>
<td align="center">1579</td>
<td align="center">1412</td>
<td align="center">1653</td>
<td align="center">1823</td>
<td align="center">2223</td>
<td align="center">11117</td>
</tr>
<tr>
<td align="center">Publications with suspected gene name errors</td>
<td align="center">284</td>
<td align="center">490</td>
<td align="center">477</td>
<td align="center">443</td>
<td align="center">475</td>
<td align="center">594</td>
<td align="center">707</td>
<td align="center">3470</td>
</tr>
<tr>
<td align="center">False positive Excel files</td>
<td align="center">8</td>
<td align="center">0</td>
<td align="center">7</td>
<td align="center">5</td>
<td align="center">15</td>
<td align="center">4</td>
<td align="center">11</td>
<td align="center">50</td>
</tr>
<tr>
<td align="center">False positive publications</td>
<td align="center">2</td>
<td align="center">0</td>
<td align="center">6</td>
<td align="center">3</td>
<td align="center">11</td>
<td align="center">3</td>
<td align="center">9</td>
<td align="center">34</td>
</tr>
<tr>
<td align="center">Affected Excel files</td>
<td align="center">429</td>
<td align="center">701</td>
<td align="center">653</td>
<td align="center">648</td>
<td align="center">703</td>
<td align="center">914</td>
<td align="center">1038</td>
<td align="center">5086</td>
</tr>
<tr>
<td align="center">Affected publications</td>
<td align="center">282</td>
<td align="center">490</td>
<td align="center">471</td>
<td align="center">440</td>
<td align="center">464</td>
<td align="center">591</td>
<td align="center">698</td>
<td align="center">3436</td>
</tr>
<tr>
<td align="center">Proportion of publications affected (%)</td>
<td align="center">30.1%</td>
<td align="center">32.9%</td>
<td align="center">29.8%</td>
<td align="center">31.2%</td>
<td align="center">28.1%</td>
<td align="center">32.4%</td>
<td align="center">31.4%</td>
<td align="center">30.9%</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<p>In the period 2014–2020, both the number of publications with Excel gene lists and the number of publications affected by gene name errors increased, with a pause in the period 2016–2018 (<xref ref-type="fig" rid="pcbi.1008984.g001">Fig 1A and 1B</xref>). On the other hand, the proportion of papers with Excel gene lists affected by errors remained stable over this period (<xref ref-type="fig" rid="pcbi.1008984.g001">Fig 1C</xref>). This result suggests gene name errors did not substantially reduce in the period after 2016 as we had hypothesized.</p>
<fig id="pcbi.1008984.g001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1008984.g001</object-id>
<label>Fig 1</label>
<caption>
<title>Prevalence of gene name errors in the period 2014–2020.</title>
<p>(A) Publications with supplementary Excel gene lists. (B) Publications affected by gene name errors. (C) Proportion of affected publications.</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1008984.g001" xlink:type="simple"/>
</fig>
<p>Next, to determine whether five-digit numbers explain the higher observed proportion of errors, we investigated a subset of 2160 affected spreadsheet files to determine frequency of error types. Dates in Mar-1 or 1-Mar format accounted for 1,797 files (83.2%). Errors in DD/MM/YYYY: format accounted for 19 files (0.88%) and 4 for floating-point numbers (0.18%). Five-digit numbers accounted for 340 files (15.7%) indicating that this error type is sufficiently common to account for the discrepancy between this and the previous report (<xref ref-type="supplementary-material" rid="pcbi.1008984.s004">S4 Table</xref>). When these five-digit numbers are formatted as standard dates, 292 (85.9%) appear in the months of March and September which is consistent with gene name errors.</p>
</sec>
<sec id="sec006">
<title>Gene name errors by organism</title>
<p>Next, we investigated whether the rate of gene name errors was dependent on the organism under study. We found that the frequency of gene name errors was highest for mouse and human datasets, while lower for Arabidopsis, chicken and rice (<xref ref-type="table" rid="pcbi.1008984.t004">Table 4</xref>).</p>
<table-wrap id="pcbi.1008984.t004" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1008984.t004</object-id>
<label>Table 4</label> <caption><title>Gene name errors stratified by organism under study.</title></caption>
<alternatives>
<graphic id="pcbi.1008984.t004g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1008984.t004" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="center">Species</th>
<th align="center">Publications with Excel gene lists</th>
<th align="center">Affected publications</th>
<th align="center">Proportion of publications affected</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center"><italic>M</italic>. <italic>musculus</italic></td>
<td align="center">1577</td>
<td align="center">609</td>
<td align="center">38.6%</td>
</tr>
<tr>
<td align="center"><italic>H</italic>. <italic>sapiens</italic></td>
<td align="center">7936</td>
<td align="center">2419</td>
<td align="center">30.5%</td>
</tr>
<tr>
<td align="center"><italic>C</italic>. <italic>elegans</italic></td>
<td align="center">124</td>
<td align="center">31</td>
<td align="center">25.0%</td>
</tr>
<tr>
<td align="center"><italic>D</italic>. <italic>melanogaster</italic></td>
<td align="center">607</td>
<td align="center">142</td>
<td align="center">23.4%</td>
</tr>
<tr>
<td align="center"><italic>S</italic>. <italic>cerevisiae</italic></td>
<td align="center">443</td>
<td align="center">93</td>
<td align="center">21.0%</td>
</tr>
<tr>
<td align="center"><italic>R</italic>. <italic>norvegicus</italic></td>
<td align="center">327</td>
<td align="center">68</td>
<td align="center">20.8%</td>
</tr>
<tr>
<td align="center"><italic>D</italic>. <italic>rerio</italic></td>
<td align="center">251</td>
<td align="center">48</td>
<td align="center">19.1%</td>
</tr>
<tr>
<td align="center"><italic>A</italic>. <italic>thaliana</italic></td>
<td align="center">511</td>
<td align="center">76</td>
<td align="center">14.9%</td>
</tr>
<tr>
<td align="center"><italic>G</italic>. <italic>gallus</italic></td>
<td align="center">1827</td>
<td align="center">172</td>
<td align="center">9.4%</td>
</tr>
<tr>
<td align="center"><italic>O</italic>. <italic>sativa</italic></td>
<td align="center">10</td>
<td align="center">0</td>
<td align="center">0.0%</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
</sec>
<sec id="sec007">
<title>Gene name errors by journal</title>
<p>In this sample of PMC articles 4,581 journals were represented. Of these, 741 journals published one or more supplementary Excel gene lists. There were 414 journals with at least one supplementary file with gene name errors. Next, we focused on journals that published at least 50 articles with supplementary Excel gene lists, finding 37 journals that accounted for 67.9% of affected publications (<xref ref-type="table" rid="pcbi.1008984.t005">Table 5</xref>). The journals with the most affected articles included <italic>Nature Communications</italic>, <italic>PLOS ONE</italic>, <italic>Scientific Reports</italic>, <italic>BMC Genomics</italic>, <italic>PLoS Genetics</italic> and <italic>Oncotarget</italic>, with at least 100 affected publications each.</p>
<table-wrap id="pcbi.1008984.t005" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1008984.t005</object-id>
<label>Table 5</label> <caption><title>Prevalence of gene name errors across journals.</title> <p>Only journals with ≥50 articles with supplementary Excel gene lists are shown.</p></caption>
<alternatives>
<graphic id="pcbi.1008984.t005g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1008984.t005" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="center">Journal name as it appears in PMC</th>
<th align="center">Number of articles with Excel gene lists</th>
<th align="center">Number of affected articles</th>
<th align="center">Proportion of articles affected (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center"><italic>Nat Commun</italic></td>
<td align="center">920</td>
<td align="center">345</td>
<td align="center">37.5%</td>
</tr>
<tr>
<td align="center"><italic>PLoS One</italic></td>
<td align="center">946</td>
<td align="center">244</td>
<td align="center">25.8%</td>
</tr>
<tr>
<td align="center"><italic>Sci Rep</italic></td>
<td align="center">767</td>
<td align="center">227</td>
<td align="center">29.6%</td>
</tr>
<tr>
<td align="center"><italic>BMC Genomics</italic></td>
<td align="center">660</td>
<td align="center">166</td>
<td align="center">25.2%</td>
</tr>
<tr>
<td align="center"><italic>PLoS Genet</italic></td>
<td align="center">448</td>
<td align="center">134</td>
<td align="center">29.9%</td>
</tr>
<tr>
<td align="center"><italic>Oncotarget</italic></td>
<td align="center">326</td>
<td align="center">107</td>
<td align="center">32.8%</td>
</tr>
<tr>
<td align="center"><italic>Front Genet</italic></td>
<td align="center">313</td>
<td align="center">94</td>
<td align="center">30.0%</td>
</tr>
<tr>
<td align="center"><italic>eLife</italic></td>
<td align="center">243</td>
<td align="center">89</td>
<td align="center">36.6%</td>
</tr>
<tr>
<td align="center"><italic>Proc Natl Acad Sci USA</italic></td>
<td align="center">155</td>
<td align="center">73</td>
<td align="center">47.1%</td>
</tr>
<tr>
<td align="center"><italic>Cell Rep</italic></td>
<td align="center">158</td>
<td align="center">71</td>
<td align="center">44.9%</td>
</tr>
<tr>
<td align="center"><italic>Genome Biol</italic></td>
<td align="center">193</td>
<td align="center">66</td>
<td align="center">34.2%</td>
</tr>
<tr>
<td align="center"><italic>Nature</italic></td>
<td align="center">118</td>
<td align="center">52</td>
<td align="center">44.1%</td>
</tr>
<tr>
<td align="center"><italic>Nat Genet</italic></td>
<td align="center">140</td>
<td align="center">48</td>
<td align="center">34.3%</td>
</tr>
<tr>
<td align="center"><italic>Genome Med</italic></td>
<td align="center">137</td>
<td align="center">44</td>
<td align="center">32.1%</td>
</tr>
<tr>
<td align="center"><italic>PeerJ</italic></td>
<td align="center">137</td>
<td align="center">39</td>
<td align="center">28.5%</td>
</tr>
<tr>
<td align="center"><italic>Cell</italic></td>
<td align="center">74</td>
<td align="center">39</td>
<td align="center">52.7%</td>
</tr>
<tr>
<td align="center"><italic>Clin Epigenetics</italic></td>
<td align="center">109</td>
<td align="center">38</td>
<td align="center">34.9%</td>
</tr>
<tr>
<td align="center"><italic>Nucleic Acids Res</italic></td>
<td align="center">120</td>
<td align="center">36</td>
<td align="center">30.0%</td>
</tr>
<tr>
<td align="center"><italic>BMC Med Genomics</italic></td>
<td align="center">117</td>
<td align="center">31</td>
<td align="center">26.5%</td>
</tr>
<tr>
<td align="center"><italic>Front Oncol</italic></td>
<td align="center">85</td>
<td align="center">31</td>
<td align="center">36.5%</td>
</tr>
<tr>
<td align="center"><italic>Transl Psychiatry</italic></td>
<td align="center">73</td>
<td align="center">29</td>
<td align="center">39.7%</td>
</tr>
<tr>
<td align="center"><italic>BMC Cancer</italic></td>
<td align="center">105</td>
<td align="center">28</td>
<td align="center">26.7%</td>
</tr>
<tr>
<td align="center"><italic>PLoS Pathog</italic></td>
<td align="center">80</td>
<td align="center">27</td>
<td align="center">33.8%</td>
</tr>
<tr>
<td align="center"><italic>Commun Biol</italic></td>
<td align="center">74</td>
<td align="center">27</td>
<td align="center">36.5%</td>
</tr>
<tr>
<td align="center"><italic>PLoS Biol</italic></td>
<td align="center">66</td>
<td align="center">26</td>
<td align="center">39.4%</td>
</tr>
<tr>
<td align="center"><italic>Aging</italic></td>
<td align="center">56</td>
<td align="center">26</td>
<td align="center">46.4%</td>
</tr>
<tr>
<td align="center"><italic>EBioMedicine</italic></td>
<td align="center">51</td>
<td align="center">26</td>
<td align="center">51.0%</td>
</tr>
<tr>
<td align="center"><italic>Epigenetics Chromatin</italic></td>
<td align="center">64</td>
<td align="center">25</td>
<td align="center">39.1%</td>
</tr>
<tr>
<td align="center"><italic>PLoS Comput Biol</italic></td>
<td align="center">97</td>
<td align="center">24</td>
<td align="center">24.7%</td>
</tr>
<tr>
<td align="center"><italic>Oncogene</italic></td>
<td align="center">53</td>
<td align="center">22</td>
<td align="center">41.5%</td>
</tr>
<tr>
<td align="center"><italic>iScience</italic></td>
<td align="center">58</td>
<td align="center">20</td>
<td align="center">34.5%</td>
</tr>
<tr>
<td align="center"><italic>Sci Adv</italic></td>
<td align="center">56</td>
<td align="center">20</td>
<td align="center">35.7%</td>
</tr>
<tr>
<td align="center"><italic>BMC Bioinformatics</italic></td>
<td align="center">77</td>
<td align="center">19</td>
<td align="center">24.7%</td>
</tr>
<tr>
<td align="center"><italic>G3</italic></td>
<td align="center">74</td>
<td align="center">15</td>
<td align="center">20.3%</td>
</tr>
<tr>
<td align="center"><italic>Hum Mol Genet</italic></td>
<td align="center">53</td>
<td align="center">15</td>
<td align="center">28.3%</td>
</tr>
<tr>
<td align="center"><italic>BMC Plant Biol</italic></td>
<td align="center">52</td>
<td align="center">6</td>
<td align="center">11.5%</td>
</tr>
<tr>
<td align="center"><italic>Front Plant Sci</italic></td>
<td align="center">75</td>
<td align="center">5</td>
<td align="center">6.7%</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<p>From the set of 37 journals, those with lowest rate of affected articles (&lt;25%) included <italic>Frontiers in Plant Science</italic>, <italic>BMC Plant Biology</italic>, <italic>G3</italic>, <italic>BMC Bioinformatics</italic> and <italic>PLoS Computational Biology</italic>, while the journals with the highest rate (&gt;40%) included <italic>Cell</italic>, <italic>EBioMedicine</italic>, <italic>PNAS</italic>, <italic>Aging</italic>, <italic>Cell Reports</italic>, <italic>Nature</italic> and <italic>Oncogene</italic>.</p>
<p>Next, we assessed whether there was any correlation between error proportion and the journal impact factor (JIF) for the set of 37 journals. A scatterplot of JIF and proportion of affected articles is shown in <xref ref-type="fig" rid="pcbi.1008984.g002">Fig 2</xref>. A correlation analysis indicated a statistically significant association using the Pearson (p = 0.0052, r = 0.462,) and Spearman (p = 1.95E-04; <italic>ρ</italic> = 0.589) methods.</p>
<fig id="pcbi.1008984.g002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1008984.g002</object-id>
<label>Fig 2</label>
<caption>
<title>A scatterplot of JIF and proportion of articles with supplementary Excel gene lists affected by gene name errors.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1008984.g002" xlink:type="simple"/>
</fig>
<p>Next we assessed the temporal trends for the three journals with most gene name errors (<xref ref-type="fig" rid="pcbi.1008984.g003">Fig 3</xref>). <italic>Nature Communications</italic> showed a strong increase in articles with Excel gene lists and gene name errors over the period, while the proportion of affected articles recorded an increase from 33.3% to 39.5% in the period 2014–2020. <italic>PLOS ONE</italic> showed a trend of decreasing numbers of articles with supplementary Excel gene lists and number of affected articles but the proportion of affected articles was relatively flat over this time. <italic>Scientific Reports</italic> recorded a strong increase in articles with supplementary Excel files in the period 2014–2017 but has since remained stable. The proportion of affected articles in this journal did not show any consistent trend over this period.</p>
<fig id="pcbi.1008984.g003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pcbi.1008984.g003</object-id>
<label>Fig 3</label>
<caption>
<title>Gene name errors in supplementary files for three dominant journals in the period 2014–2020.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1008984.g003" xlink:type="simple"/>
</fig>
</sec>
<sec id="sec008">
<title>Novel error types</title>
<p>While we are familiar with common <italic>SEPT</italic> and <italic>MARCH</italic> conversions, we observed a variety of additional novel error modes. Some of these were likely related to locale language settings. In a few cases, the human gene <italic>AGO2</italic> was converted to Aug-02 (eg: PMC5537504 &amp; PMC6244004), which may be due to Excel working in languages such as Italian, Spanish or Portugese. Similarly, the gene <italic>MEI1</italic> was seen to be converted to May-01 (eg: PMC6065148 &amp; PMC5877863) and could be due to the similarity with the Dutch (mei). In one article (PMC5908809), <italic>TAMM41</italic> was apparently converted to “Jan-41” due to similarity with the month of January in Finnish (tammikuu).</p>
<p>There were also several cases where the dates appeared to be unrelated to Excel date genes. For example, article PMC6330011 S4 Table contained the following: “’Feb-97, Aug-97, Nov-97, Feb-98, Aug-98”. Information in other columns of the spreadsheet indicated that these originated from <italic>SEPT</italic>, <italic>MARCH</italic> and <italic>DEC</italic> gene names. Cells containing Aug-97 through Aug-11 corresponded to <italic>SEPT2</italic> to <italic>SEPT14</italic> and <italic>SEP15</italic>. Article PMC5989470 showed evidence that the protein name “jun-1” was converted to “May-31”. We posit that this type of error is caused by the spreadsheet evaluating protein names like “jun-1” as the month of June minus 1.</p>
<p>Other observations were more puzzling. There were two papers where it appears the <italic>P2RY1</italic> gene (Ensembl identifier ENSGALG00000016687) was converted to “7”; possibly a problem in an upstream database. In one sheet (Table S5 of article PMC6506828), the numeric value “3002” was observed in the gene symbol column beside “NM_198411”, corresponding to <italic>Inverted Formin 2</italic> (<italic>Inf2</italic>). Perhaps the spreadsheet interpreted “Inf2” as a numerical value.</p>
</sec>
</sec>
<sec id="sec009" sec-type="conclusions">
<title>Discussion</title>
<p>We hypothesized that after a previous publication in 2016 received substantial attention in technology and social media spheres, that researchers and publishers would be aware of the issue of gene name conversion in spreadsheets and the prevalence of such errors would decline. On the contrary, this work demonstrates that overall there has been no substantial change in the rate of gene name errors in the period 2014 to 2020. Indeed the proportion of articles with Excel gene lists containing gene name errors was significantly higher here as compared to a previous report (30.9% and 19.6% respectively) [<xref ref-type="bibr" rid="pcbi.1008984.ref002">2</xref>]. This is due to two main contributors. Firstly, the articles here were sampled from PMC as compared to a set of 18 major genomics journals. Secondly this work identified gene names becoming converted to internal date format, which accounts for ~15% of such errors detected here. These numbers correspond to the number of days since 1st January 1900; indeed, this is how spreadsheet software stores date information internally. Gene names can become converted to five-digit numbers by first converting them to dates upon import, followed by changing the cell formatting to “number” or “text”, becoming permanent when the spreadsheet is saved.</p>
<p>Another take-away from this study is that articles with supplementary Excel gene lists in highly reputable journals like <italic>Cell</italic>, <italic>Nature</italic> and <italic>Proc Natl Acad Sci USA</italic> more frequently contained gene name errors as compared to their counterparts with lower JIF scores. This may seem counterintuitive, but is consistent with previous analysis [<xref ref-type="bibr" rid="pcbi.1008984.ref002">2</xref>]. Although it has been suggested that articles in highly prestigious journals are of an inferior methodological quality [<xref ref-type="bibr" rid="pcbi.1008984.ref006">6</xref>], the simpler explanation is that the number and size of supplementary gene lists accompanying articles is the main contributor to this trend (although we have not examined this hypothesis quantitatively). This is likely a contributing factor to why so many gene name errors were identified in <italic>Nature Communications</italic>. This journal recommends authors provide source data which contain the raw data underlying any graphs and charts, resulting in more data in attached Excel files. Additionally, this is a prolific and fast-growing multidisciplinary journal with 6,448 published articles in 2020 and ~15% year-on-year growth since 2014. Concerningly, the proportion of papers in <italic>Nature Communications</italic> with supplementary Excel gene lists affected by gene name errors also increased in the period 2014–2020 (<xref ref-type="fig" rid="pcbi.1008984.g003">Fig 3</xref>).</p>
<p>There are limitations to this study that need to be pointed out. For convenience, we only screened open access articles in PMC and so this might not be representative of the work in paywalled articles. Moreover, we screened a subset of PMC articles that contained the keyword “genom*” in the abstract or title. Out of 3,291,704 articles in PMC published in the period 2014–2020, we included only 116,139 (~5.0%). There are likely many gene name errors outside of this sample of articles and there is a chance that such errors appear at varying rates in the articles not analysed here. The updated screening software yielded a slightly higher fraction of false positives but was circumvented by systematically opening each file manually for verification. Our script only identified vertical gene lists, so there were likely some in the horizontal orientation that were missed.</p>
<p>There has been a great deal of discussion around who is responsible for the persistence in gene name errors over time. The software developers surely must take some blame because these conversions occur without any user notifications, and the date conversion feature is not one that can be disabled. In their defence, we must understand that Excel and other spreadsheet software were designed only for lightweight data entry and calculation, not for analysis of data containing many thousands of rows. Reviewers are doing their best with limited time but can do better with regards to quality checking supplementary files. Journal editors have yet to put in place systems to identify gene name errors before they are published. Surely some blame rests on the researchers who inadvertently make these mistakes. In particular, senior authors need to take leadership in picking up such errors when they arise, but more importantly, they need to provide training opportunities and promote a culture of reproducibility in the groups they lead. Academic faculty need to ensure that biology graduates are trained in contemporary skills to conduct data-driven research that goes beyond appropriate use of spreadsheets. This needs to include competence in scripted computer languages, statistical analysis and computational reproducibility [<xref ref-type="bibr" rid="pcbi.1008984.ref005">5</xref>].</p>
<p>From the researcher’s perspective, there are several practical ways that such errors can be avoided (<xref ref-type="boxed-text" rid="pcbi.1008984.box001">Box 1</xref>).</p>
<boxed-text id="pcbi.1008984.box001" position="float">
<sec id="sec010">
<title>Box 1. Tips to avoid gene name errors</title>
<list list-type="bullet">
<list-item><p>Scripted analyses are preferred over spreadsheets. Gene name to date conversion is a bug specific to spreadsheets and doesn’t occur in scripted computer languages like Python or R. In addition, analyses conducted with Python and R notebooks (eg: Jupyter or Rmarkdown) capture computational methods and results in a stepwise fashion meaning these workflows can be more readily audited. These notebooks can therefore achieve a higher level of computational reproducibility than spreadsheets. Although this requires a big investment in learning a computer language, this investment pays off in the longer term.</p></list-item>
<list-item><p>If a spreadsheet must be used, then LibreOffice is recommended because it will avoid such errors from occurring. This will not remedy other error types.</p></list-item>
<list-item><p>If using Excel is unavoidable, then take great care importing the data. If opening a TSV or CSV file, use the data import wizard to ensure that each column of data is formatted appropriately. For example, columns containing gene names should be formatted as “free text”, genomic coordinates formatted as “integers” and gene expression measurements as “numeric”.</p></list-item>
<list-item><p>Instead of spreadsheets, share genomic data as “flat text” files. These typically have the suffixes “csv”, “tsv” or “txt”. These are native formats for computer languages and suitable for long term data archiving. Excel formats such as “xls” or “xlsx” are proprietary and future development is decided by Microsoft.</p></list-item>
<list-item><p>If it is unavoidable to use a spreadsheet with genomic data, verify that gene names are intact. To do this, sort columns containing gene names in ascending order. This will bring dates and numbers to the top of the column so it is obvious whether any gene symbols have been converted. Alternatively, use the Truke web tool to identify such errors (<ext-link ext-link-type="uri" xlink:href="http://maplab.imppc.org/truke/" xlink:type="simple">http://maplab.imppc.org/truke/</ext-link>).</p></list-item>
<list-item><p>Assume that there are Excel date gene names in your organism of interest. Although human and mouse SEPT and MARCH gene names have been changed to avoid such errors, there are many taxa across Eukarya that are yet to see similar changes. Excel gene names may also be prevalent in Prokarya.</p></list-item>
</list>
</sec>
</boxed-text>
<p>The HGNC has taken the initiative to change the most susceptible gene names, but this will not entirely solve the problem. There are a number of gene names that could be converted if the user computer is set up to use a non-English language. While human, mouse, and rat gene names have been changed, such changes are yet to take place for other species such as <italic>D</italic>. <italic>rerio</italic>, <italic>C</italic>. <italic>elegans</italic>, <italic>D</italic>. <italic>melanogaster</italic> and <italic>A</italic>. <italic>thaliana</italic> (See <xref ref-type="supplementary-material" rid="pcbi.1008984.s001">S1 Table</xref>). Open-source tools are being developed to circumvent these errors. Truke is a web service that identifies and corrects corrupted gene names in affected files [<xref ref-type="bibr" rid="pcbi.1008984.ref007">7</xref>], while EscapeExcel is a tool designed to prevent gene name conversions from happening by protecting strings before import [<xref ref-type="bibr" rid="pcbi.1008984.ref008">8</xref>]. HGNChelper is an R package that recognises and fixes human gene symbols converted to dates [<xref ref-type="bibr" rid="pcbi.1008984.ref009">9</xref>]. It appears that these developments are not having a major impact yet because gene name errors continue to grow year-on-year and the proportion of affected articles has remained stable since 2014 (<xref ref-type="table" rid="pcbi.1008984.t003">Table 3</xref> and <xref ref-type="fig" rid="pcbi.1008984.g001">Fig 1</xref>).</p>
<p>It has been argued that gene name errors are of little consequence to the conclusions of a scientific publication [<xref ref-type="bibr" rid="pcbi.1008984.ref010">10</xref>], however our view is that it is a symptom of a larger problem—that overreliance on spreadsheets leads to errors occurring silently in large data files and that such errors are exceedingly difficult for researchers, reviewers, and editorial staff to identify. Previous spreadsheet research in the business setting indicates that errors exist in 0.9% to 1.9% of formula cells and from a sample of 50 spreadsheets, only seven were error-free [<xref ref-type="bibr" rid="pcbi.1008984.ref011">11</xref>]. In the healthcare sector, an analysis of data entry errors into a clinical pathology spreadsheet found errors in 0.5% to 6.4% of cells [<xref ref-type="bibr" rid="pcbi.1008984.ref012">12</xref>], while a systematic analysis of spreadsheet errors in a hospital setting found critical errors in 11 of 12 spreadsheets analyzed [<xref ref-type="bibr" rid="pcbi.1008984.ref013">13</xref>]. In the biomedical research setting we know that spreadsheet errors can occur and impact downstream work involving clinical drug trials [<xref ref-type="bibr" rid="pcbi.1008984.ref014">14</xref>]. Despite this potential risk, there has yet to be a systematic assessment of the full taxonomy of spreadsheet errors in biomedical research, so we don’t know how frequently they occur.</p>
<p>It must be noted that a blanket ban on spreadsheets as supplementary files is unlikely to mitigate gene name errors entirely, as many researchers might simply export their working spreadsheets to flat text files (errors included). Rather, raising standards around computer code sharing, code review, and reproducibility measures is more likely to deliver lasting improvements in the quality of published research.</p>
<p>In summary, this work demonstrates that gene name errors in supplementary data files of research articles are more frequent than previously appreciated and are not declining over time. Eliminating gene name errors will require major changes to researcher practices which are unlikely to happen in the near term. To monitor gene name errors in PMC we have set up an automated reporting system that will be updated monthly (URL: <ext-link ext-link-type="uri" xlink:href="http://ziemann-lab.net/public/gene_name_errors/" xlink:type="simple">http://ziemann-lab.net/public/gene_name_errors/</ext-link>).</p>
</sec>
<sec id="sec011" sec-type="materials|methods">
<title>Methods</title>
<sec id="sec012">
<title>Characterising spreadsheet software behaviours</title>
<p>We tested the default behaviour of four different spreadsheet programs (Microsoft Excel 365 MSO version, Google Sheets (accessed 4th June 2021), LibreOffice v6.4.6.2, and Gnumeric v1.12.46) by entering the list of strings shown in <xref ref-type="boxed-text" rid="pcbi.1008984.box002">Box 2</xref>. These data were entered into spreadsheets by (i) opening directly from a text file with csv or tsv suffix, (ii) typing directly into cells, and (iii) pasting from a separate text file. We then observed and recorded the propensity of these programs to perform date conversion of the gene symbols.</p>
<boxed-text id="pcbi.1008984.box002" position="float">
<sec id="sec013">
<title>Box 2. Strings used to test date conversion of spreadsheet programs</title>
<p>1/1/2001</p>
<p>2/3/2001</p>
<p>5/4/2008</p>
<p>SEPT7</p>
<p>DEC1</p>
<p>OCT4</p>
<p>MARCH3</p>
<p>5/1/2010</p>
<p>TP53</p>
<p>NCF1</p>
<p>Inf2</p>
</sec>
</boxed-text>
</sec>
<sec id="sec014">
<title>Screening gene names that get converted to dates</title>
<p>All eukaryotic gene annotation files were downloaded from Ensembl (Vertebrates v102, Metazoa v49, Plants v49, Fungi v49 and Protists v49). Gene names were extracted from the GTF files and imported into Excel together with the taxa name (species/strain). The gene name column was sorted to bring cells containing dates to the top of the sheet, where we counted the number of date conversions per taxon.</p>
</sec>
<sec id="sec015">
<title>Searching PMC</title>
<p>PMC (URL: <ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/pmc/" xlink:type="simple">https://www.ncbi.nlm.nih.gov/pmc/</ext-link>) was our starting point for shortlisting open-access publications to screen. We did not screen every publication in PMC because most do not include genomic data. By searching for publications with the keyword “genom*” in the title or abstract, we were able to reduce the number of articles screened by ~95%. For example, in the year 2015, there were 405,251 articles published, but only 21,213 had the keyword “genom*” in the title or abstract. We used this approach to create lists of PMC identifiers by year for the period 2014 to 2020.</p>
</sec>
<sec id="sec016">
<title>Updated software for scanning for gene name errors</title>
<p>A shell script was used to perform the following. Each PMC publication in the shortlist was downloaded as a HTML file. Links to files with.xls or.xlsx suffixes in the HTML were extracted, these were assumed to be supplementary Excel files. Each Excel file was downloaded, and file metadata was scanned to confirm it is an Excel file and not simply a tabular text file with an incorrect suffix. True Excel files were extracted with an R script (R v4.0.0) using the readxl package v1.3.1 (<ext-link ext-link-type="uri" xlink:href="https://cran.r-project.org/package=readxl" xlink:type="simple">https://CRAN.R-project.org/package=readxl</ext-link>) into tabular files. Other text-based files with xls or.xlsx suffices were processed with ssconvert v1.12.46 to tabular files. As per a previous study [<xref ref-type="bibr" rid="pcbi.1008984.ref002">2</xref>], these tabular data underwent screening for columns that contained gene symbols. Those columns with five or more gene symbols were considered to be gene lists and underwent screening for erroneous conversions, such as date formats and scientific numbers. The main difference being that this script also recognises five-digit numbers (internal date format). Analysis logs were processed and brought together with the corresponding journal name to yield a list of supplementary files suspected to contain a gene name error.</p>
</sec>
<sec id="sec017">
<title>Verification and data visualisation</title>
<p>Each of these suspect files were downloaded and opened with either Excel or LibreOffice Calc to confirm the presence of gene name errors. To do this, columns appearing to contain gene names were sorted such that numeric values (dates) were brought to the top of the sheet. Summary data were loaded into R v4.1.0 for analysis and visualisation. The two-sided Pearson and Spearman correlation tests were executed in R.</p>
</sec>
</sec>
<sec id="sec018" sec-type="supplementary-material">
<title>Supporting information</title>
<supplementary-material id="pcbi.1008984.s001" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1008984.s001" xlink:type="simple">
<label>S1 Table</label>
<caption>
<title>Ensembl eukaryotic gene names converted to dates by Excel.</title>
<p>(XLSX)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1008984.s002" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1008984.s002" xlink:type="simple">
<label>S2 Table</label>
<caption>
<title>Articles and supplementary Excel files affected by gene name errors.</title>
<p>(XLSX)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1008984.s003" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1008984.s003" xlink:type="simple">
<label>S3 Table</label>
<caption>
<title>Excel files suspected to contain gene name errors by the screening software but turned out to be false positives.</title>
<p>(XLSX)</p>
</caption>
</supplementary-material>
<supplementary-material id="pcbi.1008984.s004" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1008984.s004" xlink:type="simple">
<label>S4 Table</label>
<caption>
<title>Classification of gene name error types in a sample of 2160 affected spreadsheets.</title>
<p>(XLSX)</p>
</caption>
</supplementary-material>
</sec>
</body>
<back>
<ack>
<p>We thank Dr Nicholas C. Wong from Monash University for comments on the manuscript.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="pcbi.1008984.ref001"><label>1</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Zeeberg</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Riss</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Kane</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Bussey</surname> <given-names>K</given-names></name>, <name name-style="western"><surname>Uchio</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Linehan</surname> <given-names>W</given-names></name>, <etal>et al</etal>. <article-title>Mistaken identifiers: gene name errors can be introduced inadvertently when using Excel in bioinformatics</article-title>. <source>BMC Bioinformatics</source>. <year>2004</year>:<volume>5</volume>: <fpage>80</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/1471-2105-5-80" xlink:type="simple">10.1186/1471-2105-5-80</ext-link></comment> <object-id pub-id-type="pmid">15214961</object-id></mixed-citation></ref>
<ref id="pcbi.1008984.ref002"><label>2</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Ziemann</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Eren</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>El-Osta</surname> <given-names>A</given-names></name>. <article-title>Gene name errors are widespread in the scientific literature</article-title>. <source>Genome Biol</source>. <year>2016</year>;<volume>17</volume>: <fpage>177</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/s13059-016-1044-7" xlink:type="simple">10.1186/s13059-016-1044-7</ext-link></comment> <object-id pub-id-type="pmid">27552985</object-id></mixed-citation></ref>
<ref id="pcbi.1008984.ref003"><label>3</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Bruford</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Braschi</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Denny</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Jones</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Seal</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Tweedie</surname> <given-names>S</given-names></name>. <source>Nat Genet</source>. <year>2020</year>;<volume>52</volume>: <fpage>754</fpage>–<lpage>758</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/s41588-020-0669-3" xlink:type="simple">10.1038/s41588-020-0669-3</ext-link></comment> <object-id pub-id-type="pmid">32747822</object-id></mixed-citation></ref>
<ref id="pcbi.1008984.ref004"><label>4</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Panko</surname> <given-names>R</given-names></name>. <article-title>What we know about spreadsheet errors</article-title>. <source>Journal of Organizational and End User Computing</source>. <year>1998</year>;<volume>10</volume>: <fpage>15</fpage>–<lpage>21</lpage>.</mixed-citation></ref>
<ref id="pcbi.1008984.ref005"><label>5</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Peng</surname> <given-names>R</given-names></name>. <article-title>Reproducible research in computational science</article-title>. <source>Science</source>. <year>2011</year>;<volume>334</volume>; <fpage>1226</fpage>–<lpage>1227</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1126/science.1213847" xlink:type="simple">10.1126/science.1213847</ext-link></comment> <object-id pub-id-type="pmid">22144613</object-id></mixed-citation></ref>
<ref id="pcbi.1008984.ref006"><label>6</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Brembs</surname> <given-names>B</given-names></name>. <article-title>Prestigious science journals struggle to reach even average reliability</article-title>. <source>Frontiers in Human Neuroscience</source>. <year>2018</year>;<volume>12</volume>: <fpage>37</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fnhum.2018.00037" xlink:type="simple">10.3389/fnhum.2018.00037</ext-link></comment> <object-id pub-id-type="pmid">29515380</object-id></mixed-citation></ref>
<ref id="pcbi.1008984.ref007"><label>7</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Mallona I Peinado</surname> <given-names>M</given-names></name>. <article-title>Truke, a web tool to check for and handle Excel misidentified gene symbols</article-title>. <source>BMC Genomics</source>. <year>2017</year>;<volume>18</volume>: <fpage>242</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/s12864-017-3631-8" xlink:type="simple">10.1186/s12864-017-3631-8</ext-link></comment> <object-id pub-id-type="pmid">28327106</object-id></mixed-citation></ref>
<ref id="pcbi.1008984.ref008"><label>8</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Welsh</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Stewart</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Kuenzi</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Eschrich</surname> <given-names>J</given-names></name>. <article-title>Escape Excel: A tool for preventing gene symbol and accession conversion errors</article-title>. <source>PLOS ONE</source>. <year>2017</year>;<volume>12</volume>: <fpage>e0185207</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pone.0185207" xlink:type="simple">10.1371/journal.pone.0185207</ext-link></comment> <object-id pub-id-type="pmid">28953918</object-id></mixed-citation></ref>
<ref id="pcbi.1008984.ref009"><label>9</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Oh</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Abdelnabi</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Al-Dulaimi</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Aggarwal</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Ramos</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Davis</surname> <given-names>S</given-names></name>, <etal>et al</etal>. <article-title>HGNChelper: identification and correction of invalid gene symbols for human and mouse [version 1].</article-title> <source>F1000Research</source>. <year>2020</year>;<volume>9</volume>: <fpage>1493</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.12688/f1000research.28033.1" xlink:type="simple">10.12688/f1000research.28033.1</ext-link></comment> <object-id pub-id-type="pmid">33564398</object-id></mixed-citation></ref>
<ref id="pcbi.1008984.ref010"><label>10</label><mixed-citation publication-type="journal" xlink:type="simple"><article-title>Legible ledgers</article-title>. <source>Nat Genet</source>. <year>2016</year>;<volume>48</volume>: <fpage>1101</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/ng.3690" xlink:type="simple">10.1038/ng.3690</ext-link></comment> <object-id pub-id-type="pmid">27681286</object-id></mixed-citation></ref>
<ref id="pcbi.1008984.ref011"><label>11</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Powell</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Baker</surname> <given-names>K</given-names></name>, <name name-style="western"><surname>Lawson</surname> <given-names>B</given-names></name>. <article-title>Errors in Operational Spreadsheets</article-title>. <source>Journal of Organizational and End User Computing</source>. <year>2009</year>;<volume>21</volume>: <fpage>24</fpage>–<lpage>36</lpage>.</mixed-citation></ref>
<ref id="pcbi.1008984.ref012"><label>12</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Hong</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Yao</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Pedersen</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Peters</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Costello</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Murphy</surname> <given-names>D</given-names></name>, <etal>et al</etal>. <article-title>Error rates in a clinical data repository: lessons from the transition to electronic data transfer—a descriptive study</article-title>. <source>BMJ Open</source>. <year>2013</year>;<volume>28</volume>: <fpage>e002406</fpage>.</mixed-citation></ref>
<ref id="pcbi.1008984.ref013"><label>13</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Dobell</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Herold</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Buckley</surname> <given-names>J</given-names></name>. <article-title>Spreadsheet Error Types and Their Prevalence in a Healthcare Context</article-title>. <source>Journal of Organizational and End User Computing</source>. <year>2018</year>;<volume>30</volume>: <fpage>20</fpage>–<lpage>42</lpage>.</mixed-citation></ref>
<ref id="pcbi.1008984.ref014"><label>14</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Baggerly</surname> <given-names>K</given-names></name>, <name name-style="western"><surname>Coombes</surname> <given-names>K</given-names></name>. <article-title>Deriving chemosensitivity from cell lines: Forensic bioinformatics and reproducible research in high-throughput biology</article-title>. <source>Ann Appl Stat</source>. <year>2009</year>;<volume>3</volume>: <fpage>1309</fpage>–<lpage>1334</lpage>.</mixed-citation></ref>
</ref-list>
</back>
<sub-article article-type="aggregated-review-documents" id="pcbi.1008984.r001" specific-use="decision-letter">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1008984.r001</article-id>
<title-group>
<article-title>Decision Letter 0</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>Ouzounis</surname>
<given-names>Christos A.</given-names>
</name>
<role>Associate Editor</role>
</contrib>
<contrib contrib-type="author">
<name name-style="western">
<surname>Ioshikhes</surname>
<given-names>Ilya</given-names>
</name>
<role>Deputy Editor</role>
</contrib>
</contrib-group>
<permissions>
<copyright-year>2021</copyright-year>
<copyright-holder>Ouzounis, Ioshikhes</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<related-object document-id="10.1371/journal.pcbi.1008984" document-id-type="doi" document-type="article" id="rel-obj001" link-type="peer-reviewed-article"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>0</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="letter-date">2 Jun 2021</named-content>
</p>
<p>Dear Dr Ziemann,</p>
<p>Thank you very much for submitting your manuscript "Gene name errors: lessons not learned" for consideration at PLOS Computational Biology. As with all papers reviewed by the journal, your manuscript was reviewed by members of the editorial board and by several independent reviewers. The reviewers appreciated the attention to an important topic. Based on the reviews, we are likely to accept this manuscript for publication, providing that you modify the manuscript according to the review recommendations.</p>
<p>Please prepare and submit your revised manuscript within 30 days. If you anticipate any delay, please let us know the expected resubmission date by replying to this email.</p>
<p>When you are ready to resubmit, please upload the following:</p>
<p>[1] A letter containing a detailed list of your responses to all review comments, and a description of the changes you have made in the manuscript. Please note while forming your response, if your article is accepted, you may have the opportunity to make the peer review history publicly available. The record will include editor decision letters (with reviews) and your responses to reviewer comments. If eligible, we will contact you to opt in or out</p>
<p>[2] Two versions of the revised manuscript: one with either highlights or tracked changes denoting where the text has been changed; the other a clean version (uploaded as the manuscript file).</p>
<p>Important additional instructions are given below your reviewer comments.</p>
<p>Thank you again for your submission to our journal. We hope that our editorial process has been constructive so far, and we welcome your feedback at any time. Please don't hesitate to contact us if you have any questions or comments.</p>
<p>Sincerely,</p>
<p>Christos A. Ouzounis</p>
<p>Associate Editor</p>
<p>PLOS Computational Biology</p>
<p>Ilya Ioshikhes</p>
<p>Deputy Editor</p>
<p>PLOS Computational Biology</p>
<p>***********************</p>
<p>A link appears below if there are any accompanying review attachments. If you believe any reviews to be missing, please contact <email xlink:type="simple">ploscompbiol@plos.org</email> immediately:</p>
<p>[LINK]</p>
<p>Reviewer's Responses to Questions</p>
<p><bold>Comments to the Authors:</bold></p>
<p><bold>Please note here if the review is uploaded as an attachment.</bold></p>
<p>Reviewer #1: The authors have written a clearly readable paper with all relevant data and code made available.</p>
<p>It is beneficial to highlight how incorrect gene names continue to be published and how this can be an issue as they are difficult to check for.</p>
<p>It is unclear how useful the comparison between different publications is. It is unclear how many other users will see value in the provided software. It is also unclear whether it was necessary to spend time focusing on which organisms have gene names at risk, and the text devoted to gene name errors by year. Calling a correlation coefficient of 0.589 a strong association is somewhat optimistic.</p>
<p>This paper could be strengthened by offering more solutions to this problem, for example by emphasising using .csv rather than .txt format, and alternatives to spreadsheet software, such as RStudio.</p>
<p>A couple of minor points, it is possible to paste directly into Google Sheets without errors occurring. There appears to be an error with the automated reporting system, with four identical reports currently shown.</p>
<p>Reviewer #2: I applaud the hard work of the authors. Opening several thousand file by hand is a heroic task that nobody envies them for. My hat is off!</p>
<p>I liked the analysis and the writing very much. I have only four minor points, the last one is optional:</p>
<p>1. Ordering table 4 by highest to lowest „Proportion of publications affected” makes grasping the results much easier.</p>
<p>2. Show graph for the correlation between JIF and error rate and report R2. Such a graph is useful for visualizing the negative correlation of journal rank with reliability, consistent with similar graphs from other publications (see next point).</p>
<p>3. In the discussion (p14), the authors mention that the correlation of error rate with journal prestige may seem counterintuitive, but it is not only the 2016 analysis that their results are consistent with. See, e.g., <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fnhum.2018.00037" xlink:type="simple">https://doi.org/10.3389/fnhum.2018.00037</ext-link> for a list of many other analyses that all suggest that work published in prestigious journals is more error prone than that of other publication venues. I fact, it would have been counter-intuitive had the authors found less errors in highly prestigious journals: the evidence predicts more errors in more prestigious journals.</p>
<p>4. The authors may consider inserting a few sentences into their discussion about a solution to this (and many other, related) problems with the data and code underlying our narratives: an infrastructure solution that integrates research data and code in a way that makes supplemental files unnecessary. Such a solution has been technically feasible for some years now and many people have suggested to use such solutions, e.g.:</p>
<p><ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.3000117" xlink:type="simple">https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.3000117</ext-link></p>
<p><ext-link ext-link-type="uri" xlink:href="http://www.theguardian.com/science/head-quarters/2015/may/12/will-traditional-science-journals-disappear" xlink:type="simple">http://www.theguardian.com/science/head-quarters/2015/may/12/will-traditional-science-journals-disappear</ext-link></p>
<p><ext-link ext-link-type="uri" xlink:href="https://neuroneurotic.net/2015/07/17/revolutionise-the-publication-process/" xlink:type="simple">https://neuroneurotic.net/2015/07/17/revolutionise-the-publication-process/</ext-link></p>
<p><ext-link ext-link-type="uri" xlink:href="https://micahallen.org/2015/03/20/short-post-my-science-fiction-vision-of-how-science-could-work-in-the-future/" xlink:type="simple">https://micahallen.org/2015/03/20/short-post-my-science-fiction-vision-of-how-science-could-work-in-the-future/</ext-link></p>
<p>Björn Brembs</p>
<p>**********</p>
<p><bold>Have the authors made all data and (if applicable) computational code underlying the findings in their manuscript fully available?</bold></p>
<p>The <ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/ploscompbiol/s/materials-and-software-sharing" xlink:type="simple">PLOS Data policy</ext-link> requires authors to make all data and code underlying the findings described in their manuscript fully available without restriction, with rare exception (please refer to the Data Availability Statement in the manuscript PDF file). The data and code should be provided as part of the manuscript or its supporting information, or deposited to a public repository. For example, in addition to summary statistics, the data points behind means, medians and variance measures should be available. If there are restrictions on publicly sharing data or code —e.g. participant privacy or use of data from a third party—those must be specified.</p>
<p>Reviewer #1: Yes</p>
<p>Reviewer #2: Yes</p>
<p>**********</p>
<p>PLOS authors have the option to publish the peer review history of their article (<ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/ploscompbiol/s/editorial-and-peer-review-process#loc-peer-review-history" xlink:type="simple">what does this mean?</ext-link>). If published, this will include your full peer review and any attached files.</p>
<p>If you choose “no”, your identity will remain anonymous but your review may still be made public.</p>
<p><bold>Do you want your identity to be public for this peer review?</bold> For information about this choice, including consent withdrawal, please see our <ext-link ext-link-type="uri" xlink:href="https://www.plos.org/privacy-policy" xlink:type="simple">Privacy Policy</ext-link>.</p>
<p>Reviewer #1: No</p>
<p>Reviewer #2: <bold>Yes: </bold>Björn Brembs</p>
<p>Figure Files:</p>
<p>While revising your submission, please upload your figure files to the Preflight Analysis and Conversion Engine (PACE) digital diagnostic tool, <ext-link ext-link-type="uri" xlink:href="https://pacev2.apexcovantage.com" xlink:type="simple">https://pacev2.apexcovantage.com</ext-link>. PACE helps ensure that figures meet PLOS requirements. To use PACE, you must first register as a user. Then, login and navigate to the UPLOAD tab, where you will find detailed instructions on how to use the tool. If you encounter any issues or have any questions when using PACE, please email us at <email xlink:type="simple">figures@plos.org</email>.</p>
<p>Data Requirements:</p>
<p>Please note that, as a condition of publication, PLOS' data policy requires that you make available all data used to draw the conclusions outlined in your manuscript. Data must be deposited in an appropriate repository, included within the body of the manuscript, or uploaded as supporting information. This includes all numerical values that were used to generate graphs, histograms etc.. For an example in PLOS Biology see here: <ext-link ext-link-type="uri" xlink:href="http://www.plosbiology.org/article/info%3Adoi%2F10.1371%2Fjournal.pbio.1001908#s5" xlink:type="simple">http://www.plosbiology.org/article/info%3Adoi%2F10.1371%2Fjournal.pbio.1001908#s5</ext-link>.</p>
<p>Reproducibility:</p>
<p>To enhance the reproducibility of your results, we recommend that you deposit your laboratory protocols in protocols.io, where a protocol can be assigned its own identifier (DOI) such that it can be cited independently in the future. Additionally, PLOS ONE offers an option to publish peer-reviewed clinical study protocols. Read more information on sharing protocols at <ext-link ext-link-type="uri" xlink:href="https://plos.org/protocols?utm_medium=editorial-email&amp;utm_source=authorletters&amp;utm_campaign=protocols" xlink:type="simple">https://plos.org/protocols?utm_medium=editorial-email&amp;utm_source=authorletters&amp;utm_campaign=protocols</ext-link></p>
<p>References:</p>
<p>Review your reference list to ensure that it is complete and correct. If you have cited papers that have been retracted, please include the rationale for doing so in the manuscript text, or remove these references and replace them with relevant current references. Any changes to the reference list should be mentioned in the rebuttal letter that accompanies your revised manuscript.</p>
<p><italic>If you need to cite a retracted article, indicate the article’s retracted status in the References list and also include a citation and full reference for the retraction notice.</italic></p>
</body>
</sub-article>
<sub-article article-type="author-comment" id="pcbi.1008984.r002">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1008984.r002</article-id>
<title-group>
<article-title>Author response to Decision Letter 0</article-title>
</title-group>
<related-object document-id="10.1371/journal.pcbi.1008984" document-id-type="doi" document-type="peer-reviewed-article" id="rel-obj002" link-type="rebutted-decision-letter" object-id="10.1371/journal.pcbi.1008984.r001" object-id-type="doi" object-type="decision-letter"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>1</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="author-response-date">14 Jun 2021</named-content>
</p>
<supplementary-material id="pcbi.1008984.s005" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" position="float" xlink:href="info:doi/10.1371/journal.pcbi.1008984.s005" xlink:type="simple">
<label>Attachment</label>
<caption>
<p>Submitted filename: <named-content content-type="submitted-filename">GeneNames_Point-by-point response to reviewer.docx</named-content></p>
</caption>
</supplementary-material>
</body>
</sub-article>
<sub-article article-type="aggregated-review-documents" id="pcbi.1008984.r003" specific-use="decision-letter">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1008984.r003</article-id>
<title-group>
<article-title>Decision Letter 1</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>Ouzounis</surname>
<given-names>Christos A.</given-names>
</name>
<role>Associate Editor</role>
</contrib>
<contrib contrib-type="author">
<name name-style="western">
<surname>Ioshikhes</surname>
<given-names>Ilya</given-names>
</name>
<role>Deputy Editor</role>
</contrib>
</contrib-group>
<permissions>
<copyright-year>2021</copyright-year>
<copyright-holder>Ouzounis, Ioshikhes</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<related-object document-id="10.1371/journal.pcbi.1008984" document-id-type="doi" document-type="article" id="rel-obj003" link-type="peer-reviewed-article"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>1</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="letter-date">1 Jul 2021</named-content>
</p>
<p>Dear Dr Ziemann,</p>
<p>We are pleased to inform you that your manuscript 'Gene name errors: lessons not learned' has been provisionally accepted for publication in PLOS Computational Biology.</p>
<p>Before your manuscript can be formally accepted you will need to complete some formatting changes, which you will receive in a follow up email. A member of our team will be in touch with a set of requests.</p>
<p>Please note that your manuscript will not be scheduled for publication until you have made the required changes, so a swift response is appreciated.</p>
<p>IMPORTANT: The editorial review process is now complete. PLOS will only permit corrections to spelling, formatting or significant scientific errors from this point onwards. Requests for major changes, or any which affect the scientific understanding of your work, will cause delays to the publication date of your manuscript.</p>
<p>Should you, your institution's press office or the journal office choose to press release your paper, you will automatically be opted out of early publication. We ask that you notify us now if you or your institution is planning to press release the article. All press must be co-ordinated with PLOS.</p>
<p>Thank you again for supporting Open Access publishing; we are looking forward to publishing your work in PLOS Computational Biology. </p>
<p>Best regards,</p>
<p>Christos A. Ouzounis</p>
<p>Associate Editor</p>
<p>PLOS Computational Biology</p>
<p>Ilya Ioshikhes</p>
<p>Deputy Editor</p>
<p>PLOS Computational Biology</p>
<p>***********************************************************</p>
<p>Reviewer's Responses to Questions</p>
<p><bold>Comments to the Authors:</bold></p>
<p><bold>Please note here if the review is uploaded as an attachment.</bold></p>
<p>Reviewer #1: I thank the Authors for addressing all of the comments. I am pleased with the additions, particularly Box 1. This article helps continue to strengthen the case for reproducible and transparent science, thank you.</p>
<p>Reviewer #2: The reivewers have addressed all my concerns adequately.</p>
<p>**********</p>
<p><bold>Have the authors made all data and (if applicable) computational code underlying the findings in their manuscript fully available?</bold></p>
<p>The <ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/ploscompbiol/s/materials-and-software-sharing" xlink:type="simple">PLOS Data policy</ext-link> requires authors to make all data and code underlying the findings described in their manuscript fully available without restriction, with rare exception (please refer to the Data Availability Statement in the manuscript PDF file). The data and code should be provided as part of the manuscript or its supporting information, or deposited to a public repository. For example, in addition to summary statistics, the data points behind means, medians and variance measures should be available. If there are restrictions on publicly sharing data or code —e.g. participant privacy or use of data from a third party—those must be specified.</p>
<p>Reviewer #1: Yes</p>
<p>Reviewer #2: Yes</p>
<p>**********</p>
<p>PLOS authors have the option to publish the peer review history of their article (<ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/ploscompbiol/s/editorial-and-peer-review-process#loc-peer-review-history" xlink:type="simple">what does this mean?</ext-link>). If published, this will include your full peer review and any attached files.</p>
<p>If you choose “no”, your identity will remain anonymous but your review may still be made public.</p>
<p><bold>Do you want your identity to be public for this peer review?</bold> For information about this choice, including consent withdrawal, please see our <ext-link ext-link-type="uri" xlink:href="https://www.plos.org/privacy-policy" xlink:type="simple">Privacy Policy</ext-link>.</p>
<p>Reviewer #1: No</p>
<p>Reviewer #2: <bold>Yes: </bold>Björn Brembs</p>
</body>
</sub-article>
<sub-article article-type="editor-report" id="pcbi.1008984.r004" specific-use="acceptance-letter">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pcbi.1008984.r004</article-id>
<title-group>
<article-title>Acceptance letter</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>Ouzounis</surname>
<given-names>Christos A.</given-names>
</name>
<role>Associate Editor</role>
</contrib>
<contrib contrib-type="author">
<name name-style="western">
<surname>Ioshikhes</surname>
<given-names>Ilya</given-names>
</name>
<role>Deputy Editor</role>
</contrib>
</contrib-group>
<permissions>
<copyright-year>2021</copyright-year>
<copyright-holder>Ouzounis, Ioshikhes</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<related-object document-id="10.1371/journal.pcbi.1008984" document-id-type="doi" document-type="article" id="rel-obj004" link-type="peer-reviewed-article"/>
</front-stub>
<body>
<p>
<named-content content-type="letter-date">22 Jul 2021</named-content>
</p>
<p>PCOMPBIOL-D-21-00825R1 </p>
<p>Gene name errors: lessons not learned</p>
<p>Dear Dr Ziemann,</p>
<p>I am pleased to inform you that your manuscript has been formally accepted for publication in PLOS Computational Biology. Your manuscript is now with our production department and you will be notified of the publication date in due course.</p>
<p>The corresponding author will soon be receiving a typeset proof for review, to ensure errors have not been introduced during production. Please review the PDF proof of your manuscript carefully, as this is the last chance to correct any errors. Please note that major changes, or those which affect the scientific understanding of the work, will likely cause delays to the publication date of your manuscript. </p>
<p>Soon after your final files are uploaded, unless you have opted out, the early version of your manuscript will be published online. The date of the early version will be your article's publication date. The final article will be published to the same URL, and all versions of the paper will be accessible to readers.</p>
<p>Thank you again for supporting PLOS Computational Biology and open-access publishing. We are looking forward to publishing your work! </p>
<p>With kind regards,</p>
<p>Zsofi Zombor</p>
<p>PLOS Computational Biology | Carlyle House, Carlyle Road, Cambridge CB4 3DN | United Kingdom <email xlink:type="simple">ploscompbiol@plos.org</email> | Phone +44 (0) 1223-442824 | <ext-link ext-link-type="uri" xlink:href="http://ploscompbiol.org" xlink:type="simple">ploscompbiol.org</ext-link> | @PLOSCompBiol</p>
</body>
</sub-article>
</article>