<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article
  PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="3.0" xml:lang="en">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS ONE</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">plosone</journal-id><journal-title-group>
<journal-title>PLoS ONE</journal-title></journal-title-group>
<issn pub-type="epub">1932-6203</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, USA</publisher-loc></publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">PONE-D-14-22865</article-id>
<article-id pub-id-type="doi">10.1371/journal.pone.0107801</article-id>
<article-categories><subj-group subj-group-type="heading"><subject>Research Article</subject></subj-group><subj-group subj-group-type="Discipline-v2"><subject>Physical sciences</subject><subj-group><subject>Mathematics</subject><subj-group><subject>Applied mathematics</subject><subj-group><subject>Algorithms</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v2"><subject>Biology and life sciences</subject><subj-group><subject>Biochemistry</subject><subj-group><subject>Biomarkers</subject></subj-group></subj-group><subj-group><subject>Computational biology</subject><subj-group><subject>Genome analysis</subject><subj-group><subject>Transcriptome analysis</subject></subj-group></subj-group></subj-group><subj-group><subject>Neuroscience</subject><subj-group><subject>Cognitive science</subject><subj-group><subject>Artificial intelligence</subject><subj-group><subject>Machine learning</subject><subj-group><subject>Machine learning algorithms</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v2"><subject>Computer and information sciences</subject></subj-group><subj-group subj-group-type="Discipline-v2"><subject>Research and analysis methods</subject><subj-group><subject>Database and informatics methods</subject><subj-group><subject>Bioinformatics</subject></subj-group></subj-group><subj-group><subject>Simulation and modeling</subject></subj-group></subj-group></article-categories>
<title-group>
<article-title>A Robust and Accurate Method for Feature Selection and Prioritization from Multi-Class OMICs Data</article-title>
<alt-title alt-title-type="running-head">Feature Selection and Prioritisation from OMICs Data</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Fortino</surname><given-names>Vittorio</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref><xref ref-type="aff" rid="aff2"><sup>2</sup></xref></contrib>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Kinaret</surname><given-names>Pia</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref><xref ref-type="aff" rid="aff2"><sup>2</sup></xref></contrib>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Fyhrquist</surname><given-names>Nanna</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref><xref ref-type="aff" rid="aff2"><sup>2</sup></xref></contrib>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Alenius</surname><given-names>Harri</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref><xref ref-type="aff" rid="aff2"><sup>2</sup></xref></contrib>
<contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Greco</surname><given-names>Dario</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref><xref ref-type="aff" rid="aff2"><sup>2</sup></xref><xref ref-type="corresp" rid="cor1"><sup>*</sup></xref></contrib>
</contrib-group>
<aff id="aff1"><label>1</label><addr-line>Unit of Systems Toxicology, Finnish Institute of Occupational Health (FIOH), Helsinki, Finland</addr-line></aff>
<aff id="aff2"><label>2</label><addr-line>Nanosafety Centre, Finnish Institute of Occupational Health (FIOH), Helsinki, Finland</addr-line></aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple"><name name-style="western"><surname>Arthur</surname><given-names>Jonathan</given-names></name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/></contrib>
</contrib-group>
<aff id="edit1"><addr-line>Children's Medical Research Institute, Australia</addr-line></aff>
<author-notes>
<corresp id="cor1">* E-mail: <email xlink:type="simple">dario.greco@ttl.fi</email></corresp>
<fn fn-type="conflict"><p>The authors have declared that no competing interests exist.</p></fn>
<fn fn-type="con"><p>Conceived and designed the experiments: DG VF. Performed the experiments: VF PK DG. Analyzed the data: VF DG PK NF HA. Contributed reagents/materials/analysis tools: DG VF. Wrote the paper: DG VF NF PK HA.</p></fn>
</author-notes>
<pub-date pub-type="collection"><year>2014</year></pub-date>
<pub-date pub-type="epub"><day>23</day><month>9</month><year>2014</year></pub-date>
<volume>9</volume>
<issue>9</issue>
<elocation-id>e107801</elocation-id>
<history>
<date date-type="received"><day>22</day><month>5</month><year>2014</year></date>
<date date-type="accepted"><day>22</day><month>8</month><year>2014</year></date>
</history>
<permissions>
<copyright-year>2014</copyright-year>
<copyright-holder>Fortino et al</copyright-holder><license xlink:type="simple"><license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p></license></permissions>
<abstract>
<p>Selecting relevant features is a common task in most OMICs data analysis, where the aim is to identify a small set of key features to be used as biomarkers. To this end, two alternative but equally valid methods are mainly available, namely the univariate (filter) or the multivariate (wrapper) approach. The stability of the selected lists of features is an often neglected but very important requirement. If the same features are selected in multiple independent iterations, they more likely are reliable biomarkers. In this study, we developed and evaluated the performance of a novel method for feature selection and prioritization, aiming at generating robust and stable sets of features with high predictive power. The proposed method uses the fuzzy logic for a first unbiased feature selection and a Random Forest built from conditional inference trees to prioritize the candidate discriminant features. Analyzing several multi-class gene expression microarray data sets, we demonstrate that our technique provides equal or better classification performance and a greater stability as compared to other Random Forest-based feature selection methods.</p>
</abstract>
<funding-group><funding-statement>This work has been supported by the European Commission, under grant agreement FP7-309329 (NANOSOLUTIONS). The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</funding-statement></funding-group><counts><page-count count="9"/></counts><custom-meta-group><custom-meta id="data-availability" xlink:type="simple"><meta-name>Data Availability</meta-name><meta-value>The authors confirm that all data underlying the findings are fully available without restriction. All relevant data are within the paper and its Supporting Information files.</meta-value></custom-meta></custom-meta-group></article-meta>
</front>
<body><sec id="s1">
<title>Introduction</title>
<p>Identifying discriminant features, for instance from transcriptomics experiments, and modelling classifiers based on them are fundamental tasks when the aim is to highlight biomarkers (<italic>e.g.</italic> genes or transcripts discriminating healthy from diseased samples). Indeed, clinical classification based on high throughput molecular profiling has been already explored for a number of complex diseases, such as cancer <xref ref-type="bibr" rid="pone.0107801-Weinstein1">[1]</xref>, <xref ref-type="bibr" rid="pone.0107801-Virtanen1">[2]</xref>. These studies become crucial also in terms of public health when such approaches are considered for clinical practice <xref ref-type="bibr" rid="pone.0107801-Tezak1">[3]</xref>. On the other hand, concerns about the stability and reproducibility of microarray results have been raised, despite the huge propagation of the gene selection methods in biomarker discovery <xref ref-type="bibr" rid="pone.0107801-Saeys1">[4]</xref> and interest on this topic seems to be increasing <xref ref-type="bibr" rid="pone.0107801-He1">[5]</xref>, <xref ref-type="bibr" rid="pone.0107801-Abeel1">[6]</xref>. The most frequently used feature selection techniques include univariate (filter), and multivariate (wrapper), approaches. Univariate techniques, such as the formal statistical hypothesis testing or, more in general, the ranking methods, test each feature separately. Multivariate techniques assess the relevance of groups of features simultaneously, by using selection methods (<italic>e.g.</italic> forward or backward selection) coupled with machine learning techniques such as logistic regression, support vector machines (SVM) or random forests (RF) <xref ref-type="bibr" rid="pone.0107801-Guyon1">[7]</xref>–<xref ref-type="bibr" rid="pone.0107801-Kursa1">[9]</xref>. Unfortunately, multivariate methods tend to identify different subsets of candidate biomarkers with equal accuracy, even when feature selection algorithms are used on the same data <xref ref-type="bibr" rid="pone.0107801-He1">[5]</xref>, <xref ref-type="bibr" rid="pone.0107801-Abeel1">[6]</xref>. This is particularly true for feature selection problems in OMICs data analysis, where the number of investigated features is much larger than the number of samples. Multiple stability issues can in fact affect these data sets, and the data sets can contain large number of redundant features <xref ref-type="bibr" rid="pone.0107801-Kursa2">[10]</xref>.</p>
<p>The aim of this study was to develop a feature selection and prioritization framework capable of guaranteeing high stability as well as high classification performance. First, an unsupervised fuzzy pattern discovery method <xref ref-type="bibr" rid="pone.0107801-GlezPea1">[11]</xref> is used to discretize the gene expression data and to identify fuzzy-based feature signatures called fuzzy patterns (FP). Each FP summarizes the most relevant features of each class. Next, a Random Forest (RF) procedure, where the prior knowledge of the FPs is used to enhance the performance of each tree within the classifier, is run on the original data. Last, a permutation based variable importance measure is used to rank the selected features and produce the final prioritized feature list.</p>
<p>We tested our fuzzy pattern – random forest (FPRF) procedure, implemented in R language <xref ref-type="bibr" rid="pone.0107801-R1">[12]</xref>, as well as two widely used methods for feature selection based on RFs and also implemented in R - varSelRF <xref ref-type="bibr" rid="pone.0107801-DazUriarte1">[8]</xref> and Boruta <xref ref-type="bibr" rid="pone.0107801-Kursa1">[9]</xref>, on several gene expression microarray data sets investigating human samples in different pathophysiological conditions. The basic idea of varSelRF is to execute a backward iterative selection process that exploits the measures of variable importance, computed by RF based on CART trees <xref ref-type="bibr" rid="pone.0107801-Breiman1">[13]</xref>, whereas Boruta uses an iteration process that removes, at each run, the features with less contribution to classification accuracy by introducing random variables for the competition <xref ref-type="bibr" rid="pone.0107801-Kursa2">[10]</xref>.</p>
</sec><sec id="s2" sec-type="materials|methods">
<title>Materials and Methods</title>
<sec id="s2a">
<title>The FPRF algorithm</title>
<p><xref ref-type="fig" rid="pone-0107801-g001">Figure 1</xref> depicts the proposed schema of feature selection and prioritization, when dealing with multiclass high-throughput data. The first step consists of a fuzzy pattern discovery method, implemented in the R package <italic>DFP</italic> <xref ref-type="bibr" rid="pone.0107801-GlezPea1">[11]</xref>, which is used to select large subsets of relevant and independent class-specific features indicated as FPs (see <xref ref-type="supplementary-material" rid="pone.0107801.s001">Figure S1</xref> for details). First, the membership function for each feature is computed and each value is consequently transformed into a linguistic label (<italic>i.e.</italic> “Low”, “Medium”, “High” and their intersections “Low-Medium” and “Medium-High”). The output is a discretized (fuzzyfied) dataset that is only used to generate the FPs. A FP is a large set of features whose fuzzyfied pattern is correlated with a specific class (or a biological state of interest). The union of all FPs forms the set of selected features (step 1 in <xref ref-type="fig" rid="pone-0107801-g001">Figure 1</xref>). At the second step, the selected features are prioritized using a RF-based classifier. This is achieved by using a modified RF algorithm that helps reducing the risk of considering redundant features for the node splitting process, improving the accuracy of the decision trees and the final rank of the features.</p>
<fig id="pone-0107801-g001" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0107801.g001</object-id><label>Figure 1</label><caption>
<title>Feature selection and prioritization schema.</title>
<p>The feature selection step is based on a fuzzy pattern discovery method implemented into the R-package <italic>DFP</italic>. This method is able to identify the most relevant class-specific features, forming a fuzzy pattern, for each class (A). The selected features are then ranked using a modified RF procedure that exploits the fuzzy patterns to improve the performance of the decision trees grown from large subset of samples and features (B). The RF algorithm works on the gene expression dataset given by the union of all features selected in the first step. Furthermore, the knowledge of the fuzzy-based feature signatures is exploited in order to have a different random selection procedure. For each node, the subset of features used for the splitting process is composed by a random selected feature from each fuzzy feature pattern.</p>
</caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pone.0107801.g001" position="float" xlink:type="simple"/></fig></sec><sec id="s2b">
<title>The discretization and selection steps</title>
<p>As shown in <xref ref-type="supplementary-material" rid="pone.0107801.s001">Figure S1</xref>, the full dataset is discretized into fuzzy labels through the application of membership functions (MFs). A MF is a curve that defines, for each feature in the data set, how each numerical value in the input space is mapped to a membership value or linguistic label (<italic>i.e.</italic>, “Low”, “Medium” and “High”). In the <italic>DFP</italic> package, two types of membership functions are used: i) the polynomial approximation of a Gaussian membership function to model the range of ‘normal’ expression levels of a gene and ii) the polynomial approximation of two sigmoidal membership functions, which are able to specify asymmetric membership functions for the ‘low’ and ‘high’ expression levels. After the discretization step, the FP for each class (or outcome) is computed by selecting the genes with highly frequent discretized label in at least one class (<xref ref-type="supplementary-material" rid="pone.0107801.s001">Figure S1</xref>). The discretization and selection step is based on two parameters (<italic>zeta</italic>, <italic>piVal</italic>). The <italic>zeta</italic> parameter is a threshold used in the membership functions to label the float values with a discrete value, and is thus important for the fuzzy discretization process. The parameter <italic>piVal</italic>, on the other hand, specifies the percentage of values of a class to determine the FPs. Essentially, these two parameters influence the number of features included in each FP. Smaller values of <italic>zeta</italic> and bigger values of <italic>piVal</italic> result in smaller FPs containing less features. While too small FPs can resolve in some empty FPs, bigger FPs might include less relevant features. In our experiments, we have preferred to work with bigger FPs by using the parameter configurations shown in <xref ref-type="supplementary-material" rid="pone.0107801.s002">Table S1</xref>. Since in FPRF the predictive power of the features is evaluated in the RF-based classifier, the less informative features will rank low in the final list. Nevertheless, nominally less predictive features that become important in combination with others will have a higher rank in the output.</p>
</sec><sec id="s2c">
<title>The RF-based feature ranking method</title>
<p>We used a RF-based classifier to rank the features selected by the fuzzy pattern discovery method. The RF implementation utilized is based on unbiased classification trees, as implemented in the <monospace>ctree</monospace> function in the R package <italic>party</italic> <xref ref-type="bibr" rid="pone.0107801-Hothorn1">[14]</xref>, <xref ref-type="bibr" rid="pone.0107801-Strobl1">[15]</xref>. The feature importance is usually evaluated through methods such as the Gini importance and the “mean decrease in accuracy” or “permutation” test, available in the package <italic>randomForest</italic> <xref ref-type="bibr" rid="pone.0107801-Breiman2">[16]</xref>. Similarly, a permutation importance measure for <monospace>cforest</monospace> is available in <italic>party</italic>. Since, the Gini importance criterion may lead to biased results <xref ref-type="bibr" rid="pone.0107801-Altmann1">[17]</xref>, <xref ref-type="bibr" rid="pone.0107801-Strobl2">[18]</xref>, we used the permutation accuracy importance score to evaluate the selected features.</p>
<p>Furthermore, the package <italic>party</italic> was modified in order to introduce a new RF procedure that exploits the information of FPs to improve the accuracy of the decision trees. At each node, the standard RF procedure, selects M variables at random and searches for the best split over them. This procedure is referred to as node splitting process. The new RF procedure simply replaces the random selection of M features with a new process that picks one random relevant feature from each fuzzy pattern. The basic idea is to increase the number of relevant features selected for the node splitting process, restricting the random feature selection from the FPs. Indeed, we observed by internal studies that the RF models built with the proposed RF procedure are significantly better than those obtained with the standard procedure. In a more detail, the random subset of features at each node splitting is built as follows:<disp-formula id="pone.0107801.e001"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pone.0107801.e001" xlink:type="simple"/><label>(1)</label></disp-formula></p>
<p>Where <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pone.0107801.e002" xlink:type="simple"/></inline-formula> is the feature randomly selected from the <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pone.0107801.e003" xlink:type="simple"/></inline-formula> and the index <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pone.0107801.e004" xlink:type="simple"/></inline-formula>. <italic>k</italic> is the number of genes in <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pone.0107801.e005" xlink:type="simple"/></inline-formula> and <italic>n</italic> is the number of FPs ( =  #number of classes). The number of trees used by the RF classifiers to rank the selected features is a relevant parameter, and we used 1000 trees for all the data sets. This value was also used to evaluate the accuracy of all trained RF-based classifiers. The methods varSelRF and Boruta were used with default parameters.</p>
</sec><sec id="s2d">
<title>Datasets analyzed</title>
<p>Four multi-class gene expression microarray data sets were analyzed to evaluate the performances of FPRF. The first data set is a compendium of human peripheral blood mononuclear cells (PBMC) samples, consisting of seven classes, which were generated by integrating multiple independent publicly available series (<xref ref-type="supplementary-material" rid="pone.0107801.s003">Table S2</xref>) from the NCBI GEO repository (<ext-link ext-link-type="uri" xlink:href="http://www.ncbi.nlm.nih.gov/geo/" xlink:type="simple">http://www.ncbi.nlm.nih.gov/geo/</ext-link>). The second data set is a series of transcriptomics profiles of bone marrow cells (BM) from patients with different subtypes of acute lymphoblastic leukemia (ALL) <xref ref-type="bibr" rid="pone.0107801-Yeoh1">[19]</xref> (<ext-link ext-link-type="uri" xlink:href="http://www.stjuderesearch.org/site/data/ALL1" xlink:type="simple">http://www.stjuderesearch.org/site/data/ALL1</ext-link>). In the original study, the dataset Leukemia has been divided into six diagnostic groups (BCR-ABL, E2A-PBX1, Hyperdiploid&gt;50, MLL, T-ALL and TELL-AML), and one that contains samples that did not fit into any one of the above groups. But, in our study we preferred to consider only those samples that were belonging to one of 6 categories (276). In addition, two data sets from the IMPROVER challenge <xref ref-type="bibr" rid="pone.0107801-Tarca1">[20]</xref>, profiling respectively lung cancer (four classes,, GSE43580) and psoriasis specimens (three classes, GSE13355 and GSE14905), respectively, were also analyzed. The preprocessing of all the data sets has followed a similar procedure. First, the raw data (.CEL files) were imported into R v. 3.0.0 and their quality were checked with the packages <italic>AffyQCReport</italic> (<ext-link ext-link-type="uri" xlink:href="http://www.bioconductor.org/packages/2.12/bioc/html/affyQCReport.html" xlink:type="simple">http://www.bioconductor.org/packages/2.12/bioc/html/affyQCReport.html</ext-link>) and <italic>affyPLM</italic> (<ext-link ext-link-type="uri" xlink:href="http://www.bioconductor.org/packages/2.12/bioc/html/affyPLM.html" xlink:type="simple">http://www.bioconductor.org/packages/2.12/bioc/html/affyPLM.html</ext-link>). Subsequently, the background correction, normalization and summarization of the gene expression values were performed according to the RMA algorithm implemented in the package <italic>affy</italic> <xref ref-type="bibr" rid="pone.0107801-Irizarry1">[21]</xref>. Alternative CDFs v.18 (<ext-link ext-link-type="uri" xlink:href="http://brainarray.mbni.med.umich.edu/brainarray/default.asp" xlink:type="simple">http://brainarray.mbni.med.umich.edu/brainarray/default.asp</ext-link>) were used for probe annotations based on the Ensembl Gene (<ext-link ext-link-type="uri" xlink:href="http://www.ensembl.org/index.html" xlink:type="simple">http://www.ensembl.org/index.html</ext-link>) or the Entrez Gene (<ext-link ext-link-type="uri" xlink:href="http://www.ncbi.nlm.nih.gov/gene" xlink:type="simple">http://www.ncbi.nlm.nih.gov/gene</ext-link>) databases. For the PBMC compendium, the technical biases associated with the series and the platform (different Affymetrix chipsets) were corrected using the ComBat algorithm <xref ref-type="bibr" rid="pone.0107801-Johnson1">[22]</xref> implemented in the R <italic>sva</italic> package <xref ref-type="bibr" rid="pone.0107801-Leek1">[23]</xref>, <xref ref-type="bibr" rid="pone.0107801-Leek2">[24]</xref>. Finally, the genes with low variability across the samples were eliminated.</p>
</sec><sec id="s2e">
<title>Testing and assessment of the results</title>
<p><xref ref-type="fig" rid="pone-0107801-g002">Figure 2</xref> depicts the general schema used to evaluate and compare our method with varSelRF and Boruta. After having randomly selected and left aside the 30% of the data, the remaining 70% of the data was used to generate the sets of selected, relevant features (or genes). Since our method runs feature selection and prioritization, the generated ranked lists were cut-off at different points (or lengths) in order to produce lists of features of different sizes, referred to as n-top ranked feature lists. Next, the classification accuracy of each n-top ranked feature list and the feature lists obtained with varSelRF and Boruta were evaluated. For each list of features, a RF classifier was first trained on the randomly selected 70% of the data set, and then tested on the remaining 30% of the data, in order to get the post-selection classification accuracy. This process was repeated 30 times in order to assess the resulting classification metrics and the stability of the selected features.</p>
<fig id="pone-0107801-g002" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0107801.g002</object-id><label>Figure 2</label><caption>
<title>Evaluation process.</title>
<p>The full dataset is a matrix with thousands of features (<italic>e.g.</italic> genes) in rows and tens or hundreds of samples (belonging to different classes) in columns. For each sample, the outcome (class) is given. The dataset is randomly divided into training and test sets using a stratified random selection (1). Within the training set, relevant features are selected using the compared methods (2). The FPRF method identifies a wide set of relevant features using a fuzzy pattern discovery technique and ranks them applying a RF-based procedure (3). The most n-relevant features are then selected with n = 30, 50, 100, 150 and 200 (4). The different sets of features are used to evaluate the stability and the corresponding classification performance. For each set of selected features an RF-based classifier is trained on the training set (5). After training, the classifiers are asked to predict the outcome of the test set patients (6). The predicted outcome is compared with the true outcome and the number of correctly classified samples is noted. Steps 1–6 are repeated 30 times, and the resulting evaluation metrics are obtained by averaging over the 30 runs.</p>
</caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pone.0107801.g002" position="float" xlink:type="simple"/></fig>
<p>The execution time for each run was recorded. All the analyses were performed with a quad-core Intel Core i7 3.4 GHz and 32GB DDR3 RAM running Apple Mac OS X v.10.9.2 and R 3.0.2.</p>
</sec><sec id="s2f">
<title>Evaluation Metrics</title>
<p>The evaluation metrics used in this study aimed at assessing the classification accuracy and the stability of the compared methods. The stability was evaluated by comparing the lists of features selected by each method over 30 bootstrap iterations. From these, we looked for significantly self-consistent features <xref ref-type="bibr" rid="pone.0107801-Kursa2">[10]</xref>, that were selected more frequently by bootstrap iterations than what would be expected at random. For each method, we computed the ratio between the number of self-consistent features and the total number of features selected over the 30 bootstrap iterations.</p>
<p>The post-classification accuracy was estimated by the F-score and the G-mean metrics, which are particularly appropriate when unbalance multi-class problems are considered <xref ref-type="bibr" rid="pone.0107801-Yu1">[25]</xref>. The F-score is based on the F-measure, which is calculated as follows: <disp-formula id="pone.0107801.e006"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pone.0107801.e006" xlink:type="simple"/><label>(2)</label></disp-formula></p>
<p>Where <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pone.0107801.e007" xlink:type="simple"/></inline-formula> and <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pone.0107801.e008" xlink:type="simple"/></inline-formula> are the recall and the precision, respectively, for the <italic>i<sup>th</sup></italic> class, and <italic>k</italic> is the number of class labels. A high F-measure value guarantees that both recall and precision are reasonably high. The extended F-measure metric for the multi-class case is described as follows: <disp-formula id="pone.0107801.e009"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pone.0107801.e009" xlink:type="simple"/><label>(3)</label></disp-formula></p>
<p>The G-mean function is instead defined as the geometric mean of the recalls across all the classes.<disp-formula id="pone.0107801.e010"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pone.0107801.e010" xlink:type="simple"/><label>(4)</label></disp-formula></p>
<p>In addition to these two main metrics, we also report the overall accuracy.</p>
</sec></sec><sec id="s3">
<title>Results and Discussion</title>
<p>We compared the novel feature selection and prioritization method FPRF with two common RF-based feature selection methods, varSelRF and Boruta. We used publicly available gene expression datasets, and evaluated the classification performance and the stability of the selected features following the schema illustrated in <xref ref-type="fig" rid="pone-0107801-g002">Figure 2</xref>.</p>
<sec id="s3a">
<title>Empirical evaluation - accuracy</title>
<p>A common method for the assessment and tuning of feature selection methods consists in evaluating the accuracy/error of a classifier that is trained using only the selected features. At each bootstrap iteration, the features that were selected from the training set were used to model a RF-based classifier on the test set. The test set contained samples that were not present in the corresponding training set, and therefore not considered during the feature selection step. Since our method selects and ranks the features, we trained different RF-based classifiers on the training set, selecting different cut points: the first 2, 3, 4, 5, 10, 20, 30, 50, 150, 200 and 250 top-ranked features. We then compared the classification performance of the classifiers FPRF.2–250, with those obtained using the features selected from varSelRF and Boruta. The idea was to compare our method with Boruta and varSelRF, and simultaneously find the best cut points for each analyzed datasets. To impartially assess the classification performance, we used three extended measures, namely, Acc, G-mean (<italic>G</italic>) and F-score (<italic>F</italic>), as defined in the Material and Methods.</p>
<p><xref ref-type="table" rid="pone-0107801-t001">Table 1</xref> reports the results of varSelRF and Boruta, along with the different n-top ranked feature lists generated by FPRF on the four analyzed data sets. The evaluation metrics were averaged over 30 bootstraps. The RF-based classifiers that were trained with less than or equal to 50-top ranked feature lists exhibited the best classification performance among the different n-top ranked feature lists (<xref ref-type="table" rid="pone-0107801-t002">Table 2</xref>). In the Lekumia dataset the Accuracy, F-score and G-mean values compiled for FPRF.10 and FPRF.20 are almost always bigger than the same values compiled for FPRF.3–5 and FPRF.30–250. Of note, the Leukemia dataset was sensitive to class imbalance, while other datasets were not, as shown by the difference between the Accuracy as well as the F-score and G-mean values. An Accuracy value larger than the F-score or the G-mean value indicates that the classifier is significantly affected by imbalanced class distribution, which is evident with the RF-based classifiers obtained from varSelRF and Boruta. However, this class imbalance problem does not apparently affect the classification performance of the RF-classifiers obtained using FPRF.10 and FPRF.20. In Lung Cancer dataset, Boruta performs better than our RF-based classifiers achieving 99% of accuracy. However, using only the first four top ranked features, the accuracy of the FPRF.4 classifier reaches 98%.</p>
<table-wrap id="pone-0107801-t001" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0107801.t001</object-id><label>Table 1</label><caption>
<title>Overview of the analyzed datasets.</title>
</caption><alternatives><graphic id="pone-0107801-t001-1" position="float" mimetype="image" xlink:href="info:doi/10.1371/journal.pone.0107801.t001" xlink:type="simple"/>
<table><colgroup span="1"><col align="left" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/></colgroup>
<thead>
<tr>
<td align="left" rowspan="1" colspan="1">Dataset</td>
<td align="left" rowspan="1" colspan="1">Sample Size</td>
<td align="left" rowspan="1" colspan="1"># of genes</td>
<td align="left" rowspan="1" colspan="1"># of classes</td>
<td align="left" rowspan="1" colspan="1">Class list (# of samples)</td>
</tr>
</thead>
<tbody>
<tr>
<td align="left" rowspan="1" colspan="1">Leukemia</td>
<td align="left" rowspan="1" colspan="1">276</td>
<td align="left" rowspan="1" colspan="1">4199</td>
<td align="left" rowspan="1" colspan="1">6</td>
<td align="left" rowspan="1" colspan="1">MLL-rearrangement (21), BCR-ABL (16), T-ALL (45), TEL-AML1 (79), E2A-PBX1 (27) and Hyperdiploid&gt;50 (88).</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">Lung Cancer</td>
<td align="left" rowspan="1" colspan="1">150</td>
<td align="left" rowspan="1" colspan="1">9480</td>
<td align="left" rowspan="1" colspan="1">4</td>
<td align="left" rowspan="1" colspan="1">AC1 (41), AC2 (36), SCC1(34), SCC2(39)</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">Psoriasis</td>
<td align="left" rowspan="1" colspan="1">262</td>
<td align="left" rowspan="1" colspan="1">9480</td>
<td align="left" rowspan="1" colspan="1">3</td>
<td align="left" rowspan="1" colspan="1">Involved-skin (91), Normal-skin (85), Uninvolved-skin (86)</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">PBMC</td>
<td align="left" rowspan="1" colspan="1">978</td>
<td align="left" rowspan="1" colspan="1">3700</td>
<td align="left" rowspan="1" colspan="1">7</td>
<td align="left" rowspan="1" colspan="1">Asthma quiet (521), HCV (60), Healthy (170), HIV (60), Measles (15), S. aureus (44) andSLE (108).</td>
</tr>
</tbody>
</table>
</alternatives><table-wrap-foot><fn id="nt101"><label/><p>For each dataset, the number of samples, the number of features/genes after pre-processing the data, the number of classes and samples specified for each class are reported.</p></fn></table-wrap-foot></table-wrap><table-wrap id="pone-0107801-t002" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0107801.t002</object-id><label>Table 2</label><caption>
<title>Classification performance.</title>
</caption><alternatives><graphic id="pone-0107801-t002-2" position="float" mimetype="image" xlink:href="info:doi/10.1371/journal.pone.0107801.t002" xlink:type="simple"/>
<table><colgroup span="1"><col align="left" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/></colgroup>
<thead>
<tr>
<td align="left" rowspan="1" colspan="1">Method</td>
<td colspan="3" align="left" rowspan="1">Leukemia</td>
<td colspan="3" align="left" rowspan="1">Lung Cancer</td>
<td colspan="3" align="left" rowspan="1">Psoriasis</td>
<td colspan="3" align="left" rowspan="1">PBMC</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1"/>
<td align="left" rowspan="1" colspan="1">Acc</td>
<td align="left" rowspan="1" colspan="1">F</td>
<td align="left" rowspan="1" colspan="1">G</td>
<td align="left" rowspan="1" colspan="1">Acc</td>
<td align="left" rowspan="1" colspan="1">F</td>
<td align="left" rowspan="1" colspan="1">G</td>
<td align="left" rowspan="1" colspan="1">Acc</td>
<td align="left" rowspan="1" colspan="1">F</td>
<td align="left" rowspan="1" colspan="1">G</td>
<td align="left" rowspan="1" colspan="1">Acc</td>
<td align="left" rowspan="1" colspan="1">F</td>
<td align="left" rowspan="1" colspan="1">G</td>
</tr>
</thead>
<tbody>
<tr>
<td align="left" rowspan="1" colspan="1">varSelRF</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.95</td>
<td align="left" rowspan="1" colspan="1">0.95</td>
<td align="left" rowspan="1" colspan="1">0.91</td>
<td align="left" rowspan="1" colspan="1">0.9</td>
<td align="left" rowspan="1" colspan="1">0.9</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">Boruta</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.95</td>
<td align="left" rowspan="1" colspan="1">0.94</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.2</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.94</td>
<td align="left" rowspan="1" colspan="1">0.95</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.92</td>
<td align="left" rowspan="1" colspan="1">0.94</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.3</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.94</td>
<td align="left" rowspan="1" colspan="1">0.95</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.94</td>
<td align="left" rowspan="1" colspan="1">0.94</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.4</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.95</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.5</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.10</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.95</td>
<td align="left" rowspan="1" colspan="1">0.95</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.20</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.94</td>
<td align="left" rowspan="1" colspan="1">0.93</td>
<td align="left" rowspan="1" colspan="1">0.93</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.30</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.92</td>
<td align="left" rowspan="1" colspan="1">0.92</td>
<td align="left" rowspan="1" colspan="1">0.92</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.50</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.92</td>
<td align="left" rowspan="1" colspan="1">0.92</td>
<td align="left" rowspan="1" colspan="1">0.91</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.99</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.100</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.88</td>
<td align="left" rowspan="1" colspan="1">0.87</td>
<td align="left" rowspan="1" colspan="1">0.87</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.150</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.88</td>
<td align="left" rowspan="1" colspan="1">0.86</td>
<td align="left" rowspan="1" colspan="1">0.85</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.200</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.87</td>
<td align="left" rowspan="1" colspan="1">0.85</td>
<td align="left" rowspan="1" colspan="1">0.85</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.95</td>
<td align="left" rowspan="1" colspan="1">0.95</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.250</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.96</td>
<td align="left" rowspan="1" colspan="1">0.76</td>
<td align="left" rowspan="1" colspan="1">0.81</td>
<td align="left" rowspan="1" colspan="1">0.81</td>
<td align="left" rowspan="1" colspan="1">0.94</td>
<td align="left" rowspan="1" colspan="1">0.94</td>
<td align="left" rowspan="1" colspan="1">0.94</td>
<td align="left" rowspan="1" colspan="1">0.98</td>
<td align="left" rowspan="1" colspan="1">0.97</td>
<td align="left" rowspan="1" colspan="1">0.95</td>
</tr>
</tbody>
</table>
</alternatives><table-wrap-foot><fn id="nt102"><label/><p>The mean accuracy values obtained over the 30 bootstrap iterations. Acc – is the overall accuracy, F – is the F-score, G – is the G-score. The highest values are highlighted in bold. NOTE: all the corresponding standard deviations are less than 0.02.</p></fn></table-wrap-foot></table-wrap>
<p>Our results confirm the recent observations of Kursa <xref ref-type="bibr" rid="pone.0107801-Kursa2">[10]</xref>, where the robustness of different RF-based gene selection methods was evaluated in terms of accuracy and stability. The high accuracy of the generated gene sets is due to the nature of RF-based classifiers. They can handle a large number of noisy features without a significant increase in error. Therefore, although the classification performance is relevant for evaluating the quality of the selected feature sets, it is alone not sufficient to provide a reliable assessment of the selection quality.</p>
</sec><sec id="s3b">
<title>Empirical evaluation - stability</title>
<p>For each method, the feature stability was assessed using the sets of selected features over 30 bootstrap iterations. The stability of the different n-top ranked feature sets were compared with those obtained with varSelRF and Boruta. For each method, we identified the set of stable features by applying a recently described self-consistency-based method <xref ref-type="bibr" rid="pone.0107801-Kursa2">[10]</xref>.</p>
<p><xref ref-type="table" rid="pone-0107801-t003">Table 3</xref> reports the ratios of the self-consistent features to the total number of different features selected over the 30 bootstrap iterations for each data set and method considered. The 2, 3, 4, 5, 10 20 top-ranked features provided by our method exhibit better robustness for all data sets. This is particularly true in the case of the Lung Cancer and Psoriasis datasets, where the observed stability values are much higher than those achieved by Boruta.</p>
<table-wrap id="pone-0107801-t003" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0107801.t003</object-id><label>Table 3</label><caption>
<title>Selection consistency analysis.</title>
</caption><alternatives><graphic id="pone-0107801-t003-3" position="float" mimetype="image" xlink:href="info:doi/10.1371/journal.pone.0107801.t003" xlink:type="simple"/>
<table><colgroup span="1"><col align="left" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/></colgroup>
<thead>
<tr>
<td align="left" rowspan="1" colspan="1">Method</td>
<td colspan="3" align="left" rowspan="1">Leukemia</td>
<td colspan="3" align="left" rowspan="1">Lung Cancer</td>
<td colspan="3" align="left" rowspan="1">Psoriasis</td>
<td colspan="3" align="left" rowspan="1">PBMC</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1"/>
<td align="left" rowspan="1" colspan="1">ns</td>
<td align="left" rowspan="1" colspan="1">tot</td>
<td align="left" rowspan="1" colspan="1">ns/tot</td>
<td align="left" rowspan="1" colspan="1">ns</td>
<td align="left" rowspan="1" colspan="1">tot</td>
<td align="left" rowspan="1" colspan="1">ns/tot</td>
<td align="left" rowspan="1" colspan="1">ns</td>
<td align="left" rowspan="1" colspan="1">tot</td>
<td align="left" rowspan="1" colspan="1">ns/tot</td>
<td align="left" rowspan="1" colspan="1">ns</td>
<td align="left" rowspan="1" colspan="1">tot</td>
<td align="left" rowspan="1" colspan="1">ns/tot</td>
</tr>
</thead>
<tbody>
<tr>
<td align="left" rowspan="1" colspan="1">varSelRF</td>
<td align="left" rowspan="1" colspan="1">125</td>
<td align="left" rowspan="1" colspan="1">232</td>
<td align="left" rowspan="1" colspan="1">54%</td>
<td align="left" rowspan="1" colspan="1">241</td>
<td align="left" rowspan="1" colspan="1">1158</td>
<td align="left" rowspan="1" colspan="1">21%</td>
<td align="left" rowspan="1" colspan="1">291</td>
<td align="left" rowspan="1" colspan="1">734</td>
<td align="left" rowspan="1" colspan="1">40%</td>
<td align="left" rowspan="1" colspan="1">174</td>
<td align="left" rowspan="1" colspan="1">389</td>
<td align="left" rowspan="1" colspan="1">45%</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">Boruta</td>
<td align="left" rowspan="1" colspan="1">193</td>
<td align="left" rowspan="1" colspan="1">328</td>
<td align="left" rowspan="1" colspan="1">58%</td>
<td align="left" rowspan="1" colspan="1">20</td>
<td align="left" rowspan="1" colspan="1">93</td>
<td align="left" rowspan="1" colspan="1">22%</td>
<td align="left" rowspan="1" colspan="1">62</td>
<td align="left" rowspan="1" colspan="1">179</td>
<td align="left" rowspan="1" colspan="1">35%</td>
<td align="left" rowspan="1" colspan="1">343</td>
<td align="left" rowspan="1" colspan="1">667</td>
<td align="left" rowspan="1" colspan="1">51%</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.2</td>
<td align="left" rowspan="1" colspan="1">4</td>
<td align="left" rowspan="1" colspan="1">5</td>
<td align="left" rowspan="1" colspan="1">80%</td>
<td align="left" rowspan="1" colspan="1">8</td>
<td align="left" rowspan="1" colspan="1">16</td>
<td align="left" rowspan="1" colspan="1">50%</td>
<td align="left" rowspan="1" colspan="1">4</td>
<td align="left" rowspan="1" colspan="1">2</td>
<td align="left" rowspan="1" colspan="1">50%</td>
<td align="left" rowspan="1" colspan="1">4</td>
<td align="left" rowspan="1" colspan="1">7</td>
<td align="left" rowspan="1" colspan="1">57%</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.3</td>
<td align="left" rowspan="1" colspan="1">4</td>
<td align="left" rowspan="1" colspan="1">7</td>
<td align="left" rowspan="1" colspan="1">57%</td>
<td align="left" rowspan="1" colspan="1">10</td>
<td align="left" rowspan="1" colspan="1">22</td>
<td align="left" rowspan="1" colspan="1">50%</td>
<td align="left" rowspan="1" colspan="1">8</td>
<td align="left" rowspan="1" colspan="1">6</td>
<td align="left" rowspan="1" colspan="1">75%</td>
<td align="left" rowspan="1" colspan="1">5</td>
<td align="left" rowspan="1" colspan="1">12</td>
<td align="left" rowspan="1" colspan="1">42%</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.4</td>
<td align="left" rowspan="1" colspan="1">6</td>
<td align="left" rowspan="1" colspan="1">12</td>
<td align="left" rowspan="1" colspan="1">50%</td>
<td align="left" rowspan="1" colspan="1">13</td>
<td align="left" rowspan="1" colspan="1">27</td>
<td align="left" rowspan="1" colspan="1">50%</td>
<td align="left" rowspan="1" colspan="1">9</td>
<td align="left" rowspan="1" colspan="1">7</td>
<td align="left" rowspan="1" colspan="1">78%</td>
<td align="left" rowspan="1" colspan="1">6</td>
<td align="left" rowspan="1" colspan="1">13</td>
<td align="left" rowspan="1" colspan="1">46%</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.5</td>
<td align="left" rowspan="1" colspan="1">9</td>
<td align="left" rowspan="1" colspan="1">13</td>
<td align="left" rowspan="1" colspan="1">69%</td>
<td align="left" rowspan="1" colspan="1">13</td>
<td align="left" rowspan="1" colspan="1">32</td>
<td align="left" rowspan="1" colspan="1">41%</td>
<td align="left" rowspan="1" colspan="1">13</td>
<td align="left" rowspan="1" colspan="1">8</td>
<td align="left" rowspan="1" colspan="1">62%</td>
<td align="left" rowspan="1" colspan="1">10</td>
<td align="left" rowspan="1" colspan="1">14</td>
<td align="left" rowspan="1" colspan="1">71%</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.10</td>
<td align="left" rowspan="1" colspan="1">17</td>
<td align="left" rowspan="1" colspan="1">25</td>
<td align="left" rowspan="1" colspan="1">68%</td>
<td align="left" rowspan="1" colspan="1">20</td>
<td align="left" rowspan="1" colspan="1">61</td>
<td align="left" rowspan="1" colspan="1">33%</td>
<td align="left" rowspan="1" colspan="1">31</td>
<td align="left" rowspan="1" colspan="1">20</td>
<td align="left" rowspan="1" colspan="1">65%</td>
<td align="left" rowspan="1" colspan="1">12</td>
<td align="left" rowspan="1" colspan="1">26</td>
<td align="left" rowspan="1" colspan="1">46%</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.20</td>
<td align="left" rowspan="1" colspan="1">30</td>
<td align="left" rowspan="1" colspan="1">48</td>
<td align="left" rowspan="1" colspan="1">62%</td>
<td align="left" rowspan="1" colspan="1">39</td>
<td align="left" rowspan="1" colspan="1">126</td>
<td align="left" rowspan="1" colspan="1">31%</td>
<td align="left" rowspan="1" colspan="1">65</td>
<td align="left" rowspan="1" colspan="1">31</td>
<td align="left" rowspan="1" colspan="1">48%</td>
<td align="left" rowspan="1" colspan="1">29</td>
<td align="left" rowspan="1" colspan="1">50</td>
<td align="left" rowspan="1" colspan="1">58%</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.30</td>
<td align="left" rowspan="1" colspan="1">47</td>
<td align="left" rowspan="1" colspan="1">78</td>
<td align="left" rowspan="1" colspan="1">60%</td>
<td align="left" rowspan="1" colspan="1">45</td>
<td align="left" rowspan="1" colspan="1">178</td>
<td align="left" rowspan="1" colspan="1">25%</td>
<td align="left" rowspan="1" colspan="1">44</td>
<td align="left" rowspan="1" colspan="1">104</td>
<td align="left" rowspan="1" colspan="1">42%</td>
<td align="left" rowspan="1" colspan="1">41</td>
<td align="left" rowspan="1" colspan="1">86</td>
<td align="left" rowspan="1" colspan="1">48%</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.50</td>
<td align="left" rowspan="1" colspan="1">71</td>
<td align="left" rowspan="1" colspan="1">134</td>
<td align="left" rowspan="1" colspan="1">53%</td>
<td align="left" rowspan="1" colspan="1">84</td>
<td align="left" rowspan="1" colspan="1">319</td>
<td align="left" rowspan="1" colspan="1">26%</td>
<td align="left" rowspan="1" colspan="1">86</td>
<td align="left" rowspan="1" colspan="1">190</td>
<td align="left" rowspan="1" colspan="1">45%</td>
<td align="left" rowspan="1" colspan="1">71</td>
<td align="left" rowspan="1" colspan="1">136</td>
<td align="left" rowspan="1" colspan="1">52%</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.100</td>
<td align="left" rowspan="1" colspan="1">130</td>
<td align="left" rowspan="1" colspan="1">256</td>
<td align="left" rowspan="1" colspan="1">51%</td>
<td align="left" rowspan="1" colspan="1">160</td>
<td align="left" rowspan="1" colspan="1">631</td>
<td align="left" rowspan="1" colspan="1">25%</td>
<td align="left" rowspan="1" colspan="1">158</td>
<td align="left" rowspan="1" colspan="1">554</td>
<td align="left" rowspan="1" colspan="1">29%</td>
<td align="left" rowspan="1" colspan="1">123</td>
<td align="left" rowspan="1" colspan="1">268</td>
<td align="left" rowspan="1" colspan="1">46%</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.150</td>
<td align="left" rowspan="1" colspan="1">181</td>
<td align="left" rowspan="1" colspan="1">364</td>
<td align="left" rowspan="1" colspan="1">50%</td>
<td align="left" rowspan="1" colspan="1">201</td>
<td align="left" rowspan="1" colspan="1">914</td>
<td align="left" rowspan="1" colspan="1">22%</td>
<td align="left" rowspan="1" colspan="1">206</td>
<td align="left" rowspan="1" colspan="1">835</td>
<td align="left" rowspan="1" colspan="1">25%</td>
<td align="left" rowspan="1" colspan="1">185</td>
<td align="left" rowspan="1" colspan="1">387</td>
<td align="left" rowspan="1" colspan="1">48%</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.200</td>
<td align="left" rowspan="1" colspan="1">243</td>
<td align="left" rowspan="1" colspan="1">541</td>
<td align="left" rowspan="1" colspan="1">45%</td>
<td align="left" rowspan="1" colspan="1">231</td>
<td align="left" rowspan="1" colspan="1">1166</td>
<td align="left" rowspan="1" colspan="1">20%</td>
<td align="left" rowspan="1" colspan="1">264</td>
<td align="left" rowspan="1" colspan="1">989</td>
<td align="left" rowspan="1" colspan="1">27%</td>
<td align="left" rowspan="1" colspan="1">232</td>
<td align="left" rowspan="1" colspan="1">509</td>
<td align="left" rowspan="1" colspan="1">46%</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF.250</td>
<td align="left" rowspan="1" colspan="1">274</td>
<td align="left" rowspan="1" colspan="1">619</td>
<td align="left" rowspan="1" colspan="1">44%</td>
<td align="left" rowspan="1" colspan="1">301</td>
<td align="left" rowspan="1" colspan="1">1328</td>
<td align="left" rowspan="1" colspan="1">23%</td>
<td align="left" rowspan="1" colspan="1">373</td>
<td align="left" rowspan="1" colspan="1">1082</td>
<td align="left" rowspan="1" colspan="1">34%</td>
<td align="left" rowspan="1" colspan="1">272</td>
<td align="left" rowspan="1" colspan="1">642</td>
<td align="left" rowspan="1" colspan="1">42%</td>
</tr>
</tbody>
</table>
</alternatives><table-wrap-foot><fn id="nt103"><label/><p>The number of significantly self-consistent and all the selected genes by a given method during the 30 bootstrap iterations. <italic>ns</italic> – the number of significantly self-consistent genes found, <italic>tot</italic> – the number of different features selected over the 30 bootstrap iterations, mnsf – the mean number of selected features. The highest values are highlighted in bold.</p></fn></table-wrap-foot></table-wrap>
<p>In <xref ref-type="fig" rid="pone-0107801-g003">Figure 3</xref>, we evaluated the stability and accuracy metrics together rather than individually. This figure allows to easily identify the methods providing the best trade-offs with respect to the two selected metrics. The methods providing the best trade-offs can be found in the right top corner; here there are all the methods which are strictly better on at least one metric. For instance, in the case of the Lung Cancer dataset, we can observe that Boruta provides high accuracy (99%) and very low stability (22%). While the FPRF.4 provides high accuracy (98%) and better stability (50%).</p>
<fig id="pone-0107801-g003" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0107801.g003</object-id><label>Figure 3</label><caption>
<title>Accuracy <italic>vs</italic> Stability.</title>
<p>The accuracy and the stability are evaluated together rather than individually for each data set and each analysis method. x-axis shows the percentage stability while y-axis shows the percentage accuracy (G-mean). Moreover, the size of the circles denotes the mean number of selected features over the 30 runs. Each subplot allows to analyze and identify the methods having the beset trade-off between a accuracy and stability for a specific dataset. (<bold>.a</bold>) trade-offs in Leukemia; (<bold>.b</bold>) trade-offs in Lung Cancer; (<bold>.c</bold>) trade-offs in Psoriasis; (<bold>.d</bold>) trade-offs in PBMC.</p>
</caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pone.0107801.g003" position="float" xlink:type="simple"/></fig></sec><sec id="s3c">
<title>Execution time</title>
<p>The average running time of the methods benchmarked in this study is reported in the <xref ref-type="table" rid="pone-0107801-t004">Table 4</xref>. The fastest method is FPRF, when considering the sum of the execution time for both the feature selection and the prioritization steps. Instead, the slowest method is Boruta, especially for the PBMC data set, which consists of many samples. The varSelRF algorithm required less execution time than Boruta, but it is systematically slower than FPRF.</p>
<table-wrap id="pone-0107801-t004" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0107801.t004</object-id><label>Table 4</label><caption>
<title>Running time.</title>
</caption><alternatives><graphic id="pone-0107801-t004-4" position="float" mimetype="image" xlink:href="info:doi/10.1371/journal.pone.0107801.t004" xlink:type="simple"/>
<table><colgroup span="1"><col align="left" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/><col align="center" span="1"/></colgroup>
<thead>
<tr>
<td align="left" rowspan="1" colspan="1">Method</td>
<td align="left" rowspan="1" colspan="1">Leukemia</td>
<td align="left" rowspan="1" colspan="1">Lung Cancer</td>
<td align="left" rowspan="1" colspan="1">Psoriasis</td>
<td align="left" rowspan="1" colspan="1">PBMC</td>
</tr>
</thead>
<tbody>
<tr>
<td align="left" rowspan="1" colspan="1">varSelRF</td>
<td align="left" rowspan="1" colspan="1">5′</td>
<td align="left" rowspan="1" colspan="1">6′</td>
<td align="left" rowspan="1" colspan="1">9′</td>
<td align="left" rowspan="1" colspan="1">18′</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">Boruta</td>
<td align="left" rowspan="1" colspan="1">19′</td>
<td align="left" rowspan="1" colspan="1">20′</td>
<td align="left" rowspan="1" colspan="1">31′</td>
<td align="left" rowspan="1" colspan="1">120′</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1">FPRF</td>
<td align="left" rowspan="1" colspan="1"><bold>1</bold>′</td>
<td align="left" rowspan="1" colspan="1"><bold>2</bold>′</td>
<td align="left" rowspan="1" colspan="1"><bold>3</bold>′</td>
<td align="left" rowspan="1" colspan="1"><bold>6</bold>′</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1"># features</td>
<td align="left" rowspan="1" colspan="1">4199</td>
<td align="left" rowspan="1" colspan="1">9480</td>
<td align="left" rowspan="1" colspan="1">9480</td>
<td align="left" rowspan="1" colspan="1">3700</td>
</tr>
<tr>
<td align="left" rowspan="1" colspan="1"># samples</td>
<td align="left" rowspan="1" colspan="1">276</td>
<td align="left" rowspan="1" colspan="1">150</td>
<td align="left" rowspan="1" colspan="1">262</td>
<td align="left" rowspan="1" colspan="1">978</td>
</tr>
</tbody>
</table>
</alternatives><table-wrap-foot><fn id="nt104"><label/><p>Evaluation of the running time represented as the mean over 30 bootstrap iterations. All methods investigated in this study were run single-threaded. For the proposed method the running time is compiled considering the sum of the execution times spent for the feature selection and prioritization steps.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3d">
<title>Biological significance of the selected feature lists</title>
<p>Next, we evaluated the ability of FPRF to select biologically relevant features. The lists of genes selected from each dataset are given as <xref ref-type="supplementary-material" rid="pone.0107801.s005">Data S1</xref>. Leukemia is a hematological neoplasm, in which the bone marrow generates abnormal, rapidly growing white blood cells. As expected, genes involved in acute myeloid leukemia along with other cancer-related genes, such as breast and ovarian cancers, were among the top ranked genes. Moreover, genes related to T- and B-cell activation and differentiation, as well as leukocyte activation, were also retrieved. Interestingly, FPRF selected several genes coding for proteins located in the cell membrane, suggesting that this approach might provide a valuable additional tool in leukemia differential diagnosis, which is currently dependent on flow cytometric analysis of patterns and intensity of antigen expression on the cell membrane to reach a definitive diagnosis <xref ref-type="bibr" rid="pone.0107801-Jennings1">[26]</xref>. The genes selected in the lung cancer data set are involved in epithelial differentiation. In addition, a number of known genes currently used for lung cancer subtyping were retrieved in the top positions of the ranked output list, such as KRT5 and TP63 <xref ref-type="bibr" rid="pone.0107801-Mukhopadhyay1">[27]</xref>.Psoriasis is a chronic inflammatory skin disease characterized by highly inflamed and sharply demarcated scaly skin lesions or plaques, which histologically show marked epidermal hyperplasia, prominent inflammatory infiltrate and increased vascularization. T-cells play a key role, and it is becoming increasingly more apparent that a pathogenic crosstalk between innate and adaptive cells underlie the dysregulated immune response that leads to abnormal epidermal proliferation <xref ref-type="bibr" rid="pone.0107801-DiMeglio1">[28]</xref>. As expected, the top ranked genes in the psoriasis data include members of relevant immune signaling pathways such as the IL-1 family of cytokines (IL-36G), the IFN-gamma (OASL, STAT1, CXCL1) and IL-17 (CCL20, CXCL8) signaling pathways. Furthermore, antimicrobial peptides (AMPs, S100A7A, S100A12) that bridge the immune system and the epidermal component, and kallikrein-related peptidases (KLK9) that induce AMPs were retrieved. FRPF also selected epidermal differentiation markers (members of the small proline-rich protein family (SPRR2) and the late cornified envelope family (LCE3D)), and genes associated with keratin regulation (KRT77), similar to previous studies <xref ref-type="bibr" rid="pone.0107801-SurezFarias1">[29]</xref>, <xref ref-type="bibr" rid="pone.0107801-Quaranta1">[30]</xref>.Peripheral blood mononuclear cells (PBMC) represent a heterogeneous set of circulating immune cells, mainly including lymphocytes (T-, B- and NK-cells), monocytes and macrophages. The selected genes in this data set indeed highlighted relevant pathways including immune response, T-cell activation and differentiation, and host-virus interactions.</p>
</sec></sec><sec id="s4">
<title>Conclusions</title>
<p>We have developed a new method, FPRF, for fast feature selection and prioritization that ensures the identification of relevant and stable sets of features from high-throughput transcriptomics data. By evaluating FPRF on different multi-class microarray data sets, we show that it is able to reach a high classification power, while gaining stability over other popular algorithms.</p>
</sec><sec id="s5">
<title>Supporting Information</title>
<supplementary-material id="pone.0107801.s001" mimetype="image/tiff" xlink:href="info:doi/10.1371/journal.pone.0107801.s001" position="float" xlink:type="simple"><label>Figure S1</label><caption>
<p><bold>The fuzzy pattern discovery method implemented in the R package </bold><bold><italic>DFP</italic></bold><bold> <xref ref-type="bibr" rid="pone.0107801-GlezPea1">[11]</xref></bold><bold> is described in details.</bold></p>
<p>(TIF)</p>
</caption></supplementary-material><supplementary-material id="pone.0107801.s002" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xlink:href="info:doi/10.1371/journal.pone.0107801.s002" position="float" xlink:type="simple"><label>Table S1</label><caption>
<p><bold>Supplementary table reporting the values of the </bold><bold><italic>zeta</italic></bold><bold> and </bold><bold><italic>piVal</italic></bold><bold> parameters used for each analysed dataset.</bold></p>
<p>(XLSX)</p>
</caption></supplementary-material><supplementary-material id="pone.0107801.s003" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xlink:href="info:doi/10.1371/journal.pone.0107801.s003" position="float" xlink:type="simple"><label>Table S2</label><caption>
<p><bold>Supplementary Table summarizing the publicly available GEO series integrated to form the PBMC dataset.</bold></p>
<p>(XLSX)</p>
</caption></supplementary-material><supplementary-material id="pone.0107801.s004" mimetype="application/zip" xlink:href="info:doi/10.1371/journal.pone.0107801.s004" position="float" xlink:type="simple"><label>Code S1</label><caption>
<p><bold>Supplementary methods and the implementation/evaluation in R of the FPRF method.</bold></p>
<p>(ZIP)</p>
</caption></supplementary-material><supplementary-material id="pone.0107801.s005" mimetype="application/vnd.ms-excel" xlink:href="info:doi/10.1371/journal.pone.0107801.s005" position="float" xlink:type="simple"><label>Data S1</label><caption>
<p><bold>Supplementary Data containing the list of genes selected for each dataset and their ranks.</bold></p>
<p>(XLS)</p>
</caption></supplementary-material></sec></body>
<back>
<ack>
<p>The authors wish to thank Dr. Kai Puolamäki, Jussi Korpela and Andreas Henelius for fruitful discussions and critical reading of the manuscript. This work has been supported by the European Commission, under grant agreement FP7-309329 (NANOSOLUTIONS). The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="pone.0107801-Weinstein1"><label>1</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Weinstein</surname><given-names>JN</given-names></name>, <name name-style="western"><surname>Collisson</surname><given-names>EA</given-names></name>, <name name-style="western"><surname>Mills</surname><given-names>GB</given-names></name>, <name name-style="western"><surname>Shaw</surname><given-names>KRM</given-names></name>, <name name-style="western"><surname>Ozenberger</surname><given-names>BA</given-names></name>, <etal>et al</etal>. (<year>2013</year>) <article-title>The Cancer Genome Atlas Pan-Cancer analysis project</article-title>. <source>Nat Genet</source> <volume>45</volume>: <fpage>1113</fpage>–<lpage>1120</lpage> Available: <ext-link ext-link-type="uri" xlink:href="http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3919969&amp;tool=pmcentrez&amp;rendertype=abstract" xlink:type="simple">http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3919969&amp;tool=pmcentrez&amp;rendertype=abstract</ext-link>.</mixed-citation>
</ref>
<ref id="pone.0107801-Virtanen1"><label>2</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Virtanen</surname><given-names>C</given-names></name>, <name name-style="western"><surname>Woodgett</surname><given-names>J</given-names></name> (<year>2008</year>) <article-title>Clinical uses of microarrays in cancer research</article-title>. <source>Methods Mol Med</source> <volume>141</volume>: <fpage>87</fpage>–<lpage>113</lpage>.</mixed-citation>
</ref>
<ref id="pone.0107801-Tezak1"><label>3</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Tezak</surname><given-names>Z</given-names></name>, <name name-style="western"><surname>Ranamukhaarachchi</surname><given-names>D</given-names></name>, <name name-style="western"><surname>Russek-Cohen</surname><given-names>E</given-names></name>, <name name-style="western"><surname>Gutman</surname><given-names>SI</given-names></name> (<year>2006</year>) <article-title>FDA perspectives on potential microarray-based clinical diagnostics</article-title>. <source>Hum Genomics</source> <volume>2</volume>: <fpage>236</fpage>–<lpage>243</lpage> <comment>doi:<ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1186/1479-7364-2-4-236" xlink:type="simple">10.1186/1479-7364-2-4-236</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0107801-Saeys1"><label>4</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Saeys</surname><given-names>Y</given-names></name>, <name name-style="western"><surname>Inza</surname><given-names>I</given-names></name>, <name name-style="western"><surname>Larrañaga</surname><given-names>P</given-names></name> (<year>2007</year>) <article-title>A review of feature selection techniques in bioinformatics</article-title>. <source>Bioinformatics</source> <volume>23</volume>: <fpage>2507</fpage>–<lpage>2517</lpage>.</mixed-citation>
</ref>
<ref id="pone.0107801-He1"><label>5</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>He</surname><given-names>Z</given-names></name>, <name name-style="western"><surname>Yu</surname><given-names>W</given-names></name> (<year>2010</year>) <article-title>Stable feature selection for biomarker discovery</article-title>. <source>Comput Biol Chem</source> <volume>34</volume>: <fpage>215</fpage>–<lpage>225</lpage>.</mixed-citation>
</ref>
<ref id="pone.0107801-Abeel1"><label>6</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Abeel</surname><given-names>T</given-names></name>, <name name-style="western"><surname>Helleputte</surname><given-names>T</given-names></name>, <name name-style="western"><surname>Van de Peer</surname><given-names>Y</given-names></name>, <name name-style="western"><surname>Dupont</surname><given-names>P</given-names></name>, <name name-style="western"><surname>Saeys</surname><given-names>Y</given-names></name> (<year>2010</year>) <article-title>Robust biomarker identification for cancer diagnosis with ensemble feature selection methods</article-title>. <source>Bioinformatics</source> <volume>26</volume>: <fpage>392</fpage>–<lpage>398</lpage>.</mixed-citation>
</ref>
<ref id="pone.0107801-Guyon1"><label>7</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Guyon</surname><given-names>I</given-names></name>, <name name-style="western"><surname>Weston</surname><given-names>J</given-names></name>, <name name-style="western"><surname>Barnhill</surname><given-names>S</given-names></name>, <name name-style="western"><surname>Vapnik</surname><given-names>V</given-names></name> (<year>2002</year>) <article-title>Gene selection for cancer classification using Support Vector Machines</article-title>. <source>Mach Learn</source> <volume>46</volume>: <fpage>389</fpage>–<lpage>422</lpage> Available: <ext-link ext-link-type="uri" xlink:href="http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3740816&amp;tool=pmcentrez&amp;rendertype=abstract" xlink:type="simple">http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3740816&amp;tool=pmcentrez&amp;rendertype=abstract</ext-link>.</mixed-citation>
</ref>
<ref id="pone.0107801-DazUriarte1"><label>8</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Díaz-Uriarte</surname><given-names>R</given-names></name>, <name name-style="western"><surname>Alvarez de Andrés</surname><given-names>S</given-names></name> (<year>2006</year>) <article-title>Gene selection and classification of microarray data using random forest</article-title>. <source>BMC Bioinformatics</source> <volume>7</volume>: <fpage>3</fpage>.</mixed-citation>
</ref>
<ref id="pone.0107801-Kursa1"><label>9</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Kursa</surname><given-names>MB</given-names></name>, <name name-style="western"><surname>Rudnicki</surname><given-names>WR</given-names></name> (<year>2010</year>) <article-title>Feature Selection with the Boruta Package</article-title>. <source>J Stat Softw</source> <volume>36</volume>: <fpage>1</fpage>–<lpage>13</lpage> Available: <ext-link ext-link-type="uri" xlink:href="http://www.jstatsoft.org/v36/i11/paper" xlink:type="simple">http://www.jstatsoft.org/v36/i11/paper</ext-link>.</mixed-citation>
</ref>
<ref id="pone.0107801-Kursa2"><label>10</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Kursa</surname><given-names>MB</given-names></name> (<year>2014</year>) <article-title>Robustness of Random Forest-based gene selection methods</article-title>. <source>BMC Bioinformatics</source> <volume>15</volume>: <fpage>8</fpage> Available: <ext-link ext-link-type="uri" xlink:href="http://www.ncbi.nlm.nih.gov/pubmed/24410865" xlink:type="simple">http://www.ncbi.nlm.nih.gov/pubmed/24410865</ext-link>.</mixed-citation>
</ref>
<ref id="pone.0107801-GlezPea1"><label>11</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Glez-Peña</surname><given-names>D</given-names></name>, <name name-style="western"><surname>Alvarez</surname><given-names>R</given-names></name>, <name name-style="western"><surname>Díaz</surname><given-names>F</given-names></name>, <name name-style="western"><surname>Fdez-Riverola</surname><given-names>F</given-names></name> (<year>2009</year>) <article-title>DFP: a Bioconductor package for fuzzy profile identification and gene reduction of microarray data</article-title>. <source>BMC Bioinformatics</source> <volume>10</volume>: <fpage>37</fpage>.</mixed-citation>
</ref>
<ref id="pone.0107801-R1"><label>12</label>
<mixed-citation publication-type="other" xlink:type="simple">R Development Core Team (2012) R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. ISBN 3-900051-07-0, URL <ext-link ext-link-type="uri" xlink:href="http://www.R-project.org/" xlink:type="simple">http://www.R-project.org/</ext-link>. R Found Stat Comput Vienna, Austria.</mixed-citation>
</ref>
<ref id="pone.0107801-Breiman1"><label>13</label>
<mixed-citation publication-type="other" xlink:type="simple">Breiman L, Friedman JH, Olshen RA, Stone CJ (1984) Classification and Regression Trees.</mixed-citation>
</ref>
<ref id="pone.0107801-Hothorn1"><label>14</label>
<mixed-citation publication-type="other" xlink:type="simple">Hothorn T, Hornik K, Zeileis a (2006) party: A Laboratory for Recursive Part (y) itioning. R Packag version 09–0, URL <ext-link ext-link-type="uri" xlink:href="http://CRAN.R-project.org" xlink:type="simple">http//CRAN.R-project.org</ext-link>.</mixed-citation>
</ref>
<ref id="pone.0107801-Strobl1"><label>15</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Strobl</surname><given-names>C</given-names></name>, <name name-style="western"><surname>Zeileis</surname><given-names>A</given-names></name> (<year>2008</year>) <article-title>Danger: high power! - Exploring the statistical properties of a test for random forest variable importance</article-title>. <source>Univ Munich, Dep Stat Tech Rep</source> <volume>017</volume>: <fpage>1</fpage>–<lpage>8</lpage> Available: <ext-link ext-link-type="uri" xlink:href="http://epub.ub.uni-muenchen.de/2111/1/techreport.pdf" xlink:type="simple">http://epub.ub.uni-muenchen.de/2111/1/techreport.pdf</ext-link>.</mixed-citation>
</ref>
<ref id="pone.0107801-Breiman2"><label>16</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Breiman</surname><given-names>L</given-names></name> (<year>2001</year>) <article-title>Random forests</article-title>. <source>Mach Learn</source> <volume>45</volume>: <fpage>5</fpage>–<lpage>32</lpage> <comment>doi:<ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1023/A:1010933404324" xlink:type="simple">10.1023/A:1010933404324</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0107801-Altmann1"><label>17</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Altmann</surname><given-names>A</given-names></name>, <name name-style="western"><surname>Toloşi</surname><given-names>L</given-names></name>, <name name-style="western"><surname>Sander</surname><given-names>O</given-names></name>, <name name-style="western"><surname>Lengauer</surname><given-names>T</given-names></name> (<year>2010</year>) <article-title>Permutation importance: a corrected feature importance measure</article-title>. <source>Bioinformatics</source> <volume>26</volume>: <fpage>1340</fpage>–<lpage>1347</lpage>.</mixed-citation>
</ref>
<ref id="pone.0107801-Strobl2"><label>18</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Strobl</surname><given-names>C</given-names></name>, <name name-style="western"><surname>Boulesteix</surname><given-names>A-L</given-names></name>, <name name-style="western"><surname>Zeileis</surname><given-names>A</given-names></name>, <name name-style="western"><surname>Hothorn</surname><given-names>T</given-names></name> (<year>2007</year>) <article-title>Bias in random forest variable importance measures: illustrations, sources and a solution</article-title>. <source>BMC Bioinformatics</source> <volume>8</volume>: <fpage>25</fpage>.</mixed-citation>
</ref>
<ref id="pone.0107801-Yeoh1"><label>19</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Yeoh</surname><given-names>E-J</given-names></name>, <name name-style="western"><surname>Ross</surname><given-names>ME</given-names></name>, <name name-style="western"><surname>Shurtleff</surname><given-names>SA</given-names></name>, <name name-style="western"><surname>Williams</surname><given-names>WK</given-names></name>, <name name-style="western"><surname>Patel</surname><given-names>D</given-names></name>, <etal>et al</etal>. (<year>2002</year>) <article-title>Classification, subtype discovery, and prediction of outcome in pediatric acute lymphoblastic leukemia by gene expression profiling</article-title>. <source>Cancer Cell</source> <volume>1</volume>: <fpage>133</fpage>–<lpage>143</lpage> <comment>doi:<ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1016/S1535-6108(02)00032-6" xlink:type="simple">10.1016/S1535-6108(02)00032-6</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0107801-Tarca1"><label>20</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Tarca</surname><given-names>AL</given-names></name>, <name name-style="western"><surname>Lauria</surname><given-names>M</given-names></name>, <name name-style="western"><surname>Unger</surname><given-names>M</given-names></name>, <name name-style="western"><surname>Bilal</surname><given-names>E</given-names></name>, <name name-style="western"><surname>Boue</surname><given-names>S</given-names></name>, <etal>et al</etal>. (<year>2013</year>) <article-title>Strengths and limitations of microarray-based phenotype prediction: lessons learned from the IMPROVER Diagnostic Signature Challenge</article-title>. <source>Bioinformatics</source> <volume>29</volume>: <fpage>2892</fpage>–<lpage>2899</lpage> Available: <ext-link ext-link-type="uri" xlink:href="http://www.ncbi.nlm.nih.gov/pubmed/23966112" xlink:type="simple">http://www.ncbi.nlm.nih.gov/pubmed/23966112</ext-link>.</mixed-citation>
</ref>
<ref id="pone.0107801-Irizarry1"><label>21</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Irizarry</surname><given-names>RA</given-names></name>, <name name-style="western"><surname>Hobbs</surname><given-names>B</given-names></name>, <name name-style="western"><surname>Collin</surname><given-names>F</given-names></name>, <name name-style="western"><surname>Beazer-Barclay</surname><given-names>YD</given-names></name>, <name name-style="western"><surname>Antonellis</surname><given-names>KJ</given-names></name>, <etal>et al</etal>. (<year>2003</year>) <article-title>Exploration, normalization, and summaries of high density oligonucleotide array probe level data</article-title>. <source>Biostatistics</source> <volume>4</volume>: <fpage>249</fpage>–<lpage>264</lpage>.</mixed-citation>
</ref>
<ref id="pone.0107801-Johnson1"><label>22</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Johnson</surname><given-names>WE</given-names></name>, <name name-style="western"><surname>Li</surname><given-names>C</given-names></name>, <name name-style="western"><surname>Rabinovic</surname><given-names>A</given-names></name> (<year>2007</year>) <article-title>Adjusting batch effects in microarray expression data using empirical Bayes methods</article-title>. <source>Biostatistics</source> <volume>8</volume>: <fpage>118</fpage>–<lpage>127</lpage>.</mixed-citation>
</ref>
<ref id="pone.0107801-Leek1"><label>23</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Leek</surname><given-names>JT</given-names></name>, <name name-style="western"><surname>Johnson</surname><given-names>WE</given-names></name>, <name name-style="western"><surname>Parker</surname><given-names>HS</given-names></name>, <name name-style="western"><surname>Jaffe</surname><given-names>AE</given-names></name>, <name name-style="western"><surname>Storey</surname><given-names>JD</given-names></name> (<year>2012</year>) <article-title>The sva package for removing batch effects and other unwanted variation in high-throughput experiments</article-title>. <source>Bioinformatics</source> <volume>28</volume>: <fpage>882</fpage>–<lpage>883</lpage> Available: <ext-link ext-link-type="uri" xlink:href="http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3307112&amp;tool=pmcentrez&amp;rendertype=abstract" xlink:type="simple">http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3307112&amp;tool=pmcentrez&amp;rendertype=abstract</ext-link>.</mixed-citation>
</ref>
<ref id="pone.0107801-Leek2"><label>24</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Leek</surname><given-names>JT</given-names></name>, <name name-style="western"><surname>Storey</surname><given-names>JD</given-names></name> (<year>2007</year>) <article-title>Capturing heterogeneity in gene expression studies by surrogate variable analysis</article-title>. <source>PLoS Genet</source> <volume>3</volume>: <fpage>1724</fpage>–<lpage>1735</lpage>.</mixed-citation>
</ref>
<ref id="pone.0107801-Yu1"><label>25</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Yu</surname><given-names>H</given-names></name>, <name name-style="western"><surname>Hong</surname><given-names>S</given-names></name>, <name name-style="western"><surname>Yang</surname><given-names>X</given-names></name>, <name name-style="western"><surname>Ni</surname><given-names>J</given-names></name>, <name name-style="western"><surname>Dan</surname><given-names>Y</given-names></name>, <etal>et al</etal>. (<year>2013</year>) <article-title>Recognition of multiple imbalanced cancer types based on DNA microarray data using ensemble classifiers</article-title>. <source>Biomed Res Int</source> <volume>2013</volume>: <fpage>239628</fpage> Available: <ext-link ext-link-type="uri" xlink:href="http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3770038&amp;tool=pmcentrez&amp;rendertype=abstract" xlink:type="simple">http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3770038&amp;tool=pmcentrez&amp;rendertype=abstract</ext-link>.</mixed-citation>
</ref>
<ref id="pone.0107801-Jennings1"><label>26</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Jennings</surname><given-names>CD</given-names></name>, <name name-style="western"><surname>Foon</surname><given-names>KA</given-names></name> (<year>1997</year>) <article-title>Recent advances in flow cytometry: application to the diagnosis of hematologic malignancy</article-title>. <source>Blood</source> <volume>90</volume>: <fpage>2863</fpage>–<lpage>2892</lpage>.</mixed-citation>
</ref>
<ref id="pone.0107801-Mukhopadhyay1"><label>27</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Mukhopadhyay</surname><given-names>S</given-names></name>, <name name-style="western"><surname>Katzenstein</surname><given-names>A-LA</given-names></name> (<year>2011</year>) <article-title>Subclassification of non-small cell lung carcinomas lacking morphologic differentiation on biopsy specimens: Utility of an immunohistochemical panel containing TTF-1, napsin A, p63, and CK5/6</article-title>. <source>Am J Surg Pathol</source> <volume>35</volume>: <fpage>15</fpage>–<lpage>25</lpage> <comment>doi:<ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1097/PAS.0b013e3182036d05" xlink:type="simple">10.1097/PAS.0b013e3182036d05</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0107801-DiMeglio1"><label>28</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Di Meglio</surname><given-names>P</given-names></name>, <name name-style="western"><surname>Perera</surname><given-names>GK</given-names></name>, <name name-style="western"><surname>Nestle</surname><given-names>FO</given-names></name> (<year>2011</year>) <article-title>The Multitasking Organ: Recent Insights into Skin Immune Function</article-title>. <source>Immunity</source> <volume>35</volume>: <fpage>857</fpage>–<lpage>869</lpage>.</mixed-citation>
</ref>
<ref id="pone.0107801-SurezFarias1"><label>29</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Suárez-Fariñas</surname><given-names>M</given-names></name>, <name name-style="western"><surname>Li</surname><given-names>K</given-names></name>, <name name-style="western"><surname>Fuentes-Duculan</surname><given-names>J</given-names></name>, <name name-style="western"><surname>Hayden</surname><given-names>K</given-names></name>, <name name-style="western"><surname>Brodmerkel</surname><given-names>C</given-names></name>, <etal>et al</etal>. (<year>2012</year>) <article-title>Expanding the Psoriasis Disease Profile: Interrogation of the Skin and Serum of Patients with Moderate-to-Severe Psoriasis</article-title>. <source>J Invest Dermatol</source> <volume>132</volume>: <fpage>2552</fpage>–<lpage>2564</lpage> <comment>doi:<ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1038/jid.2012.184" xlink:type="simple">10.1038/jid.2012.184</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0107801-Quaranta1"><label>30</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Quaranta</surname><given-names>M</given-names></name>, <name name-style="western"><surname>Knapp</surname><given-names>B</given-names></name>, <name name-style="western"><surname>Garzorz</surname><given-names>N</given-names></name>, <name name-style="western"><surname>Mattii</surname><given-names>M</given-names></name>, <name name-style="western"><surname>Pullabhatla</surname><given-names>V</given-names></name>, <etal>et al</etal>. (<year>2014</year>) <article-title>Intraindividual genome expression analysis reveals a specific molecular signature of psoriasis and eczema</article-title>. <source>Sci Transl Med</source> <volume>6</volume>: <fpage>244ra90</fpage>–<lpage>244ra90</lpage> Available: <ext-link ext-link-type="uri" xlink:href="http://stm.sciencemag.org/cgi/doi/10.1126/scitranslmed.3008946" xlink:type="simple">http://stm.sciencemag.org/cgi/doi/10.1126/scitranslmed.3008946</ext-link>.</mixed-citation>
</ref>
</ref-list></back>
</article>