<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article
  PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="3.0" xml:lang="en">
<front>
   <journal-meta>
      <journal-id journal-id-type="nlm-ta">PLoS ONE</journal-id>
      <journal-id journal-id-type="publisher-id">plos</journal-id>
      <journal-id journal-id-type="pmc">plosone</journal-id>
      <journal-title-group>
         <journal-title>PLoS ONE</journal-title>
      </journal-title-group>
      <issn pub-type="epub">1932-6203</issn>
      <publisher>
         <publisher-name>Public Library of Science</publisher-name>
         <publisher-loc>San Francisco, USA</publisher-loc>
      </publisher>
   </journal-meta>
   <article-meta>
      <article-id pub-id-type="publisher-id">PONE-D-12-34087</article-id>
      <article-id pub-id-type="doi">10.1371/journal.pone.0061318</article-id>
      <article-categories>
         <subj-group subj-group-type="heading">
            <subject>Research Article</subject>
         </subj-group>
<subj-group subj-group-type="Discipline-v2"><subject>Biology</subject>
<subj-group>
<subject>Biochemistry</subject>
<subj-group>
<subject>Drug discovery</subject>
<subject>Small molecules</subject>
</subj-group>
</subj-group>
<subj-group>
<subject>Biotechnology</subject>
<subj-group>
<subject>Drug discovery</subject>
</subj-group>
</subj-group>
<subj-group>
<subject>Computational biology</subject>
<subj-group>
<subject>Genomics</subject>
</subj-group>
</subj-group>
<subj-group>
<subject>Genetics</subject>
<subj-group>
<subject>Human genetics</subject>
<subj-group>
<subject>Personalized medicine</subject>
</subj-group>
</subj-group>
</subj-group>
<subj-group>
<subject>Systems biology</subject>
</subj-group>
</subj-group>
<subj-group subj-group-type="Discipline-v2"><subject>Computer science</subject>
<subj-group>
<subject>Computer modeling</subject>
</subj-group>
</subj-group>
<subj-group subj-group-type="Discipline-v2"><subject>Medicine</subject>
<subj-group>
<subject>Clinical genetics</subject>
<subj-group>
<subject>Personalized medicine</subject>
</subj-group>
</subj-group>
<subj-group>
<subject>Oncology</subject>
<subj-group>
<subject>Basic cancer research</subject>
</subj-group>
</subj-group>
</subj-group>
      </article-categories>
      <title-group>
         <article-title>Machine Learning Prediction of Cancer Cell Sensitivity to Drugs Based on Genomic and Chemical Properties</article-title>
         <alt-title alt-title-type="running-head">Predicting Drug Effect from Genomics and Chemistry</alt-title>
      </title-group>
      <contrib-group>
         <contrib contrib-type="author" xlink:type="simple">
            <name name-style="western"><surname>Menden</surname>
<given-names>Michael P.</given-names>
            </name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
         </contrib>
         <contrib contrib-type="author" xlink:type="simple">
            <name name-style="western"><surname>Iorio</surname>
<given-names>Francesco</given-names>
            </name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
         </contrib>
         <contrib contrib-type="author" xlink:type="simple">
            <name name-style="western"><surname>Garnett</surname>
<given-names>Mathew</given-names>
            </name><xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
         </contrib>
         <contrib contrib-type="author" xlink:type="simple">
            <name name-style="western"><surname>McDermott</surname>
<given-names>Ultan</given-names>
            </name><xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
         </contrib>
         <contrib contrib-type="author" xlink:type="simple">
            <name name-style="western"><surname>Benes</surname>
<given-names>Cyril H.</given-names>
            </name><xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
         </contrib>
         <contrib contrib-type="author" xlink:type="simple">
            <name name-style="western"><surname>Ballester</surname><given-names>Pedro J.</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref><xref ref-type="corresp" rid="cor1"><sup>*</sup></xref>
         </contrib>
         <contrib contrib-type="author" xlink:type="simple">
            <name name-style="western"><surname>Saez-Rodriguez</surname><given-names>Julio</given-names></name><xref ref-type="aff" rid="aff1"><sup>1</sup></xref><xref ref-type="corresp" rid="cor1"><sup>*</sup></xref>
         </contrib>
      </contrib-group>
      <aff id="aff1"><label>1</label><addr-line>European Bioinformatics Institute, Wellcome Trust Genome Campus–Cambridge, Cambridge, United Kingdom</addr-line></aff>
      <aff id="aff2"><label>2</label><addr-line>Cancer Genome Project, Wellcome Trust Sanger Institute, Wellcome Trust Genome Campus-­Cambridge, Cambridge, United Kingdom</addr-line></aff>
      <aff id="aff3"><label>3</label><addr-line>Center for Molecular Therapeutics, Massachusetts General Hospital Cancer Center and Harvard Medical School, Charlestown, Massachusetts, United States of America</addr-line></aff>
      <contrib-group>
         <contrib contrib-type="editor" xlink:type="simple">
            <name name-style="western"><surname>Raghava</surname>
<given-names>Gajendra P. S.</given-names>
            </name><role>Editor</role>
<xref ref-type="aff" rid="edit1"/>
         </contrib>
      </contrib-group>
      <aff id="edit1"><addr-line>CSIR-Institute of Microbial Technology, India</addr-line></aff>
      <author-notes>
         <corresp id="cor1">* E-mail: <email xlink:type="simple">pedro.ballester@ebi.ac.uk</email> (PJB); <email xlink:type="simple">saezrodriguez@ebi.ac.uk</email> (JS-R)</corresp>
         <fn fn-type="conflict">
            <p>The authors have declared that no competing interests exist.</p>
         </fn>
         <fn fn-type="con">
            <p>Designed the software and implementation of different approaches: MPM FI PJB. Conceived and designed the experiments: MPM PJB JSR. Performed the experiments: MPM FI PJB. Analyzed the data: MPM FI MG UM CHB PJB JSR. Contributed reagents/materials/analysis tools: MPM FI MG UM CHB PJB. Wrote the paper: MPM CHB PJB JSR.</p>
         </fn>
      </author-notes>
      <pub-date pub-type="collection">
         <year>2013</year>
      </pub-date>
      <pub-date pub-type="epub">
         <day>30</day>
         <month>4</month>
         <year>2013</year>
      </pub-date>
      <volume>8</volume>
      <issue>4</issue>
      <elocation-id>e61318</elocation-id>
      <history>
         <date date-type="received">
            <day>26</day>
            <month>10</month>
            <year>2012</year>
         </date>
         <date date-type="accepted">
            <day>7</day>
            <month>3</month>
            <year>2013</year>
         </date>
      </history>
      <permissions>
         <copyright-year>2013</copyright-year>
         <copyright-holder>Menden et al</copyright-holder><license xlink:type="simple"><license-p>This is an open-access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p></license></permissions>
      <abstract>
         <p>Predicting the response of a specific cancer to a therapy is a major goal in modern oncology that should ultimately lead to a personalised treatment. High-throughput screenings of potentially active compounds against a panel of genomically heterogeneous cancer cell lines have unveiled multiple relationships between genomic alterations and drug responses. Various computational approaches have been proposed to predict sensitivity based on genomic features, while others have used the chemical properties of the drugs to ascertain their effect. In an effort to integrate these complementary approaches, we developed machine learning models to predict the response of cancer cell lines to drug treatment, quantified through IC<sub>50</sub> values, based on both the genomic features of the cell lines and the chemical properties of the considered drugs. Models predicted IC<sub>50</sub> values in a 8-fold cross-validation and an independent <italic>blind</italic> test with coefficient of determination R<sup>2</sup> of 0.72 and 0.64 respectively. Furthermore, models were able to predict with comparable accuracy (R<sup>2</sup> of 0.61) IC50s of cell lines from a tissue not used in the training stage. Our <italic>in silico</italic> models can be used to optimise the experimental design of drug-cell screenings by estimating a large proportion of missing IC<sub>50</sub> values rather than experimentally measuring them. The implications of our results go beyond <italic>virtual</italic> drug screening design: potentially thousands of drugs could be probed <italic>in silico</italic> to systematically test their potential efficacy as anti-tumour agents based on their structure, thus providing a computational framework to identify new drug repositioning opportunities as well as ultimately be useful for personalized medicine by linking the genomic traits of patients to drug sensitivity.</p>
               </abstract>
      <funding-group>
         <funding-statement>This work was funded by the European Molecular Biology Laboratory PhD programme to MPM, the Sanger/European Bioinformatics Institute ESPOD programme to FI, a Medical Research Council Methodology Research Fellowship to PJB, a Wellcome Trust grant to MG and CHB, and Cancer Research UK to UM. The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</funding-statement>
      </funding-group>
<counts>
<page-count count="7"/>
</counts>
</article-meta>
</front>
<body>
   <sec id="s1">
      <title>Introduction</title>
      <p>High-throughput screening of a large number of molecules is a widely used approach to identify lead compounds exerting a beneficial effect on a given phenotype. In the context of cancer, libraries of chemical entities have been tested in this way against panels of cell lines grown in different conditions and with heterogeneous genomic backgrounds <xref ref-type="bibr" rid="pone.0061318-Sharma1">[1]</xref>. Following the pioneering work of the “NCI-60”, a collection of 59 human cancer cell lines developed by the National Cancer Institute for <italic>in vitro</italic> drug screening <xref ref-type="bibr" rid="pone.0061318-Grever1">[2]</xref>, recent hallmark studies have shown that screening very large cell line collections can recapitulate known and identify novel molecular genomic determinants of drug sensitivity <xref ref-type="bibr" rid="pone.0061318-Sharma1">[1]</xref>, <xref ref-type="bibr" rid="pone.0061318-Garnett1">[3]</xref>–<xref ref-type="bibr" rid="pone.0061318-Heiser1">[5]</xref>.</p>
      <p>In these studies, using systematic statistical inference and regression methods, determinant such as oncogenic lesions, high or low levels of basal gene expression and other genotypic traits have been associated to profiles of increased sensitivity/resistance to specific compounds. For instance, by applying a multivariate analysis of variance <xref ref-type="bibr" rid="pone.0061318-Stevens1">[6]</xref> and the ‘Elastic Net’ regression framework <xref ref-type="bibr" rid="pone.0061318-HuiZou1">[7]</xref> established drug-genotype associations have been confirmed and complemented with markers of tissue-specificity and novel connections, e.g. the <italic>EWS-FLI1</italic> translocation in Ewing’s sarcoma and sensitivity to <italic>PARP</italic> inhibitors, have been identified and further experimentally validated. Results of these studies have been made publicly available, providing unique resources that support the discovery of new predictive biomarkers for personalised cancer therapy.</p>
      <p>Increasing further the size of the considered cell-line/compound panels would be very beneficial, as it provides the basis to improve the accuracy and predictive power of the inferred associations. However, this requires larger infrastructures and the cost grows with the screening size. In addition, due to various technical and logistical reasons in a high-throughput screen <xref ref-type="bibr" rid="pone.0061318-HuiZou1">[7]</xref>, the resulting compound-by-cell line matrix of drug efficacy (typically summarised in their IC<sub>50</sub>, the half maximal (50%) inhibitory concentration of a substance with respect to cell viability) is often not complete. Although many steps are automated, filling experimentally each gap could be expensive and laborious <xref ref-type="bibr" rid="pone.0061318-Stevens1">[6]</xref>. Hence, an accurate tool to impute missing IC<sub>50</sub>s and estimate them for novel cell lines would be of great value for drug screening design.</p>
      <p>Furthermore, a robust prediction tool for <italic>in silico</italic> identification of potentially effective drugs for treating a specific cancer could be used for drug repositioning <xref ref-type="bibr" rid="pone.0061318-Ashburn1">[8]</xref>, <xref ref-type="bibr" rid="pone.0061318-Sanseau1">[9]</xref>. An approach of this kind is represented by the COMPARE algorithm <xref ref-type="bibr" rid="pone.0061318-Paull1">[10]</xref>, <xref ref-type="bibr" rid="pone.0061318-Zaharevitz1">[11]</xref> that uses drug response profiles of the NCI-60 screening, through a ‘guilt-by-association’ paradigm. Following this principle, drugs eliciting a similar drug-response profile across the cell lines in the NCI-60 panel are hypothesized to share a common mode of action (MoA), thus enabling MoA discovery for novel drugs (if their tumour-suppression profile is similar to that of a known and well characterized drug) as well as the discovery of novel or secondary effects for established drugs.</p>
      <p>Ultimately, <italic>in silico</italic> methods to accurately predict the effectiveness of drugs based on the molecular making of tumours (i.e. genome, transcriptome) would be a major milestone towards personalized therapies for cancer patients based on molecular biomarkers <xref ref-type="bibr" rid="pone.0061318-Kelloff1">[12]</xref>.</p>
   </sec>
   <sec id="s2">
      <title>Results</title>
      <p>We therefore investigated whether it is possible to build machine learning models (for details see “Materials and Methods” section, “Machine learning” subsection) that can predict drug sensitivity using cell line screening experimental data, where cell lines are treated with variable concentration of a given drug and the resulting dose-response curve summarized by an IC<sub>50</sub>. We focused on the most comprehensive cancer drug screening dataset available to date, from the “Genomics of Drug Sensitivity in Cancer” (GDSC) project <xref ref-type="bibr" rid="pone.0061318-Garnett1">[3]</xref>. For each drug, a neural network model was trained to predict its IC<sub>50</sub> profile across the panel of cell lines based on the genomic background of each cell, as characterised by microsatellite instability status (1 =  unstable or 0 =  stable), somatic coding variants in the coding sequence of 77 cancer genes (1 =  any change in protein sequence and 0 =  wild type) and copy number alterations denoting gene amplification and deletion of those cancer genes (1 =  amplification/more than 7 copy numbers, 0 =  wild type/between 1 or 7 copy numbers, and –1 =  deletion/no copy number). However, the predictive power of these initial models was limited, especially for those drugs without a well-known oncogene-to-drug response dependency.</p>
      <p>We reasoned that cancer cell sensitivity to drug molecules is driven by features from both cells and drugs. Whereas cell features are ultimately connected to the inner workings of the cell, drug features include physicochemical properties that are correlated with the ability of the molecule to cross the cell membrane (e.g. lipophilicity) or its selectivity to intracellular targets (e.g. fingerprints encoding the chemical structure).</p>
      <p>Indeed, extensive work has been done on Quantitative Structure-Activity Relationship (QSAR) approaches to predicting whole-cell activity of molecules based of their chemical properties <xref ref-type="bibr" rid="pone.0061318-Kubinyi1">[13]</xref>–<xref ref-type="bibr" rid="pone.0061318-Swamidass1">[16]</xref>, including applications to predicting anti-cancer activity in drugs <xref ref-type="bibr" rid="pone.0061318-Shi1">[17]</xref>, <xref ref-type="bibr" rid="pone.0061318-Shi2">[18]</xref>. However, such QSAR approaches exclusively based on chemical features cannot distinguish between resistant and sensitive cell lines. For instance, building a model without any information of the cell lines, the model will be not capable of predicting cell line A to be more resistant than cell line B to drug C, which is the main aim of integrating chemical and genomic features in our models.</p>
      <p>We therefore extended our machine learning models to include as input chemical features from the drugs, besides the molecular characterization of the cell lines (see <xref ref-type="fig" rid="pone-0061318-g001">Fig 1</xref>). This integrative approach not only integrates two complementary streams of information, but also allows the model to be trained with much larger amounts of data, which is often a key factor to improve predictive performance (see <xref ref-type="fig" rid="pone-0061318-g002">Fig 2</xref>). Consequently, data was pre-processed to include 689 chemical descriptors of the drugs and 138 genomic features for differentiating the cell lines, resulting in an input space of 827 features.</p>
      <fig id="pone-0061318-g001" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0061318.g001</object-id><label>Figure 1</label>
         <caption>
            <title>IC<sub>50</sub> prediction workflow.</title>
            <p>Our method is based on two different input streams: (1) cell line features of 77 oncogenes and their mutation state, (2) drug features that are generated with PaDEL software <xref ref-type="bibr" rid="pone.0061318-Yap1">[19]</xref> from the simplified molecular-input line entry system (SMILES), see method section for details. The continuous IC<sub>50</sub> value is predicted with state-of-the-art machine learning algorithms (neural networks and random forests).</p>
         </caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pone.0061318.g001" position="float" xlink:type="simple"/></fig>
      <fig id="pone-0061318-g002" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0061318.g002</object-id><label>Figure 2</label>
         <caption>
            <title>Comparison of single-drug models and the multi-drug model.</title>
            <p>The performance of the multi-drug model (red asterisk) and the family of 111 single-drug models (blue histogram) is represented using three different metrics: (A) Pearson correlation R<sub>p</sub>, (B) coefficient of determination R<sup>2</sup>, and (C) root mean square error RMSE.</p>
         </caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pone.0061318.g002" position="float" xlink:type="simple"/></fig>
      <p>Chemical descriptors were generated with PaDEL software <xref ref-type="bibr" rid="pone.0061318-Yap1">[19]</xref> from simplified molecular-input line entry system (SMILES) structures. Descriptors include physicochemical features such as weight, lipophilicity, rule of five, and additionally fingerprints of the drugs (for details see “Materials and Methods” section, “Features” subsection, and <ext-link ext-link-type="uri" xlink:href="http://padel.nus.edu.sg/software/padeldescriptor/" xlink:type="simple">http://padel.nus.edu.sg/software/padeldescriptor/</ext-link>).</p>
      <p>For building our model, we used GDSC screening data from 608 genomically characterised cell lines and 111 drugs for which chemical information were available (see <xref ref-type="fig" rid="pone-0061318-g002">Fig 2</xref> and Methods for details). The published version of this matrix holds 38,930 IC<sub>50</sub> values (∼58% of the total, due to technical and logistic reasons).</p>
      <p>We performed an 8-fold cross-validation, where the test set of each fold was not used for training so as to measure the predictive power of the resulting models across all drugs rather than for each drug separately. Neural networks were able to impute missing log(IC<sub>50</sub>) values on the test sets with an averaged Pearson correlation coefficient (R<sub>p</sub>), coefficient of determination (<italic>R<sup>2</sup></italic>) and root mean square error (RMSE) (<xref ref-type="supplementary-material" rid="pone.0061318.s005">Text S1</xref>) of 0.85, 0.72 and 0.83 across all 111 drugs, respectively (<xref ref-type="fig" rid="pone-0061318-g003">Fig 3A</xref>). Alternatively, random forests achieved comparable performances (R<sub>p</sub> of 0.85, <italic>R<sup>2</sup></italic> of 0.72 and RMSE of 0.84; full details in supplementary materials). Furthermore, we conducted a blind test using 13,565 new experimental IC<sub>50</sub> values only received after training our models in order to verify our cross-validation results (drug-to-cell line matrix updated by ∼18%, with these newly generated IC<sub>50</sub>s exclusively used as the blind test set). The results on the blind test were almost as good as in the cross-validation, obtaining an R<sub>p</sub> of 0.79, <italic>R<sup>2</sup></italic> of 0.64 and an RMSE of 0.97 (Fig S1, <xref ref-type="supplementary-material" rid="pone.0061318.s006">Text S2</xref>). The accuracy of the predictions encouraged us to train the networks with fewer IC<sub>50</sub> values. Remarkably, the predictive power of the models did not fall appreciably off in quality, even if the amount of training data was reduced to 20% of the total (<xref ref-type="fig" rid="pone-0061318-g003">Fig 3B</xref>).</p>
      <fig id="pone-0061318-g003" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0061318.g003</object-id><label>Figure 3</label>
         <caption>
            <title>IC<sub>50</sub> prediction.</title>
            <p>Predictions are achieved with 8-fold cross-validations. Performance values are exclusively calculated on the test sets. (A) Correlation between predicted to experimental observed log(IC<sub>50</sub>) values (Pearson correlation R<sub>p</sub> = 0.85 ; coefficient of determination R<sup>2</sup> = 0.72, root mean square error RMSE  = 0.83). Although there is an enrichment of resistant cell lines, which tend to have higher log(IC<sub>50</sub>) values than sensitive cell lines, the lower log(IC<sub>50</sub>) values are still decently predicted. (B) Expected improvement of the IC<sub>50</sub> prediction by filling experimentally gaps in the cell-to-drug matrix. The vertical grey line corresponds to the published data set (filled to ∼58%, due to logistic reasons), which corresponds to the results in panel (A). However, similar accuracies (R<sub>p</sub> of 0.84 instead of 0.85, R<sup>2</sup> of 0.70 instead of 0.72) can be achieved using exclusively 20% of the whole matrix.</p>
         </caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pone.0061318.g003" position="float" xlink:type="simple"/></fig>
      <p>Using an analysis of variance (ANOVA) to identify drug-to-oncogene associations, we investigated how well the IC<sub>50</sub> values predicted for the test set using our model recapitulate associations manifested in the experimental data, for instance, whether a given mutation is causing sensitivity or resistance against a drug <xref ref-type="bibr" rid="pone.0061318-Garnett1">[3]</xref>. Using only predicted IC<sub>50</sub> values, we correctly captured 79% (168/213) of the significant observations with the same t-test tendency (positive or negative effect on drug sensitivity) identified with the experimental IC<sub>50</sub>s. When only considering significant associations from our model (p-value adjusted with Benjamini-Hochberg, FDR = 0.2), we correctly predicted 28% (59/213) of all experimentally identified associations. Where we failed to detect an association the ANOVA effect size is often small, or the experimental correlation is associated with a mutation either not or infrequently represented within the subset of cell lines with predicted IC<sub>50</sub> values. Notably, as example of the utility of this approach, using only predicted IC<sub>50</sub> values we identified known drug-to-oncogene associations such as sensitivity of <italic>BRAF</italic>-mutated cells lines to <italic>MEK1/2</italic>-inhibitors (<xref ref-type="fig" rid="pone-0061318-g004">Fig 4B</xref>) <xref ref-type="bibr" rid="pone.0061318-Solit1">[20]</xref>. The range of predicted IC<sub>50</sub> values for a drug are typically narrower than for the observed values and is likely because currently available genomic dataset are in sufficient to explain the observed range of drug responses across the cell lines.</p>
      <fig id="pone-0061318-g004" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0061318.g004</object-id><label>Figure 4</label>
         <caption>
            <title>Comparing ANOVA with prediction.</title>
            <p>(A) Analysis of variance (ANOVA) of experimental data and predicted output for drug-to-oncogene associations (20% FDR). The size of each association (dot) is proportional to the amount of treated cell lines containing the particular mutated oncogene. Blue dots indicating the same t-test tendency in our predictions, and red ones the opposite. (B) Predicted and measured IC<sub>50</sub>s of <italic>BRAF</italic>-mutated vs. wild-type cell lines exposed to the <italic>MEK1/2</italic>-inhibitor PD-0325901 (p- value of prediction  = 1.91×10<sup>−05</sup>, t-test multiple hypothesis corrected with Benjamini &amp; Hochberg).</p>
         </caption><graphic mimetype="image" xlink:href="info:doi/10.1371/journal.pone.0061318.g004" position="float" xlink:type="simple"/></fig>
      <p>In addition, we assessed the predictive power of our model for unknown cell lines. Therefore, we applied a more stringent 8-fold cross-validation, where a cell line was either included in the train or test set. These models achieved an R<sub>p</sub> of 0.82, <italic>R<sup>2</sup></italic> of 0.68 and an RMSE of 0.89 (Fig S2) demonstrating the accuracy of our model to predict IC<sub>50</sub> values for completely new cell lines. In an additional simulation, we left out all cancer cell lines from a specific tissue, e.g. we removed all lung cancer cell lines (106 out of 608 cell lines) and still obtained an R<sub>p</sub> of 0.79, <italic>R<sup>2</sup></italic> of 0.61 and RMSE of 0.99 (Fig S3).</p>
   </sec>
   <sec id="s3">
      <title>Discussion</title>
      <p>Our results show that by using genomic features from the cell lines and chemical information from drugs, it is possible to build <italic>in silico</italic> multi-drug models to impute missing IC<sub>50</sub> values with non-parametric machine learning algorithms such as neural networks and random forests. As output for our method, we chose to explore IC50 values as generated by Garnett et al. <xref ref-type="bibr" rid="pone.0061318-Garnett1">[3]</xref>, which enables us to compare our results to them, however other metrics (such as a capped IC50 or area under the curve), might provide additional insight and potentially lead to more robust models.</p>
      <p>The Pearson correlation (<xref ref-type="fig" rid="pone-0061318-g002">Fig. 2A</xref>) and coefficient of determination (<xref ref-type="fig" rid="pone-0061318-g002">Fig. 2B</xref>) of the multi-drug model are significantly better than the single-drug models, while the RMSE error is similar (<xref ref-type="fig" rid="pone-0061318-g002">Fig 2C</xref>). This means that the error (on average) of predicting a given IC50 value is the same in the multi-drug and single-drug models (RMSE) and, since some drugs are active at different concentration ranges, the model is able to cover a much larger dynamic range with a similar precision. The coefficient of determination balances these two terms, and thus a broader range with the same RMSE increases R<sup>2</sup>. Thanks to the use of chemical descriptors, multi-drug models are trained with a volume of data that is two orders of magnitude bigger than the data to train each single-drug model. This larger dataset weights the difficulty in training heterogeneous response values across drugs.</p>
      <p>In several instances, the use of multi-drug models permitted the <italic>in silico</italic> identification of genomic events associated with altered drug sensitivity, which is only possible when genomic properties are considered.</p>
      <p>Although our models did not capture all known gene to drug associations, we anticipate that as larger drug sensitivity and genomic datasets become available in coming years the predictive power of these models will increase. We believe that the predictive power of our models is due to the large number of cell lines and broad range of drugs in the GDSC panel that samples intensively the chemical space of common cancer drugs (chemotherapeutic and kinase inhibitors). It remains to be determined how these models will predict completely unknown families of therapeutic agents.</p>
      <p>The predictive ability of our methods for individual values is still limited and could be further improved by extending the set of input features with additional layers of molecular characterization of the cell lines, such as basal transcriptional profiles and phosphoproteomic data. These data types have been used to predict drug responses in various contexts <xref ref-type="bibr" rid="pone.0061318-Chen1">[21]</xref>–<xref ref-type="bibr" rid="pone.0061318-Reinhold1">[24]</xref>. Another valuable extension could be the inclusion of gene expression data following drug treatment, a powerful <italic>in silico</italic> resource for predicting treatment outcomes and elucidating compound mode of action <xref ref-type="bibr" rid="pone.0061318-Kutalik1">[25]</xref>, <xref ref-type="bibr" rid="pone.0061318-Lamb1">[26]</xref>, as well as a promising gateway to the identification of new drug repositioning opportunities <xref ref-type="bibr" rid="pone.0061318-Iorio1">[27]</xref>. Additionally, epigenetics data could enhance the prediction capabilities of future methods <xref ref-type="bibr" rid="pone.0061318-RodriguezParedes1">[28]</xref>.</p>
      <p>Our method uses purely experimental data, but additional predictive power can be expected from including knowledge of the underlying network <xref ref-type="bibr" rid="pone.0061318-Jorgensen1">[29]</xref>. It has been shown that the prediction of drug response and mode of action by transcriptional profiling is significantly enhanced when paired with known a priori gene and protein networks <xref ref-type="bibr" rid="pone.0061318-Torkamani1">[30]</xref>, <xref ref-type="bibr" rid="pone.0061318-Mani1">[31]</xref> and drug similarities have been inferred based on the corresponding <italic>in silico</italic> predicted impinged pathway <xref ref-type="bibr" rid="pone.0061318-Silberberg1">[32]</xref>. Prior knowledge could also increase the interpretability of the results. Known regulatory relationships between genes and transcriptional data <xref ref-type="bibr" rid="pone.0061318-Liu1">[33]</xref> and protein networks <xref ref-type="bibr" rid="pone.0061318-Ulitsky1">[34]</xref> can be used to identify deregulated pathways, and be further linked to the genomic alterations that drive them <xref ref-type="bibr" rid="pone.0061318-Kim1">[35]</xref>, highlighting subnetworks of importance for drug response.</p>
      <p>Incorporation of these additional features will require a scheme to prioritize the input features based on their impact on the final trained model. Associations between features and outcomes could be explicitly unveiled by integrating in our models feature selections criteria and dimensionality reduction techniques.</p>
      <p>In terms of predictive models, we have used standard machine learning methods (neural networks and random forests), given their flexibility and robustness as predictive models. A fertile ground for further research is investigating the application of other modeling techniques, including linear regression methods (e.g. LASSO, ElasticNets).</p>
      <p>Our results also show that one can estimate the accuracy of prediction for different degrees of sparseness in the data, which may have utility when designing experiments where coverage has to be balanced with accuracy. Furthermore, because models are able to predict IC<sub>50</sub> on cell lines not screened yet, predictions from these models can be used to decide whether it is worthwhile expanding the panel of cell lines, or rather focus on a few selected ones.</p>
      <p>The implications of our results go beyond their utility to optimise the experimental design of drug screenings. Once a model is built, it could be used to systematically test the potential effect of novel drugs <italic>in silico</italic>, based on their chemical features and similarity. These predictions can help to evaluate the potential activity of new drugs, e.g. from large chemical libraries, to be screened. Furthermore, predictions on clinically approved drugs is expected to reveal candidates for drug repurposing and potentially identify specific disease sub-types that would be most responsive <xref ref-type="bibr" rid="pone.0061318-Ashburn1">[8]</xref>. Although cell lines are not an exact replica of real tumours, comprehensive predictive models such as ours together with expanded genomic and epigenomic datasets may be a good proxy to facilitate the development new therapeutic strategies tailored to individual patients <xref ref-type="bibr" rid="pone.0061318-Kelloff1">[12]</xref>.</p>
   </sec>
   <sec id="s4" sec-type="materials|methods">
      <title>Materials and Methods</title>
      <sec id="s4a">
         <title>Training dataset</title>
         <p>We used the data from the Genomics of Drug Sensitivity in Cancer project <xref ref-type="bibr" rid="pone.0061318-Garnett1">[3]</xref>, which contains 639 cancer cell lines, each of them characterised by a set of genomic features (details in the next section). The characterisation is not complete for every cell line, and therefore we filtered out cell lines with more than 15 missing genomic features, which reduced the set of selected cell lines from 639 to 608. The dataset contains 131 drugs. As our method exploits the chemical structure of each drug, this information in simplified molecular-input line entry system (SMILES) format is required. Therefore, we did not consider the 20 drugs for which SMILES were not available, and built our model for the remaining 111 drugs.</p>
         <p>The resulting matrix of 608 cell lines by 111 drugs will have 67,488 possible drug response curves, each summarised by its IC<sub>50</sub> value (drug concentration in μM units required to eradicate 50% of the cancer cells). Currently, the dataset contains 38,930 IC<sub>50</sub> values out of these 67,488 (58%), with missing values mostly due to logistic reasons such as co-ordinating measurements from various screening centres. The log IC<sub>50</sub> ranges from –7.40 (IC<sub>50</sub>∼4•10<sup>−8</sup> M; the most sensitive drug-cell combination) to 6.91 (IC<sub>50</sub>∼8•10<sup>6</sup> M; the most resistant). Note that extremely large and small values are extrapolations in the IC<sub>50</sub> that have no clinical relevance. We use these ranges in this study as those are the ones used in the paper Garnett et al. <xref ref-type="bibr" rid="pone.0061318-Garnett1">[3]</xref> that we compare our results against.</p>
      </sec>
      <sec id="s4b">
         <title>Blind test dataset</title>
         <p>We generated test sets during the cross-validation for estimating the expected error (details in cross-validation section). However, even cross-validation can overestimate the prospective performance of machine learning methods. Therefore, we conducted a truly blind test in order to demonstrate the prospective capabilities of our cross-validated models to impute missing IC50 values in the 608 cell lines by 111 drugs matrix (Fig S1). Our blind test contains 13,565 newly generated IC<sub>50</sub> values, which were obtained after training took place, or put it differently, a batch of new experimental data was generated to independently validate our models. To sum up, 58% of the IC<sub>50</sub> values are in the original dataset (used for cross-validation), an additional 18% are used for the blind test (independent test).</p>
      </sec>
      <sec id="s4c">
         <title>Features</title>
         <p>There are two different input data streams in our method: the genomic background for each cancer cell line, and the chemical properties of a drug. For the first input data stream, cancer cell lines are characterised by the mutational status of 77 oncogenes, where each of them is further described by copy number variation (any high grade amplification or homozygous deletion of a cancer gene) and sequence variation (changes in the protein sequence, e.g. non-synonymous single nucleotide polymorphism). Additionally, there is one binary feature for the microsatellite stability status of each cell line. The cell line features were encoded as followed:</p>
         <p>Microsatellite instability status <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pone.0061318.e001" xlink:type="simple"/></inline-formula></p>
         <p>Sequence variation <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pone.0061318.e002" xlink:type="simple"/></inline-formula></p>
         <p>Copy number variation <inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pone.0061318.e003" xlink:type="simple"/></inline-formula></p>
         <p>All mutations considered, we have 77 possible copy number variations plus 77 possible sequence variations and one microsatellite stability value, which sums up to 155 possible cell line features. However, a few mutational features are missing for some cell lines, and we conservatively removed a feature in case it was missing for any cell line. This led to a final set of 138 genomic features characterising each cancer cell line.</p>
         <p>The second input data stream incorporates 1D and 2D chemical properties of each drug. We generated these chemical features using the PaDEL software (v2.11, downloaded from the project website, <ext-link ext-link-type="uri" xlink:href="http://padel.nus.edu.sg/software/padeldescriptor/" xlink:type="simple">http://padel.nus.edu.sg/software/padeldescriptor/</ext-link>) <xref ref-type="bibr" rid="pone.0061318-Yap1">[19]</xref> from the SMILES with default settings. 722 features are physicochemical descriptors and 881 are obtained from the fingerprints, leading to a total of 1603 chemical features. We only included chemical features that could be calculated for all drugs. Furthermore, we removed any feature with the same value across all drugs, obtaining a final set of 689 chemical features for each drug (e.g. atom count, bond count, molecular weight, xlogP or PubChem fingerprint, to name a few). The list of drugs is available in the Supplementary material (<xref ref-type="supplementary-material" rid="pone.0061318.s004">Table S1</xref>).</p>
         <p>Taking together the cancer cell line and drug stream, we used 827 features to build our predictive models of the log IC<sub>50</sub> value of a given cell line in the presence of a given drug.</p>
      </sec>
      <sec id="s4d">
         <title>Cross-validation</title>
         <p>We used an 8-fold cross-validation for building our models. Therefore, we separated the original dataset into eight equally sized sets of IC<sub>50</sub> values, obtained by randomly distributing all IC<sub>50</sub>s of the matrix into 8 bins. One of them was exclusively used for testing (never involved in any training), other six were destined for training the model and the remaining piece was used for cross-training. Cross-validation is a process used to avoid under- and overfitting <xref ref-type="bibr" rid="pone.0061318-Mitchell1">[36]</xref> e.g. identifying the optimal number of hidden units and training iterations for a neural network (details in “Machine learning” section). We rotated iteratively the sets so that each data point was used at least once for training, cross-training or testing. Finally, we obtained 8 models, which were equally predictive.</p>
         <p>Furthermore, we used a more stringent version of the above described 8-fold cross-validation. We ensured that test, train and cross-train set are not sharing any cell line, which might occur in the non-stringent version (described above). For instance, assume cell line C1 is treated with the drugs D1, D2 and D3; For the non-stringent cross-validation, the combination C1–D1, C1–D2 and C1–D3 might be distributed over test, train and cross-train set; for the stringent cross-validation, every combination with C1 is exclusively occurring in one of those three sets.</p>
      </sec>
      <sec id="s4e">
         <title>Machine learning</title>
         <p>For the neural networks, we used the Java implementation from Encog 3.0.1 (<ext-link ext-link-type="uri" xlink:href="http://www.heatonresearch.com/encog" xlink:type="simple">http://www.heatonresearch.com/encog</ext-link>) <xref ref-type="bibr" rid="pone.0061318-Jeff1">[37]</xref>, <xref ref-type="bibr" rid="pone.0061318-Jeff2">[38]</xref> of a feed-forward multi layer perceptron, where we defined three different layers: input, hidden (or middle) and output layer. Every perceptron of a layer is completely connected to each perceptron of the upper layer. The number of features determined the number of input units, or put it differently, required perceptrons in the first layer. The number of hidden units was explored during the training for determining the correct model complexity, which was between 1 and 30 hidden units. Furthermore, each input and hidden unit had also an bias, which is a permanent activation input for those perceptrons. We used a single output unit for predicting the continuous log(IC<sub>50</sub>) value.</p>
         <p>As perceptron activation function for enabling the network to predict non-linear behaviour, we used the sigmoid function, which returns values in an interval from 0 to 1. Therefore, we had to normalise the IC<sub>50</sub> values (raw IC<sub>50</sub> values, not in log space) also into a range from 0 to 1, which was done with the following logistic-like function:<disp-formula id="pone.0061318.e004"><graphic position="anchor" xlink:href="info:doi/10.1371/journal.pone.0061318.e004" xlink:type="simple"/></disp-formula></p>
         <p><inline-formula><inline-graphic xlink:href="info:doi/10.1371/journal.pone.0061318.e005" xlink:type="simple"/></inline-formula><sub>:</sub> Observed/expected IC50 value, which has to be a positive number greater than zero.</p>
         <p>We trained the network with the resilient error backpropagation implementation from Encog with default parameters <xref ref-type="bibr" rid="pone.0061318-Riedmiller1">[39]</xref>. For exploring the final model complexity, which is described by number of hidden units and amount of training iterations, we examined different neural network architectures from 1 up to 30 hidden units and trained them for maximal 400 iterations. We searched the global minimum in that cross-training landscape (minimizing the root mean square error of cross training set) for avoiding an under- or overfitting (usually, between 21 and 27 hidden units were chosen as best model after approximately 300 iterations).</p>
         <p>We also carried out random forest <xref ref-type="bibr" rid="pone.0061318-Breiman1">[40]</xref> regression models to investigate whether there was any significant performance gain using an alternative non-parametric machine learning methodology (<xref ref-type="supplementary-material" rid="pone.0061318.s007">Text S3</xref>). A random forest is an ensemble of many different regression trees randomly generated from the same training data (recommended value of n = 500 trees was used).</p>
      </sec>
      <sec id="s4f">
         <title>Data access</title>
         <p>The dataset is fully accessible of the Genomics of Drug Sensitivity in Cancer project <xref ref-type="bibr" rid="pone.0061318-Garnett1">[3]</xref>, downloaded from the project website, <ext-link ext-link-type="uri" xlink:href="http://www.cancerrxgene.org/" xlink:type="simple">http://www.cancerrxgene.org/</ext-link>. The training set is based on release v1.0 from June 2012. Newly generated IC<sub>50</sub> values of the blind test are published in release v1.1 from July 2012, which are not part of Release v1.0.</p>
      </sec>
      <sec id="s4g">
         <title>Software access</title>
         <p>The Encog Machine Learning Framework (version 3.0.1) <xref ref-type="bibr" rid="pone.0061318-Jeff1">[37]</xref>, <xref ref-type="bibr" rid="pone.0061318-Jeff2">[38]</xref> containing the neural network implementation is a free available and open source (Apache License 2.5), and could be downloaded on the Heaton Research webpage (<ext-link ext-link-type="uri" xlink:href="http://www.heatonresearch.com/encog" xlink:type="simple">http://www.heatonresearch.com/encog</ext-link>). For the random forest model, the R package randomForest (version 4.6–6) <xref ref-type="bibr" rid="pone.0061318-Wiener1">[41]</xref> is also freely available under GPL licence from CRAN webpage (<ext-link ext-link-type="uri" xlink:href="http://cran.r-project.org/web/packages/randomForest/index.html" xlink:type="simple">http://cran.r-project.org/web/packages/randomForest/index.html</ext-link>).</p>
      </sec>
   </sec>
   <sec id="s5">
      <title>Supporting Information</title>
<supplementary-material id="pone.0061318.s001" mimetype="image/tiff" xlink:href="info:doi/10.1371/journal.pone.0061318.s001" position="float" xlink:type="simple">
<label>Figure S1</label>
<caption><p><bold>Blind test of multi-drug model.</bold> The training dataset holds 38,930 IC<sub>50</sub> values, that is ∼58% of all possible drug-to-cell line combinations. For the blind test, 13,565 novel IC<sub>50</sub> values were generated, an∼18% additional data points which were not included in the training dataset. For obtaining the predicted log(IC50) values, we averaged the output of each model (8 different models resulting from the 8-fold cross-validation procedure). The prediction on the blind test was slightly worse than that estimated by cross-validation (<xref ref-type="fig" rid="pone-0061318-g003">Fig 3A</xref>): root mean square error (<italic>RMSE</italic>) was increased from 0.83 to 0.97, coefficient of determination (R<sup>2</sup>) declined from 0.72 to 0.64 and the Pearson correlation coefficient (<italic>R<sub>p</sub></italic>) was decreased from 0.85 to 0.79. This small performance decrease is due to the fact that blind test data points are not selected at random: these tend to come from drug-cell combinations that are not optimally represented in the training set (i.e. those cell lines in the training set that have been probed against every drug in the panel will not have further IC50 values in the test set, as all training and test sets in this study are non-overlapping).</p>
      <p>(TIFF)</p>
</caption></supplementary-material>
<supplementary-material id="pone.0061318.s002" mimetype="image/tiff" xlink:href="info:doi/10.1371/journal.pone.0061318.s002" position="float" xlink:type="simple">
<label>Figure S2</label>
<caption><p><bold>Correlation between predicted to experimental observed log(IC<sub>50</sub>) values leaving out cell lines.</bold> The stringent 8-fold cross-validation was performed on the distinct set of cell lines, so that a cell line was neither used for testing or involved in the training. The figure shows values obtained solely on the test sets. The prediction quality is slightly worse than the normal cross-validation (<xref ref-type="fig" rid="pone-0061318-g003">Figure 3A</xref>): <italic>RMSE</italic> increased from 0.83 to 0.89, R<sup>2</sup> decreased from 0.72 to 0.68 and the <italic>R<sub>p</sub></italic> decreased from 0.85 to 0.82.</p>
      <p>(TIFF)</p>
</caption></supplementary-material>
<supplementary-material id="pone.0061318.s003" mimetype="image/tiff" xlink:href="info:doi/10.1371/journal.pone.0061318.s003" position="float" xlink:type="simple">
<label>Figure S3</label>
<caption><p><bold>Correlation between predicted to experimental observed log(IC<sub>50</sub>) values leaving out all lung cell lines.</bold> To further challenge our model and our hypothesis that it is possible to leave out several cell lines, we removed all lung cell lines and used them exclusively for testing. There are 106 out of 608 cell lines are from lung tissue (∼17% from data), which we were able to predict with minor performance reduction compared to including all cell lines (<xref ref-type="fig" rid="pone-0061318-g003">Figure 3A</xref>): root mean square error (<italic>RMSE</italic>) increased from 0.83 to 0.99, coefficient of determination (R<sup>2</sup>) declined from 0.72 to 0.61 and the Pearson correlation coefficient (<italic>R<sub>p</sub></italic>) decreased from 0.85 to 0.79.</p>
      <p>(TIFF)</p>
</caption></supplementary-material>
<supplementary-material id="pone.0061318.s004" mimetype="text/comma-separated-values" xlink:href="info:doi/10.1371/journal.pone.0061318.s004" position="float" xlink:type="simple">
<label>Table S1</label>
<caption><p><bold>Drug list.</bold></p>
      <p>(CSV)</p>
</caption></supplementary-material>
<supplementary-material id="pone.0061318.s005" mimetype="application/msword" xlink:href="info:doi/10.1371/journal.pone.0061318.s005" position="float" xlink:type="simple">
<label>Text S1</label>
<caption><p><bold>Performance measurements.</bold></p>
      <p>(DOC)</p>
</caption></supplementary-material>
<supplementary-material id="pone.0061318.s006" mimetype="application/msword" xlink:href="info:doi/10.1371/journal.pone.0061318.s006" position="float" xlink:type="simple">
<label>Text S2</label>
<caption><p><bold>Comparison of imputation methods and machine learning approach.</bold></p>
      <p>(DOC)</p>
</caption></supplementary-material>
<supplementary-material id="pone.0061318.s007" mimetype="application/msword" xlink:href="info:doi/10.1371/journal.pone.0061318.s007" position="float" xlink:type="simple">
<label>Text S3</label>
<caption><p><bold>Random Forest.</bold></p>
      <p>(DOC)</p>
</caption></supplementary-material>
   </sec>
</body>
<back>
   <ack>
      <p>We thank King Wai Lau, David Wedge and Jorge Soares for helping with data, Marc Hafner, Mario Niepel, John Marioni, Theo Knijnenburg, Lodewyk Wessels for useful discussions, and Clare Pacini and Maja Köhn for feedback on the manuscript.</p>
   </ack>
   <ref-list>
      <title>References</title>
      <ref id="pone.0061318-Sharma1"><label>1</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Sharma</surname><given-names>SV</given-names></name>, <name name-style="western"><surname>Haber</surname><given-names>DA</given-names></name>, <name name-style="western"><surname>Settleman</surname><given-names>J</given-names></name> (<year>2010</year>) <article-title>Cell line-based platforms to evaluate the therapeutic efficacy of candidate anticancer agents</article-title>. <source>Nat Rev Cancer</source> <volume>10</volume>: <fpage>241</fpage>–<lpage>253</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Grever1"><label>2</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Grever</surname><given-names>MR</given-names></name>, <name name-style="western"><surname>Schepartz</surname><given-names>SA</given-names></name>, <name name-style="western"><surname>Chabner</surname><given-names>BA</given-names></name> (<year>1992</year>) <article-title>The National Cancer Institute: cancer drug discovery and development program</article-title>. <source>Semin Oncol</source> <volume>19</volume>: <fpage>622</fpage>–<lpage>638</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Garnett1"><label>3</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Garnett</surname><given-names>MJ</given-names></name>, <name name-style="western"><surname>Edelman</surname><given-names>EJ</given-names></name>, <name name-style="western"><surname>Heidorn</surname><given-names>SJ</given-names></name>, <name name-style="western"><surname>Greenman</surname><given-names>CD</given-names></name>, <name name-style="western"><surname>Dastur</surname><given-names>A</given-names></name>, <etal>et al</etal>. (<year>2012</year>) <article-title>Systematic identification of genomic markers of drug sensitivity in cancer cells</article-title>. <source>Nature</source> <volume>483</volume>: <fpage>570</fpage>–<lpage>575</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Barretina1"><label>4</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Barretina</surname><given-names>J</given-names></name>, <name name-style="western"><surname>Caponigro</surname><given-names>G</given-names></name>, <name name-style="western"><surname>Stransky</surname><given-names>N</given-names></name>, <name name-style="western"><surname>Venkatesan</surname><given-names>K</given-names></name>, <name name-style="western"><surname>Margolin</surname><given-names>AA</given-names></name>, <etal>et al</etal>. (<year>2012</year>) <article-title>The Cancer Cell Line Encyclopedia enables predictive modelling of anticancer drug sensitivity</article-title>. <source>Nature</source> <volume>483</volume>: <fpage>603</fpage>–<lpage>607</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Heiser1"><label>5</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Heiser</surname><given-names>LM</given-names></name>, <name name-style="western"><surname>Sadanandam</surname><given-names>A</given-names></name>, <name name-style="western"><surname>Kuo</surname><given-names>WL</given-names></name>, <name name-style="western"><surname>Benz</surname><given-names>SC</given-names></name>, <name name-style="western"><surname>Goldstein</surname><given-names>TC</given-names></name>, <etal>et al</etal>. (<year>2012</year>) <article-title>Subtype and pathway specific responses to anticancer compounds in breast cancer</article-title>. <source>Proc Natl Acad Sci U S A</source> <volume>109</volume>: <fpage>2724</fpage>–<lpage>2729</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Stevens1"><label>6</label><mixed-citation publication-type="other" xlink:type="simple">Stevens JP (2002) Applied multivariate statistics for the social sciences; Riegert D, editor. Mahwah, NJ: Lawrence Erblaum Associates, Inc.</mixed-citation></ref>
      <ref id="pone.0061318-HuiZou1"><label>7</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Hui Zou</surname><given-names>TH</given-names></name> (<year>2005</year>) <article-title>Regularization and variable selection via the elastic net</article-title>. <source>Journal of the Royal Statistical Society: Series B (Statistical Methodology)</source> <volume>67</volume>: <fpage>301</fpage>–<lpage>320</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Ashburn1"><label>8</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Ashburn</surname><given-names>TT</given-names></name>, <name name-style="western"><surname>Thor</surname><given-names>KB</given-names></name> (<year>2004</year>) <article-title>Drug repositioning: identifying and developing new uses for existing drugs</article-title>. <source>Nat Rev Drug Discov</source> <volume>3</volume>: <fpage>673</fpage>–<lpage>683</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Sanseau1"><label>9</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Sanseau</surname><given-names>P</given-names></name>, <name name-style="western"><surname>Koehler</surname><given-names>J</given-names></name> (<year>2011</year>) <article-title>Editorial: computational methods for drug repurposing</article-title>. <source>Brief Bioinform</source> <volume>12</volume>: <fpage>301</fpage>–<lpage>302</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Paull1"><label>10</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Paull</surname><given-names>KD</given-names></name>, <name name-style="western"><surname>Shoemaker</surname><given-names>RH</given-names></name>, <name name-style="western"><surname>Hodes</surname><given-names>L</given-names></name>, <name name-style="western"><surname>Monks</surname><given-names>A</given-names></name>, <name name-style="western"><surname>Scudiero</surname><given-names>DA</given-names></name>, <etal>et al</etal>. (<year>1989</year>) <article-title>Display and analysis of patterns of differential activity of drugs against human tumor cell lines: development of mean graph and COMPARE algorithm</article-title>. <source>J Natl Cancer Inst</source> <volume>81</volume>: <fpage>1088</fpage>–<lpage>1092</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Zaharevitz1"><label>11</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Zaharevitz</surname><given-names>DW</given-names></name>, <name name-style="western"><surname>Holbeck</surname><given-names>SL</given-names></name>, <name name-style="western"><surname>Bowerman</surname><given-names>C</given-names></name>, <name name-style="western"><surname>Svetlik</surname><given-names>PA</given-names></name> (<year>2002</year>) <article-title>COMPARE: a web accessible tool for investigating mechanisms of cell growth inhibition</article-title>. <source>J Mol Graph Model</source> <volume>20</volume>: <fpage>297</fpage>–<lpage>303</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Kelloff1"><label>12</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Kelloff</surname><given-names>GJ</given-names></name>, <name name-style="western"><surname>Sigman</surname><given-names>CC</given-names></name> (<year>2012</year>) <article-title>Cancer biomarkers: selecting the right drug for the right patient</article-title>. <source>Nat Rev Drug Discov</source> <volume>11</volume>: <fpage>201</fpage>–<lpage>214</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Kubinyi1"><label>13</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Kubinyi</surname><given-names>H</given-names></name> (<year>1990</year>) <article-title>Quantitative structure-activity relationships (QSAR) and molecular modelling in cancer research</article-title>. <source>J Cancer Res Clin Oncol</source> <volume>116</volume>: <fpage>529</fpage>–<lpage>537</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Li1"><label>14</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Li</surname><given-names>GH</given-names></name>, <name name-style="western"><surname>Huang</surname><given-names>JF</given-names></name> (<year>2012</year>) <article-title>CDRUG: a web server for predicting anticancer activity of chemical compounds</article-title>. <source>Bioinformatics</source> <volume>28</volume>: <fpage>3334</fpage>–<lpage>3335</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Agarwal1"><label>15</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Agarwal</surname><given-names>S</given-names></name>, <name name-style="western"><surname>Dugar</surname><given-names>D</given-names></name>, <name name-style="western"><surname>Sengupta</surname><given-names>S</given-names></name> (<year>2010</year>) <article-title>Ranking chemical structures for drug discovery: a new machine learning approach</article-title>. <source>J Chem Inf Model</source> <volume>50</volume>: <fpage>716</fpage>–<lpage>731</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Swamidass1"><label>16</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Swamidass</surname><given-names>SJ</given-names></name>, <name name-style="western"><surname>Chen</surname><given-names>J</given-names></name>, <name name-style="western"><surname>Bruand</surname><given-names>J</given-names></name>, <name name-style="western"><surname>Phung</surname><given-names>P</given-names></name>, <name name-style="western"><surname>Ralaivola</surname><given-names>L</given-names></name>, <etal>et al</etal>. (<year>2005</year>) <article-title>Kernels for small molecules and the prediction of mutagenicity, toxicity and anti-cancer activity</article-title>. <source>Bioinformatics</source> <volume>21</volume> Suppl 1<fpage>i359</fpage>–<lpage>368</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Shi1"><label>17</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Shi</surname><given-names>LM</given-names></name>, <name name-style="western"><surname>Fan</surname><given-names>Y</given-names></name>, <name name-style="western"><surname>Myers</surname><given-names>TG</given-names></name>, <name name-style="western"><surname>O'Connor</surname><given-names>PM</given-names></name>, <name name-style="western"><surname>Paull</surname><given-names>KD</given-names></name>, <etal>et al</etal>. (<year>1998</year>) <article-title>Mining the NCI anticancer drug discovery databases: genetic function approximation for the QSAR study of anticancer ellipticine analogues</article-title>. <source>J Chem Inf Comput Sci</source> <volume>38</volume>: <fpage>189</fpage>–<lpage>199</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Shi2"><label>18</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Shi</surname><given-names>LM</given-names></name>, <name name-style="western"><surname>Fan</surname><given-names>Y</given-names></name>, <name name-style="western"><surname>Lee</surname><given-names>JK</given-names></name>, <name name-style="western"><surname>Waltham</surname><given-names>M</given-names></name>, <name name-style="western"><surname>Andrews</surname><given-names>DT</given-names></name>, <etal>et al</etal>. (<year>2000</year>) <article-title>Mining and visualizing large anticancer drug discovery databases</article-title>. <source>J Chem Inf Comput Sci</source> <volume>40</volume>: <fpage>367</fpage>–<lpage>379</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Yap1"><label>19</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Yap</surname><given-names>CW</given-names></name> (<year>2011</year>) <article-title>PaDEL-descriptor: an open source software to calculate molecular descriptors and fingerprints</article-title>. <source>J Comput Chem</source> <volume>32</volume>: <fpage>1466</fpage>–<lpage>1474</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Solit1"><label>20</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Solit</surname><given-names>DB</given-names></name>, <name name-style="western"><surname>Garraway</surname><given-names>LA</given-names></name>, <name name-style="western"><surname>Pratilas</surname><given-names>CA</given-names></name>, <name name-style="western"><surname>Sawai</surname><given-names>A</given-names></name>, <name name-style="western"><surname>Getz</surname><given-names>G</given-names></name>, <etal>et al</etal>. (<year>2006</year>) <article-title>BRAF mutation predicts sensitivity to MEK inhibition</article-title>. <source>Nature</source> <volume>439</volume>: <fpage>358</fpage>–<lpage>362</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Chen1"><label>21</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Chen</surname><given-names>BJ</given-names></name>, <name name-style="western"><surname>Causton</surname><given-names>HC</given-names></name>, <name name-style="western"><surname>Mancenido</surname><given-names>D</given-names></name>, <name name-style="western"><surname>Goddard</surname><given-names>NL</given-names></name>, <name name-style="western"><surname>Perlstein</surname><given-names>EO</given-names></name>, <etal>et al</etal>. (<year>2009</year>) <article-title>Harnessing gene expression to identify the genetic basis of drug resistance</article-title>. <source>Mol Syst Biol</source> <volume>5</volume>: <fpage>310</fpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Park1"><label>22</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Park</surname><given-names>ES</given-names></name>, <name name-style="western"><surname>Rabinovsky</surname><given-names>R</given-names></name>, <name name-style="western"><surname>Carey</surname><given-names>M</given-names></name>, <name name-style="western"><surname>Hennessy</surname><given-names>BT</given-names></name>, <name name-style="western"><surname>Agarwal</surname><given-names>R</given-names></name>, <etal>et al</etal>. (<year>2010</year>) <article-title>Integrative analysis of proteomic signatures, mutations, and drug responsiveness in the NCI 60 cancer cell line set</article-title>. <source>Mol Cancer Ther</source> <volume>9</volume>: <fpage>257</fpage>–<lpage>267</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Ruderfer1"><label>23</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Ruderfer</surname><given-names>DM</given-names></name>, <name name-style="western"><surname>Roberts</surname><given-names>DC</given-names></name>, <name name-style="western"><surname>Schreiber</surname><given-names>SL</given-names></name>, <name name-style="western"><surname>Perlstein</surname><given-names>EO</given-names></name>, <name name-style="western"><surname>Kruglyak</surname><given-names>L</given-names></name> (<year>2009</year>) <article-title>Using expression and genotype to predict drug response in yeast</article-title>. <source>PLoS One</source> <volume>4</volume>: <fpage>e6907</fpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Reinhold1"><label>24</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Reinhold</surname><given-names>WC</given-names></name>, <name name-style="western"><surname>Sunshine</surname><given-names>M</given-names></name>, <name name-style="western"><surname>Liu</surname><given-names>H</given-names></name>, <name name-style="western"><surname>Varma</surname><given-names>S</given-names></name>, <name name-style="western"><surname>Kohn</surname><given-names>KW</given-names></name>, <etal>et al</etal>. (<year>2012</year>) <article-title>CellMiner: A Web-Based Suite of Genomic and Pharmacologic Tools to Explore Transcript and Drug Patterns in the NCI-60 Cell Line Set</article-title>. <source>Cancer Res</source> <volume>72</volume>: <fpage>3499</fpage>–<lpage>3511</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Kutalik1"><label>25</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Kutalik</surname><given-names>Z</given-names></name>, <name name-style="western"><surname>Beckmann</surname><given-names>JS</given-names></name>, <name name-style="western"><surname>Bergmann</surname><given-names>S</given-names></name> (<year>2008</year>) <article-title>A modular approach for integrative analysis of large-scale gene-expression and drug-response data</article-title>. <source>Nat Biotechnol</source> <volume>26</volume>: <fpage>531</fpage>–<lpage>539</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Lamb1"><label>26</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Lamb</surname><given-names>J</given-names></name>, <name name-style="western"><surname>Crawford</surname><given-names>ED</given-names></name>, <name name-style="western"><surname>Peck</surname><given-names>D</given-names></name>, <name name-style="western"><surname>Modell</surname><given-names>JW</given-names></name>, <name name-style="western"><surname>Blat</surname><given-names>IC</given-names></name>, <etal>et al</etal>. (<year>2006</year>) <article-title>The Connectivity Map: using gene-expression signatures to connect small molecules, genes, and disease</article-title>. <source>Science</source> <volume>313</volume>: <fpage>1929</fpage>–<lpage>1935</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Iorio1"><label>27</label><mixed-citation publication-type="other" xlink:type="simple">Iorio F, Rittman T, Ge H, Menden M, Saez-Rodriguez J (2012) Transcriptional data: a new gateway to drug repositioning? Drug Discov Today.</mixed-citation></ref>
      <ref id="pone.0061318-RodriguezParedes1"><label>28</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Rodriguez-Paredes</surname><given-names>M</given-names></name>, <name name-style="western"><surname>Esteller</surname><given-names>M</given-names></name> (<year>2011</year>) <article-title>Cancer epigenetics reaches mainstream oncology</article-title>. <source>Nat Med</source> <volume>17</volume>: <fpage>330</fpage>–<lpage>339</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Jorgensen1"><label>29</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Jorgensen</surname><given-names>C</given-names></name>, <name name-style="western"><surname>Linding</surname><given-names>R</given-names></name> (<year>2010</year>) <article-title>Simplistic pathways or complex networks?</article-title> <source>Curr Opin Genet Dev</source> <volume>20</volume>: <fpage>15</fpage>–<lpage>22</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Torkamani1"><label>30</label><mixed-citation publication-type="other" xlink:type="simple">Torkamani A, Schork NJ (2011) Background gene expression networks significantly enhance drug response prediction by transcriptional profiling. Pharmacogenomics J.</mixed-citation></ref>
      <ref id="pone.0061318-Mani1"><label>31</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Mani</surname><given-names>KM</given-names></name>, <name name-style="western"><surname>Lefebvre</surname><given-names>C</given-names></name>, <name name-style="western"><surname>Wang</surname><given-names>K</given-names></name>, <name name-style="western"><surname>Lim</surname><given-names>WK</given-names></name>, <name name-style="western"><surname>Basso</surname><given-names>K</given-names></name>, <etal>et al</etal>. (<year>2008</year>) <article-title>A systems biology approach to prediction of oncogenes and molecular perturbation targets in B-cell lymphomas</article-title>. <source>Mol Syst Biol</source> <volume>4</volume>: <fpage>169</fpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Silberberg1"><label>32</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Silberberg</surname><given-names>Y</given-names></name>, <name name-style="western"><surname>Gottlieb</surname><given-names>A</given-names></name>, <name name-style="western"><surname>Kupiec</surname><given-names>M</given-names></name>, <name name-style="western"><surname>Ruppin</surname><given-names>E</given-names></name>, <name name-style="western"><surname>Sharan</surname><given-names>R</given-names></name> (<year>2012</year>) <article-title>Large-scale elucidation of drug response pathways in humans</article-title>. <source>J Comput Biol</source> <volume>19</volume>: <fpage>163</fpage>–<lpage>174</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Liu1"><label>33</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names></name>, <name name-style="western"><surname>Ringner</surname><given-names>M</given-names></name> (<year>2007</year>) <article-title>Revealing signaling pathway deregulation by using gene expression signatures and regulatory motif analysis</article-title>. <source>Genome Biol</source> <volume>8</volume>: <fpage>R77</fpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Ulitsky1"><label>34</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Ulitsky</surname><given-names>I</given-names></name>, <name name-style="western"><surname>Krishnamurthy</surname><given-names>A</given-names></name>, <name name-style="western"><surname>Karp</surname><given-names>RM</given-names></name>, <name name-style="western"><surname>Shamir</surname><given-names>R</given-names></name> (<year>2010</year>) <article-title>DEGAS: de novo discovery of dysregulated pathways in human diseases</article-title>. <source>PLoS One</source> <volume>5</volume>: <fpage>e13367</fpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Kim1"><label>35</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Kim</surname><given-names>YA</given-names></name>, <name name-style="western"><surname>Wuchty</surname><given-names>S</given-names></name>, <name name-style="western"><surname>Przytycka</surname><given-names>TM</given-names></name> (<year>2011</year>) <article-title>Identifying causal genes and dysregulated pathways in complex diseases</article-title>. <source>PLoS Comput Biol</source> <volume>7</volume>: <fpage>e1001095</fpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Mitchell1"><label>36</label><mixed-citation publication-type="other" xlink:type="simple">Mitchell T (1997) Machine Learning.</mixed-citation></ref>
      <ref id="pone.0061318-Jeff1"><label>37</label><mixed-citation publication-type="other" xlink:type="simple">Jeff H (2008) Introduction to Neural Networks for Java.</mixed-citation></ref>
      <ref id="pone.0061318-Jeff2"><label>38</label><mixed-citation publication-type="other" xlink:type="simple">Jeff H (2011) Programming Neural Networks with Encog3 in Java.</mixed-citation></ref>
      <ref id="pone.0061318-Riedmiller1"><label>39</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Riedmiller</surname><given-names>M</given-names></name>, <name name-style="western"><surname>Braun</surname><given-names>H</given-names></name> (<year>1993</year>) <article-title>A Direct Adaptive Method for Faster Backpropagation Learning - the Rprop Algorithm</article-title>. <source>1993 Ieee International Conference on Neural Networks, Vols</source> <volume>1–3</volume>: <fpage>586</fpage>–<lpage>591</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Breiman1"><label>40</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Breiman</surname><given-names>L</given-names></name> (<year>2001</year>) <article-title>Random forests</article-title>. <source>Machine Learning</source> <volume>45</volume>: <fpage>5</fpage>–<lpage>32</lpage>.</mixed-citation></ref>
      <ref id="pone.0061318-Wiener1"><label>41</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Wiener</surname><given-names>ALaM</given-names></name> (<year>2002</year>) <article-title>Classification and Regression by randomForest</article-title>. <source>R News</source> <volume>2</volume>: <fpage>18</fpage>–<lpage>22</lpage>.</mixed-citation></ref>
   </ref-list>
</back>
</article>