<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1d3 20150301//EN" "http://jats.nlm.nih.gov/publishing/1.1d3/JATS-journalpublishing1.dtd">
<article article-type="research-article" dtd-version="1.1d3" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS ONE</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">plosone</journal-id>
<journal-title-group>
<journal-title>PLOS ONE</journal-title>
</journal-title-group>
<issn pub-type="epub">1932-6203</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, CA USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.1371/journal.pone.0243251</article-id>
<article-id pub-id-type="publisher-id">PONE-D-20-19021</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Research Article</subject>
</subj-group>
<subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Single nucleotide polymorphisms</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Medicine and health sciences</subject><subj-group><subject>Oncology</subject><subj-group><subject>Cancers and neoplasms</subject><subj-group><subject>Breast tumors</subject><subj-group><subject>Breast cancer</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Physical sciences</subject><subj-group><subject>Mathematics</subject><subj-group><subject>Algebra</subject><subj-group><subject>Linear algebra</subject><subj-group><subject>Eigenvectors</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Gene expression</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Cell biology</subject><subj-group><subject>Chromosome biology</subject><subj-group><subject>Chromatin</subject><subj-group><subject>Chromatin modification</subject><subj-group><subject>DNA methylation</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Epigenetics</subject><subj-group><subject>Chromatin</subject><subj-group><subject>Chromatin modification</subject><subj-group><subject>DNA methylation</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Gene expression</subject><subj-group><subject>Chromatin</subject><subj-group><subject>Chromatin modification</subject><subj-group><subject>DNA methylation</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>DNA</subject><subj-group><subject>DNA modification</subject><subj-group><subject>DNA methylation</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Biochemistry</subject><subj-group><subject>Nucleic acids</subject><subj-group><subject>DNA</subject><subj-group><subject>DNA modification</subject><subj-group><subject>DNA methylation</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Epigenetics</subject><subj-group><subject>DNA modification</subject><subj-group><subject>DNA methylation</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Gene expression</subject><subj-group><subject>DNA modification</subject><subj-group><subject>DNA methylation</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Simulation and modeling</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Genomics</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Genetics</subject><subj-group><subject>Heredity</subject><subj-group><subject>Genetic mapping</subject><subj-group><subject>Variant genotypes</subject></subj-group></subj-group></subj-group></subj-group></subj-group></article-categories>
<title-group>
<article-title>ANOVA-HD: Analysis of variance when both input and output layers are high-dimensional</article-title>
<alt-title alt-title-type="running-head">ANOVA-HD: Analysis of variance for multi-layer high-dimensional data</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0001-5692-7129</contrib-id>
<name name-style="western">
<surname>de los Campos</surname>
<given-names>Gustavo</given-names>
</name>
<role content-type="https://casrai.org/credit/">Conceptualization</role>
<role content-type="https://casrai.org/credit/">Data curation</role>
<role content-type="https://casrai.org/credit/">Formal analysis</role>
<role content-type="https://casrai.org/credit/">Funding acquisition</role>
<role content-type="https://casrai.org/credit/">Investigation</role>
<role content-type="https://casrai.org/credit/">Methodology</role>
<role content-type="https://casrai.org/credit/">Project administration</role>
<role content-type="https://casrai.org/credit/">Resources</role>
<role content-type="https://casrai.org/credit/">Software</role>
<role content-type="https://casrai.org/credit/">Supervision</role>
<role content-type="https://casrai.org/credit/">Validation</role>
<role content-type="https://casrai.org/credit/">Visualization</role>
<role content-type="https://casrai.org/credit/">Writing – original draft</role>
<role content-type="https://casrai.org/credit/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff002"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff003"><sup>3</sup></xref>
<xref ref-type="corresp" rid="cor001">*</xref>
</contrib>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0001-7874-8500</contrib-id>
<name name-style="western">
<surname>Pook</surname>
<given-names>Torsten</given-names>
</name>
<role content-type="https://casrai.org/credit/">Conceptualization</role>
<role content-type="https://casrai.org/credit/">Data curation</role>
<role content-type="https://casrai.org/credit/">Formal analysis</role>
<role content-type="https://casrai.org/credit/">Validation</role>
<role content-type="https://casrai.org/credit/">Writing – original draft</role>
<role content-type="https://casrai.org/credit/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff004"><sup>4</sup></xref>
<xref ref-type="corresp" rid="cor001">*</xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0002-2249-8335</contrib-id>
<name name-style="western">
<surname>Gonzalez-Reymundez</surname>
<given-names>Agustin</given-names>
</name>
<role content-type="https://casrai.org/credit/">Formal analysis</role>
<role content-type="https://casrai.org/credit/">Visualization</role>
<role content-type="https://casrai.org/credit/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff005"><sup>5</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Simianer</surname>
<given-names>Henner</given-names>
</name>
<role content-type="https://casrai.org/credit/">Conceptualization</role>
<role content-type="https://casrai.org/credit/">Funding acquisition</role>
<role content-type="https://casrai.org/credit/">Resources</role>
<role content-type="https://casrai.org/credit/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff004"><sup>4</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0002-9083-1216</contrib-id>
<name name-style="western">
<surname>Mias</surname>
<given-names>George</given-names>
</name>
<role content-type="https://casrai.org/credit/">Conceptualization</role>
<role content-type="https://casrai.org/credit/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff003"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff006"><sup>6</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Vazquez</surname>
<given-names>Ana I.</given-names>
</name>
<role content-type="https://casrai.org/credit/">Conceptualization</role>
<role content-type="https://casrai.org/credit/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff003"><sup>3</sup></xref>
</contrib>
</contrib-group>
<aff id="aff001"><label>1</label> <addr-line>Epidemiology &amp; Biostatistics, Michigan State University, East Lansing, MI, United States of America</addr-line></aff>
<aff id="aff002"><label>2</label> <addr-line>Statistics &amp; Probability, Michigan State University, East Lansing, MI, United States of America</addr-line></aff>
<aff id="aff003"><label>3</label> <addr-line>Institute for Quantitative Health Science and Engineering, East Lansing, MI, United States of America</addr-line></aff>
<aff id="aff004"><label>4</label> <addr-line>Department of Animal Sciences, Center for Integrated Breeding Research, University of Goettingen, Goettingen, Germany</addr-line></aff>
<aff id="aff005"><label>5</label> <addr-line>Genetics and Genome Sciences Graduate Program, Michigan State University, East Lansing, MI, United States of America</addr-line></aff>
<aff id="aff006"><label>6</label> <addr-line>Biochemistry and Molecular Biology, Michigan State University, East Lansing, MI, United States of America</addr-line></aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple">
<name name-style="western">
<surname>Fritsche-Neto</surname>
<given-names>Roberto</given-names>
</name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/>
</contrib>
</contrib-group>
<aff id="edit1"><addr-line>University of Sao Paulo/Luiz de Queiroz Agriculture College, BRAZIL</addr-line></aff>
<author-notes>
<fn fn-type="conflict" id="coi001">
<p>AIV, AGR, and GDLC received financial support from Zoetis. The funders did not have any additional role in the study design, data collection and analysis, decision to publish, or preparation of the manuscript. The funding provided by Zoetis did not alter our adherence to PLOS ONE policies on sharing data and materials.</p>
</fn>
<corresp id="cor001">* E-mail: <email xlink:type="simple">gustavoc@msu.edu</email> (GC); <email xlink:type="simple">torsten.pook@agr.uni-goettingen.de</email> (TP)</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>14</day>
<month>12</month>
<year>2020</year>
</pub-date>
<pub-date pub-type="collection">
<year>2020</year>
</pub-date>
<volume>15</volume>
<issue>12</issue>
<elocation-id>e0243251</elocation-id>
<history>
<date date-type="received">
<day>22</day>
<month>6</month>
<year>2020</year>
</date>
<date date-type="accepted">
<day>17</day>
<month>11</month>
<year>2020</year>
</date>
</history>
<permissions>
<copyright-year>2020</copyright-year>
<copyright-holder>de los Campos et al</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.pone.0243251"/>
<abstract>
<p>Modern genomic data sets often involve multiple data-layers (e.g., DNA-sequence, gene expression), each of which itself can be high-dimensional. The biological processes underlying these data-layers can lead to intricate multivariate association patterns. We propose and evaluate two methods to determine the proportion of variance of an output data set that can be explained by an input data set when both data panels are high dimensional. Our approach uses random-effects models to estimate the proportion of variance of vectors in the linear span of the output set that can be explained by regression on the input set. We consider a method based on an orthogonal basis (Eigen-ANOVA) and one that uses random vectors (Monte Carlo ANOVA, MC-ANOVA) in the linear span of the output set. Using simulations, we show that the MC-ANOVA method gave nearly unbiased estimates. Estimates produced by Eigen-ANOVA were also nearly unbiased, except when the shared variance was very high (e.g., &gt;0.9). We demonstrate the potential insight that can be obtained from the use of MC-ANOVA and Eigen-ANOVA by applying these two methods to the study of multi-locus linkage disequilibrium in chicken (<italic>Gallus gallus</italic>) genomes and to the assessment of inter-dependencies between gene expression, methylation, and copy-number-variants in data from breast cancer tumors from humans (<italic>Homo sapiens</italic>). Our analyses reveal that in chicken breeding populations ~50,000 evenly-spaced SNPs are enough to fully capture the span of whole-genome-sequencing genomes. In the study of multi-omic breast cancer data, we found that the span of copy-number-variants can be fully explained using either methylation or gene expression data and that roughly 74% of the variance in gene expression can be predicted from methylation data.</p>
</abstract>
<funding-group>
<award-group id="award001">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/501100001659</institution-id>
<institution>Deutsche Forschungsgemeinschaft</institution>
</institution-wrap>
</funding-source>
<award-id>152112243</award-id>
<principal-award-recipient>
<name name-style="western">
<surname>Simianer</surname>
<given-names>Henner</given-names>
</name>
</principal-award-recipient>
</award-group>
<award-group id="award002">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/100012895</institution-id>
<institution>Zoetis</institution>
</institution-wrap>
</funding-source>
<principal-award-recipient>
<name name-style="western">
<surname>Vazquez</surname>
<given-names>Ana I.</given-names>
</name>
</principal-award-recipient>
</award-group>
<funding-statement>GDLC was supported by a Mercator-fellowship of the German Research Foundation (DFG) within the Research Training Group 1644, “Scaling problems in statistics” (grant no. 152112243) at the University of Goettingen. AIV, AGR and GDLC received support from a research grant sponsored by Zoetis. The funder provided support in the form of a research grant, the funds were used to cover university salaries for GDLC, AIV, and AGR. The funders did not have any additional role in the study design, data collection and analysis, decision to publish, or preparation of the manuscript. The specific roles of these authors are articulated in the ‘author contributions’ section.</funding-statement>
</funding-group>
<counts>
<fig-count count="3"/>
<table-count count="3"/>
<page-count count="18"/>
</counts>
<custom-meta-group>
<custom-meta id="data-availability">
<meta-name>Data Availability</meta-name>
<meta-value>- The wheat data set is publicly available with the BGLR R-package (doi:<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1534/genetics.114.164442" xlink:type="simple">10.1534/genetics.114.164442</ext-link>) - The Breast Cancer Data can be obtained from The Cancer Genome Atlas (<ext-link ext-link-type="uri" xlink:href="https://www.cancer.gov/tcga" xlink:type="simple">https://www.cancer.gov/tcga</ext-link>) - The whole-genome chicken sequence data was previously used by Ni et al. (2015) and is available at DOI: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.6070/H47H1GKK" xlink:type="simple">10.6070/H47H1GKK</ext-link> - Array data from chromosome 1 of the chicken genomes was previously used by Pook et al. (Genetics, 2019) and is available at <ext-link ext-link-type="uri" xlink:href="https://github.com/tpook92/HaploBlocker/blob/master/Data/chicken_breeding_chromo1.RData" xlink:type="simple">https://github.com/tpook92/HaploBlocker/blob/master/Data/chicken_breeding_chromo1.RData</ext-link>.</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="sec001" sec-type="intro">
<title>Introduction</title>
<p>Modern genomic data often combine information from multiple data-layers, each of which itself can be high-dimensional. Examples of this include data sets comprising of information from several omics, or those combining genomic information with high-throughput phenotyping (e.g., crop-imaging, milk infrared spectra data). The biological processes underlying each of the data-layers can induce complex dependencies between features within each layer (e.g., linkage disequilibrium among single nucleotide polymorphisms, SNPs) as well as between layers (e.g., the association between DNA and gene expression, GE). The main goal of this study is to develop and to evaluate methods to quantify multivariate-associations in settings in which both the input and output sets are high dimensional.</p>
<p>The methods proposed in this study can be used to answer ubiquitous questions such as: How much of the inter-individual differences in whole-genome sequence genotypes can be predicted using a low-density SNP array? What proportion of variance in GE can be explained by differences in DNA methylation (ME)? How much of the variance in image-derived phenotypes can be predicted from DNA genotypes?</p>
<p>Canonical Correlation Analysis (CCA, [<xref ref-type="bibr" rid="pone.0243251.ref001">1</xref>]), Multivariate-Analysis of Variance (MANOVA, [<xref ref-type="bibr" rid="pone.0243251.ref002">2</xref>]) and Reduced Rank-Regressions, (e.g., Partial Least Squares, PLS, [<xref ref-type="bibr" rid="pone.0243251.ref003">3</xref>]) are three methodologies often used to assess associations in multi-dimensional problems. However, these approaches have limitations that make some of them inadequate for estimating the proportion of variance explained when both the output and input layers are high-dimensional.</p>
<p>Canonical Correlation Analysis extends the concept of the correlation between two random variables to a multivariate context. However, correlation is symmetric by nature. Therefore, CCA cannot address questions regarding the proportion of variance explained when the proportion of variance of one set (e.g., <bold><italic>X</italic></bold>) that is explained by another set (<bold><italic>W</italic></bold>) is not equal to the reciprocal (i.e., the proportion of variance of <bold><italic>W</italic></bold> that can be explained by <bold><italic>X</italic></bold>). Many multi-layered data sets are not expected to have a symmetric variance-decomposition (we will illustrate this using simulated and experimental data).</p>
<p>Multivariate Analyses of Variance (MANOVA, [<xref ref-type="bibr" rid="pone.0243251.ref004">4</xref>]) is often used for ANOVA when both the response and the explanatory data sets are multi-dimensional. However, MANOVA is based on least-squares projections; therefore, the methodology is not well-suited for cases when data is high dimensional, including rank-deficient cases. Most of the problems that we focus on involve high-dimensional data where the number of features exceeds sample size; thus, making least-squares methods such as MANOVA inadequate.</p>
<p>Reduced-rank regressions [<xref ref-type="bibr" rid="pone.0243251.ref005">5</xref>] and penalized multivariate analysis methods [<xref ref-type="bibr" rid="pone.0243251.ref006">6</xref>] are often used to analyze high-dimensional data. However, the results that one can obtain using regularized methods rely on regularization decisions (e.g., the number of dimensions used in PLS or CCA, or the sparsity parameters in sparse CCA) which cannot be made using fitness (e.g., the likelihood function) or lack-of-fit measures (e.g., residual sum of squares) evaluated in the training data. Thus, these parameters are often tuned to maximize prediction accuracy in testing sets. However, solutions derived by maximizing cross-validation prediction accuracy are not necessarily optimal for inferences because prediction accuracy is highly dependent on the relationship between sample size (<italic>n</italic>) and model complexity (e.g., number of parameters, <italic>p</italic>). Thus, in cases where <italic>p&gt;&gt;n</italic>, optimal prediction accuracy may be achieved with a highly parsimonious model (e.g., a principal component regression based on a few axes) that can produce severely biased estimates of effects. Therefore, to overcome the limitations of existing methods, in this study, we developed and evaluated approaches for estimating the proportion of variance explained when both the input and output sets are high-dimensional.</p>
</sec>
<sec id="sec002" sec-type="results">
<title>Results</title>
<p>We developed two methods that use random-effects models to estimate the proportion of variance of independent vectors in the linear span of an output layer that can be explained by regression on an input layer. We considered two approaches for generating a sequence of independent vectors in the linear span of the output layer: A Monte Carlo method (MC-ANOVA) which uses random vectors, and one based on eigenvectors (Eigen-ANOVA).</p>
<sec id="sec003">
<title>Setting the stage</title>
<p>Consider a data set consisting of two numeric matrices, <bold><italic>X</italic></bold><sub><italic>n</italic>×<italic>p</italic></sub> and <bold><italic>W</italic></bold><sub><italic>n</italic>×<italic>q</italic></sub>, holding data for <italic>n</italic> individuals (rows) and <italic>p</italic> (<bold><italic>X</italic></bold>) and <italic>q</italic> (<bold><italic>W</italic></bold>) features in columns, respectively. For instance, <bold><italic>X</italic></bold> may be a matrix with genotype codes at <italic>p</italic> SNPs and <bold><italic>W</italic></bold> may be a matrix providing GE levels assessed at <italic>q</italic> genes. The columns of <bold><italic>X</italic></bold> = {<bold><italic>x</italic></bold><sub>1</sub>,<bold><italic>x</italic></bold><sub>2</sub>,…,<bold><italic>x</italic></bold><sub><italic>p</italic></sub>} and of <bold><italic>W</italic></bold> = {<bold><italic>w</italic></bold><sub>1</sub>,<bold><italic>w</italic></bold><sub>2</sub>,…,<bold><italic>w</italic></bold><sub><italic>q</italic></sub>} can be viewed as axes spanning two linear spaces (<italic>L</italic><sub><italic>X</italic></sub> and <italic>L</italic><sub><italic>W</italic></sub>, respectively). The linear spans (extensions to nonlinear settings will be addressed in the discussion) of <bold><italic>X</italic></bold> and <bold><italic>W</italic></bold> consist of all the vectors that can be obtained by forming linear combinations of the columns of each of these sets, that is <inline-formula id="pone.0243251.e001"><alternatives><graphic id="pone.0243251.e001g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e001" xlink:type="simple"/><mml:math display="inline" id="M1"><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant="bold-italic">X</mml:mi><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">α</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:msubsup><mml:mo stretchy="false">∑</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>α</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:math></alternatives></inline-formula> and <inline-formula id="pone.0243251.e002"><alternatives><graphic id="pone.0243251.e002g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e002" xlink:type="simple"/><mml:math display="inline" id="M2"><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>W</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">w</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">w</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant="bold-italic">W</mml:mi><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">δ</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:msubsup><mml:mo stretchy="false">∑</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>q</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">w</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>δ</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:math></alternatives></inline-formula>, for all real-valued vectors <bold><italic>α</italic></bold><sub><italic>s</italic></sub> = {<italic>α</italic><sub><italic>s</italic>1</sub>,…,<italic>α</italic><sub><italic>sp</italic></sub>} and <bold><italic>δ</italic></bold><sub><italic>s</italic></sub> = {<italic>δ</italic><sub><italic>s</italic>1</sub>,…,<italic>δ</italic><sub><italic>sq</italic></sub>}. In the following, we will use <bold><italic>W</italic></bold> as the input set and <bold><italic>X</italic></bold> as the output set; however, the methods proposed are not symmetric and additional knowledge can be gained by switching the roles of <bold><italic>X</italic></bold> and <bold><italic>W</italic></bold>.</p>
<p>For each vector <bold><italic>x</italic></bold><sub><italic>s</italic></sub>∈<italic>L</italic><sub><italic>X</italic></sub>, one can estimate the proportion of variance that can be explained by linear regression on <italic>L</italic><sub><italic>W</italic></sub> using a model of the form
<disp-formula id="pone.0243251.e003">
<alternatives>
<graphic id="pone.0243251.e003g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e003" xlink:type="simple"/>
<mml:math display="block" id="M3">
<mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mi mathvariant="bold-italic">β</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="bold-italic">ε</mml:mi><mml:mo>.</mml:mo>
</mml:math>
</alternatives>
<label>(1)</label>
</disp-formula></p>
<p>For cases where <italic>q</italic> is large, the proportion of variance of <bold><italic>x</italic></bold><sub><italic>s</italic></sub> that can be explained by regression on <italic>L</italic><sub><italic>X</italic></sub> <inline-formula id="pone.0243251.e004"><alternatives><graphic id="pone.0243251.e004g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e004" xlink:type="simple"/><mml:math display="inline" id="M4"><mml:mo>(</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:math></alternatives></inline-formula> can be estimated by regarding both <bold><italic>β</italic></bold> and <bold><italic>ε</italic></bold> as Gaussian independent random variables, <inline-formula id="pone.0243251.e005"><alternatives><graphic id="pone.0243251.e005g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e005" xlink:type="simple"/><mml:math display="inline" id="M5"><mml:mi>β</mml:mi><mml:mtable><mml:mtr><mml:mtd><mml:mi>i</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>∼</mml:mo></mml:mtd></mml:mtr></mml:mtable><mml:mi>N</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>β</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo>)</mml:mo></mml:math></alternatives></inline-formula> and, <inline-formula id="pone.0243251.e006"><alternatives><graphic id="pone.0243251.e006g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e006" xlink:type="simple"/><mml:math display="inline" id="M6"><mml:mi>ε</mml:mi><mml:mtable><mml:mtr><mml:mtd><mml:mi>i</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>∼</mml:mo></mml:mtd></mml:mtr></mml:mtable><mml:mi>N</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>ε</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo>)</mml:mo></mml:math></alternatives></inline-formula>. Upon appropriate scaling of the columns of <bold><italic>X</italic></bold> (see <xref ref-type="sec" rid="sec014">Materials and Methods</xref> for details) <inline-formula id="pone.0243251.e007"><alternatives><graphic id="pone.0243251.e007g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e007" xlink:type="simple"/><mml:math display="inline" id="M7"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>β</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mrow><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>β</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>ε</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:mfrac></mml:math></alternatives></inline-formula> can be interpreted as the proportion of variance of <bold><italic>x</italic></bold><sub><italic>s</italic></sub> that could be explained by regression on the features included in <bold><italic>W</italic></bold>. The variance parameters involved (<inline-formula id="pone.0243251.e008"><alternatives><graphic id="pone.0243251.e008g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e008" xlink:type="simple"/><mml:math display="inline" id="M8"><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>β</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> and <inline-formula id="pone.0243251.e009"><alternatives><graphic id="pone.0243251.e009g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e009" xlink:type="simple"/><mml:math display="inline" id="M9"><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>ε</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula>) can be estimated using Bayesian or Likelihood methods (e.g., restricted maximum likelihood, REML, [<xref ref-type="bibr" rid="pone.0243251.ref007">7</xref>]), as such methods were designed to handle common challenges of overfitting and collinearity in high dimensional data.</p>
<p>In the preceding paragraph we describe how one can estimate the proportion of variance of a vector in <italic>L</italic><sub><italic>X</italic></sub> (<bold><italic>x</italic></bold><sub><italic>s</italic></sub>) that can be explained by regression on <bold><italic>W</italic></bold>. Next, we generalize the idea to all vectors in <italic>L</italic><sub><italic>X</italic></sub>. However, <italic>L</italic><sub><italic>X</italic></sub> contains an infinite number of vectors; therefore, some approximation is needed. Perhaps the most natural approach for estimating the proportion of variance of vectors in <italic>L</italic><sub><italic>X</italic></sub> that can be explained by regression on <italic>L</italic><sub><italic>W</italic></sub> is to regress each of the columns of <bold><italic>X</italic></bold> on <bold><italic>W</italic></bold>. Such an analysis would produce a sequence of R<sup>2</sup> estimates <inline-formula id="pone.0243251.e010"><alternatives><graphic id="pone.0243251.e010g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e010" xlink:type="simple"/><mml:math display="inline" id="M10"><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>…</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:math></alternatives></inline-formula>, and the average R<sup>2</sup>, <inline-formula id="pone.0243251.e011"><alternatives><graphic id="pone.0243251.e011g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e011" xlink:type="simple"/><mml:math display="inline" id="M11"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi><mml:mo>∼</mml:mo><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mo>−</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mrow><mml:msubsup><mml:mo stretchy="false">∑</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula>, could be used to estimate the overall proportion of variance of <bold><italic>X</italic></bold> that could be explained by regression on <bold><italic>W</italic></bold>. However, one limitation of this approach is that the columns of <bold><italic>X</italic></bold> are not necessarily independent. Many features may cluster (e.g., genes may be co-expressed, or SNPs may be in high linkage-disequilibrium) leading to groups of highly unbalanced sizes. When some features are highly-correlated, the simple average of individual R<sup>2</sup>-values may be driven by a few clusters of the columns of <bold><italic>X</italic></bold>. Furthermore, when <bold><italic>X</italic></bold> is ultra-high dimensional (e.g., hundreds of thousands or million features) estimating <inline-formula id="pone.0243251.e012"><alternatives><graphic id="pone.0243251.e012g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e012" xlink:type="simple"/><mml:math display="inline" id="M12"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> (<italic>s = 1</italic>,<italic>…</italic>,<italic>p</italic>) one-feature-at-a-time will be computationally challenging. Therefore, to address these problems, we propose two methods that use independent vectors from the span of the output set; each of these methods are explained next.</p>
</sec>
<sec id="sec004">
<title>Monte Carlo analysis of variance (MC-ANOVA)</title>
<p>Since <italic>L</italic><sub><italic>X</italic></sub> is infinite, one cannot estimate <inline-formula id="pone.0243251.e013"><alternatives><graphic id="pone.0243251.e013g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e013" xlink:type="simple"/><mml:math display="inline" id="M13"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> for all vectors in <italic>L</italic><sub><italic>X</italic></sub>. However, one can ‘explore’ the linear span of the output set by generating random vectors in <italic>L</italic><sub><italic>X</italic></sub> of the form <bold><italic>x</italic></bold><sub><italic>s</italic></sub> = <bold><italic>Xα</italic></bold><sub><italic>s</italic></sub>, where <bold><italic>α</italic></bold><sub><italic>s</italic></sub> is sampled from some distribution. This can be repeated for a large number of vectors in <italic>L</italic><sub><italic>X</italic></sub> to produce a sequence of estimates <inline-formula id="pone.0243251.e014"><alternatives><graphic id="pone.0243251.e014g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e014" xlink:type="simple"/><mml:math display="inline" id="M14"><mml:mo>{</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>}</mml:mo></mml:math></alternatives></inline-formula>, and the resulting sequence can be used to estimate the average proportion of variance explained as well as other features of the distribution of the sequence. The method is summarized in <xref ref-type="boxed-text" rid="pone.0243251.box001">Box 1</xref>. Importantly, if <bold><italic>α</italic></bold><sub><italic>s</italic></sub> and <bold><italic>α</italic></bold><sub><italic>s</italic>′</sub> are independent, so are <bold><italic>x</italic></bold><sub><italic>s</italic></sub> and <bold><italic>x</italic></bold><sub><italic>s</italic>′</sub>. Indeed, noting that <bold><italic>X</italic></bold> is not random and assuming that <bold><italic>α</italic></bold><sub><italic>s</italic></sub> and <bold><italic>α</italic></bold><sub><italic>s</italic>′</sub> are sampled independently, we have that <inline-formula id="pone.0243251.e015"><alternatives><graphic id="pone.0243251.e015g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e015" xlink:type="simple"/><mml:math display="inline" id="M15"><mml:mi>p</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:mi mathvariant="bold-italic">X</mml:mi><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">α</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi mathvariant="bold-italic">X</mml:mi><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">α</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>′</mml:mo></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo><mml:mo>=</mml:mo><mml:mi>p</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:mi mathvariant="bold-italic">X</mml:mi><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">α</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:mi mathvariant="bold-italic">X</mml:mi><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">α</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>′</mml:mo></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo><mml:mi>p</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:mi mathvariant="bold-italic">X</mml:mi><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">α</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>′</mml:mo></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo><mml:mo>=</mml:mo><mml:mi>p</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:mi mathvariant="bold-italic">X</mml:mi><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">α</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo><mml:mi>p</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:mi mathvariant="bold-italic">X</mml:mi><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">α</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>′</mml:mo></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:mi mathvariant="bold-italic">X</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:math></alternatives></inline-formula>; therefore, <bold><italic>x</italic></bold><sub><italic>s</italic></sub> and <bold><italic>x</italic></bold><sub><italic>s</italic>′</sub> are independent.</p>
<boxed-text id="pone.0243251.box001" position="float">
<sec id="sec005">
<title>Box 1. Monte Carlo analysis of variance (MC-ANOVA)</title>
<list list-type="order">
<list-item><p>Draw a random vector <bold><italic>α</italic></bold><sub><italic>s</italic></sub> from a proper multivariate distribution.</p></list-item>
<list-item><p>Form the linear combination <bold><italic>x</italic></bold><sub><italic>s</italic></sub> = <bold><italic>Xα</italic></bold><sub><italic>s</italic></sub>.</p></list-item>
<list-item><p>Estimate the proportion of variance of <bold><italic>x</italic></bold><sub><italic>s</italic></sub> <inline-formula id="pone.0243251.e016"><alternatives><graphic id="pone.0243251.e016g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e016" xlink:type="simple"/><mml:math display="inline" id="M16"><mml:mo>(</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:math></alternatives></inline-formula> using a random-effects model (expression [<xref ref-type="bibr" rid="pone.0243251.ref001">1</xref>]) with variance parameters estimated using either Bayesian or likelihood-type methods.</p></list-item>
<list-item><p>Repeat 1–3 <italic>B</italic> times (e.g., <italic>B = 10</italic>,<italic>000</italic>).</p></list-item>
<list-item><p>Use the sequence of estimated R-squared <inline-formula id="pone.0243251.e017"><alternatives><graphic id="pone.0243251.e017g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e017" xlink:type="simple"/><mml:math display="inline" id="M17"><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>…</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:math></alternatives></inline-formula> to approximate the distribution of the <inline-formula id="pone.0243251.e018"><alternatives><graphic id="pone.0243251.e018g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e018" xlink:type="simple"/><mml:math display="inline" id="M18"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula>. An estimate of <inline-formula id="pone.0243251.e019"><alternatives><graphic id="pone.0243251.e019g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e019" xlink:type="simple"/><mml:math display="inline" id="M19"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi><mml:mo>∼</mml:mo><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> can be obtained using the median or the average, <inline-formula id="pone.0243251.e020"><alternatives><graphic id="pone.0243251.e020g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e020" xlink:type="simple"/><mml:math display="inline" id="M20"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi><mml:mo>∼</mml:mo><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>B</mml:mi></mml:mrow><mml:mrow><mml:mo>−</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mrow><mml:msubsup><mml:mo stretchy="false">∑</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula>, R-sq. in the sequence.</p></list-item>
</list>
</sec>
</boxed-text>
<p>In <xref ref-type="boxed-text" rid="pone.0243251.box001">Box 1</xref> we did not specify how the <bold><italic>α</italic></bold><sub><italic>s</italic></sub> are generated; for this aspect of the algorihtm there are countless possibilities: weights can be sampled from distributions with continuous support (e.g., <italic>p</italic>-variate Gaussian) or from mixture models with a point of mass at zero. The weights may be independent or correlated, and the distributions may be symmetric or skewed. We will show later on (using data from chicken genomes) that the process used to generate the weights may affect some features of the distribution of the R<sup>2</sup> values, albeit not necessarily the mean or the median R<sup>2</sup>. The possibility of using different processes for generating random vectors in <italic>L</italic><sub><italic>X</italic></sub> gives the MC-ANOVA a great deal of flexibility. For example, this method could be used to assess how the distribution of the proportion of variance explained may change for different trait architectures–we will further explore that flexibility in greater detail in one of the case studies presented below.</p>
</sec>
<sec id="sec006">
<title>Regression using orthogonal basis (Eigen-ANOVA)</title>
<p>An orthogonal basis for the row-space of <bold><italic>X</italic></bold> can be obtained from the singular-value decomposition of <inline-formula id="pone.0243251.e021"><alternatives><graphic id="pone.0243251.e021g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e021" xlink:type="simple"/><mml:math display="inline" id="M21"><mml:mi mathvariant="bold-italic">X</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">U</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">D</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mrow><mml:mi mathvariant="bold-italic">V</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mo>′</mml:mo></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula>, where <bold><italic>U</italic></bold><sub><italic>X</italic></sub> and <bold><italic>V</italic></bold><sub><italic>X</italic></sub> are the left- and right-singular vectors of <bold><italic>X</italic></bold> respectively, and <bold><italic>D</italic></bold><sub><italic>X</italic></sub> is a diagonal matrix with the singular values of <bold><italic>X</italic></bold> in the diagonal. Both <bold><italic>U</italic></bold><sub><italic>X</italic></sub> and <bold><italic>V</italic></bold><sub><italic>X</italic></sub> are orthonormal, thus <inline-formula id="pone.0243251.e022"><alternatives><graphic id="pone.0243251.e022g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e022" xlink:type="simple"/><mml:math display="inline" id="M22"><mml:msubsup><mml:mrow><mml:mi mathvariant="bold-italic">U</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mo>′</mml:mo></mml:mrow></mml:msubsup><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">U</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant="bold-italic">I</mml:mi></mml:math></alternatives></inline-formula> and <inline-formula id="pone.0243251.e023"><alternatives><graphic id="pone.0243251.e023g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e023" xlink:type="simple"/><mml:math display="inline" id="M23"><mml:msubsup><mml:mrow><mml:mi mathvariant="bold-italic">V</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mo>′</mml:mo></mml:mrow></mml:msubsup><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">V</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant="bold-italic">I</mml:mi></mml:math></alternatives></inline-formula>. Each vector in <italic>L</italic><sub><italic>X</italic></sub> can be represented as a linear combination of the left-singular vectors of <bold><italic>X</italic></bold>. Therefore, our second method estimates the proportion of variance of each of the left-singular vectors of <bold><italic>X</italic></bold> that can be explained by regression on <bold><italic>W</italic></bold>, and produces a global R<sup>2</sup> estimate using a weighted sum of the R<sup>2</sup> estimated for each singular vector (<xref ref-type="boxed-text" rid="pone.0243251.box002">Box 2</xref>, note that <inline-formula id="pone.0243251.e024"><alternatives><graphic id="pone.0243251.e024g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e024" xlink:type="simple"/><mml:math display="inline" id="M24"><mml:msubsup><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> and <bold><italic>U</italic></bold><sub><italic>X</italic></sub> in <xref ref-type="boxed-text" rid="pone.0243251.box002">Box 2</xref> are also the non-zero eigenvalues and the eigenvectors of <bold><italic>XX</italic></bold>′, respectively).</p>
<boxed-text id="pone.0243251.box002" position="float">
<sec id="sec007">
<title>Box 2. Eigen-ANOVA</title>
<list list-type="order">
<list-item><p>Generate an orthogonal basis for <italic>L</italic><sub><italic>X</italic></sub>; for instance, compute the singular-value decomposition of <inline-formula id="pone.0243251.e025"><alternatives><graphic id="pone.0243251.e025g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e025" xlink:type="simple"/><mml:math display="inline" id="M25"><mml:mi mathvariant="bold-italic">X</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">U</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">D</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mrow><mml:mi mathvariant="bold-italic">V</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mo>′</mml:mo></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> where <inline-formula id="pone.0243251.e026"><alternatives><graphic id="pone.0243251.e026g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e026" xlink:type="simple"/><mml:math display="inline" id="M26"><mml:msubsup><mml:mrow><mml:mi mathvariant="bold-italic">U</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mo>′</mml:mo></mml:mrow></mml:msubsup><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">U</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant="bold-italic">I</mml:mi></mml:math></alternatives></inline-formula> and <inline-formula id="pone.0243251.e027"><alternatives><graphic id="pone.0243251.e027g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e027" xlink:type="simple"/><mml:math display="inline" id="M27"><mml:msubsup><mml:mrow><mml:mi mathvariant="bold-italic">V</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mo>′</mml:mo></mml:mrow></mml:msubsup><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">V</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant="bold-italic">I</mml:mi></mml:math></alternatives></inline-formula> form an orthonormal basis for the row- and column space of <bold><italic>X</italic></bold> respectively, and <bold><italic>D</italic></bold><sub><italic>X</italic></sub> = <italic>Diag</italic>{<italic>d</italic><sub><italic>i</italic></sub>} is a diagonal matrix with the singular values of <bold><italic>X</italic></bold> in its diagonal (<italic>i = 1</italic>,<italic>…</italic>,<italic>r</italic>, where <italic>r</italic> is the rank of <bold><italic>X</italic></bold>).</p></list-item>
<list-item><p>Regress each of the left-singular vectors on <italic>L</italic><sub><italic>W</italic></sub> using a linear model such as that in expression [<xref ref-type="bibr" rid="pone.0243251.ref001">1</xref>] with <bold><italic>u</italic></bold><sub><italic>i</italic></sub> = <bold><italic>x</italic></bold><sub><italic>s</italic></sub>, and estimate the proportion of variance of each vector that can be explained by regression on <italic>L</italic><sub><italic>W</italic></sub>, <inline-formula id="pone.0243251.e028"><alternatives><graphic id="pone.0243251.e028g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e028" xlink:type="simple"/><mml:math display="inline" id="M28"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>u</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula>.</p></list-item>
<list-item><p>Estimate the global proportion of variance of vectors in <italic>L</italic><sub><italic>X</italic></sub> that can be explained by regression on <italic>L</italic><sub><italic>W</italic></sub> using <inline-formula id="pone.0243251.e029"><alternatives><graphic id="pone.0243251.e029g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e029" xlink:type="simple"/><mml:math display="inline" id="M29"><mml:msup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:msubsup><mml:mo stretchy="false">∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:msubsup><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>u</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:msubsup><mml:mo stretchy="false">∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:msubsup><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:mrow></mml:mrow></mml:mfrac></mml:math></alternatives></inline-formula>.</p></list-item>
</list>
</sec>
</boxed-text>
</sec>
<sec id="sec008">
<title>Statistical properties assessed via simulations</title>
<p>We evaluated the statistical performance of the two methods described above using simulated data panels with a known proportion of variance shared between input and output data set. We also compared the performance of the two proposed methods with that of the Partial Least Squares (PLS, [<xref ref-type="bibr" rid="pone.0243251.ref003">3</xref>])–a method commonly used to analyze high dimensional data. We considered two simulation settings. In both cases, the input set was obtained from a wheat (<italic>Triticum</italic>) genotype data set generated by the International Maize and Wheat Improvement Center (CIMMYT) which contains genotypes at 1,279 DNA-markers assessed in 599 wheat inbred lines (see <xref ref-type="sec" rid="sec014">Materials and Methods</xref> for further details on this data set).</p>
<p>We note here that while a genotype matrix contains strictly discrete values (0/1 or -1/1 for inbreed lines and 0/1/2 or -1/0/2 for outbred diploid individuals) the linear span of it includes vectors in R<sup>n</sup>. The vectors in the linear span of the genotypes can be thought as ‘breeding values’ formed as linear combinations of genotypes.</p>
<p>In our <bold><italic>first simulation setting</italic></bold>, <bold><italic>W</italic></bold><sub>599×1,279</sub> was the genotype matrix and <bold><italic>X</italic></bold><sub>599×1,279</sub> was obtained by adding <italic>iid</italic> (independent and identically distributed) Gaussian noise to the genotype matrix. We tuned the variance of the noise to generate scenarios of the proportion of variance of <bold><italic>X</italic></bold> explained by <bold><italic>W</italic></bold> ranging from 0 (<bold><italic>X</italic></bold> was pure noise) to 1 (<bold><italic>X = W</italic></bold>). For each simulated data set we then estimated the proportion of variance of <bold><italic>X</italic></bold> explained by regression of <bold><italic>W</italic></bold> using random-effects models, with variance parameters estimated using REML [<xref ref-type="bibr" rid="pone.0243251.ref007">7</xref>] (see <xref ref-type="sec" rid="sec014">Materials and Methods</xref> for details).</p>
<p>The Monte Carlo method estimated the proportion of variance of <bold><italic>X</italic></bold> explained by <bold><italic>W</italic></bold> without any noticeable bias (<bold><xref ref-type="table" rid="pone.0243251.t001">Table 1</xref></bold>). However, the regression of the left-singular vectors of <bold><italic>X</italic></bold> on <bold><italic>W</italic></bold> in Eigen-ANOVA produced estimates that were downwardly biased in case the true proportion of variance of <bold><italic>X</italic></bold> explained by <bold><italic>W</italic></bold> was large (e.g., &gt;0.5). Further inspection of the results for individual MC replicates suggested that the bias of the Eigen-ANOVA method was likely due to a relatively large number of ‘corner’ solutions (zero estimated proportion of variance) which were common for high-order eigenvectors (i.e., those with small eigenvalue)–we illustrate this in an analysis of multi-omic cancer data further below. The use of PLS led to a downwardly bias estimates in cases where the proportion of variance of <bold><italic>X</italic></bold> explained by <bold><italic>W</italic></bold> was low (0.1, 0.3, 0.5) and an upwardly biased estimates when proportion of variance explained was high (0.8, 0.9).</p>
<table-wrap id="pone.0243251.t001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0243251.t001</object-id>
<label>Table 1</label> <caption><title>Average (SD) estimate of the proportion of variance explained by simulation scenario (first column) and estimation method (simulation 1).</title></caption>
<alternatives>
<graphic id="pone.0243251.t001g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0243251.t001" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="center" rowspan="2">True proportion of variance explained</th>
<th align="center" colspan="3">Estimates</th>
</tr>
<tr>
<th align="center">Monte Carlo- ANOVA</th>
<th align="center">Eigen-ANOVA</th>
<th align="center">PLS</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center">0.0</td>
<td align="center">0.0082 (0.0028)</td>
<td align="center">0.0081 (0.0006)</td>
<td align="center">0.0017 (0.0001)</td>
</tr>
<tr>
<td align="center" style="background-color:#E7E6E6">0.1</td>
<td align="center" style="background-color:#E7E6E6">0.1002 (0.0083)</td>
<td align="center" style="background-color:#E7E6E6">0.0983 (0.0019)</td>
<td align="center" style="background-color:#E7E6E6">0.0478 (0.0034)</td>
</tr>
<tr>
<td align="center">0.3</td>
<td align="center">0.2991 (0.0108)</td>
<td align="center">0.3020 (0.0028)</td>
<td align="center">0.2412 (0.0075)</td>
</tr>
<tr>
<td align="center" style="background-color:#E7E6E6">0.5</td>
<td align="center" style="background-color:#E7E6E6">0.4992 (0.0102)</td>
<td align="center" style="background-color:#E7E6E6">0.5054 (0.0028)</td>
<td align="center" style="background-color:#E7E6E6">0.4865 (0.0076)</td>
</tr>
<tr>
<td align="center">0.8</td>
<td align="center">0.8006 (0.0055)</td>
<td align="center">0.7857 (0.0017)</td>
<td align="center">0.8451 (0.0036)</td>
</tr>
<tr>
<td align="center" style="background-color:#E7E6E6">0.9</td>
<td align="center" style="background-color:#E7E6E6">0.9012 (0.0033)</td>
<td align="center" style="background-color:#E7E6E6">0.8685 (0.0011)</td>
<td align="center" style="background-color:#E7E6E6">0.9403 (0.0016)</td>
</tr>
<tr>
<td align="center">1.0</td>
<td align="center">1.0000 (&lt; .0001)</td>
<td align="center">0.9377 (&lt; .0001)</td>
<td align="center">0.9988 (&lt; .0001)</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<p>We then considered a <bold><italic>second simulation setting</italic></bold> to contemplate cases involving asymmetric proportion of variance explained. To achieve this, we formed <bold><italic>X</italic></bold> using a subset of the wheat marker genotypes (5%, 10%, 30%, 50%, 80%, 90%, 95%) and formed <bold><italic>W</italic></bold> by binding the columns of <bold><italic>X</italic></bold> with additional columns filled with <italic>iid</italic> Gaussian noise (<bold><italic>Z</italic></bold>), that is <bold><italic>W</italic></bold><sub>599×1,279</sub> = [<italic>X</italic><sub>599×<italic>p</italic></sub>, <bold><italic>Z</italic></bold><sub>599×(1,279−<italic>p</italic>)</sub>] (<italic>p&lt;1</italic>,<italic>279</italic>). The columns of <bold><italic>X</italic></bold> and <bold><italic>W</italic></bold> were centered and scaled to unit variance; therefore the share of the variance of <bold><italic>W</italic></bold> explained by <bold><italic>X</italic></bold> (<inline-formula id="pone.0243251.e030"><alternatives><graphic id="pone.0243251.e030g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e030" xlink:type="simple"/><mml:math display="inline" id="M30"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>W</mml:mi><mml:mo>∼</mml:mo><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula>) is known and equal to, <inline-formula id="pone.0243251.e031"><alternatives><graphic id="pone.0243251.e031g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e031" xlink:type="simple"/><mml:math display="inline" id="M31"><mml:mfrac><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mn>1,279</mml:mn></mml:mrow></mml:mfrac></mml:math></alternatives></inline-formula>. Similarly, the proportion of variance of <bold><italic>X</italic></bold> explained by <bold><italic>W</italic></bold> (<inline-formula id="pone.0243251.e032"><alternatives><graphic id="pone.0243251.e032g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e032" xlink:type="simple"/><mml:math display="inline" id="M32"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi><mml:mo>∼</mml:mo><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula>) is 1 because <bold><italic>X</italic></bold> is included in <bold><italic>W</italic></bold>.</p>
<p>In our second simulation study, the MC-ANOVA method rendered nearly unbiased estimates of the proportion of variance of one set explained by the other (<bold><xref ref-type="table" rid="pone.0243251.t002">Table 2</xref></bold>). However, the Eigen-ANOVA method and the PLS produced noticeable biases, with Eigen-ANOVA method again generating downwardly biased estimates in cases where the true proportion of variance explained was high, and the PLS generating downwardly (upwardly) biased estimates whenever the true proportion of variance was low (high).</p>
<table-wrap id="pone.0243251.t002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0243251.t002</object-id>
<label>Table 2</label> <caption><title>Average (SD) REML estimates of the proportion of variance explained by simulation scenario (first column) and estimation method (simulation 2).</title></caption>
<alternatives>
<graphic id="pone.0243251.t002g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0243251.t002" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="center">Scenario</th>
<th align="center" colspan="3"><italic>X</italic> regressed on <italic>W</italic></th>
<th align="center" colspan="3"><italic>W</italic> regressed on <italic>X</italic></th>
</tr>
<tr>
<th align="center"><inline-formula id="pone.0243251.e033"><alternatives><graphic id="pone.0243251.e033g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e033" xlink:type="simple"/><mml:math display="inline" id="M33"><mml:mfrac><mml:mrow><mml:mo>#</mml:mo><mml:mi mathvariant="bold-italic">C</mml:mi><mml:mi mathvariant="bold-italic">o</mml:mi><mml:mi mathvariant="bold-italic">l</mml:mi><mml:mi mathvariant="bold-italic">u</mml:mi><mml:mi mathvariant="bold-italic">m</mml:mi><mml:mi mathvariant="bold-italic">n</mml:mi><mml:mi mathvariant="bold-italic">s</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="bold-italic">o</mml:mi><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="bold-italic">X</mml:mi></mml:mrow><mml:mrow><mml:mo>#</mml:mo><mml:mi mathvariant="bold-italic">C</mml:mi><mml:mi mathvariant="bold-italic">o</mml:mi><mml:mi mathvariant="bold-italic">l</mml:mi><mml:mi mathvariant="bold-italic">u</mml:mi><mml:mi mathvariant="bold-italic">m</mml:mi><mml:mi mathvariant="bold-italic">n</mml:mi><mml:mi mathvariant="bold-italic">s</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="bold-italic">o</mml:mi><mml:mi mathvariant="bold-italic">f</mml:mi><mml:mspace width="0.25em"/><mml:mi mathvariant="bold-italic">W</mml:mi></mml:mrow></mml:mfrac></mml:math></alternatives></inline-formula></th>
<th align="center">MC-ANOVA</th>
<th align="center">Eigen-ANOVA</th>
<th align="center">PLS</th>
<th align="center">MC-ANOVA</th>
<th align="center">Eigen-ANOVA</th>
<th align="center">PLS</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center">0.05</td>
<td align="center">0.9960 (0.0039)</td>
<td align="center">0.9085 (0.0051)</td>
<td align="center">0.8885 (0.0069)</td>
<td align="center">0.0505 (0.0050)</td>
<td align="center">0.0548 (0.0012)</td>
<td align="center">0.0244 (0.0029)</td>
</tr>
<tr>
<td align="center" style="background-color:#E7E6E6">0.10</td>
<td align="center" style="background-color:#E7E6E6">0.9972 (0.0030)</td>
<td align="center" style="background-color:#E7E6E6">0.8891 (0. 0041)</td>
<td align="center" style="background-color:#E7E6E6">0.9193 (0.0036)</td>
<td align="center" style="background-color:#E7E6E6">0.1000 (0. 0072)</td>
<td align="center" style="background-color:#E7E6E6">0.1061 (0. 0018)</td>
<td align="center" style="background-color:#E7E6E6">0.0652 (0.0038)</td>
</tr>
<tr>
<td align="center" style="background-color:#FFFFFF">0.30</td>
<td align="center" style="background-color:#FFFFFF">0.9964 (0.0025)</td>
<td align="center" style="background-color:#FFFFFF">0.8835 (0.0024)</td>
<td align="center" style="background-color:#FFFFFF">0.9781 (&lt; .0001)</td>
<td align="center" style="background-color:#FFFFFF">0.2999 (0.0106)</td>
<td align="center" style="background-color:#FFFFFF">0.3060 (0.0028)</td>
<td align="center" style="background-color:#FFFFFF">0.2656 (0.0068)</td>
</tr>
<tr>
<td align="center" style="background-color:#E7E6E6">0.50</td>
<td align="center" style="background-color:#E7E6E6">0.9943 (0.0028)</td>
<td align="center" style="background-color:#E7E6E6">0.8989 (0.0019)</td>
<td align="center" style="background-color:#E7E6E6">0.9954 (&lt; .0001)</td>
<td align="center" style="background-color:#E7E6E6">0.4996 (0.0102)</td>
<td align="center" style="background-color:#E7E6E6">0.4977 (0.0030)</td>
<td align="center" style="background-color:#E7E6E6">0.4902 (0.0072)</td>
</tr>
<tr>
<td align="center">0.80</td>
<td align="center">0.9965 (0.0013)</td>
<td align="center">0.9223 (0.0010)</td>
<td align="center">0.997 (&lt; .0001)</td>
<td align="center">0.8000 (0.0061)</td>
<td align="center">0.7714 (0.0025)</td>
<td align="center">0.8259 (0.0047)</td>
</tr>
<tr>
<td align="center" style="background-color:#E7E6E6">0.90</td>
<td align="center" style="background-color:#E7E6E6">0.9992 (0.0005)</td>
<td align="center" style="background-color:#E7E6E6">0.9302 (0.0008)</td>
<td align="center" style="background-color:#E7E6E6">0.9979 (&lt; .0001)</td>
<td align="center" style="background-color:#E7E6E6">0.9008 (0.0039)</td>
<td align="center" style="background-color:#E7E6E6">0.8593 (0.0019)</td>
<td align="center" style="background-color:#E7E6E6">0.9277 (0.0035)</td>
</tr>
<tr>
<td align="center">0.95</td>
<td align="center">0.9998 (0.0002)</td>
<td align="center">0.9345 (0.0008)</td>
<td align="center">0.9984 (&lt; .0001)</td>
<td align="center">0.9511 (0.0026)</td>
<td align="center">0.9016 (0.0013)</td>
<td align="center">0.9746 (0.0025)</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
</sec>
<sec id="sec009">
<title>Applications to experimental data</title>
<p>We used the MC-ANOVA and Eigen-ANOVA to quantify the proportion of variance explained in two experimental data sets. The first one contains a set of ultra-high-density (UHD) SNPs from chicken (<italic>Gallus gallus</italic>) genomes derived from a combination of whole-genome sequencing (WGS) and imputation. We used this data set to assess the proportion of variance of UHD genotypes that can be captured and predicted using low-density SNP sets. The second data set involved three omic-layers (gene expression [GE], methylation [ME], and copy-number-variants [CNVs]) of human (<italic>Homo sapiens)</italic> female breast cancer patients. We used this data set to assess the proportion of variance at one omic that can be explained by another omic.</p>
<sec id="sec010">
<title>Quantifying multi-locus linkage disequilibrium between SNP panels</title>
<p>The continued reduction of genotyping and sequencing costs has led to a sustained increase in the number of loci that can be genotyped. In plant and animal breeding four typical genotyping options include customized low-density arrays with hundreds to a few thousand SNPs [<xref ref-type="bibr" rid="pone.0243251.ref008">8</xref>], commercial arrays of common SNPs with tens of thousands of SNPs [<xref ref-type="bibr" rid="pone.0243251.ref009">9</xref>], high-density SNP arrays with hundreds of thousands of SNPs [<xref ref-type="bibr" rid="pone.0243251.ref010">10</xref>, <xref ref-type="bibr" rid="pone.0243251.ref011">11</xref>], and whole-genome sequence-derived SNP genotypes. The number of SNPs that can be derived from WGS varies between populations and sequencing depth but is usually of the order of tens of millions (UHD SNP genotypes). In recent years, several projects have produced large volumes of fully sequenced genomes for various agricultural species and model organisms. However, generating, storing, and fitting models with UHD-genotypes can be logistically, economically, and computationally challenging. Moreover, empirical evidence seems to suggest that using UHD SNP-genotypes does not lead to substantial gains in prediction accuracy relative to models trained using tens of thousands of SNPs [<xref ref-type="bibr" rid="pone.0243251.ref012">12</xref>–<xref ref-type="bibr" rid="pone.0243251.ref015">15</xref>]. This often leads to researchers wondering: <italic>How many SNPs are needed to capture (almost all) the information contained in UHD SNP genotypes</italic>? We used the MC- and Eigen-ANOVA methods to address precisely this question.</p>
<p>Data consisted of 1.79 million SNP-genotypes for 892 (female and male) chickens from six generations of a purebred commercial brown layer line of Lohmann Tierzucht GmbH. These genotypes originated from a combination of whole-genome sequencing of 25 layers and imputation to UHD of the genomes of 867 that were genotyped a high density (~600,000 SNPs) Affymetrix Axiom Chicken Genotyping Array [<xref ref-type="bibr" rid="pone.0243251.ref016">16</xref>]. Further details about this data set can be found in the Materials and Methods section.</p>
<p>In a first analysis, the output space was the linear space (<italic>L</italic><sub><italic>X</italic></sub>) spanned by the UHD SNP genotypes. The input set, (<italic>L</italic><sub><italic>W</italic></sub>), consisted of low-density genotypes obtained by selecting <italic>p</italic> (<italic>p</italic> = 500, 1K, 2K, 3K, 5K, 10K, and 50K) evenly-spaced (in variant counts) SNPs. We estimated the proportion of variance captured by low-density panels using the MC- and Eigen-ANOVA methods. For the MC method we sampled weights from <italic>iid</italic> standard normal distribution, <inline-formula id="pone.0243251.e034"><alternatives><graphic id="pone.0243251.e034g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e034" xlink:type="simple"/><mml:math display="inline" id="M34"><mml:msub><mml:mrow><mml:mi>α</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mtable><mml:mtr><mml:mtd><mml:mi>i</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>∼</mml:mo></mml:mtd></mml:mtr></mml:mtable><mml:mi>N</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:mn>0,1</mml:mn></mml:mrow><mml:mo>)</mml:mo></mml:math></alternatives></inline-formula>, and then formed a random vector in <italic>L</italic><sub><italic>X</italic></sub> using <bold><italic>x</italic></bold><sub><italic>s</italic></sub> = <bold><italic>Xα</italic></bold><sub><italic>s</italic></sub>, where <bold><italic>X</italic></bold> is the matrix of UHD SNP-genotypes. These random vectors were then regressed on the lower-density SNP sets, and the proportion of variance explained was estimated using REML. This was repeated 1,000 times to estimate the distribution of the proportion of variance of vectors in <italic>L</italic><sub><italic>X</italic></sub> explained by each of the low-density SNP-sets. For the Eigen-ANOVA method, we regressed each of the left-singular vectors of the UHD SNP genotypes on the low-density panels.</p>
<p>According to the MC-ANOVA method, the panel containing 500 evenly-spaced SNPs captured about two-thirds of the variance spanned by the UHD SNP genotypes (<bold><xref ref-type="fig" rid="pone.0243251.g001">Fig 1</xref></bold>). The proportion of variance of the UHD SNPs explained by low-density panels increased with the number of SNPs in the low-density panels reaching 100% with p&gt; = 10K SNPs. The variance in the proportion of variance captured by low-density panels also decreased with the number of SNPs in the array (<bold><xref ref-type="fig" rid="pone.0243251.g001">Fig 1</xref></bold>). Small sample size and small effective population size are further factors that may make 10K SNPs to be sufficient to achieve a very high R-sq.</p>
<fig id="pone.0243251.g001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0243251.g001</object-id>
<label>Fig 1</label>
<caption>
<title>Proportion of the variance of whole-genome-sequence-derived SNPs (1.79 million) explained by SNP-panels consisting of 500, to 50K (K = 1000) evenly-spaced SNPs.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0243251.g001" xlink:type="simple"/>
</fig>
<p>The Eigen-ANOVA yielded a very similar estimate of the proportion of variance explained as the MC-ANOVA for <italic>p</italic> = 500. However, for SNP-panels with more than 500 SNPs, the estimated proportion of variance obtained with the Eigen-ANOVA was systematically lower than the one obtained with MC-ANOVA. This agrees with what we found in the simulations where for high R<sup>2</sup> the Eigen-ANOVA method gave downwardly biased estimates. (Note that while the MC-ANOVA yields both a point estimate and measures of dispersion (across random vectors) of R<sup>2</sup>, the Eigen-ANOVA only yields the point-estimates which are shown in <bold><xref ref-type="fig" rid="pone.0243251.g001">Fig 1</xref></bold>.)</p>
<p>In the previous application of the MC method, we drew random effect vectors that had weights (drawn from a normal distribution) on all the SNPs of the UHD set. When <bold><italic>X</italic></bold> contains whole-genome sequence genomes, one can think of the random vectors in <italic>L</italic><sub><italic>X</italic></sub> (<bold><italic>x</italic></bold><sub><italic>s</italic></sub> = <bold><italic>Xα</italic></bold><sub><italic>s</italic></sub>) as additive-genetic traits and of the MC method as exploring many possible of such traits. However, for any trait, the vast majority of variants in the genome are expected to have no effect. The number of variants affecting any trait could vary from very few (simple traits) to hundreds or thousands (complex traits). Therefore, to explore the effect of the trait architecture on the distribution of the proportion of genetic variance of those traits that could be captured by low-density SNP panels, we repeated the previous analyses using random vectors that had <italic>5</italic>,<italic>10</italic>,<italic>50</italic>,<italic>500</italic> non-zero weights–the set of SNPs with non-zero weight were randomly sampled from the UHD-genotypes, and the weights of those SNPs were <italic>iid</italic> normal (see <xref ref-type="sec" rid="sec014">Materials and Methods</xref> for details).</p>
<p>The estimated proportion of variance explained by regression on lower-density SNP panels was, on average, the same across “trait-architectures” (<bold><xref ref-type="fig" rid="pone.0243251.g002">Fig 2</xref></bold>). However, the dispersion of the estimated means was, as expected, much larger for simple traits (e.g., 5 ‘causal variants’). For "complex traits" with 500 "causal variants," the proportion of variance explained by regression on 10K or more SNPs was greater than 95% for all MC replicates. However, for simpler traits we had some random vectors with a proportion of variance explained smaller than 0.8. This suggests that, while for highly complex traits low-density SNP arrays of 10K-50K SNPs may be enough to span the variance of the whole genome, for some simple traits, such arrays may not contain enough SNPs in high LD with the causal variants.</p>
<fig id="pone.0243251.g002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0243251.g002</object-id>
<label>Fig 2</label>
<caption>
<title>Proportion of variance of random vectors derived from ultra-high-density SNP-panel explained by regression on low-density SNP-panels, by number of loci used to form “genetic traits”.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0243251.g002" xlink:type="simple"/>
</fig>
</sec>
<sec id="sec011">
<title>Using MC-ANOVA and Eigen-ANOVA to study shared variance in multi-omic data sets</title>
<p>Cancerous processes involve the deregulation of signaling pathways controlling cell fate and progression, arising from the accumulation of genomic and epigenomics alterations across multiple genes [<xref ref-type="bibr" rid="pone.0243251.ref017">17</xref>, <xref ref-type="bibr" rid="pone.0243251.ref018">18</xref>]. Genetic and epigenetic modifications can lead to changes in GE, which in turn can lead to changes in downstream (e.g., protein expression) and upstream (e.g., DNA, ME) processes, thus resulting in complex multivariate association patterns between multiple omic-layers.</p>
<p>We used GE, ME and CNV data from breast cancer tumors (<italic>n = 593</italic>) from The Cancer Genome Atlas (TCGA) to study multivariate associations between those three omics. Details of the technologies used to generate these data, as well as the data quality controls (QC) and editions are described in the Materials and Methods. After QC and editions data consisted of the (log-transformed) expression of 20,319 genes, counts at 11,552 CVN-sites, and ME intensity at 28,241 ME CpG islands. We used the MC- and the Eigen-ANOVA methods to estimate the proportion of variance of one omic that can be explained by regression on another omic; we did this for all pairwise omics combinations (GE~ME, GE~CNV, ME~GE, ME~CVN, CNV~GE, and CVN~ME).</p>
<p>Our results with the MC-ANOVA method indicate that the CNV data were completely explained by both GE and ME (<bold><xref ref-type="table" rid="pone.0243251.t003">Table 3</xref></bold>). About 70% of the variance spanned by ME was explained by GE and vice versa. Finally, CNV explained a relatively small fraction of the variance spanned by either GE or ME. These results suggest that most CNVs have effects in both ME and GE and therefore, variation in CNV can be predicted by ME and GE. However, although there is an association between CNV and both ME and GE, many other factors (e.g., environmental effects) seem to intervene, thus making the proportion of GE or ME explained by CNV relatively small (~20%). Overall the MC- and Eigen-ANOVA methods yielded similar results. However, in cases involving high R<sup>2</sup> (CNV~ME, CNV~GE, GE~ME and ME~GE) the Eigen-ANOVA method gave R<sup>2</sup> estimates that were lower than those of the MC method. This pattern is consistent with what we observed in the simulation and in the analyses of chicken genomes.</p>
<table-wrap id="pone.0243251.t003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0243251.t003</object-id>
<label>Table 3</label> <caption><title>Proportion of variance of one omic explained (posterior standard deviation) by regression of the omic in each row on the omic in each column obtained with MC-ANOVA (Eigen-ANOVA).</title></caption>
<alternatives>
<graphic id="pone.0243251.t003g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0243251.t003" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="justify" rowspan="2">Dependent</th>
<th align="center" colspan="3">Explanatory</th>
</tr>
<tr>
<th align="center">CNV</th>
<th align="center">Methylation</th>
<th align="center">Gene Expression</th>
</tr>
</thead>
<tbody>
<tr>
<td align="justify" style="background-color:#FFFFFF">CNV</td>
<td align="center" style="background-color:#FFFFFF">---</td>
<td align="center" style="background-color:#FFFFFF">1.00 (0.929)</td>
<td align="center" style="background-color:#FFFFFF">1.00 (0.904)</td>
</tr>
<tr>
<td align="justify" style="background-color:#F2F2F2">Methylation</td>
<td align="center" style="background-color:#F2F2F2">0.164 (0.228)</td>
<td align="center" style="background-color:#F2F2F2">---</td>
<td align="center" style="background-color:#F2F2F2">0.715 (0.685)</td>
</tr>
<tr>
<td align="justify" style="background-color:#FFFFFF">Gene Expression</td>
<td align="center" style="background-color:#FFFFFF">0.204 (0.238)</td>
<td align="center" style="background-color:#FFFFFF">0.738 (0.660)</td>
<td align="center" style="background-color:#FFFFFF">---</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<p>Eigen-vector-specific R<sup>2</sup> values obtained with the Eigen-ANOVA method (<bold><xref ref-type="fig" rid="pone.0243251.g003">Fig 3</xref></bold>) showed that the R<sup>2</sup> values were, in most cases (except GE~CNV and ME~CNV) very high (and in many cases very close to one) for the top-eigenvectors (i.e., those with high eigenvalue), and very small for eigenvectors associated with low eigenvalues. The transition in the R<sup>2</sup> profile of individual eigenvectors showed a relatively sharp phase transition from R<sup>2</sup> values near one to near-zero values. Overall, our results suggest a relatively good agreement in the patterns captured by the top-eigenvectors across omics.</p>
<fig id="pone.0243251.g003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0243251.g003</object-id>
<label>Fig 3</label>
<caption>
<title>Proportion of variance of omic-derived eigenvectors of an omic-set explained by regression on a different omic-set.</title>
<p>Points give the proportion of variance for individual eigenvectors. GE = Gene Expression, ME = Methylation, CNV = Copy-number variants (global R<sup>2</sup> estimates, derived from random vectors and from the Eigen-ANOVA method are shown in <bold><xref ref-type="table" rid="pone.0243251.t003">Table 3</xref></bold>).</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0243251.g003" xlink:type="simple"/>
</fig>
</sec>
</sec>
</sec>
<sec id="sec012" sec-type="conclusions">
<title>Discussion</title>
<p>Modern genomic data sets often combine information from multiple non-independent data-layers. Quantifying multivariate associations between data layers can shed light on many important aspects of the data. In this study, we developed two procedures to estimate the proportion of variance explained in settings where both the input and output sets are high-dimensional. The proposed approach uses random effects Gaussian models to estimate the proportion of variance of (independent) vectors in the linear span of an output set (<bold><italic>X</italic></bold>) that can be explained by regression on an input set (<bold><italic>W</italic></bold>). The resulting R<sup>2</sup> estimate is a weighted average of the R<sup>2</sup> values obtained from independent vectors. To generate independent vectors, we considered two approaches: The first one (MC-ANOVA) is a Monte Carlo method that uses randomly generated vectors in the linear span of the output set. The second one (Eigen-ANOVA) uses an orthogonal basis for the linear span of <bold><italic>X</italic></bold>.</p>
<p>The proposed methods share four important features. First, both methods can be used to perform analysis of variance when both explanatory and dependent data are high-dimensional. Second, estimates are entirely based on the likelihood function and there is no need to make regularization decisions (number of dimensions, penalty parameters). Third, for any pair of information sets, the analysis of variance is not necessarily symmetric; therefore, the approach accommodates cases where the proportion of variance of <bold><italic>W</italic></bold> explained by <bold><italic>X</italic></bold> is not equal to the reciprocal. Finally, in addition to producing an R<sup>2</sup> estimate, the proposed methods can shed light on important aspects of the underlying association patterns (e.g., decomposition of the global R<sup>2</sup> on eigen-vector specific R<sup>2</sup>’s, distribution of R<sup>2</sup> over possible vectors in the linear span of the output set).</p>
<p>Our simulations suggest that MC-ANOVA renders nearly unbiased estimates of the proportion of the variance of one set that can be explained by another. However, the Eigen-ANOVA exhibited systematic biases in scenarios in which the true proportion of variance was high. We also evaluated the PLS regression method, and our simulations suggest that PLS lead to upwardly (downwardly) biased estimates whenever the true proportion of variance is high (low). Therefore, for estimation of the proportion of variance explained we recommend using MC-ANOVA. The Eigen-ANOVA method seems to be a valid alternative, provided that the proportion of variance of one set explained by the other is not too high.</p>
<sec id="sec013">
<title>Computational considerations</title>
<p>The Eigen-ANOVA requires computing all the eigenvectors of the response matrix (say <bold><italic>X</italic></bold>) and then estimating proportion of variance of each of the eigenvectors explained by the explanatory matrix (e.g., <bold><italic>W</italic></bold>). The computational complexity of standard algorithms for singular-value decomposition is <italic>O</italic>(<italic>n</italic><sup>3</sup>) (assuming <italic>n~p</italic>). On the other hand, the MC-ANOVA requires forming <italic>B</italic> random vectors fo the form <bold><italic>x</italic></bold><sub><italic>s</italic></sub> = <bold><italic>Xα</italic></bold><sub><italic>s</italic></sub>; ignoring the cost of sampling the weights, the computational complexity of forming each of this vectors is <italic>O</italic>(<italic>n</italic><sup>2</sup>), again assuming <italic>n~p</italic>; thus, in general the MC-ANOVA will be computationally less involved as long as the number of vectors required for accurate estimation is smaller than <italic>n</italic>. In our experience, a few hundred random vectors (say 300) are enough to estimate the average, median, and SD of the R<sup>2</sup>. Therefore, whenever the rank of the response matrix is high, the MC-ANOVA has clear computational advantages. These advantages would be particularly clear for very large rank matrices. Finally, we note that the estimation process of both Eigen-ANOVA and MC-ANOVA is 'embarrassingly' parallel since the R<sup>2</sup> of each of the vectors (either eigenvectors or random vectors) can be computed independently of each other.</p>
<p>Consistent with our simulation results, the analyses of experimental data showed that in problems involving a high R<sup>2</sup> the Eigen-ANOVA method yielded lower estimates of the proportion of variance explained than those obtained with the MC-ANOVA (e.g., see <bold><xref ref-type="fig" rid="pone.0243251.g001">Fig 1</xref></bold> and <bold><xref ref-type="table" rid="pone.0243251.t003">Table 3</xref></bold>). Inspection of the results of the Eigen-ANOVA for individual eigenvectors suggests that the downward bias of the method may originate from ‘corner’ solutions (zero-estimates of R<sup>2</sup>) for eigenvectors associated with small eigenvalues. Therefore, if the only goal is to estimate the proportion of variance of one set explained by another set, we recommend using the MC-ANOVA method.</p>
<p>The Eigen-ANOVA method yields R<sup>2</sup>-values for each of the eigenvectors of the output set. This information can help elucidate whether global patterns (e.g., those associated with the top-eigenvectors) in one information set can be predicted from information contained in another information set. For instance, our analysis of the multi-omic breast cancer revealed that the patterns described in the top-eigenvectors derived from GE and ME are very similar; therefore, one should not expect big differences in tumor classifications that are based on the top-eigenvectors derived from either set. Interestingly, we found that in the analyses of omic data the R<sup>2</sup> of individual eigenvectors showed a very sharp phase transition, suggesting that eigenvectors associated with intermediate and small eigenvalues may describe omic-specific patterns, or perhaps measurement error associated to each of the techniques.</p>
<p>The MC-ANOVA method can be used to characterize the distribution of the R<sup>2</sup> estimates across vectors in the linear span of the output set. We used this feature to study the effect of the trait-architecture on the distribution of the R<sup>2</sup> estimates. Our results indicate that while the average R<sup>2</sup> does not seem to be affected by the sparsity of the coefficients used to form random vectors (i.e., the <italic>α</italic><sub><italic>s</italic></sub>′), the dispersion and the shape of the distribution highly depend on the process used to generate the weights (<xref ref-type="fig" rid="pone.0243251.g002">Fig 2</xref>). Highly sparse weights lead to a distribution of the R<sup>2</sup> values that, compared with vectors that were less sparse, had higher dispersion and in some cases (e.g., when the proportion of variance explained was close to 1) was skewed (<xref ref-type="fig" rid="pone.0243251.g002">Fig 2</xref>).</p>
<p>An important feature of the methods proposed in this study is that the R<sup>2</sup> measure is not symmetric, in contrast to CCA. Our simulation study shows that if the underlying patterns are non-symmetric (e.g., when one of the linear spaces is a subspace of the other) the proposed estimation methods (in particular the MC-ANOVA) can detect the lack of symmetry very well (see <bold><xref ref-type="table" rid="pone.0243251.t002">Table 2</xref></bold>). Interestingly, our analysis of multi-omic data from breast cancer patients exhibited cases where R<sup>2</sup> was rather symmetric (e.g., the regression ME~GE and the regression ME~GE) and others that were highly asymmetric (e.g., CNV~GE and GE~CNV). The asymmetric cases suggest that almost all the variability in CNV can be predicted from GE (and ME as well); however, only a fraction of the GE variance can be explained by differences in CNV patterns. This result is consistent with the hypothesis that most CNV have an impact on GE, but GE is also affected by factors other than CNV (e.g., methylation, environmental effects).</p>
<p>In this study, we focused on the application of the Eigen- and MC-ANOVA for problems involving two input sets (e.g., a low- and a high-dimensional SNP array, or two different omics) evaluated on the same set of individuals. However, with slight modifications, the MC-ANOVA method will be useful for evaluating the proportion of variance of vectors in the span of a training set (e.g., all the available genotypes/phenotypes) that could be captured/predicted by regression on a subset of it, e.g., founders, or “proven” individuals, e.g., [<xref ref-type="bibr" rid="pone.0243251.ref019">19</xref>, <xref ref-type="bibr" rid="pone.0243251.ref020">20</xref>].</p>
<p>The methods discussed in this study are entirely based on linear models. However, both MC-ANOVA and Eigen-ANOVA can easily be extended to consider non-linear relationships by embedding each set using a non-linear mapping. For instance, in the case of SNPs, one could generate a linear space that accounts for additive and non-additive effects by considering contrasts for additive, dominance, and epistatic interactions [<xref ref-type="bibr" rid="pone.0243251.ref021">21</xref>]. More generally, one can consider embedding either <bold><italic>X</italic></bold> or <bold><italic>W</italic></bold> by transforming one or both sets using a non-linear mapping <italic>f</italic>(.) (e.g., Gaussian kernels). Then, the methods presented here could be applied using <inline-formula id="pone.0243251.e035"><alternatives><graphic id="pone.0243251.e035g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e035" xlink:type="simple"/><mml:math display="inline" id="M35"><mml:mover accent="true"><mml:mrow><mml:mi mathvariant="bold-italic">X</mml:mi></mml:mrow><mml:mo>˜</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:mi>f</mml:mi><mml:mo>(</mml:mo><mml:mi mathvariant="bold-italic">X</mml:mi><mml:mo>)</mml:mo></mml:math></alternatives></inline-formula> and <inline-formula id="pone.0243251.e036"><alternatives><graphic id="pone.0243251.e036g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e036" xlink:type="simple"/><mml:math display="inline" id="M36"><mml:mover accent="true"><mml:mrow><mml:mi mathvariant="bold-italic">W</mml:mi></mml:mrow><mml:mo>˜</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:mi>f</mml:mi><mml:mo>(</mml:mo><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mo>)</mml:mo></mml:math></alternatives></inline-formula> as information sets within the context of Reproducing Kernel Hilbert Spaces regressions (e.g., [<xref ref-type="bibr" rid="pone.0243251.ref022">22</xref>, <xref ref-type="bibr" rid="pone.0243251.ref023">23</xref>]).</p>
<p>In our applications, we considered one dependent and one explanatory set; however, the methodology presented in this study could be easily adapted to accommodate cases with one output set (e.g., Y) and multiple explanatory sets (e.g., X, and W). This can be done by expanding the model used to estimate R<sup>2</sup> [<xref ref-type="bibr" rid="pone.0243251.ref001">1</xref>] by including two random effects, each with its own variance parameter. Such methods could be used to answer potential questions such as what proportion of variance of gene expression may be explained by joint regression on methylation and copy-number-variants.</p>
<p>In conclusion, we developed two methods for estimating the proportion of variance explained in problems in which both the input and output sets are high-dimensional. The MC-ANOVA method provided nearly unbiased estimates across a range of simulation scenarios. In addition to providing estimates of the proportion of variance explained, the two methods can yield useful insight into the association patterns underlying multi-layered high-dimensional data.</p>
</sec>
</sec>
<sec id="sec014" sec-type="materials|methods">
<title>Materials and methods</title>
<sec id="sec015">
<title>Data sets</title>
<p>The <bold>wheat data set</bold> used in <bold>Simulations 1 and 2</bold> was generated by the International Maize and Wheat Improvement Center (CIMMYT). This data set provides genotypes at 1,279 molecular markers (Diversity Array Technology DNA markers) assessed in 599 wheat inbred lines. Further details about this data set can be found in [<xref ref-type="bibr" rid="pone.0243251.ref024">24</xref>]. The data set is available with the BGLR R-package [<xref ref-type="bibr" rid="pone.0243251.ref025">25</xref>].</p>
<p>The <bold>chicken data set</bold> used in <bold>Case Study 1</bold> consisted of UHD SNP genotypes of 892 female and male chickens from six generations of a purebred commercial brown layer line of Lohmann Tierzucht GmbH. The genomes of 25 layers were sequenced at 8x read-depth. A total of 4.92M (M = million) SNPs were derived from these 25 genome sequences. The remaining layers (n = 867) were genotyped using the Affymetrix Axiom Chicken Genotyping Array [<xref ref-type="bibr" rid="pone.0243251.ref016">16</xref>] which contains ~600K (580,961) SNPs. Ni et al. [<xref ref-type="bibr" rid="pone.0243251.ref028">28</xref>] imputed the SNP-genotypes of those 867 layers to the whole-genome sequence (4.92 SNPs) using BEAGLE 3.3.2 [<xref ref-type="bibr" rid="pone.0243251.ref026">26</xref>] for phasing and MiniMac3 [<xref ref-type="bibr" rid="pone.0243251.ref027">27</xref>] for imputation. For details on the imputing procedure we refer to Ni et al. [<xref ref-type="bibr" rid="pone.0243251.ref028">28</xref>]. This produced a combined genotype file consisting of 4.92M SNPs from 892 = 25+867 genomes. We further filtered the combined genotype file by removing SNPs with minor-allele-frequency smaller than 0.005 (0.5%) and pruning adjacent SNPs that were in (almost) perfect LD (i.e., R<sup>2</sup> ≥0.99). A total of 1.79M SNPs passed these last two filters.</p>
<p>The <bold>breast cancer data set</bold> used in <bold>Case Study 2</bold> was from the Cancer Genome Atlas (TCGA <ext-link ext-link-type="uri" xlink:href="https://www.cancer.gov/about-nci/organization/ccg/research/structural-genomics/tcga" xlink:type="simple">https://www.cancer.gov/tcga</ext-link>) and consisted of gene expression (GE), methylation (ME), and copy-number-variants (CNV) data from (n = 593) breast cancer tumors from female breast cancer patients.</p>
<p>Gene expression data (RNA-Sequencing counts) were generated using the Illumina HiSeq RNA V2 platform and DNA methylation profiles were determined using the Illumina HM450 platform. RNA-sequencing data were transformed using the natural logarithm and individual CpG site β-values were summarized at the CpG island level, using the maximum connectivity approach implemented in the WGCNA R package [<xref ref-type="bibr" rid="pone.0243251.ref029">29</xref>]. The CpG island summaries were transformed into M-values (M = β/(1-β) [<xref ref-type="bibr" rid="pone.0243251.ref030">30</xref>]). CNV profiles corresponded to gene-level copy number intensity derived from Affymetrix SNP Array 6.0 platform, using hg19 as reference.</p>
<p>From each of the three omics we removed features with a coefficient of variation smaller than 1% and those with a proportion of missing values greater than 20%. The missing values that remained were imputed using the clustering algorithm described in [<xref ref-type="bibr" rid="pone.0243251.ref031">31</xref>]. After imputation, each feature was adjusted for batch effects using ComBat [<xref ref-type="bibr" rid="pone.0243251.ref032">32</xref>]. After applying the steps described above, the data set used in the analyses consisted of the (log-transformed) expression of 20,319 genes, 11,552 CVN-sites, and ME intensity at 28,241 ME CpG islands.</p>
</sec>
<sec id="sec016">
<title>Restricted maximum likelihood estimation of variance components</title>
<p>For the <bold>MC-ANOVA and EIGEN-ANOVA</bold> methods, the proportion of variance of one set (e.g., <bold><italic>X</italic></bold>) explained by the other set (<bold><italic>W</italic></bold>) was estimated using a random-effects model of the form
<disp-formula id="pone.0243251.e037">
<alternatives>
<graphic id="pone.0243251.e037g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e037" xlink:type="simple"/>
<mml:math display="block" id="M37">
<mml:mi mathvariant="bold-italic">z</mml:mi><mml:mo>=</mml:mo><mml:mn mathvariant="bold">1</mml:mn><mml:mi>μ</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mi mathvariant="bold-italic">β</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="bold-italic">ε</mml:mi><mml:mo>,</mml:mo>
</mml:math>
</alternatives>
</disp-formula>
where <bold><italic>z</italic></bold> was either a random linear combination of the columns of <bold><italic>X</italic></bold> (MC-ANOVA, see <xref ref-type="boxed-text" rid="pone.0243251.box001">Box 1</xref>) or one of the eigenvectors of <bold><italic>XX</italic></bold>′ (Eigen-ANOVA, see <xref ref-type="boxed-text" rid="pone.0243251.box002">Box 2</xref>), <italic>μ</italic> is an intercept, <bold><italic>β</italic></bold> is a vector of Gaussian random effects, <inline-formula id="pone.0243251.e038"><alternatives><graphic id="pone.0243251.e038g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e038" xlink:type="simple"/><mml:math display="inline" id="M38"><mml:mi>β</mml:mi><mml:mtable><mml:mtr><mml:mtd><mml:mi>i</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>∼</mml:mo></mml:mtd></mml:mtr></mml:mtable><mml:mi>N</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>β</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo>)</mml:mo></mml:math></alternatives></inline-formula>, and <bold><italic>ε</italic></bold> is a vector containing error terms, which were also assumed to be Gaussian, <inline-formula id="pone.0243251.e039"><alternatives><graphic id="pone.0243251.e039g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e039" xlink:type="simple"/><mml:math display="inline" id="M39"><mml:mi>ε</mml:mi><mml:mtable><mml:mtr><mml:mtd><mml:mi>i</mml:mi><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>∼</mml:mo></mml:mtd></mml:mtr></mml:mtable><mml:mi>N</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>ε</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo>)</mml:mo></mml:math></alternatives></inline-formula>. For computational convenience and without loss of generality, we reparametrized the above model in terms of a random-effects model, of the form
<disp-formula id="pone.0243251.e040">
<alternatives>
<graphic id="pone.0243251.e040g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e040" xlink:type="simple"/>
<mml:math display="block" id="M40">
<mml:mi mathvariant="bold-italic">z</mml:mi><mml:mo>=</mml:mo><mml:mn mathvariant="bold">1</mml:mn><mml:mi>μ</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="bold-italic">u</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="bold-italic">ε</mml:mi><mml:mo>,</mml:mo>
</mml:math>
</alternatives>
</disp-formula>
where <inline-formula id="pone.0243251.e041"><alternatives><graphic id="pone.0243251.e041g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e041" xlink:type="simple"/><mml:math display="inline" id="M41"><mml:mi mathvariant="bold-italic">u</mml:mi><mml:mo>=</mml:mo><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mi mathvariant="bold-italic">β</mml:mi><mml:mo>∼</mml:mo><mml:mi>M</mml:mi><mml:mi>V</mml:mi><mml:mi>N</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:mn mathvariant="bold">0</mml:mn><mml:mo>,</mml:mo><mml:mi mathvariant="bold-italic">W</mml:mi><mml:msup><mml:mrow><mml:mi mathvariant="bold-italic">W</mml:mi></mml:mrow><mml:mrow><mml:mo>′</mml:mo></mml:mrow></mml:msup><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>β</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo>)</mml:mo></mml:math></alternatives></inline-formula>, where <italic>MVN</italic>() stands for Multivariate Normal Distribution. We centered and scaled the columns of <bold><italic>W</italic></bold> to a standard deviation equal to <inline-formula id="pone.0243251.e042"><alternatives><graphic id="pone.0243251.e042g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e042" xlink:type="simple"/><mml:math display="inline" id="M42"><mml:mn>1</mml:mn><mml:mo>/</mml:mo><mml:msqrt><mml:mi>n</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi><mml:mo>(</mml:mo><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mo>)</mml:mo></mml:msqrt></mml:math></alternatives></inline-formula>; this leads to a covariance structure, <bold><italic>WW</italic></bold>′ with an average diagonal equal to one; therefore, with this scaling, <inline-formula id="pone.0243251.e043"><alternatives><graphic id="pone.0243251.e043g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e043" xlink:type="simple"/><mml:math display="inline" id="M43"><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>β</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> can be interpreted as the amount of variance of <bold><italic>z</italic></bold> captured by regression on <bold><italic>W</italic></bold> and the ratio <inline-formula id="pone.0243251.e044"><alternatives><graphic id="pone.0243251.e044g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e044" xlink:type="simple"/><mml:math display="inline" id="M44"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>β</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mrow><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>β</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>+</mml:mo><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>ε</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:mfrac></mml:math></alternatives></inline-formula> can be interpreted as the proportion of variance of <bold><italic>z</italic></bold> that can be explained by <bold><italic>W</italic></bold>.</p>
<p>We estimated the variance components of the above model using Restricted Maximum likelihood (REML [<xref ref-type="bibr" rid="pone.0243251.ref007">7</xref>]) which was implemented with a custom R-script that for optimization uses the <monospace>bobyqa</monospace> function of the minqa R-package [<xref ref-type="bibr" rid="pone.0243251.ref033">33</xref>]. The scripts used to fit variance components using REML are provided in the <xref ref-type="supplementary-material" rid="pone.0243251.s001">S1 File</xref> (see function <monospace>fitREML</monospace>).</p>
</sec>
<sec id="sec017">
<title>Partial least squares</title>
<p>We also estimated the proportion of variance of <bold><italic>X</italic></bold> explained by <bold><italic>W</italic></bold> (and the reciprocal when needed) by regressing <bold><italic>X</italic></bold> on <bold><italic>W</italic></bold> using the pls R-package [<xref ref-type="bibr" rid="pone.0243251.ref034">34</xref>].</p>
<disp-formula id="pone.0243251.e045">
<alternatives>
<graphic id="pone.0243251.e045g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e045" xlink:type="simple"/>
<mml:math display="block" id="M45">
<mml:mi mathvariant="bold-italic">X</mml:mi><mml:mo>=</mml:mo><mml:mn mathvariant="bold">1</mml:mn><mml:mi>μ</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="bold-italic">W</mml:mi><mml:mi mathvariant="bold-italic">β</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="bold-italic">ε</mml:mi><mml:mo>.</mml:mo>
</mml:math>
</alternatives>
</disp-formula>
<p>The ability of the pls regression to fit <bold><italic>X</italic></bold> depends on the number of components used. To determine the number of components, we first fitted the PLS regressions with <italic>1</italic>, <italic>2</italic>, <italic>…</italic>, <italic>100</italic> components in 10-fold cross-validation and evaluated the cross-validation prediction mean-square error of each of the resulting models. We then selected the number of components that led to the smallest mean-squared prediction error and fitted a PLS regression with that number of components to the entire data set. The R<sup>2</sup> of the fitted model in the training data was used as an estimate of the proportion of variance of <bold><italic>X</italic></bold> that could be explained by <bold><italic>W</italic></bold>. The <monospace>fitPLS</monospace> function provided in the script provides a wrapper to the <monospace>plsr</monospace> function which implements the procedure described above.</p>
</sec>
<sec id="sec018">
<title>Simulations</title>
<p>Both simulations were implemented using genotypes from the wheat data set.</p>
<sec id="sec019">
<title>Simulation 1</title>
<p>In the first simulation setting the input set was the wheat genotypes <bold><italic>W</italic></bold><sub>599×1,279</sub> = {<bold><italic>w</italic></bold><sub>1</sub>,…,<bold><italic>w</italic></bold><sub>1,279</sub>}, and <bold><italic>X</italic></bold> = {<bold><italic>x</italic></bold><sub>1</sub>,…,<bold><italic>x</italic></bold><sub>1,279</sub>} was a noisy version of <bold><italic>W</italic></bold> obtained by adding Gaussian <italic>iid</italic> noise, <inline-formula id="pone.0243251.e046"><alternatives><graphic id="pone.0243251.e046g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e046" xlink:type="simple"/><mml:math display="inline" id="M46"><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">δ</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>∼</mml:mo><mml:mi>N</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:mn mathvariant="bold">0</mml:mn><mml:mo>,</mml:mo><mml:mi mathvariant="bold-italic">I</mml:mi><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>δ</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo>)</mml:mo></mml:math></alternatives></inline-formula>, to the genotypes, <bold><italic>x</italic></bold><sub><italic>i</italic></sub> = <bold><italic>w</italic></bold><sub><italic>i</italic></sub>+<bold><italic>δ</italic></bold><sub><italic>i</italic></sub> where <italic>i = 1</italic>,<italic>…</italic>, <italic>1</italic>,<italic>279</italic>. The columns of <bold><italic>W</italic></bold> were standardized to unit variance, and the noise variance (<inline-formula id="pone.0243251.e047"><alternatives><graphic id="pone.0243251.e047g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e047" xlink:type="simple"/><mml:math display="inline" id="M47"><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>δ</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula>) was set such that the proportion of variance of <bold><italic>x</italic></bold><sub><italic>i</italic></sub> explained by <bold><italic>w</italic></bold><sub><italic>i</italic></sub> was equal to 0.1, 0.3, 0.5, 0.8, 0.9 and 1:
<disp-formula id="pone.0243251.e048">
<alternatives>
<graphic id="pone.0243251.e048g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e048" xlink:type="simple"/>
<mml:math display="block" id="M48">
<mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi><mml:mo>∼</mml:mo><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>v</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mi>V</mml:mi><mml:mi>a</mml:mi><mml:mi>r</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>V</mml:mi><mml:mi>a</mml:mi><mml:mi>r</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mi>V</mml:mi><mml:mi>a</mml:mi><mml:mi>r</mml:mi><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi mathvariant="bold-italic">x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>+</mml:mo><mml:msubsup><mml:mrow><mml:mi>σ</mml:mi></mml:mrow><mml:mrow><mml:mi>δ</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mrow></mml:mfrac><mml:mo>.</mml:mo>
</mml:math>
</alternatives>
</disp-formula>
We also consider a scenario where <inline-formula id="pone.0243251.e049"><alternatives><graphic id="pone.0243251.e049g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e049" xlink:type="simple"/><mml:math display="inline" id="M49"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi><mml:mo>∼</mml:mo><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:math></alternatives></inline-formula> (i.e., <bold><italic>X</italic></bold> was purely random noise).</p>
<p>We conducted 1,000 MC simulations (the input set (<bold><italic>W</italic></bold>) did not change across MC samples; however, the output set (<bold><italic>X</italic></bold>) varied across MC replicates due to the noise term), and, for each simulated data set, estimated the proportion of variance of <bold><italic>X</italic></bold> explained by regression on <bold><italic>W</italic></bold> using MC-ANOVA, Eigen-ANOVA, and the PLS method.</p>
</sec>
<sec id="sec020">
<title>Simulation 2</title>
<p>We designed a second simulation to consider the case where one of the sets (<bold><italic>X</italic></bold>) was included in the other set (<bold><italic>W</italic></bold>). In this setting <bold><italic>X</italic></bold><sub>599×<italic>p</italic></sub> was generated by including <italic>p (≤1,279)</italic> of the 1,279 DNA-markers; we used values of <italic>p</italic> that led to the inclusion of 5%, 10%, 30%, 50%, 80%, 90%, and 95% of all the available DNA markers. Subsequently, <bold><italic>W</italic></bold> was formed by combining <bold><italic>X</italic></bold> with (<italic>p-1</italic>,<italic>279</italic>) columns filled with <italic>iid</italic> Gaussian random variables (<bold><italic>Z</italic></bold>): <bold><italic>W</italic></bold><sub>599×1,279</sub> = [<bold><italic>X</italic></bold><sub>599×<italic>p</italic></sub>,<bold><italic>Z</italic></bold><sub>599×(<italic>p</italic>−1,279)</sub>].</p>
<p>The columns of <bold><italic>X</italic></bold> and <bold><italic>W</italic></bold> were all centered and scaled to unit variance. Since <bold><italic>Z</italic></bold> is independent of <bold><italic>X</italic></bold>, the proportion of variance of <bold><italic>W</italic></bold> explained by <bold><italic>X</italic></bold> equals <italic>p</italic>/1,279. On the other hand, the proportion of variance of <bold><italic>X</italic></bold> explained by <bold>W</bold> is one because <bold><italic>X</italic></bold> is included in <bold><italic>W</italic></bold>. We conducted 1,000 MC simulations and, for each simulated data sets, we estimated <inline-formula id="pone.0243251.e050"><alternatives><graphic id="pone.0243251.e050g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e050" xlink:type="simple"/><mml:math display="inline" id="M50"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>X</mml:mi><mml:mo>∼</mml:mo><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> and <inline-formula id="pone.0243251.e051"><alternatives><graphic id="pone.0243251.e051g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0243251.e051" xlink:type="simple"/><mml:math display="inline" id="M51"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>W</mml:mi><mml:mo>∼</mml:mo><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> by regressing <bold><italic>X</italic></bold> on <bold><italic>W</italic></bold> and <bold><italic>W</italic></bold> on <bold><italic>X</italic></bold>, respectively, using MC-ANOVA, Eigen-ANOVA, and the PLS method.</p>
</sec>
</sec>
</sec>
<sec id="sec021" sec-type="supplementary-material">
<title>Supporting information</title>
<supplementary-material id="pone.0243251.s001" mimetype="text/html" position="float" xlink:href="info:doi/10.1371/journal.pone.0243251.s001" xlink:type="simple">
<label>S1 File</label>
<caption>
<title>Contains the scripts used to carry out the simulations and data analyses.</title>
<p>(HTML)</p>
</caption>
</supplementary-material>
</sec>
</body>
<back>
<ack>
<p>We thank The Cancer Genome Atlas Research Network (<ext-link ext-link-type="uri" xlink:href="https://www.cancer.gov/tcga" xlink:type="simple">https://www.cancer.gov/tcga</ext-link>), CIMMYT, and Lohmann Tierzucht GmbH for making their data available.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="pone.0243251.ref001"><label>1</label><mixed-citation publication-type="book" xlink:type="simple"><name name-style="western"><surname>Mardia</surname> <given-names>K V</given-names></name>, T. KJ, <name name-style="western"><surname>Bibby</surname> <given-names>JM</given-names></name>. <source>Multivariate Analysis</source>. <publisher-name>Academic Press</publisher-name>; <year>1979</year>.</mixed-citation></ref>
<ref id="pone.0243251.ref002"><label>2</label><mixed-citation publication-type="book" xlink:type="simple"><name name-style="western"><surname>Rencher</surname> <given-names>AC</given-names></name>, <name name-style="western"><surname>Christensen</surname> <given-names>WF</given-names></name>. <source>Methods of multivariate analysis</source> [Internet]. <publisher-loc>Hoboken, NJ, USA</publisher-loc>: <publisher-name>John Wiley &amp; Sons, Inc.</publisher-name>; <year>2012</year>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/9781118391686" xlink:type="simple">10.1002/9781118391686</ext-link></comment></mixed-citation></ref>
<ref id="pone.0243251.ref003"><label>3</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Wold</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Sjöström</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Eriksson</surname> <given-names>L</given-names></name>. <article-title>PLS-regression: a basic tool of chemometrics</article-title>. <source>Chemom Intell Lab Syst</source>. Elsevier; <year>2001</year>;<volume>58</volume>: <fpage>109</fpage>–<lpage>130</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/S0169-7439%2801%2900155-1" xlink:type="simple">10.1016/S0169-7439(01)00155-1</ext-link></comment></mixed-citation></ref>
<ref id="pone.0243251.ref004"><label>4</label><mixed-citation publication-type="book" xlink:type="simple"><name name-style="western"><surname>Krzanowski</surname> <given-names>WJ</given-names></name>, J. W. <source>Principles of multivariate analysis: a user’s perspective</source> [Internet]. <publisher-name>Clarendon Press</publisher-name>; <year>1988</year>. Available: <ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/citation.cfm?id=59560" xlink:type="simple">https://dl.acm.org/citation.cfm?id=59560</ext-link></mixed-citation></ref>
<ref id="pone.0243251.ref005"><label>5</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Izenman</surname> <given-names>AJ</given-names></name>. <article-title>Reduced-rank regression for the multivariate linear model</article-title>. <source>J Multivar Anal</source>. Academic Press; <year>1975</year>;<volume>5</volume>: <fpage>248</fpage>–<lpage>264</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/0047-259X%2875%2990042-1" xlink:type="simple">10.1016/0047-259X(75)90042-1</ext-link></comment></mixed-citation></ref>
<ref id="pone.0243251.ref006"><label>6</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Witten</surname> <given-names>DM</given-names></name>, <name name-style="western"><surname>Tibshirani</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Hastie</surname> <given-names>T</given-names></name>. <article-title>A penalized matrix decomposition, with applications to sparse principal components and canonical correlation analysis</article-title>. <source>Biostatistics</source>. <year>2009</year>;<volume>10</volume>: <fpage>515</fpage>–<lpage>534</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/biostatistics/kxp008" xlink:type="simple">10.1093/biostatistics/kxp008</ext-link></comment> <object-id pub-id-type="pmid">19377034</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref007"><label>7</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Patterson</surname> <given-names>HD</given-names></name>, <name name-style="western"><surname>Thompson</surname> <given-names>R</given-names></name>. <article-title>Recovery of Inter-Block Information When Block Sizes are Unequal</article-title>. <source>Biometrika</source>. <year>1971</year>;<volume>58</volume>: <fpage>545</fpage>–<lpage>554</lpage>.</mixed-citation></ref>
<ref id="pone.0243251.ref008"><label>8</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Boichard</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Chung</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Dassonneville</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>David</surname> <given-names>X</given-names></name>, <name name-style="western"><surname>Eggen</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Fritz</surname> <given-names>S</given-names></name>, <etal>et al</etal>. <article-title>Design of a Bovine Low-Density SNP Array Optimized for Imputation</article-title>. <name name-style="western"><surname>Liu</surname> <given-names>Z</given-names></name>, editor. <source>PLoS One</source>. <year>2012</year>;<volume>7</volume>: <fpage>e34130</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pone.0034130" xlink:type="simple">10.1371/journal.pone.0034130</ext-link></comment> <object-id pub-id-type="pmid">22470530</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref009"><label>9</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Matukumalli</surname> <given-names>LK</given-names></name>, <name name-style="western"><surname>Lawley</surname> <given-names>CT</given-names></name>, <name name-style="western"><surname>Schnabel</surname> <given-names>RD</given-names></name>, <name name-style="western"><surname>Taylor</surname> <given-names>JF</given-names></name>, <name name-style="western"><surname>Allan</surname> <given-names>MF</given-names></name>, <name name-style="western"><surname>Heaton</surname> <given-names>MP</given-names></name>, <etal>et al</etal>. <article-title>Development and Characterization of a High Density SNP Genotyping Assay for Cattle</article-title>. <name name-style="western"><surname>Toland</surname> <given-names>AE</given-names></name>, editor. <source>PLoS One</source>. <year>2009</year>;<volume>4</volume>: <fpage>e5350</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pone.0005350" xlink:type="simple">10.1371/journal.pone.0005350</ext-link></comment> <object-id pub-id-type="pmid">19390634</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref010"><label>10</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Kranis</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Gheyas</surname> <given-names>AA</given-names></name>, <name name-style="western"><surname>Boschiero</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Turner</surname> <given-names>F</given-names></name>, <name name-style="western"><surname>Yu</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Smith</surname> <given-names>S</given-names></name>, <etal>et al</etal>. <article-title>Development of a high density 600K SNP genotyping array for chicken</article-title>. <source>BMC Genomics</source>. <year>2013</year>;<volume>14</volume>: <fpage>59</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/1471-2164-14-59" xlink:type="simple">10.1186/1471-2164-14-59</ext-link></comment> <object-id pub-id-type="pmid">23356797</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref011"><label>11</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Unterseer</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Bauer</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Haberer</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Seidel</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Knaak</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Ouzunova</surname> <given-names>M</given-names></name>, <etal>et al</etal>. <article-title>A powerful tool for genome analysis in maize: Development and evaluation of the high density 600 k SNP genotyping array</article-title>. <source>BMC Genomics</source>. BioMed Central Ltd.; <year>2014</year>;<volume>15</volume>: <fpage>823</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/1471-2164-15-823" xlink:type="simple">10.1186/1471-2164-15-823</ext-link></comment> <object-id pub-id-type="pmid">25266061</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref012"><label>12</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Erbe</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Gredler</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Seefried</surname> <given-names>FR</given-names></name>, <name name-style="western"><surname>Bapst</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Simianer</surname> <given-names>H</given-names></name>. <article-title>A Function Accounting for Training Set Size and Marker Density to Model the Average Accuracy of Genomic Prediction</article-title>. <name name-style="western"><surname>Liu</surname> <given-names>Z</given-names></name>, editor. <source>PLoS One</source>. Public Library of Science; <year>2013</year>;<volume>8</volume>: <fpage>e81046</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pone.0081046" xlink:type="simple">10.1371/journal.pone.0081046</ext-link></comment> <object-id pub-id-type="pmid">24339895</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref013"><label>13</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Ober</surname> <given-names>U</given-names></name>, <name name-style="western"><surname>Ayroles</surname> <given-names>JF</given-names></name>, <name name-style="western"><surname>Stone</surname> <given-names>EA</given-names></name>, <name name-style="western"><surname>Richards</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Zhu</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Gibbs</surname> <given-names>RA</given-names></name>, <etal>et al</etal>. <article-title>Using Whole-Genome Sequence Data to Predict Quantitative Trait Phenotypes in Drosophila melanogaster</article-title>. <name name-style="western"><surname>Wray</surname> <given-names>NR</given-names></name>, editor. <source>PLoS Genet</source>. Public Library of Science; <year>2012</year>;<volume>8</volume>: <fpage>e1002685</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pgen.1002685" xlink:type="simple">10.1371/journal.pgen.1002685</ext-link></comment> <object-id pub-id-type="pmid">22570636</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref014"><label>14</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Vazquez</surname> <given-names>AI</given-names></name>, <name name-style="western"><surname>Rosa</surname> <given-names>GJM</given-names></name>, <name name-style="western"><surname>Weigel</surname> <given-names>KA</given-names></name>, <name name-style="western"><surname>de los Campos</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Gianola</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Allison</surname> <given-names>DB</given-names></name>. <article-title>Predictive ability of subsets of single nucleotide polymorphisms with and without parent average in US Holsteins</article-title>. <source>J Dairy Sci</source>. <year>2010</year>;<volume>93</volume>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3168/jds.2010-3335" xlink:type="simple">10.3168/jds.2010-3335</ext-link></comment> <object-id pub-id-type="pmid">21094768</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref015"><label>15</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Weigel</surname> <given-names>KA</given-names></name>, <name name-style="western"><surname>de los Campos</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Vazquez</surname> <given-names>AI</given-names></name>, <name name-style="western"><surname>Rosa</surname> <given-names>GJM</given-names></name>, <name name-style="western"><surname>Gianola</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Van Tassell</surname> <given-names>CP</given-names></name>. <article-title>Accuracy of direct genomic values derived from imputed single nucleotide polymorphism genotypes in Jersey cattle</article-title>. <source>J Dairy Sci</source>. <year>2010</year>;<volume>93</volume>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3168/jds.2010-3149" xlink:type="simple">10.3168/jds.2010-3149</ext-link></comment> <object-id pub-id-type="pmid">20965358</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref016"><label>16</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Kranis</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Gheyas</surname> <given-names>AA</given-names></name>, <name name-style="western"><surname>Boschiero</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Turner</surname> <given-names>F</given-names></name>, <name name-style="western"><surname>Yu</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Smith</surname> <given-names>S</given-names></name>, <etal>et al</etal>. <article-title>Development of a high density 600K SNP genotyping array for chicken</article-title>. <source>BMC Genomics</source>. BioMed Central; <year>2013</year>;<volume>14</volume>: <fpage>59</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/1471-2164-14-59" xlink:type="simple">10.1186/1471-2164-14-59</ext-link></comment> <object-id pub-id-type="pmid">23356797</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref017"><label>17</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Vogelstein</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Papadopoulos</surname> <given-names>N</given-names></name>, <name name-style="western"><surname>Velculescu</surname> <given-names>VE</given-names></name>, <name name-style="western"><surname>Zhou</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Diaz</surname> <given-names>LA</given-names></name>, <name name-style="western"><surname>Kinzler</surname> <given-names>KW</given-names></name>. <article-title>Cancer genome landscapes</article-title>. <source>Science</source>. <year>2013</year>;<volume>339</volume>: <fpage>1546</fpage>–<lpage>58</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1126/science.1235122" xlink:type="simple">10.1126/science.1235122</ext-link></comment> <object-id pub-id-type="pmid">23539594</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref018"><label>18</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Witte</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Plass</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Gerhauser</surname> <given-names>C</given-names></name>. <source>Pan-cancer patterns of DNA methylation</source>. <year>2014</year>; <fpage>1</fpage>–<lpage>18</lpage>.</mixed-citation></ref>
<ref id="pone.0243251.ref019"><label>19</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Misztal</surname> <given-names>I.</given-names></name> <article-title>Inexpensive computation of the inverse of the genomic relationship matrix in populations with small effective population size</article-title>. <source>Genetics</source>. Genetics; <year>2016</year>;<volume>202</volume>: <fpage>401</fpage>–<lpage>409</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1534/genetics.115.182089" xlink:type="simple">10.1534/genetics.115.182089</ext-link></comment> <object-id pub-id-type="pmid">26584903</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref020"><label>20</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Pocrnic</surname> <given-names>I</given-names></name>, <name name-style="western"><surname>Lourenco</surname> <given-names>DAL</given-names></name>, <name name-style="western"><surname>Masuda</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>Legarra</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Misztal</surname> <given-names>I</given-names></name>. <article-title>The dimensionality of genomic information and its effect on genomic prediction</article-title>. <source>Genetics</source>. Genetics; <year>2016</year>;<volume>203</volume>: <fpage>573</fpage>–<lpage>581</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1534/genetics.116.187013" xlink:type="simple">10.1534/genetics.116.187013</ext-link></comment> <object-id pub-id-type="pmid">26944916</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref021"><label>21</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Martini</surname> <given-names>JWR</given-names></name>, <name name-style="western"><surname>Gao</surname> <given-names>N</given-names></name>, <name name-style="western"><surname>Cardoso</surname> <given-names>DF</given-names></name>, <name name-style="western"><surname>Wimmer</surname> <given-names>V</given-names></name>, <name name-style="western"><surname>Erbe</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Cantet</surname> <given-names>RJC</given-names></name>, <etal>et al</etal>. <article-title>Genomic prediction with epistasis models: on the marker-coding-dependent performance of the extended GBLUP and properties of the categorical epistasis model (CE)</article-title>. <source>BMC Bioinformatics</source>. BioMed Central Ltd.; <year>2017</year>;<volume>18</volume>: <fpage>3</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/s12859-016-1439-1" xlink:type="simple">10.1186/s12859-016-1439-1</ext-link></comment> <object-id pub-id-type="pmid">28049412</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref022"><label>22</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>de los Campos</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Gianola</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Rosa</surname> <given-names>GJM</given-names></name>. <article-title>Reproducing kernel Hilbert spaces regression: a general framework for genetic evaluation</article-title>. <source>J Anim Sci</source>. <year>2009</year>;<volume>87</volume>: <fpage>1883</fpage>–<lpage>1887</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.2527/jas.2008-1259" xlink:type="simple">10.2527/jas.2008-1259</ext-link></comment> <object-id pub-id-type="pmid">19213705</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref023"><label>23</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>de los Campos</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Gianola</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Rosa</surname> <given-names>GJM</given-names></name>, <name name-style="western"><surname>Weigel</surname> <given-names>KA</given-names></name>, <name name-style="western"><surname>Crossa</surname> <given-names>J</given-names></name>. <article-title>Semi-parametric Genomic-Enabled Prediction of Genetic Values Using Reproducing Kernel {H}ilbert Spaces Methods</article-title>. <source>Genet Res (Camb)</source>. Cambridge University Press; <year>2010</year>;<volume>92</volume>: <fpage>295</fpage>–<lpage>308</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1017/S0016672310000285" xlink:type="simple">10.1017/S0016672310000285</ext-link></comment> <object-id pub-id-type="pmid">20943010</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref024"><label>24</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Crossa</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>de los Campos</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Perez</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Gianola</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Burgueo</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Araus</surname> <given-names>JL</given-names></name>, <etal>et al</etal>. <article-title>Prediction of genetic values of quantitative traits in plant breeding using pedigree and molecular markers</article-title>. <source>Genetics</source>. <year>2010</year>;<volume>186</volume>: <fpage>713</fpage>–<lpage>724</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1534/genetics.110.118521" xlink:type="simple">10.1534/genetics.110.118521</ext-link></comment> <object-id pub-id-type="pmid">20813882</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref025"><label>25</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Pérez</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>de los Campos</surname> <given-names>G</given-names></name>. <article-title>Genome-wide regression and prediction with the BGLR statistical package</article-title>. <source>Genetics</source>. Genetics; <year>2014</year>;<volume>198</volume>: <fpage>483</fpage>–<lpage>95</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1534/genetics.114.164442" xlink:type="simple">10.1534/genetics.114.164442</ext-link></comment> <object-id pub-id-type="pmid">25009151</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref026"><label>26</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Browning</surname> <given-names>BL</given-names></name>, <name name-style="western"><surname>Browning</surname> <given-names>SR</given-names></name>. <article-title>A unified approach to genotype imputation and haplotype-phase inference for large data sets of trios and unrelated individuals</article-title>. <source>Am J Hum Genet</source>. Elsevier; <year>2009</year>;<volume>84</volume>: <fpage>210</fpage>–<lpage>223</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.ajhg.2009.01.005" xlink:type="simple">10.1016/j.ajhg.2009.01.005</ext-link></comment> <object-id pub-id-type="pmid">19200528</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref027"><label>27</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Howie</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Fuchsberger</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Stephens</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Marchini</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Abecasis</surname> <given-names>GR</given-names></name>. <article-title>Fast and accurate genotype imputation in genome-wide association studies through pre-phasing</article-title>. <source>Nat Genet</source>. <year>2012</year>;<volume>44</volume>: <fpage>955</fpage>–<lpage>959</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/ng.2354" xlink:type="simple">10.1038/ng.2354</ext-link></comment> <object-id pub-id-type="pmid">22820512</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref028"><label>28</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Ni</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Strom</surname> <given-names>TM</given-names></name>, <name name-style="western"><surname>Pausch</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Reimer</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Preisinger</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Simianer</surname> <given-names>H</given-names></name>, <etal>et al</etal>. <article-title>Comparison among three variant callers and assessment of the accuracy of imputation from SNP array data to whole-genome sequence level in chicken</article-title>. <source>BMC Genomics</source>. <year>2015</year>;<volume>16</volume>: <fpage>824</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/s12864-015-2059-2" xlink:type="simple">10.1186/s12864-015-2059-2</ext-link></comment> <object-id pub-id-type="pmid">26486989</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref029"><label>29</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Langfelder</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Horvath</surname> <given-names>S</given-names></name>. <article-title>WGCNA: an R package for weighted correlation network analysis</article-title>. <source>BMC Bioinformatics</source>. BioMed Central; <year>2008</year>;<volume>9</volume>: <fpage>559</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/1471-2105-9-559" xlink:type="simple">10.1186/1471-2105-9-559</ext-link></comment> <object-id pub-id-type="pmid">19114008</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref030"><label>30</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Du</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Zhang</surname> <given-names>X</given-names></name>, <name name-style="western"><surname>Huang</surname> <given-names>C-C</given-names></name>, <name name-style="western"><surname>Jafari</surname> <given-names>N</given-names></name>, <name name-style="western"><surname>Kibbe</surname> <given-names>WA</given-names></name>, <name name-style="western"><surname>Hou</surname> <given-names>L</given-names></name>, <etal>et al</etal>. <article-title>Comparison of Beta-value and M-value methods for quantifying methylation levels by microarray analysis</article-title>. <source>BMC Bioinformatics</source>. BioMed Central; <year>2010</year>;<volume>11</volume>: <fpage>587</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/1471-2105-11-587" xlink:type="simple">10.1186/1471-2105-11-587</ext-link></comment> <object-id pub-id-type="pmid">21118553</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref031"><label>31</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Hastie</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Tibshirani</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Narasimhan</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Chu</surname> <given-names>G</given-names></name>. <source>Impute: Imputation for microarray data</source>. <year>2016</year>;<volume>17</volume>: <fpage>520</fpage>–<lpage>525</lpage>.</mixed-citation></ref>
<ref id="pone.0243251.ref032"><label>32</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Lazar</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Meganck</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Taminau</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Steenhoff</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Coletta</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Molter</surname> <given-names>C</given-names></name>, <etal>et al</etal>. <article-title>Batch effect removal methods for microarray gene expression data integration: a survey</article-title>. <source>Brief Bioinform</source>. Oxford University Press; <year>2013</year>;<volume>14</volume>: <fpage>469</fpage>–<lpage>490</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/bib/bbs037" xlink:type="simple">10.1093/bib/bbs037</ext-link></comment> <object-id pub-id-type="pmid">22851511</object-id></mixed-citation></ref>
<ref id="pone.0243251.ref033"><label>33</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Bates</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Mullen</surname> <given-names>KM</given-names></name>, <name name-style="western"><surname>Nash</surname> <given-names>JC</given-names></name>, <name name-style="western"><surname>Varadhan</surname> <given-names>R</given-names></name>. <source>minqa: Derivative-free optimization algorithms by quadratic approximation</source> [Internet]. <year>2014</year>. Available: <ext-link ext-link-type="uri" xlink:href="https://cran.r-project.org/package=minqa" xlink:type="simple">https://cran.r-project.org/package=minqa</ext-link></mixed-citation></ref>
<ref id="pone.0243251.ref034"><label>34</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Mevik</surname> <given-names>B-H</given-names></name>, <name name-style="western"><surname>Wehrens</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Hovde</surname> <given-names>K</given-names></name>. <article-title>pls: Partial Least Squares and Principal Component Regression</article-title>. <source>R Packag version 27–2}</source>. <year>2019</year>; <ext-link ext-link-type="uri" xlink:href="https://CRAN.R-project.org/package=pls" xlink:type="simple">https://CRAN.R-project.org/package=pls</ext-link></mixed-citation></ref>
</ref-list>
</back>
</article>