<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1d3 20150301//EN" "http://jats.nlm.nih.gov/publishing/1.1d3/JATS-journalpublishing1.dtd">
<article article-type="research-article" dtd-version="1.1d3" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS ONE</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">plosone</journal-id>
<journal-title-group>
<journal-title>PLOS ONE</journal-title>
</journal-title-group>
<issn pub-type="epub">1932-6203</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, CA USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">PONE-D-24-04044</article-id>
<article-id pub-id-type="doi">10.1371/journal.pone.0308543</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Study Protocol</subject>
</subj-group>
<subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Simulation and modeling</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Mathematical and statistical techniques</subject><subj-group><subject>Statistical methods</subject><subj-group><subject>Regression analysis</subject><subj-group><subject>Linear regression analysis</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Physical sciences</subject><subj-group><subject>Mathematics</subject><subj-group><subject>Statistics</subject><subj-group><subject>Statistical methods</subject><subj-group><subject>Regression analysis</subject><subj-group><subject>Linear regression analysis</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Mathematical and statistical techniques</subject><subj-group><subject>Statistical methods</subject><subj-group><subject>Monte Carlo method</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Physical sciences</subject><subj-group><subject>Mathematics</subject><subj-group><subject>Statistics</subject><subj-group><subject>Statistical methods</subject><subj-group><subject>Monte Carlo method</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Mathematical and statistical techniques</subject><subj-group><subject>Statistical methods</subject><subj-group><subject>Forecasting</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Physical sciences</subject><subj-group><subject>Mathematics</subject><subj-group><subject>Statistics</subject><subj-group><subject>Statistical methods</subject><subj-group><subject>Forecasting</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Research assessment</subject><subj-group><subject>Reproducibility</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Research assessment</subject><subj-group><subject>Research errors</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Mathematical and statistical techniques</subject><subj-group><subject>Statistical methods</subject><subj-group><subject>Regression analysis</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Physical sciences</subject><subj-group><subject>Mathematics</subject><subj-group><subject>Statistics</subject><subj-group><subject>Statistical methods</subject><subj-group><subject>Regression analysis</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Nutrition</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Medicine and health sciences</subject><subj-group><subject>Nutrition</subject></subj-group></subj-group></article-categories>
<title-group>
<article-title>Evaluating variable selection methods for multivariable regression models: A simulation study protocol</article-title>
<alt-title alt-title-type="running-head">Evaluating variable selection methods for multivariable regression models</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0003-1215-8561</contrib-id>
<name name-style="western">
<surname>Ullmann</surname> <given-names>Theresa</given-names></name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="http://credit.niso.org/contributor-roles/software/">Software</role>
<role content-type="http://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-original-draft/">Writing – original draft</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Heinze</surname> <given-names>Georg</given-names></name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role content-type="http://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="http://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-original-draft/">Writing – original draft</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Hafermann</surname> <given-names>Lorena</given-names></name>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff002"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0003-3943-6234</contrib-id>
<name name-style="western">
<surname>Schilhart-Wallisch</surname> <given-names>Christine</given-names></name>
<role content-type="http://credit.niso.org/contributor-roles/software/">Software</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff003"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0003-1339-0311</contrib-id>
<name name-style="western">
<surname>Dunkler</surname> <given-names>Daniela</given-names></name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role content-type="http://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="http://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-original-draft/">Writing – original draft</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
<xref ref-type="corresp" rid="cor001">*</xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<collab>for TG2 of the STRATOS initiative</collab>
<xref ref-type="fn" rid="fn001"><sup>¶</sup></xref>
</contrib>
</contrib-group>
<aff id="aff001">
<label>1</label>
<addr-line>Institute of Clinical Biometrics, Center for Medical Data Science, Medical University of Vienna, Vienna, Austria</addr-line>
</aff>
<aff id="aff002">
<label>2</label>
<addr-line>Institute of Biometry and Clinical Epidemiology, Charité – Universitätsmedizin Berlin, corporate member of Freie Universität Berlin and Humboldt-Universität zu Berlin, Berlin, Germany</addr-line>
</aff>
<aff id="aff003">
<label>3</label>
<addr-line>Austrian Agency for Health and Food Safety (AGES), Vienna, Austria</addr-line>
</aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple">
<name name-style="western">
<surname>Tian</surname> <given-names>Suyan</given-names></name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/>
</contrib>
</contrib-group>
<aff id="edit1">
<addr-line>The First Hospital of Jilin University, CHINA</addr-line>
</aff>
<author-notes>
<fn fn-type="conflict" id="coi001">
<p>The authors have declared that no competing interests exist.</p>
</fn>
<fn fn-type="other" id="fn001">
<p>¶ Membership list can be found in the Acknowledgments section.</p>
</fn>
<corresp id="cor001">* E-mail: <email xlink:type="simple">daniela.dunkler@meduniwien.ac.at</email></corresp>
</author-notes>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<pub-date pub-type="epub">
<day>9</day>
<month>8</month>
<year>2024</year>
</pub-date>
<volume>19</volume>
<issue>8</issue>
<elocation-id>e0308543</elocation-id>
<history>
<date date-type="received">
<day>7</day>
<month>2</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>25</day>
<month>7</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-year>2024</copyright-year>
<copyright-holder>Ullmann et al</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.pone.0308543"/>
<abstract>
<p>Researchers often perform data-driven variable selection when modeling the associations between an outcome and multiple independent variables in regression analysis. Variable selection may improve the interpretability, parsimony and/or predictive accuracy of a model. Yet variable selection can also have negative consequences, such as false exclusion of important variables or inclusion of noise variables, biased estimation of regression coefficients, underestimated standard errors and invalid confidence intervals, as well as model instability. While the potential advantages and disadvantages of variable selection have been discussed in the literature for decades, few large-scale simulation studies have neutrally compared data-driven variable selection methods with respect to their consequences for the resulting models. We present the protocol for a simulation study that will evaluate different variable selection methods: forward selection, stepwise forward selection, backward elimination, augmented backward elimination, univariable selection, univariable selection followed by backward elimination, and penalized likelihood approaches (Lasso, relaxed Lasso, adaptive Lasso). These methods will be compared with respect to false inclusion and/or exclusion of variables, consequences on bias and variance of the estimated regression coefficients, the validity of the confidence intervals for the coefficients, the accuracy of the estimated variable importance ranking, and the predictive performance of the selected models. We consider both linear and logistic regression in a low-dimensional setting (20 independent variables with 10 true predictors and 10 noise variables). The simulation will be based on real-world data from the National Health and Nutrition Examination Survey (NHANES). Publishing this study protocol ahead of performing the simulation increases transparency and allows integrating the perspective of other experts into the study design.</p>
</abstract>
<funding-group>
<award-group id="award001">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/501100002428</institution-id>
<institution>Austrian Science Fund</institution>
</institution-wrap>
</funding-source>
<award-id>I-4739-B</award-id>
<principal-award-recipient>
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0003-1339-0311</contrib-id>
<name name-style="western">
<surname>Dunkler</surname> <given-names>Daniela</given-names></name>
</principal-award-recipient>
</award-group>
<award-group id="award002">
<funding-source>
<institution-wrap>
<institution-id institution-id-type="funder-id">http://dx.doi.org/10.13039/501100001659</institution-id>
<institution>Deutsche Forschungsgemeinschaft</institution>
</institution-wrap>
</funding-source>
<award-id>RA 2347/8-1</award-id>
</award-group>
<funding-statement>This research was funded in part by the Austrian Science Fund (FWF, <ext-link ext-link-type="uri" xlink:href="https://www.fwf.ac.at/en/" xlink:type="simple">https://www.fwf.ac.at/en/</ext-link>) [I-4739-B] (for T.U. and C.W.) and by the German Research Foundation (DFG, <ext-link ext-link-type="uri" xlink:href="https://www.dfg.de/en" xlink:type="simple">https://www.dfg.de/en</ext-link>) [RA 2347/8-1] (for L. H.). For open access purposes, the author has applied a CC BY public copyright license to any author accepted manuscript version arising from this submission. The funders did not and will not have any role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</funding-statement>
</funding-group>
<counts>
<fig-count count="1"/>
<table-count count="4"/>
<page-count count="19"/>
</counts>
<custom-meta-group>
<custom-meta id="data-availability">
<meta-name>Data Availability</meta-name>
<meta-value>This manuscript is a protocol of a simulation study. We intend to share the software code after the study has been conducted and published. This will allow recreating our data and reproducing our simulation study.</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="sec001" sec-type="intro">
<title>1 Introduction</title>
<p>Data-driven variable selection is frequently performed when modeling the associations between an outcome and multiple independent variables (sometimes also referred to as explanatory variables, covariates or predictors). Variable selection may help to generate parsimonious and interpretable models, and may also yield models with increased predictive accuracy. Despite these potential advantages, data-driven variable selection can also have unintended negative consequences that many researchers are not fully aware of. Variable selection induces additional uncertainty in the estimation process and may cause biased estimation of regression coefficients, model instability (i.e., models that are not robust with respect to small perturbations of the data set), and issues with post-selection inference such as underestimated standard errors and invalid confidence intervals [<xref ref-type="bibr" rid="pone.0308543.ref001">1</xref>–<xref ref-type="bibr" rid="pone.0308543.ref005">5</xref>].</p>
<p>A recent review [<xref ref-type="bibr" rid="pone.0308543.ref001">1</xref>] provided guidance about variable selection and gave an overview of possible consequences of variable selection. However, there are few systematic simulation studies that compare different variable selection methods with respect to their consequences for the resulting models (for some exceptions, see [<xref ref-type="bibr" rid="pone.0308543.ref006">6</xref>–<xref ref-type="bibr" rid="pone.0308543.ref010">10</xref>]). While many articles proposing new variable selection methods include a comparison with existing methods (based on simulated or real data), these comparisons are typically somewhat limited, often comparing the new method to only one to three competitors, even though there are many more existing methods. Moreover, these articles are inherently biased towards demonstrating superiority of the new methods. In particular, such studies cannot be considered as <italic>neutral</italic>. A neutral comparison study is a study whose authors do not have a vested interest in one of the competing methods, and are (as a group) approximately equally familiar with all considered methods [<xref ref-type="bibr" rid="pone.0308543.ref011">11</xref>, <xref ref-type="bibr" rid="pone.0308543.ref012">12</xref>]. More neutral comparison studies about existing variable selection methods are needed to better understand their properties, a viewpoint that aligns with the goals of the STRATOS initiative (STRengthening Analytical Thinking for Observational Studies [<xref ref-type="bibr" rid="pone.0308543.ref013">13</xref>]). The STRATOS initiative is an international consortium of biostatistical experts, and aims to provide guidance in the design and analysis of observational studies for specialist and non-specialist audiences. This perspective motivates our comprehensive simulation study.</p>
<p>We will focus on <italic>descriptive</italic> modeling (i.e., describing the relationship between the outcome and the independent variables in a parsimonious manner) and <italic>predictive</italic> modeling (i.e., predicting the outcome as accurately as possible) [<xref ref-type="bibr" rid="pone.0308543.ref014">14</xref>]. Our setting is multivariable regression analysis with one outcome variable. The outcome is either continuous (linear regression) or binary (logistic regression). We simulate data in a low-dimensional scenario (20 variables consisting of 10 true predictors and 10 noise variables). Different variable selection methods with multiple parameter settings are compared: forward selection, stepwise forward selection, backward elimination, augmented backward elimination [<xref ref-type="bibr" rid="pone.0308543.ref015">15</xref>], univariable selection, univariable selection followed by backward elimination, the Lasso [<xref ref-type="bibr" rid="pone.0308543.ref016">16</xref>], the relaxed Lasso [<xref ref-type="bibr" rid="pone.0308543.ref009">9</xref>, <xref ref-type="bibr" rid="pone.0308543.ref017">17</xref>], and the adaptive Lasso [<xref ref-type="bibr" rid="pone.0308543.ref018">18</xref>]. We compare the performances of these methods with respect to false inclusion and/or exclusion of variables, consequences on bias and variance of the estimated regression coefficients, the validity of the confidence intervals for the coefficients, the accuracy of the estimated variable importance ranking, and finally the predictive performance of the selected models.</p>
<p>Using simulated instead of real data allows us to a) know the true data generating process and b) systematically vary several data characteristics [<xref ref-type="bibr" rid="pone.0308543.ref019">19</xref>, <xref ref-type="bibr" rid="pone.0308543.ref020">20</xref>]. For example, we will include varying sample sizes and <italic>R</italic><sup>2</sup>, as the consequences of variable selection depend on these parameters. To ensure that the simulation results are practically relevant, we use real data as the starting point for our simulation. The distributions and correlation structure of the variables are based on data from the National Health and Nutrition Examination Survey (NHANES) [<xref ref-type="bibr" rid="pone.0308543.ref021">21</xref>]. The choice of variables and true regression coefficients is inspired by an applied study about predicting the difference between ambulatory/home and clinic blood pressure readings [<xref ref-type="bibr" rid="pone.0308543.ref022">22</xref>]. Our simulated data thus mimics real cardiovascular data.</p>
<p>Our focus is on low-dimensional data, which is reflected in our simulation setting with twenty independent variables. Data of this type frequently appears in medicine and other application fields, and researchers often apply variable selection in this context. For example, a systematic review of models for COVID-19 prognosis [<xref ref-type="bibr" rid="pone.0308543.ref023">23</xref>, <xref ref-type="bibr" rid="pone.0308543.ref024">24</xref>] identified 236 newly developed regression models for prediction. Data-driven variable selection was applied (and reported) for 196 models. In 165 models both the number of candidate predictors (i.e., the predictors considered at the start of data-driven selection) and the number of predictors in the final model were reported; the median numbers were 28 (range 4–130), and 6 (range 1–38), respectively. This demonstrates that low- to medium-dimensional data played an important role in COVID-19 prediction research. Of course, data-driven variable selection is also relevant for high-dimensional data. Comparing variable selection methods for high-dimensional data would require a different study design and is not the purpose of this planned simulation study.</p>
<p>As mentioned above, neutrality is an important goal when conducting systematic comparison studies. “Perfect” neutrality may be the ultimate goal, but this ideal can be difficult to achieve in practice. While we aim to be as neutral as possible, we disclose (for the purpose of full transparency) that one of the methods for variable selection included in our comparison, namely augmented backwards elimination, was originally proposed by two authors of the present study protocol [<xref ref-type="bibr" rid="pone.0308543.ref015">15</xref>]. Our goal was to not let this fact influence our choice of study design, though unconscious biases can never be fully excluded. Striving for as much neutrality as possible motivated us to publish this study protocol. This will allow us to integrate the comments of reviewers before performing the simulation. For the design of our study, results from previous smaller simulation studies and pilot studies were taken into account [<xref ref-type="bibr" rid="pone.0308543.ref001">1</xref>]; however, the study outlined in this protocol has not yet been run and analyzed. Preregistration of study protocols for simulation studies/methodological studies is still very rare (for an exception, see [<xref ref-type="bibr" rid="pone.0308543.ref025">25</xref>]). However, this practice could offer similar advantages to those discussed for preregistration in applied research, such as increased transparency and prevention of “hindsight bias” [<xref ref-type="bibr" rid="pone.0308543.ref026">26</xref>]. Potential advantages of preregistering protocols for simulation studies, but also possible limitations and challenges, are discussed more extensively elsewhere [<xref ref-type="bibr" rid="pone.0308543.ref027">27</xref>].</p>
<p>A specific goal of our simulation study is to evaluate previously published recommendations about variable selection [<xref ref-type="bibr" rid="pone.0308543.ref001">1</xref>], which we discuss in Section 2. We then describe our simulation design in Section 3, explain the planned code review in Section 4, and conclude the protocol with some final remarks in Section 5.</p>
</sec>
<sec id="sec002">
<title>2 Previous variable selection recommendations</title>
<p>Varied viewpoints exist in the literature as to whether researchers should apply data-driven variable selection, and, if so, which methods and parameters are deemed preferable. Some authors generally caution against data-driven variable selection, stressing potential negative consequences [<xref ref-type="bibr" rid="pone.0308543.ref005">5</xref>]. Other authors put more focus on potential advantages of variable selection and are more optimistic about using selection methods, at least if the sample size is large enough and if selection is accompanied by a stability analysis [<xref ref-type="bibr" rid="pone.0308543.ref028">28</xref>]. In a review conducted by three co-authors of the present study protocol, Heinze et al. [<xref ref-type="bibr" rid="pone.0308543.ref001">1</xref>] summarized different perspectives from the literature. Drawing upon existing recommendations, but also taking their own experience and a small simulation study into account, they derived recommendations for the usage of variable selection methods. These recommendations consider both benefits and drawbacks of variable selection, thereby reconciling different viewpoints on the matter. The recommendations depend on the “events-per-variable” (EPV) in the data. The EPV is the ratio between sample size (in linear regression) or the number of the less frequent outcome (in logistic regression) and the number of independent variables. Data-driven variable selection is applied on a carefully designed “global” model which includes all independent variables relevant for the research question. The denominator of EPV refers to the number of design variables (including possible dummy variables and other constructed variables) in this global model. The following bullet points list the recommendations, and how we plan to evaluate them.</p>
<list list-type="bullet">
<list-item>
<p>EPV &gt; 25: While variable selection may generally work well for a large EPV value, the selection of independent variables with small effect size can still be unstable. If backward elimination is used, a stringent threshold of <italic>α</italic> = 0.05 or selection with the BIC may lead to a more accurate selection of variables than milder thresholds.</p>
<p><italic>In our study</italic>: We will check whether selection rates of variables with small standardized regression coefficients (e.g., ±0.25) are notably different from either 0 or 1 (which indicates instability). For backward elimination, we will evaluate whether the selection of variables is more accurate when using the threshold <italic>α</italic> = 0.05 or the BIC (which corresponds to even stricter thresholds for our considered sample sizes [<xref ref-type="bibr" rid="pone.0308543.ref001">1</xref>]), compared to using larger <italic>α</italic> values.</p>
</list-item>
<list-item>
<p>10 &lt; EPV ≤ 25: In general, the selection of variables might be unstable with such an EPV. When variables with unclear effect size are selected, their effects might be over-estimated. Penalized estimation (Lasso) or postestimation shrinkage is thus recommended. If backward elimination is used, a threshold corresponding to selection with the AIC (approximately <italic>α</italic> = 0.157) is recommended, but not smaller <italic>α</italic> values.</p>
<p><italic>In our study</italic>: Again, we will evaluate stability by checking whether selection rates of variables, particularly those with small standardized regression coefficients, are notably different from either 0 or 1. We will also calculate the conditional bias (i.e., bias conditioned on selection) of the variables and analyze whether variables with small standardized regression coefficients have large conditional bias away from zero. For backward elimination, we will evaluate to which extent a threshold of <italic>α</italic> = 0.157 (or an even milder threshold of <italic>α</italic> = 0.5) selects the true predictors more frequently than smaller thresholds (i.e., a fixed threshold of <italic>α</italic> = 0.05 or selection with the BIC) [<xref ref-type="bibr" rid="pone.0308543.ref003">3</xref>].</p>
</list-item>
<list-item>
<p>EPV ≤ 10: Data-driven variable selection is generally not recommended.</p>
<p><italic>In our study</italic>: We will analyze whether variable selection has negative consequences with respect to the different performance criteria.</p>
</list-item>
</list>
<p>The results of variable selection are not only influenced by EPV, but also by other aspects such as the <italic>R</italic><sup>2</sup> of the model. We will thus consider different <italic>R</italic><sup>2</sup> values in our simulation study. The recommendations above do not take <italic>R</italic><sup>2</sup> into account, as the <italic>R</italic><sup>2</sup> of the model is typically not known prior to the data analysis.</p>
</sec>
<sec id="sec003">
<title>3 Simulation design</title>
<p>Morris et al. [<xref ref-type="bibr" rid="pone.0308543.ref019">19</xref>] proposed to describe the following components when reporting a simulation study: the aims of the study (A), the data-generating mechanisms (D), the estimands (i.e., the population quantities which are estimated) and other targets of interest (E), the methods to be compared (M), and the performance measures used for evaluating the methods (P). The ADEMP components of our study are briefly summarized in Tables <xref ref-type="table" rid="pone.0308543.t001">1</xref> and <xref ref-type="table" rid="pone.0308543.t002">2</xref>. We now describe the components in more detail.</p>
<table-wrap id="pone.0308543.t001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0308543.t001</object-id>
<label>Table 1</label>
<caption>
<title>Summary of the simulation design, part 1: Aims and data-generating mechanisms.</title>
</caption>
<alternatives>
<graphic id="pone.0308543.t001g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0308543.t001" xlink:type="simple"/>
<table border="0" frame="box" rules="all">
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<tbody>
<tr>
<td align="left">Aims (Section 3.1)</td>
<td align="left">Comparison of popular data-driven variable selection methods for multivariable linear or logistic regression, with respect to their consequences for the resulting models.</td>
</tr>
<tr>
<td align="left">Data-generating mechanisms (Section 3.2)</td>
<td align="left">
<list list-type="bullet">
<list-item>
<p>20 variables: 10 predictors <italic>X</italic><sub>1</sub>, …, <italic>X</italic><sub>10</sub> and 10 noise variables <italic>X</italic><sub>11</sub>, …, <italic>X</italic><sub>20</sub> (mixture of binary and continuous variables)</p>
</list-item>
<list-item>
<p>Distributions and correlation structure of the variables are based on NHANES data [<xref ref-type="bibr" rid="pone.0308543.ref021">21</xref>] (see <xref ref-type="fig" rid="pone.0308543.g001">Fig 1</xref>, <xref ref-type="supplementary-material" rid="pone.0308543.s001">S1 Fig</xref> and <xref ref-type="supplementary-material" rid="pone.0308543.s005">S1 Table</xref>).</p>
</list-item>
<list-item>
<p>Standardized regression coefficients for the predictors <italic>X</italic><sub>1</sub>, …, <italic>X</italic><sub>10</sub>: <inline-formula id="pone.0308543.e001"><alternatives><graphic id="pone.0308543.e001g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e001" xlink:type="simple"/><mml:math display="inline" id="M1"><mml:mrow><mml:mrow><mml:mo>(</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mn>1</mml:mn> <mml:mrow><mml:mi>s</mml:mi> <mml:mi>d</mml:mi></mml:mrow></mml:msubsup> <mml:mo>,</mml:mo> <mml:mo>…</mml:mo> <mml:mo>,</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mrow><mml:mn>10</mml:mn></mml:mrow> <mml:mrow><mml:mi>s</mml:mi> <mml:mi>d</mml:mi></mml:mrow></mml:msubsup> <mml:mo>)</mml:mo></mml:mrow> <mml:mo>=</mml:mo> <mml:mrow><mml:mo>(</mml:mo> <mml:mn>1</mml:mn> <mml:mo>.</mml:mo> <mml:mn>5</mml:mn> <mml:mo>,</mml:mo> <mml:mo>-</mml:mo> <mml:mn>1</mml:mn> <mml:mo>,</mml:mo> <mml:mn>1</mml:mn> <mml:mo>,</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>75</mml:mn> <mml:mo>,</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>5</mml:mn> <mml:mo>,</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>5</mml:mn> <mml:mo>,</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>5</mml:mn> <mml:mo>,</mml:mo> <mml:mo>-</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>5</mml:mn> <mml:mo>,</mml:mo> <mml:mo>-</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>25</mml:mn> <mml:mo>,</mml:mo> <mml:mo>-</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>25</mml:mn> <mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula> The regression coefficients for <italic>X</italic><sub>11</sub>, …, <italic>X</italic><sub>20</sub> are set to zero.</p>
</list-item>
<list-item>
<p>For settings with linear effects, the outcome <italic>Y</italic> is simulated as follows:
<list list-type="bullet"><list-item><p>For linear regression: <italic>Y</italic> = <bold><italic>xβ</italic></bold> + <italic>ϵ</italic> with <italic>ϵ</italic> ∼ <italic>N</italic>(0, <italic>σ</italic><sup>2</sup>), and <italic>σ</italic><sup>2</sup> chosen such that <italic>R</italic><sup>2</sup> = 0.45 (setting 1), <italic>R</italic><sup>2</sup> = 0.15 (setting 2), or <italic>R</italic><sup>2</sup> = 0.7 (setting 3). The intercept <italic>β</italic><sub>0</sub> is set to 36.</p></list-item> <list-item><p>For logistic regression: outcomes <italic>Y</italic> are drawn from a Bernoulli distribution with event probability 1/(1 + exp(−<italic>c</italic> <bold><italic>xβ</italic></bold>)). The intercept <italic>β</italic><sub>0</sub> and the constant <italic>c</italic> &gt; 0 are adjusted such that
<list list-type="bullet"><list-item><p>the expected event probability equals 0.3 with Cox-Snell <inline-formula id="pone.0308543.e002"><alternatives><graphic id="pone.0308543.e002g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e002" xlink:type="simple"/><mml:math display="inline" id="M2"><mml:mrow><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup> <mml:mo>=</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>40</mml:mn></mml:mrow></mml:math></alternatives></inline-formula> (setting 4) or <inline-formula id="pone.0308543.e003"><alternatives><graphic id="pone.0308543.e003g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e003" xlink:type="simple"/><mml:math display="inline" id="M3"><mml:mrow><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup> <mml:mo>=</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>13</mml:mn></mml:mrow></mml:math></alternatives></inline-formula> (setting 5)</p></list-item> <list-item><p>the expected event probability equals 0.05 with Cox-Snell <inline-formula id="pone.0308543.e004"><alternatives><graphic id="pone.0308543.e004g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e004" xlink:type="simple"/><mml:math display="inline" id="M4"><mml:mrow><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup> <mml:mo>=</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>16</mml:mn></mml:mrow></mml:math></alternatives></inline-formula> (setting 6) or <inline-formula id="pone.0308543.e005"><alternatives><graphic id="pone.0308543.e005g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e005" xlink:type="simple"/><mml:math display="inline" id="M5"><mml:mrow><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup> <mml:mo>=</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>05</mml:mn></mml:mrow></mml:math></alternatives></inline-formula> (setting 7)</p></list-item></list></p></list-item></list></p>
</list-item>
<list-item>
<p>To evaluate models with mildly misspecified functional forms (a frequent situation in practice), each of the 7 settings is also considered with <italic>Y</italic> generated with nonlinear effects, yielding 14 settings in total (see <xref ref-type="table" rid="pone.0308543.t003">Table 3</xref>).</p>
</list-item>
<list-item>
<p>Additionally, we consider three simplified scenarios with all variables <inline-formula id="pone.0308543.e006"><alternatives><graphic id="pone.0308543.e006g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e006" xlink:type="simple"/><mml:math display="inline" id="M6"><mml:mrow><mml:mi mathvariant="script">N</mml:mi> <mml:mo>(</mml:mo> <mml:mn>0</mml:mn> <mml:mo>,</mml:mo> <mml:mn>1</mml:mn> <mml:mo>)</mml:mo></mml:mrow></mml:math></alternatives></inline-formula>-distributed and/or uncorrelated (see Section 3.2.5 for details).</p>
</list-item>
<list-item>
<p>Each setting is considered with varying sample sizes (see <xref ref-type="table" rid="pone.0308543.t004">Table 4</xref>).</p>
</list-item>
</list>
</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<table-wrap id="pone.0308543.t002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0308543.t002</object-id>
<label>Table 2</label>
<caption>
<title>Summary of the simulation design, part 2: Estimands, methods, and performance measures.</title>
</caption>
<alternatives>
<graphic id="pone.0308543.t002g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0308543.t002" xlink:type="simple"/>
<table border="0" frame="box" rules="all">
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<tbody>
<tr>
<td align="left">Estimands and other targets (Section 3.3)</td>
<td align="left">Regression coefficients, model selection, predictions</td>
</tr>
<tr>
<td align="left">Methods (Section 3.4)</td>
<td align="left">
<list list-type="bullet">
<list-item>
<p>Forward selection with AIC</p>
</list-item>
<list-item>
<p>Stepwise forward selection with AIC (forward selection with backward elimination steps)</p>
</list-item>
<list-item>
<p>Backward elimination with <italic>α</italic> = 0.05, <italic>α</italic> = 0.5, AIC, or BIC</p>
</list-item>
<list-item>
<p>Augmented backward elimination (ABE) with AIC and <italic>τ</italic> = 0.05</p>
</list-item>
<list-item>
<p>Univariable selection with <italic>α</italic> = 0.05 or <italic>α</italic> = 0.20</p>
</list-item>
<list-item>
<p>Univariable selection with <italic>α</italic> = 0.20 followed by backward elimination with <italic>α</italic> = 0.05</p>
</list-item>
<list-item>
<p>Lasso with λ tuned with 10-fold cross-validation</p>
</list-item>
<list-item>
<p>Relaxed Lasso with λ tuned with 10-fold cross-validation or BIC</p>
</list-item>
<list-item>
<p>Adaptive Lasso with λ tuned with 10-fold cross-validation</p>
</list-item>
</list>
We also consider the global model with all variables.</td>
</tr>
<tr>
<td align="left">Performance measures (Section 3.5)</td>
<td align="left">Mainly for descriptive models: <list list-type="bullet"><list-item><p>Bias and root of mean squared error of the coefficients</p></list-item> <list-item><p>Coverage and width of the 95% confidence intervals for the coefficients</p></list-item> <list-item><p>Type 1 error rate/power</p></list-item> <list-item><p>False positive rate/true positive rate</p></list-item> <list-item><p>Kendall’s <italic>τ</italic><sub><italic>B</italic></sub> for variable rankings</p></list-item> <list-item><p>Selection rate of the true model, of an over-selection model, and of an under-selection model</p></list-item></list>
Mainly for prediction models:
<list list-type="bullet"><list-item><p>Local bias and local root mean squared error w.r.t. estimated vs. true linear predictor</p></list-item> <list-item><p>Root mean squared error and median absolute error of estimated vs. true linear predictor, additionally AUC for logistic regression</p></list-item> <list-item><p>Integrated calibration index (ICI)</p></list-item></list></td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<sec id="sec004">
<title>3.1 Aims (A)</title>
<p>We aim to compare different variable selection methods for multivariable linear or logistic regression, with respect to their consequences for the resulting models. We consider consequences on bias and variance of the estimated regression coefficients, validity of confidence intervals for the coefficients, false inclusion or exclusion of variables, and predictive performance. We analyze the behavior of variable selection methods…</p>
<list list-type="bullet">
<list-item>
<p>… depending on sample size/EPV, with particular focus on evaluating the recommendations of Heinze et al. [<xref ref-type="bibr" rid="pone.0308543.ref001">1</xref>],</p>
</list-item>
<list-item>
<p>… depending on the <italic>R</italic><sup>2</sup> of the population model,</p>
</list-item>
<list-item>
<p>… depending on the modeling goal (description or prediction),</p>
</list-item>
<list-item>
<p>… when functional forms are misspecified (i.e., when fitting models assuming linear functional forms of continuous predictors even though the true functional forms are nonlinear),</p>
</list-item>
<list-item>
<p>… when switching from our realistic scenario that mimics cardiovascular data to simplified scenarios (i.e., all variables are normally distributed and/or uncorrelated).</p>
</list-item>
</list>
</sec>
<sec id="sec005">
<title>3.2 Data-generating mechanisms (D)</title>
<sec id="sec006">
<title>3.2.1 Simulation of independent variables (predictors and noise variables)</title>
<p>We simulate 20 independent variables: 10 true predictors (from now on just called “predictors”) and 10 noise variables. The correlation structure and distributions are based on real-world data from the 2013–14 and 2015–2016 cycles of the National Health and Nutrition Examination Survey (NHANES) [<xref ref-type="bibr" rid="pone.0308543.ref021">21</xref>]. To choose suitable variables in the NHANES data, we drew inspiration from a regression model reported by Sheppard et al. [<xref ref-type="bibr" rid="pone.0308543.ref022">22</xref>] for predicting the difference between diastolic blood pressure readings as measured ambulatory/at home versus in the clinic. The variables are described in detail in <xref ref-type="supplementary-material" rid="pone.0308543.s004">S1 Appendix</xref>. The correlation matrix Σ for the simulation is based on the empirical correlation matrix of the variables. For better interpretability, we set correlations below 0.15 to zero and round all values to the closest multiple of 0.05 (see <xref ref-type="supplementary-material" rid="pone.0308543.s001">S1 Fig</xref> and <xref ref-type="supplementary-material" rid="pone.0308543.s005">S1 Table</xref> for the resulting correlation matrix).</p>
<p>To obtain distributions from the NHANES data, we fit Bernoulli distributions for the binary variables, and normal distributions, log-normal distributions, or approximations of the empirical cumulative distribution function (CDF) for the continuous variables. For each continuous variable, we truncate its distribution with the minimum of the variable in the NHANES data as the lower bound and the maximum as the upper bound. The resulting distributions are as follows (see also <xref ref-type="fig" rid="pone.0308543.g001">Fig 1</xref>):</p>
<list list-type="bullet">
<list-item>
<p>predictors: <italic>X</italic><sub>1</sub> (log-normal), <italic>X</italic><sub>2</sub> (continuous with approximated CDF), <italic>X</italic><sub>3</sub> (log-normal), <italic>X</italic><sub>4</sub> (binary, <italic>p</italic> = 0.50), <italic>X</italic><sub>5</sub> (normal), <italic>X</italic><sub>6</sub> (binary, <italic>p</italic> = 0.29), <italic>X</italic><sub>7</sub> (log-normal), <italic>X</italic><sub>8</sub> (log-normal), <italic>X</italic><sub>9</sub> (normal), <italic>X</italic><sub>10</sub> (binary, <italic>p</italic> = 0.11)</p>
</list-item>
<list-item>
<p>noise variables: <italic>X</italic><sub>11</sub> (log-normal), <italic>X</italic><sub>12</sub> (normal), <italic>X</italic><sub>13</sub> (log-normal), <italic>X</italic><sub>14</sub> (binary, <italic>p</italic> = 0.61), <italic>X</italic><sub>15</sub> (normal), <italic>X</italic><sub>16</sub> (binary, <italic>p</italic> = 0.20), <italic>X</italic><sub>17</sub> (log-normal), <italic>X</italic><sub>18</sub> (normal), <italic>X</italic><sub>19</sub> (normal), <italic>X</italic><sub>20</sub> (binary, <italic>p</italic> = 0.20)</p>
</list-item>
</list>
<fig id="pone.0308543.g001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0308543.g001</object-id>
<label>Fig 1</label>
<caption>
<title>Distributions and pre-specified standardized regression coefficients of predictors and noise variables.</title>
<p>Predictors are ordered by absolute values of standardized regression coefficients. Histograms are based on a large simulated dataset (<italic>n</italic> = 100, 000).</p>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0308543.g001" xlink:type="simple"/>
</fig>
<p>The distributions, together with the correlation matrix Σ, are then used as input for the normal-to-anything (NORTA) method for simulation [<xref ref-type="bibr" rid="pone.0308543.ref029">29</xref>, <xref ref-type="bibr" rid="pone.0308543.ref030">30</xref>].</p>
</sec>
<sec id="sec007">
<title>3.2.2 Choice of regression coefficients</title>
<p>For choosing the standardized regression coefficients of the predictors <italic>X</italic><sub>1</sub>, …, <italic>X</italic><sub>10</sub>, we drew inspiration from the coefficients reported for the regression model of Sheppard et al. [<xref ref-type="bibr" rid="pone.0308543.ref022">22</xref>]:
<disp-formula id="pone.0308543.e007"><alternatives><graphic id="pone.0308543.e007g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e007" xlink:type="simple"/><mml:math display="block" id="M7"><mml:mrow><mml:mo>(</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mn>1</mml:mn> <mml:mrow><mml:mi>s</mml:mi> <mml:mi>d</mml:mi></mml:mrow></mml:msubsup> <mml:mo>,</mml:mo> <mml:mo>…</mml:mo> <mml:mo>,</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mrow><mml:mn>10</mml:mn></mml:mrow> <mml:mrow><mml:mi>s</mml:mi> <mml:mi>d</mml:mi></mml:mrow></mml:msubsup><mml:msup><mml:mo>)</mml:mo> <mml:mi>t</mml:mi></mml:msup> <mml:mo>=</mml:mo> <mml:mrow><mml:mo>(</mml:mo> <mml:mn>1</mml:mn> <mml:mo>.</mml:mo> <mml:mn>5</mml:mn> <mml:mo>,</mml:mo> <mml:mo>-</mml:mo> <mml:mn>1</mml:mn> <mml:mo>,</mml:mo> <mml:mn>1</mml:mn> <mml:mo>,</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>75</mml:mn> <mml:mo>,</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>5</mml:mn> <mml:mo>,</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>5</mml:mn> <mml:mo>,</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>5</mml:mn> <mml:mo>,</mml:mo> <mml:mo>-</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>5</mml:mn> <mml:mo>,</mml:mo> <mml:mo>-</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>25</mml:mn> <mml:mo>,</mml:mo> <mml:mo>-</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>25</mml:mn> <mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives></disp-formula></p>
<p>This choice reflects a mixture of stronger and weaker effects, a situation typical for many applications in biology and medicine. We would expect different behaviors of the predictors during variable selection depending on their effects.</p>
<p>The standardized coefficients <inline-formula id="pone.0308543.e008"><alternatives><graphic id="pone.0308543.e008g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e008" xlink:type="simple"/><mml:math display="inline" id="M8"><mml:msubsup><mml:mi>β</mml:mi> <mml:mi>j</mml:mi> <mml:mrow><mml:mi>s</mml:mi> <mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> are transformed into non-standardized coefficients <italic>β</italic><sub><italic>j</italic></sub> as follows: standard deviations (SDs) of the variables are calculated based on a single large simulated dataset <italic>D</italic><sub><italic>P</italic></sub> to approximate the population (<italic>n</italic> = 100, 000) and the standardized coefficients are divided by these SDs.</p>
<p>The regression coefficients for the noise variables <italic>X</italic><sub>11</sub>, …, <italic>X</italic><sub>20</sub> are set to zero.</p>
<p>As intended, there is no systematic association between the absolute values <inline-formula id="pone.0308543.e009"><alternatives><graphic id="pone.0308543.e009g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e009" xlink:type="simple"/><mml:math display="inline" id="M9"><mml:mrow><mml:mrow><mml:mo>|</mml:mo></mml:mrow> <mml:msubsup><mml:mi>β</mml:mi> <mml:mi>j</mml:mi> <mml:mrow><mml:mi>s</mml:mi> <mml:mi>d</mml:mi></mml:mrow></mml:msubsup> <mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula> and the coefficients of determination <inline-formula id="pone.0308543.e010"><alternatives><graphic id="pone.0308543.e010g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e010" xlink:type="simple"/><mml:math display="inline" id="M10"><mml:msubsup><mml:mi>R</mml:mi> <mml:mi>j</mml:mi> <mml:mn>2</mml:mn></mml:msubsup></mml:math></alternatives></inline-formula> for the regression of each variable <italic>X</italic><sub><italic>j</italic></sub> on all other respective variables <italic>X</italic><sub><italic>l</italic></sub>, <italic>l</italic> = 1, …, 20, <italic>l</italic> ≠ <italic>j</italic> (see <xref ref-type="supplementary-material" rid="pone.0308543.s002">S2 Fig</xref>). Moreover, five out of ten predictors have univariable effects that are larger than their multivariable effects. (Here, multivariable effects are obtained by fitting a model with all predictors and noise variables).</p>
</sec>
<sec id="sec008">
<title>3.2.3 Simulation of outcome <italic>Y</italic></title>
<p>The outcome <italic>Y</italic> is simulated as follows:</p>
<list list-type="bullet">
<list-item>
<p>For linear regression: <italic>Y</italic> = <bold><italic>xβ</italic></bold> + <italic>ϵ</italic>, with <italic>ϵ</italic> ∼ <italic>N</italic>(0, <italic>σ</italic><sup>2</sup>), and <italic>σ</italic><sup>2</sup> chosen such that <italic>R</italic><sup>2</sup> = 0.45 (setting 1, main scenario), <italic>R</italic><sup>2</sup> = 0.15 (setting 2, low <italic>R</italic><sup>2</sup> scenario), or <italic>R</italic><sup>2</sup> = 0.7 (setting 3, high <italic>R</italic><sup>2</sup> scenario). The intercept <italic>β</italic><sub>0</sub> is set to 36. The vector <bold><italic>x</italic></bold> = (1, <italic>x</italic><sub>1</sub>, …, <italic>x</italic><sub>10</sub>) denotes a simulated realization of the variables <italic>X</italic><sub>1</sub>, …, <italic>X</italic><sub>10</sub> with an added constant for the intercept.</p>
<p>The required <italic>σ</italic><sup>2</sup> values for obtaining <italic>R</italic><sup>2</sup> = 0.45, <italic>R</italic><sup>2</sup> = 0.15, or <italic>R</italic><sup>2</sup> = 0.7 can be calculated as follows [<xref ref-type="bibr" rid="pone.0308543.ref009">9</xref>]:
<disp-formula id="pone.0308543.e011"><alternatives><graphic id="pone.0308543.e011g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e011" xlink:type="simple"/><mml:math display="block" id="M11"><mml:mrow><mml:msup><mml:mi>σ</mml:mi> <mml:mn>2</mml:mn></mml:msup> <mml:mo>=</mml:mo> <mml:mi>V</mml:mi> <mml:mi>a</mml:mi> <mml:mi>r</mml:mi> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi mathvariant="bold-italic">X</mml:mi> <mml:mi>P</mml:mi></mml:msub> <mml:mi mathvariant="bold-italic">β</mml:mi> <mml:mo>)</mml:mo></mml:mrow> <mml:mfrac><mml:mrow><mml:mn>1</mml:mn> <mml:mo>-</mml:mo> <mml:msup><mml:mi>R</mml:mi> <mml:mn>2</mml:mn></mml:msup></mml:mrow> <mml:msup><mml:mi>R</mml:mi> <mml:mn>2</mml:mn></mml:msup></mml:mfrac> <mml:mo>,</mml:mo></mml:mrow></mml:math></alternatives></disp-formula>
where <italic>Var</italic>(<bold><italic>X</italic></bold><sub><italic>P</italic></sub><bold><italic>β</italic></bold>) is calculated with the design matrix <bold><italic>X</italic></bold><sub><italic>P</italic></sub> obtained from the approximate “population dataset” <italic>D</italic><sub><italic>P</italic></sub>.</p>
</list-item>
<list-item>
<p>For logistic regression: outcomes <italic>Y</italic> are drawn from a Bernoulli distribution with event probability
<disp-formula id="pone.0308543.e012"><alternatives><graphic id="pone.0308543.e012g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e012" xlink:type="simple"/><mml:math display="block" id="M12"><mml:mrow><mml:mfrac><mml:mn>1</mml:mn> <mml:mrow><mml:mn>1</mml:mn> <mml:mo>+</mml:mo> <mml:mtext>exp</mml:mtext> <mml:mo>(</mml:mo> <mml:mo>-</mml:mo> <mml:mi>c</mml:mi> <mml:mi mathvariant="bold-italic">x</mml:mi> <mml:mi mathvariant="bold-italic">β</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:mfrac> <mml:mo>,</mml:mo></mml:mrow></mml:math></alternatives></disp-formula>
with a constant <italic>c</italic> &gt; 0.</p>
<p>First, we set <italic>c</italic> = 1 and adjust the intercept <italic>β</italic><sub>0</sub> manually such that the overall expected event probability equals either 0.3 or 0.05. The resulting Cox-Snell <inline-formula id="pone.0308543.e013"><alternatives><graphic id="pone.0308543.e013g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e013" xlink:type="simple"/><mml:math display="inline" id="M13"><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup></mml:math></alternatives></inline-formula> values are 0.40 for event rate 0.3 and 0.16 for event rate 0.05. These values constitute the main settings for logistic regression, but note that they are different from the <italic>R</italic><sup>2</sup> value of 0.45 in the main setting for linear regression (setting 1). Because setting 2 for linear regression considers 1/3 of the <italic>R</italic><sup>2</sup> value in setting 1, we add analogous “low <inline-formula id="pone.0308543.e014"><alternatives><graphic id="pone.0308543.e014g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e014" xlink:type="simple"/><mml:math display="inline" id="M14"><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup></mml:math></alternatives></inline-formula> settings” for logistic regression: <italic>c</italic> and <italic>β</italic><sub>0</sub> are adjusted to obtain 1/3 of the original <inline-formula id="pone.0308543.e015"><alternatives><graphic id="pone.0308543.e015g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e015" xlink:type="simple"/><mml:math display="inline" id="M15"><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup></mml:math></alternatives></inline-formula> values (0.40/3 = 0.13 for event rate 0.3, and 0.16/3 = 0.05 for event rate 0.05). In contrast to linear regression (setting 3), we do not include an additional high <italic>R</italic><sup>2</sup> setting: the maximum Cox-Snell <italic>R</italic><sup>2</sup> values that are possible in theory are less than 1 (for event rate 0.3: approx. 0.71, for event rate 0.05: approx. 0.33), thus the <italic>R</italic><sup>2</sup> values in the main settings can already be considered as relatively high.</p>
<p>In summary, this yields the following settings 4–7, for which we also estimated the corresponding population areas under the receiver operating characteristic curve (AUC) based on the “population dataset” <italic>D</italic><sub><italic>P</italic></sub>:
<list list-type="bullet"><list-item><p>the overall expected event probability equals 0.3 with <inline-formula id="pone.0308543.e016"><alternatives><graphic id="pone.0308543.e016g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e016" xlink:type="simple"/><mml:math display="inline" id="M16"><mml:mrow><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup> <mml:mo>=</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>40</mml:mn></mml:mrow></mml:math></alternatives></inline-formula>, AUC = 0.90 (setting 4) or <inline-formula id="pone.0308543.e017"><alternatives><graphic id="pone.0308543.e017g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e017" xlink:type="simple"/><mml:math display="inline" id="M17"><mml:mrow><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup> <mml:mo>=</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>13</mml:mn></mml:mrow></mml:math></alternatives></inline-formula>, AUC = 0.73 (setting 5),</p></list-item> <list-item><p>the overall expected event probability equals 0.05 with <inline-formula id="pone.0308543.e018"><alternatives><graphic id="pone.0308543.e018g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e018" xlink:type="simple"/><mml:math display="inline" id="M18"><mml:mrow><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup> <mml:mo>=</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>16</mml:mn></mml:mrow></mml:math></alternatives></inline-formula>, AUC = 0.94 (setting 6) or <inline-formula id="pone.0308543.e019"><alternatives><graphic id="pone.0308543.e019g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e019" xlink:type="simple"/><mml:math display="inline" id="M19"><mml:mrow><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup> <mml:mo>=</mml:mo> <mml:mn>0</mml:mn> <mml:mo>.</mml:mo> <mml:mn>05</mml:mn></mml:mrow></mml:math></alternatives></inline-formula>, AUC = 0.78 (setting 7).</p></list-item></list></p>
</list-item>
</list>
</sec>
<sec id="sec009">
<title>3.2.4 Nonlinear functional forms</title>
<p>So far, we assumed that the functional forms of the effects of continuous predictors on <italic>Y</italic> are linear. In applied studies in biology and medicine, the actual functional forms of such variables might often be nonlinear, but researchers nonetheless fit a model with linear functional forms, e.g., because they are not aware that some functional forms might be nonlinear, or because they prefer a simpler model. To analyze the behavior of variable selection methods in this scenario, we include settings 1b-7b (corresponding to settings 1–7) where all predictors have <italic>nonlinear</italic> functional forms. The models that we consider for analysing the simulated data (linear/logistic regression) will not take the nonlinear functional forms into account and will thus be misspecified.</p>
<p>For each continuous predictor <italic>X</italic><sub><italic>j</italic></sub>, we define a function <italic>g</italic><sub><italic>j</italic></sub>(<italic>x</italic>) that describes the nonlinear functional form of the effect of the predictor on <italic>Y</italic>. We choose various functional forms: quadratic, log-quadratic, exponential and sigmoid. The functions are depicted in <xref ref-type="supplementary-material" rid="pone.0308543.s003">S3 Fig</xref>; exact definitions are given in <xref ref-type="supplementary-material" rid="pone.0308543.s004">S1 Appendix</xref>.</p>
<p>The nonlinear composite predictor is then simulated as
<disp-formula id="pone.0308543.e020"><alternatives><graphic id="pone.0308543.e020g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e020" xlink:type="simple"/><mml:math display="block" id="M20"><mml:mrow><mml:msubsup><mml:mi>β</mml:mi> <mml:mn>0</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:mo>+</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mn>1</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:msub><mml:mi>g</mml:mi> <mml:mn>1</mml:mn></mml:msub> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>X</mml:mi> <mml:mn>1</mml:mn></mml:msub> <mml:mo>)</mml:mo></mml:mrow> <mml:mo>+</mml:mo> <mml:mo>…</mml:mo> <mml:mo>+</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mn>10</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:msub><mml:mi>g</mml:mi> <mml:mn>10</mml:mn></mml:msub> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>X</mml:mi> <mml:mn>10</mml:mn></mml:msub> <mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives></disp-formula></p>
<p>Here, the coefficients <inline-formula id="pone.0308543.e021"><alternatives><graphic id="pone.0308543.e021g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e021" xlink:type="simple"/><mml:math display="inline" id="M21"><mml:msubsup><mml:mi>β</mml:mi> <mml:mi>j</mml:mi> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> (the letter <italic>g</italic> alludes to the nonlinear transformations <italic>g</italic><sub><italic>j</italic></sub>) are chosen for continuous predictors such that
<disp-formula id="pone.0308543.e022"><alternatives><graphic id="pone.0308543.e022g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e022" xlink:type="simple"/><mml:math display="block" id="M22"><mml:mrow><mml:msubsup><mml:mi>β</mml:mi> <mml:mi>j</mml:mi> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:mi>S</mml:mi> <mml:mi>D</mml:mi> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>g</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>X</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mo>)</mml:mo></mml:mrow> <mml:mo>)</mml:mo></mml:mrow> <mml:mover accent="true"><mml:mo>=</mml:mo> <mml:mo>!</mml:mo></mml:mover> <mml:mrow><mml:mo>|</mml:mo></mml:mrow> <mml:msubsup><mml:mi>β</mml:mi> <mml:mi>j</mml:mi> <mml:mrow><mml:mi>s</mml:mi> <mml:mi>d</mml:mi></mml:mrow></mml:msubsup> <mml:mrow><mml:mo>|</mml:mo> <mml:mo>=</mml:mo> <mml:mo>|</mml:mo></mml:mrow> <mml:msub><mml:mi>β</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mrow><mml:mo>|</mml:mo> <mml:mi>S</mml:mi> <mml:mi>D</mml:mi></mml:mrow> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>X</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mo>)</mml:mo></mml:mrow> <mml:mo>,</mml:mo></mml:mrow></mml:math></alternatives></disp-formula>
to obtain effects that are comparable in magnitude to the linear effects. The standard deviations <italic>SD</italic>(<italic>X</italic><sub><italic>j</italic></sub>), <italic>SD</italic>(<italic>g</italic><sub><italic>j</italic></sub>(<italic>X</italic><sub><italic>j</italic></sub>)) are calculated based on the approximate “population dataset” <italic>D</italic><sub><italic>P</italic></sub>.</p>
<p>After determining <bold><italic>β</italic></bold><sup>(<italic>g</italic>)</sup>, the outcome <italic>Y</italic> is simulated as previously described in Section 3.2.3, with <bold><italic>xβ</italic></bold> replaced by the nonlinear composite predictor.</p>
<p>The simulation settings 1–7 with linear effects and settings 1b-7b with nonlinear effects are summarized in <xref ref-type="table" rid="pone.0308543.t003">Table 3</xref>. For settings 1b-7b, the <italic>R</italic><sup>2</sup> values hold for the true model with nonlinear functional forms; the achieved <italic>R</italic><sup>2</sup> values when functional forms are misspecified are expected to be lower. For logistic regression, using the coefficients <inline-formula id="pone.0308543.e023"><alternatives><graphic id="pone.0308543.e023g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e023" xlink:type="simple"/><mml:math display="inline" id="M23"><mml:msubsup><mml:mi>β</mml:mi> <mml:mi>j</mml:mi> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> for nonlinear effects in the main scenario yields slightly larger Cox-Snell <inline-formula id="pone.0308543.e024"><alternatives><graphic id="pone.0308543.e024g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e024" xlink:type="simple"/><mml:math display="inline" id="M24"><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup></mml:math></alternatives></inline-formula> values compared to the analogous settings with linear effects: 0.43 for event rate 0.3 and 0.20 for event rate 0.05. For the low <inline-formula id="pone.0308543.e025"><alternatives><graphic id="pone.0308543.e025g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e025" xlink:type="simple"/><mml:math display="inline" id="M25"><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup></mml:math></alternatives></inline-formula> scenario, <italic>c</italic> and <inline-formula id="pone.0308543.e026"><alternatives><graphic id="pone.0308543.e026g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e026" xlink:type="simple"/><mml:math display="inline" id="M26"><mml:msubsup><mml:mi>β</mml:mi> <mml:mn>0</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> are adjusted to obtain 1/3 of the <inline-formula id="pone.0308543.e027"><alternatives><graphic id="pone.0308543.e027g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e027" xlink:type="simple"/><mml:math display="inline" id="M27"><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup></mml:math></alternatives></inline-formula> values (0.43/3 = 0.14 for event rate 0.3, and 0.20/3 = 0.07 for event rate 0.05), analogously to the procedure for settings with linear effects.</p>
<table-wrap id="pone.0308543.t003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0308543.t003</object-id>
<label>Table 3</label>
<caption>
<title>Overview of simulation settings.</title>
</caption>
<alternatives>
<graphic id="pone.0308543.t003g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0308543.t003" xlink:type="simple"/>
<table border="0" frame="box" rules="all">
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="left" rowspan="2"/>
<th align="center" rowspan="2">linear regression</th>
<th align="center" colspan="2">logistic regression</th>
</tr>
<tr>
<th align="center">event rate 0.3</th>
<th align="center">event rate 0.05</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">main scenario…</td>
<td align="center"/>
<td align="center"/>
<td align="center"/>
</tr>
<tr>
<td align="left" rowspan="2">…with linear effects</td>
<td align="center">setting 1</td>
<td align="center">setting 4</td>
<td align="center">setting 6</td>
</tr>
<tr>
<td align="center"><italic>R</italic><sup>2</sup> = 0.45</td>
<td align="center">
<inline-formula id="pone.0308543.e028">
<alternatives>
<graphic id="pone.0308543.e028g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e028" xlink:type="simple"/>
<mml:math display="inline" id="M28">
<mml:mrow>
<mml:msubsup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>.</mml:mo>
<mml:mn>40</mml:mn>
</mml:mrow>
</mml:math>
</alternatives>
</inline-formula>
</td>
<td align="center">
<inline-formula id="pone.0308543.e029">
<alternatives>
<graphic id="pone.0308543.e029g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e029" xlink:type="simple"/>
<mml:math display="inline" id="M29">
<mml:mrow>
<mml:msubsup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>.</mml:mo>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:math>
</alternatives>
</inline-formula>
</td>
</tr>
<tr>
<td align="left" rowspan="2">…with nonlinear effects</td>
<td align="center">setting 1b</td>
<td align="center">setting 4b</td>
<td align="center">setting 6b</td>
</tr>
<tr>
<td align="center"><italic>R</italic><sup>2</sup> = 0.45</td>
<td align="center">
<inline-formula id="pone.0308543.e030">
<alternatives>
<graphic id="pone.0308543.e030g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e030" xlink:type="simple"/>
<mml:math display="inline" id="M30">
<mml:mrow>
<mml:msubsup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>.</mml:mo>
<mml:mn>43</mml:mn>
</mml:mrow>
</mml:math>
</alternatives>
</inline-formula>
</td>
<td align="center">
<inline-formula id="pone.0308543.e031">
<alternatives>
<graphic id="pone.0308543.e031g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e031" xlink:type="simple"/>
<mml:math display="inline" id="M31">
<mml:mrow>
<mml:msubsup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>.</mml:mo>
<mml:mn>20</mml:mn>
</mml:mrow>
</mml:math>
</alternatives>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">low <italic>R</italic><sup>2</sup> scenario…</td>
<td align="center"/>
<td align="center"/>
<td align="center"/>
</tr>
<tr>
<td align="left" rowspan="2">…with linear effects</td>
<td align="center">setting 2</td>
<td align="center">setting 5</td>
<td align="center">setting 7</td>
</tr>
<tr>
<td align="center"><italic>R</italic><sup>2</sup> = 0.15</td>
<td align="center">
<inline-formula id="pone.0308543.e032">
<alternatives>
<graphic id="pone.0308543.e032g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e032" xlink:type="simple"/>
<mml:math display="inline" id="M32">
<mml:mrow>
<mml:msubsup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>.</mml:mo>
<mml:mn>13</mml:mn>
</mml:mrow>
</mml:math>
</alternatives>
</inline-formula>
</td>
<td align="center">
<inline-formula id="pone.0308543.e033">
<alternatives>
<graphic id="pone.0308543.e033g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e033" xlink:type="simple"/>
<mml:math display="inline" id="M33">
<mml:mrow>
<mml:msubsup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>.</mml:mo>
<mml:mn>05</mml:mn>
</mml:mrow>
</mml:math>
</alternatives>
</inline-formula>
</td>
</tr>
<tr>
<td align="left" rowspan="2">…with nonlinear effects</td>
<td align="center">setting 2b</td>
<td align="center">setting 5b</td>
<td align="center">setting 7b</td>
</tr>
<tr>
<td align="center"><italic>R</italic><sup>2</sup> = 0.15</td>
<td align="center">
<inline-formula id="pone.0308543.e034">
<alternatives>
<graphic id="pone.0308543.e034g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e034" xlink:type="simple"/>
<mml:math display="inline" id="M34">
<mml:mrow>
<mml:msubsup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>.</mml:mo>
<mml:mn>14</mml:mn>
</mml:mrow>
</mml:math>
</alternatives>
</inline-formula>
</td>
<td align="center">
<inline-formula id="pone.0308543.e035">
<alternatives>
<graphic id="pone.0308543.e035g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e035" xlink:type="simple"/>
<mml:math display="inline" id="M35">
<mml:mrow>
<mml:msubsup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>.</mml:mo>
<mml:mn>07</mml:mn>
</mml:mrow>
</mml:math>
</alternatives>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">high <italic>R</italic><sup>2</sup> scenario…</td>
<td align="center"/>
<td align="center"/>
<td align="center"/>
</tr>
<tr>
<td align="left" rowspan="2">…with linear effects</td>
<td align="center">setting 3</td>
<td align="center"/>
<td align="center"/>
</tr>
<tr>
<td align="center"><italic>R</italic><sup>2</sup> = 0.7</td>
<td align="center"/>
<td align="center"/>
</tr>
<tr>
<td align="left" rowspan="2">…with nonlinear effects</td>
<td align="center">setting 3b</td>
<td align="center"/>
<td align="center"/>
</tr>
<tr>
<td align="center"><italic>R</italic><sup>2</sup> = 0.7</td>
<td align="center"/>
<td align="center"/>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<p>For the global model in the settings with nonlinear effects, we will not only calculate the usual standard errors of the regression coefficients, but also robust standard errors [<xref ref-type="bibr" rid="pone.0308543.ref031">31</xref>], to check whether robust SEs improve the coverage of the confidence intervals. If robust SEs improve the coverage for the global model, it would be interesting to analyze whether this is also the case for models obtained by variable selection; however, combining robust standard errors with variable selection requires some further work and would go beyond the scope of the proposed study. For now, we will restrict the investigation of robust SEs to the global model for linear regression.</p>
</sec>
<sec id="sec010">
<title>3.2.5 Simplified settings</title>
<p>While our main focus is on simulating variables of various distribution types (e.g., Bernoulli, normal, and log-normal) and with correlation matrix Σ based on the empirical correlation matrix from the NHANES data (<xref ref-type="supplementary-material" rid="pone.0308543.s005">S1 Table</xref>), we are also interested in the behavior of the variable selection methods for data with simpler distribution-correlation structures. We thus consider the three following simplified scenarios:</p>
<list list-type="order">
<list-item>
<p>The variables are multivariate normal and independent: <inline-formula id="pone.0308543.e036"><alternatives><graphic id="pone.0308543.e036g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e036" xlink:type="simple"/><mml:math display="inline" id="M36"><mml:mrow><mml:mi>X</mml:mi> <mml:mo>∼</mml:mo> <mml:msub><mml:mi mathvariant="script">N</mml:mi> <mml:mn>20</mml:mn></mml:msub> <mml:mrow><mml:mo>(</mml:mo> <mml:mn mathvariant="bold">0</mml:mn> <mml:mo>,</mml:mo> <mml:msub><mml:mi>I</mml:mi> <mml:mn>20</mml:mn></mml:msub> <mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula>, with <italic>I</italic><sub>20</sub> denoting the 20 × 20 identity matrix.</p>
</list-item>
<list-item>
<p>The variables are multivariate normal and correlated: <inline-formula id="pone.0308543.e037"><alternatives><graphic id="pone.0308543.e037g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e037" xlink:type="simple"/><mml:math display="inline" id="M37"><mml:mrow><mml:mi>X</mml:mi> <mml:mo>∼</mml:mo> <mml:msub><mml:mi mathvariant="script">N</mml:mi> <mml:mn>20</mml:mn></mml:msub> <mml:mrow><mml:mo>(</mml:mo> <mml:mn mathvariant="bold">0</mml:mn> <mml:mo>,</mml:mo> <mml:mo>Σ</mml:mo> <mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula>, with Σ denoting the correlation matrix as described above.</p>
</list-item>
<list-item>
<p>The variables have the same individual distributions as described in Section 3.2.1 (<xref ref-type="fig" rid="pone.0308543.g001">Fig 1</xref>), but are not correlated.</p>
</list-item>
</list>
<p>For each of these three scenarios, we will consider the settings 1–2 and 4–7 (i.e., the main scenario and low <italic>R</italic><sup>2</sup> scenario) with <italic>linear</italic> effects. This yields 3 * 6 = 18 simplified settings. For logistic regression, using the coefficients <inline-formula id="pone.0308543.e038"><alternatives><graphic id="pone.0308543.e038g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e038" xlink:type="simple"/><mml:math display="inline" id="M38"><mml:msubsup><mml:mi>β</mml:mi> <mml:mi>j</mml:mi> <mml:mrow><mml:mi>s</mml:mi> <mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> from Section 3.2.2. will yield Cox-Snell <inline-formula id="pone.0308543.e039"><alternatives><graphic id="pone.0308543.e039g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e039" xlink:type="simple"/><mml:math display="inline" id="M39"><mml:msubsup><mml:mi>R</mml:mi> <mml:mrow><mml:mi>C</mml:mi> <mml:mi>S</mml:mi></mml:mrow> <mml:mn>2</mml:mn></mml:msubsup></mml:math></alternatives></inline-formula> values that are slightly different from those given in <xref ref-type="table" rid="pone.0308543.t003">Table 3</xref>.</p>
<p>Depending on the results for settings 1b-2b and 4b-7b with nonlinear effects, we might additionally consider nonlinear effects for the simplified scenario 3 (variables not multivariate normal and not correlated).</p>
</sec>
<sec id="sec011">
<title>3.2.6 Sample sizes</title>
<p>For linear regression, we consider eight different sample sizes: 100, 200, 400, 500, 800, 1600, 3200, and 6400. These sample sizes result when doubling sample size six times from 100. Additionally, the sample size 500 is included because it corresponds to EPV = 25, and this EPV value was specifically mentioned in the recommendations of Heinze et al. [<xref ref-type="bibr" rid="pone.0308543.ref001">1</xref>].</p>
<p>For logistic regression, we first choose sample sizes corresponding to EPV = 25: <italic>n</italic> = 1667 (event rate 0.3) respectively <italic>n</italic> = 10, 000 (event rate 0.05).
The other sample sizes for logistic regression are chosen differently depending on the event rate.
For event rate 0.3, sample sizes are “aligned” to the samples sizes <inline-formula id="pone.0308543.e040"><alternatives><graphic id="pone.0308543.e040g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e040" xlink:type="simple"/><mml:math display="inline" id="M40"><mml:mrow><mml:mrow><mml:mo>{</mml:mo> <mml:msubsup><mml:mi>n</mml:mi> <mml:mn>1</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>l</mml:mi> <mml:mi>i</mml:mi> <mml:mi>n</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:mo>,</mml:mo> <mml:mo>…</mml:mo> <mml:mo>,</mml:mo> <mml:msubsup><mml:mi>n</mml:mi> <mml:mn>7</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>l</mml:mi> <mml:mi>i</mml:mi> <mml:mi>n</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:mo>}</mml:mo></mml:mrow> <mml:mo>=</mml:mo> <mml:mrow><mml:mo>{</mml:mo> <mml:mn>100</mml:mn> <mml:mo>,</mml:mo> <mml:mn>200</mml:mn> <mml:mo>,</mml:mo> <mml:mn>400</mml:mn> <mml:mo>,</mml:mo> <mml:mn>800</mml:mn> <mml:mo>,</mml:mo> <mml:mn>1600</mml:mn> <mml:mo>,</mml:mo> <mml:mn>3200</mml:mn> <mml:mo>,</mml:mo> <mml:mn>6400</mml:mn> <mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula> of linear regression as follows: sample sizes <inline-formula id="pone.0308543.e041"><alternatives><graphic id="pone.0308543.e041g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e041" xlink:type="simple"/><mml:math display="inline" id="M41"><mml:msubsup><mml:mi>n</mml:mi> <mml:mi>k</mml:mi> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>l</mml:mi> <mml:mi>o</mml:mi> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup></mml:math></alternatives></inline-formula> for logistic regression are chosen such that at sample size <inline-formula id="pone.0308543.e042"><alternatives><graphic id="pone.0308543.e042g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e042" xlink:type="simple"/><mml:math display="inline" id="M42"><mml:mrow><mml:mi>n</mml:mi> <mml:mo>=</mml:mo> <mml:msubsup><mml:mi>n</mml:mi> <mml:mi>k</mml:mi> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>l</mml:mi> <mml:mi>o</mml:mi> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup></mml:mrow></mml:math></alternatives></inline-formula>, the regression coefficients in the logistic regression have approximately the same standard errors as the regression coefficients in the linear regression at <inline-formula id="pone.0308543.e043"><alternatives><graphic id="pone.0308543.e043g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e043" xlink:type="simple"/><mml:math display="inline" id="M43"><mml:mrow><mml:mi>n</mml:mi> <mml:mo>=</mml:mo> <mml:msubsup><mml:mi>n</mml:mi> <mml:mi>k</mml:mi> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>l</mml:mi> <mml:mi>i</mml:mi> <mml:mi>n</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup></mml:mrow></mml:math></alternatives></inline-formula>. Our procedure for aligning the sample sizes is described in detail in <xref ref-type="supplementary-material" rid="pone.0308543.s004">S1 Appendix</xref>.</p>
<p>Because this procedure is unstable for small event rates, we do not use the alignment based on standard errors for event rate 0.05. Instead, we choose sample sizes corresponding to the EPV values in linear regression.</p>
<p>The resulting sample sizes are displayed in <xref ref-type="table" rid="pone.0308543.t004">Table 4</xref>. The numbers below the sample sizes indicate the corresponding EPV values. For event rate 0.05, we will first include sample sizes only up to 10,000 (EPV = 25) to save computation time. We expect the variable selection methods to behave similarly for both event rates (0.3 and 0.05). If we observe different behaviors for event rate 0.05, we will include the additional sample sizes.</p>
<table-wrap id="pone.0308543.t004" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0308543.t004</object-id>
<label>Table 4</label>
<caption>
<title>Sample sizes and EPV values for linear and logistic regression.</title>
</caption>
<alternatives>
<graphic id="pone.0308543.t004g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0308543.t004" xlink:type="simple"/>
<table border="0" frame="box" rules="all">
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<tbody>
<tr>
<td align="left" rowspan="2">linear regression</td>
<td align="left"><italic>n</italic></td>
<td align="right">100</td>
<td align="right">200</td>
<td align="right">400</td>
<td align="right"><italic>500</italic></td>
<td align="right">800</td>
<td align="right">1600</td>
<td align="right">3200</td>
<td align="right">6400</td>
</tr>
<tr>
<td align="left">EPV</td>
<td align="right">5</td>
<td align="right">10</td>
<td align="right">20</td>
<td align="right"><italic>25</italic></td>
<td align="right">40</td>
<td align="right">80</td>
<td align="right">160</td>
<td align="right">320</td>
</tr>
<tr>
<td align="left" rowspan="2">logistic regression, event rate 0.3</td>
<td align="left"><italic>n</italic></td>
<td align="right">183</td>
<td align="right">365</td>
<td align="right">730</td>
<td align="right"><italic>1667</italic></td>
<td align="right">1461</td>
<td align="right">2922</td>
<td align="right">5844</td>
<td align="right">11,687</td>
</tr>
<tr>
<td align="left">EPV</td>
<td align="right">2.75</td>
<td align="right">5.48</td>
<td align="right">10.95</td>
<td align="right"><italic>25</italic></td>
<td align="right">21.92</td>
<td align="right">43.83</td>
<td align="right">87.66</td>
<td align="right">175.31</td>
</tr>
<tr>
<td align="left" rowspan="2">logistic regression, event rate 0.05</td>
<td align="left"><italic>n</italic></td>
<td align="right">2000</td>
<td align="right">4000</td>
<td align="right">8000</td>
<td align="right"><italic>10,000</italic></td>
<td align="right">–</td>
<td align="right">–</td>
<td align="right">–</td>
<td align="right">–</td>
</tr>
<tr>
<td align="left">EPV</td>
<td align="right">5</td>
<td align="right">10</td>
<td align="right">20</td>
<td align="right"><italic>25</italic></td>
<td align="right">–</td>
<td align="right">–</td>
<td align="right">–</td>
<td align="right">–</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<p>In <xref ref-type="supplementary-material" rid="pone.0308543.s004">S1 Appendix</xref>, we additionally report expected shrinkage factors for each setting, based on sample size and <italic>R</italic><sup>2</sup> [<xref ref-type="bibr" rid="pone.0308543.ref032">32</xref>, <xref ref-type="bibr" rid="pone.0308543.ref033">33</xref>].</p>
</sec>
</sec>
<sec id="sec012">
<title>3.3 Estimands and other targets (E)</title>
<p>As estimands, we consider the true <italic>regression coefficients</italic> of the data generating models. As further targets, we are interested in <italic>model selection</italic> (e.g., whether the true model is selected) and <italic>predictive performance</italic> of the selected models.</p>
<p>For the settings with linear functional forms, the regression coefficient estimands are the coefficients <bold><italic>β</italic></bold> (respectively <italic>c</italic> <bold><italic>β</italic></bold> for logistic regression) as described in Sections 3.2.2 and 3.2.3. For the settings with nonlinear effects, we cannot take the coefficients <bold><italic>β</italic></bold><sup>(<italic>g</italic>)</sup> as defined in 3.2.4 as estimands, because our linear/logistic regression models will not take nonlinear functional forms into account and will thus be misspecified.</p>
<p>Instead, we consider two alternative versions of estimands based on two different linear approximations of the nonlinear functions. Recall that we simulate the nonlinear composite predictor as
<disp-formula id="pone.0308543.e044"><alternatives><graphic id="pone.0308543.e044g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e044" xlink:type="simple"/><mml:math display="block" id="M44"><mml:mrow><mml:msubsup><mml:mi>β</mml:mi> <mml:mn>0</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:mo>+</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mn>1</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:msub><mml:mi>g</mml:mi> <mml:mn>1</mml:mn></mml:msub> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>X</mml:mi> <mml:mn>1</mml:mn></mml:msub> <mml:mo>)</mml:mo></mml:mrow> <mml:mo>+</mml:mo> <mml:mo>…</mml:mo> <mml:mo>+</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mn>10</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:msub><mml:mi>g</mml:mi> <mml:mn>10</mml:mn></mml:msub> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>X</mml:mi> <mml:mn>10</mml:mn></mml:msub> <mml:mo>)</mml:mo></mml:mrow> <mml:mo>.</mml:mo></mml:mrow></mml:math></alternatives></disp-formula></p>
<p>Our first option for the estimands is <bold><italic>β</italic></bold><sup>(<italic>proj</italic>)</sup>:
<disp-formula id="pone.0308543.e045"><alternatives><graphic id="pone.0308543.e045g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e045" xlink:type="simple"/><mml:math display="block" id="M45"><mml:mtable><mml:mtr><mml:mtd><mml:msubsup><mml:mi>β</mml:mi> <mml:mn>0</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:mo>+</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mn>1</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:msub><mml:mi>g</mml:mi> <mml:mn>1</mml:mn></mml:msub> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>X</mml:mi> <mml:mn>1</mml:mn></mml:msub> <mml:mo>)</mml:mo></mml:mrow> <mml:mo>+</mml:mo> <mml:mo>…</mml:mo> <mml:mo>+</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mn>10</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:msub><mml:mi>g</mml:mi> <mml:mn>10</mml:mn></mml:msub> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>X</mml:mi> <mml:mn>10</mml:mn></mml:msub> <mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>≈</mml:mo> <mml:mspace width="3.33333pt"/><mml:msubsup><mml:mi>β</mml:mi> <mml:mn>0</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>p</mml:mi> <mml:mi>r</mml:mi> <mml:mi>o</mml:mi> <mml:mi>j</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:mo>+</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mn>1</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>p</mml:mi> <mml:mi>r</mml:mi> <mml:mi>o</mml:mi> <mml:mi>j</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:msub><mml:mi>X</mml:mi> <mml:mn>1</mml:mn></mml:msub> <mml:mo>+</mml:mo> <mml:mo>…</mml:mo> <mml:mo>+</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mn>10</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>p</mml:mi> <mml:mi>r</mml:mi> <mml:mi>o</mml:mi> <mml:mi>j</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:msub><mml:mi>X</mml:mi> <mml:mn>10</mml:mn></mml:msub> <mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></alternatives></disp-formula>
where <bold><italic>β</italic></bold><sup>(<italic>proj</italic>)</sup> are the coefficients obtained by projecting the true model with nonlinear functional forms onto one with linear functional forms. This projection is approximated by using the dataset <italic>D</italic><sub><italic>P</italic></sub> as a “surrogate” for the population and fitting a linear/logistic regression model with linear functional forms to the nonlinear composite predictor (for linear regression) or to the outcome <italic>Y</italic> that was simulated based on nonlinear functional forms (for logistic regression).</p>
<p>As the second option, we consider <bold><italic>β</italic></bold><sup>(<italic>AS</italic>)</sup>:
<disp-formula id="pone.0308543.e046"><alternatives><graphic id="pone.0308543.e046g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e046" xlink:type="simple"/><mml:math display="block" id="M46"><mml:mtable><mml:mtr><mml:mtd><mml:msubsup><mml:mi>β</mml:mi> <mml:mn>0</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:mo>+</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mn>1</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:msub><mml:mi>g</mml:mi> <mml:mn>1</mml:mn></mml:msub> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>X</mml:mi> <mml:mn>1</mml:mn></mml:msub> <mml:mo>)</mml:mo></mml:mrow> <mml:mo>+</mml:mo> <mml:mo>…</mml:mo> <mml:mo>+</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mn>10</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:msub><mml:mi>g</mml:mi> <mml:mn>10</mml:mn></mml:msub> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>X</mml:mi> <mml:mn>10</mml:mn></mml:msub> <mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>≈</mml:mo> <mml:mspace width="3.33333pt"/><mml:msubsup><mml:mi>β</mml:mi> <mml:mn>0</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>A</mml:mi> <mml:mi>S</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:mo>+</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mn>1</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>A</mml:mi> <mml:mi>S</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:msub><mml:mi>X</mml:mi> <mml:mn>1</mml:mn></mml:msub> <mml:mo>+</mml:mo> <mml:mo>…</mml:mo> <mml:mo>+</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mn>10</mml:mn> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>A</mml:mi> <mml:mi>S</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:msub><mml:mi>X</mml:mi> <mml:mn>10</mml:mn></mml:msub> <mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></alternatives></disp-formula>
where <bold><italic>β</italic></bold><sup>(<italic>AS</italic>)</sup> are the “average slope” coefficients. In contrast to the “projected” regression coefficients, here each variable is considered individually. Each nonlinear effect <italic>g</italic><sub><italic>j</italic></sub>(<italic>x</italic>) is approximated as <italic>α</italic><sub><italic>j</italic></sub><italic>x</italic>, where <italic>α</italic><sub><italic>j</italic></sub> is the average slope of <italic>g</italic><sub><italic>j</italic></sub> weighted by the density of the <italic>j</italic>-th predictor <italic>X</italic><sub><italic>j</italic></sub>.</p>
<p>More precisely, let <italic>f</italic><sub><italic>j</italic></sub> be the density function of <italic>X</italic><sub><italic>j</italic></sub> as estimated from the NHANES data (i.e., the density function that is used as input for the simulation, see Section 3.2.1). We aim to approximate the “average slope” integral
<disp-formula id="pone.0308543.e047"><alternatives><graphic id="pone.0308543.e047g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e047" xlink:type="simple"/><mml:math display="block" id="M47"><mml:mrow><mml:msubsup><mml:mo>∫</mml:mo> <mml:mrow><mml:mi>m</mml:mi> <mml:mi>i</mml:mi> <mml:mi>n</mml:mi> <mml:mo>(</mml:mo> <mml:msub><mml:mi>X</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mo>)</mml:mo></mml:mrow> <mml:mrow><mml:mi>m</mml:mi> <mml:mi>a</mml:mi> <mml:mi>x</mml:mi> <mml:mo>(</mml:mo> <mml:msub><mml:mi>X</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:msubsup><mml:mi>g</mml:mi> <mml:mi>j</mml:mi> <mml:mo>′</mml:mo></mml:msubsup> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>x</mml:mi> <mml:mo>)</mml:mo></mml:mrow> <mml:msub><mml:mi>f</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>x</mml:mi> <mml:mo>)</mml:mo></mml:mrow> <mml:mi>d</mml:mi> <mml:mi>x</mml:mi></mml:mrow></mml:math></alternatives></disp-formula></p>
<p>For this purpose, we construct a partition <inline-formula id="pone.0308543.e048"><alternatives><graphic id="pone.0308543.e048g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e048" xlink:type="simple"/><mml:math display="inline" id="M48"><mml:mrow><mml:msubsup><mml:mi>x</mml:mi> <mml:mn>1</mml:mn> <mml:mi>j</mml:mi></mml:msubsup> <mml:mo>=</mml:mo> <mml:mtext>min</mml:mtext> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>X</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mo>)</mml:mo></mml:mrow> <mml:mo>≤</mml:mo> <mml:mo>…</mml:mo> <mml:mo>≤</mml:mo> <mml:msubsup><mml:mi>x</mml:mi> <mml:mn>1001</mml:mn> <mml:mi>j</mml:mi></mml:msubsup> <mml:mo>=</mml:mo> <mml:mtext>max</mml:mtext> <mml:mrow><mml:mo>(</mml:mo> <mml:msub><mml:mi>X</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula> of the range of <italic>X</italic><sub><italic>j</italic></sub> with equal sub-interval lengths <inline-formula id="pone.0308543.e049"><alternatives><graphic id="pone.0308543.e049g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e049" xlink:type="simple"/><mml:math display="inline" id="M49"><mml:mrow><mml:msub><mml:mi>d</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mo>=</mml:mo> <mml:msubsup><mml:mi>x</mml:mi> <mml:mrow><mml:mi>k</mml:mi> <mml:mo>+</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mi>j</mml:mi></mml:msubsup> <mml:mo>-</mml:mo> <mml:msubsup><mml:mi>x</mml:mi> <mml:mi>k</mml:mi> <mml:mi>j</mml:mi></mml:msubsup></mml:mrow></mml:math></alternatives></inline-formula>, where min(<italic>X</italic><sub><italic>j</italic></sub>), max(<italic>X</italic><sub><italic>j</italic></sub>) are obtained from the NHANES data. Then the integral is approximated by
<disp-formula id="pone.0308543.e050"><alternatives><graphic id="pone.0308543.e050g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e050" xlink:type="simple"/><mml:math display="block" id="M50"><mml:mrow><mml:msub><mml:mi>α</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mo>=</mml:mo> <mml:munderover><mml:mo>∑</mml:mo> <mml:mrow><mml:mi>k</mml:mi> <mml:mo>=</mml:mo> <mml:mn>1</mml:mn></mml:mrow> <mml:mn>1000</mml:mn></mml:munderover> <mml:msub><mml:mi>d</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:msubsup><mml:mi>g</mml:mi> <mml:mi>j</mml:mi> <mml:mo>′</mml:mo></mml:msubsup> <mml:mrow><mml:mo>(</mml:mo> <mml:msubsup><mml:mi>x</mml:mi> <mml:mi>k</mml:mi> <mml:mi>j</mml:mi></mml:msubsup> <mml:mo>)</mml:mo></mml:mrow> <mml:msub><mml:mi>f</mml:mi> <mml:mi>j</mml:mi></mml:msub> <mml:mrow><mml:mo>(</mml:mo> <mml:msubsup><mml:mi>x</mml:mi> <mml:mi>k</mml:mi> <mml:mi>j</mml:mi></mml:msubsup> <mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives></disp-formula>
Finally, <bold><italic>β</italic></bold><sup>(<italic>AS</italic>)</sup> is obtained by setting <inline-formula id="pone.0308543.e051"><alternatives><graphic id="pone.0308543.e051g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e051" xlink:type="simple"/><mml:math display="inline" id="M51"><mml:mrow><mml:msubsup><mml:mi>β</mml:mi> <mml:mi>j</mml:mi> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>A</mml:mi> <mml:mi>S</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:mo>=</mml:mo> <mml:msubsup><mml:mi>β</mml:mi> <mml:mi>j</mml:mi> <mml:mrow><mml:mo>(</mml:mo> <mml:mi>g</mml:mi> <mml:mo>)</mml:mo></mml:mrow></mml:msubsup> <mml:msub><mml:mi>α</mml:mi> <mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:math></alternatives></inline-formula>.</p>
</sec>
<sec id="sec013">
<title>3.4 Methods (M)</title>
<sec id="sec014">
<title>3.4.1 Overview of variable selection methods</title>
<p>We include the following methods:</p>
<list list-type="bullet">
<list-item>
<p>Forward selection with AIC: starting from the model containing only the intercept, variables are iteratively added to the model based on their capability to decrease the AIC when included.</p>
</list-item>
<list-item>
<p>Stepwise forward selection with AIC (i.e., forward selection with backward elimination steps): like simple forward selection, this method starts from the intercept model and adds variables based on the AIC. However, in each step, re-exclusion of already selected variables is allowed, based on the capability to decrease the AIC when removed.</p>
</list-item>
<list-item>
<p>Backward elimination with <italic>α</italic> = 0.05, with BIC, with AIC, and with <italic>α</italic> = 0.5: starting from the global model, variables are iteratively removed, either based on their capability to decrease the BIC/AIC when removed, or based on the <italic>p</italic>-values of their coefficients. We do not consider a stepwise variant of backward elimination with forward selection steps, following the recommendations of Royston and Sauerbrei [<xref ref-type="bibr" rid="pone.0308543.ref028">28</xref>, p. 32] who argue that allowing re-inclusion of removed variables in backward elimination is rarely relevant, while allowing re-exclusion of included variables may cause a notable difference for forward selection.</p>
</list-item>
<list-item>
<p>Augmented backward elimination (ABE) with AIC [<xref ref-type="bibr" rid="pone.0308543.ref015">15</xref>]: backward elimination is combined with the change-in-estimate criterion [<xref ref-type="bibr" rid="pone.0308543.ref034">34</xref>, <xref ref-type="bibr" rid="pone.0308543.ref035">35</xref>]. A variable that would be removed in backward elimination based on AIC may stay in the model if its removal would induce a large change in the estimated regression coefficients of the other variables that are currently in the model. As threshold for the standardized change-in-estimate, we choose <italic>τ</italic> = 0.05. We will use the R package <monospace specific-use="no-wrap">abe</monospace> [<xref ref-type="bibr" rid="pone.0308543.ref036">36</xref>].</p>
</list-item>
<list-item>
<p>Univariable selection with <italic>α</italic> = 0.05 and <italic>α</italic> = 0.20: a variable is selected if its regression coefficient in a univariable model is significant at level <italic>α</italic>. While many authors have advised against using univariable selection [<xref ref-type="bibr" rid="pone.0308543.ref005">5</xref>, <xref ref-type="bibr" rid="pone.0308543.ref037">37</xref>, <xref ref-type="bibr" rid="pone.0308543.ref038">38</xref>], the method is still often used in practice, which is why we include it in our simulation study.</p>
</list-item>
<list-item>
<p>Univariable selection with <italic>α</italic> = 0.20, followed by backward elimination with <italic>α</italic> = 0.05: frequently, researchers use this combination instead of using only univariable selection or only backward elimination [<xref ref-type="bibr" rid="pone.0308543.ref039">39</xref>, <xref ref-type="bibr" rid="pone.0308543.ref040">40</xref>] However, the warnings against univariable selection still apply to the combination method.</p>
</list-item>
<list-item>
<p>Lasso [<xref ref-type="bibr" rid="pone.0308543.ref016">16</xref>]: a penalty on the coefficients is added to the OLS criterion (linear regression) or the negative log-likelihood (logistic regression), causing shrinkage of the coefficients toward zero and setting some of them to exactly zero.</p>
</list-item>
<list-item>
<p>Relaxed Lasso [<xref ref-type="bibr" rid="pone.0308543.ref009">9</xref>, <xref ref-type="bibr" rid="pone.0308543.ref017">17</xref>]: variables are selected with the Lasso, but the shrinkage of the coefficients of the selected variables is relaxed by refitting the model with the selected variables without penalty.</p>
</list-item>
<list-item>
<p>Adaptive Lasso [<xref ref-type="bibr" rid="pone.0308543.ref018">18</xref>]: first, the global linear/logistic model is fit, then a Lasso with variable-specific weights for the penalty is estimated. The estimates from the first step serve to get the variable-specific weights for the second step: the weights are calculated such that a variable with larger regression coefficient in the first step is penalized less than a variable with smaller regression coefficient.</p>
<p>For all variants of the Lasso, we will use the R package <monospace specific-use="no-wrap">glmnet</monospace> [<xref ref-type="bibr" rid="pone.0308543.ref041">41</xref>]. The complexity parameter λ will be tuned with 10-fold cross-validation (CV). As performance criterion for the prediction on test sets during CV, we use the mean squared error for linear regression and deviance for logistic regression. For the relaxed Lasso, we additionally consider tuning λ with the BIC.</p>
</list-item>
</list>
<p>We also consider the global model with all variables.</p>
</sec>
<sec id="sec015">
<title>3.4.2 Firth correction in logistic regression</title>
<p>In the models for logistic regression, separation may occur (i.e., perfect separation of events and non-events by a linear combination of covariates), particularly for small to medium sample sizes and low event rates [<xref ref-type="bibr" rid="pone.0308543.ref042">42</xref>]. In this case, at least one parameter estimate is infinite. While separation can be detected by linear programming [<xref ref-type="bibr" rid="pone.0308543.ref043">43</xref>], we found that in practice, a simple and robust check can be performed by inspecting the model standard errors of the regression coefficients. If at least one standard error is extremely large, this indicates separation. A possible solution to the problem of separation is to apply the Firth correction to obtain finite parameter estimates [<xref ref-type="bibr" rid="pone.0308543.ref042">42</xref>, <xref ref-type="bibr" rid="pone.0308543.ref044">44</xref>].</p>
<p>In the simulation settings for logistic regression, we check for each individual simulated dataset whether separation occurs. In the case of separation, we apply the Firth correction (with the FLIC intercept correction [<xref ref-type="bibr" rid="pone.0308543.ref045">45</xref>] to obtain unbiased predictions), otherwise we use the standard logistic regression. When Firth correction is applied, confidence intervals for the regression coefficients are calculated based on the profile penalized likelihood, otherwise based on the profile likelihood.</p>
<p>We describe our procedure to check for separation based on the model standard errors of the coefficients in <xref ref-type="supplementary-material" rid="pone.0308543.s004">S1 Appendix</xref>.</p>
</sec>
</sec>
<sec id="sec016">
<title>3.5 Performance measures (P)</title>
<p>We organize the performance measures into three categories, based on which estimands/targets they pertain to. Formulas for all performance measures are given in <xref ref-type="supplementary-material" rid="pone.0308543.s004">S1 Appendix</xref>.</p>
<p>Performance measures for the <bold>regression coefficients</bold> as estimands include <italic>bias</italic> and <inline-formula id="pone.0308543.e052"><alternatives><graphic id="pone.0308543.e052g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e052" xlink:type="simple"/><mml:math display="inline" id="M52"><mml:mrow><mml:mi>R</mml:mi> <mml:mi>M</mml:mi> <mml:mi>S</mml:mi> <mml:mi>E</mml:mi> <mml:mo>·</mml:mo> <mml:msqrt><mml:mi>n</mml:mi></mml:msqrt></mml:mrow></mml:math></alternatives></inline-formula> (root of expected mean squared error multiplied by <inline-formula id="pone.0308543.e053"><alternatives><graphic id="pone.0308543.e053g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e053" xlink:type="simple"/><mml:math display="inline" id="M53"><mml:msqrt><mml:mi>n</mml:mi></mml:msqrt></mml:math></alternatives></inline-formula>) of the estimated regression coefficients, and <italic>coverage</italic> and <inline-formula id="pone.0308543.e054"><alternatives><graphic id="pone.0308543.e054g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e054" xlink:type="simple"/><mml:math display="inline" id="M54"><mml:mrow><mml:mi>w</mml:mi> <mml:mi>i</mml:mi> <mml:mi>d</mml:mi> <mml:mi>t</mml:mi> <mml:mi>h</mml:mi> <mml:mo>·</mml:mo> <mml:msqrt><mml:mi>n</mml:mi></mml:msqrt></mml:mrow></mml:math></alternatives></inline-formula> of the 95% confidence intervals of the coefficients. (The RMSE of coefficients and width of confidence intervals are multiplied by <inline-formula id="pone.0308543.e055"><alternatives><graphic id="pone.0308543.e055g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e055" xlink:type="simple"/><mml:math display="inline" id="M55"><mml:msqrt><mml:mi>n</mml:mi></mml:msqrt></mml:math></alternatives></inline-formula> for better comparability across sample sizes.) Moreover, we consider the <italic>power</italic> for predictors and the <italic>type 1 error</italic> for noise variables (i.e., whether the confidence interval for the respective regression coefficient contains zero), as well as the selection rates of the variables (i.e., whether the regression coefficients are zero): the <italic>true positive rate</italic> for predictors and the <italic>false positive rate</italic> for noise variables. We also include Kendall’s <italic>τ</italic><sub><italic>B</italic></sub> [<xref ref-type="bibr" rid="pone.0308543.ref046">46</xref>] to measure the agreement of the estimated ranking of variables (defined by ordering the variables based on absolute values of the estimated standardized regression coefficients) with the “true” ranking of the variables (defined by ordering the variables based on absolute values of the true standardized regression coefficients).</p>
<p>For bias and <inline-formula id="pone.0308543.e056"><alternatives><graphic id="pone.0308543.e056g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e056" xlink:type="simple"/><mml:math display="inline" id="M56"><mml:mrow><mml:mtext>RMSE</mml:mtext> <mml:mo>·</mml:mo> <mml:msqrt><mml:mi>n</mml:mi></mml:msqrt></mml:mrow></mml:math></alternatives></inline-formula> of coefficients, coverage and width <inline-formula id="pone.0308543.e057"><alternatives><graphic id="pone.0308543.e057g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e057" xlink:type="simple"/><mml:math display="inline" id="M57"><mml:mrow><mml:mo>·</mml:mo> <mml:msqrt><mml:mi>n</mml:mi></mml:msqrt></mml:mrow></mml:math></alternatives></inline-formula> of confidence intervals, and type 1 error/power for variables, the calculation can be performed unconditionally or conditionally on selection. In the unconditional approach, the coefficients and their confidence limits for non-selected variables are set to zero, while the conditional approach includes only simulation runs where the specific variable is selected.</p>
<p>Performance measures for <bold>model selection</bold> as target include the <italic>selection rate of the true model</italic> consisting exactly of the ten predictors, the <italic>selection rate of an “over-selection” model</italic> which we define as a model including all predictors as well as at least one noise variable (previously called an “inflated” model [<xref ref-type="bibr" rid="pone.0308543.ref015">15</xref>]), and the <italic>selection rate of any “under-selection” model</italic> defined as a model not containing all predictors but possibly including noise variables (previously called a “biased” model [<xref ref-type="bibr" rid="pone.0308543.ref015">15</xref>]).</p>
<p>Finally, we use multiple performance measures for <bold>prediction</bold>. Predictive performance is evaluated on a large test dataset (<italic>n</italic><sub><italic>test</italic></sub> = 10, 000). One test dataset is simulated for each simulation setting. Prediction is assessed locally, i.e., at each value of the true linear predictor (<italic>local bias</italic> and <italic>local</italic> <inline-formula id="pone.0308543.e058"><alternatives><graphic id="pone.0308543.e058g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e058" xlink:type="simple"/><mml:math display="inline" id="M58"><mml:mrow><mml:mi>R</mml:mi> <mml:mi>M</mml:mi> <mml:mi>S</mml:mi> <mml:mi>E</mml:mi> <mml:mo>·</mml:mo> <mml:msqrt><mml:mi>n</mml:mi></mml:msqrt></mml:mrow></mml:math></alternatives></inline-formula>), as well as globally with the <italic>global</italic> <inline-formula id="pone.0308543.e059"><alternatives><graphic id="pone.0308543.e059g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e059" xlink:type="simple"/><mml:math display="inline" id="M59"><mml:mrow><mml:mi>R</mml:mi> <mml:mi>M</mml:mi> <mml:mi>S</mml:mi> <mml:mi>E</mml:mi> <mml:mo>·</mml:mo> <mml:msqrt><mml:mi>n</mml:mi></mml:msqrt></mml:mrow></mml:math></alternatives></inline-formula> (i.e., the root of the expected mean squared error of the estimated vs. true linear predictor multiplied by <inline-formula id="pone.0308543.e060"><alternatives><graphic id="pone.0308543.e060g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e060" xlink:type="simple"/><mml:math display="inline" id="M60"><mml:msqrt><mml:mi>n</mml:mi></mml:msqrt></mml:math></alternatives></inline-formula>) and the <italic>global MAE</italic> (i.e., the expected median absolute error of the estimated vs. true linear predictor). For logistic regression, global predictive performance is additionally evaluated with the <italic>AUC</italic>, i.e., area under the receiver operating characteristic curve. For both linear and logistic regression, the calibration of the predictions is measured with the <italic>integrated calibration index</italic> (ICI) [<xref ref-type="bibr" rid="pone.0308543.ref047">47</xref>]. The ICI is defined as the mean distance of the predicted outcomes/probabilities to the corresponding points on the calibration curve.</p>
<p>The performance measures for the regression coefficients and for model selection are primarily relevant for descriptive models, while performance measures for predictive performance are mainly relevant for prediction models. However, a descriptive model may also be suitable for prediction; therefore, performance measures for prediction could also be relevant for descriptive modeling. Vice versa, in prediction models, aspects such as interpretability, fairness etc. often play an important role; researchers might thus consider performance measures such as bias of coefficients also for prediction models.</p>
</sec>
<sec id="sec017">
<title>3.6 Monte Carlo errors and number of simulation runs</title>
<p>The number of simulation repetitions <italic>n</italic><sub><italic>sim</italic></sub> must be chosen large enough to estimate the performance measures with suitable accuracy, i.e., the Monte Carlo errors of the measures must be acceptable. We use the coverage of the confidence intervals as reference measure. The Monte Carlo standard error for the coverage can be calculated with the formula
<disp-formula id="pone.0308543.e061"><alternatives><graphic id="pone.0308543.e061g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e061" xlink:type="simple"/><mml:math display="block" id="M61"><mml:mrow><mml:msqrt><mml:mfrac><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>c</mml:mi> <mml:mi>o</mml:mi> <mml:mi>v</mml:mi> <mml:mi>e</mml:mi> <mml:mi>r</mml:mi></mml:mrow> <mml:mo>^</mml:mo></mml:mover> <mml:mrow><mml:mo>(</mml:mo> <mml:mn>1</mml:mn> <mml:mo>-</mml:mo> <mml:mover accent="true"><mml:mrow><mml:mi>c</mml:mi> <mml:mi>o</mml:mi> <mml:mi>v</mml:mi> <mml:mi>e</mml:mi> <mml:mi>r</mml:mi></mml:mrow> <mml:mo>^</mml:mo></mml:mover> <mml:mo>)</mml:mo></mml:mrow></mml:mrow> <mml:msub><mml:mi>n</mml:mi> <mml:mrow><mml:mi>s</mml:mi> <mml:mi>i</mml:mi> <mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mfrac></mml:msqrt> <mml:mo>,</mml:mo></mml:mrow></mml:math></alternatives></disp-formula>
where <inline-formula id="pone.0308543.e062"><alternatives><graphic id="pone.0308543.e062g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e062" xlink:type="simple"/><mml:math display="inline" id="M62"><mml:mover accent="true"><mml:mrow><mml:mi>c</mml:mi> <mml:mi>o</mml:mi> <mml:mi>v</mml:mi> <mml:mi>e</mml:mi> <mml:mi>r</mml:mi></mml:mrow> <mml:mo>^</mml:mo></mml:mover></mml:math></alternatives></inline-formula> is the coverage estimated via simulation [<xref ref-type="bibr" rid="pone.0308543.ref019">19</xref>]. If <italic>n</italic><sub><italic>sim</italic></sub> = 2000 and <inline-formula id="pone.0308543.e063"><alternatives><graphic id="pone.0308543.e063g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e063" xlink:type="simple"/><mml:math display="inline" id="M63"><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>c</mml:mi> <mml:mi>o</mml:mi> <mml:mi>v</mml:mi> <mml:mi>e</mml:mi> <mml:mi>r</mml:mi></mml:mrow> <mml:mo>^</mml:mo></mml:mover> <mml:mo>=</mml:mo> <mml:mn>95</mml:mn> <mml:mo>%</mml:mo></mml:mrow></mml:math></alternatives></inline-formula>, the Monte Carlo SE is about 0.5%. If <italic>n</italic><sub><italic>sim</italic></sub> = 2000 and <inline-formula id="pone.0308543.e064"><alternatives><graphic id="pone.0308543.e064g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e064" xlink:type="simple"/><mml:math display="inline" id="M64"><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>c</mml:mi> <mml:mi>o</mml:mi> <mml:mi>v</mml:mi> <mml:mi>e</mml:mi> <mml:mi>r</mml:mi></mml:mrow> <mml:mo>^</mml:mo></mml:mover> <mml:mo>=</mml:mo> <mml:mn>50</mml:mn> <mml:mo>%</mml:mo></mml:mrow></mml:math></alternatives></inline-formula> (the worst-case scenario), the Monte Carlo SE is about 1%, which is still acceptable. Therefore, we plan to use <italic>n</italic><sub><italic>sim</italic></sub> = 2000 in our simulation for all settings (provided that this will be computationally feasible). We will then calculate the Monte Carlo SEs for all performance measures.</p>
</sec>
</sec>
<sec id="sec018">
<title>4 Code review</title>
<p>To ensure reproducibility, as well as readability, the code will be checked by another researcher (a “code reviewer”) who works at the same institute as the first, second and last author of this protocol, but was not involved in planning the study. After writing the code, the first author (T.U.) will hand over the code to the code reviewer, together with instructions for running the code as well as some partial results (using less than the full <italic>n</italic><sub><italic>sim</italic></sub> = 2000 repetitions). The code reviewer will then check the plausibility of the partial results and provide feedback on the simulation code, focusing on a) data generation, b) the implementation of the compared models, and c) the implementation of the performance measures applied to these models. Once T.U. and the code reviewer have agreed upon a final version of the code, T.U. will re-run the partial results, and the code reviewer will check the complete computational reproducibility by re-running the code on another machine. This check for reproducibility is done on the partial results as the generation of the final results is expected to require large amounts of computational resources. Once the reproducibility check has successfully concluded, T.U. will perform the full <italic>n</italic><sub><italic>sim</italic></sub> = 2000 repetitions to generate the final results.</p>
</sec>
<sec id="sec019">
<title>5 Final remarks</title>
<p>Our simulation study will enable researchers to better understand the consequences of variable selection, and will clarify differences in the performance of different selection methods depending on the considered scenarios. To make the results of the study more accessible and interpretable, we plan to display all results in an interactive web app (Shiny app) that will be published alongside the main paper. We will also make our code available on a Git repository, and will specify random seeds to ensure reproducibility of the results.</p>
<p>The performance measures for our study (Section 3.5) are defined as expected values and probabilities. Their estimation by simulation thus always involves taking the mean over (a part of) the simulation repetitions. However, if one only calculates the mean over the repetitions, one might miss relevant properties of the distribution of the values over the simulation repetitions. We will thus use distribution plots and correlation analyses to evaluate the simulation results in more detail [<xref ref-type="bibr" rid="pone.0308543.ref019">19</xref>]. Moreover, we will analyze how many variables were selected by each variable selection method. We did not include model size as a performance measure in Section 3.5 because there is no clear target value and smaller/larger values are not automatically better/worse (a smaller model size is preferable in some applications, but might be less relevant in others). A specific focus on model size (e.g., comparing different variable selection methods under constraints w.r.t. the number of chosen variables) would require a different study design.</p>
<p>Multicollinearity is an important topic in the context of variable selection. Data-driven variable selection methods tend to perform worse if there is a high degree of correlation between the predictors, and their performance will improve the less the predictors are correlated with each other. Before regression analysis is performed in an applied study, the correlations between the independent variables should be checked during initial data analysis [<xref ref-type="bibr" rid="pone.0308543.ref048">48</xref>]. For our simulation study, we have carefully chosen true correlations between the independent variables based on a real correlation matrix from NHANES data. As mentioned in Section 3.2.1, <xref ref-type="supplementary-material" rid="pone.0308543.s002">S2 Fig</xref> shows the coefficients of determination <inline-formula id="pone.0308543.e065"><alternatives><graphic id="pone.0308543.e065g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e065" xlink:type="simple"/><mml:math display="inline" id="M65"><mml:msubsup><mml:mi>R</mml:mi> <mml:mi>j</mml:mi> <mml:mn>2</mml:mn></mml:msubsup></mml:math></alternatives></inline-formula> for the regression of each variable <italic>X</italic><sub><italic>j</italic></sub> on all other respective variables <italic>X</italic><sub><italic>l</italic></sub>, <italic>l</italic> = 1, …, 20, <italic>l</italic> ≠ <italic>j</italic>. The <inline-formula id="pone.0308543.e066"><alternatives><graphic id="pone.0308543.e066g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0308543.e066" xlink:type="simple"/><mml:math display="inline" id="M66"><mml:msubsup><mml:mi>R</mml:mi> <mml:mi>j</mml:mi> <mml:mn>2</mml:mn></mml:msubsup></mml:math></alternatives></inline-formula> values range from 0 to 0.56, demonstrating differing degrees of dependence between the predictors. These values will be considered when interpreting the simulation results.</p>
<p>In future work, it would be interesting to consider various extensions of our simulation. For example, while we focus on linear and logistic regression in the present protocol, data-driven variable selection is also often used in the context of survival analysis. We plan to conduct a further simulation study comparing different data-driven variable selection methods for Cox regression and the accelerated failure time model.</p>
<p>In the present study, we include several settings where all predictors have true <italic>nonlinear</italic> functional forms, but we nevertheless fit all models with <italic>linear</italic> functional forms; this mimics the frequent misspecification of models in practice. Generally, when fitting a regression model with linear effects, it is advisable to check for misspecification by analyzing the residuals. If misspecification is only mild, then a model with linear effects might still be justifiable. If misspecification is too severe, functional form selection can be performed to account for nonlinear effects, e.g., with spline-based approaches. In future work, our study could be extended by considering the <italic>combination</italic> of variable selection and functional form selection, which is a complex issue [<xref ref-type="bibr" rid="pone.0308543.ref039">39</xref>].</p>
<p>We focus on low-dimensional data in our study. Future studies could compare variable selection methods for high-dimensional data. Finally, our study considers variable selection in a frequentist framework. Future simulation studies could also evaluate Bayesian methods for variable selection.</p>
</sec>
<sec id="sec020" sec-type="supplementary-material">
<title>Supporting information</title>
<supplementary-material id="pone.0308543.s001" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pone.0308543.s001" xlink:type="simple">
<label>S1 Fig</label>
<caption>
<title>Correlation network graph.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pone.0308543.s002" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pone.0308543.s002" xlink:type="simple">
<label>S2 Fig</label>
<caption>
<title>Absolute standardized regression coefficients plotted against coefficients of determination for each independent variable.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pone.0308543.s003" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pone.0308543.s003" xlink:type="simple">
<label>S3 Fig</label>
<caption>
<title>Nonlinear effects.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pone.0308543.s004" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pone.0308543.s004" xlink:type="simple">
<label>S1 Appendix</label>
<caption>
<title>Details of the simulation design.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
<supplementary-material id="pone.0308543.s005" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pone.0308543.s005" xlink:type="simple">
<label>S1 Table</label>
<caption>
<title>Correlation table.</title>
<p>(PDF)</p>
</caption>
</supplementary-material>
</sec>
</body>
<back>
<ack>
<p>We would like to thank the members of Topic Group (TG) 2 and the Publications Panel of the STRengthening Analytical Thinking for Observational Studies (STRATOS) initiative for helpful comments. In particular, we thank Willi Sauerbrei, Frank Harrell, Nadja Klein and Harald Binder.</p>
<p>At the time of submission, STRATOS TG2 consisted of the following members (in alphabetical order): Michal Abrahamowicz, Harald Binder, Daniela Dunkler, Frank Harrell, Georg Heinze, Marc Henrion, Michael Kammer, Aris Perperoglou, Willi Sauerbrei, and Matthias Schmid. The group is co-chaired by Georg Heinze (<email xlink:type="simple">georg.heinze@meduniwien.ac.at</email>), Aris Perperoglou, and Willi Sauerbrei.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="pone.0308543.ref001">
<label>1</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Heinze</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Wallisch</surname> <given-names>C</given-names></name>, <name name-style="western"><surname>Dunkler</surname> <given-names>D</given-names></name>. <article-title>Variable selection–a review and recommendations for the practicing statistician</article-title>. <source>Biometrical Journal</source>. <year>2018</year>;<volume>60</volume>(<issue>3</issue>):<fpage>431</fpage>–<lpage>449</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/bimj.201700067" xlink:type="simple">10.1002/bimj.201700067</ext-link></comment> <object-id pub-id-type="pmid">29292533</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref002">
<label>2</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Sauerbrei</surname> <given-names>W</given-names></name>, <name name-style="western"><surname>Buchholz</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Boulesteix</surname> <given-names>AL</given-names></name>, <name name-style="western"><surname>Binder</surname> <given-names>H</given-names></name>. <article-title>On stability issues in deriving multivariable regression models</article-title>. <source>Biometrical Journal</source>. <year>2015</year>;<volume>57</volume>(<issue>4</issue>):<fpage>531</fpage>–<lpage>555</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/bimj.201300222" xlink:type="simple">10.1002/bimj.201300222</ext-link></comment> <object-id pub-id-type="pmid">25501529</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref003">
<label>3</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Steyerberg</surname> <given-names>EW</given-names></name>, <name name-style="western"><surname>Eijkemans</surname> <given-names>MJ</given-names></name>, <name name-style="western"><surname>Habbema</surname> <given-names>JDF</given-names></name>. <article-title>Stepwise selection in small data sets: a simulation study of bias in logistic regression analysis</article-title>. <source>Journal of Clinical Epidemiology</source>. <year>1999</year>;<volume>52</volume>(<issue>10</issue>):<fpage>935</fpage>–<lpage>942</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/S0895-4356(99)00103-1" xlink:type="simple">10.1016/S0895-4356(99)00103-1</ext-link></comment> <object-id pub-id-type="pmid">10513756</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref004">
<label>4</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Hurvich</surname> <given-names>CM</given-names></name>, <name name-style="western"><surname>Tsai</surname> <given-names>C</given-names></name>. <article-title>The impact of model selection on inference in linear regression</article-title>. <source>The American Statistician</source>. <year>1990</year>;<volume>44</volume>(<issue>3</issue>):<fpage>214</fpage>–<lpage>217</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1080/00031305.1990.10475722" xlink:type="simple">10.1080/00031305.1990.10475722</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0308543.ref005">
<label>5</label>
<mixed-citation publication-type="book" xlink:type="simple">
<name name-style="western"><surname>Harrell</surname> <given-names>FE</given-names></name>. <chapter-title>Regression Modeling Strategies: With Applications to Linear Models, Logistic and Ordinal Regression, and Survival Analysis</chapter-title>. <source>Springer Series in Statistics</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>; <year>2015</year>.</mixed-citation>
</ref>
<ref id="pone.0308543.ref006">
<label>6</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Mickey</surname> <given-names>RM</given-names></name>, <name name-style="western"><surname>Greenland</surname> <given-names>S</given-names></name>. <article-title>The impact of confounder selection criteria on effect estimation</article-title>. <source>American Journal of Epidemiology</source>. <year>1989</year>;<volume>129</volume>(<issue>1</issue>):<fpage>125</fpage>–<lpage>137</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/oxfordjournals.aje.a115101" xlink:type="simple">10.1093/oxfordjournals.aje.a115101</ext-link></comment> <object-id pub-id-type="pmid">2910056</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref007">
<label>7</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Maldonado</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Greenland</surname> <given-names>S</given-names></name>. <article-title>Simulation study of confounder-selection strategies</article-title>. <source>American Journal of Epidemiology</source>. <year>1993</year>;<volume>138</volume>(<issue>11</issue>):<fpage>923</fpage>–<lpage>936</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/oxfordjournals.aje.a116813" xlink:type="simple">10.1093/oxfordjournals.aje.a116813</ext-link></comment> <object-id pub-id-type="pmid">8256780</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref008">
<label>8</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Derksen</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Keselman</surname> <given-names>HJ</given-names></name>. <article-title>Backward, forward and stepwise automated subset selection algorithms: Frequency of obtaining authentic and noise variables</article-title>. <source>British Journal of Mathematical and Statistical Psychology</source>. <year>1992</year>;<volume>45</volume>(<issue>2</issue>):<fpage>265</fpage>–<lpage>282</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1111/j.2044-8317.1992.tb00992.x" xlink:type="simple">10.1111/j.2044-8317.1992.tb00992.x</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0308543.ref009">
<label>9</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Hastie</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Tibshirani</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Tibshirani</surname> <given-names>R</given-names></name>. <article-title>Best Subset, Forward Stepwise or Lasso? Analysis and Recommendations Based on Extensive Comparisons</article-title>. <source>Statistical Science</source>. <year>2020</year>;<volume>35</volume>(<issue>4</issue>):<fpage>579</fpage>–<lpage>592</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1214/19-STS733" xlink:type="simple">10.1214/19-STS733</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0308543.ref010">
<label>10</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Hanke</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Dijkstra</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Foraita</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Didelez</surname> <given-names>V</given-names></name>. <article-title>Variable selection in linear regression models: choosing the best subset is not always the best choice</article-title>. <source>Biometrical Journal</source>. <year>2023</year>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/bimj.202200209" xlink:type="simple">10.1002/bimj.202200209</ext-link></comment> <object-id pub-id-type="pmid">37643390</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref011">
<label>11</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Boulesteix</surname> <given-names>AL</given-names></name>, <name name-style="western"><surname>Lauer</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Eugster</surname> <given-names>MJ</given-names></name>. <article-title>A plea for neutral comparison studies in computational sciences</article-title>. <source>PloS One</source>. <year>2013</year>;<volume>8</volume>(<issue>4</issue>):<fpage>e61562</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pone.0061562" xlink:type="simple">10.1371/journal.pone.0061562</ext-link></comment> <object-id pub-id-type="pmid">23637855</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref012">
<label>12</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Boulesteix</surname> <given-names>AL</given-names></name>, <name name-style="western"><surname>Wilson</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Hapfelmeier</surname> <given-names>A</given-names></name>. <article-title>Towards evidence-based computational statistics: lessons from clinical research on the role and design of real-data benchmark studies</article-title>. <source>BMC Medical Research Methodology</source>. <year>2017</year>;<volume>17</volume>:<fpage>138</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/s12874-017-0417-2" xlink:type="simple">10.1186/s12874-017-0417-2</ext-link></comment> <object-id pub-id-type="pmid">28888225</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref013">
<label>13</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Sauerbrei</surname> <given-names>W</given-names></name>, <name name-style="western"><surname>Abrahamowicz</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Altman</surname> <given-names>DG</given-names></name>, <name name-style="western"><surname>le Cessie</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Carpenter</surname> <given-names>J</given-names></name>, <collab>STRATOS initiative</collab>. <article-title>STRengthening analytical thinking for observational studies: the STRATOS initiative</article-title>. <source>Statistics in Medicine</source>. <year>2014</year>;<volume>33</volume>(<issue>30</issue>):<fpage>5413</fpage>–<lpage>5432</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/sim.6265" xlink:type="simple">10.1002/sim.6265</ext-link></comment> <object-id pub-id-type="pmid">25074480</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref014">
<label>14</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Shmueli</surname> <given-names>G</given-names></name>. <article-title>To Explain or to Predict?</article-title> <source>Statistical Science</source>. <year>2010</year>;<volume>25</volume>(<issue>3</issue>). <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1214/10-STS330" xlink:type="simple">10.1214/10-STS330</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0308543.ref015">
<label>15</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Dunkler</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Plischke</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Leffondré</surname> <given-names>K</given-names></name>, <name name-style="western"><surname>Heinze</surname> <given-names>G</given-names></name>. <article-title>Augmented backward elimination: a pragmatic and purposeful way to develop statistical models</article-title>. <source>PloS One</source>. <year>2014</year>;<volume>9</volume>(<issue>11</issue>):<fpage>e113677</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pone.0113677" xlink:type="simple">10.1371/journal.pone.0113677</ext-link></comment> <object-id pub-id-type="pmid">25415265</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref016">
<label>16</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Tibshirani</surname> <given-names>R</given-names></name>. <article-title>Regression shrinkage and selection via the lasso</article-title>. <source>Journal of the Royal Statistical Society: Series B (Methodological)</source>. <year>1996</year>;<volume>58</volume>(<issue>1</issue>):<fpage>267</fpage>–<lpage>288</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1111/j.2517-6161.1996.tb02080.x" xlink:type="simple">10.1111/j.2517-6161.1996.tb02080.x</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0308543.ref017">
<label>17</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Meinshausen</surname> <given-names>N</given-names></name>. <article-title>Relaxed lasso</article-title>. <source>Computational Statistics &amp; Data Analysis</source>. <year>2007</year>;<volume>52</volume>(<issue>1</issue>):<fpage>374</fpage>–<lpage>393</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.csda.2006.12.019" xlink:type="simple">10.1016/j.csda.2006.12.019</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0308543.ref018">
<label>18</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Zou</surname> <given-names>H</given-names></name>. <article-title>The adaptive lasso and its oracle properties</article-title>. <source>Journal of the American Statistical Association</source>. <year>2006</year>;<volume>101</volume>(<issue>476</issue>):<fpage>1418</fpage>–<lpage>1429</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1198/016214506000000735" xlink:type="simple">10.1198/016214506000000735</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0308543.ref019">
<label>19</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Morris</surname> <given-names>TP</given-names></name>, <name name-style="western"><surname>White</surname> <given-names>IR</given-names></name>, <name name-style="western"><surname>Crowther</surname> <given-names>MJ</given-names></name>. <article-title>Using simulation studies to evaluate statistical methods</article-title>. <source>Statistics in Medicine</source>. <year>2019</year>;<volume>38</volume>(<issue>11</issue>):<fpage>2074</fpage>–<lpage>2102</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/sim.8086" xlink:type="simple">10.1002/sim.8086</ext-link></comment> <object-id pub-id-type="pmid">30652356</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref020">
<label>20</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Boulesteix</surname> <given-names>AL</given-names></name>, <name name-style="western"><surname>Groenwold</surname> <given-names>RH</given-names></name>, <name name-style="western"><surname>Abrahamowicz</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Binder</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Briel</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Hornung</surname> <given-names>R</given-names></name>, <etal>et al</etal>. <article-title>Introduction to statistical simulations in health research</article-title>. <source>BMJ Open</source>. <year>2020</year>;<volume>10</volume>(<issue>12</issue>):<fpage>e039921</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1136/bmjopen-2020-039921" xlink:type="simple">10.1136/bmjopen-2020-039921</ext-link></comment> <object-id pub-id-type="pmid">33318113</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref021">
<label>21</label>
<mixed-citation publication-type="other" xlink:type="simple">Centers for Disease Control and Prevention (CDC), National Center for Health Statistics (NCHS). National Health and Nutrition Examination Survey Data; 2023. Available from: <ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/nchs/nhanes/" xlink:type="simple">https://www.cdc.gov/nchs/nhanes/</ext-link>.</mixed-citation>
</ref>
<ref id="pone.0308543.ref022">
<label>22</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Sheppard</surname> <given-names>JP</given-names></name>, <name name-style="western"><surname>Stevens</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Gill</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Martin</surname> <given-names>U</given-names></name>, <name name-style="western"><surname>Godwin</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Hanley</surname> <given-names>J</given-names></name>, <etal>et al</etal>. <article-title>Predicting Out-of-Office Blood Pressure in the Clinic (PROOF-BP): Derivation and Validation of a Tool to Improve the Accuracy of Blood Pressure Measurement in Clinical Practice</article-title>. <source>Hypertension</source>. <year>2016</year>;<volume>67</volume>(<issue>5</issue>):<fpage>941</fpage>–<lpage>950</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1161/HYPERTENSIONAHA.115.07108" xlink:type="simple">10.1161/HYPERTENSIONAHA.115.07108</ext-link></comment> <object-id pub-id-type="pmid">27001299</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref023">
<label>23</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Wynants</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Calster</surname> <given-names>BV</given-names></name>, <name name-style="western"><surname>Collins</surname> <given-names>GS</given-names></name>, <name name-style="western"><surname>Riley</surname> <given-names>RD</given-names></name>, <name name-style="western"><surname>Heinze</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Schuit</surname> <given-names>E</given-names></name>, <etal>et al</etal>. <article-title>Prediction models for diagnosis and prognosis of covid-19: systematic review and critical appraisal [update 4]</article-title>. <source>BMJ</source>. <year>2020</year>;<volume>369</volume>:<fpage>m1328</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1136/bmj.m1328" xlink:type="simple">10.1136/bmj.m1328</ext-link></comment> <object-id pub-id-type="pmid">32265220</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref024">
<label>24</label>
<mixed-citation publication-type="other" xlink:type="simple">COVID-19 living review, summary details per model;. <ext-link ext-link-type="uri" xlink:href="https://www.covprecise.org/living-review/" xlink:type="simple">https://www.covprecise.org/living-review/</ext-link> [Accessed: 2024-05-13].</mixed-citation>
</ref>
<ref id="pone.0308543.ref025">
<label>25</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Kipruto</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Sauerbrei</surname> <given-names>W</given-names></name>. <article-title>Comparison of variable selection procedures and investigation of the role of shrinkage in linear regression-protocol of a simulation study in low-dimensional data</article-title>. <source>PloS One</source>. <year>2022</year>;<volume>17</volume>(<issue>10</issue>):<fpage>e0271240</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1371/journal.pone.0271240" xlink:type="simple">10.1371/journal.pone.0271240</ext-link></comment> <object-id pub-id-type="pmid">36191290</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref026">
<label>26</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Nosek</surname> <given-names>BA</given-names></name>, <name name-style="western"><surname>Ebersole</surname> <given-names>CR</given-names></name>, <name name-style="western"><surname>DeHaven</surname> <given-names>AC</given-names></name>, <name name-style="western"><surname>Mellor</surname> <given-names>DT</given-names></name>. <article-title>The preregistration revolution</article-title>. <source>Proceedings of the National Academy of Sciences</source>. <year>2018</year>;<volume>115</volume>(<issue>11</issue>):<fpage>2600</fpage>–<lpage>2606</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1073/pnas.1708274114" xlink:type="simple">10.1073/pnas.1708274114</ext-link></comment> <object-id pub-id-type="pmid">29531091</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref027">
<label>27</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Pawel</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Kook</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Reeve</surname> <given-names>K</given-names></name>. <article-title>Pitfalls and potentials in simulation studies: Questionable research practices in comparative simulation studies allow for spurious claims of superiority of any method</article-title>. <source>Biometrical Journal</source>. <year>2023</year>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/bimj.202200091" xlink:type="simple">10.1002/bimj.202200091</ext-link></comment> <object-id pub-id-type="pmid">36890629</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref028">
<label>28</label>
<mixed-citation publication-type="book" xlink:type="simple">
<name name-style="western"><surname>Royston</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Sauerbrei</surname> <given-names>W</given-names></name>. <source>Multivariable model-building: a pragmatic approach to regression anaylsis based on fractional polynomials for modelling continuous variables</source>. <publisher-name>John Wiley &amp; Sons</publisher-name>; <year>2008</year>.</mixed-citation>
</ref>
<ref id="pone.0308543.ref029">
<label>29</label>
<mixed-citation publication-type="other" xlink:type="simple">Cario MC, Nelson BL. Modeling and generating random vectors with arbitrary marginal distributions and correlation matrix. Department of Industrial Engineering and Management, Northwestern University; 1997.</mixed-citation>
</ref>
<ref id="pone.0308543.ref030">
<label>30</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Ghosh</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Henderson</surname> <given-names>SG</given-names></name>. <article-title>Behavior of the NORTA method for correlated random vector generation as the dimension increases</article-title>. <source>ACM Transactions on Modeling and Computer Simulation (TOMACS)</source>. <year>2003</year>;<volume>13</volume>(<issue>3</issue>):<fpage>276</fpage>–<lpage>294</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1145/937332.937336" xlink:type="simple">10.1145/937332.937336</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0308543.ref031">
<label>31</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>White</surname> <given-names>H</given-names></name>. <article-title>A heteroskedasticity-consistent covariance matrix estimator and a direct test for heteroskedasticity</article-title>. <source>Econometrica</source>. <year>1980</year>;<volume>48</volume>(<issue>4</issue>):<fpage>817</fpage>–<lpage>838</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.2307/1912934" xlink:type="simple">10.2307/1912934</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0308543.ref032">
<label>32</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Riley</surname> <given-names>RD</given-names></name>, <name name-style="western"><surname>Snell</surname> <given-names>KI</given-names></name>, <name name-style="western"><surname>Ensor</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Burke</surname> <given-names>DL</given-names></name>, <name name-style="western"><surname>Harrell</surname> <given-names>FE</given-names> <suffix>Jr</suffix></name>, <name name-style="western"><surname>Moons</surname> <given-names>KG</given-names></name>, <etal>et al</etal>. <article-title>Minimum sample size for developing a multivariable prediction model: Part I–Continuous outcomes</article-title>. <source>Statistics in Medicine</source>. <year>2019</year>;<volume>38</volume>(<issue>7</issue>):<fpage>1262</fpage>–<lpage>1275</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/sim.7992" xlink:type="simple">10.1002/sim.7992</ext-link></comment> <object-id pub-id-type="pmid">30347470</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref033">
<label>33</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Riley</surname> <given-names>RD</given-names></name>, <name name-style="western"><surname>Snell</surname> <given-names>KI</given-names></name>, <name name-style="western"><surname>Ensor</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Burke</surname> <given-names>DL</given-names></name>, <name name-style="western"><surname>Harrell</surname> <given-names>FE</given-names> <suffix>Jr</suffix></name>, <name name-style="western"><surname>Moons</surname> <given-names>KG</given-names></name>, <etal>et al</etal>. <article-title>Minimum sample size for developing a multivariable prediction model: PART II-binary and time-to-event outcomes</article-title>. <source>Statistics in Medicine</source>. <year>2019</year>;<volume>38</volume>(<issue>7</issue>):<fpage>1276</fpage>–<lpage>1296</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/sim.7992" xlink:type="simple">10.1002/sim.7992</ext-link></comment> <object-id pub-id-type="pmid">30357870</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref034">
<label>34</label>
<mixed-citation publication-type="book" xlink:type="simple">
<name name-style="western"><surname>Hosmer</surname> <given-names>DW</given-names> <suffix>Jr</suffix></name>, <name name-style="western"><surname>Lemeshow</surname> <given-names>S</given-names></name>. <source>Applied logistic regression</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>John Wiley &amp; Sons</publisher-name>; <year>2000</year>.</mixed-citation>
</ref>
<ref id="pone.0308543.ref035">
<label>35</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Bursac</surname> <given-names>Z</given-names></name>, <name name-style="western"><surname>Gauss</surname> <given-names>CH</given-names></name>, <name name-style="western"><surname>Williams</surname> <given-names>DK</given-names></name>, <name name-style="western"><surname>Hosmer</surname> <given-names>DW</given-names></name>. <article-title>Purposeful selection of variables in logistic regression</article-title>. <source>Source Code for Biology and Medicine</source>. <year>2008</year>;<volume>3</volume>:<fpage>17</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/1751-0473-3-17" xlink:type="simple">10.1186/1751-0473-3-17</ext-link></comment> <object-id pub-id-type="pmid">19087314</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref036">
<label>36</label>
<mixed-citation publication-type="other" xlink:type="simple">Blagus R. abe: Augmented Backward Elimination. R package version 5.1.1; 2022.</mixed-citation>
</ref>
<ref id="pone.0308543.ref037">
<label>37</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Sun</surname> <given-names>GW</given-names></name>, <name name-style="western"><surname>Shook</surname> <given-names>TL</given-names></name>, <name name-style="western"><surname>Kay</surname> <given-names>GL</given-names></name>. <article-title>Inappropriate use of bivariable analysis to screen risk factors for use in multivariable analysis</article-title>. <source>Journal of Clinical Epidemiology</source>. <year>1996</year>;<volume>49</volume>(<issue>8</issue>):<fpage>907</fpage>–<lpage>916</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/0895-4356(96)00025-X" xlink:type="simple">10.1016/0895-4356(96)00025-X</ext-link></comment> <object-id pub-id-type="pmid">8699212</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref038">
<label>38</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Royston</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Moons</surname> <given-names>KG</given-names></name>, <name name-style="western"><surname>Altman</surname> <given-names>DG</given-names></name>, <name name-style="western"><surname>Vergouwe</surname> <given-names>Y</given-names></name>. <article-title>Prognosis and prognostic research: developing a prognostic model</article-title>. <source>BMJ</source>. <year>2009</year>;<volume>338</volume>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1136/bmj.b604" xlink:type="simple">10.1136/bmj.b604</ext-link></comment> <object-id pub-id-type="pmid">19336487</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref039">
<label>39</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Sauerbrei</surname> <given-names>W</given-names></name>, <name name-style="western"><surname>Perperoglou</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Schmid</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Abrahamowicz</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Becher</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Binder</surname> <given-names>H</given-names></name>, <etal>et al</etal>. <article-title>State of the art in selection of variables and functional forms in multivariable analysis—outstanding issues</article-title>. <source>Diagnostic and Prognostic Research</source>. <year>2020</year>;<volume>4</volume>(<issue>3</issue>). <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/s41512-020-00074-3" xlink:type="simple">10.1186/s41512-020-00074-3</ext-link></comment> <object-id pub-id-type="pmid">32266321</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref040">
<label>40</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Mallett</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Royston</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Dutton</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Waters</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Altman</surname> <given-names>DG</given-names></name>. <article-title>Reporting methods in studies developing prognostic models in cancer: a review</article-title>. <source>BMC Medicine</source>. <year>2010</year>;<volume>8</volume>:<fpage>20</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1186/1741-7015-8-20" xlink:type="simple">10.1186/1741-7015-8-20</ext-link></comment> <object-id pub-id-type="pmid">20353578</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref041">
<label>41</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Friedman</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Tibshirani</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Hastie</surname> <given-names>T</given-names></name>. <article-title>Regularization Paths for Generalized Linear Models via Coordinate Descent</article-title>. <source>Journal of Statistical Software</source>. <year>2010</year>;<volume>33</volume>(<issue>1</issue>):<fpage>1</fpage>–<lpage>22</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.18637/jss.v033.i01" xlink:type="simple">10.18637/jss.v033.i01</ext-link></comment> <object-id pub-id-type="pmid">20808728</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref042">
<label>42</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Heinze</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Schemper</surname> <given-names>M</given-names></name>. <article-title>A solution to the problem of separation in logistic regression</article-title>. <source>Statistics in Medicine</source>. <year>2002</year>;<volume>21</volume>(<issue>16</issue>):<fpage>2409</fpage>–<lpage>2419</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/sim.1047" xlink:type="simple">10.1002/sim.1047</ext-link></comment> <object-id pub-id-type="pmid">12210625</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref043">
<label>43</label>
<mixed-citation publication-type="other" xlink:type="simple">Konis K. Linear programming algorithms for detecting separated data in binary logistic regression models [PhD thesis]. University of Oxford; 2007.</mixed-citation>
</ref>
<ref id="pone.0308543.ref044">
<label>44</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Firth</surname> <given-names>D</given-names></name>. <article-title>Bias reduction of maximum likelihood estimates</article-title>. <source>Biometrika</source>. <year>1993</year>;<volume>80</volume>(<issue>1</issue>):<fpage>27</fpage>–<lpage>38</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/biomet/80.1.27" xlink:type="simple">10.1093/biomet/80.1.27</ext-link></comment></mixed-citation>
</ref>
<ref id="pone.0308543.ref045">
<label>45</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Puhr</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Heinze</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Nold</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Lusa</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Geroldinger</surname> <given-names>A</given-names></name>. <article-title>Firth’s logistic regression with rare events: accurate effect estimates and predictions?</article-title> <source>Statistics in Medicine</source>. <year>2017</year>;<volume>36</volume>(<issue>14</issue>):<fpage>2302</fpage>–<lpage>2317</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/sim.7273" xlink:type="simple">10.1002/sim.7273</ext-link></comment> <object-id pub-id-type="pmid">28295456</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref046">
<label>46</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Kendall</surname> <given-names>MG</given-names></name>. <article-title>The treatment of ties in ranking problems</article-title>. <source>Biometrika</source>. <year>1945</year>;<volume>33</volume>(<issue>3</issue>):<fpage>239</fpage>–<lpage>251</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1093/biomet/33.3.239" xlink:type="simple">10.1093/biomet/33.3.239</ext-link></comment> <object-id pub-id-type="pmid">21006841</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref047">
<label>47</label>
<mixed-citation publication-type="journal" xlink:type="simple">
<name name-style="western"><surname>Austin</surname> <given-names>PC</given-names></name>, <name name-style="western"><surname>Steyerberg</surname> <given-names>EW</given-names></name>. <article-title>The Integrated Calibration Index (ICI) and related metrics for quantifying the calibration of logistic regression models</article-title>. <source>Statistics in Medicine</source>. <year>2019</year>;<volume>38</volume>(<issue>21</issue>):<fpage>4051</fpage>–<lpage>4065</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1002/sim.8281" xlink:type="simple">10.1002/sim.8281</ext-link></comment> <object-id pub-id-type="pmid">31270850</object-id></mixed-citation>
</ref>
<ref id="pone.0308543.ref048">
<label>48</label>
<mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Heinze</surname> <given-names>G</given-names></name>, <name name-style="western"><surname>Baillie</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Lusa</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Sauerbrei</surname> <given-names>W</given-names></name>, <name name-style="western"><surname>Schmidt</surname> <given-names>CO</given-names></name>, <name name-style="western"><surname>Harrell</surname> <given-names>FE</given-names></name>, <etal>et al</etal>. <source>Regression without regrets—initial data analysis is an essential prerequisite to multivariable regression</source>. <year>2023</year>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.21203/rs.3.rs-3580334/v1" xlink:type="simple">10.21203/rs.3.rs-3580334/v1</ext-link></comment></mixed-citation>
</ref>
</ref-list>
</back>
<sub-article article-type="aggregated-review-documents" id="pone.0308543.r001" specific-use="decision-letter">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pone.0308543.r001</article-id>
<title-group>
<article-title>Decision Letter 0</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>Tian</surname>
<given-names>Suyan</given-names>
</name>
<role>Academic Editor</role>
</contrib>
</contrib-group>
<permissions>
<copyright-year>2024</copyright-year>
<copyright-holder>Suyan Tian</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<related-object document-id="10.1371/journal.pone.0308543" document-id-type="doi" document-type="article" id="rel-obj001" link-type="peer-reviewed-article"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>0</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="letter-date">16 Apr 2024</named-content>
</p>
<p><!-- <div> -->PONE-D-24-04044<!-- </div> --><!-- <div> -->Evaluating variable selection methods for multivariable regression models: a simulation study protocol<!-- </div> --><!-- <div> -->PLOS ONE</p>
<p>Dear Dr. Dunkler,</p>
<p>Thank you for submitting your manuscript to PLOS ONE. After careful consideration, we feel that it has merit but does not fully meet PLOS ONE’s publication criteria as it currently stands. Therefore, we invite you to submit a revised version of the manuscript that addresses the points raised during the review process.<!-- </div> --><!-- <div> --> <!-- </div> --><!-- <div> --><bold>As pointed out by the reviewer, the setting for the simulations is comparable simple compared to the real world cases, please address this issue seriously.</bold><!-- </div> --><!-- <div> --><bold>Also, the authors mentioned when other relevant studies carried out a comparison between their proposed method and the existing methods, the number of the existing methods considered is very limited. However, the authors themselves have not carried out any comprehensive comparison with relevant methods. Please revise the corresponding paragraphs. </bold><!-- </div> --><!-- <div> --><bold>Lastly, when I was a student and my professor told us that the coefficients before g(x) are still called linear coefficients, but the authors named them as nonlinear ones. Please double check it. If I am correct, please revise the corresponding equations and sentences.</bold></p>
<p>Please submit your revised manuscript by May 31 2024 11:59PM. If you will need more time than this to complete your revisions, please reply to this message or contact the journal office at <email xlink:type="simple">plosone@plos.org</email>. When you're ready to submit your revision, log on to <ext-link ext-link-type="uri" xlink:href="https://www.editorialmanager.com/pone/" xlink:type="simple">https://www.editorialmanager.com/pone/</ext-link> and select the 'Submissions Needing Revision' folder to locate your manuscript file.</p>
<p>Please include the following items when submitting your revised manuscript:<!-- </div> --><list list-type="bullet"><list-item><p>A rebuttal letter that responds to each point raised by the academic editor and reviewer(s). You should upload this letter as a separate file labeled 'Response to Reviewers'.</p></list-item><list-item><p>A marked-up copy of your manuscript that highlights changes made to the original version. You should upload this as a separate file labeled 'Revised Manuscript with Track Changes'.</p></list-item><list-item><p>An unmarked version of your revised paper without tracked changes. You should upload this as a separate file labeled 'Manuscript'.</p></list-item></list></p>
<p>If you would like to make changes to your financial disclosure, please include your updated statement in your cover letter. Guidelines for resubmitting your figure files are available below the reviewer comments at the end of this letter.</p>
<p>If applicable, we recommend that you deposit your laboratory protocols in protocols.io to enhance the reproducibility of your results. Protocols.io assigns your protocol its own identifier (DOI) so that it can be cited independently in the future. For instructions see: <ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/plosone/s/submission-guidelines#loc-laboratory-protocols" xlink:type="simple">https://journals.plos.org/plosone/s/submission-guidelines#loc-laboratory-protocols</ext-link>. Additionally, PLOS ONE offers an option for publishing peer-reviewed Lab Protocol articles, which describe protocols hosted on protocols.io. Read more information on sharing protocols at <ext-link ext-link-type="uri" xlink:href="https://plos.org/protocols?utm_medium=editorial-email&amp;utm_source=authorletters&amp;utm_campaign=protocols" xlink:type="simple">https://plos.org/protocols?utm_medium=editorial-email&amp;utm_source=authorletters&amp;utm_campaign=protocols</ext-link>.</p>
<p>We look forward to receiving your revised manuscript.</p>
<p>Kind regards,</p>
<p>Suyan Tian</p>
<p>Academic Editor</p>
<p>PLOS ONE</p>
<p>Journal Requirements:</p>
<p>1. When submitting your revision, we need you to address these additional requirements.</p>
<p>Please ensure that your manuscript meets PLOS ONE's style requirements, including those for file naming. The PLOS ONE style templates can be found at </p>
<p><ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/plosone/s/file?id=wjVg/PLOSOne_formatting_sample_main_body.pdf" xlink:type="simple">https://journals.plos.org/plosone/s/file?id=wjVg/PLOSOne_formatting_sample_main_body.pdf</ext-link> and </p>
<p><ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/plosone/s/file?id=ba62/PLOSOne_formatting_sample_title_authors_affiliations.pdf" xlink:type="simple">https://journals.plos.org/plosone/s/file?id=ba62/PLOSOne_formatting_sample_title_authors_affiliations.pdf</ext-link></p>
<p>2. One of the noted authors is a group or consortium TG2 of the STRATOS initiative. In addition to naming the author group, please list the individual authors and affiliations within this group in the acknowledgments section of your manuscript. Please also indicate clearly a lead author for this group along with a contact email address.</p>
<p>[Note: HTML markup is below. Please do not edit.]</p>
<p>Reviewers' comments:</p>
<p>Reviewer's Responses to Questions</p>
<p><!-- <font color="black"> --><bold>Comments to the Author</bold></p>
<p>1. Does the manuscript provide a valid rationale for the proposed study, with clearly identified and justified research questions?</p>
<p>The research question outlined is expected to address a valid academic problem or topic and contribute to the base of knowledge in the field.<!-- </font> --></p>
<p>Reviewer #1: No</p>
<p>**********</p>
<p><!-- <font color="black"> -->2. Is the protocol technically sound and planned in a manner that will lead to a meaningful outcome and allow testing the stated hypotheses?</p>
<p>The manuscript should describe the methods in sufficient detail to prevent undisclosed flexibility in the experimental procedure or analysis pipeline, including sufficient outcome-neutral conditions (e.g. necessary controls, absence of floor or ceiling effects) to test the proposed hypotheses and a statistical power analysis where applicable. As there may be aspects of the methodology and analysis which can only be refined once the work is undertaken, authors should outline potential assumptions and explicitly describe what aspects of the proposed analyses, if any, are exploratory.<!-- </font> --></p>
<p>Reviewer #1: No</p>
<p>**********</p>
<p><!-- <font color="black"> -->3. Is the methodology feasible and described in sufficient detail to allow the work to be replicable?</p>
<p>Descriptions of methods and materials in the protocol should be reported in sufficient detail for another researcher to reproduce all experiments and analyses. The protocol should describe the appropriate controls, sample size calculations, and replication needed to ensure that the data are robust and reproducible.<!-- </font> --></p>
<p>Reviewer #1: No</p>
<p>**********</p>
<p><!-- <font color="black"> -->4. Have the authors described where all data underlying the findings will be made available when the study is complete?</p>
<p>The <ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/plosone/s/materials-and-software-sharing" xlink:type="simple">PLOS Data policy</ext-link> requires authors to make all data underlying the findings described in their manuscript fully available without restriction, with rare exception, at the time of publication. The data should be provided as part of the manuscript or its supporting information, or deposited to a public repository. For example, in addition to summary statistics, the data points behind means, medians and variance measures should be available. If there are restrictions on publicly sharing data—e.g. participant privacy or use of data from a third party—those must be specified.<!-- </font> --></p>
<p>Reviewer #1: Yes</p>
<p>**********</p>
<p><!-- <font color="black"> -->5. Is the manuscript presented in an intelligible fashion and written in standard English?</p>
<p>PLOS ONE does not copyedit accepted manuscripts, so the language in submitted articles must be clear, correct, and unambiguous. Any typographical or grammatical errors should be corrected at revision, so please note any specific errors here.<!-- </font> --></p>
<p>Reviewer #1: Yes</p>
<p>**********</p>
<p><!-- <font color="black"> -->6. Review Comments to the Author</p>
<p>Please use the space provided to explain your answers to the questions above and, if applicable, provide comments about issues authors must address before this protocol can be accepted for publication. You may also include additional comments for the author, including concerns about research or publication ethics.</p>
<p>You may also provide optional suggestions and comments to authors that they might find helpful in planning their study.</p>
<p>(Please upload your review as an attachment if it exceeds 20,000 characters)<!-- </font> --></p>
<p><bold>Reviewer #1:</bold> The authors consider variable selection in multivariate regression modeling from simulation perspective. Although the topic is very important in interesting, I am afraid the contribution does not reflect the importance and novelty. I have some specific comments for further consideration:</p>
<p>1- Variable selection first come to notice in (ultra)high-dimensional (HD) cases. In the simulations, I do not see this is highlighted and thus the simulation cannot be a reflective of real-world problems and not practical. In the revision, the authors must exactly specify the relation between the sample size and dimension.</p>
<p>2- In this paper, I did not see any trace of screening. Why did not authors consider selecting important variables based on the marginal correlation of conduct sure screening?</p>
<p>3- The problem of multicollinearity is not considered in variable selection which is absolutely important.</p>
<p>4- The measure R-squared is not a good measure of model fit in HD problems.</p>
<p>5- Form of explanatory variables does not play substantial role; however, the dependent variable plays. I do not see variant responses. Furthermore, in such studies, it is of essential need to consider additive structures in the predictive component. Further investigations for multivariate additive models are needed.</p>
<p>**********</p>
<p><!-- <font color="black"> -->7. PLOS authors have the option to publish the peer review history of their article (<ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/plosone/s/editorial-and-peer-review-process#loc-peer-review-history" xlink:type="simple">what does this mean?</ext-link>). If published, this will include your full peer review and any attached files.</p>
<p>If you choose “no”, your identity will remain anonymous but your review may still be made public.</p>
<p><bold>Do you want your identity to be public for this peer review?</bold> For information about this choice, including consent withdrawal, please see our <ext-link ext-link-type="uri" xlink:href="https://www.plos.org/privacy-policy" xlink:type="simple">Privacy Policy</ext-link>.<!-- </font> --></p>
<p>Reviewer #1: No</p>
<p>**********</p>
<p>[NOTE: If reviewer comments were submitted as an attachment file, they will be attached to this email and accessible via the submission site. Please log into your account, locate the manuscript record, and check for the action link "View Attachments". If this link does not appear, there are no attachment files.]</p>
<p>While revising your submission, please upload your figure files to the Preflight Analysis and Conversion Engine (PACE) digital diagnostic tool, <ext-link ext-link-type="uri" xlink:href="https://pacev2.apexcovantage.com/" xlink:type="simple">https://pacev2.apexcovantage.com/</ext-link>. PACE helps ensure that figures meet PLOS requirements. To use PACE, you must first register as a user. Registration is free. Then, login and navigate to the UPLOAD tab, where you will find detailed instructions on how to use the tool. If you encounter any issues or have any questions when using PACE, please email PLOS at <email xlink:type="simple">figures@plos.org</email>. Please note that Supporting Information files do not need this step.</p>
</body>
</sub-article>
<sub-article article-type="author-comment" id="pone.0308543.r002">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pone.0308543.r002</article-id>
<title-group>
<article-title>Author response to Decision Letter 0</article-title>
</title-group>
<related-object document-id="10.1371/journal.pone.0308543" document-id-type="doi" document-type="peer-reviewed-article" id="rel-obj002" link-type="rebutted-decision-letter" object-id="10.1371/journal.pone.0308543.r001" object-id-type="doi" object-type="decision-letter"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>1</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="author-response-date">29 May 2024</named-content>
</p>
<p>All comments of the editor and the reviewer are addressed in the PDF file "Response to Reviewers".</p>
<supplementary-material id="pone.0308543.s006" mimetype="application/pdf" position="float" xlink:href="info:doi/10.1371/journal.pone.0308543.s006" xlink:type="simple">
<label>Attachment</label>
<caption>
<p>Submitted filename: <named-content content-type="submitted-filename">Response to reviewers.pdf</named-content></p>
</caption>
</supplementary-material>
</body>
</sub-article>
<sub-article article-type="aggregated-review-documents" id="pone.0308543.r003" specific-use="decision-letter">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pone.0308543.r003</article-id>
<title-group>
<article-title>Decision Letter 1</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>Tian</surname>
<given-names>Suyan</given-names>
</name>
<role>Academic Editor</role>
</contrib>
</contrib-group>
<permissions>
<copyright-year>2024</copyright-year>
<copyright-holder>Suyan Tian</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<related-object document-id="10.1371/journal.pone.0308543" document-id-type="doi" document-type="article" id="rel-obj003" link-type="peer-reviewed-article"/>
<custom-meta-group>
<custom-meta>
<meta-name>Submission Version</meta-name>
<meta-value>1</meta-value>
</custom-meta>
</custom-meta-group>
</front-stub>
<body>
<p>
<named-content content-type="letter-date">26 Jul 2024</named-content>
</p>
<p>Evaluating variable selection methods for multivariable regression models: a simulation study protocol</p>
<p>PONE-D-24-04044R1</p>
<p>Dear Dr. Dunkler,</p>
<p>We’re pleased to inform you that your manuscript has been judged scientifically suitable for publication and will be formally accepted for publication once it meets all outstanding technical requirements.</p>
<p>Within one week, you’ll receive an e-mail detailing the required amendments. When these have been addressed, you’ll receive a formal acceptance letter and your manuscript will be scheduled for publication.</p>
<p>An invoice will be generated when your article is formally accepted. Please note, if your institution has a publishing partnership with PLOS and your article meets the relevant criteria, all or part of your publication costs will be covered. Please make sure your user information is up-to-date by logging into Editorial Manager at <ext-link ext-link-type="uri" xlink:href="https://www.editorialmanager.com/pone/" xlink:type="simple">Editorial Manager®</ext-link> and clicking the ‘Update My Information' link at the top of the page. If you have any questions relating to publication charges, please contact our Author Billing department directly at <email xlink:type="simple">authorbilling@plos.org</email>.</p>
<p>If your institution or institutions have a press office, please notify them about your upcoming paper to help maximize its impact. If they’ll be preparing press materials, please inform our press team as soon as possible -- no later than 48 hours after receiving the formal acceptance. Your manuscript will remain under strict press embargo until 2 pm Eastern Time on the date of publication. For more information, please contact <email xlink:type="simple">onepress@plos.org</email>.</p>
<p>Kind regards,</p>
<p>Suyan Tian</p>
<p>Academic Editor</p>
<p>PLOS ONE</p>
<p>Additional Editor Comments (optional):</p>
<p>Reviewers' comments:</p>
<p>Reviewer's Responses to Questions</p>
<p><!-- <font color="black"> --><bold>Comments to the Author</bold></p>
<p>1. Does the manuscript provide a valid rationale for the proposed study, with clearly identified and justified research questions?</p>
<p>The research question outlined is expected to address a valid academic problem or topic and contribute to the base of knowledge in the field.<!-- </font> --></p>
<p>Reviewer #1: Yes</p>
<p>**********</p>
<p><!-- <font color="black"> -->2. Is the protocol technically sound and planned in a manner that will lead to a meaningful outcome and allow testing the stated hypotheses?</p>
<p>The manuscript should describe the methods in sufficient detail to prevent undisclosed flexibility in the experimental procedure or analysis pipeline, including sufficient outcome-neutral conditions (e.g. necessary controls, absence of floor or ceiling effects) to test the proposed hypotheses and a statistical power analysis where applicable. As there may be aspects of the methodology and analysis which can only be refined once the work is undertaken, authors should outline potential assumptions and explicitly describe what aspects of the proposed analyses, if any, are exploratory.<!-- </font> --></p>
<p>Reviewer #1: Yes</p>
<p>**********</p>
<p><!-- <font color="black"> -->3. Is the methodology feasible and described in sufficient detail to allow the work to be replicable?</p>
<p>Descriptions of methods and materials in the protocol should be reported in sufficient detail for another researcher to reproduce all experiments and analyses. The protocol should describe the appropriate controls, sample size calculations, and replication needed to ensure that the data are robust and reproducible.<!-- </font> --></p>
<p>Reviewer #1: Yes</p>
<p>**********</p>
<p><!-- <font color="black"> -->4. Have the authors described where all data underlying the findings will be made available when the study is complete?</p>
<p>The <ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/plosone/s/materials-and-software-sharing" xlink:type="simple">PLOS Data policy</ext-link> requires authors to make all data underlying the findings described in their manuscript fully available without restriction, with rare exception, at the time of publication. The data should be provided as part of the manuscript or its supporting information, or deposited to a public repository. For example, in addition to summary statistics, the data points behind means, medians and variance measures should be available. If there are restrictions on publicly sharing data—e.g. participant privacy or use of data from a third party—those must be specified.<!-- </font> --></p>
<p>Reviewer #1: No</p>
<p>**********</p>
<p><!-- <font color="black"> -->5. Is the manuscript presented in an intelligible fashion and written in standard English?</p>
<p>PLOS ONE does not copyedit accepted manuscripts, so the language in submitted articles must be clear, correct, and unambiguous. Any typographical or grammatical errors should be corrected at revision, so please note any specific errors here.<!-- </font> --></p>
<p>Reviewer #1: Yes</p>
<p>**********</p>
<p><!-- <font color="black"> -->6. Review Comments to the Author</p>
<p>Please use the space provided to explain your answers to the questions above and, if applicable, provide comments about issues authors must address before this protocol can be accepted for publication. You may also include additional comments for the author, including concerns about research or publication ethics.</p>
<p>You may also provide optional suggestions and comments to authors that they might find helpful in planning their study.</p>
<p>(Please upload your review as an attachment if it exceeds 20,000 characters)<!-- </font> --></p>
<p>Reviewer #1: I am happy with the revised version. The authors have clearly stated the underlying design of their simulation.</p>
<p>**********</p>
<p><!-- <font color="black"> -->7. PLOS authors have the option to publish the peer review history of their article (<ext-link ext-link-type="uri" xlink:href="https://journals.plos.org/plosone/s/editorial-and-peer-review-process#loc-peer-review-history" xlink:type="simple">what does this mean?</ext-link>). If published, this will include your full peer review and any attached files.</p>
<p>If you choose “no”, your identity will remain anonymous but your review may still be made public.</p>
<p><bold>Do you want your identity to be public for this peer review?</bold> For information about this choice, including consent withdrawal, please see our <ext-link ext-link-type="uri" xlink:href="https://www.plos.org/privacy-policy" xlink:type="simple">Privacy Policy</ext-link>.<!-- </font> --></p>
<p>Reviewer #1: No</p>
<p>**********</p>
</body>
</sub-article>
<sub-article article-type="editor-report" id="pone.0308543.r004" specific-use="acceptance-letter">
<front-stub>
<article-id pub-id-type="doi">10.1371/journal.pone.0308543.r004</article-id>
<title-group>
<article-title>Acceptance letter</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>Tian</surname>
<given-names>Suyan</given-names>
</name>
<role>Academic Editor</role>
</contrib>
</contrib-group>
<permissions>
<copyright-year>2024</copyright-year>
<copyright-holder>Suyan Tian</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<related-object document-id="10.1371/journal.pone.0308543" document-id-type="doi" document-type="article" id="rel-obj004" link-type="peer-reviewed-article"/>
</front-stub>
<body>
<p>
<named-content content-type="letter-date">31 Jul 2024</named-content>
</p>
<p>PONE-D-24-04044R1 </p>
<p>PLOS ONE</p>
<p>Dear Dr.  Dunkler, </p>
<p>I'm pleased to inform you that your manuscript has been deemed suitable for publication in PLOS ONE. Congratulations! Your manuscript is now being handed over to our production team.</p>
<p>At this stage, our production department will prepare your paper for publication. This includes ensuring the following:</p>
<p>* All references, tables, and figures are properly cited</p>
<p>* All relevant supporting information is included in the manuscript submission,</p>
<p>* There are no issues that prevent the paper from being properly typeset</p>
<p>If revisions are needed, the production department will contact you directly to resolve them. If no revisions are needed, you will receive an email when the publication date has been set. At this time, we do not offer pre-publication proofs to authors during production of the accepted work. Please keep in mind that we are working through a large volume of accepted articles, so please give us a few weeks to review your paper and let you know the next and final steps. </p>
<p>Lastly, if your institution or institutions have a press office, please let them know about your upcoming paper now to help maximize its impact. If they'll be preparing press materials, please inform our press team within the next 48 hours. Your manuscript will remain under strict press embargo until 2 pm Eastern Time on the date of publication. For more information, please contact <email xlink:type="simple">onepress@plos.org</email>.</p>
<p>If we can help with anything else, please email us at <email xlink:type="simple">customercare@plos.org</email>.</p>
<p>Thank you for submitting your work to PLOS ONE and supporting open access. </p>
<p>Kind regards, </p>
<p>PLOS ONE Editorial Office Staff</p>
<p>on behalf of</p>
<p>Dr. Suyan Tian </p>
<p>Academic Editor</p>
<p>PLOS ONE</p>
</body>
</sub-article>
</article>