<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1d3 20150301//EN" "http://jats.nlm.nih.gov/publishing/1.1d3/JATS-journalpublishing1.dtd">
<article article-type="research-article" dtd-version="1.1d3" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS ONE</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">plosone</journal-id>
<journal-title-group>
<journal-title>PLOS ONE</journal-title>
</journal-title-group>
<issn pub-type="epub">1932-6203</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, CA USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.1371/journal.pone.0302741</article-id>
<article-id pub-id-type="publisher-id">PONE-D-24-02193</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Research Article</subject>
</subj-group>
<subj-group subj-group-type="Discipline-v3">
<subject>Physical sciences</subject><subj-group><subject>Mathematics</subject><subj-group><subject>Applied mathematics</subject><subj-group><subject>Algorithms</subject><subj-group><subject>Clustering algorithms</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Simulation and modeling</subject><subj-group><subject>Algorithms</subject><subj-group><subject>Clustering algorithms</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Medicine and health sciences</subject><subj-group><subject>Health care</subject><subj-group><subject>Geriatric care</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Physical sciences</subject><subj-group><subject>Mathematics</subject><subj-group><subject>Applied mathematics</subject><subj-group><subject>Algorithms</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Simulation and modeling</subject><subj-group><subject>Algorithms</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Medicine and health sciences</subject><subj-group><subject>Sports and exercise medicine</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Sports science</subject><subj-group><subject>Sports and exercise medicine</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Computer and information sciences</subject><subj-group><subject>Data management</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Biology and life sciences</subject><subj-group><subject>Evolutionary biology</subject><subj-group><subject>Evolutionary processes</subject><subj-group><subject>Convergent evolution</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Physical sciences</subject><subj-group><subject>Mathematics</subject><subj-group><subject>Optimization</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Physical sciences</subject><subj-group><subject>Mathematics</subject><subj-group><subject>Probability theory</subject><subj-group><subject>Random variables</subject><subj-group><subject>Covariance</subject></subj-group></subj-group></subj-group></subj-group></subj-group></article-categories>
<title-group>
<article-title>Design of health information management model for elderly care using an advanced higher-order hybrid clustering algorithm from the perspective of sports and medicine integration</article-title>
<alt-title alt-title-type="running-head">Design of health information management model</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0009-0007-7521-5275</contrib-id>
<name name-style="western">
<surname>Zhao</surname>
<given-names>Ning</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-original-draft/">Writing – original draft</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
<xref ref-type="corresp" rid="cor001">*</xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Zhao</surname>
<given-names>Wenkai</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role content-type="http://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role content-type="http://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<xref ref-type="aff" rid="aff002"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Tang</surname>
<given-names>Xiaoliang</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="http://credit.niso.org/contributor-roles/resources/">Resources</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Jiao</surname>
<given-names>Chuanming</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role content-type="http://credit.niso.org/contributor-roles/validation/">Validation</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Zhang</surname>
<given-names>Zhong</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/validation/">Validation</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"><sup>1</sup></xref>
</contrib>
</contrib-group>
<aff id="aff001"><label>1</label> <addr-line>Physical Education Department, Qiqihar Medical University, Qiqihar, Heilongjiang, China</addr-line></aff>
<aff id="aff002"><label>2</label> <addr-line>The Third Affiliated Hospital of Qiqihar Medical University, Qiqihar, Heilongjiang, China</addr-line></aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple">
<name name-style="western">
<surname>Ghadiri Nejad</surname>
<given-names>Mazyar</given-names>
</name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/>
</contrib>
</contrib-group>
<aff id="edit1"><addr-line>Cyprus International University Faculty of Engineering: Uluslararasi Kibris Universitesi Muhendislik Fakultesi, TURKEY</addr-line></aff>
<author-notes>
<fn fn-type="conflict" id="coi001">
<p>The authors have declared that no competing interests exist.</p>
</fn>
<corresp id="cor001">* E-mail: <email xlink:type="simple">zhaoning1026@qmu.edu.cn</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>17</day>
<month>5</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>19</volume>
<issue>5</issue>
<elocation-id>e0302741</elocation-id>
<history>
<date date-type="received">
<day>17</day>
<month>1</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>11</day>
<month>4</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-year>2024</copyright-year>
<copyright-holder>Zhao et al</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
</permissions>
<self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.pone.0302741"/>
<abstract>
<p>In the context of integrating sports and medicine domains, the urgent resolution of elderly health supervision requires effective data clustering algorithms. This paper introduces a novel higher-order hybrid clustering algorithm that combines density values and the particle swarm optimization (PSO) algorithm. Initially, the traditional PSO algorithm is enhanced by integrating the Global Evolution Dynamic Model (GEDM) into the Distribution Estimation Algorithm (EDA), constructing a weighted covariance matrix-based GEDM. This adapted PSO algorithm dynamically selects between the Global Evolution Dynamic Model and the standard PSO algorithm to update population information, significantly enhancing convergence speed while mitigating the risk of local optima entrapment. Subsequently, the higher-order hybrid clustering algorithm is formulated based on the density value and the refined PSO algorithm. The PSO clustering algorithm is adopted in the initial clustering phase, culminating in class clusters after a finite number of iterations. These clusters then undergo the application of the density peak search algorithm to identify candidate centroids. The final centroids are determined through a fusion of the initial class clusters and the identified candidate centroids. Results showcase remarkable improvements: achieving 99.13%, 82.22%, and 99.22% for F-measure, recall, and precision on dataset S1, and 75.22%, 64.0%, and 64.4% on dataset CMC. Notably, the proposed algorithm yields a 75.22%, 64.4%, and 64.6% rate on dataset S, significantly surpassing the comparative schemes’ performance. Moreover, employing the text vector representation of the LDA topic vector model underscores the efficacy of the higher-order hybrid clustering algorithm in efficiently clustering text information. This innovative approach facilitates swift and accurate clustering of elderly health data from the perspective of sports and medicine integration. It enables the identification of patterns and regularities within the data, facilitating the formulation of personalized health management strategies and addressing latent health concerns among the elderly population.</p>
</abstract>
<funding-group>
<award-group id="award001">
<funding-source>
<institution>Research and Planning Project of Philosophy and Social Sciences in Qiqihar —— Research on the Implementation Path of Community Elderly Health Service in Qiqihar from the Perspective of -Sports Integration.</institution>
</funding-source>
<award-id>QSX2023-24YB</award-id>
</award-group>
<funding-statement>Thanks to the support of Research and Planning Project of Philosophy and Social Sciences in Qiqihar —— Research on the Implementation Path of Community Elderly Health Service in Qiqihar from the Perspective of "Sports Integration" (General Project).The Project number is QSX2023-24YB.</funding-statement>
</funding-group>
<counts>
<fig-count count="8"/>
<table-count count="1"/>
<page-count count="16"/>
</counts>
<custom-meta-group>
<custom-meta id="data-availability">
<meta-name>Data Availability</meta-name>
<meta-value>The dataset address and DOI provided in this paper are as follows: <ext-link ext-link-type="uri" xlink:href="https://zenodo.org/records/4341443" xlink:type="simple">https://zenodo.org/records/4341443</ext-link> doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.4341443" xlink:type="simple">10.5281/zenodo.4341443</ext-link>.</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="sec001" sec-type="intro">
<title>Introduction</title>
<p>In the contemporary landscape, health emerges as a pivotal facet in the quest for an improved quality of life. Nevertheless, the escalating prominence of health issues stemming from inadequate physical activity poses a substantial menace to well-being. Of particular concern is the surge in chronic ailments among the elderly, burdening healthcare services and precipitating a public health crisis. To combat these challenges, the government has initiated a strategy that amalgamates sports and medicine—melding medical expertise with sports science—to employ exercise and fitness as preventative and therapeutic measures against diseases.</p>
<p>The overarching objective of this integration is to optimize both physical and mental well-being by leveraging exercise, sports, and fitness to enhance health and stave off diseases. This approach enables the real-time assessment of health metrics during physical activity, facilitating the formulation of tailored exercise regimens for the elderly. These programs aim to bolster physical health, mitigate disease risks, and elevate their overall quality of life. As data volumes burgeon, manual management proves increasingly inefficient. Artificial intelligence (AI) methodologies offer a solution by enabling efficient and precise analysis, processing, and prognostication of data. Leveraging advanced sensor, computer, and internet technologies holds the promise of elevating the intelligence quotient of community medical and health services. The focal point of ongoing research revolves around utilizing body-health integration data to cater to the specific medical and health needs of the elderly.</p>
<p>Recent advancements in scholarly research have revealed breakthroughs in clustering algorithms [<xref ref-type="bibr" rid="pone.0302741.ref001">1</xref>], particularly in the domain of unsupervised learning. These algorithms adeptly handle diverse data types—continuous, discrete, ordered, or nominal—alongside large datasets. Automating the determination of optimal cluster numbers and outcomes eliminates the need for human intervention. This ensures that similar entities or data points coalesce, promoting intra-group homogeneity and inter-group heterogeneity, thereby enhancing the efficient management of elderly health data. Concurrently, researchers have engaged in sensor-based somatosensory data collection, expanding the technological toolkit for health data management within the body-medicine integration paradigm. However, challenges persist in elderly health data management using clustering algorithms within the body-medicine nexus. Notably, the sensitivity of clustering algorithms to initial centroid selection poses a challenge, potentially leading to suboptimal clustering outcomes. Additionally, as unsupervised learning methods, clustering algorithms often involve non-convex functions with multiple local extrema in their objective functions, increasing susceptibility to converging on local optima [<xref ref-type="bibr" rid="pone.0302741.ref002">2</xref>].</p>
<p>To address these concerns, the Particle Swarm Optimization (PSO) algorithm [<xref ref-type="bibr" rid="pone.0302741.ref003">3</xref>] emerges as a powerful global optimization technique, mimicking natural bird foraging to navigate complex data spaces. Recent applications underscore its effectiveness in resolving clustering predicaments. Nevertheless, inherent in existing PSO algorithms is the risk of converging toward local optima, necessitating further exploration and refinement.</p>
<p>This paper tackles the aforementioned challenges by optimizing the PSO algorithm, treating clustering as an optimization problem. Utilizing an intelligent optimization algorithm enhances the stability of the clustering process. To this end, a high-order hybrid clustering algorithm is proposed to design a comprehensive health information management model for elderly individuals within the realm of physical medicine integration. The primary contributions of this study are outlined below:</p>
<list list-type="order">
<list-item><p>Enhancement of the PSO algorithm: Introducing a Weighted Covariance Matrix-based Global Evolution Dynamic Model (GEDM) refines the depiction of population distribution by employing weighted covariance calculations. Strategic utilization of GEDM or the conventional PSO algorithm at distinct stages strengthens algorithmic adaptability and resilience, augmenting convergence speed and preventing entrapment in local optima.</p></list-item>
<list-item><p>Development of a higher-order hybrid clustering algorithm: This approach integrates the peaking algorithm with the PSO clustering algorithm. Initially, the PSO clustering algorithm iteratively forms class clusters, followed by applying the density peaking search algorithm to ascertain candidate centroids based on data attributes. The final centroids are derived by amalgamating the initial class clusters with the identified candidate centroids.</p></list-item>
<list-item><p>Efficient clustering of elderly health text information using the higher-order hybrid clustering algorithm with the LDA topic vector model: Employing this algorithm for text clustering on a synthetic dataset, utilizing three distinct text vector formation methods—Vector Space Model (VSM), Doc2vec, and LDA Topic Model—underscores the efficacy of the LDA Topic Model-based approach for optimal text clustering within elderly health data analysis.</p></list-item>
</list>
<p>In this paper, the current state of the art of clustering algorithms and the current state of application of PSO algorithms nowadays will be presented in Section 2. The improved PSO algorithm constructed in this paper and the higher order hybrid clustering algorithm based on density values and the improved PSO algorithm are presented in Section 3. Section 4 focuses on describing the experimental results and discussing the performance of the scheme, comparing and analysing it with the classical scheme, and conducting ablation experiments to explore the role of each module of the model. Finally, it also explores the clustering of text data by this paper’s higher-order hybrid clustering algorithm under different text vector formation methods to improve the management of health data of the elderly in the context of physical-medical integration. Section 5 concludes with a summary discussing the performance of the higher-order hybrid clustering algorithm constructed in this paper and its application to elderly health data management.</p>
</sec>
<sec id="sec002">
<title>Related works</title>
<sec id="sec003">
<title>Hybrid clustering algorithm</title>
<p>The K-Means algorithm, well-known for its simplicity, has limitations that impede its ability to produce flawless clustering results [<xref ref-type="bibr" rid="pone.0302741.ref004">4</xref>]. In response, literature [<xref ref-type="bibr" rid="pone.0302741.ref005">5</xref>] introduced the Fuzzy C-Means (FCM) algorithm, allowing points to belong to multiple clusters, a departure from K-Means’ strict partitioning. However, sensitivity to initial values persisted, leading to subsequent enhancements. The peak method proposed in literature [<xref ref-type="bibr" rid="pone.0302741.ref006">6</xref>] was employed to estimate cluster centers for initial partitioning, while literature [<xref ref-type="bibr" rid="pone.0302741.ref007">7</xref>] advocated adjusting proximity distances to fortify FCM against outliers. Further advancements aimed to enhance clustering algorithms’ efficacy. Literature [<xref ref-type="bibr" rid="pone.0302741.ref008">8</xref>] presented a probabilistic perspective on clustering, extracting data objects from specific probability distributions and assuming the overall data distribution as a blend of several distributions. Similarly, literature [<xref ref-type="bibr" rid="pone.0302741.ref009">9</xref>] showcased Density-Based Spatial Clustering of Applications with Noise (DBSCAN), a density-based algorithm capable of identifying arbitrarily shaped clusters without constraints, demonstrating robust performance on sizable datasets. Building upon DBSCAN, literature [<xref ref-type="bibr" rid="pone.0302741.ref010">10</xref>] introduced density peak clustering, derived from the DBSCAN algorithm, excelling in discerning non-spherical clusters and accurately delineating intricate data distributions.</p>
<p>These algorithms face challenges related to initial clustering center selection and the tendency to converge toward local optima, potentially hindering the attainment of globally optimal clustering outcomes. Recent research has focused on leveraging PSO-based data clustering to devise hybrid clustering algorithms, combining PSO with conventional methods to elevate clustering efficacy [<xref ref-type="bibr" rid="pone.0302741.ref011">11</xref>]. These PSO-based algorithms dynamically fine-tune parameters like inertia weights, using population fitness variance to ascertain the convergence timing of PSO and K-Means algorithms [<xref ref-type="bibr" rid="pone.0302741.ref012">12</xref>]. Additionally, they incorporate real-time monitoring of optimal values within individual particles and particle clusters, promptly executing mutation operations on prematurely converging particles to explore globally optimal initial clustering centers for K-Means algorithms.</p>
<p>Hybrid clustering algorithms provide a more comprehensive understanding of data, combining the global search and optimization capabilities of PSO with the expertise of traditional clustering algorithms tailored to specific data structures and features. Literature [<xref ref-type="bibr" rid="pone.0302741.ref013">13</xref>] introduces the PSO-K-means algorithm, which initializes a particle in the particle swarm using clustering results derived from K-Means. Alternatively, literature [<xref ref-type="bibr" rid="pone.0302741.ref014">14</xref>] presents a hybrid algorithm that combines PSO and K-Means, utilizing PSO for global search at the optimization’s outset and leveraging K-Means for accelerated convergence near the optimal solution. In addressing nonlinear division clustering challenges, literature [<xref ref-type="bibr" rid="pone.0302741.ref015">15</xref>] pioneers a hybrid clustering algorithm that integrates fuzzy adaptive PSO and K-Means methodologies.</p>
</sec>
<sec id="sec004">
<title>PSO optimization model</title>
<p>To enhance the effectiveness of hybrid clustering algorithms, researchers have worked to refine both clustering and PSO methodologies. Literature [<xref ref-type="bibr" rid="pone.0302741.ref016">16</xref>] introduces the Evolutionary Particle Swarm Optimization (EPSO) algorithm, rooted in particle swarm evolution. This approach initiates particles uniformly across the input data space, with subsequent generations dynamically adjusting to pursue optimal positions. However, relying on inter-particle information exchange limits its ability to break free from local optima, curtailing performance on intricate problems and constraining global search capability.</p>
<p>In pursuit of reduced computational complexity, literature [<xref ref-type="bibr" rid="pone.0302741.ref017">17</xref>] proposes an enhanced mPSC (Particle Swarm Clustering) algorithm, aiming to streamline PSC for improved efficiency with large-scale datasets. Despite simplifying computational processes and reducing manual input parameters, mPSC faces challenges related to dataset characteristics and initial parameter choices. These challenges result in susceptibility to local optima, diminished convergence efficiency, and compromised clustering accuracy, highlighting the need for more comprehensive optimization. Literature [<xref ref-type="bibr" rid="pone.0302741.ref018">18</xref>] introduces a PSC-RCE algorithm, incorporating Rapid Centroid Estimation (RCE) [<xref ref-type="bibr" rid="pone.0302741.ref019">19</xref>] to simplify PSC’s update rules. This approach, termed PSC-RCE, aims to streamline the clustering process. While PSO-based clustering mitigates the risk of local optima, challenges persist. The update mechanism in PSO algorithms, reliant on historical optimal solutions, leads to diminished convergence efficiency during initial search stages and a subsequent decline in population diversity. This inefficiency poses a substantial challenge, particularly in scenarios involving large-scale datasets and stringent real-time requirements, due to prolonged convergence times and heightened computational resource consumption.</p>
<p>The PSO algorithm encounters limitations when handling high-dimensional, intricate shapes, and noisy data, often yielding clustering outcomes lacking accuracy and stability. These limitations directly impact the algorithm’s reliability and practical utility, constraining its applicability in real-world scenarios. Consequently, this paper aims to enhance the PSO methodology by integrating it with density algorithms, forging a higher-order hybrid clustering algorithm. The objective is to craft a health data management model tailored for the elderly within the framework of physical-medical integration.</p>
</sec>
<sec id="sec005">
<title>Model design</title>
<p>The model diagram in <xref ref-type="fig" rid="pone.0302741.g001">Fig 1</xref> illustrates the framework for elderly health information management based on the higher-order hybrid clustering algorithm developed in this paper. Our approach involves refining the PSO algorithm and integrating density values to construct an efficient higher-order hybrid clustering algorithm. This algorithm processes sensor-collected data, representing it in vector form, and concludes with the effective clustering management of elderly health data.</p>
<fig id="pone.0302741.g001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0302741.g001</object-id>
<label>Fig 1</label>
<caption>
<title>Model frame drawing.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0302741.g001" xlink:type="simple"/>
</fig>
</sec>
<sec id="sec006">
<title>Optimization of PSO</title>
<p>Within the n-dimensional search space of the PSO algorithm, each particle is characterized by two vectors: the position vector (<italic>P</italic>) and the velocity vector (<italic>V</italic>). Initially, every particle possesses randomized initial velocity and position values. The particle’s position denotes a potential solution within the problem space. Throughout the search phase, the PSO algorithm iteratively updates particle information. Let the individual history optimal solution <italic>pbest</italic><sub><italic>i</italic></sub> for particle i and the global history optimal solution <italic>gbest</italic><sub><italic>i</italic></sub> for the whole population be represented as follows:
<disp-formula id="pone.0302741.e001">
<alternatives>
<graphic id="pone.0302741.e001g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e001" xlink:type="simple"/>
<mml:math display="block" id="M1">
<mml:mrow><mml:mi>p</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mn>…</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">]</mml:mo></mml:mrow>
</mml:math>
</alternatives>
<label>(1)</label>
</disp-formula>
<disp-formula id="pone.0302741.e002">
<alternatives>
<graphic id="pone.0302741.e002g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e002" xlink:type="simple"/>
<mml:math display="block" id="M2">
<mml:mrow><mml:mi>g</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:msub><mml:mi>g</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>g</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mn>…</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>g</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">]</mml:mo></mml:mrow>
</mml:math>
</alternatives>
<label>(2)</label>
</disp-formula></p>
<p>When the particle moves in the search space, the velocity <italic>v</italic><sub><italic>id</italic></sub> and position <italic>x</italic><sub><italic>id</italic></sub> of each particle i are updated according to the Formulas (<xref ref-type="disp-formula" rid="pone.0302741.e003">3</xref>) and (<xref ref-type="disp-formula" rid="pone.0302741.e004">4</xref>).</p>
<disp-formula id="pone.0302741.e003">
<alternatives>
<graphic id="pone.0302741.e003g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e003" xlink:type="simple"/>
<mml:math display="block" id="M3">
<mml:mrow><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>w</mml:mi><mml:mi>*</mml:mi><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mi>c</mml:mi><mml:mn>1</mml:mn><mml:mi>*</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>d</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>p</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>−</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mi>c</mml:mi><mml:mn>2</mml:mn><mml:mi>*</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>d</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mi>*</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>g</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>t</mml:mi><mml:mi>d</mml:mi></mml:msub><mml:mo>−</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow>
</mml:math>
</alternatives>
<label>(3)</label>
</disp-formula>
<disp-formula id="pone.0302741.e004">
<alternatives>
<graphic id="pone.0302741.e004g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e004" xlink:type="simple"/>
<mml:math display="block" id="M4">
<mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>v</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow>
</mml:math>
</alternatives>
<label>(4)</label>
</disp-formula>
<p>Where <italic>w</italic> denotes the inertia weights, <italic>c</italic>1 and <italic>c</italic>2 denote the acceleration constants, <italic>v</italic><sub><italic>id</italic></sub>(<italic>t</italic>) denotes the velocity of the ith particle in the dth dimension at the t-th iteration, and rand1 and rand2 are random numbers in the range [0,1]. It is established that the efficacy of PSO is heightened as the inertia weights progressively decrease in tandem with the number of iterations. The subsequent representation illustrates the linear descent of inertia weights:
<disp-formula id="pone.0302741.e005">
<alternatives>
<graphic id="pone.0302741.e005g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e005" xlink:type="simple"/>
<mml:math display="block" id="M5">
<mml:mrow><mml:mi>w</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi mathvariant="normal">max</mml:mi></mml:mrow></mml:msub><mml:mo>−</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi mathvariant="normal">max</mml:mi></mml:mrow></mml:msub><mml:mo>−</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi mathvariant="normal">min</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mi>*</mml:mi><mml:mfrac><mml:mi>t</mml:mi><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi mathvariant="normal">max</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow>
</mml:math>
</alternatives>
<label>(5)</label>
</disp-formula></p>
<p>To further enhance the convergence efficacy of the PSO algorithm and mitigate local optima, our investigation reveals EDA [<xref ref-type="bibr" rid="pone.0302741.ref020">20</xref>] as a robust evolutionary algorithm. EDA estimates population evolution through probabilistic model sampling and learning, exhibiting exceptional performance in addressing intricate optimization problems. Within this framework, GEDM [<xref ref-type="bibr" rid="pone.0302741.ref021">21</xref>] assumes a pivotal role as a core constituent of EDA. Our decision was to integrate GEDM into the PSO algorithm, depicted in <xref ref-type="fig" rid="pone.0302741.g002">Fig 2</xref>. This integration involves constructing a GEDM model utilizing a weighted covariance matrix. This approach enables a more precise depiction of the population’s distribution through weighted covariance matrix calculations, thereby furnishing the algorithm with an enhanced estimation of the evolutionary trajectory. The computational methodology for the GEDM model based on the weighted covariance matrix is delineated below:
<disp-formula id="pone.0302741.e006">
<alternatives>
<graphic id="pone.0302741.e006g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e006" xlink:type="simple"/>
<mml:math display="block" id="M6">
<mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">ln</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>m</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>−</mml:mo><mml:mi mathvariant="normal">ln</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>m</mml:mi></mml:munderover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="normal">ln</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>m</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>−</mml:mo><mml:mi mathvariant="normal">ln</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mfrac></mml:mrow>
</mml:math>
</alternatives>
<label>(6)</label>
</disp-formula>
<disp-formula id="pone.0302741.e007">
<alternatives>
<graphic id="pone.0302741.e007g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e007" xlink:type="simple"/>
<mml:math display="block" id="M7">
<mml:mrow><mml:mi>X</mml:mi><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>m</mml:mi></mml:munderover><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mi>*</mml:mi><mml:msub><mml:mi>X</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:mrow>
</mml:math>
</alternatives>
<label>(7)</label>
</disp-formula>
<disp-formula id="pone.0302741.e008">
<alternatives>
<graphic id="pone.0302741.e008g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e008" xlink:type="simple"/>
<mml:math display="block" id="M8">
<mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>v</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mstyle scriptlevel="+1"><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mi>m</mml:mi><mml:mo>−</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mfrac></mml:mstyle><mml:mi>*</mml:mi><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>m</mml:mi></mml:munderover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>−</mml:mo><mml:mi>X</mml:mi><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mi>*</mml:mi><mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>−</mml:mo><mml:mi>X</mml:mi><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>T</mml:mi></mml:msup></mml:mrow></mml:mstyle></mml:mrow>
</mml:math>
</alternatives>
<label>(8)</label>
</disp-formula></p>
<fig id="pone.0302741.g002" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0302741.g002</object-id>
<label>Fig 2</label>
<caption>
<title>Weighted covariance matrix based GEDM model.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0302741.g002" xlink:type="simple"/>
</fig>
<p>We set m to denote the number of solutions selected as dominant, and {<italic>X</italic><sub>1</sub>,<italic>X</italic><sub>2</sub>,<italic>X</italic><sub>3</sub>,⋯,<italic>X</italic><sub><italic>m</italic></sub>} denotes the set of dominant solutions, where <italic>X</italic><sub>1</sub> denotes the optimal solution. <italic>w</italic><sub><italic>i</italic></sub> denotes the weight of the ith dominant solution, the higher the ranking of the solution the greater the corresponding weight.</p>
<p><italic>Cov</italic>(<italic>t</italic>) represents the weighted covariance matrix of the dominant solutions in the proposed model. When using GEDM to update the population information, we use the Formula (<xref ref-type="disp-formula" rid="pone.0302741.e009">9</xref>) to update the position of particles in the population.</p>
<disp-formula id="pone.0302741.e009">
<alternatives>
<graphic id="pone.0302741.e009g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e009" xlink:type="simple"/>
<mml:math display="block" id="M9">
<mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>G</mml:mi><mml:mi>a</mml:mi><mml:mi>u</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>v</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>d</mml:mi><mml:mi>*</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>−</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow>
</mml:math>
</alternatives>
<label>(9)</label>
</disp-formula>
<p>To effectively merge GEDM and PSO, a selection process determines whether GEDM or PSO updates the population information at various stages of the optimization process, guided by distinct probabilities. During the initial phase, GEDM receives a higher probability for updating population information. This prioritization leverages the strengths of advantageous particles to ascertain a superior evolutionary trajectory, thereby accelerating the convergence rate. Conversely, in the latter stages of optimization, PSO garners a higher probability for updating population information. This strategic shift capitalizes on PSO’s robust local search capabilities to enhance convergence accuracy. The probability model governing the selection between GEDM and PSO to update population information is depicted below:
<disp-formula id="pone.0302741.e010">
<alternatives>
<graphic id="pone.0302741.e010g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e010" xlink:type="simple"/>
<mml:math display="block" id="M10">
<mml:mrow><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi mathvariant="normal">max</mml:mi></mml:mrow></mml:msub><mml:mo>−</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi mathvariant="normal">max</mml:mi></mml:mrow></mml:msub><mml:mo>−</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi mathvariant="normal">min</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mi>*</mml:mi><mml:mfrac><mml:mi>t</mml:mi><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mi mathvariant="normal">max</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow>
</mml:math>
</alternatives>
<label>(10)</label>
</disp-formula></p>
<p>Where <italic>y</italic><sub>max</sub> and <italic>y</italic><sub>min</sub> denote the maximum and minimum probability of selecting GEDM to update population information, respectively.</p>
</sec>
<sec id="sec007">
<title>Higher order hybrid clustering algorithm</title>
<p>Drawing from the density peak clustering method established in literature [<xref ref-type="bibr" rid="pone.0302741.ref009">9</xref>, <xref ref-type="bibr" rid="pone.0302741.ref010">10</xref>], this approach adeptly manages clusters of varying shapes, identifies noise points within the data for exclusion from clustering, and offers an accurate depiction of data distribution. To enhance clustering precision, we introduce the density peak algorithm into the hybrid clustering framework based on the PSO algorithm in this section, thereby constructing a higher order hybrid clustering algorithm for performance enhancement. The PSO algorithm utilized within this higher-order framework is the refined PSO algorithm outlined in Section 3.1.</p>
<p>The final constructed higher-order hybrid clustering algorithm comprises three key components: initial data clustering, centroid acquisition, and class merging. The PSO clustering algorithm initially engages in data clustering, amalgamating multiple class clusters from datasets after a finite number of iterations. Candidate centroids are derived from the data attributes via the density peak search algorithm, as illustrated in <xref ref-type="fig" rid="pone.0302741.g003">Fig 3</xref>. Subsequently, the initial class clusters and candidate centroids facilitate centroid determination. Finally, a division-based approach allocates data points to the class corresponding to the nearest centroid. Given scenarios with multiple identified class centers, a class merging operation is employed at the algorithm’s conclusion to consolidate class clusters.</p>
<fig id="pone.0302741.g003" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0302741.g003</object-id>
<label>Fig 3</label>
<caption>
<title>Flow chart of density peak clustering algorithm.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0302741.g003" xlink:type="simple"/>
</fig>
<p>After inputting the dataset to be clustered, the algorithm proceeds as follows:</p>
<p>Step 1: Calculate the distance between two data points <italic>d</italic><sub><italic>ij</italic></sub>(<italic>i</italic>≠<italic>j</italic>).</p>
<p>Step 2: Calculate the truncation distance <italic>d</italic><sub><italic>c</italic></sub>, where <italic>d</italic><sub><italic>c</italic></sub> = <italic>d</italic><sub><italic>f(Mt)</italic></sub>, <italic>f</italic>(<italic>Mt</italic>) is obtained by rounding Mt.</p>
<p>Step 3: Calculate <italic>p</italic><sub><italic>i</italic></sub> and the distance <italic>φ</italic><sub><italic>i</italic></sub>. We use the truncated kernel calculation, which is the common way to calculate <italic>p</italic><sub><italic>i</italic></sub>. Thus <inline-formula id="pone.0302741.e011"><alternatives><graphic id="pone.0302741.e011g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e011" xlink:type="simple"/><mml:math display="inline" id="M11"><mml:mrow><mml:msub><mml:mi>p</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder><mml:mo>∑</mml:mo><mml:mi>j</mml:mi></mml:munder><mml:mrow><mml:mi>χ</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>−</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:math></alternatives></inline-formula>, where <italic>χ</italic> is defined as follows:
<disp-formula id="pone.0302741.e012">
<alternatives>
<graphic id="pone.0302741.e012g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e012" xlink:type="simple"/>
<mml:math display="block" id="M12">
<mml:mrow><mml:mi>χ</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mo>{</mml:mo><mml:mrow><mml:mtable equalcolumns="true" equalrows="true"><mml:mtr><mml:mtd><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>x</mml:mi><mml:mo>&lt;</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mi>x</mml:mi><mml:mo>≥</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow>
</mml:math>
</alternatives>
<label>(11)</label>
</disp-formula></p>
<p>Distance <italic>φ</italic><sub><italic>i</italic></sub> is measured by calculating the minimum distance between point i and any other point with a higher density:
<disp-formula id="pone.0302741.e013">
<alternatives>
<graphic id="pone.0302741.e013g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e013" xlink:type="simple"/>
<mml:math display="block" id="M13">
<mml:mrow><mml:msub><mml:mi>φ</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="normal">min</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>:</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo>&gt;</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow>
</mml:math>
</alternatives>
<label>(12)</label>
</disp-formula></p>
<p>For densely populated points:
<disp-formula id="pone.0302741.e014">
<alternatives>
<graphic id="pone.0302741.e014g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e014" xlink:type="simple"/>
<mml:math display="block" id="M14">
<mml:mrow><mml:msub><mml:mi>φ</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="normal">max</mml:mi></mml:mrow><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow>
</mml:math>
</alternatives>
<label>(13)</label>
</disp-formula></p>
<p>It can be noticed that only those points whose densities are locally or globally maximal are larger than the normal neighbour spacing, i.e., there is a large <italic>φ</italic><sub><italic>i</italic></sub>, so that points with abnormally large values of <italic>φ</italic><sub><italic>i</italic></sub> may also be the clustering centres. The clustering process of the algorithm is to take the points with large local density <italic>p</italic><sub><italic>i</italic></sub> and large <italic>φ</italic><sub><italic>i</italic></sub> as the class centre, and the points with small local density but large <italic>φ</italic><sub><italic>i</italic></sub> are considered as anomalous points. Once the class centres are identified, the other points are divided into the classes represented by the closest class centres.</p>
<p>Step 4: Calculate <italic>y</italic><sub><italic>i</italic></sub> and sort it in descending order according to the following formulae to get the set of candidate centroids <italic>CEN</italic><sub><italic>sus</italic></sub> = (<italic>cen</italic><sub>1</sub>,<italic>cen</italic><sub>2</sub>,⋯,<italic>cen</italic><sub><italic>s</italic></sub>), <italic>y</italic><sub><italic>i</italic></sub> is calculated as follows:
<disp-formula id="pone.0302741.e015">
<alternatives>
<graphic id="pone.0302741.e015g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e015" xlink:type="simple"/>
<mml:math display="block" id="M15">
<mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mi>*</mml:mi><mml:msub><mml:mi>φ</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow>
</mml:math>
</alternatives>
<label>(14)</label>
</disp-formula></p>
<p>Step 5: Run the improved PSO algorithm from Section 3.1 to get the initial clustering of the data <italic>Int</italic> = (<italic>c</italic><sub>1</sub>,<italic>c</italic><sub>2</sub>,⋯,<italic>c</italic><sub><italic>n</italic></sub>)</p>
<p>Step 6: Judge the candidate centroids of the class affiliation in the middle to get the exact set of clustered centroids <italic>CEN</italic><sub><italic>sure</italic></sub>.</p>
<p>Step 7: Divide the remaining data points to their nearest class centroid to get the final class cluster.</p>
<p>We are in step 5, the PSO algorithm stops iterating after a certain number of iterative runs, at this point the data has formed the beginnings of the class, but not all the data is well classified into a fixed class, at this point it is also necessary to appropriate processing of the existing results. If there is no data in the neighbourhood of an object or its neighbourhood is less than a certain constant, it is considered to be an isolated point and the point is not counted in the initial clustering. Otherwise the point and its neighbourhood are considered as a class, from which multiple classes are formed to get the initial clustering <italic>Int</italic>.</p>
<p>The formation of the candidate centroid set is carried out next. The candidate centroid set is defined in the algorithm in order to eliminate the manual selection process. Firstly, calculate <italic>y</italic><sub><italic>i</italic></sub> = <italic>p</italic><sub><italic>i</italic></sub> *<italic>φ</italic><sub><italic>i</italic></sub> to get <italic>y</italic><sub><italic>i</italic></sub>, then calculate the second-order difference value of <italic>y</italic><sub><italic>i</italic></sub>, find the index position where the very small value point is located, and intercept the first few data points of the index position, but in most cases, the number of points intercepted by this method is larger than the real class centre, so we form the candidate centroid set here for the next steps.</p>
<p>In determining the centroids from the set of candidate points, we mainly applied the initial clusters formed in step 5 <italic>Int</italic>, which determine the centroids as follows:</p>
<p>For each class, we first calculate the sum of the distances S from the data points in the class to the candidate centre <italic>cent</italic><sub><italic>i</italic></sub>, S is calculated as follows:
<disp-formula id="pone.0302741.e016">
<alternatives>
<graphic id="pone.0302741.e016g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e016" xlink:type="simple"/>
<mml:math display="block" id="M16">
<mml:mrow><mml:mi>S</mml:mi><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:msub><mml:mo>∑</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>∈</mml:mo><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mi>d</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mi>c</mml:mi><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:msub><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mstyle></mml:mrow>
</mml:math>
</alternatives>
<label>(15)</label>
</disp-formula></p>
<p>The candidate centroid with the smallest S is subsequently used as the exact centroid.</p>
</sec>
</sec>
<sec id="sec008">
<title>Experiments and analysis</title>
<p>To evaluate the effectiveness of the proposed methodologies, we conducted experiments using one synthetic dataset, S1, and the 1-gram UCI real dataset CMC. The dataset parameters are detailed in <xref ref-type="table" rid="pone.0302741.t001">Table 1</xref>. Our comparative analysis includes K-means [<xref ref-type="bibr" rid="pone.0302741.ref004">4</xref>], HPSOK-Means [<xref ref-type="bibr" rid="pone.0302741.ref013">13</xref>], and PSC-RCE [<xref ref-type="bibr" rid="pone.0302741.ref018">18</xref>]. HPSOK-Means utilizes the PSO algorithm to determine the centroids of K clusters, initializing particles using the clustering outcomes from K-means. PSC-RCE is a notable particle swarm clustering algorithm, streamlining particle swarm clustering update rules to significantly reduce computational overhead by enhancing trajectory efficiency.</p>
<table-wrap id="pone.0302741.t001" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0302741.t001</object-id>
<label>Table 1</label> <caption><title>Data set parameters.</title></caption>
<alternatives>
<graphic id="pone.0302741.t001g" mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0302741.t001" xlink:type="simple"/>
<table>
<colgroup>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
<col align="left" valign="middle"/>
</colgroup>
<thead>
<tr>
<th align="justify"/>
<th align="justify">Data set</th>
<th align="justify">Scale</th>
<th align="justify">Number of features</th>
<th align="justify">Number of clusters</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">Synthetic data set</td>
<td align="left">S<sub>1</sub></td>
<td align="left">1200</td>
<td align="left">2</td>
<td align="left">4</td>
</tr>
<tr>
<td align="left">Real data set</td>
<td align="left">CMC</td>
<td align="left">1473</td>
<td align="left">9</td>
<td align="left">3</td>
</tr>
</tbody>
</table>
</alternatives>
</table-wrap>
<p>To ensure fair comparisons, the maximum fitness evaluations for the PSO-based clustering algorithm are capped at 3000, while K-Means’ maximum objective function calculations are set to 30000 without additional parameters. Other parameters for the comparison algorithms, excluding K-means, align with their respective original papers. To mitigate experimental bias, each algorithm underwent 30 independent runs on each dataset.</p>
<sec id="sec009">
<title>Experimental indicators</title>
<p>The F-measure serves as a pivotal metric for evaluating clustering performance comprehensively, amalgamating precision and recall as vital indicators. Within clustering analysis, the F-measure value directly mirrors the efficacy of the clustering outcome.</p>
<p>Precision primarily quantifies the fraction of positive example samples correctly identified within the clustering results, indicating the ratio of true examples to the samples predicted as positive examples.</p>
<p>Conversely, Recall assesses the proportion of all actual positive example samples correctly identified, representing the ratio of true examples to the total number of actual positive example samples.</p>
<p>Formally, each benchmark classification <italic>C</italic><sub><italic>i</italic></sub> (given by the true labels of the input dataset) corresponds to a collection of <italic>n</italic><sub><italic>i</italic></sub> objects required for a query. Each cluster <italic>C<sub>j</sub></italic>' obtained by the clustering algorithm corresponds to the set of <italic>n</italic><sub><italic>j</italic></sub> objects retrieved from a query. <italic>n</italic><sub><italic>ij</italic></sub> denotes the number of objects in the base classification <italic>C</italic><sub><italic>i</italic></sub> in the cluster <italic>C<sub>j</sub></italic>'. For each benchmark classification <italic>C<sub>j</sub></italic>' and cluster <italic>C<sub>j</sub></italic>', F-measure(F), Precision and Recall are defined as follows:
<disp-formula id="pone.0302741.e017">
<alternatives>
<graphic id="pone.0302741.e017g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e017" xlink:type="simple"/>
<mml:math display="block" id="M17">
<mml:mrow><mml:mi mathvariant="normal">Precision</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>n</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:mfrac></mml:mrow>
</mml:math>
</alternatives>
<label>(16)</label>
</disp-formula>
<disp-formula id="pone.0302741.e018">
<alternatives>
<graphic id="pone.0302741.e018g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e018" xlink:type="simple"/>
<mml:math display="block" id="M18">
<mml:mrow><mml:mi mathvariant="normal">Re</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>n</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:mfrac></mml:mrow>
</mml:math>
</alternatives>
<label>(17)</label>
</disp-formula>
<disp-formula id="pone.0302741.e019">
<alternatives>
<graphic id="pone.0302741.e019g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e019" xlink:type="simple"/>
<mml:math display="block" id="M19">
<mml:mrow><mml:mi>F</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msup><mml:mi>b</mml:mi><mml:mn>2</mml:mn></mml:msup><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">)</mml:mo><mml:mo>×</mml:mo><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>×</mml:mo><mml:mi>R</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:msup><mml:mi>b</mml:mi><mml:mn>2</mml:mn></mml:msup><mml:mo>×</mml:mo><mml:mi>P</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:mi>R</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mfrac></mml:mrow>
</mml:math>
</alternatives>
<label>(18)</label>
</disp-formula>
<disp-formula id="pone.0302741.e020">
<alternatives>
<graphic id="pone.0302741.e020g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0302741.e020" xlink:type="simple"/>
<mml:math display="block" id="M20">
<mml:mrow><mml:mi>F</mml:mi><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder><mml:mo>∑</mml:mo><mml:mi>i</mml:mi></mml:munder><mml:mrow><mml:mfrac><mml:mrow><mml:msub><mml:mi>n</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mi>N</mml:mi></mml:mfrac><mml:mo>×</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="normal">max</mml:mi></mml:mrow><mml:mi>j</mml:mi></mml:msub><mml:mo>{</mml:mo><mml:mi>F</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>}</mml:mo></mml:mrow></mml:mstyle></mml:mrow>
</mml:math>
</alternatives>
<label>(19)</label>
</disp-formula>
where b in Eq (<xref ref-type="disp-formula" rid="pone.0302741.e019">18</xref>) is equal to 1, which allows both precision and recall to remain equally weighted.</p>
</sec>
<sec id="sec010" sec-type="results">
<title>Results</title>
<p>In order to comprehensively assess the performance and stability of the algorithms presented in this paper alongside the comparative algorithms, this subsection presents experimental comparison results across both datasets. <xref ref-type="fig" rid="pone.0302741.g004">Fig 4</xref> illustrates the average F-measure and recall values for each scheme on datasets S1 and CMC. The algorithm introduced in this paper exhibits superior performance compared to the three comparative algorithms. Notably, on dataset S1, this paper’s algorithm attains an average F-measure and recall above 99%, surpassing all others. The best-performing PSC-RCE algorithm achieves only 88.57% and 88.92% for F-measure and recall, respectively, substantially trailing behind this paper’s scheme by 10.65% and 10.35%.</p>
<fig id="pone.0302741.g004" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0302741.g004</object-id>
<label>Fig 4</label>
<caption>
<title>Experimental comparison of different clustering algorithms.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0302741.g004" xlink:type="simple"/>
</fig>
<p>Furthermore, the experimental results obtained by the proposed model display a highly concentrated overall distribution with minimal outliers. This observation indicates robust stability across both datasets, affirming the strength of this paper’s algorithm.</p>
<p>To scrutinize the stability factors behind the algorithms presented in this paper, we conducted ablation experiments, considering variations such as the original PSO algorithm, the improved PSO algorithm, and the inclusion or exclusion of density values for clustering. We denoted E1 as the clustering algorithm devoid of density values and lacking the improved PSO algorithm, E2 as the clustering algorithm devoid of density values but integrating the improved PSO algorithm, E3 as the clustering algorithm incorporating density values without the improved PSO algorithm, and E4 as the clustering algorithm integrating both density values and the improved PSO algorithm. Experimental evaluations of F-measure, recall, and precision for each model are depicted in <xref ref-type="fig" rid="pone.0302741.g005">Fig 5</xref>.</p>
<fig id="pone.0302741.g005" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0302741.g005</object-id>
<label>Fig 5</label>
<caption>
<title>Results of ablation experiment.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0302741.g005" xlink:type="simple"/>
</fig>
<p>Comparing E2 with E1 highlights a substantial enhancement in F-measure, precision, and recall (73.91%, 71.91%, and 60.91%, respectively, on dataset CMC) upon employing the improved PSO algorithm. Relative to E3 and E1, it’s evident that introducing density values without a higher-order clustering algorithm doesn’t efficiently enhance performance. Moreover, on dataset S1, the recall values for E1, E2, E3, and E4 reach 60.92%, 80.62%, 73.23%, and 82.22%, respectively. These metrics underscore that the higher-order hybrid clustering algorithm devised in this paper augments traditional PSO clustering algorithm performance by approximately 34.96%.</p>
<p><xref ref-type="fig" rid="pone.0302741.g006">Fig 6</xref> illustrates the iterative training’s impact on the clustering algorithm’s ability to identify truly positive samples, as measured by recall and precision metrics. Initially, as the algorithm learns, it gradually identifies more truly positive samples, leading to a steady increase in both recall and precision. Notably, due to our mitigation of overfitting concerns, recall and precision remain steady without declining over the course of iterations.</p>
<fig id="pone.0302741.g006" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0302741.g006</object-id>
<label>Fig 6</label>
<caption>
<title>Training process.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0302741.g006" xlink:type="simple"/>
</fig>
<p>Moreover, the relatively balanced distribution of positive and negative samples in both datasets, coupled with the algorithm’s high stability, results in moderate fluctuations in recall and precision. Ultimately, the recall stabilizes at 82.22% and 62.87%, while precision stabilizes at 98.89% and 73.87% across both datasets.</p>
<p>To further explore the efficacy of the algorithm presented in this study for text data clustering, we conducted experiments on synthetic datasets employing various text vectorization methods. This investigation aimed to ascertain the algorithm’s consistency across diverse text vector representations and identify the representations that optimize its performance. <xref ref-type="fig" rid="pone.0302741.g007">Fig 7</xref> illustrates the impact of the algorithm on both synthetic and CMC datasets, depicting accuracy and recall concerning Doc2vec and LDA subject models. Given the flexibility to train vectors to varying dimensions, our experimentation with Doc2vec involved dimensions of 100 and 200, while LDA utilized dimensions of 30 and 50. The VSM dimension corresponds to the lexicon, encompassing 75,307 feature items.</p>
<fig id="pone.0302741.g007" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0302741.g007</object-id>
<label>Fig 7</label>
<caption>
<title>Comparison of clustering performance under different text vector formation methods.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0302741.g007" xlink:type="simple"/>
</fig>
<p>Observing minimal differences in clustering outcomes across varying dimensions of the same representation, it’s evident that LDA, leveraging thematic vectors, crafts semantic-level textual representations, yielding the highest accuracy and recall post-clustering. Specifically, employing LDA thematic vectors with a dimensionality of 50, this paper’s approach achieves an accuracy rate of 79.2% and 83.5% on datasets S1 and CMC, respectively, alongside recall rates of 84.4% and 86.2%. These rates notably surpass other text data representations.</p>
<p>Contrarily, while LDA demonstrates superior performance, few studies adopt LDA topic vectors for text vectorization. To address this, we introduced Doc2vec, a document representation model necessitating iterative runs. Results indicate that with Doc2vec dimensions set at 100, this paper’s approach achieves accuracies of 73.1% and 75.1% on datasets S1 and CMC, respectively. With Doc2vec dimensions set at 200, accuracies reach 73.6% and 75.8% on the same datasets. Notably akin to the LDA model, Doc2vec generates text vectors at the semantic level via neural network-based formation, yet it lacks interpretability, warranting further investigation.</p>
<p>Furthermore, our findings underscore VSM’s notably inferior results. This can be attributed to VSM employing lexical items as vector dimensions and TFIDF features as word weights, resulting in excessively high-dimensional and sparse representations, thereby exhibiting poor algorithmic performance.</p>
<p>The LDA topic model’s dimensionality reduction yields a more concise and meaningful data representation, capturing semantic information within documents. This enhancement aids clustering algorithms in comprehending document content more effectively. Iterative experiments conducted on the higher-order hybrid clustering algorithm (<xref ref-type="fig" rid="pone.0302741.g008">Fig 8</xref>) reveal that employing LDA’s dimensionality reduction enables accurate clustering of similar data, enhancing clustering quality to achieve stability with fewer iterations.</p>
<fig id="pone.0302741.g008" position="float">
<object-id pub-id-type="doi">10.1371/journal.pone.0302741.g008</object-id>
<label>Fig 8</label>
<caption>
<title>Recall curve of higher-order hybrid clustering algorithm incorporating LDA model.</title>
</caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0302741.g008" xlink:type="simple"/>
</fig>
<p>Remarkably, on dataset S1, after 250 iterations, a rate of 93.41% is achieved, while on dataset CMC, 79.85% is reached at 130 iterations. This notably enhances cluster recall. Consequently, integrating the LDA Topic Vector Model into the design of health information management models for the elderly promises improved data input and representation for clustering algorithms, thereby enhancing clustering efficacy and efficiency.</p>
</sec>
</sec>
<sec id="sec011" sec-type="conclusions">
<title>Discussion</title>
<p>By integrating the perspective of sports and medicine, this paper successfully incorporates GEDM into the EDA of the PSO algorithm, effectively improving the convergence speed of PSO and avoiding local optima. Additionally, a high-order hybrid clustering algorithm combines the improved PSO clustering with the density peak search algorithm’s candidate centroids, optimizing solution performance. Our analysis highlights the outstanding accuracy and stability of this hybrid algorithm across various datasets.</p>
<p>Using the LDA topic model’s vector representation in text data enhances the performance of text data clustering by revealing latent topic information through dimensionality reduction. The high-order hybrid clustering algorithm, based on density values and the improved PSO algorithm, enables the grouping of similar data points in elderly health data, further revealing patterns in the data. Employing LDA topic vectors for text representation maximizes their advantages in dimensionality reduction, semantic richness, sparse handling, interpretability, flexibility, and scalability. This significantly improves the data representation of elderly health data clustering, enhancing clustering effectiveness and efficiency.</p>
<p>Integrating the perspectives of sports and medicine, constructing a high-order hybrid clustering algorithm holds important practical significance in understanding the health status of the elderly and identifying potential health issues. This comprehensive approach not only aids in a profound understanding of the physiological and movement characteristics of the elderly but also provides more comprehensive and accurate health assessments. Firstly, by integrating physiological indicators from the medical field and activity data from the sports domain, the algorithm can more comprehensively depict an individual’s health, providing healthcare professionals with more information and clues.</p>
<p>Secondly, the application of the high-order hybrid clustering algorithm can effectively identify potential health issues in the elderly population. By combining medical and sports data, the algorithm can identify potential health risks and signs of diseases, assisting healthcare personnel in early-stage intervention and management. This ability to detect potential health issues early has a significant positive impact on improving the quality of life and extending healthy lifespans for the elderly.</p>
<p>Moreover, the integrated perspective of sports and medicine can also provide a scientific basis for developing personalized rehabilitation plans and health management strategies. By understanding the movement characteristics and physiological conditions of the elderly, medical teams can design rehabilitation plans tailored to each individual’s specific needs, thereby improving rehabilitation outcomes and quality of life.</p>
<p>Therefore, by integrating the perspective of sports and medicine through the high-order hybrid clustering algorithm, it not only aids in a comprehensive understanding of the health status of the elderly but also provides a scientific basis for personalized healthcare and rehabilitation, thus holding important practical application prospects in the field of elderly health management.</p>
</sec>
<sec id="sec012" sec-type="conclusions">
<title>Conclusion</title>
<p>In this paper, we devised a higher-order hybrid clustering algorithm integrating peak density and PSO methodologies to address geriatric health information management within sports integration. Enhancements to the PSO algorithm involve integrating GEDM into EDA, accelerating convergence and preventing local optima. In the initial clustering phase of our hybrid algorithm, based on density values and the improved PSO, we merge class clusters and candidate centroids to determine final centroids. Our experiments on datasets S1 and CMC demonstrate remarkable algorithm performance. Specifically, this paper’s algorithm achieves outstanding F-measure, recall, and precision of 99.13%, 82.22%, and 99.22% on dataset S1 and 75.22%, 64.22%, and 74.22% on dataset CMC, surpassing comparison schemes significantly. Moreover, leveraging the text vector representation of the LDA topic vector model facilitates efficient dimensionality reduction, particularly notable when the LDA topic vector dimension is set to 50. Ultimately, our higher-order hybrid clustering algorithm attains accuracy rates of 79.2% and 83.5%, alongside recall rates of 84.4% and 86.2% on datasets S1 and CMC, respectively. Consequently, within elderly health information management, employing text vector representation with LDA topic vectors significantly bolsters health data clustering performance. It expedites the detection of elderly health data, enabling swift and efficient formulation of corresponding treatment plans, thereby advancing the convergence of body and medicine integration.</p>
</sec>
</body>
<back>
<ack>
<p>We thank the anonymous reviewers whose comments and suggestions helped to improve the manuscript.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="pone.0302741.ref001"><label>1</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Ikotun A</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Ezugwu A</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Abualigah</surname> <given-names>L</given-names></name>, <etal>et al</etal>. <article-title>K-means clustering algorithms: a comprehensive review, variants analysis, and advances in the era of big data[J].</article-title> <source>Information Sciences</source>, <year>2023</year>, <volume>622</volume>: <fpage>178</fpage>–<lpage>210</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.ins.2022.11.139" xlink:type="simple">10.1016/j.ins.2022.11.139</ext-link></comment></mixed-citation></ref>
<ref id="pone.0302741.ref002"><label>2</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Wang</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Allen G</surname> <given-names>I</given-names></name>. <article-title>Integrative generalized convex clustering optimisation and feature selection for mixed multi-view data[J].</article-title> <source>The Journal of Machine Learning Research</source>, <year>2021</year>, <volume>22</volume>(<issue>1</issue>): <fpage>2498</fpage>–<lpage>2571</lpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref003"><label>3</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Wang</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Tan</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Liu</surname> <given-names>L</given-names></name>. <article-title>Particle swarm optimisation algorithm: an overview[J].</article-title> <source>Soft computing</source>, <year>2018</year>, <volume>22</volume>: <fpage>387</fpage>–<lpage>408</lpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref004"><label>4</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Ahmed</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Seraj</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Islam S M</surname> <given-names>S</given-names></name>. <article-title>The k-means algorithm: a comprehensive survey and performance evaluation[J]</article-title>. <source>Electronics</source>, <year>2020</year>, <volume>9</volume>(<issue>8</issue>): <fpage>1295</fpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref005"><label>5</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Borlea I</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Precup R</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Borlea A</surname> <given-names>B</given-names></name>, <etal>et al</etal>. <article-title>A unified form of fuzzy C-means and K-means algorithms and its partitional implementation[J].</article-title> <source>Knowledge-Based Systems</source>, <year>2021</year>, <volume>214</volume>: <fpage>106731</fpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref006"><label>6</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Sinaga K</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Yang M</surname> <given-names>S</given-names></name>. <article-title>Unsupervised K-means clustering algorithm[J].</article-title> <source>IEEE access</source>, <year>2020</year>, <volume>8</volume>: <fpage>80716</fpage>–<lpage>80727.0</lpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref007"><label>7</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Askari</surname> <given-names>S.</given-names></name> <article-title>Fuzzy C-Means clustering algorithm for data with unequal cluster sizes and contaminated with noise and outliers: review and development[J]</article-title>. <source>Expert Systems with Applications</source>, <year>2021</year>, <volume>165</volume>: <fpage>113856</fpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref008"><label>8</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Zhang</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Zhang</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Li</surname> <given-names>X</given-names></name>. <article-title>Maximum joint probability with multiple representations for clustering[J]</article-title>. <source>IEEE transactions on neural networks and learning systems</source>, <year>2021</year>, <volume>33</volume>(<issue>9</issue>): <fpage>4300</fpage>–<lpage>4310</lpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref009"><label>9</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Bushra A</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Yi</surname> <given-names>G</given-names></name>. <article-title>Comparative analysis review of pioneering DBSCAN and successive density-based clustering algorithms[J].</article-title> <source>IEEE Access</source>, <year>2021</year>, <volume>9</volume>: <fpage>87918</fpage>–<lpage>87935</lpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref010"><label>10</label><mixed-citation publication-type="book" xlink:type="simple"><name name-style="western"><surname>Sudharkar</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Narsimha V</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Narsimha</surname> <given-names>G</given-names></name>. <source>An Ensemble Deep Closest Count and Density Peak Clustering Technique for Intrusion Detection System for Cloud Computing[C]//International Conference on Innovations in Computer Science and Engineering.</source> <publisher-loc>Singapore</publisher-loc>: <publisher-name>Springer Nature Singapore</publisher-name>, <year>2022</year>: <fpage>403</fpage>–<lpage>414</lpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref011"><label>11</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Rathore</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Kumar</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Bezdek J</surname> <given-names>C</given-names></name>, <etal>et al</etal>. <article-title>A rapid hybrid clustering algorithm for large volumes of high dimensional data[J]</article-title>. <source>IEEE Transactions on Knowledge and Data Engineering</source>, <year>2018</year>, <volume>31</volume>(<issue>4</issue>): <fpage>641</fpage>–<lpage>654</lpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref012"><label>12</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Zhang</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Zhang</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Li</surname> <given-names>X</given-names></name>. <article-title>Maximum joint probability with multiple representations for clustering[J]</article-title>. <source>IEEE transactions on neural networks and learning systems</source>, <year>2021</year>, <volume>33</volume>(<issue>9</issue>): <fpage>4300</fpage>–<lpage>4310</lpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref013"><label>13</label><mixed-citation publication-type="other" xlink:type="simple">Paul S, De S, Dey S. A novel approach of data clustering using an improved particle swarm optimisation based k-means clustering algorithm[C]//2020 IEEE International Conference on Electronics, Computing and Communication Technologies (CONECCT). IEEE, 2020: 1–6.</mixed-citation></ref>
<ref id="pone.0302741.ref014"><label>14</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Pu</surname> <given-names>Q</given-names></name>, <name name-style="western"><surname>Gan</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Qiu</surname> <given-names>L</given-names></name>, <etal>et al</etal>. <article-title>An efficient hybrid approach based on PSO, ABC and k-means for cluster analysis[J]</article-title>. <source>Multimedia Tools and Applications</source>, <year>2022</year>, <volume>81</volume>(<issue>14</issue>): <fpage>19321</fpage>–<lpage>19339</lpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref015"><label>15</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Gao</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Li</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>Kabalyants</surname> <given-names>P</given-names></name>, <etal>et al</etal>. <article-title>A novel hybrid PSO-K-means clustering algorithm using Gaussian estimation of distribution method and Lévy flight[J].</article-title> <source>IEEE access</source>, <year>2020</year>, <volume>8</volume>: <fpage>122848</fpage>–<lpage>122863</lpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref016"><label>16</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Parouha R</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Verma</surname> <given-names>P</given-names></name>. <article-title>A systematic overview of developments in differential evolution and particle swarm optimisation with their advanced suggestion[J].</article-title> <source>Applied Intelligence</source>, <year>2022</year>, <volume>52</volume>(<issue>9</issue>): <fpage>10448</fpage>–<lpage>10492</lpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref017"><label>17</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Wang</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>Ding</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Wang</surname> <given-names>L</given-names></name>, <etal>et al</etal>. <article-title>A manifold p-spectral clustering with sparrow search algorithm[J].</article-title> <source>Soft Computing</source>, <volume>2022</volume>, <volume>26</volume>(<issue>4</issue>): <fpage>1765</fpage>–<lpage>1777</lpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref018"><label>18</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Gao</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Li</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>Kabalyants</surname> <given-names>P</given-names></name>, <etal>et al</etal>. <article-title>A novel hybrid PSO-K-means clustering algorithm using Gaussian estimation of distribution method and Lévy flight[J].</article-title> <source>IEEE access</source>, <year>2020</year>, <volume>8</volume>: <fpage>122848</fpage>–<lpage>122863</lpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref019"><label>19</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Liu</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Zhang</surname> <given-names>W</given-names></name>, <name name-style="western"><surname>Yuwono</surname> <given-names>M</given-names></name>, <etal>et al</etal>. <article-title>A data-driven meat freshness monitoring and evaluation method using rapid centroid estimation and hidden Markov models[J J]</article-title>. <source>Sensors and Actuators B: Chemical</source>, <year>2020</year>, <volume>311</volume>: <fpage>127868</fpage>.</mixed-citation></ref>
<ref id="pone.0302741.ref020"><label>20</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Du</surname> <given-names>Y</given-names></name>, <name name-style="western"><surname>Li</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Luo</surname> <given-names>C</given-names></name>, <etal>et al</etal>. <article-title>A hybrid estimation of distribution algorithm for distributed flexible job shop scheduling with crane transportations[J].</article-title> <source>Swarm and Evolutionary Computation</source>, <year>2021</year>, <volume>62</volume>: <fpage>100861</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.swevo.2021.100861" xlink:type="simple">10.1016/j.swevo.2021.100861</ext-link></comment></mixed-citation></ref>
<ref id="pone.0302741.ref021"><label>21</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Hie B</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Yang K</surname> <given-names>K</given-names></name>, <name name-style="western"><surname>Kim P</surname> <given-names>S</given-names></name>. <article-title>Evolutionary velocity with protein language models predicts evolutionary dynamics of diverse proteins[J].</article-title> <source>Cell Systems</source>, <year>2022</year>, <volume>13</volume>(<issue>4</issue>): <fpage>274</fpage>–<lpage>285. e6</lpage>.</mixed-citation></ref>
</ref-list>
</back>
</article>