<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "http://jats.nlm.nih.gov/publishing/1.3/JATS-journalpublishing1-3.dtd">
<article article-type="research-article" dtd-version="1.3" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<processing-meta>
<custom-meta-group content-type="composition">
<custom-meta specific-use="newgen" xlink:href="https://www.newgen.co/">
<meta-name>Composition Vendor</meta-name>
<meta-value>Newgen KnowledgeWorks (P) Ltd.</meta-value>
</custom-meta>
</custom-meta-group>
</processing-meta>
<front>
<journal-meta>
<journal-id journal-id-type="nlm-ta">PLoS One</journal-id>
<journal-id journal-id-type="publisher-id">plos</journal-id>
<journal-id journal-id-type="pmc">plosone</journal-id>
<journal-title-group>
<journal-title>PLOS One</journal-title>
</journal-title-group>
<issn pub-type="epub">1932-6203</issn>
<publisher>
<publisher-name>Public Library of Science</publisher-name>
<publisher-loc>San Francisco, CA USA</publisher-loc>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.1371/journal.pone.0340191</article-id>
<article-id pub-id-type="publisher-id">PONE-D-25-38916</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Research Article</subject>
</subj-group>
<subj-group subj-group-type="Discipline-v3">
<subject>Ecology and environmental sciences</subject><subj-group><subject>Pollution</subject><subj-group><subject>Air pollution</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>People and places</subject><subj-group><subject>Geographical locations</subject><subj-group><subject>Europe</subject><subj-group><subject>European Union</subject><subj-group><subject>Poland</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Earth sciences</subject><subj-group><subject>Atmospheric science</subject><subj-group><subject>Atmospheric chemistry</subject><subj-group><subject>Air quality</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Physical sciences</subject><subj-group><subject>Chemistry</subject><subj-group><subject>Environmental chemistry</subject><subj-group><subject>Atmospheric chemistry</subject><subj-group><subject>Air quality</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Ecology and environmental sciences</subject><subj-group><subject>Environmental chemistry</subject><subj-group><subject>Atmospheric chemistry</subject><subj-group><subject>Air quality</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>People and places</subject><subj-group><subject>Geographical locations</subject><subj-group><subject>Europe</subject><subj-group><subject>European Union</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Earth sciences</subject><subj-group><subject>Geomorphology</subject><subj-group><subject>Topography</subject></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Physical sciences</subject><subj-group><subject>Mathematics</subject><subj-group><subject>Applied mathematics</subject><subj-group><subject>Algorithms</subject><subj-group><subject>Machine learning algorithms</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Research and analysis methods</subject><subj-group><subject>Simulation and modeling</subject><subj-group><subject>Algorithms</subject><subj-group><subject>Machine learning algorithms</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Computer and information sciences</subject><subj-group><subject>Artificial intelligence</subject><subj-group><subject>Machine learning</subject><subj-group><subject>Machine learning algorithms</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Earth sciences</subject><subj-group><subject>Geography</subject></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Earth sciences</subject><subj-group><subject>Marine and aquatic sciences</subject><subj-group><subject>Bodies of water</subject><subj-group><subject>Rivers</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Ecology and environmental sciences</subject><subj-group><subject>Aquatic environments</subject><subj-group><subject>Freshwater environments</subject><subj-group><subject>Rivers</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3">
<subject>Earth sciences</subject><subj-group><subject>Marine and aquatic sciences</subject><subj-group><subject>Aquatic environments</subject><subj-group><subject>Freshwater environments</subject><subj-group><subject>Rivers</subject></subj-group></subj-group></subj-group></subj-group></subj-group></article-categories>
<title-group>
<article-title>Air pollution macro-regions identification using machine learning and spatio-temporal analysis</article-title>
<alt-title alt-title-type="running-head">Air pollution macro-regions identification using machine learning and spatio-temporal analysis</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Morawiec</surname>
<given-names>Tymoteusz</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role content-type="http://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role content-type="http://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-original-draft/">Writing – original draft</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"/></contrib>
<contrib contrib-type="author" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0002-9663-6593</contrib-id>
<name name-style="western">
<surname>Zareba</surname>
<given-names>Mateusz</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role content-type="http://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="http://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-original-draft/">Writing – original draft</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"/></contrib>
<contrib contrib-type="author" xlink:type="simple">
<name name-style="western">
<surname>Danek</surname>
<given-names>Tomasz</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role content-type="http://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role content-type="http://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-original-draft/">Writing – original draft</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="aff" rid="aff001"/></contrib>
<contrib contrib-type="author" corresp="yes" xlink:type="simple">
<contrib-id authenticated="true" contrib-id-type="orcid">https://orcid.org/0000-0002-0381-4697</contrib-id>
<name name-style="western">
<surname>Chuchro</surname>
<given-names>Monika</given-names>
</name>
<role content-type="http://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role content-type="http://credit.niso.org/contributor-roles/validation/">Validation</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-original-draft/">Writing – original draft</role>
<role content-type="http://credit.niso.org/contributor-roles/writing-review-editing/">Writing – review &amp; editing</role>
<xref ref-type="corresp" rid="cor001">*</xref>
<xref ref-type="aff" rid="aff001"/></contrib>
</contrib-group>
<aff id="aff001"><addr-line>Department of Geoinformatics and Applied Computer Science, Faculty of Geology, Geophysics and Environmental Protection, AGH University of Krakow, Krakow, Malopolska, Poland</addr-line></aff>
<contrib-group>
<contrib contrib-type="editor" xlink:type="simple">
<name name-style="western">
<surname>Calka</surname>
<given-names>Beata</given-names>
</name>
<role>Editor</role>
<xref ref-type="aff" rid="edit1"/></contrib>
</contrib-group>
<aff id="edit1"><addr-line>Military University of Technology Faculty of Civil Engineering and Geodesy: Wojskowa Akademia Techniczna im Jaroslawa Dabrowskiego Wydzial Inzynierii Ladowej i Geodezji, POLAND</addr-line></aff>
<author-notes>
<fn fn-type="conflict" id="coi001">
<p>The authors have declared that no competing interests exist.</p>
</fn>
<corresp id="cor001">* E-mail: <email xlink:type="simple">chuchro@agh.edu.pl</email></corresp>
</author-notes>
<pub-date pub-type="epub"><day>12</day><month>1</month><year>2026</year></pub-date>
<pub-date pub-type="collection"><year>2026</year></pub-date>
<volume>21</volume>
<issue>1</issue>
<elocation-id>e0340191</elocation-id>
<history>
<date date-type="received"><day>25</day><month>7</month><year>2025</year></date>
<date date-type="accepted"><day>17</day><month>12</month><year>2025</year></date>
</history>
<permissions>
<copyright-year>2026</copyright-year>
<copyright-holder>Morawiec et al</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p></license>
</permissions>
<self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.pone.0340191"/>
<abstract>
<p>Air pollution caused by suspended particulate matter (PM) remains one of the key environmental challenges in Poland, particularly in the context of public health and spatial planning. This study presents a spatio-temporal analysis based on data from 173 air quality monitoring stations collected between 2015 and 2023. Advanced unsupervised clustering methods based on the Dynamic Time Warping (DTW) metric were applied to identify spatial patterns of pollution at both daily and annual timescales. Based on over 13 million observations, four macroregions were delineated, along with a set of sixteen clusters allowing for assessment of local anomalies. A significant variation in median <italic>PM</italic><sub>10</sub> concentrations has been observed across macroregions, ranging from 19.7 to 27.18 <inline-formula id="pone.0340191.e001"><alternatives><graphic id="pone.0340191.e001g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e001" xlink:type="simple"/><mml:math display="inline" id="M1"><mml:mrow><mml:mi>μ</mml:mi><mml:mrow><mml:mi mathvariant="normal">g</mml:mi><mml:mo>/</mml:mo><mml:msup><mml:mi mathvariant="normal">m</mml:mi><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula>. The results revealed the significant role of topographic, urban, and microclimatic factors in shaping the spatial distribution of particulate matter. Urbanized areas in southern Poland (Silesia, Lesser Poland) formed distinctly isolated clusters with high PM levels, in contrast to the stable, low-emission northern lowlands. The analysis further demonstrated that regional policies may be ineffective without supra-regional coordination. These findings show the need of including the high-resolution data analyses into environmental and public health planning to effectively limit the impacts of air pollution.</p>
</abstract>
<funding-group>
<funding-statement>The author(s) received no specific funding for this work.</funding-statement>
</funding-group>
<counts>
<fig-count count="12"/>
<table-count count="0"/>
<page-count count="26"/>
</counts>
<custom-meta-group>
<custom-meta id="data-availability">
<meta-name>Data Availability</meta-name>
<meta-value>The data underlying the results presented in the study are available from <ext-link ext-link-type="uri" xlink:href="https://powietrze.gios.gov.pl/pjp/archives" xlink:type="simple">https://powietrze.gios.gov.pl/pjp/archives</ext-link>.</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="sec001" sec-type="intro">
<title>Introduction</title>
<p>Air pollution caused by particulate matter (PM) represents a significant challenge, both in terms of public health and in the context of developing future urban environments with particular focus on concepts such as smart cities [<xref ref-type="bibr" rid="pone.0340191.ref001">1</xref>]. Clean air, efficient transportation, access to education, and recreational spaces are the basis of the concept of a smart city [<xref ref-type="bibr" rid="pone.0340191.ref002">2</xref>]. The research proved links between PM exposure and the prevalence of neurodegenerative diseases, particularly Alzheimer’s and Parkinson’s [<xref ref-type="bibr" rid="pone.0340191.ref003">3</xref>]. This concern is especially relevant for countries in the European Union (EU), including Poland, where aging populations are predicted to lead to higher proportions of elderly individuals compared to the younger group [<xref ref-type="bibr" rid="pone.0340191.ref004">4</xref>]. Addressing air pollution is thus critical not only for immediate urban planning and management but also for the long-term planning of healthcare systems and social frameworks within the smart city concept.</p>
<p>Monitoring air quality according to EU regulations usually involves advanced reference sensors, which employ gravimetric methods to measure PM1, <italic>PM</italic><sub>2.5</sub>, and <italic>PM</italic><sub>10</sub> concentrations with high accuracy. These methods are considered the standard for official reporting but are associated with significant installation and maintenance costs [<xref ref-type="bibr" rid="pone.0340191.ref005">5</xref>]. Consequently, their deployment is often spatially limited, restricting the scope of sampling and analysis. In contrast, lower-cost laser-based sensors (LCS) offer a more affordable alternative, but their reduced accuracy renders them unsuitable for regulatory compliance [<xref ref-type="bibr" rid="pone.0340191.ref006">6</xref>].</p>
<p>The EU has established directives 2004/107/EC [<xref ref-type="bibr" rid="pone.0340191.ref007">7</xref>] and 2008/50/EC [<xref ref-type="bibr" rid="pone.0340191.ref008">8</xref>] to define air quality standards and mandate that member states designate zones where air quality is assessed and made publicly accessible. When air pollution levels exceed permissible thresholds, member states are required to implement corrective measures. The European Council emphasizes that <italic>PM</italic><sub>2.5</sub> poses the greatest health risk to Europeans due to its ability to penetrate deeply into the respiratory system and bloodstream. Although air pollution levels across Europe have shown a clear downward trend, 97% of the EU population remains exposed to <italic>PM</italic><sub>2.5</sub> concentrations that exceed World Health Organization (WHO) guidelines. The European Environment Agency (EEA) identifies energy production, agriculture, manufacturing, road transport, and waste management as the primary sources of <italic>PM</italic><sub>2.5</sub>, with similar contributions observed for <italic>PM</italic><sub>10</sub>. Poland faces particular challenges related to PM pollution. Despite national and EU regulatory efforts, the country remains among the most polluted in Europe. Two of the four EU cities that exceeded <italic>PM</italic><sub>2.5</sub> limits between 2021 and 2022 are in Poland [<xref ref-type="bibr" rid="pone.0340191.ref009">9</xref>]. Many Polish cities - including Krakow - rank among the world’s most polluted urban areas due to the country’s energy mix based on coal production and geographic conditions that can intensify the retention of pollution [<xref ref-type="bibr" rid="pone.0340191.ref010">10</xref>]. This phenomenon is particularly noticeable during the cold period when the air temperature hovers around 0 degrees Celsius [<xref ref-type="bibr" rid="pone.0340191.ref011">11</xref>].</p>
<p>According to the Central Statistical Office of Poland, the country’s natural population growth rate is –3.9 per 1,000 inhabitants, significantly lower than the EU average of –2.9. This demographic trend suggests potential future strain on healthcare and social systems, particularly given the health impacts of PM exposure and the increasing prevalence of age-related diseases such as Alzheimer’s [<xref ref-type="bibr" rid="pone.0340191.ref012">12</xref>]. Poland’s average life expectancy of 81.1 years for women and 73.4 years for men highlights the urgency of addressing long-term health risks associated with air pollution. In 2022, over 40% of Poland’s greenhouse gas emissions originated from coal-based energy production, while transport and other energy-related sectors each contributed 18%. Poland ranks 21st among 27 EU countries in the share of renewable energy in its energy mix [<xref ref-type="bibr" rid="pone.0340191.ref013">13</xref>].</p>
<p>Research examining urban air quality in Poland indicates that while pollutant concentrations in general have decreased significantly 16 to 34% between 2005 and 2021 - the reductions remain insufficient to meet EU standards, particularly for <italic>PM</italic><sub>2.5</sub> and <italic>PM</italic><sub>10</sub>. Progress has been made since 2011, but overall levels of air pollution remain high (besides ozone). The slow implementation of air quality improvement strategies underscores the need for more decisive actions to align with EU directives and mitigate public health risks [<xref ref-type="bibr" rid="pone.0340191.ref014">14</xref>]. The geographical characteristics of Poland, including its lowlands and industrialized urban regions, play a crucial role in the persistence of high PM concentrations. Combined with its coal-dependent energy system, these conditions create substantial challenges for reducing pollution levels. While stricter industrial regulations and cleaner energy sources are essential, enhanced air quality monitoring networks are also critical for effective spatial and temporal analysis of pollution trends [<xref ref-type="bibr" rid="pone.0340191.ref011">11</xref>].</p>
<p>This study focuses on the spatio-temporal analysis of PM pollution in Poland which is a high-exposure country within the EU. The findings aim to help form strategies for improving urban air quality, supporting public health, and integrating clean environment policies into the broader framework. In this study, data from 173 reference monitoring stations were used. Sensors located throughout Poland were analyzed for the 2015-2023 period to perform a spatio-temporal analysis of <italic>PM</italic><sub>10</sub> and <italic>PM</italic><sub>2.5</sub> pollution. Although <italic>PM</italic><sub>10</sub> data from earlier years are also available, the period from 2015 onward was selected as representative. In 2014, the EU Air Quality Directive (2008/50/EC) was implemented in Poland, standardizing measurement and reporting procedures in accordance with European law. Additionally, studies indicate that 2014 marked a slowdown in the rate of air pollution reduction at the national scale, providing a more stable baseline for assessing the current spatial structure of <italic>PM</italic><sub>10</sub> macroregions.</p>
<p>The novelty of this research lies in the first-time application of big data and machine learning techniques to establish air pollution macroregions (APMR) within the country. In this paper, the term air pollution macroregion refers to a geographical subdivision identified through the spatial clustering of areas exhibiting similar air pollution characteristics. A macroregion encompasses an area larger than a single city, county, or even voivodeship, defined by common patterns in air pollutant concentrations that may be related to regional emission sources, prevailing meteorological conditions, and topographic influences. These interrelated factors create a coherent, large-scale spatial unit that reflects the regional dynamics of air quality and atmospheric processes. In contrast, an air pollution microregion represents a smaller, more localized subdivision within a macroregion, characterized by finer-scale variations. A representative example of an air pollution microregion is the Kraków metropolitan area, which displays well-documented, spatially confined pollution patterns influenced by both urban morphology and surrounding terrain [<xref ref-type="bibr" rid="pone.0340191.ref005">5</xref>,<xref ref-type="bibr" rid="pone.0340191.ref010">10</xref>,<xref ref-type="bibr" rid="pone.0340191.ref015">15</xref>]. Specifically, the study included an evaluation of the utility of individual sensor data, which ultimately narrowed the data set to 127 <italic>PM</italic><sub>10</sub> and 56 <italic>PM</italic><sub>2.5</sub> stations that met the data reliability criteria, forming the basis of a reference database. Advanced imputation of missing values in the time series was performed using hierarchical clustering methods to account for spatial dependencies. Unsupervised machine learning algorithms were employed for regional analysis, leveraging spatio-temporal clustering with dynamic time warping (DTW) to capture both - time and temporal variations. The analyses utilized multiple temporal resolutions of the input data, including daily, and annual averages, to identify patterns across different time scales. Furthermore, multi-year analyses of <italic>PM</italic><sub>2.5</sub> / <italic>PM</italic><sub>10</sub> ratios were performed, providing information on the sources and composition of particulate matter.</p>
</sec>
<sec id="sec002" sec-type="materials|methods">
<title>Materials and methods</title>
<sec id="sec003">
<title>Localization</title>
<p>Poland is a central European country, located on the North European Plain. To the North extends the Baltic Sea, to the South lie Carpathian Mountains and Sudetes. Both act as natural borders from Poland’s southern neighbors. Major part of Poland are lowlands, which take up over 75 percent of the countries surface and are located in central and north regions of the area. The rest of the surface is filled with highlands and mountain ranges in the South. The average altitude is 173 meters above sea level. The total area is 322 575 square kilometers. Poland is located in a temperate warm transitional climate. It is surrounded by different climates, such as marine in the West, continental in the East, cool temperate in the North and Mediterranean in the South. The movement of large air masses, that flow into Poland from all directions has an influence both on climate and levels of <italic>PM</italic><sub>10</sub> and <italic>PM</italic><sub>2.5</sub> [<xref ref-type="bibr" rid="pone.0340191.ref016">16</xref>]. On the one hand, it cannot be excluded that these air masses inflow significant amounts of air pollutants. On the other hand, they can also outflow them. On a micro-scale: one city and its surrounding terrains. It has been shown that topography of the studied terrain may have major impact on <italic>PM</italic><sub>10</sub> and <italic>PM</italic><sub>2.5</sub> levels. For example, Krakow is located in a depression surrounded by higher elevation to the North and South. This unfortunate position along with unfavorable meteorological conditions may cause entrapment of air pollutants [<xref ref-type="bibr" rid="pone.0340191.ref015">15</xref>]. This sort of disadvantageous circumstances are likely to occur in many more places all over the country, and may cause extremely high levels of pollution. It’s worth considering these variables on a much larger scale when analyzing levels of air pollution. Topography has major influence in shaping wind patterns by creating natural barriers and corridors which in turn affect inflow and outflow of air masses [<xref ref-type="bibr" rid="pone.0340191.ref017">17</xref>].</p>
<p>Data used in this study was collected from 173 stations in total [<xref ref-type="bibr" rid="pone.0340191.ref018">18</xref>]. 109 of them collected only <italic>PM</italic><sub>10</sub> data, 15 gathered only <italic>PM</italic><sub>2.5</sub> data and the remaining 49 - both. Even though measurements were conducted since January 2000, the time frame of analyzed data begins in January 2015 and ends December 2023. This certain time frame was chosen due to large proportion of stations not conducting research constantly. For various reasons stations have been halting and starting research in irregular fashion over the years. In latter workflow phases it resulted in having little to none data over large segments of studied area, if longer time periods were chosen.</p>
</sec>
<sec id="sec004">
<title>Data pipeline</title>
<p>There were certain steps involved with the machine-learning pipeline to provide reliable results based on big data used in this study. Data were collected from 173 stations over 9 years in hourly intervals. In total there were 78840 unique observations with 173 features which gave 13,639,320 cells of data. The big-data machine learning pipeline begins with data collection from GIOŚ (Główny Inspektorat Ochrony Środowiska) archives website [<xref ref-type="bibr" rid="pone.0340191.ref018">18</xref>]. It is publicly available data, shared every year in packages of .xlsx files. Besides <italic>PM</italic><sub>10</sub> <inline-formula id="pone.0340191.e002"><alternatives><graphic id="pone.0340191.e002g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e002" xlink:type="simple"/><mml:math display="inline" id="M2"><mml:mrow><mml:mi>μ</mml:mi><mml:mrow><mml:mi mathvariant="normal">g</mml:mi><mml:mo>/</mml:mo><mml:msup><mml:mi mathvariant="normal">m</mml:mi><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula> and <italic>PM</italic><sub>2.5</sub> <inline-formula id="pone.0340191.e003"><alternatives><graphic id="pone.0340191.e003g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e003" xlink:type="simple"/><mml:math display="inline" id="M3"><mml:mrow><mml:mi>μ</mml:mi><mml:mrow><mml:mi mathvariant="normal">g</mml:mi><mml:mo>/</mml:mo><mml:msup><mml:mi mathvariant="normal">m</mml:mi><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula>, other measurements can be found, like carbon monoxide <italic>mg</italic>/<italic>m</italic><sup>3</sup>, benzene <italic>ng</italic>/<italic>m</italic><sup>3</sup>, or sulfur dioxide <inline-formula id="pone.0340191.e004"><alternatives><graphic id="pone.0340191.e004g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e004" xlink:type="simple"/><mml:math display="inline" id="M4"><mml:mrow><mml:mi>μ</mml:mi><mml:mrow><mml:mi mathvariant="normal">g</mml:mi><mml:mo>/</mml:mo><mml:msup><mml:mi mathvariant="normal">m</mml:mi><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula>. There are two types of measurement frequencies shared with the public: every 24 hours and 1 hour. Through data transformation, different intervals can be calculated using upscaling or downscaling methods. Final results were calculated at yearly and daily intervals.</p>
<p>An observation is a measurement of a specific type of air pollution taken by a given station at a particular moment. A few challenges were recognized during prepossessing that were related to data ingestion from the origin source and their non-uniqueness in names. Also before loading to the final database data cleaning was applied to make sure that all observations were in numeric formats. The cleaning process focused on aligning the data into the correct tabular format with consistent naming across all sources (see <xref ref-type="fig" rid="pone.0340191.g001">Fig 1</xref>).</p>
<fig id="pone.0340191.g001" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0340191.g001</object-id><label>Fig 1</label><caption><title>Data preparation pipeline.</title></caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0340191.g001" xlink:type="simple"/></fig>
<p>NaN values represent data that were unavailable at a given time and location. Stations with more than 50% of NaN values were excluded from the analysis (see <xref ref-type="fig" rid="pone.0340191.g001">Fig 1</xref>). To clarify the rationale behind selecting the 50% utoff threshold, consider two alternative scenarios:</p>
<list list-type="order">
<list-item>
<p><bold>A 10% cutoff</bold> - meaning that only stations with up to 10% of NaN values (at least 90% of valid data) would be retained. This approach would ensure very high data quality; however, it would significantly reduce the number of available stations, potentially increasing spatial bias and lowering representativeness.</p>
</list-item>
<list-item>
<p><bold>A 90% cutoff</bold> - meaning that stations with up to 90% of NaN values (as little as 10% of valid data) would still be included. While this would retain a much larger number of stations, many would contain excessive missing data, introducing noise and likely distorting the final results.</p>
</list-item>
</list>
<p>In conclusion, the 50% threshold represents a balanced compromise between retaining a sufficient number of stations and maintaining acceptable data quality.</p>
<p>The remaining 140 stations (see <xref ref-type="fig" rid="pone.0340191.g002">Fig 2</xref>) still contained significant number of NaN values. This problem was solved using the hierarchical clustering imputation method <xref ref-type="fig" rid="pone.0340191.g003">Fig 3</xref>. This approach was significantly better than conventional methods, such as interpolation based on a single station. The spatiotemporal nature of the data requires a solution that minimizes potential spatial bias - for example - air pollution profiles may differ between the southern and northern parts of the country.</p>
<fig id="pone.0340191.g002" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0340191.g002</object-id><label>Fig 2</label><caption><title>Geographic distribution of air pollution monitoring stations included in the study (January 2015 – December 2023) – <italic>PM</italic><sub>10</sub> (dark blue), <italic>PM</italic><sub>2.5</sub> (orange), <inline-formula id="pone.0340191.e005"><alternatives><graphic id="pone.0340191.e005g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e005" xlink:type="simple"/><mml:math display="inline" id="M5"><mml:mrow><mml:mi>P</mml:mi><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mn>10</mml:mn></mml:mrow></mml:msub><mml:mspace width="0.167em"/><mml:mrow><mml:mo>+</mml:mo></mml:mrow><mml:mspace width="0.167em"/><mml:mi>P</mml:mi><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mn>2.5</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:math></alternatives></inline-formula> (light blue).</title><p>The background map is based on OpenStreetMap data [<xref ref-type="bibr" rid="pone.0340191.ref019">19</xref>]. (hypsometric map from WMTS: [<xref ref-type="bibr" rid="pone.0340191.ref020">20</xref>]; Ref. System: EPSG 2180).</p></caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0340191.g002" xlink:type="simple"/></fig>
<fig id="pone.0340191.g003" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0340191.g003</object-id><label>Fig 3</label><caption><title>Data imputation and clustering pipeline.</title></caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0340191.g003" xlink:type="simple"/></fig>
<p>Began with calculating Pearson’s [<xref ref-type="bibr" rid="pone.0340191.ref021">21</xref>] correlation matrix for each voivodeship individually, which helped identify local similarities between stations <xref ref-type="disp-formula" rid="pone.0340191.e006">1</xref>.</p>
<disp-formula id="pone.0340191.e006"><alternatives><graphic id="pone.0340191.e006g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e006" xlink:type="simple"/><mml:math display="block" id="M6"><mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msubsup><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>−</mml:mo><mml:mover><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="true">¯</mml:mo></mml:mover><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>−</mml:mo><mml:mover><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="true">¯</mml:mo></mml:mover><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:msubsup><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>−</mml:mo><mml:mover><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="true">¯</mml:mo></mml:mover><mml:msup><mml:mo stretchy="false">)</mml:mo><mml:mn>2</mml:mn></mml:msup><mml:msubsup><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>−</mml:mo><mml:mover><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="true">¯</mml:mo></mml:mover><mml:msup><mml:mo stretchy="false">)</mml:mo><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac></mml:mrow></mml:mrow></mml:math></alternatives> <label>(1)</label></disp-formula>
<p>where <italic>n</italic> is the number of observations, <italic>x</italic><sub><italic>i</italic></sub> and <italic>y</italic><sub><italic>i</italic></sub> represent the values of the respective variables, and <inline-formula id="pone.0340191.e007"><alternatives><graphic id="pone.0340191.e007g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e007" xlink:type="simple"/><mml:math display="inline" id="M7"><mml:mrow><mml:mover><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="true">¯</mml:mo></mml:mover></mml:mrow></mml:math></alternatives></inline-formula> and <inline-formula id="pone.0340191.e008"><alternatives><graphic id="pone.0340191.e008g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e008" xlink:type="simple"/><mml:math display="inline" id="M8"><mml:mrow><mml:mover><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="true">¯</mml:mo></mml:mover></mml:mrow></mml:math></alternatives></inline-formula> are their arithmetic means.</p>
<p>Then, hierarchical clustering using Ward’s method [<xref ref-type="bibr" rid="pone.0340191.ref022">22</xref>] was applied to the correlation matrix of each voivodeship to identify groups of similar time series within the region.</p>
<p>This method is an unsupervised machine learning technique, utilizing an agglomerative approach. Initially, each time series is treated as a cluster. Next, the algorithm searches for the two most similar clusters and merges them into one. This step is repeated until all data points are in one big cluster. Finally, the dendrogram (<xref ref-type="fig" rid="pone.0340191.g004">Fig 4</xref>) is created. It is a visual representation of the clustering process. Upon review, we can conclude the level of similarity of each cluster and how they were connected in each iteration. Even though a dendrogram was available for each voivodeship, only one was reviewed manually.</p>
<fig id="pone.0340191.g004" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0340191.g004</object-id><label>Fig 4</label><caption><title>Dendogram of clusters from lesser Poland Voivodeship.</title></caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0340191.g004" xlink:type="simple"/></fig>
<p>The decision was made to set the cutoff distance at half of the max tree height automatically for every region. This solution gave optimal results in a relatively short amount of time, however, better results could have been achieved through exploratory data analysis. Once the algorithm chose groups in a given region, the within-cluster average value was calculated for each row and used for imputation. However, this implies having more than one station in each cluster, which wasn’t always the case. Two edge cases, have to be considered. The cluster consists of one time series, but there are other stations in the region. Second, there aren’t any other stations in the region. The first case is handled by calculating the within-region mean value by row and imputing that for standalone clusters. Second is imputed with the mean value of the whole column for a given station. This method of filling in the gaps, introduced spatial dimension into the analysis, as the data was imputed only geo-locally. The potential spatial bias which could negatively affect the final results, has been minimized. For example, the measurements in the North may differ in nature from stations in the South. Therefore, it would be unwise to use data from one to impute in the other. The main difference between hierarchical clustering and k-means clustering is choosing the number of clusters. K-means [<xref ref-type="bibr" rid="pone.0340191.ref023">23</xref>] has a priori and hierarchical a posteriori approach, essentially one needs the number of clusters before executing, other doesn’t.</p>
</sec>
<sec id="sec005">
<title>Unsupervised machine learning</title>
<p>Machine learning has seen significant gain in recognition in the last few years, due to the rise in popularity of artificial intelligence [<xref ref-type="bibr" rid="pone.0340191.ref024">24</xref>]. Along with the rapid increase in data availability and quantity in general [<xref ref-type="bibr" rid="pone.0340191.ref025">25</xref>]. Machine learning became essential for any sensible data analysis and modeling, both in scientific [<xref ref-type="bibr" rid="pone.0340191.ref026">26</xref>] and commercial [<xref ref-type="bibr" rid="pone.0340191.ref027">27</xref>] workflows. Machine learning serves various purposes in data science [<xref ref-type="bibr" rid="pone.0340191.ref028">28</xref>]. It is a helpful tool in every part of the data pipeline, ranging from data cleaning and imputation to final business intelligence results [<xref ref-type="bibr" rid="pone.0340191.ref029">29</xref>].</p>
<p>Machine learning can be divided into unsupervised and supervised. Supervised is most commonly used for classification and regression problems. It needs labeled data to train the model. Labeling often has to be done by humans. Thus obtaining data for supervised learning can be very expensive and time-consuming. It uses different metrics for model evaluation, due to the nature of the data it works on. This enables assessment of the models accuracy, and how well it explains a given feature. In turn, unsupervised learning trains on unlabeled data and its primary task is to find unknown patterns within the data. These patterns are often imperceptible to the human eye, often due to vast amounts of data. Thanks to this it can be a very valuable input in scientific research. One of its main strengths is minimal human intervention. The algorithm only needs clean and representative data. It works out the hidden structure of the data based on the similarities and dissimilarities within and yields the desired results. The outcome then can be verified by researchers, which are given the choice to either accept the result or adjust the data and parameters. The biggest drawback of unsupervised machine learning is that it is virtually impossible to decide whether the returned results are correct. Since there is no labeled data, there is nothing to constitute correctness, as in the case of supervised learning.</p>
<p>K-means algorithm is an unsupervised learning algorithm. It categorizes data points into clusters based on the distance between data points and a cluster center. The most commonly used distance metric is the Euclidean distance. However, due to the nature of the data in this study which is time series data, we employed Dynamic Time Warping (DTW). From a performance perspective, the Euclidean distance is computationally efficient, with linear time complexity <inline-formula id="pone.0340191.e009"><alternatives><graphic id="pone.0340191.e009g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e009" xlink:type="simple"/><mml:math display="inline" id="M9"><mml:mrow><mml:mi>𝒪</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></alternatives></inline-formula>, but it lacks robustness to temporal misalignments and variations in sequence length. In contrast, DTW provides higher accuracy in matching time series that exhibit phase shifts or non-linear temporal variations by allowing flexible alignments, though at the cost of increased computational complexity <inline-formula id="pone.0340191.e010"><alternatives><graphic id="pone.0340191.e010g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e010" xlink:type="simple"/><mml:math display="inline" id="M10"><mml:mrow><mml:mi>𝒪</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msup><mml:mi>n</mml:mi><mml:mn>2</mml:mn></mml:msup><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></alternatives></inline-formula> and higher memory consumption. K-means is a well-established machine learning algorithm. It has many advantages over different methods. It’s very fast, eq. it’s faster than the previously mentioned hierarchical clustering, especially in its simplest form (using Euclidean distance). Adaptability, there are many variations of the original algorithm, such as fuzzy c-means [<xref ref-type="bibr" rid="pone.0340191.ref030">30</xref>] or k-means++ [<xref ref-type="bibr" rid="pone.0340191.ref031">31</xref>].</p>
<p>Simplicity, the algorithm can be broken down into 5 steps.</p>
<list list-type="order">
<list-item>
<p>Select the number of clusters ‘k’ to identify within the data. That is k in k-means.</p>
</list-item>
<list-item>
<p>‘Randomly’ initialize centroids.</p>
</list-item>
<list-item>
<p>Assign each data point to the closest cluster. Assignment of new cluster is given as Eq (<xref ref-type="disp-formula" rid="pone.0340191.e011">2</xref>) solely for Euclidean Distance.<disp-formula id="pone.0340191.e011"><alternatives><graphic id="pone.0340191.e011g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e011" xlink:type="simple"/><mml:math display="block" id="M11"><mml:mrow><mml:mrow><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>arg</mml:mi><mml:munderover><mml:mo>min</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:munderover><mml:mo stretchy="false">|</mml:mo><mml:mo stretchy="false">|</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>−</mml:mo><mml:msub><mml:mi>μ</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">|</mml:mo><mml:msup><mml:mo stretchy="false">|</mml:mo><mml:mn>2</mml:mn></mml:msup><mml:mo>,</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives> <label>(2)</label></disp-formula></p>
<p><italic>c</italic><sub><italic>i</italic></sub> is the newly assigned cluster to the point <italic>x</italic><sub><italic>i</italic></sub>. <inline-formula id="pone.0340191.e012"><alternatives><graphic id="pone.0340191.e012g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e012" xlink:type="simple"/><mml:math display="inline" id="M12"><mml:mrow><mml:msub><mml:mi>μ</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:math></alternatives></inline-formula> is the centroid-point. <inline-formula id="pone.0340191.e013"><alternatives><graphic id="pone.0340191.e013g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e013" xlink:type="simple"/><mml:math display="inline" id="M13"><mml:mrow><mml:mo stretchy="false">|</mml:mo><mml:mo stretchy="false">|</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mspace width="0.167em"/><mml:mrow><mml:mo>−</mml:mo></mml:mrow><mml:mspace width="0.167em"/><mml:msub><mml:mi>μ</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">|</mml:mo><mml:msup><mml:mo stretchy="false">|</mml:mo><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:math></alternatives></inline-formula> is the euclidean distance between the point and centroid. <italic>k</italic> is the a priori set number of clusters from step 1</p>
</list-item>
<list-item>
<p>Calculate the mean value of each cluster and set that as a new centroid. This is expressed by Eq (<xref ref-type="disp-formula" rid="pone.0340191.e014">3</xref>)<disp-formula id="pone.0340191.e014"><alternatives><graphic id="pone.0340191.e014g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e014" xlink:type="simple"/><mml:math display="block" id="M14"><mml:mrow><mml:mrow><mml:msub><mml:mi>u</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>n</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:mfrac><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">[</mml:mo><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">]</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives> <label>(3)</label></disp-formula></p>
<p><italic>u</italic><sub><italic>j</italic></sub> is centroids new location, <inline-formula id="pone.0340191.e015"><alternatives><graphic id="pone.0340191.e015g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e015" xlink:type="simple"/><mml:math display="inline" id="M15"><mml:mrow><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>n</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:mfrac><mml:msubsup><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">[</mml:mo><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:math></alternatives></inline-formula> is the mean value of the <italic>j</italic> cluster, because <italic>n</italic><sub><italic>j</italic></sub> is the count of points in <italic>j</italic> cluster and <inline-formula id="pone.0340191.e016"><alternatives><graphic id="pone.0340191.e016g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e016" xlink:type="simple"/><mml:math display="inline" id="M16"><mml:mrow><mml:msubsup><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">[</mml:mo><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:math></alternatives></inline-formula> is the sum of the point-values inside the <italic>j</italic> cluster.</p>
</list-item>
<list-item>
<p>Repeat steps 3 and 4 until the clusters stop changing or max-iteration value is reached.</p>
</list-item>
</list>
<p>It should be emphasized that the equations above are true if k-means uses Euclidean distance as the similarity measure. Although, in general, it is the most common metric. In this study, the Dynamic Time Warping metric (DTW) was used. According to [<xref ref-type="bibr" rid="pone.0340191.ref032">32</xref>] DTW can find optimal global alignment between the time series. It can capture the similarity between two temporal sequences varying in speed or with time shifts, unlike Euclidean distance. Thus it is much better at pattern recognition in the temporal sequences domain. It’s especially important in studies like this, to recognize trends and global patterns over long periods. DTW, as the name suggests ‘warps’ time sequences in a way to locally minimize pairwise distance between two moments in time. Given two time series, lets compare it with the simpler similarity measure.</p>
<disp-formula id="pone.0340191.e017"><alternatives><graphic id="pone.0340191.e017g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e017" xlink:type="simple"/><mml:math display="block" id="M17"><mml:mrow><mml:mrow><mml:mi>X</mml:mi><mml:mo>=</mml:mo><mml:mo stretchy="false">{</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mi>…</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:mo stretchy="false">}</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives> <label>(4)</label></disp-formula>
<disp-formula id="pone.0340191.e018"><alternatives><graphic id="pone.0340191.e018g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e018" xlink:type="simple"/><mml:math display="block" id="M18"><mml:mrow><mml:mrow><mml:mi>Y</mml:mi><mml:mo>=</mml:mo><mml:mo stretchy="false">{</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mi>…</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:mo stretchy="false">}</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives> <label>(5)</label></disp-formula>
<p>The Euclidean Distance between them would be expressed with Eq (<xref ref-type="disp-formula" rid="pone.0340191.e019">6</xref>)</p>
<disp-formula id="pone.0340191.e019"><alternatives><graphic id="pone.0340191.e019g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e019" xlink:type="simple"/><mml:math display="block" id="M19"><mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:mi>Y</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:msqrt><mml:mrow><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>−</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:msup><mml:mo stretchy="false">)</mml:mo><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:msqrt></mml:mrow></mml:mrow></mml:math></alternatives> <label>(6)</label></disp-formula>
<p>Where <inline-formula id="pone.0340191.e020"><alternatives><graphic id="pone.0340191.e020g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e020" xlink:type="simple"/><mml:math display="inline" id="M20"><mml:mrow><mml:msqrt><mml:mrow><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>−</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:msup><mml:mo stretchy="false">)</mml:mo><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:msqrt></mml:mrow></mml:math></alternatives></inline-formula> is the square root of sum of squared differences for each <italic>i</italic> pair of the sequence.</p>
<p>DTW is a bit more complicated, because of the optimization process that synchronizes two originally un-synchronized sequences. In simple terms, the algorithm seeks out best possible match for each point in the two time series in order do minimize the distance between them. The distance measure does not have to be euclidean, any can be used.</p>
<p>The DTW distance between two time series <italic>X</italic> and <italic>Y</italic> can be expressed by Eq (<xref ref-type="disp-formula" rid="pone.0340191.e021">7</xref>)</p>
<disp-formula id="pone.0340191.e021"><alternatives><graphic id="pone.0340191.e021g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e021" xlink:type="simple"/><mml:math display="block" id="M21"><mml:mrow><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>D</mml:mi><mml:mi>T</mml:mi><mml:mi>W</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:mi>Y</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mi>α</mml:mi><mml:mo>,</mml:mo><mml:mi>β</mml:mi></mml:mrow></mml:msub><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover><mml:mo stretchy="false">|</mml:mo><mml:mo stretchy="false">|</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>−</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>α</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msub><mml:mo stretchy="false">|</mml:mo><mml:mo stretchy="false">|</mml:mo><mml:mo>+</mml:mo><mml:mo stretchy="false">|</mml:mo><mml:mo stretchy="false">|</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>−</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>β</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:msub><mml:mo stretchy="false">|</mml:mo><mml:mo stretchy="false">|</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives> <label>(7)</label></disp-formula>
<p>where <inline-formula id="pone.0340191.e022"><alternatives><graphic id="pone.0340191.e022g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e022" xlink:type="simple"/><mml:math display="inline" id="M22"><mml:mrow><mml:mi>α</mml:mi></mml:mrow></mml:math></alternatives></inline-formula> and <inline-formula id="pone.0340191.e023"><alternatives><graphic id="pone.0340191.e023g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e023" xlink:type="simple"/><mml:math display="inline" id="M23"><mml:mrow><mml:mi>β</mml:mi></mml:mrow></mml:math></alternatives></inline-formula> are warping functions that specify how the time indexes of the series should be transformed, and <italic>n</italic> is the length of the series.</p>
<p>Using the formula above, the K-means algorithm can be modified by replacing the Euclidean distance with the DTW distance. The process of assigning points to the clusters using DTW is given as Eq (<xref ref-type="disp-formula" rid="pone.0340191.e024">8</xref>)</p>
<disp-formula id="pone.0340191.e024"><alternatives><graphic id="pone.0340191.e024g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e024" xlink:type="simple"/><mml:math display="block" id="M24"><mml:mrow><mml:mrow><mml:msub><mml:mi>c</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>arg</mml:mi><mml:munderover><mml:mo>min</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>D</mml:mi><mml:mi>T</mml:mi><mml:mi>W</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>μ</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives> <label>(8)</label></disp-formula>
</sec>
<sec id="sec006">
<title>Clustering evaluation metrics</title>
<p>The first and arguably the most important step involved with k-means clustering is choosing the ‘k’ number of clusters. Sometimes, a researcher can decide on the number of distinct patterns within the data, which would be equivalent with the number ‘k’. However, when dealing with large scale datasets it is almost impossible to decide just based on exploratory data analysis done by hand. That is why, many useful evaluation metrics have been developed to help recognize the ‘right’ number of clusters based on traits like within cluster similarity or total dispersion. In this study, four metrics were used: the elbow method, Calinski-Harabasz index, Davies-Bouldin index and Silhouette coefficient. Although time-series clustering was applied, the primary objective was to identify spatial and temporal dependencies rather than to analyze temporal similarity per se. For this reason, DTW was employed to account for temporal misalignments and amplitude variations within daily PM profiles, while classical clustering validity indices such as the Davies–Bouldin (DB), Calinski–Harabasz (CH), and Within-Cluster Inertia were subsequently used to evaluate the compactness and separability of the resulting groups. This approach allowed the integration of time-series sensitivity with interpretable, geometry-based evaluation metrics, ensuring both temporal flexibility and spatial coherence of the macroregional clusters. In cluster analysis, the formation of clusters is guided not only by quantitative metrics but also by the domain knowledge of the interpreter. Selecting a single, “optimal” number of clusters in an entirely objective manner is extremely challenging. Therefore, in this study, multiple metrics were employed, which may occasionally yield conflicting results. However, when combined with expert knowledge, the analysis of clustering results on maps and their subsequent interpretation can serve as a reliable and meaningful identifier [<xref ref-type="bibr" rid="pone.0340191.ref033">33</xref>].</p>
<sec id="sec007">
<title>Elbow method.</title>
<p>The elbow method [<xref ref-type="bibr" rid="pone.0340191.ref034">34</xref>] is a method to recognize significant turning point in sum squared error (SSE) between centroid and points inside the cluster. The SSE is plotted against the number of clusters. Goal is to choose <italic>k</italic>, that sees a sudden drop of SSE and then plateaus.</p>
<disp-formula id="pone.0340191.e025"><alternatives><graphic id="pone.0340191.e025g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e025" xlink:type="simple"/><mml:math display="block" id="M25"><mml:mrow><mml:mrow><mml:mrow><mml:mi>S</mml:mi><mml:mi>S</mml:mi><mml:mi>E</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:munderover><mml:munder><mml:mo>∑</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>∈</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:munder><mml:mo stretchy="false">|</mml:mo><mml:mo stretchy="false">|</mml:mo><mml:mi>x</mml:mi><mml:mo>−</mml:mo><mml:msub><mml:mi>μ</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">|</mml:mo><mml:msup><mml:mo stretchy="false">|</mml:mo><mml:mn>2</mml:mn></mml:msup><mml:mo>,</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives> <label>(9)</label></disp-formula>
<p>where <italic>k</italic> is the number of clusters, <italic>C</italic><sub><italic>i</italic></sub> is the <italic>i</italic>th cluster, <inline-formula id="pone.0340191.e026"><alternatives><graphic id="pone.0340191.e026g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e026" xlink:type="simple"/><mml:math display="inline" id="M26"><mml:mrow><mml:msub><mml:mi>μ</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></alternatives></inline-formula> is the centroid of the <italic>i</italic>th cluster, and <inline-formula id="pone.0340191.e027"><alternatives><graphic id="pone.0340191.e027g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e027" xlink:type="simple"/><mml:math display="inline" id="M27"><mml:mrow><mml:mo stretchy="false">|</mml:mo><mml:mo stretchy="false">|</mml:mo><mml:mi>x</mml:mi><mml:mspace width="0.167em"/><mml:mrow><mml:mo>−</mml:mo></mml:mrow><mml:mspace width="0.167em"/><mml:msub><mml:mi>μ</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">|</mml:mo><mml:msup><mml:mo stretchy="false">|</mml:mo><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:math></alternatives></inline-formula> is the squared distance between point <italic>x</italic> and the centroid <inline-formula id="pone.0340191.e028"><alternatives><graphic id="pone.0340191.e028g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e028" xlink:type="simple"/><mml:math display="inline" id="M28"><mml:mrow><mml:msub><mml:mi>μ</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></alternatives></inline-formula>.</p>
</sec>
<sec id="sec008">
<title>Calinski-Harabasz index.</title>
<p>Calinski-Harabasz index (CH) [<xref ref-type="bibr" rid="pone.0340191.ref035">35</xref>] is a ratio of between cluster separation and individual cluster cohesiveness. Basically, it’s a index of how well separated the clusters are and how ‘dense’ they are. It is calculated by the equation below (<xref ref-type="disp-formula" rid="pone.0340191.e029">10</xref>):</p>
<disp-formula id="pone.0340191.e029"><alternatives><graphic id="pone.0340191.e029g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e029" xlink:type="simple"/><mml:math display="block" id="M29"><mml:mrow><mml:mrow><mml:mi>C</mml:mi><mml:mi>H</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>W</mml:mi><mml:msub><mml:mi>I</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mfrac><mml:mi>×</mml:mi><mml:mfrac><mml:mrow><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mi>W</mml:mi></mml:mrow></mml:msub><mml:mo>−</mml:mo><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi><mml:mo>−</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives> <label>(10)</label></disp-formula>
<p>where <italic>t</italic>(<italic>B</italic><sub><italic>k</italic></sub>) it the trace of the covariance matrix between clusters, <italic>t</italic>(<italic>WI</italic><sub><italic>k</italic></sub>) it the trace of the covariance matrix within the cluster, <italic>n</italic><sub><italic>W</italic></sub> is the total number of the data points, <italic>k</italic> is the number of clusters.</p>
</sec>
<sec id="sec009">
<title>Silhouette index.</title>
<p>The Silhouette Coefficient (SC) evaluates how well data points fit their assigned clusters by considering both their internal cohesion within the cluster and their separation from other clusters. It ranges from –1 to 1, where –1 indicates that the point would fit better in a different cluster, 0 suggests that the point lies on or near the boundary between two clusters, and 1 signifies a perfect fit to the assigned cluster.</p>
<disp-formula id="pone.0340191.e030"><alternatives><graphic id="pone.0340191.e030g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e030" xlink:type="simple"/><mml:math display="block" id="M30"><mml:mrow><mml:mrow><mml:mi>S</mml:mi><mml:mi>C</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>b</mml:mi><mml:mo>−</mml:mo><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>b</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mfrac><mml:mo>,</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives> <label>(11)</label></disp-formula>
<p>where <italic>a</italic> is the mean distance between the point and other points inside the cluster and <italic>b</italic> is the distance between a point and the nearest cluster (to which this point does not belong).</p>
</sec>
<sec id="sec010">
<title>Davies Bouldin index.</title>
<p>The Davies-Bouldin Index (DB) evaluates the internal compactness of clusters and their separation from one another. The DB ranges from 0 to infinity, where lower values indicate better clustering quality [<xref ref-type="bibr" rid="pone.0340191.ref036">36</xref>]. The DB is defined by the equation below (<xref ref-type="disp-formula" rid="pone.0340191.e031">12</xref>).</p>
<disp-formula id="pone.0340191.e031"><alternatives><graphic id="pone.0340191.e031g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e031" xlink:type="simple"/><mml:math display="block" id="M31"><mml:mrow><mml:mrow><mml:mi>D</mml:mi><mml:mi>B</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:mfrac><mml:munderover><mml:mo>∑</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:munderover><mml:munder><mml:mo>max</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>≠</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:munder><mml:mfrac><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:mspace width="0.278em"/><mml:mspace width="0.278em"/><mml:mspace width="0.278em"/><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mi>…</mml:mi><mml:mi>…</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi><mml:mo>,</mml:mo></mml:mrow></mml:mrow></mml:math></alternatives> <label>(12)</label></disp-formula>
<p>where <italic>s</italic><sub><italic>i</italic></sub> is the average distance between the <italic>i</italic>-cluster centroid and points in this cluster, <italic>s</italic><sub><italic>j</italic></sub> is the average distance between the <italic>j</italic>-cluster centroid and points in this cluster and <italic>d</italic><sub><italic>ij</italic></sub> is the distance between <italic>i</italic> and <italic>j</italic> cluster centroids.</p>
</sec>
</sec>
</sec>
<sec id="sec011" sec-type="results">
<title>Results</title>
<sec id="sec012">
<title>Optimal cluster choice</title>
<p>The optimal cluster choice was based on the previously mentioned four metrics. The focus was primarily on local extremes. For the sake of improved interpretability the metrics were scaled using the min-max method. Moreover the optimal ‘k’ has to be an intersection of metrics concerning all three types of results: <italic>PM</italic><sub>10</sub> <italic>PM</italic><sub>2.5</sub> and <italic>PM</italic><sub>2.5</sub> to <italic>PM</italic><sub>10</sub> ratio. The metrics were calculated on the daily interval data. The possible k’s were within the range of 2 to 20. A key assumption is that k in the range of 2 to 5 was not considered during the analysis of the metrics due to the attempt to identify macroregions.</p>
<p>The analysis of within cluster sum of squares against the number of clusters involves recognizing a significant turning point called the elbow. Results showed in (<xref ref-type="fig" rid="pone.0340191.g005">Fig 5</xref>) are in a steady decline, thus it is impossible to find an elbow, especially across all three types. This metric won’t be helpful for this case.</p>
<fig id="pone.0340191.g005" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0340191.g005</object-id><label>Fig 5</label><caption><title>Within cluster sum of squares metric for cluster number validation for <italic>PM</italic><sub>10</sub> (blue line), <italic>PM</italic><sub>2.5</sub> (orange line), <italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub> (green line).</title></caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0340191.g005" xlink:type="simple"/></fig>
<p>Calinski-Harabasz index (<xref ref-type="fig" rid="pone.0340191.g006">Fig 6</xref>) provides more valuable information. The k equal to 8 is a local maximum, that indicates better clustering for both <italic>PM</italic><sub>10</sub> and PM ratio. Also <italic>k</italic> equal to 12 is interesting because of the sudden decline in PM ratio index value. The index stabilizes for <italic>k</italic> in the range of 13 to 20, where slight fluctuations occur. In general the Calinski-Harabasz index is almost in a monotonic decline, which suggests worse clustering with the k value increase.</p>
<fig id="pone.0340191.g006" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0340191.g006</object-id><label>Fig 6</label><caption><title>Calinski-Harabasz Index for cluster number validation for <italic>PM</italic><sub>10</sub> (blue line), <italic>PM</italic><sub>2.5</sub> (orange line), <italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub> (green line).</title></caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0340191.g006" xlink:type="simple"/></fig>
<p>The silhouette (<xref ref-type="fig" rid="pone.0340191.g007">Fig 7</xref>) exhibits significant variance with a fairly consistent pattern across the three measurements in the range of k 2 to 12. Similarily to Calinski-Harabasz index, there is a significant drop of PM ratio between k 12 and 13. There is a local minimum on k 7 and a local maximum on k 9 for all three types. In the range of k 13 to 20, there are no consistent local extrema. One noteworthy is a k 16 local maximum for PM 2.5.</p>
<fig id="pone.0340191.g007" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0340191.g007</object-id><label>Fig 7</label><caption><title>Silhouette for cluster number validation for <italic>PM</italic><sub>10</sub> (blue line), <italic>PM</italic><sub>2.5</sub> (orange line), <italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub> (green line).</title></caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0340191.g007" xlink:type="simple"/></fig>
<p>When analyzing the Davies-Bouldin index (<xref ref-type="fig" rid="pone.0340191.g008">Fig 8</xref>), the focus was on identifying local minima, which indicate better clusterings. The k 7 was one, which was highly distinct. It is a good clustering indicator for all types of measurements. On the other hand k 8, indicates bad clustering which goes against the Calinski-Harabasz index.</p>
<fig id="pone.0340191.g008" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0340191.g008</object-id><label>Fig 8</label><caption><title>Davies-Bouldin Index for cluster number validation for <italic>PM</italic><sub>10</sub> (blue line), <italic>PM</italic><sub>2.5</sub> (orange line), <italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub> (green line).</title></caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0340191.g008" xlink:type="simple"/></fig>
<p>A comprehensive evaluation of all clustering metrics led to the selection of k = 7 as the most well-justified cluster size. This choice was predominantly influenced by the Davies-Bouldin index, which strongly indicated a high-quality partitioning of the data. The second configuration, k = 16 was considered based on the assumption that the resulting cluster structure would closely correspond to the distribution of voivodeships. The choice of 16 clusters was guided by external knowledge and expert judgment, aiming to balance granularity with interpretability.</p>
</sec>
<sec id="sec013">
<title>The cluster maps</title>
<p>To identify the macroregions of air pollution in Poland, clustering results were visualized across 12 maps. These include two sets of maps, each based on daily and annual data, corresponding to the previously selected cluster sizes of <italic>k</italic> = 7 and <italic>k</italic> = 16.</p>
<p>The cluster distribution (<xref ref-type="fig" rid="pone.0340191.g009">Fig 9a</xref>) (<italic>PM</italic><sub>10</sub>, daily, k = 7) is dominated by two major clusters: 0 and 2. Cluster 0 covers most of northern and western Poland, where the terrain is predominantly lowland. Cluster 2 spans the southeastern part of the country and partially overlaps with clusters in Silesia and Małopolska, where the terrain is mostly highland. Overall, the clusters divide Poland along a northeast–southwest axis. Clusters 3 and 6 are located in central Poland but differ geographically: Cluster 3 corresponds mainly to the transition zone between the Przedborska and South Masurian Highlands and the central lowlands, while Cluster 6 is associated with river valleys (e.g., the Noteć Valley) and the lake district (Pojezierze region). Clusters 1, 4, and 5 are concentrated in Silesia and Małopolska, primarily around major urban areas. In summary, daily <italic>PM</italic><sub>10</sub> clusters exhibit distinct spatial patterns and clear separation between regions.</p>
<fig id="pone.0340191.g009" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0340191.g009</object-id><label>Fig 9</label><caption><title><italic>PM</italic><sub>10</sub> station cluster distribution (hypsometric map from WMTS: [<xref ref-type="bibr" rid="pone.0340191.ref020">20</xref>]; Ref. System: EPSG 2180).</title><p>a) <italic>PM</italic><sub>10</sub>, daily, k = 7. b) <italic>PM</italic><sub>10</sub>, yearly, k = 7. c) <italic>PM</italic><sub>10</sub>, daily, k = 16. d) <italic>PM</italic><sub>10</sub>, yearly, k = 16.</p></caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0340191.g009" xlink:type="simple"/></fig>
<p>The cluster distribution shown in (<xref ref-type="fig" rid="pone.0340191.g009">Fig 9b</xref>) (<italic>PM</italic><sub>10</sub>, yearly, k = 7) appears more chaotic than its daily counterpart. Most of the country is dominated by clusters 0, 2, and 5, which are highly interwoven and encircle the remaining smaller clusters. Cluster 0 is primarily associated with lowland areas and the lake districts in the north. In contrast, clusters 2 and 5 do not correspond to specific geographical regions, suggesting that their grouping may be influenced by factors other than topography. Cluster 1 forms a compact group centered around the Łód ź Voivodeship, particularly in the Wyżyna Przedborska region. It also includes six additional stations located to the south and west, most of which are situated in upland areas. Clusters 3 and 6 are concentrated around major urban centers such as Kraków and Katowice—regions typically characterized by elevated air pollution levels, especially during the colder months. Cluster 4 consists of three stations, all sharing a common feature: close proximity to bodies of water.</p>
<p>Notable similarities in cluster distribution can be observed between (<xref ref-type="fig" rid="pone.0340191.g009">Fig 9c</xref> (<italic>PM</italic><sub>10</sub>, daily, k = 16) and (<xref ref-type="fig" rid="pone.0340191.g009">9b</xref>). In particular, clusters 13 and 10 in (<xref ref-type="fig" rid="pone.0340191.g009">Fig 9c</xref>) closely resembles clusters 4 and 1 in (<xref ref-type="fig" rid="pone.0340191.g009">Fig 9b</xref>), respectively. This resemblance suggests not only analogous local variations but also shared long-term trends among the corresponding stations. Overall, the cluster distribution in (<xref ref-type="fig" rid="pone.0340191.g009">Fig 9c</xref>) is relatively compact, with each cluster largely confined to a specific geographical region. Clusters 6 and 4 are located in highland areas near the southeastern border. Cluster 15 is situated along a river basin of Odra river, with the exception of station 126, which deviates from this pattern. Clusters 14, 5, and 11 are composed of stations situated in the northern and center lowlands.</p>
<p>The long-term trends captured in the annual <italic>PM</italic><sub>10</sub> data do not result in many spatially coherent clusters. The distribution presented in (<xref ref-type="fig" rid="pone.0340191.g009">Fig 9d</xref>) (<italic>PM</italic><sub>10</sub> yearly, k = 16) appears disorganized, making it difficult to analyze or identify consistent characteristics within individual clusters. Notable exceptions are clusters 4 and 10, which show similarities to clusters identified in the previous maps. Most clusters in this dataset are either small, comprising only 2 to 4 stations and/or are dispersed widely across the country, further complicating spatial interpretation.</p>
<p>Cluster 4 in (<xref ref-type="fig" rid="pone.0340191.g010">Fig 10a</xref>) (<italic>PM</italic><sub>2.5</sub>, daily, k = 7) largely follows the course of major river basins-primarily the Wisła—with the exception of stations 93, 89 and 150, which deviate from this pattern. Cluster 6 consists of a single station (127), potentially indicating unique <italic>PM</italic><sub>2.5</sub> pollution levels at that location. Cluster 2 is associated with elevated terrain in the south, with most of its stations situated in the Podgórze Środkowobeskidzkie and Kotlina Sandomierska. Cluster 5 includes stations distributed across all major geographical regions, however, these stations are generally located near terrain depressions or low-lying areas. Cluster 0 is concentrated on the Wyżyna Ślaska, with two outlier—stations 43 and 65—also located in upland regions. Clusters 1 and 3 dominate the northwestern part of the country, an area characterized by lowlands interspersed with forests and numerous lakes.</p>
<fig id="pone.0340191.g010" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0340191.g010</object-id><label>Fig 10</label><caption><title><italic>PM</italic><sub>2.5</sub> station cluster distribution (hypsometric map from WMTS: [<xref ref-type="bibr" rid="pone.0340191.ref020">20</xref>]; Ref. System: EPSG 2180).</title><p>a) <italic>PM</italic><sub>2.5</sub>, daily, k = 7. b) <italic>PM</italic><sub>2.5</sub>, yearly, k = 7. c) <italic>PM</italic><sub>2.5</sub>, daily, k = 16. d) <italic>PM</italic><sub>2.5</sub>, yearly, k = 16.</p></caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0340191.g010" xlink:type="simple"/></fig>
<p>Cluster 0 exhibits in (<xref ref-type="fig" rid="pone.0340191.g010">Fig 10b</xref>) (<italic>PM</italic><sub>2.5</sub>, yearly, k = 7) a distribution similar to its counterpart in (<xref ref-type="fig" rid="pone.0340191.g010">Fig 10a</xref>), though it now slightly overlaps with cluster 0 in the Wyżyna Ślaska region. Cluster 1 is distributed across upland and basin areas in the southeastern part of the country. Clusters 0, 1, 2, and 5 are primarily located in the southern highlands, gradually extending into the central lowlands and to a lesser extent, overlapping with northern regions. Compared to (<xref ref-type="fig" rid="pone.0340191.g010">Fig 10a</xref>), the clusters in the south are less compact and more fragmented. In contrast, cluster 6 forms an almost perfectly delineated region, with the exception of station 6, which appears to be an outlier. This cluster comprises of stations situated near renewable energy sources, such as wind and solar power plants. Cluster 3 is predominantly distributed across the central lowlands in a vertical pattern. Due to its large spatial extent, identifying a single unifying characteristic for this cluster is challenging. Finally, cluster 4 is a standalone group consisting of a single station—station 152—which, as previously noted, tends to form small, isolated clusters.</p>
<p>The cluster distribution presented in (<xref ref-type="fig" rid="pone.0340191.g010">Fig 10c</xref>) (<italic>PM</italic><sub>2.5</sub>, daily, k = 16) reveals several notable features and exhibits clear spatial separation across the country. Most clusters are geographically compact, with the exception of clusters 10, 2, and 5, which are more dispersed. It is particularly noteworthy that cluster 2 closely mirrors its distribution from Fig (<xref ref-type="fig" rid="pone.0340191.g010">10a</xref>), suggesting stable spatial characteristics reflected in local temporal changes. Cluster 3 shows strong alignment with the course of the Wisła River, although station 93 stands out as an outlier. In the southern highlands, numerous clusters have formed in close proximity. Clusters 0 and 6 are situated near the Górnoślasko-Zagłebiowska Metropolia, while clusters 13, 4, and 15 are entirely contained within the Małopolska region. As observed in previous maps, the northwestern parts of the country continue to exhibit well-defined and spatially distinct clusters.</p>
<p>(<xref ref-type="fig" rid="pone.0340191.g010">Fig 10d</xref>) (<italic>PM</italic><sub>2.5</sub>, yearly, k = 16) illustrates that spatial separation is primarily influenced by similarities in local temporal variations among stations. The overall distribution is visibly chaotic, with many clusters overlapping, often within the boundaries of just one or two geographical regions. Clusters 2, 5, 8, 11, and 15 exemplify this lack of a clear spatial pattern, highlighting the irregularity in cluster formation. Once again, station 152 forms a standalone cluster, consistent with its behavior observed in (<xref ref-type="fig" rid="pone.0340191.g010">Fig 10b</xref>).</p>
<p>The cluster distribution presented in (<xref ref-type="fig" rid="pone.0340191.g011">Fig 11a</xref>) (<italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub>, daily, k = 7) is notably distinct from those in the other maps. It features a dominant, large cluster that spans the entire country and serves as a background or baseline layer. In contrast, the remaining clusters are much smaller, each comprising between one and four stations. Cluster 4 exhibits some similarity to Cluster 2 in (<xref ref-type="fig" rid="pone.0340191.g010">Fig 10a</xref> and <xref ref-type="fig" rid="pone.0340191.g010">10c</xref>). Cluster 6 consists solely of station 152, which frequently appears as a standalone or part of a small cluster across different maps. Clusters 1, 3, and 5 are each composed of individual stations, located primarily in the central and northern regions of the country.</p>
<fig id="pone.0340191.g011" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0340191.g011</object-id><label>Fig 11</label><caption><title><italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub> station cluster distribution (hypsometric map from WMTS: [<xref ref-type="bibr" rid="pone.0340191.ref020">20</xref>]; Ref. System: EPSG 2180).</title><p>a) <italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub>, daily, k = 7. b) <italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub>, yearly, k = 7. c) <italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub>, daily, k = 16. d) <italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub>, yearly, k = 16.</p></caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0340191.g011" xlink:type="simple"/></fig>
<p>The clusters shown in (<xref ref-type="fig" rid="pone.0340191.g011">Fig 11b</xref>) (<italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub>, yearly, k = 7) exhibit an unusual spatial pattern. Rather than being concentrated in compact regions, they extend over large areas in curved formations, with several noticeable outliers per cluster. Cluster 3 forms a linear pattern that delineates the Pojezierza region in the northwestern part of Poland. Cluster 2 outlines the central lowlands, effectively encircling this geographical area. Cluster 0 is primarily located near the southern border with the exception of stations 52 and 159, which deviate from this pattern. Clusters 1 and 5 appear more randomly distributed, emerging across various geographical zones without a clear or consistent spatial structure.</p>
<p>The cluster distribution in (<xref ref-type="fig" rid="pone.0340191.g011">Fig 11c</xref>) (<italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub>, daily, k = 16) follows a pattern similar to that observed in (<xref ref-type="fig" rid="pone.0340191.g011">Fig 11a</xref>), albeit on a smaller scale. The dominant cluster is reduced in size but still functions as a background group, encompassing stations that lack a clear geographical association. The remaining clusters are composed of only one to three stations each, making it challenging to identify any distinct spatial characteristics or regional coherence within them.</p>
<p>The cluster distribution in (<xref ref-type="fig" rid="pone.0340191.g011">Fig 11d</xref>) (<italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub>, yearly, k = 16) appears highly irregular. Clusters are either very small, comprising one to three stations, or extend across broad areas of the country. Even the small clusters are widely dispersed, with examples such as clusters 4, 11, and 2 located far apart from one another. Based solely on the terrain and known geographical regions, it is difficult to discern any consistent spatial patterns or logic in the clustering.</p>
<p>(Fig <xref ref-type="fig" rid="pone.0340191.g012">12</xref>) presents the distribution of <italic>PM</italic><sub>10</sub> values across clusters identified in (<xref ref-type="fig" rid="pone.0340191.g009">Fig 9a</xref>) (<italic>PM</italic><sub>10</sub> daily, k = 7). The y-axis is truncated at 100 <inline-formula id="pone.0340191.e032"><alternatives><graphic id="pone.0340191.e032g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e032" xlink:type="simple"/><mml:math display="inline" id="M32"><mml:mrow><mml:mi>μ</mml:mi><mml:mrow><mml:mi mathvariant="normal">g</mml:mi><mml:mo>/</mml:mo><mml:msup><mml:mi mathvariant="normal">m</mml:mi><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula>, with values exceeding this threshold considered outliers. For each cluster, the maximum value is indicated above the corresponding boxplot, along with the percentage of observations classified as outliers. Cluster 0 (northern lowlands) exhibits a notably lower concentration distribution, with a median below 20 <inline-formula id="pone.0340191.e033"><alternatives><graphic id="pone.0340191.e033g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e033" xlink:type="simple"/><mml:math display="inline" id="M33"><mml:mrow><mml:mi>μ</mml:mi><mml:mrow><mml:mi mathvariant="normal">g</mml:mi><mml:mo>/</mml:mo><mml:msup><mml:mi mathvariant="normal">m</mml:mi><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula> and the fewest outliers among all clusters. Similarly, Cluster 6 (central lowlands) shows a low median and the lowest maximum outlier value. In contrast, Clusters 1, 3, 4, and 5 (southern highlands) demonstrate significantly higher <italic>PM</italic><sub>10</sub> concentrations, particularly between the median and third quartile. These clusters also contain extremely high maximum outlier values ranging from 321.4 to 508.4 <inline-formula id="pone.0340191.e034"><alternatives><graphic id="pone.0340191.e034g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e034" xlink:type="simple"/><mml:math display="inline" id="M34"><mml:mrow><mml:mi>μ</mml:mi><mml:mrow><mml:mi mathvariant="normal">g</mml:mi><mml:mo>/</mml:mo><mml:msup><mml:mi mathvariant="normal">m</mml:mi><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:mrow></mml:math></alternatives></inline-formula>, with outlier shares reaching up to 5.8%. Cluster 2 (southeastern terrains) represents an intermediate case, with a distribution situated between the low and high concentration groups.</p>
<fig id="pone.0340191.g012" position="float"><object-id pub-id-type="doi">10.1371/journal.pone.0340191.g012</object-id><label>Fig 12</label><caption><title><italic>PM</italic><sub>10</sub> daily, k = 7 data distribution within clusters.</title></caption>
<graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pone.0340191.g012" xlink:type="simple"/></fig>
</sec>
</sec>
<sec id="sec014" sec-type="conclusions">
<title>Discussion</title>
<p>The selection of the optimal number of clusters (k) was informed by a comparative assessment of four established validation metrics: WCSS, Calinski-Harabasz Index, Silhouette Coefficient, and Davies-Bouldin Index. To enhance interpretability and comparability across <italic>PM</italic><sub>10</sub> <italic>PM</italic><sub>2.5</sub>, and <italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub> ratio datasets, all metrics were normalized using the min-max scaling method. The analysis was conducted on daily-resolution data within the range of k = 6 to 20.</p>
<p>WCSS exhibited a monotonic decline, lacking a clear elbow point, and was thus uninformative. The Calinski-Harabasz Index peaked at k = 8 but declined thereafter, stabilizing beyond k = 13. The Silhouette Coefficient showed consistent structure up to k = 12, with a maximum at k = 9 and a local minimum at k = 7. The Davies-Bouldin Index indicated k = 7 as the most favorable, reflecting optimal cluster compactness and separation. Given the divergence between metrics, a multi-criteria approach was adopted, with k = 7 emerging as the most robust solution. Additionally, k = 16 was selected to explore potential alignment between pollution patterns and Poland’s administrative divisions (voivodeships), aiming to assess whether regional governance, policies, or awareness campaigns may manifest in spatially distinct pollution clusters.</p>
<p>The daily <italic>PM</italic><sub>10</sub> cluster map revealed strong spatial structuring, dominated by two major clusters: one spanning northern and western Poland, and the other encompassing the southeast, including Silesia and Małopolska. These clusters align along a northeast–southwest axis. Central Poland was divided between clusters associated with topographic features such as upland transition zones and river valleys. Urban-industrial centers in Silesia and Małopolska formed distinct, well-separated clusters, highlighting the influence of terrain and anthropogenic factors on daily <italic>PM</italic><sub>10</sub> levels.</p>
<p>This observation underscores the importance of topography and urban-industrial density in shaping pollution patterns at daily resolution. In contrast, annual mean <italic>PM</italic><sub>10</sub> data exhibit less pronounced spatial differentiation. For instance, in Małopolska, stations 60 (Kraków–Bujaka), 61 (Kraków–Bulwarowa), and 64 (Niepołomice), all geographically proximate, belonged to the same cluster in the daily data but diverged significantly in the annual data. Notably, station 61 became part of a geographically isolated southern cluster (cluster 6). The divergence arises directly from the differences in time-series comparisons between annual and daily data. The daily data capture short-term dynamic patterns that account for a substantial portion of the similarities between time series. When the data are aggregated to an annual resolution, these short-term variations are lost, and the focus shifts primarily to long-term trends, disregarding local temporal peaks and shifts. Consequently, this leads to divergence in the clustering results.</p>
<p>A similar divergence is observed in the Silesian Voivodeship, where stations 142 (Zabrze), 136 (Knurów), 140 (Rybnik), and 133 (Godów) form a coherent group in the daily data but separate in the annual data, with station 140 isolated into cluster 6. Interestingly, both stations 61 and 140 are located near small water bodies. Likewise, stations 138 (Myszków) and 139 (Pszczyna) exhibit similar patterns.</p>
<p>These findings suggest that local water bodies may influence long-term <italic>PM</italic><sub>10</sub> measurements, potentially due to microclimatic effects or the presence of water vapor acting as a precursor in secondary particle formation. This observation is in line with Tyrso study [<xref ref-type="bibr" rid="pone.0340191.ref037">37</xref>]. However, the impact of water bodies on PM concentrations is not uniform and depends strongly on their spatial relation to urban structures. Evidence from Wuhan suggests a generally positive influence of large water bodies, driven by their scale, absence of emission sources, and restrictions on dense construction [<xref ref-type="bibr" rid="pone.0340191.ref038">38</xref>]. Other research reports both positive and negative effects of rivers depending on environmental conditions [<xref ref-type="bibr" rid="pone.0340191.ref039">39</xref>]. Specifically, proximity to rivers may reduce <italic>PM</italic><sub>2.5</sub> through humidity-driven particle deposition once a critical threshold is exceeded. Still, when humidity remains lower, particles are less likely to settle and may persist in suspension. Moreover, under moderately humid conditions, gaseous precursors can enhance secondary particle formation. Thus, the role of rivers in urban air quality is context-dependent, shaped by both morphology and meteorology.</p>
<sec id="sec015">
<title>Macroregional patterns of daily <italic>PM</italic><sub>10</sub> clusters in Poland</title>
<p>Daily-resolution data were selected for macroregional classification of <italic>PM</italic><sub>10</sub> as annual averages tend to suppress short-term variability associated with anthropogenic activities. In contrast, daily data preserve finer-scale temporal fluctuations, allowing for a more nuanced identification of pollution regimes shaped by both human and environmental drivers. The spatial distribution of daily <italic>PM</italic><sub>10</sub> clusters across Poland, as illustrated in <xref ref-type="fig" rid="pone.0340191.g009">Fig 9a</xref>, reveals distinct macroregional structures aligned with topographic, climatic, and urban-industrial characteristics. Based on the dominant cluster presence and geographical coherence, five macroregions can be delineated:</p>
<list list-type="order">
<list-item>
<p><bold>Western-Baltic Lowland (Majority of Cluster 0)</bold>: This region encompasses the majority of northern and western Poland, including the voivodeships of West Pomerania, Lubusz, Greater Poland, and parts of Kuyavia-Pomerania, Podlaskie and parts of Warmian-Masurian voivodeship. It is characterized by low population density, relatively flat terrain, and limited industrial activity. The dominance of Cluster 0 indicates generally lower <italic>PM</italic><sub>10</sub> levels, likely influenced by favorable dispersion conditions and a lack of large urban emission sources. Cluster 0 is characterized by the lowest average concentration values, with medians below 20 <inline-formula id="pone.0340191.e035"><alternatives><graphic id="pone.0340191.e035g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e035" xlink:type="simple"/><mml:math display="inline" id="M35"><mml:mrow><mml:mi>μ</mml:mi></mml:mrow></mml:math></alternatives></inline-formula>g/m<sup>3</sup>. In the boxplot, it displays the narrowest interquartile range and only 0.4% of outliers, indicating a relatively minor influence of sudden external factors. This cluster is located in regions with the highest forest cover in the country—Lubusz Voivodeship with nearly 50% forested area, as well as Pomeranian and West Pomeranian Voivodeships, each with approximately 35% [<xref ref-type="bibr" rid="pone.0340191.ref040">40</xref>].</p>
</list-item>
<list-item>
<p><bold>Carpathian Foothill-Upland (Cluster 2)</bold>: Spanning the Carpathian foothills and uplands of Lesser Poland and Subcarpathia, this macroregion exhibits higher <italic>PM</italic><sub>10</sub> concentrations associated with Cluster 2. The distribution aligns with complex topography, frequent thermal inversions, and dense urbanization in valleys, contributing to pollutant accumulation. Cluster 2 is characterized by a relatively narrow boxplot, with the upper quartile around 35 <inline-formula id="pone.0340191.e036"><alternatives><graphic id="pone.0340191.e036g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e036" xlink:type="simple"/><mml:math display="inline" id="M36"><mml:mrow><mml:mi>μ</mml:mi><mml:msup><mml:mtext>g/m</mml:mtext><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:math></alternatives></inline-formula> and the lower quartile at approximately 17 <inline-formula id="pone.0340191.e037"><alternatives><graphic id="pone.0340191.e037g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e037" xlink:type="simple"/><mml:math display="inline" id="M37"><mml:mrow><mml:mi>μ</mml:mi><mml:msup><mml:mtext>g/m</mml:mtext><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:math></alternatives></inline-formula>, and a median shifted toward 20 <inline-formula id="pone.0340191.e038"><alternatives><graphic id="pone.0340191.e038g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e038" xlink:type="simple"/><mml:math display="inline" id="M38"><mml:mrow><mml:mi>μ</mml:mi><mml:msup><mml:mtext>g/m</mml:mtext><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:math></alternatives></inline-formula>. The proportion of outliers is relatively low (1.2%), suggesting greater variability and a stronger influence of topographic factors compared to the lowland-dominated Cluster 0.</p>
</list-item>
<list-item>
<p><bold>Central Transitional Zone (Clusters 3 and 6)</bold>: This central zone includes parts of Mazovia, Łód ź, and Kuyavia, and is subdivided by topographic and hydrological characteristics. We propose to divide it into two subzones:<list list-type="bullet"><list-item><p><bold>Upland-Lowland Transition Zone within Cluster 3.</bold> This cluster is characterized by a boxplot ranging from 20 <inline-formula id="pone.0340191.e039"><alternatives><graphic id="pone.0340191.e039g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e039" xlink:type="simple"/><mml:math display="inline" id="M39"><mml:mrow><mml:mi>μ</mml:mi><mml:msup><mml:mtext>g/m</mml:mtext><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:math></alternatives></inline-formula> to 40 <inline-formula id="pone.0340191.e040"><alternatives><graphic id="pone.0340191.e040g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e040" xlink:type="simple"/><mml:math display="inline" id="M40"><mml:mrow><mml:mi>μ</mml:mi><mml:msup><mml:mtext>g/m</mml:mtext><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:math></alternatives></inline-formula>, with the median shifted toward lower values. The maximum concentration reaches 399 <inline-formula id="pone.0340191.e041"><alternatives><graphic id="pone.0340191.e041g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e041" xlink:type="simple"/><mml:math display="inline" id="M41"><mml:mrow><mml:mi>μ</mml:mi><mml:msup><mml:mtext>g/m</mml:mtext><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:math></alternatives></inline-formula>, and the proportion of outliers is 2.2%. This region encompasses the transitional belt between central uplands and northern lowlands, including areas such as the Przedbórz Upland, the South Masurian Upland and adjacent plains. The clustering pattern suggests that <italic>PM</italic><sub>10</sub> dynamics here are shaped by the interplay of elevation gradients, orographic influences, and regional air circulation.</p>
</list-item>
<list-item>
<p><bold>Lake and River District Zone within Cluster 6.</bold> This cluster also represents a transitional area; however, due to its geographic location, it clearly differs in terms of average concentration values. The box width is comparable to that of Cluster 3, but it is shifted toward lower values. The maximum concentrations are also lower, which may be associated with a stronger influence of unrestricted airflow dynamics. Spanning the river valleys of central Poland—such as the Noteć basin—and extending into the Pojezierze lake districts, this macroregion is shaped by terrain depressions and hydrological features. These local geographical and meteorological conditions appear to modulate pollutant dispersion and retention, contributing to the distinctive <italic>PM</italic><sub>10</sub> cluster.</p>
</list-item>
</list>
</p>
</list-item>
<list-item>
<p><bold>Urban-Industrial Southern (Clusters 1, 4 and 5)</bold>: Concentrated in the Upper Silesian Industrial Area and urban parts of Lesser Poland, this macroregion includes heavily urbanized and industrialized areas such as Katowice, Kraków, Rybnik, and Zabrze. These clusters reflect localized emission sources and complex urban topography, leading to higher and more variable <italic>PM</italic><sub>10</sub> concentrations. Clusters 1, 4, and 5 exhibit very similar characteristics, with Cluster 4 standing out due to significantly poorer air quality indicators. It has the highest proportion of outliers—over 5.8%—and extremely high maximum concentrations, reaching nearly 510 <inline-formula id="pone.0340191.e042"><alternatives><graphic id="pone.0340191.e042g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e042" xlink:type="simple"/><mml:math display="inline" id="M42"><mml:mrow><mml:mi>μ</mml:mi><mml:msup><mml:mtext>g/m</mml:mtext><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:math></alternatives></inline-formula>. This cluster is located in the industrial region of Poland, adjacent to the Upper Silesian Industrial Area, where both anthropogenic activity and local topography contribute to elevated values. Clusters 1 and 5 also show substantially high maximum concentrations, exceeding 300 <inline-formula id="pone.0340191.e043"><alternatives><graphic id="pone.0340191.e043g" mimetype="image" position="anchor" xlink:href="info:doi/10.1371/journal.pone.0340191.e043" xlink:type="simple"/><mml:math display="inline" id="M43"><mml:mrow><mml:mi>μ</mml:mi><mml:msup><mml:mtext>g/m</mml:mtext><mml:mn>3</mml:mn></mml:msup></mml:mrow></mml:math></alternatives></inline-formula>.</p>
</list-item>
</list>
<p>A comparable approach was adopted in a study on <italic>PM</italic><sub>2.5</sub> pollution in China [<xref ref-type="bibr" rid="pone.0340191.ref041">41</xref>]. China, much like Poland, has a highly diverse topography, which makes it a valuable case for comparison. In their analysis, the authors employed a combination of frequent itemset mining and agglomerative hierarchical clustering to identify recurrent patterns of air pollution and to delineate groups of regions with shared characteristics. This procedure resulted in 13 clusters, which were subsequently consolidated into three broader divisions. Each division was distinguished by a unique configuration of climate, degree of urbanization, topography and air pollution patterns. On the basis of these findings, the authors concluded that air pollution management and policy design should move beyond administrative boundaries and instead focus on regions defined by common pollution profiles. This perspective is closely aligned with the rationale of the present study.</p>
<p>These macroregional divisions highlight the strong spatial heterogeneity of daily <italic>PM</italic><sub>10</sub> concentrations in Poland and emphasize the influence of both physiographic and anthropogenic factors. Importantly, such spatial structure is markedly less pronounced in annual-mean data, underscoring the value of high-resolution temporal analyses. Furthermore, discrepancies observed between daily and annual cluster assignments—for example, in the Kraków (Bujaka vs. Bulwarowa) or Silesian region (Zabrze vs. Rybnik)—suggest that local microclimatic conditions and proximity to small water bodies may significantly affect <italic>PM</italic><sub>10</sub> variability. These findings align with previous studies suggesting that water vapor can act as a precursor in particulate matter formation, particularly in the presence of stagnant atmospheric conditions.</p>
<p>Spatiotemporal analysis for 16 clusters of <italic>PM</italic><sub>10</sub> concentrations does not confirm the initial hypothesis that regional policies at the voivodeship level influence the formation of clusters. However, this does not imply that such policies have no impact on pollution levels. The case of Kraków demonstrates that implementing a policy limited to a single subregion does not lead to a significant overall improvement, due to the inflow of pollutants from neighboring areas [<xref ref-type="bibr" rid="pone.0340191.ref015">15</xref>]. This phenomenon is largely driven by the local topography. This observation is further supported by a study on <italic>PM</italic><sub>2.5</sub> in China [<xref ref-type="bibr" rid="pone.0340191.ref041">41</xref>], which found that air pollution clusters do not correspond to administrative boundaries or the policies associated with them. The authors highlight that, due to the transboundary transport of <italic>PM</italic><sub>2.5</sub>, effective mitigation requires coordinated control measures across multiple administrative regions. Consequently, regions defined based on air pollution data extend beyond administrative boundaries and are largely independent of them. Together, these findings reinforce the conclusion from our analysis of <italic>PM</italic><sub>10</sub> clusters: policies limited to a single administrative unit may be insufficient to control air pollution effectively. In the data segmented into 16 clusters, it is evident that in the southern, mountainous regions of Poland particularly in Silesia and Lesser Poland - there is a tendency for isolated clusters of daily averages to form. This is likely linked to local accumulation in topographical depressions. In contrast, in the northern lowlands, the situation appears to be much more stable.</p>
<p>Annual average data show cluster transitions, but the overall spatial stability of the groupings remains high. Due to the limited number of spatial locations with <italic>PM</italic><sub>2.5</sub> data, the high-resolution spatial macro-analysis did not include these results for macroregion delineation, both for k = 7 and k = 16. An interesting observation from the <italic>PM</italic><sub>2.5</sub> dataset is that annual averaging shows greater spatial variability compared to <italic>PM</italic><sub>10</sub> possibly due to differences in particle weight, transport mechanisms, and the formation of secondary pollutants in the atmosphere. However, a similar north-south division, as observed with <italic>PM</italic><sub>10</sub> is still noticeable. For 16 clusters, the conclusions are analogous: Silesia remains strongly isolated, while Lesser Poland shows significant spatial granularity.</p>
<p>A particularly intriguing result arises from the clustering of daily data and <italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub> ratios—nearly the entire country falls into a single cluster, with the exception of a small grouping in the southeast (cluster 4). The remaining clusters consist of isolated individual stations. This suggests that, at high temporal resolution, the origin of particulate matter across Poland is relatively uniform. The differences in concentration appear to be driven less by emission sources and more by transport, removal, and dispersion mechanisms, which affect the spatial density of particulate matter. In the annual perspective with k = 7, there is greater spatial variability: strongly isolated southern clusters and more dispersed northern clusters. This directly reflects the capacity of annual data to reveal dominant trends, such as industrial or natural emissions that may be influenced by the energy consumption [<xref ref-type="bibr" rid="pone.0340191.ref042">42</xref>]. Clustering into 16 regions again shows relatively high heterogeneity across Poland, although naturally lower than in the case of k = 7 clusters. Still, annual averaging reveals increased spatial heterogeneity, with significantly greater differentiation in the south and continued strong distinctiveness of the Silesian voivodeship, along with additional internal diversity within Lesser Poland.</p>
</sec>
<sec id="sec016">
<title>Future research</title>
<p>Future research will involve a more in-depth analysis incorporating additional features, such as meteorological variables and, where possible, indicators related to human activity, in order to further refine regionalization and the analysis of feature importance, following the methodology proposed in [<xref ref-type="bibr" rid="pone.0340191.ref043">43</xref>]. Subsequently, the results of these studies should be integrated with air quality management and funding allocation plans to enable more effective and rapid improvement of atmospheric conditions through targeted management and resource transfer to areas that most critically require intervention - either because they are highly polluted or are significant sources of particulate matter that disperses to surrounding regions. Regarding transitions between clusters, seasonal studies are planned along with the augmentation of reference stations and low-cost sensor (LCS) networks to more accurately delineate regional boundaries. Such a broad, interdisciplinary, and data-driven approach, combined with social and environmental considerations, will allow for the design of more effective national and local policies and support efforts to protect public health.</p>
</sec>
<sec id="sec017">
<title>Urban planning and policies</title>
<p>Continuous data collection and analysis represent essential components in the pursuit of improved air quality in Poland. The application of machine learning techniques enhances analytical capabilities and facilitates the identification of underlying patterns in air pollution. Insights derived from such analyses may serve as a foundation for the development of urban planning strategies consistent with the principles of smart city design. Furthermore, local governments could leverage these findings to secure targeted funding aimed at mitigating region-specific air quality challenges. For instance, areas characterized by elevated <italic>PM</italic><sub>2.5</sub> emissions resulting from vehicular traffic could prioritize traffic optimization measures to reduce pollution levels, while regions dominated by heavy industry might focus on implementing technologies that limit the dispersion of particulate matter.</p>
<p>A framework for developing nationwide policies. The Polish government is actively implementing new policies and regulatory guidelines aimed at mitigating air pollution. The results of our current and future studies may significantly contribute to the formulation of these documents by ensuring that policy measures are data-driven, evidence-based, and tailored to the specific environmental and socio-economic conditions of individual regions and are in line with the country level source analysis [<xref ref-type="bibr" rid="pone.0340191.ref042">42</xref>]. An alternative approach similar to what was proposed by Zhang et al. [<xref ref-type="bibr" rid="pone.0340191.ref041">41</xref>], involves establishing pollution control zones that transcend administrative boundaries. In the context of Poland, this would entail implementing air quality management measures extending beyond individual voivodeships. Such a system would introduce joint inter-voivodeship control areas, delineated according to macroregions identified through data-driven analysis.</p>
</sec>
<sec id="sec018">
<title>Limitations</title>
<p>This study has several limitations. Large areas of the investigated region are underrepresented, with southern Poland being characterized by a higher station density compared to the northern part of the country.</p>
<p>Another limitation concerns the presence of substantial data gaps in some stations, occasionally spanning long time periods. These missing data were unevenly distributed both temporally and spatially, which may have influenced the clustering procedure and its subsequent division into macroregions. Although advanced imputation methods were applied to reduce spatial bias, imputation is inherently imperfect and certain relevant information may not have been fully captured in the grouping process.</p>
<p>The study is constrained by the use of the k-means clustering algorithm, which is associated with several methodological limitations [<xref ref-type="bibr" rid="pone.0340191.ref044">44</xref>]. One issue concerns the initialization procedure, as the algorithm begins by randomly selecting the initial positions of the cluster centroids. This stochastic component may introduce variability in the results, depending on the choice of the random state parameter. Furthermore, K-means inherently assumes the presence of compact, hyper-spherical clusters, which limits its effectiveness when applied to datasets exhibiting more complex or irregular shapes.</p>
</sec>
</sec>
<sec id="sec019" sec-type="conclusions">
<title>Conclusions</title>
<p>This study presents the first large-scale, high-resolution spatiotemporal cluster analysis of <italic>PM</italic><sub>10</sub> and <italic>PM</italic><sub>2.5</sub> in Poland, based on over 13 million observations from reference-grade monitoring stations over 9 years. By combining absolute concentrations and <italic>PM</italic><sub>2.5</sub>/<italic>PM</italic><sub>10</sub> ratios, it provides novel insights into aerosol behavior across diverse physiogeographic regions. Key findings include:</p>
<list list-type="bullet">
<list-item>
<p>Dual-resolution clustering:<list list-type="simple"><list-item><p>– <italic>k</italic> = 7 was applied to capture broad macroregional differentiation.</p>
</list-item>
<list-item>
<p>– <italic>k</italic> = 16 was originally intended to explore alignment with administrative boundaries (voivodeships) and local urban-industrial dynamics. However, the resulting clusters largely do not correspond to voivodeship borders, highlighting the dominance of physiographic and emission-driven patterns over administrative divisions.</p>
</list-item>
</list>
</p>
</list-item>
<list-item>
<p>Physiographic patterns:<list list-type="simple"><list-item><p>– Southern mountains (Silesia, Lesser Poland) show high granularity and persistent isolation due to topography and thermal inversions.</p>
</list-item>
<list-item>
<p>– Northern lowlands form stable, homogeneous clusters shaped by open airflow and lower emissions.</p>
</list-item>
</list>
</p>
</list-item>
<list-item>
<p><italic>PM</italic><sub>10</sub> shows clear daily clustering influenced by topography and microclimate, whereas annual means smooth short-term accumulation events.</p>
</list-item>
<list-item>
<p><italic>PM</italic><sub>2.5</sub> exhibits more homogeneous daily patterns but stronger spatial heterogeneity annually, reflecting secondary aerosol formation and regional chemical regimes.</p>
</list-item>
<list-item>
<p>Urban-industrial hotspots (Katowice, Kraków, Rybnik) are distinguished by both intensive emissions and geomorphological constraints.</p>
</list-item>
<list-item>
<p>Lack of systematic alignment with voivodeship boundaries highlights the need for supra-regional coordination, including harmonized standards, joint monitoring, and mitigation strategies.</p>
</list-item>
</list>
<p>Based on these results, four macroregions are proposed for PM pollution management in Poland: Western-Baltic Lowland – Carpathian Foothill-Upland, Central Transitional Zone (with sub-divisions), and Urban-Industrial Southern. The framework emphasizes the need for targeted interventions coordinated at a supra-regional level, with maybe centralized allocation of resources from the national level to sub-regions. The current voivodeship-based governance does not align with actual pollution patterns, which may result in inefficiencies in funding and mitigation efforts. A macroregional approach can potentially enable a more effective distribution of financial and technical resources, ensuring interventions are proportional to emission burdens and physiographic constraints, rather than administrative boundaries.</p>
</sec>
<sec id="sec020" sec-type="supplementary-material">
<title>Supporting information</title>
<supplementary-material id="pone.0340191.s001" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" position="float" xlink:href="info:doi/10.1371/journal.pone.0340191.s001" xlink:type="simple">
<label>S1 File</label>
<caption>
<title/>
<p>(DOCX)</p>
</caption>
</supplementary-material>
</sec>
</body>
<back>
<ack>
<p>This research was supported as a part of the statutory project by AGH University of Science and Technology, Faculty of Geology, Geophysics and Environmental Protection. Research project partly supported by program “Excellence initiative – research university” for the AGH University.</p>
<p>Use of Artificial Intelligence tools</p>
<p>The authors used the AI language model ChatGPT (developed by OpenAI) to assist with English language editing during the preparation of this manuscript. The tool was used solely to improve grammar, clarity, and style of the text. All scientific content, including hypotheses, interpretations, results, conclusions, and implications, reflects the authors’ original ideas and has been critically reviewed and verified by the authors to ensure accuracy and validity. No content was generated by AI in relation to data analysis, interpretation, or primary research findings.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="pone.0340191.ref001"><label>1</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Jonek-Kowalska</surname> <given-names>I</given-names></name>. <article-title>Assessing the effectiveness of air quality improvements in Polish cities aspiring to be sustainably smart</article-title>. <source>Smart Cities.</source> <year>2023</year>;<volume>6</volume>(<issue>1</issue>):<fpage>510</fpage>–<lpage>30</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3390/smartcities6010024" xlink:type="simple">10.3390/smartcities6010024</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref002"><label>2</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Wesz</surname> <given-names>JGB</given-names></name>, <name name-style="western"><surname>Miron</surname> <given-names>LIG</given-names></name>, <name name-style="western"><surname>Delsante</surname> <given-names>I</given-names></name>, <name name-style="western"><surname>Tzortzopoulos</surname> <given-names>P</given-names></name>. <article-title>Urban quality of life: a systematic literature review</article-title>. <source>Urban Science.</source> <year>2023</year>;<volume>7</volume>(<issue>2</issue>):<fpage>56</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3390/urbansci7020056" xlink:type="simple">10.3390/urbansci7020056</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref003"><label>3</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Thurston</surname> <given-names>GD</given-names></name>, <name name-style="western"><surname>Kipen</surname> <given-names>H</given-names></name>, <name name-style="western"><surname>Annesi-Maesano</surname> <given-names>I</given-names></name>, <name name-style="western"><surname>Balmes</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Brook</surname> <given-names>RD</given-names></name>, <name name-style="western"><surname>Cromar</surname> <given-names>K</given-names></name>, <etal>et al</etal>. <article-title>A joint ERS/ATS policy statement: what constitutes an adverse health effect of air pollution? An analytical framework</article-title>. <source>Eur Respir J.</source> <year>2017</year>;<volume>49</volume>(<issue>1</issue>):<fpage>1600419</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1183/13993003.00419-2016" xlink:type="simple">10.1183/13993003.00419-2016</ext-link></comment> <object-id pub-id-type="pmid">28077473</object-id></mixed-citation></ref>
<ref id="pone.0340191.ref004"><label>4</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Jurek</surname> <given-names>Ł</given-names></name>. <article-title>Ageing of the working-age population in Poland: a theoretical perspective and practical implications</article-title>. <source>Polityka Społeczna.</source> <year>2024</year>;<volume>602</volume>(<issue>7</issue>):<fpage>1</fpage>–<lpage>7</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5604/01.3001.0054.7577" xlink:type="simple">10.5604/01.3001.0054.7577</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref005"><label>5</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Danek</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Zar ęba</surname> <given-names>M</given-names></name>. <article-title>The use of public data from low-cost sensors for the geospatial analysis of air pollution from solid fuel heating during the COVID-19 pandemic spring period in Krakow, Poland</article-title>. <source>Sensors (Basel).</source> <year>2021</year>;<volume>21</volume>(<issue>15</issue>):<fpage>5208</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3390/s21155208" xlink:type="simple">10.3390/s21155208</ext-link></comment> <object-id pub-id-type="pmid">34372442</object-id></mixed-citation></ref>
<ref id="pone.0340191.ref006"><label>6</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>da Costa</surname> <given-names>AZ</given-names></name>, <name name-style="western"><surname>Aniceto</surname> <given-names>JPS</given-names></name>, <name name-style="western"><surname>Lopes</surname> <given-names>M</given-names></name>. <article-title>Low-cost sensor network for air quality assessment in Cabo Verde Islands</article-title>. <source>Sensors (Basel).</source> <year>2024</year>;<volume>24</volume>(<issue>23</issue>):<fpage>7656</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3390/s24237656" xlink:type="simple">10.3390/s24237656</ext-link></comment> <object-id pub-id-type="pmid">39686191</object-id></mixed-citation></ref>
<ref id="pone.0340191.ref007"><label>7</label><mixed-citation publication-type="other" xlink:type="simple">European Parliament. Directive 2004 /107/EC of the European Parliament and of the Council relating to arsenic, cadmium, mercury, nickel and polycyclic aromatic hydrocarbons in ambient air. 2004. <ext-link ext-link-type="uri" xlink:href="https://eur-lex.europa.eu/legal-content/en/ALL/?uri=CELEX:32008L0050" xlink:type="simple">https://eur-lex.europa.eu/legal-content/en/ALL/?uri=CELEX:32008L0050</ext-link></mixed-citation></ref>
<ref id="pone.0340191.ref008"><label>8</label><mixed-citation publication-type="other" xlink:type="simple">European Parliament. Directive 2008 /50/EC of the European Parliament and of the Council on Ambient Air Quality and Cleaner Air for Europe. 2008. <ext-link ext-link-type="uri" xlink:href="http://eur-lex.europa.eu/legal-content/en/ALL/?uri=CELEX:32008L0050" xlink:type="simple">http://eur-lex.europa.eu/legal-content/en/ALL/?uri=CELEX:32008L0050</ext-link></mixed-citation></ref>
<ref id="pone.0340191.ref009"><label>9</label><mixed-citation publication-type="other" xlink:type="simple">Council of the EU and the European Council. Air Pollution in the EU: Facts and Figures. 2024. <ext-link ext-link-type="uri" xlink:href="https://www.consilium.europa.eu/en/infographics/air-pollution-in-the-eu/" xlink:type="simple">https://www.consilium.europa.eu/en/infographics/air-pollution-in-the-eu/</ext-link></mixed-citation></ref>
<ref id="pone.0340191.ref010"><label>10</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Zar ęba</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Danek</surname> <given-names>T</given-names></name>. <article-title>Analysis of air pollution migration during COVID-19 lockdown in Krakow, Poland</article-title>. <source>Aerosol Air Qual Res.</source> <year>2022</year>;<volume>22</volume>(<issue>3</issue>):<fpage>210275</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.4209/aaqr.210275" xlink:type="simple">10.4209/aaqr.210275</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref011"><label>11</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Zareba</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Weglinska</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Danek</surname> <given-names>T</given-names></name>. <article-title>Air pollution seasons in urban moderate climate areas through big data analytics</article-title>. <source>Sci Rep.</source> <year>2024</year>;<volume>14</volume>(<issue>1</issue>):<fpage>3058</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/s41598-024-52733-w" xlink:type="simple">10.1038/s41598-024-52733-w</ext-link></comment> <object-id pub-id-type="pmid">38321084</object-id></mixed-citation></ref>
<ref id="pone.0340191.ref012"><label>12</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Nandi</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Counts</surname> <given-names>N</given-names></name>, <name name-style="western"><surname>Bröker</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Malik</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Chen</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Han</surname> <given-names>R</given-names></name>, <etal>et al</etal>. <article-title>Cost of care for Alzheimer’s disease and related dementias in the United States: 2016 to 2060</article-title>. <source>NPJ Aging.</source> <year>2024</year>;<volume>10</volume>(<issue>1</issue>):<fpage>13</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/s41514-024-00136-6" xlink:type="simple">10.1038/s41514-024-00136-6</ext-link></comment> <object-id pub-id-type="pmid">38331952</object-id></mixed-citation></ref>
<ref id="pone.0340191.ref013"><label>13</label><mixed-citation publication-type="other" xlink:type="simple">Statistics Poland. Poland in the European Union 2024. Statistics Poland; 2024. <ext-link ext-link-type="uri" xlink:href="https://stat.gov.pl/en/topics/other-studies/other-aggregated-studies/poland-in-the-european-union-2024.10.18.html" xlink:type="simple">https://stat.gov.pl/en/topics/other-studies/other-aggregated-studies/poland-in-the-european-union-2024.10.18.html</ext-link></mixed-citation></ref>
<ref id="pone.0340191.ref014"><label>14</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Zgłobicki</surname> <given-names>W</given-names></name>, <name name-style="western"><surname>Baran-Zgłobicka</surname> <given-names>B</given-names></name>. <article-title>Air pollution in major Polish cities in the period 2005 -2021: intensity, effects and attempts to reduce it</article-title>. <source>Environ Res.</source> <year>2024</year>;<volume>240</volume>(Pt 2):<fpage>117497</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.envres.2023.117497" xlink:type="simple">10.1016/j.envres.2023.117497</ext-link></comment> <object-id pub-id-type="pmid">37914007</object-id></mixed-citation></ref>
<ref id="pone.0340191.ref015"><label>15</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Danek</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Weglinska</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Zareba</surname> <given-names>M</given-names></name>. <article-title>The influence of meteorological factors and terrain on air pollution concentration and migration: a geostatistical case study from Krakow, Poland</article-title>. <source>Sci Rep.</source> <year>2022</year>;<volume>12</volume>(<issue>1</issue>):<fpage>11050</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1038/s41598-022-15160-3" xlink:type="simple">10.1038/s41598-022-15160-3</ext-link></comment> <object-id pub-id-type="pmid">35773386</object-id></mixed-citation></ref>
<ref id="pone.0340191.ref016"><label>16</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Czernecki</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Półrolniczak</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Kolendowicz</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Marosz</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Kendzierski</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Pilguj</surname> <given-names>N</given-names></name>. <article-title>Influence of the atmospheric conditions on PM10 concentrations in Poznań, Poland</article-title>. <source>J Atmos Chem.</source> <year>2016</year>;<volume>74</volume>(<issue>1</issue>):<fpage>115</fpage>–<lpage>39</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1007/s10874-016-9345-5" xlink:type="simple">10.1007/s10874-016-9345-5</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref017"><label>17</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Zhang</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Guo</surname> <given-names>X</given-names></name>, <name name-style="western"><surname>Zhao</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Xu</surname> <given-names>X</given-names></name>, <name name-style="western"><surname>Zheng</surname> <given-names>X</given-names></name>, <name name-style="western"><surname>Li</surname> <given-names>Y</given-names></name>, <etal>et al</etal>. <article-title>Effect of large topography on atmospheric environment in Sichuan Basin: A climate analysis based on changes in atmospheric visibility</article-title>. <source>Front Earth Sci.</source> <year>2022</year>;<volume>10</volume>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/feart.2022.997586" xlink:type="simple">10.3389/feart.2022.997586</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref018"><label>18</label><mixed-citation publication-type="other" xlink:type="simple">Chief Inspectorate of Environmental Protection. Measurement data archives. Chief Inspectorate of Environmental Protection. 2024. <ext-link ext-link-type="uri" xlink:href="https://powietrze.gios.gov.pl/pjp/archives" xlink:type="simple">https://powietrze.gios.gov.pl/pjp/archives</ext-link></mixed-citation></ref>
<ref id="pone.0340191.ref019"><label>19</label><mixed-citation publication-type="other" xlink:type="simple">OpenStreetMap contributors. OpenStreetMap. 2025. <ext-link ext-link-type="uri" xlink:href="https://www.openstreetmap.org" xlink:type="simple">https://www.openstreetmap.org</ext-link></mixed-citation></ref>
<ref id="pone.0340191.ref020"><label>20</label><mixed-citation publication-type="other" xlink:type="simple">General Directorate for Geodesy and Cartography (GUGiK). Geoportal.gov.pl. 2025. <ext-link ext-link-type="uri" xlink:href="https://mapy.geoportal.gov.pl" xlink:type="simple">https://mapy.geoportal.gov.pl</ext-link></mixed-citation></ref>
<ref id="pone.0340191.ref021"><label>21</label><mixed-citation publication-type="book" xlink:type="simple"><name name-style="western"><surname>Freedman</surname> <given-names>D</given-names></name>, <name name-style="western"><surname>Pisani</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Purves</surname> <given-names>R</given-names></name>. <source>Statistics (International Student Edition)</source>. 4th ed. <publisher-loc>New York</publisher-loc>: <publisher-name>W.W. Norton and Company</publisher-name>; <year>2007</year>.</mixed-citation></ref>
<ref id="pone.0340191.ref022"><label>22</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Murtagh</surname> <given-names>F</given-names></name>, <name name-style="western"><surname>Legendre</surname> <given-names>P</given-names></name>. <article-title>Ward’s hierarchical agglomerative clustering method: which algorithms implement ward’s criterion?</article-title>. <source>J Classif.</source> <year>2014</year>;<volume>31</volume>(<issue>3</issue>):<fpage>274</fpage>–<lpage>95</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1007/s00357-014-9161-z" xlink:type="simple">10.1007/s00357-014-9161-z</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref023"><label>23</label><mixed-citation publication-type="other" xlink:type="simple">Sapovadia D. K-means clustering. 2021. <ext-link ext-link-type="uri" xlink:href="https://darrsheni-sapovadia26.medium.com/k-means-clustering-96711652a0e9" xlink:type="simple">https://darrsheni-sapovadia26.medium.com/k-means-clustering-96711652a0e9</ext-link></mixed-citation></ref>
<ref id="pone.0340191.ref024"><label>24</label><mixed-citation publication-type="other" xlink:type="simple">Howarth J. 54 New Artificial Intelligence Statistics. 2024. <ext-link ext-link-type="uri" xlink:href="https://explodingtopics.com/blog/ai-statistics" xlink:type="simple">https://explodingtopics.com/blog/ai-statistics</ext-link></mixed-citation></ref>
<ref id="pone.0340191.ref025"><label>25</label><mixed-citation publication-type="other" xlink:type="simple">Bartley K. Big Data Statistics: How Much Data is There in the World? 2024. <ext-link ext-link-type="uri" xlink:href="https://rivery.io/blog/big-data-statistics-how-much-data-is-there-in-the-world/" xlink:type="simple">https://rivery.io/blog/big-data-statistics-how-much-data-is-there-in-the-world/</ext-link></mixed-citation></ref>
<ref id="pone.0340191.ref026"><label>26</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Deelman</surname> <given-names>E</given-names></name>, <name name-style="western"><surname>Mandal</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Jiang</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Sakellariou</surname> <given-names>R</given-names></name>. <article-title>The role of machine learning in scientific workflows</article-title>. <source>The International Journal of High Performance Computing Applications.</source> <year>2019</year>;<volume>33</volume>(<issue>6</issue>):<fpage>1128</fpage>–<lpage>39</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1177/1094342019852127" xlink:type="simple">10.1177/1094342019852127</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref027"><label>27</label><mixed-citation publication-type="other" xlink:type="simple">Chen M. Machine learning and analytics: an expert guide; 2024. <ext-link ext-link-type="uri" xlink:href="https://www.oracle.com/business-analytics/machine-learning/" xlink:type="simple">https://www.oracle.com/business-analytics/machine-learning/</ext-link></mixed-citation></ref>
<ref id="pone.0340191.ref028"><label>28</label><mixed-citation publication-type="other" xlink:type="simple">Sen J. Machine learning - algorithms, models and applications. Rijeka: IntechOpen; 2021.</mixed-citation></ref>
<ref id="pone.0340191.ref029"><label>29</label><mixed-citation publication-type="other" xlink:type="simple">Jäger S, Biessmann F. From data imputation to data cleaning—automated cleaning of tabular data improves downstream predictive performance. In: Proceedings of the 27th International Conference on Artificial Intelligence and Statistics. 2024. p. 3394–402. <ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v238/jager24a.html" xlink:type="simple">https://proceedings.mlr.press/v238/jager24a.html</ext-link></mixed-citation></ref>
<ref id="pone.0340191.ref030"><label>30</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Bezdek</surname> <given-names>JC</given-names></name>, <name name-style="western"><surname>Ehrlich</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Full</surname> <given-names>W</given-names></name>. <article-title>FCM: the fuzzy c-means clustering algorithm</article-title>. <source>Computers &amp; Geosciences.</source> <year>1984</year>;<volume>10</volume>(2–3):<fpage>191</fpage>–<lpage>203</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/0098-3004(84)90020-7" xlink:type="simple">10.1016/0098-3004(84)90020-7</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref031"><label>31</label><mixed-citation publication-type="other" xlink:type="simple">Arthur D, Vassilvitskii S. K-means: the advantages of careful seeding. In: Proceedings of the eighteenth annual ACM-SIAM symposium on discrete algorithms. 2007. p. 1027–35.</mixed-citation></ref>
<ref id="pone.0340191.ref032"><label>32</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Petitjean</surname> <given-names>F</given-names></name>, <name name-style="western"><surname>Ketterlin</surname> <given-names>A</given-names></name>, <name name-style="western"><surname>Gançarski</surname> <given-names>P</given-names></name>. <article-title>A global averaging method for dynamic time warping, with applications to clustering</article-title>. <source>Pattern Recognition.</source> <year>2011</year>;<volume>44</volume>(<issue>3</issue>):<fpage>678</fpage>–<lpage>93</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.patcog.2010.09.013" xlink:type="simple">10.1016/j.patcog.2010.09.013</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref033"><label>33</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Jain</surname> <given-names>AK</given-names></name>. <article-title>Data clustering: 50 years beyond K-means</article-title>. <source>Pattern Recognition Letters.</source> <year>2010</year>;<volume>31</volume>(<issue>8</issue>):<fpage>651</fpage>–<lpage>66</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.patrec.2009.09.011" xlink:type="simple">10.1016/j.patrec.2009.09.011</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref034"><label>34</label><mixed-citation publication-type="other" xlink:type="simple">Marutho D, Hendra Handaka S, Wijaya E, Muljono. The determination of cluster number at k-mean using elbow method and purity evaluation on headline news. In: 2018 International Seminar on Application for Technology of Information and Communication. 2018. p. 533–8. <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1109/isemantic.2018.8549751" xlink:type="simple">https://doi.org/10.1109/isemantic.2018.8549751</ext-link></mixed-citation></ref>
<ref id="pone.0340191.ref035"><label>35</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Calinski</surname> <given-names>T</given-names></name>, <name name-style="western"><surname>Harabasz</surname> <given-names>J</given-names></name>. <article-title>A dendrite method for cluster analysis</article-title>. <source>Comm in Stats - Theory &amp; Methods.</source> <year>1974</year>;<volume>3</volume>(<issue>1</issue>):<fpage>1</fpage>–<lpage>27</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1080/03610927408827101" xlink:type="simple">10.1080/03610927408827101</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref036"><label>36</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Davies</surname> <given-names>DL</given-names></name>, <name name-style="western"><surname>Bouldin</surname> <given-names>DW</given-names></name>. <article-title>A cluster separation measure</article-title>. <source>IEEE Trans Pattern Anal Mach Intell.</source> <year>1979</year>;PAMI-1(<issue>2</issue>):<fpage>224</fpage>–<lpage>7</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1109/tpami.1979.4766909" xlink:type="simple">10.1109/tpami.1979.4766909</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref037"><label>37</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Tsyro</surname> <given-names>SG</given-names></name>. <article-title>To what extent can aerosol water explain the discrepancy between model calculated and gravimetric PM 10 and PM 2.5?</article-title>. <source>Atmos Chem Phys.</source> <year>2005</year>;<volume>5</volume>(<issue>2</issue>):<fpage>515</fpage>–<lpage>32</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5194/acp-5-515-2005" xlink:type="simple">10.5194/acp-5-515-2005</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref038"><label>38</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Zhou</surname> <given-names>X</given-names></name>, <name name-style="western"><surname>Zhang</surname> <given-names>S</given-names></name>, <name name-style="western"><surname>Zhu</surname> <given-names>D</given-names></name>. <article-title>Impact of urban water networks on microclimate and PM2.5 distribution in downtown areas: a case study of Wuhan</article-title>. <source>Building and Environment.</source> <year>2021</year>;<volume>203</volume>:<fpage>108073</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.buildenv.2021.108073" xlink:type="simple">10.1016/j.buildenv.2021.108073</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref039"><label>39</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Xu</surname> <given-names>J</given-names></name>, <name name-style="western"><surname>Liu</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Chen</surname> <given-names>H</given-names></name>. <article-title>Spatial heterogeneity of river effects on PM2.5 pollutants in waterfront neighborhoods based on mobile monitoring</article-title>. <source>Atmospheric Pollution Research.</source> <year>2022</year>;<volume>13</volume>(<issue>9</issue>):<fpage>101539</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.apr.2022.101539" xlink:type="simple">10.1016/j.apr.2022.101539</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref040"><label>40</label><mixed-citation publication-type="other" xlink:type="simple">State Forests National Forest Holding. Report on the State of Forests in Poland 2022 . Warsaw, Poland: Information Center of State Forests; 2023. <ext-link ext-link-type="uri" xlink:href="https://www.lasy.gov.pl" xlink:type="simple">https://www.lasy.gov.pl</ext-link></mixed-citation></ref>
<ref id="pone.0340191.ref041"><label>41</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Zhang</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Yang</surname> <given-names>G</given-names></name>. <article-title>Cluster analysis of PM2.5 pollution in China using the frequent itemset clustering approach</article-title>. <source>Environ Res.</source> <year>2022</year>;<volume>204</volume>(Pt B):<fpage>112009</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.envres.2021.112009" xlink:type="simple">10.1016/j.envres.2021.112009</ext-link></comment> <object-id pub-id-type="pmid">34534521</object-id></mixed-citation></ref>
<ref id="pone.0340191.ref042"><label>42</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Zareba</surname> <given-names>M</given-names></name>. <article-title>Assessing the role of energy mix in long-term air pollution trends: initial evidence from Poland</article-title>. <source>Energies.</source> <year>2025</year>;<volume>18</volume>(<issue>5</issue>):<fpage>1211</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3390/en18051211" xlink:type="simple">10.3390/en18051211</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref043"><label>43</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Zareba</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Danek</surname> <given-names>T</given-names></name>. <article-title>A novel methodology for Explainable Artificial Intelligence integrated with geostatistics for air pollution control and environmental management</article-title>. <source>Ecological Informatics.</source> <year>2025</year>;<volume>92</volume>:<fpage>103450</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.ecoinf.2025.103450" xlink:type="simple">10.1016/j.ecoinf.2025.103450</ext-link></comment></mixed-citation></ref>
<ref id="pone.0340191.ref044"><label>44</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Ikotun</surname> <given-names>AM</given-names></name>, <name name-style="western"><surname>Ezugwu</surname> <given-names>AE</given-names></name>, <name name-style="western"><surname>Abualigah</surname> <given-names>L</given-names></name>, <name name-style="western"><surname>Abuhaija</surname> <given-names>B</given-names></name>, <name name-style="western"><surname>Heming</surname> <given-names>J</given-names></name>. <article-title>K-means clustering algorithms: a comprehensive review, variants analysis, and advances in the era of big data</article-title>. <source>Information Sciences.</source> <year>2023</year>;<volume>622</volume>:<fpage>178</fpage>–<lpage>210</lpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1016/j.ins.2022.11.139" xlink:type="simple">10.1016/j.ins.2022.11.139</ext-link></comment></mixed-citation></ref>
</ref-list>
</back>
</article>