Coverage Report

Created: 2025-12-31 06:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gdal/gcore/gdal_matrix.hpp
Line
Count
Source
1
/******************************************************************************
2
 * Project:  GDAL Core
3
 * Purpose:  Utility functions for matrix multiplication
4
 * Author:   Even Rouault, <even dot rouault at spatialys.com>
5
 *
6
 ******************************************************************************
7
 * Copyright (c) 2025, Even Rouault <even dot rouault at spatialys.com>
8
 *
9
 * SPDX-License-Identifier: MIT
10
 ****************************************************************************/
11
12
#ifndef GDAL_MATRIX_HPP
13
#define GDAL_MATRIX_HPP
14
15
#include "cpl_port.h"
16
#include <algorithm>
17
18
/************************************************************************/
19
/*               GDALMatrixMultiplyAByTransposeAUpperTriangle()         */
20
/************************************************************************/
21
22
// Compute res = A * A.transpose(), by filling only the upper triangle.
23
// Do that in a cache friendly way.
24
// We accumulate values into the output array, so generally the caller must
25
// have make sure to zero-initialize it before (unless they want to add into it)
26
template <class T>
27
// CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW because it seems the uses of openmp-simd
28
// causes that to happen
29
static CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW void
30
GDALMatrixMultiplyAByTransposeAUpperTriangle([[maybe_unused]] int nNumThreads,
31
                                             const T *A,  // rows * cols
32
                                             T *res,      // rows * rows
33
                                             int rows, size_t cols)
34
0
{
35
0
    constexpr int BLOCK_SIZE = 64;
36
0
    constexpr size_t BLOCK_SIZE_COLS = 256;
37
0
    constexpr T ZERO{0};
38
39
#ifdef HAVE_OPENMP
40
#pragma omp parallel for schedule(static) num_threads(nNumThreads)
41
#endif
42
0
    for (int64_t ii64 = 0; ii64 < rows; ii64 += BLOCK_SIZE)
43
0
    {
44
0
        const int ii = static_cast<int>(ii64);
45
0
        const int i_end = ii + std::min(BLOCK_SIZE, rows - ii);
46
0
        for (int jj = ii; jj < rows;
47
0
             jj += std::min(BLOCK_SIZE, rows - jj))  // upper triangle
48
0
        {
49
0
            const int j_end = jj + std::min(BLOCK_SIZE, rows - jj);
50
0
            for (size_t cc = 0; cc < cols; /* increment done at end of loop */)
51
0
            {
52
0
                const size_t c_end = cc + std::min(BLOCK_SIZE_COLS, cols - cc);
53
54
0
                for (int i = ii; i < i_end; ++i)
55
0
                {
56
0
                    const T *const Ai = &A[i * cols];
57
0
                    int j = std::max(i, jj);
58
0
                    for (; j < j_end - 7; j += 8)
59
0
                    {
60
0
                        const T *const Ajp0 = A + (j + 0) * cols;
61
0
                        const T *const Ajp1 = A + (j + 1) * cols;
62
0
                        const T *const Ajp2 = A + (j + 2) * cols;
63
0
                        const T *const Ajp3 = A + (j + 3) * cols;
64
0
                        const T *const Ajp4 = A + (j + 4) * cols;
65
0
                        const T *const Ajp5 = A + (j + 5) * cols;
66
0
                        const T *const Ajp6 = A + (j + 6) * cols;
67
0
                        const T *const Ajp7 = A + (j + 7) * cols;
68
69
0
                        T dfSum0 = ZERO, dfSum1 = ZERO, dfSum2 = ZERO,
70
0
                          dfSum3 = ZERO, dfSum4 = ZERO, dfSum5 = ZERO,
71
0
                          dfSum6 = ZERO, dfSum7 = ZERO;
72
0
#ifdef HAVE_OPENMP_SIMD
73
0
#pragma omp simd reduction(+ : dfSum0, dfSum1, dfSum2, dfSum3, dfSum4, dfSum5, dfSum6, dfSum7)
74
0
#endif
75
0
                        for (size_t c = cc; c < c_end; ++c)
76
0
                        {
77
0
                            dfSum0 += Ai[c] * Ajp0[c];
78
0
                            dfSum1 += Ai[c] * Ajp1[c];
79
0
                            dfSum2 += Ai[c] * Ajp2[c];
80
0
                            dfSum3 += Ai[c] * Ajp3[c];
81
0
                            dfSum4 += Ai[c] * Ajp4[c];
82
0
                            dfSum5 += Ai[c] * Ajp5[c];
83
0
                            dfSum6 += Ai[c] * Ajp6[c];
84
0
                            dfSum7 += Ai[c] * Ajp7[c];
85
0
                        }
86
87
0
                        const auto nResOffset =
88
0
                            static_cast<size_t>(i) * rows + j;
89
0
                        res[nResOffset + 0] += dfSum0;
90
0
                        res[nResOffset + 1] += dfSum1;
91
0
                        res[nResOffset + 2] += dfSum2;
92
0
                        res[nResOffset + 3] += dfSum3;
93
0
                        res[nResOffset + 4] += dfSum4;
94
0
                        res[nResOffset + 5] += dfSum5;
95
0
                        res[nResOffset + 6] += dfSum6;
96
0
                        res[nResOffset + 7] += dfSum7;
97
0
                    }
98
0
                    for (; j < j_end; ++j)
99
0
                    {
100
0
                        const T *const Aj = A + j * cols;
101
102
0
                        T dfSum = ZERO;
103
0
#ifdef HAVE_OPENMP_SIMD
104
0
#pragma omp simd reduction(+ : dfSum)
105
0
#endif
106
0
                        for (size_t c = cc; c < c_end; ++c)
107
0
                        {
108
0
                            dfSum += Ai[c] * Aj[c];
109
0
                        }
110
111
0
                        res[static_cast<size_t>(i) * rows + j] += dfSum;
112
0
                    }
113
0
                }
114
115
0
                cc = c_end;
116
0
            }
117
0
        }
118
0
    }
119
0
}
Unexecuted instantiation: gdaldataset.cpp:void GDALMatrixMultiplyAByTransposeAUpperTriangle<double>(int, double const*, double*, int, unsigned long)
Unexecuted instantiation: gdal_matrix_avx2_fma.cpp:void GDALMatrixMultiplyAByTransposeAUpperTriangle_AVX2_FMA_internal<double>(int, double const*, double*, int, unsigned long)
120
121
#endif