/src/gdal/gcore/gdal_matrix.hpp
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * Project: GDAL Core |
3 | | * Purpose: Utility functions for matrix multiplication |
4 | | * Author: Even Rouault, <even dot rouault at spatialys.com> |
5 | | * |
6 | | ****************************************************************************** |
7 | | * Copyright (c) 2025, Even Rouault <even dot rouault at spatialys.com> |
8 | | * |
9 | | * SPDX-License-Identifier: MIT |
10 | | ****************************************************************************/ |
11 | | |
12 | | #ifndef GDAL_MATRIX_HPP |
13 | | #define GDAL_MATRIX_HPP |
14 | | |
15 | | #include "cpl_port.h" |
16 | | #include <algorithm> |
17 | | |
18 | | /************************************************************************/ |
19 | | /* GDALMatrixMultiplyAByTransposeAUpperTriangle() */ |
20 | | /************************************************************************/ |
21 | | |
22 | | // Compute res = A * A.transpose(), by filling only the upper triangle. |
23 | | // Do that in a cache friendly way. |
24 | | // We accumulate values into the output array, so generally the caller must |
25 | | // have make sure to zero-initialize it before (unless they want to add into it) |
26 | | template <class T> |
27 | | // CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW because it seems the uses of openmp-simd |
28 | | // causes that to happen |
29 | | static CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW void |
30 | | GDALMatrixMultiplyAByTransposeAUpperTriangle([[maybe_unused]] int nNumThreads, |
31 | | const T *A, // rows * cols |
32 | | T *res, // rows * rows |
33 | | int rows, size_t cols) |
34 | 0 | { |
35 | 0 | constexpr int BLOCK_SIZE = 64; |
36 | 0 | constexpr size_t BLOCK_SIZE_COLS = 256; |
37 | 0 | constexpr T ZERO{0}; |
38 | |
|
39 | | #ifdef HAVE_OPENMP |
40 | | #pragma omp parallel for schedule(static) num_threads(nNumThreads) |
41 | | #endif |
42 | 0 | for (int64_t ii64 = 0; ii64 < rows; ii64 += BLOCK_SIZE) |
43 | 0 | { |
44 | 0 | const int ii = static_cast<int>(ii64); |
45 | 0 | const int i_end = ii + std::min(BLOCK_SIZE, rows - ii); |
46 | 0 | for (int jj = ii; jj < rows; |
47 | 0 | jj += std::min(BLOCK_SIZE, rows - jj)) // upper triangle |
48 | 0 | { |
49 | 0 | const int j_end = jj + std::min(BLOCK_SIZE, rows - jj); |
50 | 0 | for (size_t cc = 0; cc < cols; /* increment done at end of loop */) |
51 | 0 | { |
52 | 0 | const size_t c_end = cc + std::min(BLOCK_SIZE_COLS, cols - cc); |
53 | |
|
54 | 0 | for (int i = ii; i < i_end; ++i) |
55 | 0 | { |
56 | 0 | const T *const Ai = &A[i * cols]; |
57 | 0 | int j = std::max(i, jj); |
58 | 0 | for (; j < j_end - 7; j += 8) |
59 | 0 | { |
60 | 0 | const T *const Ajp0 = A + (j + 0) * cols; |
61 | 0 | const T *const Ajp1 = A + (j + 1) * cols; |
62 | 0 | const T *const Ajp2 = A + (j + 2) * cols; |
63 | 0 | const T *const Ajp3 = A + (j + 3) * cols; |
64 | 0 | const T *const Ajp4 = A + (j + 4) * cols; |
65 | 0 | const T *const Ajp5 = A + (j + 5) * cols; |
66 | 0 | const T *const Ajp6 = A + (j + 6) * cols; |
67 | 0 | const T *const Ajp7 = A + (j + 7) * cols; |
68 | |
|
69 | 0 | T dfSum0 = ZERO, dfSum1 = ZERO, dfSum2 = ZERO, |
70 | 0 | dfSum3 = ZERO, dfSum4 = ZERO, dfSum5 = ZERO, |
71 | 0 | dfSum6 = ZERO, dfSum7 = ZERO; |
72 | 0 | #ifdef HAVE_OPENMP_SIMD |
73 | 0 | #pragma omp simd reduction(+ : dfSum0, dfSum1, dfSum2, dfSum3, dfSum4, dfSum5, dfSum6, dfSum7) |
74 | 0 | #endif |
75 | 0 | for (size_t c = cc; c < c_end; ++c) |
76 | 0 | { |
77 | 0 | dfSum0 += Ai[c] * Ajp0[c]; |
78 | 0 | dfSum1 += Ai[c] * Ajp1[c]; |
79 | 0 | dfSum2 += Ai[c] * Ajp2[c]; |
80 | 0 | dfSum3 += Ai[c] * Ajp3[c]; |
81 | 0 | dfSum4 += Ai[c] * Ajp4[c]; |
82 | 0 | dfSum5 += Ai[c] * Ajp5[c]; |
83 | 0 | dfSum6 += Ai[c] * Ajp6[c]; |
84 | 0 | dfSum7 += Ai[c] * Ajp7[c]; |
85 | 0 | } |
86 | |
|
87 | 0 | const auto nResOffset = |
88 | 0 | static_cast<size_t>(i) * rows + j; |
89 | 0 | res[nResOffset + 0] += dfSum0; |
90 | 0 | res[nResOffset + 1] += dfSum1; |
91 | 0 | res[nResOffset + 2] += dfSum2; |
92 | 0 | res[nResOffset + 3] += dfSum3; |
93 | 0 | res[nResOffset + 4] += dfSum4; |
94 | 0 | res[nResOffset + 5] += dfSum5; |
95 | 0 | res[nResOffset + 6] += dfSum6; |
96 | 0 | res[nResOffset + 7] += dfSum7; |
97 | 0 | } |
98 | 0 | for (; j < j_end; ++j) |
99 | 0 | { |
100 | 0 | const T *const Aj = A + j * cols; |
101 | |
|
102 | 0 | T dfSum = ZERO; |
103 | 0 | #ifdef HAVE_OPENMP_SIMD |
104 | 0 | #pragma omp simd reduction(+ : dfSum) |
105 | 0 | #endif |
106 | 0 | for (size_t c = cc; c < c_end; ++c) |
107 | 0 | { |
108 | 0 | dfSum += Ai[c] * Aj[c]; |
109 | 0 | } |
110 | |
|
111 | 0 | res[static_cast<size_t>(i) * rows + j] += dfSum; |
112 | 0 | } |
113 | 0 | } |
114 | |
|
115 | 0 | cc = c_end; |
116 | 0 | } |
117 | 0 | } |
118 | 0 | } |
119 | 0 | } Unexecuted instantiation: gdaldataset.cpp:void GDALMatrixMultiplyAByTransposeAUpperTriangle<double>(int, double const*, double*, int, unsigned long) Unexecuted instantiation: gdal_matrix_avx2_fma.cpp:void GDALMatrixMultiplyAByTransposeAUpperTriangle_AVX2_FMA_internal<double>(int, double const*, double*, int, unsigned long) |
120 | | |
121 | | #endif |