/src/gdal/gcore/gdal_matrix.hpp

Source
/******************************************************************************
 * Project:  GDAL Core
 * Purpose:  Utility functions for matrix multiplication
 * Author:   Even Rouault, <even dot rouault at spatialys.com>
 *
 ******************************************************************************
 * Copyright (c) 2025, Even Rouault <even dot rouault at spatialys.com>
 *
 * SPDX-License-Identifier: MIT
 ****************************************************************************/

#ifndef GDAL_MATRIX_HPP
#define GDAL_MATRIX_HPP

#include "cpl_port.h"
#include <algorithm>

/************************************************************************/
/*               GDALMatrixMultiplyAByTransposeAUpperTriangle()         */
/************************************************************************/

// Compute res = A * A.transpose(), by filling only the upper triangle.
// Do that in a cache friendly way.
// We accumulate values into the output array, so generally the caller must
// have make sure to zero-initialize it before (unless they want to add into it)
template <class T>
// CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW because it seems the uses of openmp-simd
// causes that to happen
static CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW void
GDALMatrixMultiplyAByTransposeAUpperTriangle([[maybe_unused]] int nNumThreads,
                                             const T *A,  // rows * cols
                                             T *res,      // rows * rows
                                             int rows, size_t cols)
{
    constexpr int BLOCK_SIZE = 64;
    constexpr size_t BLOCK_SIZE_COLS = 256;
    constexpr T ZERO{0};

#ifdef HAVE_OPENMP
#pragma omp parallel for schedule(static) num_threads(nNumThreads)
#endif
    for (int64_t ii64 = 0; ii64 < rows; ii64 += BLOCK_SIZE)
    {
        const int ii = static_cast<int>(ii64);
        const int i_end = ii + std::min(BLOCK_SIZE, rows - ii);
        for (int jj = ii; jj < rows;
             jj += std::min(BLOCK_SIZE, rows - jj))  // upper triangle
        {
            const int j_end = jj + std::min(BLOCK_SIZE, rows - jj);
            for (size_t cc = 0; cc < cols; /* increment done at end of loop */)
            {
                const size_t c_end = cc + std::min(BLOCK_SIZE_COLS, cols - cc);

                for (int i = ii; i < i_end; ++i)
                {
                    const T *const Ai = &A[i * cols];
                    int j = std::max(i, jj);
                    for (; j < j_end - 7; j += 8)
                    {
                        const T *const Ajp0 = A + (j + 0) * cols;
                        const T *const Ajp1 = A + (j + 1) * cols;
                        const T *const Ajp2 = A + (j + 2) * cols;
                        const T *const Ajp3 = A + (j + 3) * cols;
                        const T *const Ajp4 = A + (j + 4) * cols;
                        const T *const Ajp5 = A + (j + 5) * cols;
                        const T *const Ajp6 = A + (j + 6) * cols;
                        const T *const Ajp7 = A + (j + 7) * cols;

                        T dfSum0 = ZERO, dfSum1 = ZERO, dfSum2 = ZERO,
                          dfSum3 = ZERO, dfSum4 = ZERO, dfSum5 = ZERO,
                          dfSum6 = ZERO, dfSum7 = ZERO;
#ifdef HAVE_OPENMP_SIMD
#pragma omp simd reduction(+ : dfSum0, dfSum1, dfSum2, dfSum3, dfSum4, dfSum5, dfSum6, dfSum7)
#endif
                        for (size_t c = cc; c < c_end; ++c)
                        {
                            dfSum0 += Ai[c] * Ajp0[c];
                            dfSum1 += Ai[c] * Ajp1[c];
                            dfSum2 += Ai[c] * Ajp2[c];
                            dfSum3 += Ai[c] * Ajp3[c];
                            dfSum4 += Ai[c] * Ajp4[c];
                            dfSum5 += Ai[c] * Ajp5[c];
                            dfSum6 += Ai[c] * Ajp6[c];
                            dfSum7 += Ai[c] * Ajp7[c];
                        }

                        const auto nResOffset =
                            static_cast<size_t>(i) * rows + j;
                        res[nResOffset + 0] += dfSum0;
                        res[nResOffset + 1] += dfSum1;
                        res[nResOffset + 2] += dfSum2;
                        res[nResOffset + 3] += dfSum3;
                        res[nResOffset + 4] += dfSum4;
                        res[nResOffset + 5] += dfSum5;
                        res[nResOffset + 6] += dfSum6;
                        res[nResOffset + 7] += dfSum7;
                    }
                    for (; j < j_end; ++j)
                    {
                        const T *const Aj = A + j * cols;

                        T dfSum = ZERO;
#ifdef HAVE_OPENMP_SIMD
#pragma omp simd reduction(+ : dfSum)
#endif
                        for (size_t c = cc; c < c_end; ++c)
                        {
                            dfSum += Ai[c] * Aj[c];
                        }

                        res[static_cast<size_t>(i) * rows + j] += dfSum;
                    }
                }

                cc = c_end;
            }
        }
    }
}

#endif

Line	Count	Source
1		/******************************************************************************
2		* Project: GDAL Core
3		* Purpose: Utility functions for matrix multiplication
4		* Author: Even Rouault, <even dot rouault at spatialys.com>
5		*
6		******************************************************************************
7		* Copyright (c) 2025, Even Rouault <even dot rouault at spatialys.com>
8		*
9		* SPDX-License-Identifier: MIT
10		****************************************************************************/
11
12		#ifndef GDAL_MATRIX_HPP
13		#define GDAL_MATRIX_HPP
14
15		#include "cpl_port.h"
16		#include <algorithm>
17
18		/************************************************************************/
19		/* GDALMatrixMultiplyAByTransposeAUpperTriangle() */
20		/************************************************************************/
21
22		// Compute res = A * A.transpose(), by filling only the upper triangle.
23		// Do that in a cache friendly way.
24		// We accumulate values into the output array, so generally the caller must
25		// have make sure to zero-initialize it before (unless they want to add into it)
26		template <class T>
27		// CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW because it seems the uses of openmp-simd
28		// causes that to happen
29		static CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW void
30		GDALMatrixMultiplyAByTransposeAUpperTriangle([[maybe_unused]] int nNumThreads,
31		const T A, // rows cols
32		T res, // rows rows
33		int rows, size_t cols)
34	0	{
35	0	constexpr int BLOCK_SIZE = 64;
36	0	constexpr size_t BLOCK_SIZE_COLS = 256;
37	0	constexpr T ZERO{0};
38
39		#ifdef HAVE_OPENMP
40		#pragma omp parallel for schedule(static) num_threads(nNumThreads)
41		#endif
42	0	for (int64_t ii64 = 0; ii64 < rows; ii64 += BLOCK_SIZE)
43	0	{
44	0	const int ii = static_cast<int>(ii64);
45	0	const int i_end = ii + std::min(BLOCK_SIZE, rows - ii);
46	0	for (int jj = ii; jj < rows;
47	0	jj += std::min(BLOCK_SIZE, rows - jj)) // upper triangle
48	0	{
49	0	const int j_end = jj + std::min(BLOCK_SIZE, rows - jj);
50	0	for (size_t cc = 0; cc < cols; /* increment done at end of loop */)
51	0	{
52	0	const size_t c_end = cc + std::min(BLOCK_SIZE_COLS, cols - cc);
53
54	0	for (int i = ii; i < i_end; ++i)
55	0	{
56	0	const T const Ai = &A[i cols];
57	0	int j = std::max(i, jj);
58	0	for (; j < j_end - 7; j += 8)
59	0	{
60	0	const T const Ajp0 = A + (j + 0) cols;
61	0	const T const Ajp1 = A + (j + 1) cols;
62	0	const T const Ajp2 = A + (j + 2) cols;
63	0	const T const Ajp3 = A + (j + 3) cols;
64	0	const T const Ajp4 = A + (j + 4) cols;
65	0	const T const Ajp5 = A + (j + 5) cols;
66	0	const T const Ajp6 = A + (j + 6) cols;
67	0	const T const Ajp7 = A + (j + 7) cols;
68
69	0	T dfSum0 = ZERO, dfSum1 = ZERO, dfSum2 = ZERO,
70	0	dfSum3 = ZERO, dfSum4 = ZERO, dfSum5 = ZERO,
71	0	dfSum6 = ZERO, dfSum7 = ZERO;
72	0	#ifdef HAVE_OPENMP_SIMD
73	0	#pragma omp simd reduction(+ : dfSum0, dfSum1, dfSum2, dfSum3, dfSum4, dfSum5, dfSum6, dfSum7)
74	0	#endif
75	0	for (size_t c = cc; c < c_end; ++c)
76	0	{
77	0	dfSum0 += Ai[c] * Ajp0[c];
78	0	dfSum1 += Ai[c] * Ajp1[c];
79	0	dfSum2 += Ai[c] * Ajp2[c];
80	0	dfSum3 += Ai[c] * Ajp3[c];
81	0	dfSum4 += Ai[c] * Ajp4[c];
82	0	dfSum5 += Ai[c] * Ajp5[c];
83	0	dfSum6 += Ai[c] * Ajp6[c];
84	0	dfSum7 += Ai[c] * Ajp7[c];
85	0	}
86
87	0	const auto nResOffset =
88	0	static_cast<size_t>(i) * rows + j;
89	0	res[nResOffset + 0] += dfSum0;
90	0	res[nResOffset + 1] += dfSum1;
91	0	res[nResOffset + 2] += dfSum2;
92	0	res[nResOffset + 3] += dfSum3;
93	0	res[nResOffset + 4] += dfSum4;
94	0	res[nResOffset + 5] += dfSum5;
95	0	res[nResOffset + 6] += dfSum6;
96	0	res[nResOffset + 7] += dfSum7;
97	0	}
98	0	for (; j < j_end; ++j)
99	0	{
100	0	const T const Aj = A + j cols;
101
102	0	T dfSum = ZERO;
103	0	#ifdef HAVE_OPENMP_SIMD
104	0	#pragma omp simd reduction(+ : dfSum)
105	0	#endif
106	0	for (size_t c = cc; c < c_end; ++c)
107	0	{
108	0	dfSum += Ai[c] * Aj[c];
109	0	}
110
111	0	res[static_cast<size_t>(i) * rows + j] += dfSum;
112	0	}
113	0	}
114
115	0	cc = c_end;
116	0	}
117	0	}
118	0	}
119	0	} Unexecuted instantiation: gdaldataset.cpp:void GDALMatrixMultiplyAByTransposeAUpperTriangle<double>(int, double const, double, int, unsigned long) Unexecuted instantiation: gdal_matrix_avx2_fma.cpp:void GDALMatrixMultiplyAByTransposeAUpperTriangle_AVX2_FMA_internal<double>(int, double const, double, int, unsigned long)
120
121		#endif

Coverage Report

Created: 2025-12-31 06:48