/src/mpdecimal-4.0.0/libmpdec/transpose.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2008-2024 Stefan Krah. All rights reserved. |
3 | | * |
4 | | * Redistribution and use in source and binary forms, with or without |
5 | | * modification, are permitted provided that the following conditions |
6 | | * are met: |
7 | | * |
8 | | * 1. Redistributions of source code must retain the above copyright |
9 | | * notice, this list of conditions and the following disclaimer. |
10 | | * 2. Redistributions in binary form must reproduce the above copyright |
11 | | * notice, this list of conditions and the following disclaimer in the |
12 | | * documentation and/or other materials provided with the distribution. |
13 | | * |
14 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
15 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
16 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
17 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
18 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
19 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
20 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
21 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
22 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
23 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
24 | | * SUCH DAMAGE. |
25 | | */ |
26 | | |
27 | | |
28 | | #include <assert.h> |
29 | | #include <limits.h> |
30 | | #include <stdio.h> |
31 | | #include <stdlib.h> |
32 | | #include <string.h> |
33 | | |
34 | | |
35 | | #include "bits.h" |
36 | | #include "constants.h" |
37 | | #include "mpdecimal.h" |
38 | | #include "transpose.h" |
39 | | #include "typearith.h" |
40 | | |
41 | | |
42 | 0 | #define BUFSIZE 4096 |
43 | 0 | #define SIDE 128 |
44 | | |
45 | | |
46 | | /* Bignum: The transpose functions are used for very large transforms |
47 | | in sixstep.c and fourstep.c. */ |
48 | | |
49 | | |
50 | | /* Definition of the matrix transpose */ |
51 | | void |
52 | | std_trans(mpd_uint_t dest[], mpd_uint_t src[], mpd_size_t rows, mpd_size_t cols) |
53 | 0 | { |
54 | 0 | mpd_size_t idest, isrc; |
55 | 0 | mpd_size_t r, c; |
56 | |
|
57 | 0 | for (r = 0; r < rows; r++) { |
58 | 0 | isrc = r * cols; |
59 | 0 | idest = r; |
60 | 0 | for (c = 0; c < cols; c++) { |
61 | 0 | dest[idest] = src[isrc]; |
62 | 0 | isrc += 1; |
63 | 0 | idest += rows; |
64 | 0 | } |
65 | 0 | } |
66 | 0 | } |
67 | | |
68 | | /* |
69 | | * Swap half-rows of 2^n * (2*2^n) matrix. |
70 | | * FORWARD_CYCLE: even/odd permutation of the halfrows. |
71 | | * BACKWARD_CYCLE: reverse the even/odd permutation. |
72 | | */ |
73 | | static int |
74 | | swap_halfrows_pow2(mpd_uint_t *matrix, mpd_size_t rows, mpd_size_t cols, int dir) |
75 | 0 | { |
76 | 0 | mpd_uint_t buf1[BUFSIZE]; |
77 | 0 | mpd_uint_t buf2[BUFSIZE]; |
78 | 0 | mpd_uint_t *readbuf, *writebuf, *hp; |
79 | 0 | mpd_size_t *done, dbits; |
80 | 0 | mpd_size_t b = BUFSIZE, stride; |
81 | 0 | mpd_size_t hn, hmax; /* halfrow number */ |
82 | 0 | mpd_size_t m, r=0; |
83 | 0 | mpd_size_t offset; |
84 | 0 | mpd_size_t next; |
85 | | |
86 | |
|
87 | 0 | assert(cols == mul_size_t(2, rows)); |
88 | |
|
89 | 0 | if (dir == FORWARD_CYCLE) { |
90 | 0 | r = rows; |
91 | 0 | } |
92 | 0 | else if (dir == BACKWARD_CYCLE) { |
93 | 0 | r = 2; |
94 | 0 | } |
95 | 0 | else { |
96 | 0 | abort(); /* GCOV_NOT_REACHED */ |
97 | 0 | } |
98 | | |
99 | 0 | m = cols - 1; |
100 | 0 | hmax = rows; /* cycles start at odd halfrows */ |
101 | 0 | dbits = 8 * sizeof *done; |
102 | 0 | if ((done = mpd_calloc(hmax/(sizeof *done) + 1, sizeof *done)) == NULL) { |
103 | 0 | return 0; |
104 | 0 | } |
105 | | |
106 | 0 | for (hn = 1; hn <= hmax; hn += 2) { |
107 | |
|
108 | 0 | if (done[hn/dbits] & mpd_bits[hn%dbits]) { |
109 | 0 | continue; |
110 | 0 | } |
111 | | |
112 | 0 | readbuf = buf1; writebuf = buf2; |
113 | |
|
114 | 0 | for (offset = 0; offset < cols/2; offset += b) { |
115 | |
|
116 | 0 | stride = (offset + b < cols/2) ? b : cols/2-offset; |
117 | |
|
118 | 0 | hp = matrix + hn*cols/2; |
119 | 0 | memcpy(readbuf, hp+offset, stride*(sizeof *readbuf)); |
120 | 0 | pointerswap(&readbuf, &writebuf); |
121 | |
|
122 | 0 | next = mulmod_size_t(hn, r, m); |
123 | 0 | hp = matrix + next*cols/2; |
124 | |
|
125 | 0 | while (next != hn) { |
126 | |
|
127 | 0 | memcpy(readbuf, hp+offset, stride*(sizeof *readbuf)); |
128 | 0 | memcpy(hp+offset, writebuf, stride*(sizeof *writebuf)); |
129 | 0 | pointerswap(&readbuf, &writebuf); |
130 | |
|
131 | 0 | done[next/dbits] |= mpd_bits[next%dbits]; |
132 | |
|
133 | 0 | next = mulmod_size_t(next, r, m); |
134 | 0 | hp = matrix + next*cols/2; |
135 | |
|
136 | 0 | } |
137 | |
|
138 | 0 | memcpy(hp+offset, writebuf, stride*(sizeof *writebuf)); |
139 | |
|
140 | 0 | done[hn/dbits] |= mpd_bits[hn%dbits]; |
141 | 0 | } |
142 | 0 | } |
143 | |
|
144 | 0 | mpd_free(done); |
145 | 0 | return 1; |
146 | 0 | } |
147 | | |
148 | | /* In-place transpose of a square matrix */ |
149 | | static inline void |
150 | | squaretrans(mpd_uint_t *buf, mpd_size_t cols) |
151 | 0 | { |
152 | 0 | mpd_uint_t tmp; |
153 | 0 | mpd_size_t idest, isrc; |
154 | 0 | mpd_size_t r, c; |
155 | |
|
156 | 0 | for (r = 0; r < cols; r++) { |
157 | 0 | c = r+1; |
158 | 0 | isrc = r*cols + c; |
159 | 0 | idest = c*cols + r; |
160 | 0 | for (c = r+1; c < cols; c++) { |
161 | 0 | tmp = buf[isrc]; |
162 | 0 | buf[isrc] = buf[idest]; |
163 | 0 | buf[idest] = tmp; |
164 | 0 | isrc += 1; |
165 | 0 | idest += cols; |
166 | 0 | } |
167 | 0 | } |
168 | 0 | } |
169 | | |
170 | | /* |
171 | | * Transpose 2^n * 2^n matrix. For cache efficiency, the matrix is split into |
172 | | * square blocks with side length 'SIDE'. First, the blocks are transposed, |
173 | | * then a square transposition is done on each individual block. |
174 | | */ |
175 | | static void |
176 | | squaretrans_pow2(mpd_uint_t *matrix, mpd_size_t size) |
177 | 0 | { |
178 | 0 | mpd_uint_t buf1[SIDE*SIDE]; |
179 | 0 | mpd_uint_t buf2[SIDE*SIDE]; |
180 | 0 | mpd_uint_t *to, *from; |
181 | 0 | mpd_size_t b = size; |
182 | 0 | mpd_size_t r, c; |
183 | 0 | mpd_size_t i; |
184 | |
|
185 | 0 | while (b > SIDE) b >>= 1; |
186 | |
|
187 | 0 | for (r = 0; r < size; r += b) { |
188 | |
|
189 | 0 | for (c = r; c < size; c += b) { |
190 | |
|
191 | 0 | from = matrix + r*size + c; |
192 | 0 | to = buf1; |
193 | 0 | for (i = 0; i < b; i++) { |
194 | 0 | memcpy(to, from, b*(sizeof *to)); |
195 | 0 | from += size; |
196 | 0 | to += b; |
197 | 0 | } |
198 | 0 | squaretrans(buf1, b); |
199 | |
|
200 | 0 | if (r == c) { |
201 | 0 | to = matrix + r*size + c; |
202 | 0 | from = buf1; |
203 | 0 | for (i = 0; i < b; i++) { |
204 | 0 | memcpy(to, from, b*(sizeof *to)); |
205 | 0 | from += b; |
206 | 0 | to += size; |
207 | 0 | } |
208 | 0 | continue; |
209 | 0 | } |
210 | 0 | else { |
211 | 0 | from = matrix + c*size + r; |
212 | 0 | to = buf2; |
213 | 0 | for (i = 0; i < b; i++) { |
214 | 0 | memcpy(to, from, b*(sizeof *to)); |
215 | 0 | from += size; |
216 | 0 | to += b; |
217 | 0 | } |
218 | 0 | squaretrans(buf2, b); |
219 | |
|
220 | 0 | to = matrix + c*size + r; |
221 | 0 | from = buf1; |
222 | 0 | for (i = 0; i < b; i++) { |
223 | 0 | memcpy(to, from, b*(sizeof *to)); |
224 | 0 | from += b; |
225 | 0 | to += size; |
226 | 0 | } |
227 | |
|
228 | 0 | to = matrix + r*size + c; |
229 | 0 | from = buf2; |
230 | 0 | for (i = 0; i < b; i++) { |
231 | 0 | memcpy(to, from, b*(sizeof *to)); |
232 | 0 | from += b; |
233 | 0 | to += size; |
234 | 0 | } |
235 | 0 | } |
236 | 0 | } |
237 | 0 | } |
238 | |
|
239 | 0 | } |
240 | | |
241 | | /* |
242 | | * In-place transposition of a 2^n x 2^n or a 2^n x (2*2^n) |
243 | | * or a (2*2^n) x 2^n matrix. |
244 | | */ |
245 | | int |
246 | | transpose_pow2(mpd_uint_t *matrix, mpd_size_t rows, mpd_size_t cols) |
247 | 0 | { |
248 | 0 | mpd_size_t size = mul_size_t(rows, cols); |
249 | |
|
250 | 0 | assert(ispower2(rows)); |
251 | 0 | assert(ispower2(cols)); |
252 | |
|
253 | 0 | if (cols == rows) { |
254 | 0 | squaretrans_pow2(matrix, rows); |
255 | 0 | } |
256 | 0 | else if (cols == mul_size_t(2, rows)) { |
257 | 0 | if (!swap_halfrows_pow2(matrix, rows, cols, FORWARD_CYCLE)) { |
258 | 0 | return 0; |
259 | 0 | } |
260 | 0 | squaretrans_pow2(matrix, rows); |
261 | 0 | squaretrans_pow2(matrix+(size/2), rows); |
262 | 0 | } |
263 | 0 | else if (rows == mul_size_t(2, cols)) { |
264 | 0 | squaretrans_pow2(matrix, cols); |
265 | 0 | squaretrans_pow2(matrix+(size/2), cols); |
266 | 0 | if (!swap_halfrows_pow2(matrix, cols, rows, BACKWARD_CYCLE)) { |
267 | 0 | return 0; |
268 | 0 | } |
269 | 0 | } |
270 | 0 | else { |
271 | 0 | abort(); /* GCOV_NOT_REACHED */ |
272 | 0 | } |
273 | | |
274 | 0 | return 1; |
275 | 0 | } |