/src/fftw3/kernel/cpy2d.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2003, 2007-14 Matteo Frigo |
3 | | * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology |
4 | | * |
5 | | * This program is free software; you can redistribute it and/or modify |
6 | | * it under the terms of the GNU General Public License as published by |
7 | | * the Free Software Foundation; either version 2 of the License, or |
8 | | * (at your option) any later version. |
9 | | * |
10 | | * This program is distributed in the hope that it will be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | * You should have received a copy of the GNU General Public License |
16 | | * along with this program; if not, write to the Free Software |
17 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18 | | * |
19 | | */ |
20 | | |
21 | | /* out of place 2D copy routines */ |
22 | | #include "kernel/ifftw.h" |
23 | | |
24 | | #if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) |
25 | | # ifdef HAVE_XMMINTRIN_H |
26 | | # include <xmmintrin.h> |
27 | | # define WIDE_TYPE __m128 |
28 | | # endif |
29 | | #endif |
30 | | |
31 | | #ifndef WIDE_TYPE |
32 | | /* fall back to double, which means that WIDE_TYPE will be unused */ |
33 | | # define WIDE_TYPE double |
34 | | #endif |
35 | | |
36 | | void X(cpy2d)(R *I, R *O, |
37 | | INT n0, INT is0, INT os0, |
38 | | INT n1, INT is1, INT os1, |
39 | | INT vl) |
40 | 0 | { |
41 | 0 | INT i0, i1, v; |
42 | |
|
43 | 0 | switch (vl) { |
44 | 0 | case 1: |
45 | 0 | for (i1 = 0; i1 < n1; ++i1) |
46 | 0 | for (i0 = 0; i0 < n0; ++i0) { |
47 | 0 | R x0 = I[i0 * is0 + i1 * is1]; |
48 | 0 | O[i0 * os0 + i1 * os1] = x0; |
49 | 0 | } |
50 | 0 | break; |
51 | 0 | case 2: |
52 | 0 | if (1 |
53 | 0 | && (2 * sizeof(R) == sizeof(WIDE_TYPE)) |
54 | 0 | && (sizeof(WIDE_TYPE) > sizeof(double)) |
55 | 0 | && (((size_t)I) % sizeof(WIDE_TYPE) == 0) |
56 | 0 | && (((size_t)O) % sizeof(WIDE_TYPE) == 0) |
57 | 0 | && ((is0 & 1) == 0) |
58 | 0 | && ((is1 & 1) == 0) |
59 | 0 | && ((os0 & 1) == 0) |
60 | 0 | && ((os1 & 1) == 0)) { |
61 | | /* copy R[2] as WIDE_TYPE if WIDE_TYPE is large |
62 | | enough to hold R[2], and if the input is |
63 | | properly aligned. This is a win when R==double |
64 | | and WIDE_TYPE is 128 bits. */ |
65 | 0 | for (i1 = 0; i1 < n1; ++i1) |
66 | 0 | for (i0 = 0; i0 < n0; ++i0) { |
67 | 0 | *(WIDE_TYPE *)&O[i0 * os0 + i1 * os1] = |
68 | 0 | *(WIDE_TYPE *)&I[i0 * is0 + i1 * is1]; |
69 | 0 | } |
70 | 0 | } else if (1 |
71 | 0 | && (2 * sizeof(R) == sizeof(double)) |
72 | 0 | && (((size_t)I) % sizeof(double) == 0) |
73 | 0 | && (((size_t)O) % sizeof(double) == 0) |
74 | 0 | && ((is0 & 1) == 0) |
75 | 0 | && ((is1 & 1) == 0) |
76 | 0 | && ((os0 & 1) == 0) |
77 | 0 | && ((os1 & 1) == 0)) { |
78 | | /* copy R[2] as double if double is large enough to |
79 | | hold R[2], and if the input is properly aligned. |
80 | | This case applies when R==float */ |
81 | 0 | for (i1 = 0; i1 < n1; ++i1) |
82 | 0 | for (i0 = 0; i0 < n0; ++i0) { |
83 | 0 | *(double *)&O[i0 * os0 + i1 * os1] = |
84 | 0 | *(double *)&I[i0 * is0 + i1 * is1]; |
85 | 0 | } |
86 | 0 | } else { |
87 | 0 | for (i1 = 0; i1 < n1; ++i1) |
88 | 0 | for (i0 = 0; i0 < n0; ++i0) { |
89 | 0 | R x0 = I[i0 * is0 + i1 * is1]; |
90 | 0 | R x1 = I[i0 * is0 + i1 * is1 + 1]; |
91 | 0 | O[i0 * os0 + i1 * os1] = x0; |
92 | 0 | O[i0 * os0 + i1 * os1 + 1] = x1; |
93 | 0 | } |
94 | 0 | } |
95 | 0 | break; |
96 | 0 | default: |
97 | 0 | for (i1 = 0; i1 < n1; ++i1) |
98 | 0 | for (i0 = 0; i0 < n0; ++i0) |
99 | 0 | for (v = 0; v < vl; ++v) { |
100 | 0 | R x0 = I[i0 * is0 + i1 * is1 + v]; |
101 | 0 | O[i0 * os0 + i1 * os1 + v] = x0; |
102 | 0 | } |
103 | 0 | break; |
104 | 0 | } |
105 | 0 | } |
106 | | |
107 | | /* like cpy2d, but read input contiguously if possible */ |
108 | | void X(cpy2d_ci)(R *I, R *O, |
109 | | INT n0, INT is0, INT os0, |
110 | | INT n1, INT is1, INT os1, |
111 | | INT vl) |
112 | 0 | { |
113 | 0 | if (IABS(is0) < IABS(is1)) /* inner loop is for n0 */ |
114 | 0 | X(cpy2d) (I, O, n0, is0, os0, n1, is1, os1, vl); |
115 | 0 | else |
116 | 0 | X(cpy2d) (I, O, n1, is1, os1, n0, is0, os0, vl); |
117 | 0 | } |
118 | | |
119 | | /* like cpy2d, but write output contiguously if possible */ |
120 | | void X(cpy2d_co)(R *I, R *O, |
121 | | INT n0, INT is0, INT os0, |
122 | | INT n1, INT is1, INT os1, |
123 | | INT vl) |
124 | 0 | { |
125 | 0 | if (IABS(os0) < IABS(os1)) /* inner loop is for n0 */ |
126 | 0 | X(cpy2d) (I, O, n0, is0, os0, n1, is1, os1, vl); |
127 | 0 | else |
128 | 0 | X(cpy2d) (I, O, n1, is1, os1, n0, is0, os0, vl); |
129 | 0 | } |
130 | | |
131 | | |
132 | | /* tiled copy routines */ |
133 | | struct cpy2d_closure { |
134 | | R *I, *O; |
135 | | INT is0, os0, is1, os1, vl; |
136 | | R *buf; |
137 | | }; |
138 | | |
139 | | static void dotile(INT n0l, INT n0u, INT n1l, INT n1u, void *args) |
140 | 0 | { |
141 | 0 | struct cpy2d_closure *k = (struct cpy2d_closure *)args; |
142 | 0 | X(cpy2d)(k->I + n0l * k->is0 + n1l * k->is1, |
143 | 0 | k->O + n0l * k->os0 + n1l * k->os1, |
144 | 0 | n0u - n0l, k->is0, k->os0, |
145 | 0 | n1u - n1l, k->is1, k->os1, |
146 | 0 | k->vl); |
147 | 0 | } |
148 | | |
149 | | static void dotile_buf(INT n0l, INT n0u, INT n1l, INT n1u, void *args) |
150 | 0 | { |
151 | 0 | struct cpy2d_closure *k = (struct cpy2d_closure *)args; |
152 | | |
153 | | /* copy from I to buf */ |
154 | 0 | X(cpy2d_ci)(k->I + n0l * k->is0 + n1l * k->is1, |
155 | 0 | k->buf, |
156 | 0 | n0u - n0l, k->is0, k->vl, |
157 | 0 | n1u - n1l, k->is1, k->vl * (n0u - n0l), |
158 | 0 | k->vl); |
159 | | |
160 | | /* copy from buf to O */ |
161 | 0 | X(cpy2d_co)(k->buf, |
162 | 0 | k->O + n0l * k->os0 + n1l * k->os1, |
163 | 0 | n0u - n0l, k->vl, k->os0, |
164 | 0 | n1u - n1l, k->vl * (n0u - n0l), k->os1, |
165 | 0 | k->vl); |
166 | 0 | } |
167 | | |
168 | | |
169 | | void X(cpy2d_tiled)(R *I, R *O, |
170 | | INT n0, INT is0, INT os0, |
171 | | INT n1, INT is1, INT os1, INT vl) |
172 | 0 | { |
173 | 0 | INT tilesz = X(compute_tilesz)(vl, |
174 | 0 | 1 /* input array */ |
175 | 0 | + 1 /* output array */); |
176 | 0 | struct cpy2d_closure k; |
177 | 0 | k.I = I; |
178 | 0 | k.O = O; |
179 | 0 | k.is0 = is0; |
180 | 0 | k.os0 = os0; |
181 | 0 | k.is1 = is1; |
182 | 0 | k.os1 = os1; |
183 | 0 | k.vl = vl; |
184 | 0 | k.buf = 0; /* unused */ |
185 | 0 | X(tile2d)(0, n0, 0, n1, tilesz, dotile, &k); |
186 | 0 | } |
187 | | |
188 | | void X(cpy2d_tiledbuf)(R *I, R *O, |
189 | | INT n0, INT is0, INT os0, |
190 | | INT n1, INT is1, INT os1, INT vl) |
191 | 0 | { |
192 | 0 | R buf[CACHESIZE / (2 * sizeof(R))]; |
193 | | /* input and buffer in cache, or |
194 | | output and buffer in cache */ |
195 | 0 | INT tilesz = X(compute_tilesz)(vl, 2); |
196 | 0 | struct cpy2d_closure k; |
197 | 0 | k.I = I; |
198 | 0 | k.O = O; |
199 | 0 | k.is0 = is0; |
200 | 0 | k.os0 = os0; |
201 | 0 | k.is1 = is1; |
202 | 0 | k.os1 = os1; |
203 | 0 | k.vl = vl; |
204 | 0 | k.buf = buf; |
205 | 0 | A(tilesz * tilesz * vl * sizeof(R) <= sizeof(buf)); |
206 | 0 | X(tile2d)(0, n0, 0, n1, tilesz, dotile_buf, &k); |
207 | 0 | } |