/src/c-blosc2/internal-complibs/zlib-ng-2.0.7/chunkset_tpl.h
Line | Count | Source |
1 | | /* chunkset_tpl.h -- inline functions to copy small data chunks. |
2 | | * For conditions of distribution and use, see copyright notice in zlib.h |
3 | | */ |
4 | | |
5 | | /* Returns the chunk size */ |
6 | 6.59k | Z_INTERNAL uint32_t CHUNKSIZE(void) { |
7 | 6.59k | return sizeof(chunk_t); |
8 | 6.59k | } |
9 | | |
10 | | /* Behave like memcpy, but assume that it's OK to overwrite at least |
11 | | chunk_t bytes of output even if the length is shorter than this, |
12 | | that the length is non-zero, and that `from` lags `out` by at least |
13 | | sizeof chunk_t bytes (or that they don't overlap at all or simply that |
14 | | the distance is less than the length of the copy). |
15 | | |
16 | | Aside from better memory bus utilisation, this means that short copies |
17 | | (chunk_t bytes or fewer) will fall straight through the loop |
18 | | without iteration, which will hopefully make the branch prediction more |
19 | | reliable. */ |
20 | 198k | Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { |
21 | 198k | Assert(len > 0, "chunkcopy should never have a length 0"); |
22 | 198k | chunk_t chunk; |
23 | 198k | int32_t align = (--len % sizeof(chunk_t)) + 1; |
24 | 198k | loadchunk(from, &chunk); |
25 | 198k | storechunk(out, &chunk); |
26 | 198k | out += align; |
27 | 198k | from += align; |
28 | 198k | len /= sizeof(chunk_t); |
29 | 423k | while (len > 0) { |
30 | 224k | loadchunk(from, &chunk); |
31 | 224k | storechunk(out, &chunk); |
32 | 224k | out += sizeof(chunk_t); |
33 | 224k | from += sizeof(chunk_t); |
34 | 224k | --len; |
35 | 224k | } |
36 | 198k | return out; |
37 | 198k | } |
38 | | |
39 | | /* Behave like chunkcopy, but avoid writing beyond of legal output. */ |
40 | 558k | Z_INTERNAL uint8_t* CHUNKCOPY_SAFE(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) { |
41 | 558k | unsigned safelen = (unsigned)((safe - out) + 1); |
42 | 558k | len = MIN(len, safelen); |
43 | | #if CHUNK_SIZE >= 32 |
44 | | while (len >= 32) { |
45 | | memcpy(out, from, 32); |
46 | | out += 32; |
47 | | from += 32; |
48 | | len -= 32; |
49 | | } |
50 | | #endif |
51 | | #if CHUNK_SIZE >= 16 |
52 | | while (len >= 16) { |
53 | | memcpy(out, from, 16); |
54 | | out += 16; |
55 | | from += 16; |
56 | | len -= 16; |
57 | | } |
58 | | #endif |
59 | 558k | #if CHUNK_SIZE >= 8 |
60 | 558k | while (len >= 8) { |
61 | 0 | memcpy(out, from, 8); |
62 | 0 | out += 8; |
63 | 0 | from += 8; |
64 | 0 | len -= 8; |
65 | 0 | } |
66 | 558k | #endif |
67 | 558k | if (len >= 4) { |
68 | 24.8k | memcpy(out, from, 4); |
69 | 24.8k | out += 4; |
70 | 24.8k | from += 4; |
71 | 24.8k | len -= 4; |
72 | 24.8k | } |
73 | 558k | if (len >= 2) { |
74 | 316k | memcpy(out, from, 2); |
75 | 316k | out += 2; |
76 | 316k | from += 2; |
77 | 316k | len -= 2; |
78 | 316k | } |
79 | 558k | if (len == 1) { |
80 | 381k | *out++ = *from++; |
81 | 381k | } |
82 | 558k | return out; |
83 | 558k | } |
84 | | |
85 | | /* Perform short copies until distance can be rewritten as being at least |
86 | | sizeof chunk_t. |
87 | | |
88 | | This assumes that it's OK to overwrite at least the first |
89 | | 2*sizeof(chunk_t) bytes of output even if the copy is shorter than this. |
90 | | This assumption holds because inflate_fast() starts every iteration with at |
91 | | least 258 bytes of output space available (258 being the maximum length |
92 | | output from a single token; see inflate_fast()'s assumptions below). */ |
93 | 733 | Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) { |
94 | 733 | unsigned char const *from = out - *dist; |
95 | 733 | chunk_t chunk; |
96 | 733 | while (*dist < *len && *dist < sizeof(chunk_t)) { |
97 | 0 | loadchunk(from, &chunk); |
98 | 0 | storechunk(out, &chunk); |
99 | 0 | out += *dist; |
100 | 0 | *len -= *dist; |
101 | 0 | *dist += *dist; |
102 | 0 | } |
103 | 733 | return out; |
104 | 733 | } |
105 | | |
106 | | /* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST. |
107 | | Return OUT + LEN. */ |
108 | 102k | Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) { |
109 | | /* Debug performance related issues when len < sizeof(uint64_t): |
110 | | Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */ |
111 | 102k | Assert(dist > 0, "chunkmemset cannot have a distance 0"); |
112 | | |
113 | 102k | unsigned char *from = out - dist; |
114 | 102k | chunk_t chunk; |
115 | 102k | unsigned sz = sizeof(chunk); |
116 | 102k | if (len < sz) { |
117 | 233k | while (len != 0) { |
118 | 193k | *out++ = *from++; |
119 | 193k | --len; |
120 | 193k | } |
121 | 39.2k | return out; |
122 | 39.2k | } |
123 | | |
124 | | #ifdef HAVE_CHUNKMEMSET_1 |
125 | | if (dist == 1) { |
126 | | chunkmemset_1(from, &chunk); |
127 | | } else |
128 | | #endif |
129 | | #ifdef HAVE_CHUNKMEMSET_2 |
130 | | if (dist == 2) { |
131 | | chunkmemset_2(from, &chunk); |
132 | | } else |
133 | | #endif |
134 | 63.2k | #ifdef HAVE_CHUNKMEMSET_4 |
135 | 63.2k | if (dist == 4) { |
136 | 8.23k | chunkmemset_4(from, &chunk); |
137 | 8.23k | } else |
138 | 55.0k | #endif |
139 | 55.0k | #ifdef HAVE_CHUNKMEMSET_8 |
140 | 55.0k | if (dist == 8) { |
141 | 173 | chunkmemset_8(from, &chunk); |
142 | 173 | } else |
143 | 54.8k | #endif |
144 | 54.8k | if (dist == sz) { |
145 | 0 | loadchunk(from, &chunk); |
146 | 54.8k | } else if (dist < sz) { |
147 | 54.1k | unsigned char *end = out + len - 1; |
148 | 558k | while (len > dist) { |
149 | 504k | out = CHUNKCOPY_SAFE(out, from, dist, end); |
150 | 504k | len -= dist; |
151 | 504k | } |
152 | 54.1k | if (len > 0) { |
153 | 54.1k | out = CHUNKCOPY_SAFE(out, from, len, end); |
154 | 54.1k | } |
155 | 54.1k | return out; |
156 | 54.1k | } else { |
157 | 733 | out = CHUNKUNROLL(out, &dist, &len); |
158 | 733 | return CHUNKCOPY(out, out - dist, len); |
159 | 733 | } |
160 | | |
161 | 8.40k | unsigned rem = len % sz; |
162 | 8.40k | len -= rem; |
163 | 29.3k | while (len) { |
164 | 20.9k | storechunk(out, &chunk); |
165 | 20.9k | out += sz; |
166 | 20.9k | len -= sz; |
167 | 20.9k | } |
168 | | |
169 | | /* Last, deal with the case when LEN is not a multiple of SZ. */ |
170 | 8.40k | if (rem) { |
171 | 5.72k | memcpy(out, from, rem); |
172 | 5.72k | out += rem; |
173 | 5.72k | } |
174 | | |
175 | 8.40k | return out; |
176 | 63.2k | } |
177 | | |
178 | 17.5k | Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) { |
179 | 17.5k | #if !defined(UNALIGNED64_OK) |
180 | 17.5k | # if !defined(UNALIGNED_OK) |
181 | 17.5k | static const uint32_t align_mask = 7; |
182 | | # else |
183 | | static const uint32_t align_mask = 3; |
184 | | # endif |
185 | 17.5k | #endif |
186 | | |
187 | 17.5k | len = MIN(len, left); |
188 | 17.5k | uint8_t *from = out - dist; |
189 | 17.5k | #if !defined(UNALIGNED64_OK) |
190 | 69.8k | while (((uintptr_t)out & align_mask) && (len > 0)) { |
191 | 52.3k | *out++ = *from++; |
192 | 52.3k | --len; |
193 | 52.3k | --left; |
194 | 52.3k | } |
195 | 17.5k | #endif |
196 | 17.5k | if (left < (unsigned)(3 * sizeof(chunk_t))) { |
197 | 7.67k | while (len > 0) { |
198 | 5.61k | *out++ = *from++; |
199 | 5.61k | --len; |
200 | 5.61k | } |
201 | 2.05k | return out; |
202 | 2.05k | } |
203 | 15.5k | if (len) |
204 | 9.93k | return CHUNKMEMSET(out, dist, len); |
205 | | |
206 | 5.56k | return out; |
207 | 15.5k | } |