/src/croaring/src/containers/run.c
Line | Count | Source |
1 | | #include <stdio.h> |
2 | | #include <stdlib.h> |
3 | | |
4 | | #include <roaring/containers/run.h> |
5 | | #include <roaring/memory.h> |
6 | | #include <roaring/portability.h> |
7 | | |
8 | | #if CROARING_IS_X64 |
9 | | #ifndef CROARING_COMPILER_SUPPORTS_AVX512 |
10 | | #error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." |
11 | | #endif // CROARING_COMPILER_SUPPORTS_AVX512 |
12 | | #endif |
13 | | #if defined(__GNUC__) && !defined(__clang__) |
14 | | #pragma GCC diagnostic push |
15 | | #pragma GCC diagnostic ignored "-Wuninitialized" |
16 | | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" |
17 | | #endif |
18 | | #ifdef __cplusplus |
19 | | extern "C" { |
20 | | namespace roaring { |
21 | | namespace internal { |
22 | | #endif |
23 | | |
24 | | extern inline uint16_t run_container_minimum(const run_container_t *run); |
25 | | extern inline uint16_t run_container_maximum(const run_container_t *run); |
26 | | extern inline int32_t interleavedBinarySearch(const rle16_t *array, |
27 | | int32_t lenarray, uint16_t ikey); |
28 | | extern inline bool run_container_contains(const run_container_t *run, |
29 | | uint16_t pos); |
30 | | extern inline int run_container_index_equalorlarger(const run_container_t *arr, |
31 | | uint16_t x); |
32 | | extern inline bool run_container_is_full(const run_container_t *run); |
33 | | extern inline bool run_container_nonzero_cardinality(const run_container_t *rc); |
34 | | extern inline int32_t run_container_serialized_size_in_bytes(int32_t num_runs); |
35 | | extern inline run_container_t *run_container_create_range(uint32_t start, |
36 | | uint32_t stop); |
37 | | extern inline int run_container_cardinality(const run_container_t *run); |
38 | | |
39 | 155k | bool run_container_add(run_container_t *run, uint16_t pos) { |
40 | 155k | int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos); |
41 | 155k | if (index >= 0) return false; // already there |
42 | 155k | index = -index - 2; // points to preceding value, possibly -1 |
43 | 155k | if (index >= 0) { // possible match |
44 | 155k | int32_t offset = pos - run->runs[index].value; |
45 | 155k | int32_t le = run->runs[index].length; |
46 | 155k | if (offset <= le) return false; // already there |
47 | 155k | if (offset == le + 1) { |
48 | | // we may need to fuse |
49 | 155k | if (index + 1 < run->n_runs) { |
50 | 82.4k | if (run->runs[index + 1].value == pos + 1) { |
51 | | // indeed fusion is needed |
52 | 174 | run->runs[index].length = run->runs[index + 1].value + |
53 | 174 | run->runs[index + 1].length - |
54 | 174 | run->runs[index].value; |
55 | 174 | recoverRoomAtIndex(run, (uint16_t)(index + 1)); |
56 | 174 | return true; |
57 | 174 | } |
58 | 82.4k | } |
59 | 155k | run->runs[index].length++; |
60 | 155k | return true; |
61 | 155k | } |
62 | 67 | if (index + 1 < run->n_runs) { |
63 | | // we may need to fuse |
64 | 43 | if (run->runs[index + 1].value == pos + 1) { |
65 | | // indeed fusion is needed |
66 | 1 | run->runs[index + 1].value = pos; |
67 | 1 | run->runs[index + 1].length = run->runs[index + 1].length + 1; |
68 | 1 | return true; |
69 | 1 | } |
70 | 43 | } |
71 | 67 | } |
72 | 186 | if (index == -1) { |
73 | | // we may need to extend the first run |
74 | 120 | if (0 < run->n_runs) { |
75 | 120 | if (run->runs[0].value == pos + 1) { |
76 | 4 | run->runs[0].length++; |
77 | 4 | run->runs[0].value--; |
78 | 4 | return true; |
79 | 4 | } |
80 | 120 | } |
81 | 120 | } |
82 | 182 | makeRoomAtIndex(run, (uint16_t)(index + 1)); |
83 | 182 | run->runs[index + 1].value = pos; |
84 | 182 | run->runs[index + 1].length = 0; |
85 | 182 | return true; |
86 | 186 | } |
87 | | |
88 | | /* Create a new run container. Return NULL in case of failure. */ |
89 | 52.5k | run_container_t *run_container_create_given_capacity(int32_t size) { |
90 | 52.5k | run_container_t *run; |
91 | | /* Allocate the run container itself. */ |
92 | 52.5k | if ((run = (run_container_t *)roaring_malloc(sizeof(run_container_t))) == |
93 | 52.5k | NULL) { |
94 | 0 | return NULL; |
95 | 0 | } |
96 | 52.5k | if (size <= 0) { // we don't want to rely on malloc(0) |
97 | 52.5k | run->runs = NULL; |
98 | 52.5k | } else if ((run->runs = (rle16_t *)roaring_malloc(sizeof(rle16_t) * |
99 | 0 | size)) == NULL) { |
100 | 0 | roaring_free(run); |
101 | 0 | return NULL; |
102 | 0 | } |
103 | 52.5k | run->capacity = size; |
104 | 52.5k | run->n_runs = 0; |
105 | 52.5k | return run; |
106 | 52.5k | } |
107 | | |
108 | 0 | int run_container_shrink_to_fit(run_container_t *src) { |
109 | 0 | if (src->n_runs == src->capacity) return 0; // nothing to do |
110 | 0 | int savings = src->capacity - src->n_runs; |
111 | 0 | src->capacity = src->n_runs; |
112 | 0 | rle16_t *oldruns = src->runs; |
113 | 0 | src->runs = |
114 | 0 | (rle16_t *)roaring_realloc(oldruns, src->capacity * sizeof(rle16_t)); |
115 | 0 | if (src->runs == NULL) roaring_free(oldruns); // should never happen? |
116 | 0 | return savings; |
117 | 0 | } |
118 | | /* Create a new run container. Return NULL in case of failure. */ |
119 | 52.5k | run_container_t *run_container_create(void) { |
120 | 52.5k | return run_container_create_given_capacity(RUN_DEFAULT_INIT_SIZE); |
121 | 52.5k | } |
122 | | |
123 | | CROARING_ALLOW_UNALIGNED |
124 | 0 | run_container_t *run_container_clone(const run_container_t *src) { |
125 | 0 | run_container_t *run = run_container_create_given_capacity(src->capacity); |
126 | 0 | if (run == NULL) return NULL; |
127 | 0 | run->capacity = src->capacity; |
128 | 0 | run->n_runs = src->n_runs; |
129 | 0 | memcpy(run->runs, src->runs, src->n_runs * sizeof(rle16_t)); |
130 | 0 | return run; |
131 | 0 | } |
132 | | |
133 | | void run_container_offset(const run_container_t *c, container_t **loc, |
134 | 0 | container_t **hic, uint16_t offset) { |
135 | 0 | run_container_t *lo = NULL, *hi = NULL; |
136 | |
|
137 | 0 | bool split; |
138 | 0 | unsigned int lo_cap, hi_cap; |
139 | 0 | int top, pivot; |
140 | |
|
141 | 0 | top = (1 << 16) - offset; |
142 | 0 | pivot = run_container_index_equalorlarger(c, top); |
143 | | // pivot is the index of the first run that is >= top or -1 if no such run |
144 | |
|
145 | 0 | if (pivot >= 0) { |
146 | 0 | split = c->runs[pivot].value < top; |
147 | 0 | lo_cap = pivot + (split ? 1 : 0); |
148 | 0 | hi_cap = c->n_runs - pivot; |
149 | 0 | } else { |
150 | | // here pivot < 0 |
151 | 0 | split = false; |
152 | 0 | lo_cap = c->n_runs; |
153 | 0 | hi_cap = 0; |
154 | 0 | } |
155 | |
|
156 | 0 | if (loc && lo_cap) { |
157 | 0 | lo = run_container_create_given_capacity(lo_cap); |
158 | 0 | memcpy(lo->runs, c->runs, lo_cap * sizeof(rle16_t)); |
159 | 0 | lo->n_runs = lo_cap; |
160 | 0 | for (unsigned int i = 0; i < lo_cap; ++i) { |
161 | 0 | lo->runs[i].value += offset; |
162 | 0 | } |
163 | 0 | *loc = (container_t *)lo; |
164 | 0 | } |
165 | |
|
166 | 0 | if (hic && hi_cap) { |
167 | 0 | hi = run_container_create_given_capacity(hi_cap); |
168 | 0 | memcpy(hi->runs, c->runs + pivot, hi_cap * sizeof(rle16_t)); |
169 | 0 | hi->n_runs = hi_cap; |
170 | 0 | for (unsigned int i = 0; i < hi_cap; ++i) { |
171 | 0 | hi->runs[i].value += offset; |
172 | 0 | } |
173 | 0 | *hic = (container_t *)hi; |
174 | 0 | } |
175 | | |
176 | | // Fix the split. |
177 | 0 | if (split) { |
178 | 0 | if (lo != NULL) { |
179 | | // Add the missing run to 'lo', exhausting length. |
180 | 0 | lo->runs[lo->n_runs - 1].length = |
181 | 0 | (1 << 16) - lo->runs[lo->n_runs - 1].value - 1; |
182 | 0 | } |
183 | |
|
184 | 0 | if (hi != NULL) { |
185 | | // Fix the first run in 'hi'. |
186 | 0 | hi->runs[0].length -= UINT16_MAX - hi->runs[0].value + 1; |
187 | 0 | hi->runs[0].value = 0; |
188 | 0 | } |
189 | 0 | } |
190 | 0 | } |
191 | | |
192 | | /* Free memory. */ |
193 | 52.5k | void run_container_free(run_container_t *run) { |
194 | 52.5k | if (run == NULL) return; |
195 | 52.5k | roaring_free(run->runs); |
196 | 52.5k | roaring_free(run); |
197 | 52.5k | } |
198 | | |
199 | 4.50k | void run_container_grow(run_container_t *run, int32_t min, bool copy) { |
200 | 4.50k | int32_t newCapacity = (run->capacity == 0) ? RUN_DEFAULT_INIT_SIZE |
201 | 4.50k | : run->capacity < 64 ? run->capacity * 2 |
202 | 182 | : run->capacity < 1024 ? run->capacity * 3 / 2 |
203 | 0 | : run->capacity * 5 / 4; |
204 | 4.50k | if (newCapacity < min) newCapacity = min; |
205 | 4.50k | run->capacity = newCapacity; |
206 | 4.50k | assert(run->capacity >= min); |
207 | 4.50k | if (copy) { |
208 | 182 | rle16_t *oldruns = run->runs; |
209 | 182 | run->runs = (rle16_t *)roaring_realloc(oldruns, |
210 | 182 | run->capacity * sizeof(rle16_t)); |
211 | 182 | if (run->runs == NULL) roaring_free(oldruns); |
212 | 4.32k | } else { |
213 | 4.32k | roaring_free(run->runs); |
214 | 4.32k | run->runs = (rle16_t *)roaring_malloc(run->capacity * sizeof(rle16_t)); |
215 | 4.32k | } |
216 | | // We may have run->runs == NULL. |
217 | 4.50k | } |
218 | | |
219 | | /* copy one container into another */ |
220 | 0 | void run_container_copy(const run_container_t *src, run_container_t *dst) { |
221 | 0 | const int32_t n_runs = src->n_runs; |
222 | 0 | if (src->n_runs > dst->capacity) { |
223 | 0 | run_container_grow(dst, n_runs, false); |
224 | 0 | } |
225 | 0 | dst->n_runs = n_runs; |
226 | 0 | memcpy(dst->runs, src->runs, sizeof(rle16_t) * n_runs); |
227 | 0 | } |
228 | | |
229 | | /* Compute the union of `src_1' and `src_2' and write the result to `dst' |
230 | | * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */ |
231 | | void run_container_union(const run_container_t *src_1, |
232 | 0 | const run_container_t *src_2, run_container_t *dst) { |
233 | | // TODO: this could be a lot more efficient |
234 | | |
235 | | // we start out with inexpensive checks |
236 | 0 | const bool if1 = run_container_is_full(src_1); |
237 | 0 | const bool if2 = run_container_is_full(src_2); |
238 | 0 | if (if1 || if2) { |
239 | 0 | if (if1) { |
240 | 0 | run_container_copy(src_1, dst); |
241 | 0 | return; |
242 | 0 | } |
243 | 0 | if (if2) { |
244 | 0 | run_container_copy(src_2, dst); |
245 | 0 | return; |
246 | 0 | } |
247 | 0 | } |
248 | 0 | const int32_t neededcapacity = src_1->n_runs + src_2->n_runs; |
249 | 0 | if (dst->capacity < neededcapacity) |
250 | 0 | run_container_grow(dst, neededcapacity, false); |
251 | 0 | dst->n_runs = 0; |
252 | 0 | int32_t rlepos = 0; |
253 | 0 | int32_t xrlepos = 0; |
254 | |
|
255 | 0 | rle16_t previousrle; |
256 | 0 | if (src_1->runs[rlepos].value <= src_2->runs[xrlepos].value) { |
257 | 0 | previousrle = run_container_append_first(dst, src_1->runs[rlepos]); |
258 | 0 | rlepos++; |
259 | 0 | } else { |
260 | 0 | previousrle = run_container_append_first(dst, src_2->runs[xrlepos]); |
261 | 0 | xrlepos++; |
262 | 0 | } |
263 | |
|
264 | 0 | while ((xrlepos < src_2->n_runs) && (rlepos < src_1->n_runs)) { |
265 | 0 | rle16_t newrl; |
266 | 0 | if (src_1->runs[rlepos].value <= src_2->runs[xrlepos].value) { |
267 | 0 | newrl = src_1->runs[rlepos]; |
268 | 0 | rlepos++; |
269 | 0 | } else { |
270 | 0 | newrl = src_2->runs[xrlepos]; |
271 | 0 | xrlepos++; |
272 | 0 | } |
273 | 0 | run_container_append(dst, newrl, &previousrle); |
274 | 0 | } |
275 | 0 | while (xrlepos < src_2->n_runs) { |
276 | 0 | run_container_append(dst, src_2->runs[xrlepos], &previousrle); |
277 | 0 | xrlepos++; |
278 | 0 | } |
279 | 0 | while (rlepos < src_1->n_runs) { |
280 | 0 | run_container_append(dst, src_1->runs[rlepos], &previousrle); |
281 | 0 | rlepos++; |
282 | 0 | } |
283 | 0 | } |
284 | | |
285 | | /* Compute the union of `src_1' and `src_2' and write the result to `src_1' |
286 | | */ |
287 | | void run_container_union_inplace(run_container_t *src_1, |
288 | 0 | const run_container_t *src_2) { |
289 | | // TODO: this could be a lot more efficient |
290 | | |
291 | | // we start out with inexpensive checks |
292 | 0 | const bool if1 = run_container_is_full(src_1); |
293 | 0 | const bool if2 = run_container_is_full(src_2); |
294 | 0 | if (if1 || if2) { |
295 | 0 | if (if1) { |
296 | 0 | return; |
297 | 0 | } |
298 | 0 | if (if2) { |
299 | 0 | run_container_copy(src_2, src_1); |
300 | 0 | return; |
301 | 0 | } |
302 | 0 | } |
303 | | // we move the data to the end of the current array |
304 | 0 | const int32_t maxoutput = src_1->n_runs + src_2->n_runs; |
305 | 0 | const int32_t neededcapacity = maxoutput + src_1->n_runs; |
306 | 0 | if (src_1->capacity < neededcapacity) |
307 | 0 | run_container_grow(src_1, neededcapacity, true); |
308 | 0 | memmove(src_1->runs + maxoutput, src_1->runs, |
309 | 0 | src_1->n_runs * sizeof(rle16_t)); |
310 | 0 | rle16_t *inputsrc1 = src_1->runs + maxoutput; |
311 | 0 | const int32_t input1nruns = src_1->n_runs; |
312 | 0 | src_1->n_runs = 0; |
313 | 0 | int32_t rlepos = 0; |
314 | 0 | int32_t xrlepos = 0; |
315 | |
|
316 | 0 | rle16_t previousrle; |
317 | 0 | if (inputsrc1[rlepos].value <= src_2->runs[xrlepos].value) { |
318 | 0 | previousrle = run_container_append_first(src_1, inputsrc1[rlepos]); |
319 | 0 | rlepos++; |
320 | 0 | } else { |
321 | 0 | previousrle = run_container_append_first(src_1, src_2->runs[xrlepos]); |
322 | 0 | xrlepos++; |
323 | 0 | } |
324 | 0 | while ((xrlepos < src_2->n_runs) && (rlepos < input1nruns)) { |
325 | 0 | rle16_t newrl; |
326 | 0 | if (inputsrc1[rlepos].value <= src_2->runs[xrlepos].value) { |
327 | 0 | newrl = inputsrc1[rlepos]; |
328 | 0 | rlepos++; |
329 | 0 | } else { |
330 | 0 | newrl = src_2->runs[xrlepos]; |
331 | 0 | xrlepos++; |
332 | 0 | } |
333 | 0 | run_container_append(src_1, newrl, &previousrle); |
334 | 0 | } |
335 | 0 | while (xrlepos < src_2->n_runs) { |
336 | 0 | run_container_append(src_1, src_2->runs[xrlepos], &previousrle); |
337 | 0 | xrlepos++; |
338 | 0 | } |
339 | 0 | while (rlepos < input1nruns) { |
340 | 0 | run_container_append(src_1, inputsrc1[rlepos], &previousrle); |
341 | 0 | rlepos++; |
342 | 0 | } |
343 | 0 | } |
344 | | |
345 | | /* Compute the symmetric difference of `src_1' and `src_2' and write the result |
346 | | * to `dst' |
347 | | * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */ |
348 | | void run_container_xor(const run_container_t *src_1, |
349 | 0 | const run_container_t *src_2, run_container_t *dst) { |
350 | | // don't bother to convert xor with full range into negation |
351 | | // since negation is implemented similarly |
352 | |
|
353 | 0 | const int32_t neededcapacity = src_1->n_runs + src_2->n_runs; |
354 | 0 | if (dst->capacity < neededcapacity) |
355 | 0 | run_container_grow(dst, neededcapacity, false); |
356 | |
|
357 | 0 | int32_t pos1 = 0; |
358 | 0 | int32_t pos2 = 0; |
359 | 0 | dst->n_runs = 0; |
360 | |
|
361 | 0 | while ((pos1 < src_1->n_runs) && (pos2 < src_2->n_runs)) { |
362 | 0 | if (src_1->runs[pos1].value <= src_2->runs[pos2].value) { |
363 | 0 | run_container_smart_append_exclusive(dst, src_1->runs[pos1].value, |
364 | 0 | src_1->runs[pos1].length); |
365 | 0 | pos1++; |
366 | 0 | } else { |
367 | 0 | run_container_smart_append_exclusive(dst, src_2->runs[pos2].value, |
368 | 0 | src_2->runs[pos2].length); |
369 | 0 | pos2++; |
370 | 0 | } |
371 | 0 | } |
372 | 0 | while (pos1 < src_1->n_runs) { |
373 | 0 | run_container_smart_append_exclusive(dst, src_1->runs[pos1].value, |
374 | 0 | src_1->runs[pos1].length); |
375 | 0 | pos1++; |
376 | 0 | } |
377 | |
|
378 | 0 | while (pos2 < src_2->n_runs) { |
379 | 0 | run_container_smart_append_exclusive(dst, src_2->runs[pos2].value, |
380 | 0 | src_2->runs[pos2].length); |
381 | 0 | pos2++; |
382 | 0 | } |
383 | 0 | } |
384 | | |
385 | | /* Compute the intersection of src_1 and src_2 and write the result to |
386 | | * dst. It is assumed that dst is distinct from both src_1 and src_2. */ |
387 | | void run_container_intersection(const run_container_t *src_1, |
388 | | const run_container_t *src_2, |
389 | 0 | run_container_t *dst) { |
390 | 0 | const bool if1 = run_container_is_full(src_1); |
391 | 0 | const bool if2 = run_container_is_full(src_2); |
392 | 0 | if (if1 || if2) { |
393 | 0 | if (if1) { |
394 | 0 | run_container_copy(src_2, dst); |
395 | 0 | return; |
396 | 0 | } |
397 | 0 | if (if2) { |
398 | 0 | run_container_copy(src_1, dst); |
399 | 0 | return; |
400 | 0 | } |
401 | 0 | } |
402 | | // TODO: this could be a lot more efficient, could use SIMD optimizations |
403 | 0 | const int32_t neededcapacity = src_1->n_runs + src_2->n_runs; |
404 | 0 | if (dst->capacity < neededcapacity) |
405 | 0 | run_container_grow(dst, neededcapacity, false); |
406 | 0 | dst->n_runs = 0; |
407 | 0 | int32_t rlepos = 0; |
408 | 0 | int32_t xrlepos = 0; |
409 | 0 | int32_t start = src_1->runs[rlepos].value; |
410 | 0 | int32_t end = start + src_1->runs[rlepos].length + 1; |
411 | 0 | int32_t xstart = src_2->runs[xrlepos].value; |
412 | 0 | int32_t xend = xstart + src_2->runs[xrlepos].length + 1; |
413 | 0 | while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) { |
414 | 0 | if (end <= xstart) { |
415 | 0 | ++rlepos; |
416 | 0 | if (rlepos < src_1->n_runs) { |
417 | 0 | start = src_1->runs[rlepos].value; |
418 | 0 | end = start + src_1->runs[rlepos].length + 1; |
419 | 0 | } |
420 | 0 | } else if (xend <= start) { |
421 | 0 | ++xrlepos; |
422 | 0 | if (xrlepos < src_2->n_runs) { |
423 | 0 | xstart = src_2->runs[xrlepos].value; |
424 | 0 | xend = xstart + src_2->runs[xrlepos].length + 1; |
425 | 0 | } |
426 | 0 | } else { // they overlap |
427 | 0 | const int32_t lateststart = start > xstart ? start : xstart; |
428 | 0 | int32_t earliestend; |
429 | 0 | if (end == xend) { // improbable |
430 | 0 | earliestend = end; |
431 | 0 | rlepos++; |
432 | 0 | xrlepos++; |
433 | 0 | if (rlepos < src_1->n_runs) { |
434 | 0 | start = src_1->runs[rlepos].value; |
435 | 0 | end = start + src_1->runs[rlepos].length + 1; |
436 | 0 | } |
437 | 0 | if (xrlepos < src_2->n_runs) { |
438 | 0 | xstart = src_2->runs[xrlepos].value; |
439 | 0 | xend = xstart + src_2->runs[xrlepos].length + 1; |
440 | 0 | } |
441 | 0 | } else if (end < xend) { |
442 | 0 | earliestend = end; |
443 | 0 | rlepos++; |
444 | 0 | if (rlepos < src_1->n_runs) { |
445 | 0 | start = src_1->runs[rlepos].value; |
446 | 0 | end = start + src_1->runs[rlepos].length + 1; |
447 | 0 | } |
448 | |
|
449 | 0 | } else { // end > xend |
450 | 0 | earliestend = xend; |
451 | 0 | xrlepos++; |
452 | 0 | if (xrlepos < src_2->n_runs) { |
453 | 0 | xstart = src_2->runs[xrlepos].value; |
454 | 0 | xend = xstart + src_2->runs[xrlepos].length + 1; |
455 | 0 | } |
456 | 0 | } |
457 | 0 | dst->runs[dst->n_runs].value = (uint16_t)lateststart; |
458 | 0 | dst->runs[dst->n_runs].length = |
459 | 0 | (uint16_t)(earliestend - lateststart - 1); |
460 | 0 | dst->n_runs++; |
461 | 0 | } |
462 | 0 | } |
463 | 0 | } |
464 | | |
465 | | /* Compute the size of the intersection of src_1 and src_2 . */ |
466 | | int run_container_intersection_cardinality(const run_container_t *src_1, |
467 | 0 | const run_container_t *src_2) { |
468 | 0 | const bool if1 = run_container_is_full(src_1); |
469 | 0 | const bool if2 = run_container_is_full(src_2); |
470 | 0 | if (if1 || if2) { |
471 | 0 | if (if1) { |
472 | 0 | return run_container_cardinality(src_2); |
473 | 0 | } |
474 | 0 | if (if2) { |
475 | 0 | return run_container_cardinality(src_1); |
476 | 0 | } |
477 | 0 | } |
478 | 0 | int answer = 0; |
479 | 0 | int32_t rlepos = 0; |
480 | 0 | int32_t xrlepos = 0; |
481 | 0 | int32_t start = src_1->runs[rlepos].value; |
482 | 0 | int32_t end = start + src_1->runs[rlepos].length + 1; |
483 | 0 | int32_t xstart = src_2->runs[xrlepos].value; |
484 | 0 | int32_t xend = xstart + src_2->runs[xrlepos].length + 1; |
485 | 0 | while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) { |
486 | 0 | if (end <= xstart) { |
487 | 0 | ++rlepos; |
488 | 0 | if (rlepos < src_1->n_runs) { |
489 | 0 | start = src_1->runs[rlepos].value; |
490 | 0 | end = start + src_1->runs[rlepos].length + 1; |
491 | 0 | } |
492 | 0 | } else if (xend <= start) { |
493 | 0 | ++xrlepos; |
494 | 0 | if (xrlepos < src_2->n_runs) { |
495 | 0 | xstart = src_2->runs[xrlepos].value; |
496 | 0 | xend = xstart + src_2->runs[xrlepos].length + 1; |
497 | 0 | } |
498 | 0 | } else { // they overlap |
499 | 0 | const int32_t lateststart = start > xstart ? start : xstart; |
500 | 0 | int32_t earliestend; |
501 | 0 | if (end == xend) { // improbable |
502 | 0 | earliestend = end; |
503 | 0 | rlepos++; |
504 | 0 | xrlepos++; |
505 | 0 | if (rlepos < src_1->n_runs) { |
506 | 0 | start = src_1->runs[rlepos].value; |
507 | 0 | end = start + src_1->runs[rlepos].length + 1; |
508 | 0 | } |
509 | 0 | if (xrlepos < src_2->n_runs) { |
510 | 0 | xstart = src_2->runs[xrlepos].value; |
511 | 0 | xend = xstart + src_2->runs[xrlepos].length + 1; |
512 | 0 | } |
513 | 0 | } else if (end < xend) { |
514 | 0 | earliestend = end; |
515 | 0 | rlepos++; |
516 | 0 | if (rlepos < src_1->n_runs) { |
517 | 0 | start = src_1->runs[rlepos].value; |
518 | 0 | end = start + src_1->runs[rlepos].length + 1; |
519 | 0 | } |
520 | |
|
521 | 0 | } else { // end > xend |
522 | 0 | earliestend = xend; |
523 | 0 | xrlepos++; |
524 | 0 | if (xrlepos < src_2->n_runs) { |
525 | 0 | xstart = src_2->runs[xrlepos].value; |
526 | 0 | xend = xstart + src_2->runs[xrlepos].length + 1; |
527 | 0 | } |
528 | 0 | } |
529 | 0 | answer += earliestend - lateststart; |
530 | 0 | } |
531 | 0 | } |
532 | 0 | return answer; |
533 | 0 | } |
534 | | |
535 | | bool run_container_intersect(const run_container_t *src_1, |
536 | 0 | const run_container_t *src_2) { |
537 | 0 | const bool if1 = run_container_is_full(src_1); |
538 | 0 | const bool if2 = run_container_is_full(src_2); |
539 | 0 | if (if1 || if2) { |
540 | 0 | if (if1) { |
541 | 0 | return !run_container_empty(src_2); |
542 | 0 | } |
543 | 0 | if (if2) { |
544 | 0 | return !run_container_empty(src_1); |
545 | 0 | } |
546 | 0 | } |
547 | 0 | int32_t rlepos = 0; |
548 | 0 | int32_t xrlepos = 0; |
549 | 0 | int32_t start = src_1->runs[rlepos].value; |
550 | 0 | int32_t end = start + src_1->runs[rlepos].length + 1; |
551 | 0 | int32_t xstart = src_2->runs[xrlepos].value; |
552 | 0 | int32_t xend = xstart + src_2->runs[xrlepos].length + 1; |
553 | 0 | while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) { |
554 | 0 | if (end <= xstart) { |
555 | 0 | ++rlepos; |
556 | 0 | if (rlepos < src_1->n_runs) { |
557 | 0 | start = src_1->runs[rlepos].value; |
558 | 0 | end = start + src_1->runs[rlepos].length + 1; |
559 | 0 | } |
560 | 0 | } else if (xend <= start) { |
561 | 0 | ++xrlepos; |
562 | 0 | if (xrlepos < src_2->n_runs) { |
563 | 0 | xstart = src_2->runs[xrlepos].value; |
564 | 0 | xend = xstart + src_2->runs[xrlepos].length + 1; |
565 | 0 | } |
566 | 0 | } else { // they overlap |
567 | 0 | return true; |
568 | 0 | } |
569 | 0 | } |
570 | 0 | return false; |
571 | 0 | } |
572 | | |
573 | | /* Compute the difference of src_1 and src_2 and write the result to |
574 | | * dst. It is assumed that dst is distinct from both src_1 and src_2. */ |
575 | | void run_container_andnot(const run_container_t *src_1, |
576 | 0 | const run_container_t *src_2, run_container_t *dst) { |
577 | | // following Java implementation as of June 2016 |
578 | |
|
579 | 0 | if (dst->capacity < src_1->n_runs + src_2->n_runs) |
580 | 0 | run_container_grow(dst, src_1->n_runs + src_2->n_runs, false); |
581 | |
|
582 | 0 | dst->n_runs = 0; |
583 | |
|
584 | 0 | int rlepos1 = 0; |
585 | 0 | int rlepos2 = 0; |
586 | 0 | int32_t start = src_1->runs[rlepos1].value; |
587 | 0 | int32_t end = start + src_1->runs[rlepos1].length + 1; |
588 | 0 | int32_t start2 = src_2->runs[rlepos2].value; |
589 | 0 | int32_t end2 = start2 + src_2->runs[rlepos2].length + 1; |
590 | |
|
591 | 0 | while ((rlepos1 < src_1->n_runs) && (rlepos2 < src_2->n_runs)) { |
592 | 0 | if (end <= start2) { |
593 | | // output the first run |
594 | 0 | dst->runs[dst->n_runs++] = |
595 | 0 | CROARING_MAKE_RLE16(start, end - start - 1); |
596 | 0 | rlepos1++; |
597 | 0 | if (rlepos1 < src_1->n_runs) { |
598 | 0 | start = src_1->runs[rlepos1].value; |
599 | 0 | end = start + src_1->runs[rlepos1].length + 1; |
600 | 0 | } |
601 | 0 | } else if (end2 <= start) { |
602 | | // exit the second run |
603 | 0 | rlepos2++; |
604 | 0 | if (rlepos2 < src_2->n_runs) { |
605 | 0 | start2 = src_2->runs[rlepos2].value; |
606 | 0 | end2 = start2 + src_2->runs[rlepos2].length + 1; |
607 | 0 | } |
608 | 0 | } else { |
609 | 0 | if (start < start2) { |
610 | 0 | dst->runs[dst->n_runs++] = |
611 | 0 | CROARING_MAKE_RLE16(start, start2 - start - 1); |
612 | 0 | } |
613 | 0 | if (end2 < end) { |
614 | 0 | start = end2; |
615 | 0 | } else { |
616 | 0 | rlepos1++; |
617 | 0 | if (rlepos1 < src_1->n_runs) { |
618 | 0 | start = src_1->runs[rlepos1].value; |
619 | 0 | end = start + src_1->runs[rlepos1].length + 1; |
620 | 0 | } |
621 | 0 | } |
622 | 0 | } |
623 | 0 | } |
624 | 0 | if (rlepos1 < src_1->n_runs) { |
625 | 0 | dst->runs[dst->n_runs++] = CROARING_MAKE_RLE16(start, end - start - 1); |
626 | 0 | rlepos1++; |
627 | 0 | if (rlepos1 < src_1->n_runs) { |
628 | 0 | memcpy(dst->runs + dst->n_runs, src_1->runs + rlepos1, |
629 | 0 | sizeof(rle16_t) * (src_1->n_runs - rlepos1)); |
630 | 0 | dst->n_runs += src_1->n_runs - rlepos1; |
631 | 0 | } |
632 | 0 | } |
633 | 0 | } |
634 | | |
635 | | /* |
636 | | * Print this container using printf (useful for debugging). |
637 | | */ |
638 | 0 | void run_container_printf(const run_container_t *cont) { |
639 | 0 | for (int i = 0; i < cont->n_runs; ++i) { |
640 | 0 | uint16_t run_start = cont->runs[i].value; |
641 | 0 | uint16_t le = cont->runs[i].length; |
642 | 0 | printf("[%d,%d]", run_start, run_start + le); |
643 | 0 | } |
644 | 0 | } |
645 | | |
646 | | /* |
647 | | * Print this container using printf as a comma-separated list of 32-bit |
648 | | * integers starting at base. |
649 | | */ |
650 | | void run_container_printf_as_uint32_array(const run_container_t *cont, |
651 | 0 | uint32_t base) { |
652 | 0 | if (cont->n_runs == 0) return; |
653 | 0 | { |
654 | 0 | uint32_t run_start = base + cont->runs[0].value; |
655 | 0 | uint16_t le = cont->runs[0].length; |
656 | 0 | printf("%u", run_start); |
657 | 0 | for (uint32_t j = 1; j <= le; ++j) printf(",%u", run_start + j); |
658 | 0 | } |
659 | 0 | for (int32_t i = 1; i < cont->n_runs; ++i) { |
660 | 0 | uint32_t run_start = base + cont->runs[i].value; |
661 | 0 | uint16_t le = cont->runs[i].length; |
662 | 0 | for (uint32_t j = 0; j <= le; ++j) printf(",%u", run_start + j); |
663 | 0 | } |
664 | 0 | } |
665 | | |
666 | | /* |
667 | | * Validate the container. Returns true if valid. |
668 | | */ |
669 | 4.42k | bool run_container_validate(const run_container_t *run, const char **reason) { |
670 | 4.42k | if (run->n_runs < 0) { |
671 | 0 | *reason = "negative run count"; |
672 | 0 | return false; |
673 | 0 | } |
674 | 4.42k | if (run->capacity < 0) { |
675 | 0 | *reason = "negative run capacity"; |
676 | 0 | return false; |
677 | 0 | } |
678 | 4.42k | if (run->capacity < run->n_runs) { |
679 | 0 | *reason = "capacity less than run count"; |
680 | 0 | return false; |
681 | 0 | } |
682 | | |
683 | 4.42k | if (run->n_runs == 0) { |
684 | 193 | *reason = "zero run count"; |
685 | 193 | return false; |
686 | 193 | } |
687 | 4.23k | if (run->runs == NULL) { |
688 | 0 | *reason = "NULL runs"; |
689 | 0 | return false; |
690 | 0 | } |
691 | | |
692 | | // Use uint32_t to avoid overflow issues on ranges that contain UINT16_MAX. |
693 | 4.23k | uint32_t last_end = 0; |
694 | 9.67k | for (int i = 0; i < run->n_runs; ++i) { |
695 | 5.48k | uint32_t start = run->runs[i].value; |
696 | 5.48k | uint32_t end = start + run->runs[i].length + 1; |
697 | 5.48k | if (end <= start) { |
698 | 0 | *reason = "run start + length overflow"; |
699 | 0 | return false; |
700 | 0 | } |
701 | 5.48k | if (end > (1 << 16)) { |
702 | 7 | *reason = "run start + length too large"; |
703 | 7 | return false; |
704 | 7 | } |
705 | 5.48k | if (start < last_end) { |
706 | 25 | *reason = "run start less than last end"; |
707 | 25 | return false; |
708 | 25 | } |
709 | 5.45k | if (start == last_end && last_end != 0) { |
710 | 15 | *reason = "run start equal to last end, should have combined"; |
711 | 15 | return false; |
712 | 15 | } |
713 | 5.44k | last_end = end; |
714 | 5.44k | } |
715 | 4.18k | return true; |
716 | 4.23k | } |
717 | | |
718 | 0 | int32_t run_container_write(const run_container_t *container, char *buf) { |
719 | 0 | uint16_t cast_16 = container->n_runs; |
720 | 0 | memcpy(buf, &cast_16, sizeof(uint16_t)); |
721 | 0 | memcpy(buf + sizeof(uint16_t), container->runs, |
722 | 0 | container->n_runs * sizeof(rle16_t)); |
723 | 0 | return run_container_size_in_bytes(container); |
724 | 0 | } |
725 | | |
726 | | int32_t run_container_read(int32_t cardinality, run_container_t *container, |
727 | 52.5k | const char *buf) { |
728 | 52.5k | (void)cardinality; |
729 | 52.5k | uint16_t cast_16; |
730 | 52.5k | memcpy(&cast_16, buf, sizeof(uint16_t)); |
731 | 52.5k | container->n_runs = cast_16; |
732 | 52.5k | if (container->n_runs > container->capacity) |
733 | 4.32k | run_container_grow(container, container->n_runs, false); |
734 | 52.5k | if (container->n_runs > 0) { |
735 | 4.32k | memcpy(container->runs, buf + sizeof(uint16_t), |
736 | 4.32k | container->n_runs * sizeof(rle16_t)); |
737 | 4.32k | } |
738 | 52.5k | return run_container_size_in_bytes(container); |
739 | 52.5k | } |
740 | | |
741 | | bool run_container_iterate(const run_container_t *cont, uint32_t base, |
742 | 0 | roaring_iterator iterator, void *ptr) { |
743 | 0 | for (int i = 0; i < cont->n_runs; ++i) { |
744 | 0 | uint32_t run_start = base + cont->runs[i].value; |
745 | 0 | uint16_t le = cont->runs[i].length; |
746 | |
|
747 | 0 | for (int j = 0; j <= le; ++j) |
748 | 0 | if (!iterator(run_start + j, ptr)) return false; |
749 | 0 | } |
750 | 0 | return true; |
751 | 0 | } |
752 | | |
753 | | bool run_container_iterate64(const run_container_t *cont, uint32_t base, |
754 | | roaring_iterator64 iterator, uint64_t high_bits, |
755 | 0 | void *ptr) { |
756 | 0 | for (int i = 0; i < cont->n_runs; ++i) { |
757 | 0 | uint32_t run_start = base + cont->runs[i].value; |
758 | 0 | uint16_t le = cont->runs[i].length; |
759 | |
|
760 | 0 | for (int j = 0; j <= le; ++j) |
761 | 0 | if (!iterator(high_bits | (uint64_t)(run_start + j), ptr)) |
762 | 0 | return false; |
763 | 0 | } |
764 | 0 | return true; |
765 | 0 | } |
766 | | |
767 | | bool run_container_is_subset(const run_container_t *container1, |
768 | 0 | const run_container_t *container2) { |
769 | 0 | int i1 = 0, i2 = 0; |
770 | 0 | while (i1 < container1->n_runs && i2 < container2->n_runs) { |
771 | 0 | int start1 = container1->runs[i1].value; |
772 | 0 | int stop1 = start1 + container1->runs[i1].length; |
773 | 0 | int start2 = container2->runs[i2].value; |
774 | 0 | int stop2 = start2 + container2->runs[i2].length; |
775 | 0 | if (start1 < start2) { |
776 | 0 | return false; |
777 | 0 | } else { // start1 >= start2 |
778 | 0 | if (stop1 < stop2) { |
779 | 0 | i1++; |
780 | 0 | } else if (stop1 == stop2) { |
781 | 0 | i1++; |
782 | 0 | i2++; |
783 | 0 | } else { // stop1 > stop2 |
784 | 0 | i2++; |
785 | 0 | } |
786 | 0 | } |
787 | 0 | } |
788 | 0 | if (i1 == container1->n_runs) { |
789 | 0 | return true; |
790 | 0 | } else { |
791 | 0 | return false; |
792 | 0 | } |
793 | 0 | } |
794 | | |
795 | | // TODO: write smart_append_exclusive version to match the overloaded 1 param |
796 | | // Java version (or is it even used?) |
797 | | |
798 | | // follows the Java implementation closely |
799 | | // length is the rle-value. Ie, run [10,12) uses a length value 1. |
800 | | void run_container_smart_append_exclusive(run_container_t *src, |
801 | | const uint16_t start, |
802 | 0 | const uint16_t length) { |
803 | 0 | int old_end; |
804 | 0 | rle16_t *last_run = src->n_runs ? src->runs + (src->n_runs - 1) : NULL; |
805 | 0 | rle16_t *appended_last_run = src->runs + src->n_runs; |
806 | |
|
807 | 0 | if (!src->n_runs || |
808 | 0 | (start > (old_end = last_run->value + last_run->length + 1))) { |
809 | 0 | *appended_last_run = CROARING_MAKE_RLE16(start, length); |
810 | 0 | src->n_runs++; |
811 | 0 | return; |
812 | 0 | } |
813 | 0 | if (old_end == start) { |
814 | | // we merge |
815 | 0 | last_run->length += (length + 1); |
816 | 0 | return; |
817 | 0 | } |
818 | 0 | int new_end = start + length + 1; |
819 | |
|
820 | 0 | if (start == last_run->value) { |
821 | | // wipe out previous |
822 | 0 | if (new_end < old_end) { |
823 | 0 | *last_run = CROARING_MAKE_RLE16(new_end, old_end - new_end - 1); |
824 | 0 | return; |
825 | 0 | } else if (new_end > old_end) { |
826 | 0 | *last_run = CROARING_MAKE_RLE16(old_end, new_end - old_end - 1); |
827 | 0 | return; |
828 | 0 | } else { |
829 | 0 | src->n_runs--; |
830 | 0 | return; |
831 | 0 | } |
832 | 0 | } |
833 | 0 | last_run->length = start - last_run->value - 1; |
834 | 0 | if (new_end < old_end) { |
835 | 0 | *appended_last_run = |
836 | 0 | CROARING_MAKE_RLE16(new_end, old_end - new_end - 1); |
837 | 0 | src->n_runs++; |
838 | 0 | } else if (new_end > old_end) { |
839 | 0 | *appended_last_run = |
840 | 0 | CROARING_MAKE_RLE16(old_end, new_end - old_end - 1); |
841 | 0 | src->n_runs++; |
842 | 0 | } |
843 | 0 | } |
844 | | |
845 | | bool run_container_select(const run_container_t *container, |
846 | | uint32_t *start_rank, uint32_t rank, |
847 | 0 | uint32_t *element) { |
848 | 0 | for (int i = 0; i < container->n_runs; i++) { |
849 | 0 | uint16_t length = container->runs[i].length; |
850 | 0 | if (rank <= *start_rank + length) { |
851 | 0 | uint16_t value = container->runs[i].value; |
852 | 0 | *element = value + rank - (*start_rank); |
853 | 0 | return true; |
854 | 0 | } else |
855 | 0 | *start_rank += length + 1; |
856 | 0 | } |
857 | 0 | return false; |
858 | 0 | } |
859 | | |
860 | 0 | int run_container_rank(const run_container_t *container, uint16_t x) { |
861 | 0 | int sum = 0; |
862 | 0 | uint32_t x32 = x; |
863 | 0 | for (int i = 0; i < container->n_runs; i++) { |
864 | 0 | uint32_t startpoint = container->runs[i].value; |
865 | 0 | uint32_t length = container->runs[i].length; |
866 | 0 | uint32_t endpoint = length + startpoint; |
867 | 0 | if (x <= endpoint) { |
868 | 0 | if (x < startpoint) break; |
869 | 0 | return sum + (x32 - startpoint) + 1; |
870 | 0 | } else { |
871 | 0 | sum += length + 1; |
872 | 0 | } |
873 | 0 | } |
874 | 0 | return sum; |
875 | 0 | } |
876 | | uint32_t run_container_rank_many(const run_container_t *container, |
877 | | uint64_t start_rank, const uint32_t *begin, |
878 | 0 | const uint32_t *end, uint64_t *ans) { |
879 | 0 | const uint16_t high = (uint16_t)((*begin) >> 16); |
880 | 0 | const uint32_t *iter = begin; |
881 | 0 | int sum = 0; |
882 | 0 | int i = 0; |
883 | 0 | for (; iter != end; iter++) { |
884 | 0 | uint32_t x = *iter; |
885 | 0 | uint16_t xhigh = (uint16_t)(x >> 16); |
886 | 0 | if (xhigh != high) return iter - begin; // stop at next container |
887 | | |
888 | 0 | uint32_t x32 = x & 0xFFFF; |
889 | 0 | while (i < container->n_runs) { |
890 | 0 | uint32_t startpoint = container->runs[i].value; |
891 | 0 | uint32_t length = container->runs[i].length; |
892 | 0 | uint32_t endpoint = length + startpoint; |
893 | 0 | if (x32 <= endpoint) { |
894 | 0 | if (x32 < startpoint) { |
895 | 0 | *(ans++) = start_rank + sum; |
896 | 0 | } else { |
897 | 0 | *(ans++) = start_rank + sum + (x32 - startpoint) + 1; |
898 | 0 | } |
899 | 0 | break; |
900 | 0 | } else { |
901 | 0 | sum += length + 1; |
902 | 0 | i++; |
903 | 0 | } |
904 | 0 | } |
905 | 0 | if (i >= container->n_runs) *(ans++) = start_rank + sum; |
906 | 0 | } |
907 | | |
908 | 0 | return iter - begin; |
909 | 0 | } |
910 | | |
911 | 0 | int run_container_get_index(const run_container_t *container, uint16_t x) { |
912 | 0 | if (run_container_contains(container, x)) { |
913 | 0 | int sum = 0; |
914 | 0 | uint32_t x32 = x; |
915 | 0 | for (int i = 0; i < container->n_runs; i++) { |
916 | 0 | uint32_t startpoint = container->runs[i].value; |
917 | 0 | uint32_t length = container->runs[i].length; |
918 | 0 | uint32_t endpoint = length + startpoint; |
919 | 0 | if (x <= endpoint) { |
920 | 0 | if (x < startpoint) break; |
921 | 0 | return sum + (x32 - startpoint); |
922 | 0 | } else { |
923 | 0 | sum += length + 1; |
924 | 0 | } |
925 | 0 | } |
926 | 0 | return sum - 1; |
927 | 0 | } else { |
928 | 0 | return -1; |
929 | 0 | } |
930 | 0 | } |
931 | | |
932 | | #if defined(CROARING_IS_X64) && CROARING_COMPILER_SUPPORTS_AVX512 |
933 | | |
934 | | CROARING_TARGET_AVX512 |
935 | | CROARING_ALLOW_UNALIGNED |
936 | | /* Get the cardinality of `run'. Requires an actual computation. */ |
937 | | static inline int _avx512_run_container_cardinality( |
938 | 0 | const run_container_t *run) { |
939 | 0 | const int32_t n_runs = run->n_runs; |
940 | 0 | const rle16_t *runs = run->runs; |
941 | 0 |
|
942 | 0 | int32_t k = 0; |
943 | 0 | const int32_t step512 = sizeof(__m512i) / sizeof(rle16_t); |
944 | 0 | __m512i total = _mm512_setzero_si512(); |
945 | 0 | for (; k + step512 <= n_runs; k += step512) { |
946 | 0 | __m512i ymm1 = _mm512_loadu_si512((const __m512i *)(runs + k)); |
947 | 0 | __m512i justlengths = _mm512_srli_epi32(ymm1, 16); |
948 | 0 | total = _mm512_add_epi32(total, justlengths); |
949 | 0 | } |
950 | 0 | if (k < n_runs) { |
951 | 0 | __m512i ymm1 = _mm512_maskz_loadu_epi32((1 << (n_runs - k)) - 1, |
952 | 0 | (const __m512i *)(runs + k)); |
953 | 0 | __m512i justlengths = _mm512_srli_epi32(ymm1, 16); |
954 | 0 | total = _mm512_add_epi32(total, justlengths); |
955 | 0 | } |
956 | 0 | return _mm512_reduce_add_epi32(total) + n_runs; |
957 | 0 | } |
958 | | |
959 | | CROARING_UNTARGET_AVX512 |
960 | | |
961 | | CROARING_TARGET_AVX2 |
962 | | CROARING_ALLOW_UNALIGNED |
963 | | /* Get the cardinality of `run'. Requires an actual computation. */ |
964 | 7.08k | static inline int _avx2_run_container_cardinality(const run_container_t *run) { |
965 | 7.08k | const int32_t n_runs = run->n_runs; |
966 | 7.08k | const rle16_t *runs = run->runs; |
967 | | |
968 | | /* by initializing with n_runs, we omit counting the +1 for each pair. */ |
969 | 7.08k | int sum = n_runs; |
970 | 7.08k | int32_t k = 0; |
971 | 7.08k | const int32_t step = sizeof(__m256i) / sizeof(rle16_t); |
972 | 7.08k | if (n_runs > step) { |
973 | 124 | __m256i total = _mm256_setzero_si256(); |
974 | 346 | for (; k + step <= n_runs; k += step) { |
975 | 222 | __m256i ymm1 = _mm256_lddqu_si256((const __m256i *)(runs + k)); |
976 | 222 | __m256i justlengths = _mm256_srli_epi32(ymm1, 16); |
977 | 222 | total = _mm256_add_epi32(total, justlengths); |
978 | 222 | } |
979 | | // a store might be faster than extract? |
980 | 124 | uint32_t buffer[sizeof(__m256i) / sizeof(rle16_t)]; |
981 | 124 | _mm256_storeu_si256((__m256i *)buffer, total); |
982 | 124 | sum += (buffer[0] + buffer[1]) + (buffer[2] + buffer[3]) + |
983 | 124 | (buffer[4] + buffer[5]) + (buffer[6] + buffer[7]); |
984 | 124 | } |
985 | 14.5k | for (; k < n_runs; ++k) { |
986 | 7.48k | sum += runs[k].length; |
987 | 7.48k | } |
988 | | |
989 | 7.08k | return sum; |
990 | 7.08k | } |
991 | | |
992 | | CROARING_ALLOW_UNALIGNED |
993 | | int _avx2_run_container_to_uint32_array(void *vout, const run_container_t *cont, |
994 | 0 | uint32_t base) { |
995 | 0 | int outpos = 0; |
996 | 0 | uint32_t *out = (uint32_t *)vout; |
997 | |
|
998 | 0 | for (int i = 0; i < cont->n_runs; ++i) { |
999 | 0 | uint32_t run_start = base + cont->runs[i].value; |
1000 | 0 | uint16_t le = cont->runs[i].length; |
1001 | 0 | if (le < 8) { |
1002 | 0 | for (int j = 0; j <= le; ++j) { |
1003 | 0 | uint32_t val = run_start + j; |
1004 | 0 | memcpy(out + outpos, &val, |
1005 | 0 | sizeof(uint32_t)); // should be compiled as a MOV on x64 |
1006 | 0 | outpos++; |
1007 | 0 | } |
1008 | 0 | } else { |
1009 | 0 | int j = 0; |
1010 | 0 | __m256i run_start_v = _mm256_set1_epi32(run_start); |
1011 | | // [8,8,8,8....] |
1012 | 0 | __m256i inc = _mm256_set1_epi32(8); |
1013 | | // used for generate sequence: |
1014 | | // [0, 1, 2, 3...], [8, 9, 10,...] |
1015 | 0 | __m256i delta = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); |
1016 | 0 | for (j = 0; j + 8 <= le; j += 8) { |
1017 | 0 | __m256i val_v = _mm256_add_epi32(run_start_v, delta); |
1018 | 0 | _mm256_storeu_si256((__m256i *)(out + outpos), val_v); |
1019 | 0 | delta = _mm256_add_epi32(inc, delta); |
1020 | 0 | outpos += 8; |
1021 | 0 | } |
1022 | 0 | for (; j <= le; ++j) { |
1023 | 0 | uint32_t val = run_start + j; |
1024 | 0 | memcpy(out + outpos, &val, |
1025 | 0 | sizeof(uint32_t)); // should be compiled as a MOV on x64 |
1026 | 0 | outpos++; |
1027 | 0 | } |
1028 | 0 | } |
1029 | 0 | } |
1030 | 0 | return outpos; |
1031 | 0 | } |
1032 | | |
1033 | | CROARING_UNTARGET_AVX2 |
1034 | | |
1035 | | /* Get the cardinality of `run'. Requires an actual computation. */ |
1036 | | static inline int _scalar_run_container_cardinality( |
1037 | 0 | const run_container_t *run) { |
1038 | 0 | const int32_t n_runs = run->n_runs; |
1039 | 0 | const rle16_t *runs = run->runs; |
1040 | | |
1041 | | /* by initializing with n_runs, we omit counting the +1 for each pair. */ |
1042 | 0 | int sum = n_runs; |
1043 | 0 | for (int k = 0; k < n_runs; ++k) { |
1044 | 0 | sum += runs[k].length; |
1045 | 0 | } |
1046 | |
|
1047 | 0 | return sum; |
1048 | 0 | } |
1049 | | |
1050 | 7.08k | int run_container_cardinality(const run_container_t *run) { |
1051 | | // Empirically AVX-512 is not always faster than AVX2 |
1052 | 7.08k | #define CROARING_ENABLE_AVX512_RUN_CONTAINER_CARDINALITY 0 |
1053 | | #if CROARING_COMPILER_SUPPORTS_AVX512 && \ |
1054 | | CROARING_ENABLE_AVX512_RUN_CONTAINER_CARDINALITY |
1055 | | if (croaring_hardware_support() & ROARING_SUPPORTS_AVX512) { |
1056 | | return _avx512_run_container_cardinality(run); |
1057 | | } else |
1058 | | #endif |
1059 | 7.08k | if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) { |
1060 | 7.08k | return _avx2_run_container_cardinality(run); |
1061 | 7.08k | } else { |
1062 | 0 | return _scalar_run_container_cardinality(run); |
1063 | 0 | } |
1064 | 7.08k | } |
1065 | | |
1066 | | int _scalar_run_container_to_uint32_array(void *vout, |
1067 | | const run_container_t *cont, |
1068 | 0 | uint32_t base) { |
1069 | 0 | int outpos = 0; |
1070 | 0 | uint32_t *out = (uint32_t *)vout; |
1071 | 0 | for (int i = 0; i < cont->n_runs; ++i) { |
1072 | 0 | uint32_t run_start = base + cont->runs[i].value; |
1073 | 0 | uint16_t le = cont->runs[i].length; |
1074 | 0 | for (int j = 0; j <= le; ++j) { |
1075 | 0 | uint32_t val = run_start + j; |
1076 | 0 | memcpy(out + outpos, &val, |
1077 | 0 | sizeof(uint32_t)); // should be compiled as a MOV on x64 |
1078 | 0 | outpos++; |
1079 | 0 | } |
1080 | 0 | } |
1081 | 0 | return outpos; |
1082 | 0 | } |
1083 | | |
1084 | | int run_container_to_uint32_array(void *vout, const run_container_t *cont, |
1085 | 0 | uint32_t base) { |
1086 | 0 | if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) { |
1087 | 0 | return _avx2_run_container_to_uint32_array(vout, cont, base); |
1088 | 0 | } else { |
1089 | 0 | return _scalar_run_container_to_uint32_array(vout, cont, base); |
1090 | 0 | } |
1091 | 0 | } |
1092 | | |
1093 | | #else |
1094 | | |
1095 | | /* Get the cardinality of `run'. Requires an actual computation. */ |
1096 | | CROARING_ALLOW_UNALIGNED |
1097 | | int run_container_cardinality(const run_container_t *run) { |
1098 | | const int32_t n_runs = run->n_runs; |
1099 | | const rle16_t *runs = run->runs; |
1100 | | |
1101 | | /* by initializing with n_runs, we omit counting the +1 for each pair. */ |
1102 | | int sum = n_runs; |
1103 | | for (int k = 0; k < n_runs; ++k) { |
1104 | | sum += runs[k].length; |
1105 | | } |
1106 | | |
1107 | | return sum; |
1108 | | } |
1109 | | |
1110 | | CROARING_ALLOW_UNALIGNED |
1111 | | int run_container_to_uint32_array(void *vout, const run_container_t *cont, |
1112 | | uint32_t base) { |
1113 | | int outpos = 0; |
1114 | | uint32_t *out = (uint32_t *)vout; |
1115 | | for (int i = 0; i < cont->n_runs; ++i) { |
1116 | | uint32_t run_start = base + cont->runs[i].value; |
1117 | | uint16_t le = cont->runs[i].length; |
1118 | | for (int j = 0; j <= le; ++j) { |
1119 | | uint32_t val = run_start + j; |
1120 | | memcpy(out + outpos, &val, |
1121 | | sizeof(uint32_t)); // should be compiled as a MOV on x64 |
1122 | | outpos++; |
1123 | | } |
1124 | | } |
1125 | | return outpos; |
1126 | | } |
1127 | | |
1128 | | #endif |
1129 | | |
1130 | | #ifdef __cplusplus |
1131 | | } |
1132 | | } |
1133 | | } // extern "C" { namespace roaring { namespace internal { |
1134 | | #endif |
1135 | | #if defined(__GNUC__) && !defined(__clang__) |
1136 | | #pragma GCC diagnostic pop |
1137 | | #endif |