/src/croaring/src/containers/run.c
Line | Count | Source |
1 | | #include <stdio.h> |
2 | | #include <stdlib.h> |
3 | | |
4 | | #include <roaring/containers/run.h> |
5 | | #include <roaring/memory.h> |
6 | | #include <roaring/portability.h> |
7 | | |
8 | | #if CROARING_IS_X64 |
9 | | #ifndef CROARING_COMPILER_SUPPORTS_AVX512 |
10 | | #error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined." |
11 | | #endif // CROARING_COMPILER_SUPPORTS_AVX512 |
12 | | #endif |
13 | | #if defined(__GNUC__) && !defined(__clang__) |
14 | | #pragma GCC diagnostic push |
15 | | #pragma GCC diagnostic ignored "-Wuninitialized" |
16 | | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" |
17 | | #endif |
18 | | #ifdef __cplusplus |
19 | | extern "C" { |
20 | | namespace roaring { |
21 | | namespace internal { |
22 | | #endif |
23 | | |
24 | | extern inline uint16_t run_container_minimum(const run_container_t *run); |
25 | | extern inline uint16_t run_container_maximum(const run_container_t *run); |
26 | | extern inline int32_t interleavedBinarySearch(const rle16_t *array, |
27 | | int32_t lenarray, uint16_t ikey); |
28 | | extern inline bool run_container_contains(const run_container_t *run, |
29 | | uint16_t pos); |
30 | | extern inline int run_container_index_equalorlarger(const run_container_t *arr, |
31 | | uint16_t x); |
32 | | extern inline bool run_container_is_full(const run_container_t *run); |
33 | | extern inline bool run_container_nonzero_cardinality(const run_container_t *rc); |
34 | | extern inline int32_t run_container_serialized_size_in_bytes(int32_t num_runs); |
35 | | extern inline run_container_t *run_container_create_range(uint32_t start, |
36 | | uint32_t stop); |
37 | | extern inline int run_container_cardinality(const run_container_t *run); |
38 | | |
39 | 0 | bool run_container_add(run_container_t *run, uint16_t pos) { |
40 | 0 | int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos); |
41 | 0 | if (index >= 0) return false; // already there |
42 | 0 | index = -index - 2; // points to preceding value, possibly -1 |
43 | 0 | if (index >= 0) { // possible match |
44 | 0 | int32_t offset = pos - run->runs[index].value; |
45 | 0 | int32_t le = run->runs[index].length; |
46 | 0 | if (offset <= le) return false; // already there |
47 | 0 | if (offset == le + 1) { |
48 | | // we may need to fuse |
49 | 0 | if (index + 1 < run->n_runs) { |
50 | 0 | if (run->runs[index + 1].value == pos + 1) { |
51 | | // indeed fusion is needed |
52 | 0 | run->runs[index].length = run->runs[index + 1].value + |
53 | 0 | run->runs[index + 1].length - |
54 | 0 | run->runs[index].value; |
55 | 0 | recoverRoomAtIndex(run, (uint16_t)(index + 1)); |
56 | 0 | return true; |
57 | 0 | } |
58 | 0 | } |
59 | 0 | run->runs[index].length++; |
60 | 0 | return true; |
61 | 0 | } |
62 | 0 | if (index + 1 < run->n_runs) { |
63 | | // we may need to fuse |
64 | 0 | if (run->runs[index + 1].value == pos + 1) { |
65 | | // indeed fusion is needed |
66 | 0 | run->runs[index + 1].value = pos; |
67 | 0 | run->runs[index + 1].length = run->runs[index + 1].length + 1; |
68 | 0 | return true; |
69 | 0 | } |
70 | 0 | } |
71 | 0 | } |
72 | 0 | if (index == -1) { |
73 | | // we may need to extend the first run |
74 | 0 | if (0 < run->n_runs) { |
75 | 0 | if (run->runs[0].value == pos + 1) { |
76 | 0 | run->runs[0].length++; |
77 | 0 | run->runs[0].value--; |
78 | 0 | return true; |
79 | 0 | } |
80 | 0 | } |
81 | 0 | } |
82 | 0 | makeRoomAtIndex(run, (uint16_t)(index + 1)); |
83 | 0 | run->runs[index + 1].value = pos; |
84 | 0 | run->runs[index + 1].length = 0; |
85 | 0 | return true; |
86 | 0 | } |
87 | | |
88 | | /* Create a new run container. Return NULL in case of failure. */ |
89 | 0 | run_container_t *run_container_create_given_capacity(int32_t size) { |
90 | 0 | run_container_t *run; |
91 | | /* Allocate the run container itself. */ |
92 | 0 | if ((run = (run_container_t *)roaring_malloc(sizeof(run_container_t))) == |
93 | 0 | NULL) { |
94 | 0 | return NULL; |
95 | 0 | } |
96 | 0 | if (size <= 0) { // we don't want to rely on malloc(0) |
97 | 0 | run->runs = NULL; |
98 | 0 | } else if ((run->runs = (rle16_t *)roaring_malloc(sizeof(rle16_t) * |
99 | 0 | size)) == NULL) { |
100 | 0 | roaring_free(run); |
101 | 0 | return NULL; |
102 | 0 | } |
103 | 0 | run->capacity = size; |
104 | 0 | run->n_runs = 0; |
105 | 0 | return run; |
106 | 0 | } |
107 | | |
108 | 0 | int run_container_shrink_to_fit(run_container_t *src) { |
109 | 0 | if (src->n_runs == src->capacity) return 0; // nothing to do |
110 | 0 | int savings = src->capacity - src->n_runs; |
111 | 0 | src->capacity = src->n_runs; |
112 | 0 | rle16_t *oldruns = src->runs; |
113 | 0 | src->runs = |
114 | 0 | (rle16_t *)roaring_realloc(oldruns, src->capacity * sizeof(rle16_t)); |
115 | 0 | if (src->runs == NULL) roaring_free(oldruns); // should never happen? |
116 | 0 | return savings; |
117 | 0 | } |
118 | | /* Create a new run container. Return NULL in case of failure. */ |
119 | 0 | run_container_t *run_container_create(void) { |
120 | 0 | return run_container_create_given_capacity(RUN_DEFAULT_INIT_SIZE); |
121 | 0 | } |
122 | | |
123 | | CROARING_ALLOW_UNALIGNED |
124 | 0 | run_container_t *run_container_clone(const run_container_t *src) { |
125 | 0 | run_container_t *run = run_container_create_given_capacity(src->capacity); |
126 | 0 | if (run == NULL) return NULL; |
127 | 0 | run->capacity = src->capacity; |
128 | 0 | run->n_runs = src->n_runs; |
129 | 0 | memcpy(run->runs, src->runs, src->n_runs * sizeof(rle16_t)); |
130 | 0 | return run; |
131 | 0 | } |
132 | | |
133 | | void run_container_offset(const run_container_t *c, container_t **loc, |
134 | 0 | container_t **hic, uint16_t offset) { |
135 | 0 | run_container_t *lo = NULL, *hi = NULL; |
136 | |
|
137 | 0 | bool split; |
138 | 0 | unsigned int lo_cap, hi_cap; |
139 | 0 | int top, pivot; |
140 | |
|
141 | 0 | top = (1 << 16) - offset; |
142 | 0 | pivot = run_container_index_equalorlarger(c, top); |
143 | | // pivot is the index of the first run that is >= top or -1 if no such run |
144 | |
|
145 | 0 | if (pivot >= 0) { |
146 | 0 | split = c->runs[pivot].value < top; |
147 | 0 | lo_cap = pivot + (split ? 1 : 0); |
148 | 0 | hi_cap = c->n_runs - pivot; |
149 | 0 | } else { |
150 | | // here pivot < 0 |
151 | 0 | split = false; |
152 | 0 | lo_cap = c->n_runs; |
153 | 0 | hi_cap = 0; |
154 | 0 | } |
155 | |
|
156 | 0 | if (loc && lo_cap) { |
157 | 0 | lo = run_container_create_given_capacity(lo_cap); |
158 | 0 | memcpy(lo->runs, c->runs, lo_cap * sizeof(rle16_t)); |
159 | 0 | lo->n_runs = lo_cap; |
160 | 0 | for (unsigned int i = 0; i < lo_cap; ++i) { |
161 | 0 | lo->runs[i].value += offset; |
162 | 0 | } |
163 | 0 | *loc = (container_t *)lo; |
164 | 0 | } |
165 | |
|
166 | 0 | if (hic && hi_cap) { |
167 | 0 | hi = run_container_create_given_capacity(hi_cap); |
168 | 0 | memcpy(hi->runs, c->runs + pivot, hi_cap * sizeof(rle16_t)); |
169 | 0 | hi->n_runs = hi_cap; |
170 | 0 | for (unsigned int i = 0; i < hi_cap; ++i) { |
171 | 0 | hi->runs[i].value += offset; |
172 | 0 | } |
173 | 0 | *hic = (container_t *)hi; |
174 | 0 | } |
175 | | |
176 | | // Fix the split. |
177 | 0 | if (split) { |
178 | 0 | if (lo != NULL) { |
179 | | // Add the missing run to 'lo', exhausting length. |
180 | 0 | lo->runs[lo->n_runs - 1].length = |
181 | 0 | (1 << 16) - lo->runs[lo->n_runs - 1].value - 1; |
182 | 0 | } |
183 | |
|
184 | 0 | if (hi != NULL) { |
185 | | // Fix the first run in 'hi'. |
186 | 0 | hi->runs[0].length -= UINT16_MAX - hi->runs[0].value + 1; |
187 | 0 | hi->runs[0].value = 0; |
188 | 0 | } |
189 | 0 | } |
190 | 0 | } |
191 | | |
192 | | /* Free memory. */ |
193 | 0 | void run_container_free(run_container_t *run) { |
194 | 0 | if (run == NULL) return; |
195 | 0 | roaring_free(run->runs); |
196 | 0 | roaring_free(run); |
197 | 0 | } |
198 | | |
199 | 0 | void run_container_grow(run_container_t *run, int32_t min, bool copy) { |
200 | 0 | int32_t newCapacity = (run->capacity == 0) ? RUN_DEFAULT_INIT_SIZE |
201 | 0 | : run->capacity < 64 ? run->capacity * 2 |
202 | 0 | : run->capacity < 1024 ? run->capacity * 3 / 2 |
203 | 0 | : run->capacity * 5 / 4; |
204 | 0 | if (newCapacity < min) newCapacity = min; |
205 | 0 | run->capacity = newCapacity; |
206 | 0 | assert(run->capacity >= min); |
207 | 0 | if (copy) { |
208 | 0 | rle16_t *oldruns = run->runs; |
209 | 0 | run->runs = (rle16_t *)roaring_realloc(oldruns, |
210 | 0 | run->capacity * sizeof(rle16_t)); |
211 | 0 | if (run->runs == NULL) roaring_free(oldruns); |
212 | 0 | } else { |
213 | 0 | roaring_free(run->runs); |
214 | 0 | run->runs = (rle16_t *)roaring_malloc(run->capacity * sizeof(rle16_t)); |
215 | 0 | } |
216 | | // We may have run->runs == NULL. |
217 | 0 | } |
218 | | |
219 | | /* copy one container into another */ |
220 | 0 | void run_container_copy(const run_container_t *src, run_container_t *dst) { |
221 | 0 | const int32_t n_runs = src->n_runs; |
222 | 0 | if (src->n_runs > dst->capacity) { |
223 | 0 | run_container_grow(dst, n_runs, false); |
224 | 0 | } |
225 | 0 | dst->n_runs = n_runs; |
226 | 0 | memcpy(dst->runs, src->runs, sizeof(rle16_t) * n_runs); |
227 | 0 | } |
228 | | |
229 | | /* Compute the union of `src_1' and `src_2' and write the result to `dst' |
230 | | * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */ |
231 | | void run_container_union(const run_container_t *src_1, |
232 | 0 | const run_container_t *src_2, run_container_t *dst) { |
233 | | // TODO: this could be a lot more efficient |
234 | | |
235 | | // we start out with inexpensive checks |
236 | 0 | const bool if1 = run_container_is_full(src_1); |
237 | 0 | const bool if2 = run_container_is_full(src_2); |
238 | 0 | if (if1 || if2) { |
239 | 0 | if (if1) { |
240 | 0 | run_container_copy(src_1, dst); |
241 | 0 | return; |
242 | 0 | } |
243 | 0 | if (if2) { |
244 | 0 | run_container_copy(src_2, dst); |
245 | 0 | return; |
246 | 0 | } |
247 | 0 | } |
248 | 0 | const int32_t neededcapacity = src_1->n_runs + src_2->n_runs; |
249 | 0 | if (dst->capacity < neededcapacity) |
250 | 0 | run_container_grow(dst, neededcapacity, false); |
251 | 0 | dst->n_runs = 0; |
252 | 0 | int32_t rlepos = 0; |
253 | 0 | int32_t xrlepos = 0; |
254 | |
|
255 | 0 | rle16_t previousrle; |
256 | 0 | if (src_1->runs[rlepos].value <= src_2->runs[xrlepos].value) { |
257 | 0 | previousrle = run_container_append_first(dst, src_1->runs[rlepos]); |
258 | 0 | rlepos++; |
259 | 0 | } else { |
260 | 0 | previousrle = run_container_append_first(dst, src_2->runs[xrlepos]); |
261 | 0 | xrlepos++; |
262 | 0 | } |
263 | |
|
264 | 0 | while ((xrlepos < src_2->n_runs) && (rlepos < src_1->n_runs)) { |
265 | 0 | rle16_t newrl; |
266 | 0 | if (src_1->runs[rlepos].value <= src_2->runs[xrlepos].value) { |
267 | 0 | newrl = src_1->runs[rlepos]; |
268 | 0 | rlepos++; |
269 | 0 | } else { |
270 | 0 | newrl = src_2->runs[xrlepos]; |
271 | 0 | xrlepos++; |
272 | 0 | } |
273 | 0 | run_container_append(dst, newrl, &previousrle); |
274 | 0 | } |
275 | 0 | while (xrlepos < src_2->n_runs) { |
276 | 0 | run_container_append(dst, src_2->runs[xrlepos], &previousrle); |
277 | 0 | xrlepos++; |
278 | 0 | } |
279 | 0 | while (rlepos < src_1->n_runs) { |
280 | 0 | run_container_append(dst, src_1->runs[rlepos], &previousrle); |
281 | 0 | rlepos++; |
282 | 0 | } |
283 | 0 | } |
284 | | |
285 | | /* Compute the union of `src_1' and `src_2' and write the result to `src_1' |
286 | | */ |
287 | | void run_container_union_inplace(run_container_t *src_1, |
288 | 0 | const run_container_t *src_2) { |
289 | | // TODO: this could be a lot more efficient |
290 | | |
291 | | // we start out with inexpensive checks |
292 | 0 | const bool if1 = run_container_is_full(src_1); |
293 | 0 | const bool if2 = run_container_is_full(src_2); |
294 | 0 | if (if1 || if2) { |
295 | 0 | if (if1) { |
296 | 0 | return; |
297 | 0 | } |
298 | 0 | if (if2) { |
299 | 0 | run_container_copy(src_2, src_1); |
300 | 0 | return; |
301 | 0 | } |
302 | 0 | } |
303 | | // we move the data to the end of the current array |
304 | 0 | const int32_t maxoutput = src_1->n_runs + src_2->n_runs; |
305 | 0 | const int32_t neededcapacity = maxoutput + src_1->n_runs; |
306 | 0 | if (src_1->capacity < neededcapacity) |
307 | 0 | run_container_grow(src_1, neededcapacity, true); |
308 | 0 | memmove(src_1->runs + maxoutput, src_1->runs, |
309 | 0 | src_1->n_runs * sizeof(rle16_t)); |
310 | 0 | rle16_t *inputsrc1 = src_1->runs + maxoutput; |
311 | 0 | const int32_t input1nruns = src_1->n_runs; |
312 | 0 | src_1->n_runs = 0; |
313 | 0 | int32_t rlepos = 0; |
314 | 0 | int32_t xrlepos = 0; |
315 | |
|
316 | 0 | rle16_t previousrle; |
317 | 0 | if (inputsrc1[rlepos].value <= src_2->runs[xrlepos].value) { |
318 | 0 | previousrle = run_container_append_first(src_1, inputsrc1[rlepos]); |
319 | 0 | rlepos++; |
320 | 0 | } else { |
321 | 0 | previousrle = run_container_append_first(src_1, src_2->runs[xrlepos]); |
322 | 0 | xrlepos++; |
323 | 0 | } |
324 | 0 | while ((xrlepos < src_2->n_runs) && (rlepos < input1nruns)) { |
325 | 0 | rle16_t newrl; |
326 | 0 | if (inputsrc1[rlepos].value <= src_2->runs[xrlepos].value) { |
327 | 0 | newrl = inputsrc1[rlepos]; |
328 | 0 | rlepos++; |
329 | 0 | } else { |
330 | 0 | newrl = src_2->runs[xrlepos]; |
331 | 0 | xrlepos++; |
332 | 0 | } |
333 | 0 | run_container_append(src_1, newrl, &previousrle); |
334 | 0 | } |
335 | 0 | while (xrlepos < src_2->n_runs) { |
336 | 0 | run_container_append(src_1, src_2->runs[xrlepos], &previousrle); |
337 | 0 | xrlepos++; |
338 | 0 | } |
339 | 0 | while (rlepos < input1nruns) { |
340 | 0 | run_container_append(src_1, inputsrc1[rlepos], &previousrle); |
341 | 0 | rlepos++; |
342 | 0 | } |
343 | 0 | } |
344 | | |
345 | | /* Compute the symmetric difference of `src_1' and `src_2' and write the result |
346 | | * to `dst' |
347 | | * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */ |
348 | | void run_container_xor(const run_container_t *src_1, |
349 | 0 | const run_container_t *src_2, run_container_t *dst) { |
350 | | // don't bother to convert xor with full range into negation |
351 | | // since negation is implemented similarly |
352 | |
|
353 | 0 | const int32_t neededcapacity = src_1->n_runs + src_2->n_runs; |
354 | 0 | if (dst->capacity < neededcapacity) |
355 | 0 | run_container_grow(dst, neededcapacity, false); |
356 | |
|
357 | 0 | int32_t pos1 = 0; |
358 | 0 | int32_t pos2 = 0; |
359 | 0 | dst->n_runs = 0; |
360 | |
|
361 | 0 | while ((pos1 < src_1->n_runs) && (pos2 < src_2->n_runs)) { |
362 | 0 | if (src_1->runs[pos1].value <= src_2->runs[pos2].value) { |
363 | 0 | run_container_smart_append_exclusive(dst, src_1->runs[pos1].value, |
364 | 0 | src_1->runs[pos1].length); |
365 | 0 | pos1++; |
366 | 0 | } else { |
367 | 0 | run_container_smart_append_exclusive(dst, src_2->runs[pos2].value, |
368 | 0 | src_2->runs[pos2].length); |
369 | 0 | pos2++; |
370 | 0 | } |
371 | 0 | } |
372 | 0 | while (pos1 < src_1->n_runs) { |
373 | 0 | run_container_smart_append_exclusive(dst, src_1->runs[pos1].value, |
374 | 0 | src_1->runs[pos1].length); |
375 | 0 | pos1++; |
376 | 0 | } |
377 | |
|
378 | 0 | while (pos2 < src_2->n_runs) { |
379 | 0 | run_container_smart_append_exclusive(dst, src_2->runs[pos2].value, |
380 | 0 | src_2->runs[pos2].length); |
381 | 0 | pos2++; |
382 | 0 | } |
383 | 0 | } |
384 | | |
385 | | /* Compute the intersection of src_1 and src_2 and write the result to |
386 | | * dst. It is assumed that dst is distinct from both src_1 and src_2. */ |
387 | | void run_container_intersection(const run_container_t *src_1, |
388 | | const run_container_t *src_2, |
389 | 0 | run_container_t *dst) { |
390 | 0 | const bool if1 = run_container_is_full(src_1); |
391 | 0 | const bool if2 = run_container_is_full(src_2); |
392 | 0 | if (if1 || if2) { |
393 | 0 | if (if1) { |
394 | 0 | run_container_copy(src_2, dst); |
395 | 0 | return; |
396 | 0 | } |
397 | 0 | if (if2) { |
398 | 0 | run_container_copy(src_1, dst); |
399 | 0 | return; |
400 | 0 | } |
401 | 0 | } |
402 | | // TODO: this could be a lot more efficient, could use SIMD optimizations |
403 | 0 | const int32_t neededcapacity = src_1->n_runs + src_2->n_runs; |
404 | 0 | if (dst->capacity < neededcapacity) |
405 | 0 | run_container_grow(dst, neededcapacity, false); |
406 | 0 | dst->n_runs = 0; |
407 | 0 | int32_t rlepos = 0; |
408 | 0 | int32_t xrlepos = 0; |
409 | 0 | int32_t start = src_1->runs[rlepos].value; |
410 | 0 | int32_t end = start + src_1->runs[rlepos].length + 1; |
411 | 0 | int32_t xstart = src_2->runs[xrlepos].value; |
412 | 0 | int32_t xend = xstart + src_2->runs[xrlepos].length + 1; |
413 | 0 | while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) { |
414 | 0 | if (end <= xstart) { |
415 | 0 | ++rlepos; |
416 | 0 | if (rlepos < src_1->n_runs) { |
417 | 0 | start = src_1->runs[rlepos].value; |
418 | 0 | end = start + src_1->runs[rlepos].length + 1; |
419 | 0 | } |
420 | 0 | } else if (xend <= start) { |
421 | 0 | ++xrlepos; |
422 | 0 | if (xrlepos < src_2->n_runs) { |
423 | 0 | xstart = src_2->runs[xrlepos].value; |
424 | 0 | xend = xstart + src_2->runs[xrlepos].length + 1; |
425 | 0 | } |
426 | 0 | } else { // they overlap |
427 | 0 | const int32_t lateststart = start > xstart ? start : xstart; |
428 | 0 | int32_t earliestend; |
429 | 0 | if (end == xend) { // improbable |
430 | 0 | earliestend = end; |
431 | 0 | rlepos++; |
432 | 0 | xrlepos++; |
433 | 0 | if (rlepos < src_1->n_runs) { |
434 | 0 | start = src_1->runs[rlepos].value; |
435 | 0 | end = start + src_1->runs[rlepos].length + 1; |
436 | 0 | } |
437 | 0 | if (xrlepos < src_2->n_runs) { |
438 | 0 | xstart = src_2->runs[xrlepos].value; |
439 | 0 | xend = xstart + src_2->runs[xrlepos].length + 1; |
440 | 0 | } |
441 | 0 | } else if (end < xend) { |
442 | 0 | earliestend = end; |
443 | 0 | rlepos++; |
444 | 0 | if (rlepos < src_1->n_runs) { |
445 | 0 | start = src_1->runs[rlepos].value; |
446 | 0 | end = start + src_1->runs[rlepos].length + 1; |
447 | 0 | } |
448 | |
|
449 | 0 | } else { // end > xend |
450 | 0 | earliestend = xend; |
451 | 0 | xrlepos++; |
452 | 0 | if (xrlepos < src_2->n_runs) { |
453 | 0 | xstart = src_2->runs[xrlepos].value; |
454 | 0 | xend = xstart + src_2->runs[xrlepos].length + 1; |
455 | 0 | } |
456 | 0 | } |
457 | 0 | dst->runs[dst->n_runs].value = (uint16_t)lateststart; |
458 | 0 | dst->runs[dst->n_runs].length = |
459 | 0 | (uint16_t)(earliestend - lateststart - 1); |
460 | 0 | dst->n_runs++; |
461 | 0 | } |
462 | 0 | } |
463 | 0 | } |
464 | | |
465 | | /* Compute the size of the intersection of src_1 and src_2 . */ |
466 | | int run_container_intersection_cardinality(const run_container_t *src_1, |
467 | 0 | const run_container_t *src_2) { |
468 | 0 | const bool if1 = run_container_is_full(src_1); |
469 | 0 | const bool if2 = run_container_is_full(src_2); |
470 | 0 | if (if1 || if2) { |
471 | 0 | if (if1) { |
472 | 0 | return run_container_cardinality(src_2); |
473 | 0 | } |
474 | 0 | if (if2) { |
475 | 0 | return run_container_cardinality(src_1); |
476 | 0 | } |
477 | 0 | } |
478 | 0 | int answer = 0; |
479 | 0 | int32_t rlepos = 0; |
480 | 0 | int32_t xrlepos = 0; |
481 | 0 | int32_t start = src_1->runs[rlepos].value; |
482 | 0 | int32_t end = start + src_1->runs[rlepos].length + 1; |
483 | 0 | int32_t xstart = src_2->runs[xrlepos].value; |
484 | 0 | int32_t xend = xstart + src_2->runs[xrlepos].length + 1; |
485 | 0 | while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) { |
486 | 0 | if (end <= xstart) { |
487 | 0 | ++rlepos; |
488 | 0 | if (rlepos < src_1->n_runs) { |
489 | 0 | start = src_1->runs[rlepos].value; |
490 | 0 | end = start + src_1->runs[rlepos].length + 1; |
491 | 0 | } |
492 | 0 | } else if (xend <= start) { |
493 | 0 | ++xrlepos; |
494 | 0 | if (xrlepos < src_2->n_runs) { |
495 | 0 | xstart = src_2->runs[xrlepos].value; |
496 | 0 | xend = xstart + src_2->runs[xrlepos].length + 1; |
497 | 0 | } |
498 | 0 | } else { // they overlap |
499 | 0 | const int32_t lateststart = start > xstart ? start : xstart; |
500 | 0 | int32_t earliestend; |
501 | 0 | if (end == xend) { // improbable |
502 | 0 | earliestend = end; |
503 | 0 | rlepos++; |
504 | 0 | xrlepos++; |
505 | 0 | if (rlepos < src_1->n_runs) { |
506 | 0 | start = src_1->runs[rlepos].value; |
507 | 0 | end = start + src_1->runs[rlepos].length + 1; |
508 | 0 | } |
509 | 0 | if (xrlepos < src_2->n_runs) { |
510 | 0 | xstart = src_2->runs[xrlepos].value; |
511 | 0 | xend = xstart + src_2->runs[xrlepos].length + 1; |
512 | 0 | } |
513 | 0 | } else if (end < xend) { |
514 | 0 | earliestend = end; |
515 | 0 | rlepos++; |
516 | 0 | if (rlepos < src_1->n_runs) { |
517 | 0 | start = src_1->runs[rlepos].value; |
518 | 0 | end = start + src_1->runs[rlepos].length + 1; |
519 | 0 | } |
520 | |
|
521 | 0 | } else { // end > xend |
522 | 0 | earliestend = xend; |
523 | 0 | xrlepos++; |
524 | 0 | if (xrlepos < src_2->n_runs) { |
525 | 0 | xstart = src_2->runs[xrlepos].value; |
526 | 0 | xend = xstart + src_2->runs[xrlepos].length + 1; |
527 | 0 | } |
528 | 0 | } |
529 | 0 | answer += earliestend - lateststart; |
530 | 0 | } |
531 | 0 | } |
532 | 0 | return answer; |
533 | 0 | } |
534 | | |
535 | | bool run_container_intersect(const run_container_t *src_1, |
536 | 0 | const run_container_t *src_2) { |
537 | 0 | const bool if1 = run_container_is_full(src_1); |
538 | 0 | const bool if2 = run_container_is_full(src_2); |
539 | 0 | if (if1 || if2) { |
540 | 0 | if (if1) { |
541 | 0 | return !run_container_empty(src_2); |
542 | 0 | } |
543 | 0 | if (if2) { |
544 | 0 | return !run_container_empty(src_1); |
545 | 0 | } |
546 | 0 | } |
547 | 0 | int32_t rlepos = 0; |
548 | 0 | int32_t xrlepos = 0; |
549 | 0 | int32_t start = src_1->runs[rlepos].value; |
550 | 0 | int32_t end = start + src_1->runs[rlepos].length + 1; |
551 | 0 | int32_t xstart = src_2->runs[xrlepos].value; |
552 | 0 | int32_t xend = xstart + src_2->runs[xrlepos].length + 1; |
553 | 0 | while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) { |
554 | 0 | if (end <= xstart) { |
555 | 0 | ++rlepos; |
556 | 0 | if (rlepos < src_1->n_runs) { |
557 | 0 | start = src_1->runs[rlepos].value; |
558 | 0 | end = start + src_1->runs[rlepos].length + 1; |
559 | 0 | } |
560 | 0 | } else if (xend <= start) { |
561 | 0 | ++xrlepos; |
562 | 0 | if (xrlepos < src_2->n_runs) { |
563 | 0 | xstart = src_2->runs[xrlepos].value; |
564 | 0 | xend = xstart + src_2->runs[xrlepos].length + 1; |
565 | 0 | } |
566 | 0 | } else { // they overlap |
567 | 0 | return true; |
568 | 0 | } |
569 | 0 | } |
570 | 0 | return false; |
571 | 0 | } |
572 | | |
573 | | /* Compute the difference of src_1 and src_2 and write the result to |
574 | | * dst. It is assumed that dst is distinct from both src_1 and src_2. */ |
575 | | void run_container_andnot(const run_container_t *src_1, |
576 | 0 | const run_container_t *src_2, run_container_t *dst) { |
577 | | // following Java implementation as of June 2016 |
578 | |
|
579 | 0 | if (dst->capacity < src_1->n_runs + src_2->n_runs) |
580 | 0 | run_container_grow(dst, src_1->n_runs + src_2->n_runs, false); |
581 | |
|
582 | 0 | dst->n_runs = 0; |
583 | |
|
584 | 0 | int rlepos1 = 0; |
585 | 0 | int rlepos2 = 0; |
586 | 0 | int32_t start = src_1->runs[rlepos1].value; |
587 | 0 | int32_t end = start + src_1->runs[rlepos1].length + 1; |
588 | 0 | int32_t start2 = src_2->runs[rlepos2].value; |
589 | 0 | int32_t end2 = start2 + src_2->runs[rlepos2].length + 1; |
590 | |
|
591 | 0 | while ((rlepos1 < src_1->n_runs) && (rlepos2 < src_2->n_runs)) { |
592 | 0 | if (end <= start2) { |
593 | | // output the first run |
594 | 0 | dst->runs[dst->n_runs++] = |
595 | 0 | CROARING_MAKE_RLE16(start, end - start - 1); |
596 | 0 | rlepos1++; |
597 | 0 | if (rlepos1 < src_1->n_runs) { |
598 | 0 | start = src_1->runs[rlepos1].value; |
599 | 0 | end = start + src_1->runs[rlepos1].length + 1; |
600 | 0 | } |
601 | 0 | } else if (end2 <= start) { |
602 | | // exit the second run |
603 | 0 | rlepos2++; |
604 | 0 | if (rlepos2 < src_2->n_runs) { |
605 | 0 | start2 = src_2->runs[rlepos2].value; |
606 | 0 | end2 = start2 + src_2->runs[rlepos2].length + 1; |
607 | 0 | } |
608 | 0 | } else { |
609 | 0 | if (start < start2) { |
610 | 0 | dst->runs[dst->n_runs++] = |
611 | 0 | CROARING_MAKE_RLE16(start, start2 - start - 1); |
612 | 0 | } |
613 | 0 | if (end2 < end) { |
614 | 0 | start = end2; |
615 | 0 | } else { |
616 | 0 | rlepos1++; |
617 | 0 | if (rlepos1 < src_1->n_runs) { |
618 | 0 | start = src_1->runs[rlepos1].value; |
619 | 0 | end = start + src_1->runs[rlepos1].length + 1; |
620 | 0 | } |
621 | 0 | } |
622 | 0 | } |
623 | 0 | } |
624 | 0 | if (rlepos1 < src_1->n_runs) { |
625 | 0 | dst->runs[dst->n_runs++] = CROARING_MAKE_RLE16(start, end - start - 1); |
626 | 0 | rlepos1++; |
627 | 0 | if (rlepos1 < src_1->n_runs) { |
628 | 0 | memcpy(dst->runs + dst->n_runs, src_1->runs + rlepos1, |
629 | 0 | sizeof(rle16_t) * (src_1->n_runs - rlepos1)); |
630 | 0 | dst->n_runs += src_1->n_runs - rlepos1; |
631 | 0 | } |
632 | 0 | } |
633 | 0 | } |
634 | | |
635 | | /* |
636 | | * Print this container using printf (useful for debugging). |
637 | | */ |
638 | 0 | void run_container_printf(const run_container_t *cont) { |
639 | 0 | for (int i = 0; i < cont->n_runs; ++i) { |
640 | 0 | uint16_t run_start = cont->runs[i].value; |
641 | 0 | uint16_t le = cont->runs[i].length; |
642 | 0 | printf("[%d,%d]", run_start, run_start + le); |
643 | 0 | } |
644 | 0 | } |
645 | | |
646 | | /* |
647 | | * Print this container using printf as a comma-separated list of 32-bit |
648 | | * integers starting at base. |
649 | | */ |
650 | | void run_container_printf_as_uint32_array(const run_container_t *cont, |
651 | 0 | uint32_t base) { |
652 | 0 | if (cont->n_runs == 0) return; |
653 | 0 | { |
654 | 0 | uint32_t run_start = base + cont->runs[0].value; |
655 | 0 | uint16_t le = cont->runs[0].length; |
656 | 0 | printf("%u", run_start); |
657 | 0 | for (uint32_t j = 1; j <= le; ++j) printf(",%u", run_start + j); |
658 | 0 | } |
659 | 0 | for (int32_t i = 1; i < cont->n_runs; ++i) { |
660 | 0 | uint32_t run_start = base + cont->runs[i].value; |
661 | 0 | uint16_t le = cont->runs[i].length; |
662 | 0 | for (uint32_t j = 0; j <= le; ++j) printf(",%u", run_start + j); |
663 | 0 | } |
664 | 0 | } |
665 | | |
666 | | /* |
667 | | * Validate the container. Returns true if valid. |
668 | | */ |
669 | 0 | bool run_container_validate(const run_container_t *run, const char **reason) { |
670 | 0 | if (run->n_runs < 0) { |
671 | 0 | *reason = "negative run count"; |
672 | 0 | return false; |
673 | 0 | } |
674 | 0 | if (run->capacity < 0) { |
675 | 0 | *reason = "negative run capacity"; |
676 | 0 | return false; |
677 | 0 | } |
678 | 0 | if (run->capacity < run->n_runs) { |
679 | 0 | *reason = "capacity less than run count"; |
680 | 0 | return false; |
681 | 0 | } |
682 | | |
683 | 0 | if (run->n_runs == 0) { |
684 | 0 | *reason = "zero run count"; |
685 | 0 | return false; |
686 | 0 | } |
687 | 0 | if (run->runs == NULL) { |
688 | 0 | *reason = "NULL runs"; |
689 | 0 | return false; |
690 | 0 | } |
691 | | |
692 | | // Use uint32_t to avoid overflow issues on ranges that contain UINT16_MAX. |
693 | 0 | uint32_t last_end = 0; |
694 | 0 | for (int i = 0; i < run->n_runs; ++i) { |
695 | 0 | uint32_t start = run->runs[i].value; |
696 | 0 | uint32_t end = start + run->runs[i].length + 1; |
697 | 0 | if (end <= start) { |
698 | 0 | *reason = "run start + length overflow"; |
699 | 0 | return false; |
700 | 0 | } |
701 | 0 | if (end > (1 << 16)) { |
702 | 0 | *reason = "run start + length too large"; |
703 | 0 | return false; |
704 | 0 | } |
705 | 0 | if (start < last_end) { |
706 | 0 | *reason = "run start less than last end"; |
707 | 0 | return false; |
708 | 0 | } |
709 | 0 | if (start == last_end && last_end != 0) { |
710 | 0 | *reason = "run start equal to last end, should have combined"; |
711 | 0 | return false; |
712 | 0 | } |
713 | 0 | last_end = end; |
714 | 0 | } |
715 | 0 | return true; |
716 | 0 | } |
717 | | |
718 | 0 | int32_t run_container_write(const run_container_t *container, char *buf) { |
719 | 0 | uint16_t cast_16 = container->n_runs; |
720 | 0 | uint16_t n_runs_le = croaring_htole16(cast_16); |
721 | 0 | memcpy(buf, &n_runs_le, sizeof(uint16_t)); |
722 | | #if CROARING_IS_BIG_ENDIAN |
723 | | char *out = buf + sizeof(uint16_t); |
724 | | for (int32_t i = 0; i < container->n_runs; ++i) { |
725 | | uint16_t v_le = croaring_htole16(container->runs[i].value); |
726 | | uint16_t l_le = croaring_htole16(container->runs[i].length); |
727 | | memcpy(out, &v_le, sizeof(uint16_t)); |
728 | | memcpy(out + sizeof(uint16_t), &l_le, sizeof(uint16_t)); |
729 | | out += sizeof(rle16_t); |
730 | | } |
731 | | #else |
732 | 0 | memcpy(buf + sizeof(uint16_t), container->runs, |
733 | 0 | container->n_runs * sizeof(rle16_t)); |
734 | 0 | #endif |
735 | 0 | return run_container_size_in_bytes(container); |
736 | 0 | } |
737 | | |
738 | | int32_t run_container_read(int32_t cardinality, run_container_t *container, |
739 | 0 | const char *buf) { |
740 | 0 | (void)cardinality; |
741 | 0 | uint16_t cast_16; |
742 | 0 | memcpy(&cast_16, buf, sizeof(uint16_t)); |
743 | 0 | container->n_runs = croaring_letoh16(cast_16); |
744 | 0 | if (container->n_runs > container->capacity) |
745 | 0 | run_container_grow(container, container->n_runs, false); |
746 | 0 | if (container->n_runs > 0) { |
747 | | #if CROARING_IS_BIG_ENDIAN |
748 | | const char *in = buf + sizeof(uint16_t); |
749 | | for (int32_t i = 0; i < container->n_runs; ++i) { |
750 | | uint16_t v_le, l_le; |
751 | | memcpy(&v_le, in, sizeof(uint16_t)); |
752 | | memcpy(&l_le, in + sizeof(uint16_t), sizeof(uint16_t)); |
753 | | container->runs[i].value = croaring_letoh16(v_le); |
754 | | container->runs[i].length = croaring_letoh16(l_le); |
755 | | in += sizeof(rle16_t); |
756 | | } |
757 | | #else |
758 | 0 | memcpy(container->runs, buf + sizeof(uint16_t), |
759 | 0 | container->n_runs * sizeof(rle16_t)); |
760 | 0 | #endif |
761 | 0 | } |
762 | 0 | return run_container_size_in_bytes(container); |
763 | 0 | } |
764 | | |
765 | | bool run_container_iterate(const run_container_t *cont, uint32_t base, |
766 | 0 | roaring_iterator iterator, void *ptr) { |
767 | 0 | for (int i = 0; i < cont->n_runs; ++i) { |
768 | 0 | uint32_t run_start = base + cont->runs[i].value; |
769 | 0 | uint16_t le = cont->runs[i].length; |
770 | |
|
771 | 0 | for (int j = 0; j <= le; ++j) |
772 | 0 | if (!iterator(run_start + j, ptr)) return false; |
773 | 0 | } |
774 | 0 | return true; |
775 | 0 | } |
776 | | |
777 | | bool run_container_iterate64(const run_container_t *cont, uint32_t base, |
778 | | roaring_iterator64 iterator, uint64_t high_bits, |
779 | 0 | void *ptr) { |
780 | 0 | for (int i = 0; i < cont->n_runs; ++i) { |
781 | 0 | uint32_t run_start = base + cont->runs[i].value; |
782 | 0 | uint16_t le = cont->runs[i].length; |
783 | |
|
784 | 0 | for (int j = 0; j <= le; ++j) |
785 | 0 | if (!iterator(high_bits | (uint64_t)(run_start + j), ptr)) |
786 | 0 | return false; |
787 | 0 | } |
788 | 0 | return true; |
789 | 0 | } |
790 | | |
791 | | bool run_container_is_subset(const run_container_t *container1, |
792 | 0 | const run_container_t *container2) { |
793 | 0 | int i1 = 0, i2 = 0; |
794 | 0 | while (i1 < container1->n_runs && i2 < container2->n_runs) { |
795 | 0 | int start1 = container1->runs[i1].value; |
796 | 0 | int stop1 = start1 + container1->runs[i1].length; |
797 | 0 | int start2 = container2->runs[i2].value; |
798 | 0 | int stop2 = start2 + container2->runs[i2].length; |
799 | 0 | if (start1 < start2) { |
800 | 0 | return false; |
801 | 0 | } else { // start1 >= start2 |
802 | 0 | if (stop1 < stop2) { |
803 | 0 | i1++; |
804 | 0 | } else if (stop1 == stop2) { |
805 | 0 | i1++; |
806 | 0 | i2++; |
807 | 0 | } else { // stop1 > stop2 |
808 | 0 | i2++; |
809 | 0 | } |
810 | 0 | } |
811 | 0 | } |
812 | 0 | if (i1 == container1->n_runs) { |
813 | 0 | return true; |
814 | 0 | } else { |
815 | 0 | return false; |
816 | 0 | } |
817 | 0 | } |
818 | | |
819 | | // TODO: write smart_append_exclusive version to match the overloaded 1 param |
820 | | // Java version (or is it even used?) |
821 | | |
822 | | // follows the Java implementation closely |
823 | | // length is the rle-value. Ie, run [10,12) uses a length value 1. |
824 | | void run_container_smart_append_exclusive(run_container_t *src, |
825 | | const uint16_t start, |
826 | 0 | const uint16_t length) { |
827 | 0 | int old_end; |
828 | 0 | rle16_t *last_run = src->n_runs ? src->runs + (src->n_runs - 1) : NULL; |
829 | 0 | rle16_t *appended_last_run = src->runs + src->n_runs; |
830 | |
|
831 | 0 | if (!src->n_runs || |
832 | 0 | (start > (old_end = last_run->value + last_run->length + 1))) { |
833 | 0 | *appended_last_run = CROARING_MAKE_RLE16(start, length); |
834 | 0 | src->n_runs++; |
835 | 0 | return; |
836 | 0 | } |
837 | 0 | if (old_end == start) { |
838 | | // we merge |
839 | 0 | last_run->length += (length + 1); |
840 | 0 | return; |
841 | 0 | } |
842 | 0 | int new_end = start + length + 1; |
843 | |
|
844 | 0 | if (start == last_run->value) { |
845 | | // wipe out previous |
846 | 0 | if (new_end < old_end) { |
847 | 0 | *last_run = CROARING_MAKE_RLE16(new_end, old_end - new_end - 1); |
848 | 0 | return; |
849 | 0 | } else if (new_end > old_end) { |
850 | 0 | *last_run = CROARING_MAKE_RLE16(old_end, new_end - old_end - 1); |
851 | 0 | return; |
852 | 0 | } else { |
853 | 0 | src->n_runs--; |
854 | 0 | return; |
855 | 0 | } |
856 | 0 | } |
857 | 0 | last_run->length = start - last_run->value - 1; |
858 | 0 | if (new_end < old_end) { |
859 | 0 | *appended_last_run = |
860 | 0 | CROARING_MAKE_RLE16(new_end, old_end - new_end - 1); |
861 | 0 | src->n_runs++; |
862 | 0 | } else if (new_end > old_end) { |
863 | 0 | *appended_last_run = |
864 | 0 | CROARING_MAKE_RLE16(old_end, new_end - old_end - 1); |
865 | 0 | src->n_runs++; |
866 | 0 | } |
867 | 0 | } |
868 | | |
869 | | bool run_container_select(const run_container_t *container, |
870 | | uint32_t *start_rank, uint32_t rank, |
871 | 0 | uint32_t *element) { |
872 | 0 | for (int i = 0; i < container->n_runs; i++) { |
873 | 0 | uint16_t length = container->runs[i].length; |
874 | 0 | if (rank <= *start_rank + length) { |
875 | 0 | uint16_t value = container->runs[i].value; |
876 | 0 | *element = value + rank - (*start_rank); |
877 | 0 | return true; |
878 | 0 | } else |
879 | 0 | *start_rank += length + 1; |
880 | 0 | } |
881 | 0 | return false; |
882 | 0 | } |
883 | | |
884 | 0 | int run_container_rank(const run_container_t *container, uint16_t x) { |
885 | 0 | int sum = 0; |
886 | 0 | uint32_t x32 = x; |
887 | 0 | for (int i = 0; i < container->n_runs; i++) { |
888 | 0 | uint32_t startpoint = container->runs[i].value; |
889 | 0 | uint32_t length = container->runs[i].length; |
890 | 0 | uint32_t endpoint = length + startpoint; |
891 | 0 | if (x <= endpoint) { |
892 | 0 | if (x < startpoint) break; |
893 | 0 | return sum + (x32 - startpoint) + 1; |
894 | 0 | } else { |
895 | 0 | sum += length + 1; |
896 | 0 | } |
897 | 0 | } |
898 | 0 | return sum; |
899 | 0 | } |
900 | | uint32_t run_container_rank_many(const run_container_t *container, |
901 | | uint64_t start_rank, const uint32_t *begin, |
902 | 0 | const uint32_t *end, uint64_t *ans) { |
903 | 0 | const uint16_t high = (uint16_t)((*begin) >> 16); |
904 | 0 | const uint32_t *iter = begin; |
905 | 0 | int sum = 0; |
906 | 0 | int i = 0; |
907 | 0 | for (; iter != end; iter++) { |
908 | 0 | uint32_t x = *iter; |
909 | 0 | uint16_t xhigh = (uint16_t)(x >> 16); |
910 | 0 | if (xhigh != high) return iter - begin; // stop at next container |
911 | | |
912 | 0 | uint32_t x32 = x & 0xFFFF; |
913 | 0 | while (i < container->n_runs) { |
914 | 0 | uint32_t startpoint = container->runs[i].value; |
915 | 0 | uint32_t length = container->runs[i].length; |
916 | 0 | uint32_t endpoint = length + startpoint; |
917 | 0 | if (x32 <= endpoint) { |
918 | 0 | if (x32 < startpoint) { |
919 | 0 | *(ans++) = start_rank + sum; |
920 | 0 | } else { |
921 | 0 | *(ans++) = start_rank + sum + (x32 - startpoint) + 1; |
922 | 0 | } |
923 | 0 | break; |
924 | 0 | } else { |
925 | 0 | sum += length + 1; |
926 | 0 | i++; |
927 | 0 | } |
928 | 0 | } |
929 | 0 | if (i >= container->n_runs) *(ans++) = start_rank + sum; |
930 | 0 | } |
931 | | |
932 | 0 | return iter - begin; |
933 | 0 | } |
934 | | |
935 | 0 | int run_container_get_index(const run_container_t *container, uint16_t x) { |
936 | 0 | if (run_container_contains(container, x)) { |
937 | 0 | int sum = 0; |
938 | 0 | uint32_t x32 = x; |
939 | 0 | for (int i = 0; i < container->n_runs; i++) { |
940 | 0 | uint32_t startpoint = container->runs[i].value; |
941 | 0 | uint32_t length = container->runs[i].length; |
942 | 0 | uint32_t endpoint = length + startpoint; |
943 | 0 | if (x <= endpoint) { |
944 | 0 | if (x < startpoint) break; |
945 | 0 | return sum + (x32 - startpoint); |
946 | 0 | } else { |
947 | 0 | sum += length + 1; |
948 | 0 | } |
949 | 0 | } |
950 | 0 | return sum - 1; |
951 | 0 | } else { |
952 | 0 | return -1; |
953 | 0 | } |
954 | 0 | } |
955 | | #define CROARING_ENABLE_AVX512_RUN_CONTAINER_CARDINALITY 0 |
956 | | |
957 | | #if defined(CROARING_IS_X64) && CROARING_COMPILER_SUPPORTS_AVX512 |
958 | | |
959 | | #if CROARING_ENABLE_AVX512_RUN_CONTAINER_CARDINALITY |
960 | | CROARING_TARGET_AVX512 |
961 | | CROARING_ALLOW_UNALIGNED |
962 | | /* Get the cardinality of `run'. Requires an actual computation. */ |
963 | | static inline int _avx512_run_container_cardinality( |
964 | | const run_container_t *run) { |
965 | | const int32_t n_runs = run->n_runs; |
966 | | const rle16_t *runs = run->runs; |
967 | | |
968 | | int32_t k = 0; |
969 | | const int32_t step512 = sizeof(__m512i) / sizeof(rle16_t); |
970 | | __m512i total = _mm512_setzero_si512(); |
971 | | for (; k + step512 <= n_runs; k += step512) { |
972 | | __m512i ymm1 = _mm512_loadu_si512((const __m512i *)(runs + k)); |
973 | | __m512i justlengths = _mm512_srli_epi32(ymm1, 16); |
974 | | total = _mm512_add_epi32(total, justlengths); |
975 | | } |
976 | | if (k < n_runs) { |
977 | | __m512i ymm1 = _mm512_maskz_loadu_epi32((1 << (n_runs - k)) - 1, |
978 | | (const __m512i *)(runs + k)); |
979 | | __m512i justlengths = _mm512_srli_epi32(ymm1, 16); |
980 | | total = _mm512_add_epi32(total, justlengths); |
981 | | } |
982 | | return _mm512_reduce_add_epi32(total) + n_runs; |
983 | | } |
984 | | |
985 | | CROARING_UNTARGET_AVX512 |
986 | | #endif // CROARING_ENABLE_AVX512_RUN_CONTAINER_CARDINALITY |
987 | | |
988 | | CROARING_TARGET_AVX2 |
989 | | CROARING_ALLOW_UNALIGNED |
990 | | /* Get the cardinality of `run'. Requires an actual computation. */ |
991 | 0 | static inline int _avx2_run_container_cardinality(const run_container_t *run) { |
992 | 0 | const int32_t n_runs = run->n_runs; |
993 | 0 | const rle16_t *runs = run->runs; |
994 | | |
995 | | /* by initializing with n_runs, we omit counting the +1 for each pair. */ |
996 | 0 | int sum = n_runs; |
997 | 0 | int32_t k = 0; |
998 | 0 | const int32_t step = sizeof(__m256i) / sizeof(rle16_t); |
999 | 0 | if (n_runs > step) { |
1000 | 0 | __m256i total = _mm256_setzero_si256(); |
1001 | 0 | for (; k + step <= n_runs; k += step) { |
1002 | 0 | __m256i ymm1 = _mm256_lddqu_si256((const __m256i *)(runs + k)); |
1003 | 0 | __m256i justlengths = _mm256_srli_epi32(ymm1, 16); |
1004 | 0 | total = _mm256_add_epi32(total, justlengths); |
1005 | 0 | } |
1006 | | // a store might be faster than extract? |
1007 | 0 | uint32_t buffer[sizeof(__m256i) / sizeof(rle16_t)]; |
1008 | 0 | _mm256_storeu_si256((__m256i *)buffer, total); |
1009 | 0 | sum += (buffer[0] + buffer[1]) + (buffer[2] + buffer[3]) + |
1010 | 0 | (buffer[4] + buffer[5]) + (buffer[6] + buffer[7]); |
1011 | 0 | } |
1012 | 0 | for (; k < n_runs; ++k) { |
1013 | 0 | sum += runs[k].length; |
1014 | 0 | } |
1015 | |
|
1016 | 0 | return sum; |
1017 | 0 | } |
1018 | | |
1019 | | CROARING_ALLOW_UNALIGNED |
1020 | | int _avx2_run_container_to_uint32_array(void *vout, const run_container_t *cont, |
1021 | 0 | uint32_t base) { |
1022 | 0 | int outpos = 0; |
1023 | 0 | uint32_t *out = (uint32_t *)vout; |
1024 | |
|
1025 | 0 | for (int i = 0; i < cont->n_runs; ++i) { |
1026 | 0 | uint32_t run_start = base + cont->runs[i].value; |
1027 | 0 | uint16_t le = cont->runs[i].length; |
1028 | 0 | if (le < 8) { |
1029 | 0 | for (int j = 0; j <= le; ++j) { |
1030 | 0 | uint32_t val = run_start + j; |
1031 | 0 | memcpy(out + outpos, &val, |
1032 | 0 | sizeof(uint32_t)); // should be compiled as a MOV on x64 |
1033 | 0 | outpos++; |
1034 | 0 | } |
1035 | 0 | } else { |
1036 | 0 | int j = 0; |
1037 | 0 | __m256i run_start_v = _mm256_set1_epi32(run_start); |
1038 | | // [8,8,8,8....] |
1039 | 0 | __m256i inc = _mm256_set1_epi32(8); |
1040 | | // used for generate sequence: |
1041 | | // [0, 1, 2, 3...], [8, 9, 10,...] |
1042 | 0 | __m256i delta = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); |
1043 | 0 | for (j = 0; j + 8 <= le; j += 8) { |
1044 | 0 | __m256i val_v = _mm256_add_epi32(run_start_v, delta); |
1045 | 0 | _mm256_storeu_si256((__m256i *)(out + outpos), val_v); |
1046 | 0 | delta = _mm256_add_epi32(inc, delta); |
1047 | 0 | outpos += 8; |
1048 | 0 | } |
1049 | 0 | for (; j <= le; ++j) { |
1050 | 0 | uint32_t val = run_start + j; |
1051 | 0 | memcpy(out + outpos, &val, |
1052 | 0 | sizeof(uint32_t)); // should be compiled as a MOV on x64 |
1053 | 0 | outpos++; |
1054 | 0 | } |
1055 | 0 | } |
1056 | 0 | } |
1057 | 0 | return outpos; |
1058 | 0 | } |
1059 | | |
1060 | | CROARING_UNTARGET_AVX2 |
1061 | | |
1062 | | /* Get the cardinality of `run'. Requires an actual computation. */ |
1063 | | static inline int _scalar_run_container_cardinality( |
1064 | 0 | const run_container_t *run) { |
1065 | 0 | const int32_t n_runs = run->n_runs; |
1066 | 0 | const rle16_t *runs = run->runs; |
1067 | | |
1068 | | /* by initializing with n_runs, we omit counting the +1 for each pair. */ |
1069 | 0 | int sum = n_runs; |
1070 | 0 | for (int k = 0; k < n_runs; ++k) { |
1071 | 0 | sum += runs[k].length; |
1072 | 0 | } |
1073 | |
|
1074 | 0 | return sum; |
1075 | 0 | } |
1076 | | |
1077 | 0 | int run_container_cardinality(const run_container_t *run) { |
1078 | | // Empirically AVX-512 is not always faster than AVX2 |
1079 | | #if CROARING_COMPILER_SUPPORTS_AVX512 && \ |
1080 | | CROARING_ENABLE_AVX512_RUN_CONTAINER_CARDINALITY |
1081 | | if (croaring_hardware_support() & ROARING_SUPPORTS_AVX512) { |
1082 | | return _avx512_run_container_cardinality(run); |
1083 | | } else |
1084 | | #endif |
1085 | 0 | if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) { |
1086 | 0 | return _avx2_run_container_cardinality(run); |
1087 | 0 | } else { |
1088 | 0 | return _scalar_run_container_cardinality(run); |
1089 | 0 | } |
1090 | 0 | } |
1091 | | |
1092 | | int _scalar_run_container_to_uint32_array(void *vout, |
1093 | | const run_container_t *cont, |
1094 | 0 | uint32_t base) { |
1095 | 0 | int outpos = 0; |
1096 | 0 | uint32_t *out = (uint32_t *)vout; |
1097 | 0 | for (int i = 0; i < cont->n_runs; ++i) { |
1098 | 0 | uint32_t run_start = base + cont->runs[i].value; |
1099 | 0 | uint16_t le = cont->runs[i].length; |
1100 | 0 | for (int j = 0; j <= le; ++j) { |
1101 | 0 | uint32_t val = run_start + j; |
1102 | 0 | memcpy(out + outpos, &val, |
1103 | 0 | sizeof(uint32_t)); // should be compiled as a MOV on x64 |
1104 | 0 | outpos++; |
1105 | 0 | } |
1106 | 0 | } |
1107 | 0 | return outpos; |
1108 | 0 | } |
1109 | | |
1110 | | int run_container_to_uint32_array(void *vout, const run_container_t *cont, |
1111 | 0 | uint32_t base) { |
1112 | 0 | if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) { |
1113 | 0 | return _avx2_run_container_to_uint32_array(vout, cont, base); |
1114 | 0 | } else { |
1115 | 0 | return _scalar_run_container_to_uint32_array(vout, cont, base); |
1116 | 0 | } |
1117 | 0 | } |
1118 | | |
1119 | | #else |
1120 | | |
1121 | | /* Get the cardinality of `run'. Requires an actual computation. */ |
1122 | | CROARING_ALLOW_UNALIGNED |
1123 | | int run_container_cardinality(const run_container_t *run) { |
1124 | | const int32_t n_runs = run->n_runs; |
1125 | | const rle16_t *runs = run->runs; |
1126 | | |
1127 | | /* by initializing with n_runs, we omit counting the +1 for each pair. */ |
1128 | | int sum = n_runs; |
1129 | | for (int k = 0; k < n_runs; ++k) { |
1130 | | sum += runs[k].length; |
1131 | | } |
1132 | | |
1133 | | return sum; |
1134 | | } |
1135 | | |
1136 | | CROARING_ALLOW_UNALIGNED |
1137 | | int run_container_to_uint32_array(void *vout, const run_container_t *cont, |
1138 | | uint32_t base) { |
1139 | | int outpos = 0; |
1140 | | uint32_t *out = (uint32_t *)vout; |
1141 | | for (int i = 0; i < cont->n_runs; ++i) { |
1142 | | uint32_t run_start = base + cont->runs[i].value; |
1143 | | uint16_t le = cont->runs[i].length; |
1144 | | for (int j = 0; j <= le; ++j) { |
1145 | | uint32_t val = run_start + j; |
1146 | | memcpy(out + outpos, &val, |
1147 | | sizeof(uint32_t)); // should be compiled as a MOV on x64 |
1148 | | outpos++; |
1149 | | } |
1150 | | } |
1151 | | return outpos; |
1152 | | } |
1153 | | |
1154 | | #endif |
1155 | | |
1156 | | #ifdef __cplusplus |
1157 | | } |
1158 | | } |
1159 | | } // extern "C" { namespace roaring { namespace internal { |
1160 | | #endif |
1161 | | #if defined(__GNUC__) && !defined(__clang__) |
1162 | | #pragma GCC diagnostic pop |
1163 | | #endif |