Coverage Report

Created: 2026-04-12 06:36

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/c-blosc/blosc/fastcopy.c
Line
Count
Source
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Author: Francesc Alted <francesc@blosc.org>
5
  Creation date: 2018-01-03
6
7
  See LICENSE.txt for details about copyright and rights to use.
8
**********************************************************************/
9
10
/*********************************************************************
11
  The code in this file is heavily based on memcopy.h, from the
12
13
  zlib-ng compression library.  See LICENSES/ZLIB-NG.txt for details.
14
  See also: https://github.com/zlib-ng/zlib-ng/blob/develop/zlib.h.in
15
16
  New implementations by Francesc Alted:
17
    * fast_copy() and copy_run() functions
18
    * Support for SSE2/AVX2 copy instructions for these routines
19
**********************************************************************/
20
21
#include <assert.h>
22
#include "blosc-common.h"
23
24
/*
25
 * Use inlined functions for supported systems.
26
 */
27
#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
28
#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
29
#endif
30
31
32
1.41k
static inline unsigned char *copy_1_bytes(unsigned char *out, const unsigned char *from) {
33
1.41k
  *out++ = *from;
34
1.41k
  return out;
35
1.41k
}
36
37
441k
static inline unsigned char *copy_2_bytes(unsigned char *out, const unsigned char *from) {
38
#if defined(BLOSC_STRICT_ALIGN)
39
  uint16_t chunk;
40
  memcpy(&chunk, from, 2);
41
  memcpy(out, &chunk, 2);
42
#else
43
441k
  *(uint16_t *) out = *(uint16_t *) from;
44
441k
#endif
45
441k
  return out + 2;
46
441k
}
47
48
490
static inline unsigned char *copy_3_bytes(unsigned char *out, const unsigned char *from) {
49
490
  out = copy_1_bytes(out, from);
50
490
  return copy_2_bytes(out, from + 1);
51
490
}
52
53
94.9k
static inline unsigned char *copy_4_bytes(unsigned char *out, const unsigned char *from) {
54
#if defined(BLOSC_STRICT_ALIGN)
55
  uint32_t chunk;
56
  memcpy(&chunk, from, 4);
57
  memcpy(out, &chunk, 4);
58
#else
59
94.9k
  *(uint32_t *) out = *(uint32_t *) from;
60
94.9k
#endif
61
94.9k
  return out + 4;
62
94.9k
}
63
64
388
static inline unsigned char *copy_5_bytes(unsigned char *out, const unsigned char *from) {
65
388
  out = copy_1_bytes(out, from);
66
388
  return copy_4_bytes(out, from + 1);
67
388
}
68
69
330
static inline unsigned char *copy_6_bytes(unsigned char *out, const unsigned char *from) {
70
330
  out = copy_2_bytes(out, from);
71
330
  return copy_4_bytes(out, from + 2);
72
330
}
73
74
226
static inline unsigned char *copy_7_bytes(unsigned char *out, const unsigned char *from) {
75
226
  out = copy_3_bytes(out, from);
76
226
  return copy_4_bytes(out, from + 3);
77
226
}
78
79
3.37k
static inline unsigned char *copy_8_bytes(unsigned char *out, const unsigned char *from) {
80
#if defined(BLOSC_STRICT_ALIGN)
81
  uint64_t chunk;
82
  memcpy(&chunk, from, 8);
83
  memcpy(out, &chunk, 8);
84
#else
85
3.37k
  *(uint64_t *) out = *(uint64_t *) from;
86
3.37k
#endif
87
3.37k
  return out + 8;
88
3.37k
}
89
90
91
20.0M
static inline unsigned char *copy_16_bytes(unsigned char *out, const unsigned char *from) {
92
20.0M
#if defined(__SSE2__)
93
20.0M
  __m128i chunk;
94
20.0M
  chunk = _mm_loadu_si128((__m128i*)from);
95
20.0M
  _mm_storeu_si128((__m128i*)out, chunk);
96
20.0M
  out += 16;
97
#elif !defined(BLOSC_STRICT_ALIGN)
98
  *(uint64_t*)out = *(uint64_t*)from;
99
   from += 8; out += 8;
100
   *(uint64_t*)out = *(uint64_t*)from;
101
   from += 8; out += 8;
102
#else
103
   int i;
104
   for (i = 0; i < 16; i++) {
105
     *out++ = *from++;
106
   }
107
#endif
108
20.0M
  return out;
109
20.0M
}
110
111
312
static inline unsigned char *copy_32_bytes(unsigned char *out, const unsigned char *from) {
112
#if defined(__AVX2__)
113
  __m256i chunk;
114
  chunk = _mm256_loadu_si256((__m256i*)from);
115
  _mm256_storeu_si256((__m256i*)out, chunk);
116
  out += 32;
117
#elif defined(__SSE2__)
118
  __m128i chunk;
119
312
  chunk = _mm_loadu_si128((__m128i*)from);
120
312
  _mm_storeu_si128((__m128i*)out, chunk);
121
312
  from += 16; out += 16;
122
312
  chunk = _mm_loadu_si128((__m128i*)from);
123
312
  _mm_storeu_si128((__m128i*)out, chunk);
124
312
  out += 16;
125
#elif !defined(BLOSC_STRICT_ALIGN)
126
  *(uint64_t*)out = *(uint64_t*)from;
127
  from += 8; out += 8;
128
  *(uint64_t*)out = *(uint64_t*)from;
129
  from += 8; out += 8;
130
  *(uint64_t*)out = *(uint64_t*)from;
131
  from += 8; out += 8;
132
  *(uint64_t*)out = *(uint64_t*)from;
133
  from += 8; out += 8;
134
#else
135
  int i;
136
  for (i = 0; i < 32; i++) {
137
    *out++ = *from++;
138
  }
139
#endif
140
312
  return out;
141
312
}
142
143
// This is never used, so comment it out
144
//#if defined(__AVX2__)
145
//static inline unsigned char *copy_32_bytes_aligned(unsigned char *out, const unsigned char *from) {
146
//  __m256i chunk;
147
//  chunk = _mm256_load_si256((__m256i*)from);
148
//  _mm256_storeu_si256((__m256i*)out, chunk);
149
//  return out + 32;
150
//}
151
//#endif  // __AVX2__
152
153
/* Copy LEN bytes (7 or fewer) from FROM into OUT. Return OUT + LEN. */
154
3.46k
static inline unsigned char *copy_bytes(unsigned char *out, const unsigned char *from, unsigned len) {
155
3.46k
  assert(len < 8);
156
157
#ifdef BLOSC_STRICT_ALIGN
158
  while (len--) {
159
    *out++ = *from++;
160
  }
161
#else
162
3.46k
  switch (len) {
163
226
    case 7:
164
226
      return copy_7_bytes(out, from);
165
330
    case 6:
166
330
      return copy_6_bytes(out, from);
167
388
    case 5:
168
388
      return copy_5_bytes(out, from);
169
1.39k
    case 4:
170
1.39k
      return copy_4_bytes(out, from);
171
264
    case 3:
172
264
      return copy_3_bytes(out, from);
173
332
    case 2:
174
332
      return copy_2_bytes(out, from);
175
533
    case 1:
176
533
      return copy_1_bytes(out, from);
177
0
    case 0:
178
0
      return out;
179
0
    default:
180
0
      assert(0);
181
3.46k
  }
182
0
#endif /* BLOSC_STRICT_ALIGN */
183
0
  return out;
184
3.46k
}
185
186
// Define a symbol for avoiding fall-through warnings emitted by gcc >= 7.0
187
#if ((defined(__GNUC__) && BLOSC_GCC_VERSION >= 700) && !defined(__clang__) && \
188
     !defined(__ICC) && !defined(__ICL))
189
#define AVOID_FALLTHROUGH_WARNING
190
#endif
191
192
/* Byte by byte semantics: copy LEN bytes from FROM and write them to OUT. Return OUT + LEN. */
193
449
static inline unsigned char *chunk_memcpy(unsigned char *out, const unsigned char *from, unsigned len) {
194
449
  unsigned sz = sizeof(uint64_t);
195
449
  unsigned rem = len % sz;
196
449
  unsigned by8;
197
198
449
  assert(len >= sz);
199
200
  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
201
449
  copy_8_bytes(out, from);
202
203
449
  len /= sz;
204
449
  out += rem;
205
449
  from += rem;
206
207
449
  by8 = len % 8;
208
449
  len -= by8;
209
449
  switch (by8) {
210
0
    case 7:
211
0
      out = copy_8_bytes(out, from);
212
0
      from += sz;
213
      #ifdef AVOID_FALLTHROUGH_WARNING
214
      __attribute__ ((fallthrough));  // Shut-up -Wimplicit-fallthrough warning in GCC
215
      #endif
216
0
    case 6:
217
0
      out = copy_8_bytes(out, from);
218
0
      from += sz;
219
      #ifdef AVOID_FALLTHROUGH_WARNING
220
      __attribute__ ((fallthrough));
221
      #endif
222
0
    case 5:
223
0
      out = copy_8_bytes(out, from);
224
0
      from += sz;
225
      #ifdef AVOID_FALLTHROUGH_WARNING
226
      __attribute__ ((fallthrough));
227
      #endif
228
0
    case 4:
229
0
      out = copy_8_bytes(out, from);
230
0
      from += sz;
231
      #ifdef AVOID_FALLTHROUGH_WARNING
232
      __attribute__ ((fallthrough));
233
      #endif
234
0
    case 3:
235
0
      out = copy_8_bytes(out, from);
236
0
      from += sz;
237
      #ifdef AVOID_FALLTHROUGH_WARNING
238
      __attribute__ ((fallthrough));
239
      #endif
240
0
    case 2:
241
0
      out = copy_8_bytes(out, from);
242
0
      from += sz;
243
      #ifdef AVOID_FALLTHROUGH_WARNING
244
      __attribute__ ((fallthrough));
245
      #endif
246
449
    case 1:
247
449
      out = copy_8_bytes(out, from);
248
449
      from += sz;
249
      #ifdef AVOID_FALLTHROUGH_WARNING
250
      __attribute__ ((fallthrough));
251
      #endif
252
449
    default:
253
449
      break;
254
449
  }
255
256
449
  while (len) {
257
0
    out = copy_8_bytes(out, from);
258
0
    from += sz;
259
0
    out = copy_8_bytes(out, from);
260
0
    from += sz;
261
0
    out = copy_8_bytes(out, from);
262
0
    from += sz;
263
0
    out = copy_8_bytes(out, from);
264
0
    from += sz;
265
0
    out = copy_8_bytes(out, from);
266
0
    from += sz;
267
0
    out = copy_8_bytes(out, from);
268
0
    from += sz;
269
0
    out = copy_8_bytes(out, from);
270
0
    from += sz;
271
0
    out = copy_8_bytes(out, from);
272
0
    from += sz;
273
274
0
    len -= 8;
275
0
  }
276
277
449
  return out;
278
449
}
279
280
#if (defined(__SSE2__) && defined(__AVX2__))
281
/* 16-byte version of chunk_memcpy() */
282
static inline unsigned char *chunk_memcpy_16(unsigned char *out, const unsigned char *from, unsigned len) {
283
  unsigned sz = 16;
284
  unsigned rem = len % sz;
285
  unsigned ilen;
286
287
  assert(len >= sz);
288
289
  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
290
  copy_16_bytes(out, from);
291
292
  len /= sz;
293
  out += rem;
294
  from += rem;
295
296
  for (ilen = 0; ilen < len; ilen++) {
297
    copy_16_bytes(out, from);
298
    out += sz;
299
    from += sz;
300
  }
301
302
  return out;
303
}
304
#endif
305
306
307
// NOTE: chunk_memcpy_32() and chunk_memcpy_32_unrolled() are not used, so commenting them
308
309
///* 32-byte version of chunk_memcpy() */
310
//static inline unsigned char *chunk_memcpy_32(unsigned char *out, const unsigned char *from, unsigned len) {
311
//  unsigned sz = 32;
312
//  unsigned rem = len % sz;
313
//  unsigned ilen;
314
//
315
//  assert(len >= sz);
316
//
317
//  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
318
//  copy_32_bytes(out, from);
319
//
320
//  len /= sz;
321
//  out += rem;
322
//  from += rem;
323
//
324
//  for (ilen = 0; ilen < len; ilen++) {
325
//    copy_32_bytes(out, from);
326
//    out += sz;
327
//    from += sz;
328
//  }
329
//
330
//  return out;
331
//}
332
//
333
///* 32-byte *unrolled* version of chunk_memcpy() */
334
//static inline unsigned char *chunk_memcpy_32_unrolled(unsigned char *out, const unsigned char *from, unsigned len) {
335
//  unsigned sz = 32;
336
//  unsigned rem = len % sz;
337
//  unsigned by8;
338
//
339
//  assert(len >= sz);
340
//
341
//  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
342
//  copy_32_bytes(out, from);
343
//
344
//  len /= sz;
345
//  out += rem;
346
//  from += rem;
347
//
348
//  by8 = len % 8;
349
//  len -= by8;
350
//  switch (by8) {
351
//    case 7:
352
//      out = copy_32_bytes(out, from);
353
//      from += sz;
354
//    case 6:
355
//      out = copy_32_bytes(out, from);
356
//      from += sz;
357
//    case 5:
358
//      out = copy_32_bytes(out, from);
359
//      from += sz;
360
//    case 4:
361
//      out = copy_32_bytes(out, from);
362
//      from += sz;
363
//    case 3:
364
//      out = copy_32_bytes(out, from);
365
//      from += sz;
366
//    case 2:
367
//      out = copy_32_bytes(out, from);
368
//      from += sz;
369
//    case 1:
370
//      out = copy_32_bytes(out, from);
371
//      from += sz;
372
//    default:
373
//      break;
374
//  }
375
//
376
//  while (len) {
377
//    out = copy_32_bytes(out, from);
378
//    from += sz;
379
//    out = copy_32_bytes(out, from);
380
//    from += sz;
381
//    out = copy_32_bytes(out, from);
382
//    from += sz;
383
//    out = copy_32_bytes(out, from);
384
//    from += sz;
385
//    out = copy_32_bytes(out, from);
386
//    from += sz;
387
//    out = copy_32_bytes(out, from);
388
//    from += sz;
389
//    out = copy_32_bytes(out, from);
390
//    from += sz;
391
//    out = copy_32_bytes(out, from);
392
//    from += sz;
393
//
394
//    len -= 8;
395
//  }
396
//
397
//  return out;
398
//}
399
400
401
/* SSE2/AVX2 *unaligned* version of chunk_memcpy() */
402
#if defined(__SSE2__) || defined(__AVX2__)
403
68.2k
static inline unsigned char *chunk_memcpy_unaligned(unsigned char *out, const unsigned char *from, unsigned len) {
404
#if defined(__AVX2__)
405
  unsigned sz = sizeof(__m256i);
406
#elif defined(__SSE2__)
407
  unsigned sz = sizeof(__m128i);
408
68.2k
#endif
409
68.2k
  unsigned rem = len % sz;
410
68.2k
  unsigned ilen;
411
412
68.2k
  assert(len >= sz);
413
414
  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
415
#if defined(__AVX2__)
416
  copy_32_bytes(out, from);
417
#elif defined(__SSE2__)
418
  copy_16_bytes(out, from);
419
68.2k
#endif
420
421
68.2k
  len /= sz;
422
68.2k
  out += rem;
423
68.2k
  from += rem;
424
425
20.0M
  for (ilen = 0; ilen < len; ilen++) {
426
#if defined(__AVX2__)
427
    copy_32_bytes(out, from);
428
#elif defined(__SSE2__)
429
    copy_16_bytes(out, from);
430
19.9M
#endif
431
19.9M
    out += sz;
432
19.9M
    from += sz;
433
19.9M
  }
434
435
68.2k
  return out;
436
68.2k
}
437
#endif // __SSE2__ || __AVX2__
438
439
440
// NOTE: chunk_memcpy_aligned() is not used, so commenting it
441
442
//#if defined(__SSE2__) || defined(__AVX2__)
443
///* SSE2/AVX2 *aligned* version of chunk_memcpy() */
444
//static inline unsigned char *chunk_memcpy_aligned(unsigned char *out, const unsigned char *from, unsigned len) {
445
//#if defined(__AVX2__)
446
//  unsigned sz = sizeof(__m256i);
447
//  __m256i chunk;
448
//#elif defined(__SSE2__)
449
//  unsigned sz = sizeof(__m128i);
450
//  __m128i chunk;
451
//#endif
452
//  unsigned bytes_to_align = sz - (unsigned)(((uintptr_t)(const void *)(from)) % sz);
453
//  unsigned corrected_len = len - bytes_to_align;
454
//  unsigned rem = corrected_len % sz;
455
//  unsigned ilen;
456
//
457
//  assert(len >= sz);
458
//
459
//  /* Copy a few bytes to make sure the loop below has aligned access. */
460
//#if defined(__AVX2__)
461
//  chunk = _mm256_loadu_si256((__m256i *) from);
462
//  _mm256_storeu_si256((__m256i *) out, chunk);
463
//#elif defined(__SSE2__)
464
//  chunk = _mm_loadu_si128((__m128i *) from);
465
//  _mm_storeu_si128((__m128i *) out, chunk);
466
//#endif
467
//  out += bytes_to_align;
468
//  from += bytes_to_align;
469
//
470
//  len = corrected_len / sz;
471
//  for (ilen = 0; ilen < len; ilen++) {
472
//#if defined(__AVX2__)
473
//    chunk = _mm256_load_si256((__m256i *) from);  /* *aligned* load */
474
//    _mm256_storeu_si256((__m256i *) out, chunk);
475
//#elif defined(__SSE2__)
476
//    chunk = _mm_load_si128((__m128i *) from);  /* *aligned* load */
477
//    _mm_storeu_si128((__m128i *) out, chunk);
478
//#endif
479
//    out += sz;
480
//    from += sz;
481
//  }
482
//
483
//  /* Copy remaining bytes */
484
//  if (rem < 8) {
485
//    out = copy_bytes(out, from, rem);
486
//  }
487
//  else {
488
//    out = chunk_memcpy(out, from, rem);
489
//  }
490
//
491
//  return out;
492
//}
493
//#endif // __AVX2__ || __SSE2__
494
495
496
/* Byte by byte semantics: copy LEN bytes from FROM and write them to OUT. Return OUT + LEN. */
497
73.2k
unsigned char *fastcopy(unsigned char *out, const unsigned char *from, unsigned len) {
498
73.2k
  switch (len) {
499
312
    case 32:
500
312
      return copy_32_bytes(out, from);
501
267
    case 16:
502
267
      return copy_16_bytes(out, from);
503
473
    case 8:
504
473
      return copy_8_bytes(out, from);
505
72.2k
    default: {
506
72.2k
    }
507
73.2k
  }
508
72.2k
  if (len < 8) {
509
3.46k
    return copy_bytes(out, from, len);
510
3.46k
  }
511
68.7k
#if defined(__SSE2__)
512
68.7k
  if (len < 16) {
513
449
    return chunk_memcpy(out, from, len);
514
449
  }
515
68.2k
#if !defined(__AVX2__)
516
68.2k
  return chunk_memcpy_unaligned(out, from, len);
517
#else
518
  if (len < 32) {
519
    return chunk_memcpy_16(out, from, len);
520
  }
521
  return chunk_memcpy_unaligned(out, from, len);
522
#endif  // !__AVX2__
523
#else
524
  return chunk_memcpy(out, from, len);
525
#endif  // __SSE2__
526
68.7k
}
527
528
529
/* Copy a run */
530
28.9k
unsigned char* copy_match(unsigned char *out, const unsigned char *from, unsigned len) {
531
#if defined(__AVX2__)
532
  unsigned sz = sizeof(__m256i);
533
#elif defined(__SSE2__)
534
  unsigned sz = sizeof(__m128i);
535
#else
536
  unsigned sz = sizeof(uint64_t);
537
#endif
538
539
#if ((defined(__GNUC__) && BLOSC_GCC_VERSION < 800) && !defined(__clang__) && !defined(__ICC) && !defined(__ICL))
540
  // GCC < 8 in fully optimization mode seems to have problems with the code further below so stop here
541
  for (; len > 0; len--) {
542
    *out++ = *from++;
543
  }
544
  return out;
545
#endif
546
547
  // If out and from are away more than the size of the copy, then a fastcopy is safe
548
28.9k
  unsigned overlap_dist = (unsigned) (out - from);
549
28.9k
  if (overlap_dist > sz) {
550
169
    return fastcopy(out, from, len);
551
169
  }
552
553
  // Otherwise we need to be more careful so as not to overwrite destination
554
28.8k
  switch (overlap_dist) {
555
0
    case 32:
556
0
      for (; len >= 32; len -= 32) {
557
0
        out = copy_32_bytes(out, from);
558
0
      }
559
0
      break;
560
0
    case 30:
561
0
      for (; len >= 30; len -= 30) {
562
0
        out = copy_16_bytes(out, from);
563
0
        out = copy_8_bytes(out, from + 16);
564
0
        out = copy_4_bytes(out, from + 24);
565
0
        out = copy_2_bytes(out, from + 28);
566
0
      }
567
0
      break;
568
0
    case 28:
569
0
      for (; len >= 28; len -= 28) {
570
0
        out = copy_16_bytes(out, from);
571
0
        out = copy_8_bytes(out, from + 16);
572
0
        out = copy_4_bytes(out, from + 24);
573
0
      }
574
0
      break;
575
0
    case 26:
576
0
      for (; len >= 26; len -= 26) {
577
0
        out = copy_16_bytes(out, from);
578
0
        out = copy_8_bytes(out, from + 16);
579
0
        out = copy_2_bytes(out, from + 24);
580
0
      }
581
0
      break;
582
0
    case 24:
583
0
      for (; len >= 24; len -= 24) {
584
0
        out = copy_16_bytes(out, from);
585
0
        out = copy_8_bytes(out, from + 16);
586
0
      }
587
0
      break;
588
0
    case 22:
589
0
      for (; len >= 22; len -= 22) {
590
0
        out = copy_16_bytes(out, from);
591
0
        out = copy_4_bytes(out, from + 16);
592
0
        out = copy_2_bytes(out, from + 20);
593
0
      }
594
0
      break;
595
0
    case 20:
596
0
      for (; len >= 20; len -= 20) {
597
0
        out = copy_16_bytes(out, from);
598
0
        out = copy_4_bytes(out, from + 16);
599
0
      }
600
0
      break;
601
0
    case 18:
602
0
      for (; len >= 18; len -= 18) {
603
0
        out = copy_16_bytes(out, from);
604
0
        out = copy_2_bytes(out, from + 16);
605
0
      }
606
0
      break;
607
165
    case 16:
608
950
      for (; len >= 16; len -= 16) {
609
785
        out = copy_16_bytes(out, from);
610
785
      }
611
165
      break;
612
403
    case 8:
613
2.40k
      for (; len >= 8; len -= 8) {
614
2.00k
        out = copy_8_bytes(out, from);
615
2.00k
      }
616
403
      break;
617
12.1k
    case 4:
618
104k
      for (; len >= 4; len -= 4) {
619
92.6k
        out = copy_4_bytes(out, from);
620
92.6k
      }
621
12.1k
      break;
622
6.97k
    case 2:
623
447k
      for (; len >= 2; len -= 2) {
624
440k
        out = copy_2_bytes(out, from);
625
440k
      }
626
6.97k
      break;
627
9.10k
    default:
628
113k
      for (; len > 0; len--) {
629
103k
        *out++ = *from++;
630
103k
      }
631
28.8k
  }
632
633
  // Copy the leftovers
634
58.6k
  for (; len > 0; len--) {
635
29.8k
    *out++ = *from++;
636
29.8k
  }
637
638
28.8k
  return out;
639
28.8k
}