Coverage Report

Created: 2024-07-27 06:20

/src/c-blosc2/blosc/fastcopy.c
Line
Count
Source (jump to first uncovered line)
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  The code in this file is heavily based on memcopy.h, from the
13
  zlib-ng compression library.  See LICENSES/ZLIB.txt for details.
14
  See also: https://github.com/Dead2/zlib-ng/blob/develop/zlib.h
15
16
  New implementations by Francesc Alted:
17
    * fast_copy() and copy_run() functions
18
    * Support for SSE2/AVX2 copy instructions for these routines
19
**********************************************************************/
20
21
#include "blosc2/blosc2-common.h"
22
23
#include <assert.h>
24
#include <stdint.h>
25
#if defined(BLOSC_STRICT_ALIGN)
26
#include <string.h>
27
#endif
28
29
/*
30
 * Use inlined functions for supported systems.
31
 */
32
#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
33
#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
34
#endif
35
36
37
340
static inline unsigned char *copy_1_bytes(unsigned char *out, const unsigned char *from) {
38
340
  *out++ = *from;
39
340
  return out;
40
340
}
41
42
1.33M
static inline unsigned char *copy_2_bytes(unsigned char *out, const unsigned char *from) {
43
#if defined(BLOSC_STRICT_ALIGN)
44
  uint16_t chunk;
45
  memcpy(&chunk, from, 2);
46
  memcpy(out, &chunk, 2);
47
#else
48
1.33M
  *(uint16_t *) out = *(uint16_t *) from;
49
1.33M
#endif
50
1.33M
  return out + 2;
51
1.33M
}
52
53
315
static inline unsigned char *copy_3_bytes(unsigned char *out, const unsigned char *from) {
54
315
  out = copy_1_bytes(out, from);
55
315
  return copy_2_bytes(out, from + 1);
56
315
}
57
58
541k
static inline unsigned char *copy_4_bytes(unsigned char *out, const unsigned char *from) {
59
#if defined(BLOSC_STRICT_ALIGN)
60
  uint32_t chunk;
61
  memcpy(&chunk, from, 4);
62
  memcpy(out, &chunk, 4);
63
#else
64
541k
  *(uint32_t *) out = *(uint32_t *) from;
65
541k
#endif
66
541k
  return out + 4;
67
541k
}
68
69
25
static inline unsigned char *copy_5_bytes(unsigned char *out, const unsigned char *from) {
70
25
  out = copy_1_bytes(out, from);
71
25
  return copy_4_bytes(out, from + 1);
72
25
}
73
74
291
static inline unsigned char *copy_6_bytes(unsigned char *out, const unsigned char *from) {
75
291
  out = copy_2_bytes(out, from);
76
291
  return copy_4_bytes(out, from + 2);
77
291
}
78
79
230
static inline unsigned char *copy_7_bytes(unsigned char *out, const unsigned char *from) {
80
230
  out = copy_3_bytes(out, from);
81
230
  return copy_4_bytes(out, from + 3);
82
230
}
83
84
52.1k
static inline unsigned char *copy_8_bytes(unsigned char *out, const unsigned char *from) {
85
#if defined(BLOSC_STRICT_ALIGN)
86
  uint64_t chunk;
87
  memcpy(&chunk, from, 8);
88
  memcpy(out, &chunk, 8);
89
#else
90
52.1k
  *(uint64_t *) out = *(uint64_t *) from;
91
52.1k
#endif
92
52.1k
  return out + 8;
93
52.1k
}
94
95
96
102k
static inline unsigned char *copy_16_bytes(unsigned char *out, const unsigned char *from) {
97
102k
#if defined(__SSE2__)
98
102k
  __m128i chunk;
99
102k
  chunk = _mm_loadu_si128((__m128i*)from);
100
102k
  _mm_storeu_si128((__m128i*)out, chunk);
101
102k
  out += 16;
102
#elif !defined(BLOSC_STRICT_ALIGN)
103
  *(uint64_t*)out = *(uint64_t*)from;
104
   from += 8; out += 8;
105
   *(uint64_t*)out = *(uint64_t*)from;
106
   from += 8; out += 8;
107
#else
108
   int i;
109
   for (i = 0; i < 16; i++) {
110
     *out++ = *from++;
111
   }
112
#endif
113
102k
  return out;
114
102k
}
115
116
221
static inline unsigned char *copy_32_bytes(unsigned char *out, const unsigned char *from) {
117
#if defined(__AVX2__)
118
  __m256i chunk;
119
  chunk = _mm256_loadu_si256((__m256i*)from);
120
  _mm256_storeu_si256((__m256i*)out, chunk);
121
  out += 32;
122
#elif defined(__SSE2__)
123
  __m128i chunk;
124
221
  chunk = _mm_loadu_si128((__m128i*)from);
125
221
  _mm_storeu_si128((__m128i*)out, chunk);
126
221
  from += 16; out += 16;
127
221
  chunk = _mm_loadu_si128((__m128i*)from);
128
221
  _mm_storeu_si128((__m128i*)out, chunk);
129
221
  out += 16;
130
#elif !defined(BLOSC_STRICT_ALIGN)
131
  *(uint64_t*)out = *(uint64_t*)from;
132
  from += 8; out += 8;
133
  *(uint64_t*)out = *(uint64_t*)from;
134
  from += 8; out += 8;
135
  *(uint64_t*)out = *(uint64_t*)from;
136
  from += 8; out += 8;
137
  *(uint64_t*)out = *(uint64_t*)from;
138
  from += 8; out += 8;
139
#else
140
  int i;
141
  for (i = 0; i < 32; i++) {
142
    *out++ = *from++;
143
  }
144
#endif
145
221
  return out;
146
221
}
147
148
// This is never used, so comment it out
149
//#if defined(__AVX2__)
150
//static inline unsigned char *copy_32_bytes_aligned(unsigned char *out, const unsigned char *from) {
151
//  __m256i chunk;
152
//  chunk = _mm256_load_si256((__m256i*)from);
153
//  _mm256_storeu_si256((__m256i*)out, chunk);
154
//  return out + 32;
155
//}
156
//#endif  // __AVX2__
157
158
/* Copy LEN bytes (7 or fewer) from FROM into OUT. Return OUT + LEN. */
159
652
static inline unsigned char *copy_bytes(unsigned char *out, const unsigned char *from, unsigned len) {
160
652
  assert(len < 8);
161
162
#ifdef BLOSC_STRICT_ALIGN
163
  while (len--) {
164
    *out++ = *from++;
165
  }
166
#else
167
652
  switch (len) {
168
230
    case 7:
169
230
      return copy_7_bytes(out, from);
170
291
    case 6:
171
291
      return copy_6_bytes(out, from);
172
25
    case 5:
173
25
      return copy_5_bytes(out, from);
174
21
    case 4:
175
21
      return copy_4_bytes(out, from);
176
85
    case 3:
177
85
      return copy_3_bytes(out, from);
178
0
    case 2:
179
0
      return copy_2_bytes(out, from);
180
0
    case 1:
181
0
      return copy_1_bytes(out, from);
182
0
    case 0:
183
0
      return out;
184
0
    default:
185
0
      assert(0);
186
652
  }
187
0
#endif /* BLOSC_STRICT_ALIGN */
188
0
  return out;
189
652
}
190
191
// Define a symbol for avoiding fall-through warnings emitted by gcc >= 7.0
192
#if ((defined(__GNUC__) && BLOSC_GCC_VERSION >= 700) && !defined(__clang__) && \
193
     !defined(__ICC) && !defined(__ICL))
194
#define AVOID_FALLTHROUGH_WARNING
195
#endif
196
197
/* Byte by byte semantics: copy LEN bytes from FROM and write them to OUT. Return OUT + LEN. */
198
386
static inline unsigned char *chunk_memcpy(unsigned char *out, const unsigned char *from, unsigned len) {
199
386
  unsigned sz = sizeof(uint64_t);
200
386
  unsigned rem = len % sz;
201
386
  unsigned by8;
202
203
386
  assert(len >= sz);
204
205
  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
206
386
  copy_8_bytes(out, from);
207
208
386
  len /= sz;
209
386
  out += rem;
210
386
  from += rem;
211
212
386
  by8 = len % 8;
213
386
  len -= by8;
214
386
  switch (by8) {
215
0
    case 7:
216
0
      out = copy_8_bytes(out, from);
217
0
      from += sz;
218
      #ifdef AVOID_FALLTHROUGH_WARNING
219
      __attribute__ ((fallthrough));  // Shut-up -Wimplicit-fallthrough warning in GCC
220
      #endif
221
0
    case 6:
222
0
      out = copy_8_bytes(out, from);
223
0
      from += sz;
224
      #ifdef AVOID_FALLTHROUGH_WARNING
225
      __attribute__ ((fallthrough));
226
      #endif
227
0
    case 5:
228
0
      out = copy_8_bytes(out, from);
229
0
      from += sz;
230
      #ifdef AVOID_FALLTHROUGH_WARNING
231
      __attribute__ ((fallthrough));
232
      #endif
233
0
    case 4:
234
0
      out = copy_8_bytes(out, from);
235
0
      from += sz;
236
      #ifdef AVOID_FALLTHROUGH_WARNING
237
      __attribute__ ((fallthrough));
238
      #endif
239
0
    case 3:
240
0
      out = copy_8_bytes(out, from);
241
0
      from += sz;
242
      #ifdef AVOID_FALLTHROUGH_WARNING
243
      __attribute__ ((fallthrough));
244
      #endif
245
0
    case 2:
246
0
      out = copy_8_bytes(out, from);
247
0
      from += sz;
248
      #ifdef AVOID_FALLTHROUGH_WARNING
249
      __attribute__ ((fallthrough));
250
      #endif
251
386
    case 1:
252
386
      out = copy_8_bytes(out, from);
253
386
      from += sz;
254
      #ifdef AVOID_FALLTHROUGH_WARNING
255
      __attribute__ ((fallthrough));
256
      #endif
257
386
    default:
258
386
      break;
259
386
  }
260
261
386
  while (len) {
262
0
    out = copy_8_bytes(out, from);
263
0
    from += sz;
264
0
    out = copy_8_bytes(out, from);
265
0
    from += sz;
266
0
    out = copy_8_bytes(out, from);
267
0
    from += sz;
268
0
    out = copy_8_bytes(out, from);
269
0
    from += sz;
270
0
    out = copy_8_bytes(out, from);
271
0
    from += sz;
272
0
    out = copy_8_bytes(out, from);
273
0
    from += sz;
274
0
    out = copy_8_bytes(out, from);
275
0
    from += sz;
276
0
    out = copy_8_bytes(out, from);
277
0
    from += sz;
278
279
0
    len -= 8;
280
0
  }
281
282
386
  return out;
283
386
}
284
285
#if (defined(__SSE2__) && defined(__AVX2__))
286
/* 16-byte version of chunk_memcpy() */
287
static inline unsigned char *chunk_memcpy_16(unsigned char *out, const unsigned char *from, unsigned len) {
288
  unsigned sz = 16;
289
  unsigned rem = len % sz;
290
  unsigned ilen;
291
292
  assert(len >= sz);
293
294
  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
295
  copy_16_bytes(out, from);
296
297
  len /= sz;
298
  out += rem;
299
  from += rem;
300
301
  for (ilen = 0; ilen < len; ilen++) {
302
    copy_16_bytes(out, from);
303
    out += sz;
304
    from += sz;
305
  }
306
307
  return out;
308
}
309
#endif
310
311
312
// NOTE: chunk_memcpy_32() and chunk_memcpy_32_unrolled() are not used, so commenting them
313
314
///* 32-byte version of chunk_memcpy() */
315
//static inline unsigned char *chunk_memcpy_32(unsigned char *out, const unsigned char *from, unsigned len) {
316
//  unsigned sz = 32;
317
//  unsigned rem = len % sz;
318
//  unsigned ilen;
319
//
320
//  assert(len >= sz);
321
//
322
//  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
323
//  copy_32_bytes(out, from);
324
//
325
//  len /= sz;
326
//  out += rem;
327
//  from += rem;
328
//
329
//  for (ilen = 0; ilen < len; ilen++) {
330
//    copy_32_bytes(out, from);
331
//    out += sz;
332
//    from += sz;
333
//  }
334
//
335
//  return out;
336
//}
337
//
338
///* 32-byte *unrolled* version of chunk_memcpy() */
339
//static inline unsigned char *chunk_memcpy_32_unrolled(unsigned char *out, const unsigned char *from, unsigned len) {
340
//  unsigned sz = 32;
341
//  unsigned rem = len % sz;
342
//  unsigned by8;
343
//
344
//  assert(len >= sz);
345
//
346
//  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
347
//  copy_32_bytes(out, from);
348
//
349
//  len /= sz;
350
//  out += rem;
351
//  from += rem;
352
//
353
//  by8 = len % 8;
354
//  len -= by8;
355
//  switch (by8) {
356
//    case 7:
357
//      out = copy_32_bytes(out, from);
358
//      from += sz;
359
//    case 6:
360
//      out = copy_32_bytes(out, from);
361
//      from += sz;
362
//    case 5:
363
//      out = copy_32_bytes(out, from);
364
//      from += sz;
365
//    case 4:
366
//      out = copy_32_bytes(out, from);
367
//      from += sz;
368
//    case 3:
369
//      out = copy_32_bytes(out, from);
370
//      from += sz;
371
//    case 2:
372
//      out = copy_32_bytes(out, from);
373
//      from += sz;
374
//    case 1:
375
//      out = copy_32_bytes(out, from);
376
//      from += sz;
377
//    default:
378
//      break;
379
//  }
380
//
381
//  while (len) {
382
//    out = copy_32_bytes(out, from);
383
//    from += sz;
384
//    out = copy_32_bytes(out, from);
385
//    from += sz;
386
//    out = copy_32_bytes(out, from);
387
//    from += sz;
388
//    out = copy_32_bytes(out, from);
389
//    from += sz;
390
//    out = copy_32_bytes(out, from);
391
//    from += sz;
392
//    out = copy_32_bytes(out, from);
393
//    from += sz;
394
//    out = copy_32_bytes(out, from);
395
//    from += sz;
396
//    out = copy_32_bytes(out, from);
397
//    from += sz;
398
//
399
//    len -= 8;
400
//  }
401
//
402
//  return out;
403
//}
404
405
406
/* SSE2/AVX2 *unaligned* version of chunk_memcpy() */
407
#if defined(__SSE2__) || defined(__AVX2__)
408
4.10k
static inline unsigned char *chunk_memcpy_unaligned(unsigned char *out, const unsigned char *from, unsigned len) {
409
#if defined(__AVX2__)
410
  unsigned sz = sizeof(__m256i);
411
#elif defined(__SSE2__)
412
  unsigned sz = sizeof(__m128i);
413
4.10k
#endif
414
4.10k
  unsigned rem = len % sz;
415
4.10k
  unsigned ilen;
416
417
4.10k
  assert(len >= sz);
418
419
  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
420
#if defined(__AVX2__)
421
  copy_32_bytes(out, from);
422
#elif defined(__SSE2__)
423
  copy_16_bytes(out, from);
424
4.10k
#endif
425
426
4.10k
  len /= sz;
427
4.10k
  out += rem;
428
4.10k
  from += rem;
429
430
86.6k
  for (ilen = 0; ilen < len; ilen++) {
431
#if defined(__AVX2__)
432
    copy_32_bytes(out, from);
433
#elif defined(__SSE2__)
434
    copy_16_bytes(out, from);
435
82.5k
#endif
436
82.5k
    out += sz;
437
82.5k
    from += sz;
438
82.5k
  }
439
440
4.10k
  return out;
441
4.10k
}
442
#endif // __SSE2__ || __AVX2__
443
444
445
// NOTE: chunk_memcpy_aligned() is not used, so commenting it
446
447
//#if defined(__SSE2__) || defined(__AVX2__)
448
///* SSE2/AVX2 *aligned* version of chunk_memcpy() */
449
//static inline unsigned char *chunk_memcpy_aligned(unsigned char *out, const unsigned char *from, unsigned len) {
450
//#if defined(__AVX2__)
451
//  unsigned sz = sizeof(__m256i);
452
//  __m256i chunk;
453
//#elif defined(__SSE2__)
454
//  unsigned sz = sizeof(__m128i);
455
//  __m128i chunk;
456
//#endif
457
//  unsigned bytes_to_align = sz - (unsigned)(((uintptr_t)(const void *)(from)) % sz);
458
//  unsigned corrected_len = len - bytes_to_align;
459
//  unsigned rem = corrected_len % sz;
460
//  unsigned ilen;
461
//
462
//  assert(len >= sz);
463
//
464
//  /* Copy a few bytes to make sure the loop below has aligned access. */
465
//#if defined(__AVX2__)
466
//  chunk = _mm256_loadu_si256((__m256i *) from);
467
//  _mm256_storeu_si256((__m256i *) out, chunk);
468
//#elif defined(__SSE2__)
469
//  chunk = _mm_loadu_si128((__m128i *) from);
470
//  _mm_storeu_si128((__m128i *) out, chunk);
471
//#endif
472
//  out += bytes_to_align;
473
//  from += bytes_to_align;
474
//
475
//  len = corrected_len / sz;
476
//  for (ilen = 0; ilen < len; ilen++) {
477
//#if defined(__AVX2__)
478
//    chunk = _mm256_load_si256((__m256i *) from);  /* *aligned* load */
479
//    _mm256_storeu_si256((__m256i *) out, chunk);
480
//#elif defined(__SSE2__)
481
//    chunk = _mm_load_si128((__m128i *) from);  /* *aligned* load */
482
//    _mm_storeu_si128((__m128i *) out, chunk);
483
//#endif
484
//    out += sz;
485
//    from += sz;
486
//  }
487
//
488
//  /* Copy remaining bytes */
489
//  if (rem < 8) {
490
//    out = copy_bytes(out, from, rem);
491
//  }
492
//  else {
493
//    out = chunk_memcpy(out, from, rem);
494
//  }
495
//
496
//  return out;
497
//}
498
//#endif // __AVX2__ || __SSE2__
499
500
501
/* Byte by byte semantics: copy LEN bytes from FROM and write them to OUT. Return OUT + LEN. */
502
5.91k
unsigned char *fastcopy(unsigned char *out, const unsigned char *from, unsigned len) {
503
5.91k
  switch (len) {
504
221
    case 32:
505
221
      return copy_32_bytes(out, from);
506
258
    case 16:
507
258
      return copy_16_bytes(out, from);
508
298
    case 8:
509
298
      return copy_8_bytes(out, from);
510
5.13k
    default: {
511
5.13k
    }
512
5.91k
  }
513
5.13k
  if (len < 8) {
514
652
    return copy_bytes(out, from, len);
515
652
  }
516
4.48k
#if defined(__SSE2__)
517
4.48k
  if (len < 16) {
518
386
    return chunk_memcpy(out, from, len);
519
386
  }
520
4.10k
#if !defined(__AVX2__)
521
4.10k
  return chunk_memcpy_unaligned(out, from, len);
522
#else
523
  if (len < 32) {
524
    return chunk_memcpy_16(out, from, len);
525
  }
526
  return chunk_memcpy_unaligned(out, from, len);
527
#endif  // !__AVX2__
528
#else
529
  return chunk_memcpy(out, from, len);
530
#endif  // __SSE2__
531
4.48k
}
532
533
534
/* Copy a run */
535
101k
unsigned char* copy_match(unsigned char *out, const unsigned char *from, unsigned len) {
536
#if defined(__AVX2__)
537
  unsigned sz = sizeof(__m256i);
538
#elif defined(__SSE2__)
539
  unsigned sz = sizeof(__m128i);
540
#else
541
  unsigned sz = sizeof(uint64_t);
542
#endif
543
544
#if ((defined(__GNUC__) && BLOSC_GCC_VERSION < 800) && !defined(__clang__) && !defined(__ICC) && !defined(__ICL))
545
  // GCC < 8 in fully optimization mode seems to have problems with the code further below so stop here
546
  for (; len > 0; len--) {
547
    *out++ = *from++;
548
  }
549
  return out;
550
#endif
551
552
  // If out and from are away more than the size of the copy, then a fastcopy is safe
553
101k
  unsigned overlap_dist = (unsigned) (out - from);
554
101k
  if (overlap_dist > sz) {
555
5.91k
    return fastcopy(out, from, len);
556
5.91k
  }
557
558
  // Otherwise we need to be more careful so as not to overwrite destination
559
95.9k
  switch (overlap_dist) {
560
0
    case 32:
561
0
      for (; len >= 32; len -= 32) {
562
0
        out = copy_32_bytes(out, from);
563
0
      }
564
0
      break;
565
0
    case 30:
566
0
      for (; len >= 30; len -= 30) {
567
0
        out = copy_16_bytes(out, from);
568
0
        out = copy_8_bytes(out, from + 16);
569
0
        out = copy_4_bytes(out, from + 24);
570
0
        out = copy_2_bytes(out, from + 28);
571
0
      }
572
0
      break;
573
0
    case 28:
574
0
      for (; len >= 28; len -= 28) {
575
0
        out = copy_16_bytes(out, from);
576
0
        out = copy_8_bytes(out, from + 16);
577
0
        out = copy_4_bytes(out, from + 24);
578
0
      }
579
0
      break;
580
0
    case 26:
581
0
      for (; len >= 26; len -= 26) {
582
0
        out = copy_16_bytes(out, from);
583
0
        out = copy_8_bytes(out, from + 16);
584
0
        out = copy_2_bytes(out, from + 24);
585
0
      }
586
0
      break;
587
0
    case 24:
588
0
      for (; len >= 24; len -= 24) {
589
0
        out = copy_16_bytes(out, from);
590
0
        out = copy_8_bytes(out, from + 16);
591
0
      }
592
0
      break;
593
0
    case 22:
594
0
      for (; len >= 22; len -= 22) {
595
0
        out = copy_16_bytes(out, from);
596
0
        out = copy_4_bytes(out, from + 16);
597
0
        out = copy_2_bytes(out, from + 20);
598
0
      }
599
0
      break;
600
0
    case 20:
601
0
      for (; len >= 20; len -= 20) {
602
0
        out = copy_16_bytes(out, from);
603
0
        out = copy_4_bytes(out, from + 16);
604
0
      }
605
0
      break;
606
0
    case 18:
607
0
      for (; len >= 18; len -= 18) {
608
0
        out = copy_16_bytes(out, from);
609
0
        out = copy_2_bytes(out, from + 16);
610
0
      }
611
0
      break;
612
982
    case 16:
613
16.1k
      for (; len >= 16; len -= 16) {
614
15.1k
        out = copy_16_bytes(out, from);
615
15.1k
      }
616
982
      break;
617
1.31k
    case 8:
618
52.4k
      for (; len >= 8; len -= 8) {
619
51.0k
        out = copy_8_bytes(out, from);
620
51.0k
      }
621
1.31k
      break;
622
17.6k
    case 4:
623
558k
      for (; len >= 4; len -= 4) {
624
540k
        out = copy_4_bytes(out, from);
625
540k
      }
626
17.6k
      break;
627
49.0k
    case 2:
628
1.37M
      for (; len >= 2; len -= 2) {
629
1.32M
        out = copy_2_bytes(out, from);
630
1.32M
      }
631
49.0k
      break;
632
26.9k
    default:
633
768k
      for (; len > 0; len--) {
634
742k
        *out++ = *from++;
635
742k
      }
636
95.9k
  }
637
638
  // Copy the leftovers
639
164k
  for (; len > 0; len--) {
640
68.6k
    *out++ = *from++;
641
68.6k
  }
642
643
95.9k
  return out;
644
95.9k
}