Coverage Report

Created: 2024-07-27 06:20

/src/c-blosc2/blosc/fastcopy.c
Line
Count
Source (jump to first uncovered line)
1
/*********************************************************************
2
  Blosc - Blocked Shuffling and Compression Library
3
4
  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
5
  https://blosc.org
6
  License: BSD 3-Clause (see LICENSE.txt)
7
8
  See LICENSE.txt for details about copyright and rights to use.
9
**********************************************************************/
10
11
/*********************************************************************
12
  The code in this file is heavily based on memcopy.h, from the
13
  zlib-ng compression library.  See LICENSES/ZLIB.txt for details.
14
  See also: https://github.com/Dead2/zlib-ng/blob/develop/zlib.h
15
16
  New implementations by Francesc Alted:
17
    * fast_copy() and copy_run() functions
18
    * Support for SSE2/AVX2 copy instructions for these routines
19
**********************************************************************/
20
21
#include "blosc2/blosc2-common.h"
22
23
#include <assert.h>
24
#include <stdint.h>
25
#if defined(BLOSC_STRICT_ALIGN)
26
#include <string.h>
27
#endif
28
29
/*
30
 * Use inlined functions for supported systems.
31
 */
32
#if defined(_MSC_VER) && !defined(__cplusplus)   /* Visual Studio */
33
#define inline __inline  /* Visual C is not C99, but supports some kind of inline */
34
#endif
35
36
37
1
static inline unsigned char *copy_1_bytes(unsigned char *out, const unsigned char *from) {
38
1
  *out++ = *from;
39
1
  return out;
40
1
}
41
42
594
static inline unsigned char *copy_2_bytes(unsigned char *out, const unsigned char *from) {
43
#if defined(BLOSC_STRICT_ALIGN)
44
  uint16_t chunk;
45
  memcpy(&chunk, from, 2);
46
  memcpy(out, &chunk, 2);
47
#else
48
594
  *(uint16_t *) out = *(uint16_t *) from;
49
594
#endif
50
594
  return out + 2;
51
594
}
52
53
1
static inline unsigned char *copy_3_bytes(unsigned char *out, const unsigned char *from) {
54
1
  out = copy_1_bytes(out, from);
55
1
  return copy_2_bytes(out, from + 1);
56
1
}
57
58
280
static inline unsigned char *copy_4_bytes(unsigned char *out, const unsigned char *from) {
59
#if defined(BLOSC_STRICT_ALIGN)
60
  uint32_t chunk;
61
  memcpy(&chunk, from, 4);
62
  memcpy(out, &chunk, 4);
63
#else
64
280
  *(uint32_t *) out = *(uint32_t *) from;
65
280
#endif
66
280
  return out + 4;
67
280
}
68
69
0
static inline unsigned char *copy_5_bytes(unsigned char *out, const unsigned char *from) {
70
0
  out = copy_1_bytes(out, from);
71
0
  return copy_4_bytes(out, from + 1);
72
0
}
73
74
3
static inline unsigned char *copy_6_bytes(unsigned char *out, const unsigned char *from) {
75
3
  out = copy_2_bytes(out, from);
76
3
  return copy_4_bytes(out, from + 2);
77
3
}
78
79
1
static inline unsigned char *copy_7_bytes(unsigned char *out, const unsigned char *from) {
80
1
  out = copy_3_bytes(out, from);
81
1
  return copy_4_bytes(out, from + 3);
82
1
}
83
84
17
static inline unsigned char *copy_8_bytes(unsigned char *out, const unsigned char *from) {
85
#if defined(BLOSC_STRICT_ALIGN)
86
  uint64_t chunk;
87
  memcpy(&chunk, from, 8);
88
  memcpy(out, &chunk, 8);
89
#else
90
17
  *(uint64_t *) out = *(uint64_t *) from;
91
17
#endif
92
17
  return out + 8;
93
17
}
94
95
96
413
static inline unsigned char *copy_16_bytes(unsigned char *out, const unsigned char *from) {
97
413
#if defined(__SSE2__)
98
413
  __m128i chunk;
99
413
  chunk = _mm_loadu_si128((__m128i*)from);
100
413
  _mm_storeu_si128((__m128i*)out, chunk);
101
413
  out += 16;
102
#elif !defined(BLOSC_STRICT_ALIGN)
103
  *(uint64_t*)out = *(uint64_t*)from;
104
   from += 8; out += 8;
105
   *(uint64_t*)out = *(uint64_t*)from;
106
   from += 8; out += 8;
107
#else
108
   int i;
109
   for (i = 0; i < 16; i++) {
110
     *out++ = *from++;
111
   }
112
#endif
113
413
  return out;
114
413
}
115
116
3
static inline unsigned char *copy_32_bytes(unsigned char *out, const unsigned char *from) {
117
#if defined(__AVX2__)
118
  __m256i chunk;
119
  chunk = _mm256_loadu_si256((__m256i*)from);
120
  _mm256_storeu_si256((__m256i*)out, chunk);
121
  out += 32;
122
#elif defined(__SSE2__)
123
  __m128i chunk;
124
3
  chunk = _mm_loadu_si128((__m128i*)from);
125
3
  _mm_storeu_si128((__m128i*)out, chunk);
126
3
  from += 16; out += 16;
127
3
  chunk = _mm_loadu_si128((__m128i*)from);
128
3
  _mm_storeu_si128((__m128i*)out, chunk);
129
3
  out += 16;
130
#elif !defined(BLOSC_STRICT_ALIGN)
131
  *(uint64_t*)out = *(uint64_t*)from;
132
  from += 8; out += 8;
133
  *(uint64_t*)out = *(uint64_t*)from;
134
  from += 8; out += 8;
135
  *(uint64_t*)out = *(uint64_t*)from;
136
  from += 8; out += 8;
137
  *(uint64_t*)out = *(uint64_t*)from;
138
  from += 8; out += 8;
139
#else
140
  int i;
141
  for (i = 0; i < 32; i++) {
142
    *out++ = *from++;
143
  }
144
#endif
145
3
  return out;
146
3
}
147
148
// This is never used, so comment it out
149
//#if defined(__AVX2__)
150
//static inline unsigned char *copy_32_bytes_aligned(unsigned char *out, const unsigned char *from) {
151
//  __m256i chunk;
152
//  chunk = _mm256_load_si256((__m256i*)from);
153
//  _mm256_storeu_si256((__m256i*)out, chunk);
154
//  return out + 32;
155
//}
156
//#endif  // __AVX2__
157
158
/* Copy LEN bytes (7 or fewer) from FROM into OUT. Return OUT + LEN. */
159
4
static inline unsigned char *copy_bytes(unsigned char *out, const unsigned char *from, unsigned len) {
160
4
  assert(len < 8);
161
162
#ifdef BLOSC_STRICT_ALIGN
163
  while (len--) {
164
    *out++ = *from++;
165
  }
166
#else
167
4
  switch (len) {
168
1
    case 7:
169
1
      return copy_7_bytes(out, from);
170
3
    case 6:
171
3
      return copy_6_bytes(out, from);
172
0
    case 5:
173
0
      return copy_5_bytes(out, from);
174
0
    case 4:
175
0
      return copy_4_bytes(out, from);
176
0
    case 3:
177
0
      return copy_3_bytes(out, from);
178
0
    case 2:
179
0
      return copy_2_bytes(out, from);
180
0
    case 1:
181
0
      return copy_1_bytes(out, from);
182
0
    case 0:
183
0
      return out;
184
0
    default:
185
0
      assert(0);
186
4
  }
187
0
#endif /* BLOSC_STRICT_ALIGN */
188
0
  return out;
189
4
}
190
191
// Define a symbol for avoiding fall-through warnings emitted by gcc >= 7.0
192
#if ((defined(__GNUC__) && BLOSC_GCC_VERSION >= 700) && !defined(__clang__) && \
193
     !defined(__ICC) && !defined(__ICL))
194
#define AVOID_FALLTHROUGH_WARNING
195
#endif
196
197
/* Byte by byte semantics: copy LEN bytes from FROM and write them to OUT. Return OUT + LEN. */
198
7
static inline unsigned char *chunk_memcpy(unsigned char *out, const unsigned char *from, unsigned len) {
199
7
  unsigned sz = sizeof(uint64_t);
200
7
  unsigned rem = len % sz;
201
7
  unsigned by8;
202
203
7
  assert(len >= sz);
204
205
  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
206
7
  copy_8_bytes(out, from);
207
208
7
  len /= sz;
209
7
  out += rem;
210
7
  from += rem;
211
212
7
  by8 = len % 8;
213
7
  len -= by8;
214
7
  switch (by8) {
215
0
    case 7:
216
0
      out = copy_8_bytes(out, from);
217
0
      from += sz;
218
      #ifdef AVOID_FALLTHROUGH_WARNING
219
      __attribute__ ((fallthrough));  // Shut-up -Wimplicit-fallthrough warning in GCC
220
      #endif
221
0
    case 6:
222
0
      out = copy_8_bytes(out, from);
223
0
      from += sz;
224
      #ifdef AVOID_FALLTHROUGH_WARNING
225
      __attribute__ ((fallthrough));
226
      #endif
227
0
    case 5:
228
0
      out = copy_8_bytes(out, from);
229
0
      from += sz;
230
      #ifdef AVOID_FALLTHROUGH_WARNING
231
      __attribute__ ((fallthrough));
232
      #endif
233
0
    case 4:
234
0
      out = copy_8_bytes(out, from);
235
0
      from += sz;
236
      #ifdef AVOID_FALLTHROUGH_WARNING
237
      __attribute__ ((fallthrough));
238
      #endif
239
0
    case 3:
240
0
      out = copy_8_bytes(out, from);
241
0
      from += sz;
242
      #ifdef AVOID_FALLTHROUGH_WARNING
243
      __attribute__ ((fallthrough));
244
      #endif
245
0
    case 2:
246
0
      out = copy_8_bytes(out, from);
247
0
      from += sz;
248
      #ifdef AVOID_FALLTHROUGH_WARNING
249
      __attribute__ ((fallthrough));
250
      #endif
251
7
    case 1:
252
7
      out = copy_8_bytes(out, from);
253
7
      from += sz;
254
      #ifdef AVOID_FALLTHROUGH_WARNING
255
      __attribute__ ((fallthrough));
256
      #endif
257
7
    default:
258
7
      break;
259
7
  }
260
261
7
  while (len) {
262
0
    out = copy_8_bytes(out, from);
263
0
    from += sz;
264
0
    out = copy_8_bytes(out, from);
265
0
    from += sz;
266
0
    out = copy_8_bytes(out, from);
267
0
    from += sz;
268
0
    out = copy_8_bytes(out, from);
269
0
    from += sz;
270
0
    out = copy_8_bytes(out, from);
271
0
    from += sz;
272
0
    out = copy_8_bytes(out, from);
273
0
    from += sz;
274
0
    out = copy_8_bytes(out, from);
275
0
    from += sz;
276
0
    out = copy_8_bytes(out, from);
277
0
    from += sz;
278
279
0
    len -= 8;
280
0
  }
281
282
7
  return out;
283
7
}
284
285
#if (defined(__SSE2__) && defined(__AVX2__))
286
/* 16-byte version of chunk_memcpy() */
287
static inline unsigned char *chunk_memcpy_16(unsigned char *out, const unsigned char *from, unsigned len) {
288
  unsigned sz = 16;
289
  unsigned rem = len % sz;
290
  unsigned ilen;
291
292
  assert(len >= sz);
293
294
  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
295
  copy_16_bytes(out, from);
296
297
  len /= sz;
298
  out += rem;
299
  from += rem;
300
301
  for (ilen = 0; ilen < len; ilen++) {
302
    copy_16_bytes(out, from);
303
    out += sz;
304
    from += sz;
305
  }
306
307
  return out;
308
}
309
#endif
310
311
312
// NOTE: chunk_memcpy_32() and chunk_memcpy_32_unrolled() are not used, so commenting them
313
314
///* 32-byte version of chunk_memcpy() */
315
//static inline unsigned char *chunk_memcpy_32(unsigned char *out, const unsigned char *from, unsigned len) {
316
//  unsigned sz = 32;
317
//  unsigned rem = len % sz;
318
//  unsigned ilen;
319
//
320
//  assert(len >= sz);
321
//
322
//  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
323
//  copy_32_bytes(out, from);
324
//
325
//  len /= sz;
326
//  out += rem;
327
//  from += rem;
328
//
329
//  for (ilen = 0; ilen < len; ilen++) {
330
//    copy_32_bytes(out, from);
331
//    out += sz;
332
//    from += sz;
333
//  }
334
//
335
//  return out;
336
//}
337
//
338
///* 32-byte *unrolled* version of chunk_memcpy() */
339
//static inline unsigned char *chunk_memcpy_32_unrolled(unsigned char *out, const unsigned char *from, unsigned len) {
340
//  unsigned sz = 32;
341
//  unsigned rem = len % sz;
342
//  unsigned by8;
343
//
344
//  assert(len >= sz);
345
//
346
//  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
347
//  copy_32_bytes(out, from);
348
//
349
//  len /= sz;
350
//  out += rem;
351
//  from += rem;
352
//
353
//  by8 = len % 8;
354
//  len -= by8;
355
//  switch (by8) {
356
//    case 7:
357
//      out = copy_32_bytes(out, from);
358
//      from += sz;
359
//    case 6:
360
//      out = copy_32_bytes(out, from);
361
//      from += sz;
362
//    case 5:
363
//      out = copy_32_bytes(out, from);
364
//      from += sz;
365
//    case 4:
366
//      out = copy_32_bytes(out, from);
367
//      from += sz;
368
//    case 3:
369
//      out = copy_32_bytes(out, from);
370
//      from += sz;
371
//    case 2:
372
//      out = copy_32_bytes(out, from);
373
//      from += sz;
374
//    case 1:
375
//      out = copy_32_bytes(out, from);
376
//      from += sz;
377
//    default:
378
//      break;
379
//  }
380
//
381
//  while (len) {
382
//    out = copy_32_bytes(out, from);
383
//    from += sz;
384
//    out = copy_32_bytes(out, from);
385
//    from += sz;
386
//    out = copy_32_bytes(out, from);
387
//    from += sz;
388
//    out = copy_32_bytes(out, from);
389
//    from += sz;
390
//    out = copy_32_bytes(out, from);
391
//    from += sz;
392
//    out = copy_32_bytes(out, from);
393
//    from += sz;
394
//    out = copy_32_bytes(out, from);
395
//    from += sz;
396
//    out = copy_32_bytes(out, from);
397
//    from += sz;
398
//
399
//    len -= 8;
400
//  }
401
//
402
//  return out;
403
//}
404
405
406
/* SSE2/AVX2 *unaligned* version of chunk_memcpy() */
407
#if defined(__SSE2__) || defined(__AVX2__)
408
19
static inline unsigned char *chunk_memcpy_unaligned(unsigned char *out, const unsigned char *from, unsigned len) {
409
#if defined(__AVX2__)
410
  unsigned sz = sizeof(__m256i);
411
#elif defined(__SSE2__)
412
  unsigned sz = sizeof(__m128i);
413
19
#endif
414
19
  unsigned rem = len % sz;
415
19
  unsigned ilen;
416
417
19
  assert(len >= sz);
418
419
  /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
420
#if defined(__AVX2__)
421
  copy_32_bytes(out, from);
422
#elif defined(__SSE2__)
423
  copy_16_bytes(out, from);
424
19
#endif
425
426
19
  len /= sz;
427
19
  out += rem;
428
19
  from += rem;
429
430
406
  for (ilen = 0; ilen < len; ilen++) {
431
#if defined(__AVX2__)
432
    copy_32_bytes(out, from);
433
#elif defined(__SSE2__)
434
    copy_16_bytes(out, from);
435
387
#endif
436
387
    out += sz;
437
387
    from += sz;
438
387
  }
439
440
19
  return out;
441
19
}
442
#endif // __SSE2__ || __AVX2__
443
444
445
// NOTE: chunk_memcpy_aligned() is not used, so commenting it
446
447
//#if defined(__SSE2__) || defined(__AVX2__)
448
///* SSE2/AVX2 *aligned* version of chunk_memcpy() */
449
//static inline unsigned char *chunk_memcpy_aligned(unsigned char *out, const unsigned char *from, unsigned len) {
450
//#if defined(__AVX2__)
451
//  unsigned sz = sizeof(__m256i);
452
//  __m256i chunk;
453
//#elif defined(__SSE2__)
454
//  unsigned sz = sizeof(__m128i);
455
//  __m128i chunk;
456
//#endif
457
//  unsigned bytes_to_align = sz - (unsigned)(((uintptr_t)(const void *)(from)) % sz);
458
//  unsigned corrected_len = len - bytes_to_align;
459
//  unsigned rem = corrected_len % sz;
460
//  unsigned ilen;
461
//
462
//  assert(len >= sz);
463
//
464
//  /* Copy a few bytes to make sure the loop below has aligned access. */
465
//#if defined(__AVX2__)
466
//  chunk = _mm256_loadu_si256((__m256i *) from);
467
//  _mm256_storeu_si256((__m256i *) out, chunk);
468
//#elif defined(__SSE2__)
469
//  chunk = _mm_loadu_si128((__m128i *) from);
470
//  _mm_storeu_si128((__m128i *) out, chunk);
471
//#endif
472
//  out += bytes_to_align;
473
//  from += bytes_to_align;
474
//
475
//  len = corrected_len / sz;
476
//  for (ilen = 0; ilen < len; ilen++) {
477
//#if defined(__AVX2__)
478
//    chunk = _mm256_load_si256((__m256i *) from);  /* *aligned* load */
479
//    _mm256_storeu_si256((__m256i *) out, chunk);
480
//#elif defined(__SSE2__)
481
//    chunk = _mm_load_si128((__m128i *) from);  /* *aligned* load */
482
//    _mm_storeu_si128((__m128i *) out, chunk);
483
//#endif
484
//    out += sz;
485
//    from += sz;
486
//  }
487
//
488
//  /* Copy remaining bytes */
489
//  if (rem < 8) {
490
//    out = copy_bytes(out, from, rem);
491
//  }
492
//  else {
493
//    out = chunk_memcpy(out, from, rem);
494
//  }
495
//
496
//  return out;
497
//}
498
//#endif // __AVX2__ || __SSE2__
499
500
501
/* Byte by byte semantics: copy LEN bytes from FROM and write them to OUT. Return OUT + LEN. */
502
43
unsigned char *fastcopy(unsigned char *out, const unsigned char *from, unsigned len) {
503
43
  switch (len) {
504
3
    case 32:
505
3
      return copy_32_bytes(out, from);
506
7
    case 16:
507
7
      return copy_16_bytes(out, from);
508
3
    case 8:
509
3
      return copy_8_bytes(out, from);
510
30
    default: {
511
30
    }
512
43
  }
513
30
  if (len < 8) {
514
4
    return copy_bytes(out, from, len);
515
4
  }
516
26
#if defined(__SSE2__)
517
26
  if (len < 16) {
518
7
    return chunk_memcpy(out, from, len);
519
7
  }
520
19
#if !defined(__AVX2__)
521
19
  return chunk_memcpy_unaligned(out, from, len);
522
#else
523
  if (len < 32) {
524
    return chunk_memcpy_16(out, from, len);
525
  }
526
  return chunk_memcpy_unaligned(out, from, len);
527
#endif  // !__AVX2__
528
#else
529
  return chunk_memcpy(out, from, len);
530
#endif  // __SSE2__
531
26
}
532
533
534
/* Copy a run */
535
1.00k
unsigned char* copy_match(unsigned char *out, const unsigned char *from, unsigned len) {
536
#if defined(__AVX2__)
537
  unsigned sz = sizeof(__m256i);
538
#elif defined(__SSE2__)
539
  unsigned sz = sizeof(__m128i);
540
#else
541
  unsigned sz = sizeof(uint64_t);
542
#endif
543
544
#if ((defined(__GNUC__) && BLOSC_GCC_VERSION < 800) && !defined(__clang__) && !defined(__ICC) && !defined(__ICL))
545
  // GCC < 8 in fully optimization mode seems to have problems with the code further below so stop here
546
  for (; len > 0; len--) {
547
    *out++ = *from++;
548
  }
549
  return out;
550
#endif
551
552
  // If out and from are away more than the size of the copy, then a fastcopy is safe
553
1.00k
  unsigned overlap_dist = (unsigned) (out - from);
554
1.00k
  if (overlap_dist > sz) {
555
43
    return fastcopy(out, from, len);
556
43
  }
557
558
  // Otherwise we need to be more careful so as not to overwrite destination
559
964
  switch (overlap_dist) {
560
0
    case 32:
561
0
      for (; len >= 32; len -= 32) {
562
0
        out = copy_32_bytes(out, from);
563
0
      }
564
0
      break;
565
0
    case 30:
566
0
      for (; len >= 30; len -= 30) {
567
0
        out = copy_16_bytes(out, from);
568
0
        out = copy_8_bytes(out, from + 16);
569
0
        out = copy_4_bytes(out, from + 24);
570
0
        out = copy_2_bytes(out, from + 28);
571
0
      }
572
0
      break;
573
0
    case 28:
574
0
      for (; len >= 28; len -= 28) {
575
0
        out = copy_16_bytes(out, from);
576
0
        out = copy_8_bytes(out, from + 16);
577
0
        out = copy_4_bytes(out, from + 24);
578
0
      }
579
0
      break;
580
0
    case 26:
581
0
      for (; len >= 26; len -= 26) {
582
0
        out = copy_16_bytes(out, from);
583
0
        out = copy_8_bytes(out, from + 16);
584
0
        out = copy_2_bytes(out, from + 24);
585
0
      }
586
0
      break;
587
0
    case 24:
588
0
      for (; len >= 24; len -= 24) {
589
0
        out = copy_16_bytes(out, from);
590
0
        out = copy_8_bytes(out, from + 16);
591
0
      }
592
0
      break;
593
0
    case 22:
594
0
      for (; len >= 22; len -= 22) {
595
0
        out = copy_16_bytes(out, from);
596
0
        out = copy_4_bytes(out, from + 16);
597
0
        out = copy_2_bytes(out, from + 20);
598
0
      }
599
0
      break;
600
0
    case 20:
601
0
      for (; len >= 20; len -= 20) {
602
0
        out = copy_16_bytes(out, from);
603
0
        out = copy_4_bytes(out, from + 16);
604
0
      }
605
0
      break;
606
0
    case 18:
607
0
      for (; len >= 18; len -= 18) {
608
0
        out = copy_16_bytes(out, from);
609
0
        out = copy_2_bytes(out, from + 16);
610
0
      }
611
0
      break;
612
0
    case 16:
613
0
      for (; len >= 16; len -= 16) {
614
0
        out = copy_16_bytes(out, from);
615
0
      }
616
0
      break;
617
0
    case 8:
618
0
      for (; len >= 8; len -= 8) {
619
0
        out = copy_8_bytes(out, from);
620
0
      }
621
0
      break;
622
196
    case 4:
623
472
      for (; len >= 4; len -= 4) {
624
276
        out = copy_4_bytes(out, from);
625
276
      }
626
196
      break;
627
127
    case 2:
628
717
      for (; len >= 2; len -= 2) {
629
590
        out = copy_2_bytes(out, from);
630
590
      }
631
127
      break;
632
641
    default:
633
5.23k
      for (; len > 0; len--) {
634
4.59k
        *out++ = *from++;
635
4.59k
      }
636
964
  }
637
638
  // Copy the leftovers
639
1.28k
  for (; len > 0; len--) {
640
322
    *out++ = *from++;
641
322
  }
642
643
964
  return out;
644
964
}