Coverage Report

Created: 2025-07-11 06:53

/src/htslib/simd.c
Line
Count
Source
1
/*  simd.c -- SIMD optimised versions of various internal functions.
2
3
    Copyright (C) 2024 Genome Research Ltd.
4
5
Permission is hereby granted, free of charge, to any person obtaining a copy
6
of this software and associated documentation files (the "Software"), to deal
7
in the Software without restriction, including without limitation the rights
8
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
copies of the Software, and to permit persons to whom the Software is
10
furnished to do so, subject to the following conditions:
11
12
The above copyright notice and this permission notice shall be included in
13
all copies or substantial portions of the Software.
14
15
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21
DEALINGS IN THE SOFTWARE.  */
22
23
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
24
#include <config.h>
25
26
// These must be defined before the first system include to ensure that legacy
27
// BSD types needed by <sys/sysctl.h> remain defined when _XOPEN_SOURCE is set.
28
#if defined __APPLE__
29
#define _DARWIN_C_SOURCE
30
#elif defined __NetBSD__
31
#define _NETBSD_SOURCE
32
#endif
33
34
#include "htslib/sam.h"
35
#include "sam_internal.h"
36
37
#if defined __x86_64__
38
#include <immintrin.h>
39
#elif defined __ARM_NEON
40
#include <arm_neon.h>
41
#endif
42
43
#if defined __arm__ || defined __aarch64__
44
45
#ifdef HAVE_OPENBSD
46
/*
47
 * Extra check for elf_aux_info() on configure-less OpenBSD builds.  Once
48
 * version 7.5 has dropped off support, this can be changed to an assumption
49
 * that the function exists in the Makefile-generated config.h.
50
 */
51
#include <sys/param.h>
52
#if OpenBSD >= 202409
53
#define HAVE_ELF_AUX_INFO
54
#endif
55
#endif
56
57
#if defined HAVE_GETAUXVAL || defined HAVE_ELF_AUX_INFO
58
#include <sys/auxv.h>
59
#elif defined __APPLE__
60
#include <sys/types.h>
61
#include <sys/sysctl.h>
62
#elif defined __NetBSD__
63
#include <stddef.h>
64
#include <sys/param.h>
65
#include <sys/sysctl.h>
66
#ifdef __aarch64__
67
#include <aarch64/armreg.h>
68
#else
69
#include <arm/armreg.h>
70
#endif
71
#elif defined _WIN32
72
#include <processthreadsapi.h>
73
#endif
74
75
static inline int cpu_supports_neon(void) {
76
#if defined HAVE_GETAUXVAL && defined __arm__ && defined HWCAP_NEON
77
    return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0;
78
#elif defined HAVE_GETAUXVAL && defined __arm__ && defined HWCAP_ARM_NEON
79
    return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0;
80
#elif defined HAVE_GETAUXVAL && defined __aarch64__ && defined HWCAP_ASIMD
81
    return (getauxval(AT_HWCAP) & HWCAP_ASIMD) != 0;
82
#elif defined __APPLE__ && defined __aarch64__
83
    int32_t ctl;
84
    size_t ctlsize = sizeof ctl;
85
    if (sysctlbyname("hw.optional.AdvSIMD", &ctl, &ctlsize, NULL, 0) != 0) return 0;
86
    if (ctlsize != sizeof ctl) return 0;
87
    return ctl;
88
#elif defined HAVE_ELF_AUX_INFO && defined __arm__ && defined HWCAP_NEON
89
    unsigned long cap;
90
    if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0;
91
    return (cap & HWCAP_NEON) != 0;
92
#elif defined HAVE_ELF_AUX_INFO && defined __aarch64__ && defined HWCAP_ASIMD
93
    unsigned long cap;
94
    if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0;
95
    return (cap & HWCAP_ASIMD) != 0;
96
#elif defined __NetBSD__ && defined __arm__ && defined ARM_MVFR0_ASIMD_MASK
97
    uint32_t buf[16];
98
    size_t buflen = sizeof buf;
99
    if (sysctlbyname("machdep.id_mvfr", buf, &buflen, NULL, 0) != 0) return 0;
100
    if (buflen < sizeof(uint32_t)) return 0;
101
    return (buf[0] & ARM_MVFR0_ASIMD_MASK) == 0x00000002;
102
#elif defined __NetBSD__ && defined __aarch64__ && defined ID_AA64PFR0_EL1_ADVSIMD
103
    struct aarch64_sysctl_cpu_id buf;
104
    size_t buflen = sizeof buf;
105
    if (sysctlbyname("machdep.cpu0.cpu_id", &buf, &buflen, NULL, 0) != 0) return 0;
106
    if (buflen < offsetof(struct aarch64_sysctl_cpu_id, ac_aa64pfr0) + sizeof(uint64_t)) return 0;
107
    return (buf.ac_aa64pfr0 & ID_AA64PFR0_EL1_ADVSIMD & 0x00e00000) == 0;
108
#elif defined _WIN32
109
    return IsProcessorFeaturePresent(PF_ARM_V8_INSTRUCTIONS_AVAILABLE) != 0;
110
#else
111
    return 0;
112
#endif
113
}
114
115
#endif
116
117
#ifdef BUILDING_SIMD_NIBBLE2BASE
118
119
void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default;
120
121
#if defined __x86_64__
122
123
/*
124
 * Convert a nibble encoded BAM sequence to a string of bases.
125
 *
126
 * Using SSSE3 instructions, 16 codepoints that hold 2 bases each can be
127
 * unpacked into 32 indexes from 0-15. Using the pshufb instruction these can
128
 * be converted to the IUPAC characters.
129
 * It falls back on the nibble2base_default function for the remainder.
130
 */
131
132
__attribute__((target("ssse3")))
133
1.70M
static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
134
1.70M
    const char *seq_end_ptr = seq + len;
135
1.70M
    char *seq_cursor = seq;
136
1.70M
    uint8_t *nibble_cursor = nib;
137
1.70M
    const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i) - 1);
138
1.70M
    __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str);
139
    /* Nucleotides are encoded 4-bits per nucleotide and stored in 8-bit bytes
140
       as follows: |AB|CD|EF|GH|. The 4-bit codes (going from 0-15) can be used
141
       together with the pshufb instruction as a lookup table. The most efficient
142
       way is to use bitwise AND and shift to create two vectors. One with all
143
       the upper codes (|A|C|E|G|) and one with the lower codes (|B|D|F|H|).
144
       The lookup can then be performed and the resulting vectors can be
145
       interleaved again using the unpack instructions. */
146
11.6M
    while (seq_cursor < seq_vec_end_ptr) {
147
9.96M
        __m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor);
148
9.96M
        __m128i encoded_upper = _mm_srli_epi64(encoded, 4);
149
9.96M
        encoded_upper = _mm_and_si128(encoded_upper, _mm_set1_epi8(15));
150
9.96M
        __m128i encoded_lower = _mm_and_si128(encoded, _mm_set1_epi8(15));
151
9.96M
        __m128i nucs_upper = _mm_shuffle_epi8(nuc_lookup_vec, encoded_upper);
152
9.96M
        __m128i nucs_lower = _mm_shuffle_epi8(nuc_lookup_vec, encoded_lower);
153
9.96M
        __m128i first_nucleotides = _mm_unpacklo_epi8(nucs_upper, nucs_lower);
154
9.96M
        __m128i second_nucleotides = _mm_unpackhi_epi8(nucs_upper, nucs_lower);
155
9.96M
        _mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides);
156
9.96M
        _mm_storeu_si128((__m128i *)(seq_cursor + sizeof(__m128i)),
157
9.96M
                         second_nucleotides);
158
9.96M
        nibble_cursor += sizeof(__m128i);
159
9.96M
        seq_cursor += 2 * sizeof(__m128i);
160
9.96M
    }
161
1.70M
    nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor);
162
1.70M
}
163
164
__attribute__((constructor))
165
2
static void nibble2base_resolve(void) {
166
2
    if (__builtin_cpu_supports("ssse3")) {
167
2
        htslib_nibble2base = nibble2base_ssse3;
168
2
    }
169
2
}
170
171
#elif defined __ARM_NEON
172
173
static void nibble2base_neon(uint8_t *nib, char *seq0, int len) {
174
    uint8x16_t low_nibbles_mask = vdupq_n_u8(0x0f);
175
    uint8x16_t nuc_lookup_vec = vld1q_u8((const uint8_t *) seq_nt16_str);
176
#ifndef __aarch64__
177
    uint8x8x2_t nuc_lookup_vec2 = {{ vget_low_u8(nuc_lookup_vec), vget_high_u8(nuc_lookup_vec) }};
178
#endif
179
180
    uint8_t *seq = (uint8_t *) seq0;
181
    int blocks;
182
183
    for (blocks = len / 32; blocks > 0; --blocks) {
184
        uint8x16_t encoded = vld1q_u8(nib);
185
        nib += 16;
186
187
        /* Translate the high and low nibbles to nucleotide letters separately,
188
           then interleave them back together via vzipq for writing. */
189
190
        uint8x16_t high_nibbles = vshrq_n_u8(encoded, 4);
191
        uint8x16_t low_nibbles  = vandq_u8(encoded, low_nibbles_mask);
192
193
#ifdef __aarch64__
194
        uint8x16_t high_nucleotides = vqtbl1q_u8(nuc_lookup_vec, high_nibbles);
195
        uint8x16_t low_nucleotides  = vqtbl1q_u8(nuc_lookup_vec, low_nibbles);
196
#else
197
        uint8x8_t high_low  = vtbl2_u8(nuc_lookup_vec2, vget_low_u8(high_nibbles));
198
        uint8x8_t high_high = vtbl2_u8(nuc_lookup_vec2, vget_high_u8(high_nibbles));
199
        uint8x16_t high_nucleotides = vcombine_u8(high_low, high_high);
200
201
        uint8x8_t low_low  = vtbl2_u8(nuc_lookup_vec2, vget_low_u8(low_nibbles));
202
        uint8x8_t low_high = vtbl2_u8(nuc_lookup_vec2, vget_high_u8(low_nibbles));
203
        uint8x16_t low_nucleotides = vcombine_u8(low_low, low_high);
204
#endif
205
206
#ifdef __aarch64__
207
        vst1q_u8_x2(seq, vzipq_u8(high_nucleotides, low_nucleotides));
208
#else
209
        // Avoid vst1q_u8_x2 as GCC erroneously omits it on 32-bit ARM
210
        uint8x16x2_t nucleotides = {{ high_nucleotides, low_nucleotides }};
211
        vst2q_u8(seq, nucleotides);
212
#endif
213
        seq += 32;
214
    }
215
216
    if (len % 32 != 0)
217
        nibble2base_default(nib, (char *) seq, len % 32);
218
}
219
220
static __attribute__((constructor)) void nibble2base_resolve(void) {
221
    if (cpu_supports_neon()) htslib_nibble2base = nibble2base_neon;
222
}
223
224
#endif
225
226
#endif // BUILDING_SIMD_NIBBLE2BASE
227
228
// Potentially useful diagnostic, and prevents "empty translation unit" errors
229
const char htslib_simd[] =
230
    "SIMD functions present:"
231
#ifdef BUILDING_SIMD_NIBBLE2BASE
232
    " nibble2base"
233
#endif
234
    ".";