Line | Count | Source |
1 | | /* simd.c -- SIMD optimised versions of various internal functions. |
2 | | |
3 | | Copyright (C) 2024 Genome Research Ltd. |
4 | | |
5 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
6 | | of this software and associated documentation files (the "Software"), to deal |
7 | | in the Software without restriction, including without limitation the rights |
8 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
9 | | copies of the Software, and to permit persons to whom the Software is |
10 | | furnished to do so, subject to the following conditions: |
11 | | |
12 | | The above copyright notice and this permission notice shall be included in |
13 | | all copies or substantial portions of the Software. |
14 | | |
15 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
16 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
17 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
18 | | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
19 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
20 | | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
21 | | DEALINGS IN THE SOFTWARE. */ |
22 | | |
23 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
24 | | #include <config.h> |
25 | | |
26 | | // These must be defined before the first system include to ensure that legacy |
27 | | // BSD types needed by <sys/sysctl.h> remain defined when _XOPEN_SOURCE is set. |
28 | | #if defined __APPLE__ |
29 | | #define _DARWIN_C_SOURCE |
30 | | #elif defined __NetBSD__ |
31 | | #define _NETBSD_SOURCE |
32 | | #endif |
33 | | |
34 | | #include "htslib/sam.h" |
35 | | #include "sam_internal.h" |
36 | | |
37 | | #if defined __x86_64__ |
38 | | #include <immintrin.h> |
39 | | #elif defined __ARM_NEON |
40 | | #include <arm_neon.h> |
41 | | #endif |
42 | | |
43 | | #if defined __arm__ || defined __aarch64__ |
44 | | |
45 | | #ifdef HAVE_OPENBSD |
46 | | /* |
47 | | * Extra check for elf_aux_info() on configure-less OpenBSD builds. Once |
48 | | * version 7.5 has dropped off support, this can be changed to an assumption |
49 | | * that the function exists in the Makefile-generated config.h. |
50 | | */ |
51 | | #include <sys/param.h> |
52 | | #if OpenBSD >= 202409 |
53 | | #define HAVE_ELF_AUX_INFO |
54 | | #endif |
55 | | #endif |
56 | | |
57 | | #if defined HAVE_GETAUXVAL || defined HAVE_ELF_AUX_INFO |
58 | | #include <sys/auxv.h> |
59 | | #elif defined __APPLE__ |
60 | | #include <sys/types.h> |
61 | | #include <sys/sysctl.h> |
62 | | #elif defined __NetBSD__ |
63 | | #include <stddef.h> |
64 | | #include <sys/param.h> |
65 | | #include <sys/sysctl.h> |
66 | | #ifdef __aarch64__ |
67 | | #include <aarch64/armreg.h> |
68 | | #else |
69 | | #include <arm/armreg.h> |
70 | | #endif |
71 | | #elif defined _WIN32 |
72 | | #include <processthreadsapi.h> |
73 | | #endif |
74 | | |
75 | | static inline int cpu_supports_neon(void) { |
76 | | #if defined HAVE_GETAUXVAL && defined __arm__ && defined HWCAP_NEON |
77 | | return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0; |
78 | | #elif defined HAVE_GETAUXVAL && defined __arm__ && defined HWCAP_ARM_NEON |
79 | | return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0; |
80 | | #elif defined HAVE_GETAUXVAL && defined __aarch64__ && defined HWCAP_ASIMD |
81 | | return (getauxval(AT_HWCAP) & HWCAP_ASIMD) != 0; |
82 | | #elif defined __APPLE__ && defined __aarch64__ |
83 | | int32_t ctl; |
84 | | size_t ctlsize = sizeof ctl; |
85 | | if (sysctlbyname("hw.optional.AdvSIMD", &ctl, &ctlsize, NULL, 0) != 0) return 0; |
86 | | if (ctlsize != sizeof ctl) return 0; |
87 | | return ctl; |
88 | | #elif defined HAVE_ELF_AUX_INFO && defined __arm__ && defined HWCAP_NEON |
89 | | unsigned long cap; |
90 | | if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0; |
91 | | return (cap & HWCAP_NEON) != 0; |
92 | | #elif defined HAVE_ELF_AUX_INFO && defined __aarch64__ && defined HWCAP_ASIMD |
93 | | unsigned long cap; |
94 | | if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0; |
95 | | return (cap & HWCAP_ASIMD) != 0; |
96 | | #elif defined __NetBSD__ && defined __arm__ && defined ARM_MVFR0_ASIMD_MASK |
97 | | uint32_t buf[16]; |
98 | | size_t buflen = sizeof buf; |
99 | | if (sysctlbyname("machdep.id_mvfr", buf, &buflen, NULL, 0) != 0) return 0; |
100 | | if (buflen < sizeof(uint32_t)) return 0; |
101 | | return (buf[0] & ARM_MVFR0_ASIMD_MASK) == 0x00000002; |
102 | | #elif defined __NetBSD__ && defined __aarch64__ && defined ID_AA64PFR0_EL1_ADVSIMD |
103 | | struct aarch64_sysctl_cpu_id buf; |
104 | | size_t buflen = sizeof buf; |
105 | | if (sysctlbyname("machdep.cpu0.cpu_id", &buf, &buflen, NULL, 0) != 0) return 0; |
106 | | if (buflen < offsetof(struct aarch64_sysctl_cpu_id, ac_aa64pfr0) + sizeof(uint64_t)) return 0; |
107 | | return (buf.ac_aa64pfr0 & ID_AA64PFR0_EL1_ADVSIMD & 0x00e00000) == 0; |
108 | | #elif defined _WIN32 |
109 | | return IsProcessorFeaturePresent(PF_ARM_V8_INSTRUCTIONS_AVAILABLE) != 0; |
110 | | #else |
111 | | return 0; |
112 | | #endif |
113 | | } |
114 | | |
115 | | #endif |
116 | | |
117 | | #ifdef BUILDING_SIMD_NIBBLE2BASE |
118 | | |
119 | | void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default; |
120 | | |
121 | | #if defined __x86_64__ |
122 | | |
123 | | /* |
124 | | * Convert a nibble encoded BAM sequence to a string of bases. |
125 | | * |
126 | | * Using SSSE3 instructions, 16 codepoints that hold 2 bases each can be |
127 | | * unpacked into 32 indexes from 0-15. Using the pshufb instruction these can |
128 | | * be converted to the IUPAC characters. |
129 | | * It falls back on the nibble2base_default function for the remainder. |
130 | | */ |
131 | | |
132 | | __attribute__((target("ssse3"))) |
133 | 1.70M | static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) { |
134 | 1.70M | const char *seq_end_ptr = seq + len; |
135 | 1.70M | char *seq_cursor = seq; |
136 | 1.70M | uint8_t *nibble_cursor = nib; |
137 | 1.70M | const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i) - 1); |
138 | 1.70M | __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str); |
139 | | /* Nucleotides are encoded 4-bits per nucleotide and stored in 8-bit bytes |
140 | | as follows: |AB|CD|EF|GH|. The 4-bit codes (going from 0-15) can be used |
141 | | together with the pshufb instruction as a lookup table. The most efficient |
142 | | way is to use bitwise AND and shift to create two vectors. One with all |
143 | | the upper codes (|A|C|E|G|) and one with the lower codes (|B|D|F|H|). |
144 | | The lookup can then be performed and the resulting vectors can be |
145 | | interleaved again using the unpack instructions. */ |
146 | 11.6M | while (seq_cursor < seq_vec_end_ptr) { |
147 | 9.96M | __m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor); |
148 | 9.96M | __m128i encoded_upper = _mm_srli_epi64(encoded, 4); |
149 | 9.96M | encoded_upper = _mm_and_si128(encoded_upper, _mm_set1_epi8(15)); |
150 | 9.96M | __m128i encoded_lower = _mm_and_si128(encoded, _mm_set1_epi8(15)); |
151 | 9.96M | __m128i nucs_upper = _mm_shuffle_epi8(nuc_lookup_vec, encoded_upper); |
152 | 9.96M | __m128i nucs_lower = _mm_shuffle_epi8(nuc_lookup_vec, encoded_lower); |
153 | 9.96M | __m128i first_nucleotides = _mm_unpacklo_epi8(nucs_upper, nucs_lower); |
154 | 9.96M | __m128i second_nucleotides = _mm_unpackhi_epi8(nucs_upper, nucs_lower); |
155 | 9.96M | _mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides); |
156 | 9.96M | _mm_storeu_si128((__m128i *)(seq_cursor + sizeof(__m128i)), |
157 | 9.96M | second_nucleotides); |
158 | 9.96M | nibble_cursor += sizeof(__m128i); |
159 | 9.96M | seq_cursor += 2 * sizeof(__m128i); |
160 | 9.96M | } |
161 | 1.70M | nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor); |
162 | 1.70M | } |
163 | | |
164 | | __attribute__((constructor)) |
165 | 2 | static void nibble2base_resolve(void) { |
166 | 2 | if (__builtin_cpu_supports("ssse3")) { |
167 | 2 | htslib_nibble2base = nibble2base_ssse3; |
168 | 2 | } |
169 | 2 | } |
170 | | |
171 | | #elif defined __ARM_NEON |
172 | | |
173 | | static void nibble2base_neon(uint8_t *nib, char *seq0, int len) { |
174 | | uint8x16_t low_nibbles_mask = vdupq_n_u8(0x0f); |
175 | | uint8x16_t nuc_lookup_vec = vld1q_u8((const uint8_t *) seq_nt16_str); |
176 | | #ifndef __aarch64__ |
177 | | uint8x8x2_t nuc_lookup_vec2 = {{ vget_low_u8(nuc_lookup_vec), vget_high_u8(nuc_lookup_vec) }}; |
178 | | #endif |
179 | | |
180 | | uint8_t *seq = (uint8_t *) seq0; |
181 | | int blocks; |
182 | | |
183 | | for (blocks = len / 32; blocks > 0; --blocks) { |
184 | | uint8x16_t encoded = vld1q_u8(nib); |
185 | | nib += 16; |
186 | | |
187 | | /* Translate the high and low nibbles to nucleotide letters separately, |
188 | | then interleave them back together via vzipq for writing. */ |
189 | | |
190 | | uint8x16_t high_nibbles = vshrq_n_u8(encoded, 4); |
191 | | uint8x16_t low_nibbles = vandq_u8(encoded, low_nibbles_mask); |
192 | | |
193 | | #ifdef __aarch64__ |
194 | | uint8x16_t high_nucleotides = vqtbl1q_u8(nuc_lookup_vec, high_nibbles); |
195 | | uint8x16_t low_nucleotides = vqtbl1q_u8(nuc_lookup_vec, low_nibbles); |
196 | | #else |
197 | | uint8x8_t high_low = vtbl2_u8(nuc_lookup_vec2, vget_low_u8(high_nibbles)); |
198 | | uint8x8_t high_high = vtbl2_u8(nuc_lookup_vec2, vget_high_u8(high_nibbles)); |
199 | | uint8x16_t high_nucleotides = vcombine_u8(high_low, high_high); |
200 | | |
201 | | uint8x8_t low_low = vtbl2_u8(nuc_lookup_vec2, vget_low_u8(low_nibbles)); |
202 | | uint8x8_t low_high = vtbl2_u8(nuc_lookup_vec2, vget_high_u8(low_nibbles)); |
203 | | uint8x16_t low_nucleotides = vcombine_u8(low_low, low_high); |
204 | | #endif |
205 | | |
206 | | #ifdef __aarch64__ |
207 | | vst1q_u8_x2(seq, vzipq_u8(high_nucleotides, low_nucleotides)); |
208 | | #else |
209 | | // Avoid vst1q_u8_x2 as GCC erroneously omits it on 32-bit ARM |
210 | | uint8x16x2_t nucleotides = {{ high_nucleotides, low_nucleotides }}; |
211 | | vst2q_u8(seq, nucleotides); |
212 | | #endif |
213 | | seq += 32; |
214 | | } |
215 | | |
216 | | if (len % 32 != 0) |
217 | | nibble2base_default(nib, (char *) seq, len % 32); |
218 | | } |
219 | | |
220 | | static __attribute__((constructor)) void nibble2base_resolve(void) { |
221 | | if (cpu_supports_neon()) htslib_nibble2base = nibble2base_neon; |
222 | | } |
223 | | |
224 | | #endif |
225 | | |
226 | | #endif // BUILDING_SIMD_NIBBLE2BASE |
227 | | |
228 | | // Potentially useful diagnostic, and prevents "empty translation unit" errors |
229 | | const char htslib_simd[] = |
230 | | "SIMD functions present:" |
231 | | #ifdef BUILDING_SIMD_NIBBLE2BASE |
232 | | " nibble2base" |
233 | | #endif |
234 | | "."; |