Coverage Report

Created: 2025-11-15 07:36

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/duckdb/third_party/fsst/libfsst.hpp
Line
Count
Source
1
// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
2
// 
3
// Copyright 2018-2020, CWI, TU Munich, FSU Jena
4
// 
5
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files   
6
// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,   
7
// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is   
8
// furnished to do so, subject to the following conditions:
9
// 
10
// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
11
// 
12
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
13
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
14
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 
15
// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
16
//                 
17
// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst 
18
#include <algorithm>
19
#include <cassert>
20
#include <cstdint>
21
#include <cstring>
22
#include <fstream>
23
#include <iostream>
24
#include <numeric>
25
#include <memory>
26
#include <queue>
27
#include <string>
28
#include <unordered_set>
29
#include <vector>
30
#include <sys/types.h>
31
#include <sys/stat.h>
32
#include <fcntl.h>
33
#include <stddef.h>
34
35
using namespace std;
36
37
#include "fsst.h" // the official FSST API -- also usable by C mortals
38
39
/* unsigned integers */
40
typedef uint8_t u8;
41
typedef uint16_t u16;
42
typedef uint32_t u32;
43
typedef uint64_t u64;
44
45
0
inline uint64_t fsst_unaligned_load(u8 const* V) {
46
0
  uint64_t Ret;
47
0
  memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible)
48
0
  return Ret;
49
0
}
50
51
0
#define FSST_ENDIAN_MARKER ((u64) 1)
52
0
#define FSST_VERSION_20190218 20190218
53
0
#define FSST_VERSION ((u64) FSST_VERSION_20190218)
54
55
// "symbols" are character sequences (up to 8 bytes)
56
// A symbol is compressed into a "code" of, in principle, one byte. But, we added an exception mechanism:
57
// byte 255 followed by byte X represents the single-byte symbol X. Its code is 256+X.
58
59
// we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length
60
0
#define FSST_LEN_BITS       12
61
0
#define FSST_CODE_BITS      9 
62
0
#define FSST_CODE_BASE      256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */
63
0
#define FSST_CODE_MAX       (1UL<<FSST_CODE_BITS) /* all bits set: indicating a symbol that has not been assigned a code yet */
64
0
#define FSST_CODE_MASK      (FSST_CODE_MAX-1UL)   /* all bits set: indicating a symbol that has not been assigned a code yet */
65
66
struct Symbol {
67
   static const unsigned maxLength = 8;
68
69
   // the byte sequence that this symbol stands for
70
   union { char str[maxLength]; u64 num; } val; // usually we process it as a num(ber), as this is fast
71
72
   // icl = u64 ignoredBits:16,code:12,length:4,unused:32 -- but we avoid exposing this bit-field notation
73
   u64 icl;  // use a single u64 to be sure "code" is accessed with one load and can be compared with one comparison
74
75
0
   Symbol() : icl(0) { val.num = 0; }
76
77
0
   explicit Symbol(u8 c, u16 code) : icl((1<<28)|(code<<16)|56) { val.num = c; } // single-char symbol
78
0
   explicit Symbol(const char* begin, const char* end) : Symbol(begin, (u32) (end-begin)) {}
79
0
   explicit Symbol(u8* begin, u8* end) : Symbol((const char*)begin, (u32) (end-begin)) {}
80
0
   explicit Symbol(const char* input, u32 len) {
81
0
      val.num = 0;
82
0
      if (len>=8) {
83
0
          len = 8;
84
0
          memcpy(val.str, input, 8);
85
0
      } else {
86
0
          memcpy(val.str, input, len);
87
0
      }
88
0
      set_code_len(FSST_CODE_MAX, len);
89
0
   }
90
0
   void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); }
91
92
0
   u32 length() const { return (u32) (icl >> 28); }
93
0
   u16 code() const { return (icl >> 16) & FSST_CODE_MASK; }
94
0
   u32 ignoredBits() const { return (u32) icl; }
95
96
0
   u8 first() const { assert( length() >= 1); return 0xFF & val.num; }
97
0
   u16 first2() const { assert( length() >= 2); return 0xFFFF & val.num; }
98
99
#define FSST_HASH_LOG2SIZE 10 
100
0
#define FSST_HASH_PRIME 2971215073LL
101
0
#define FSST_SHIFT 15
102
0
#define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT))
103
0
   size_t hash() const { size_t v = 0xFFFFFF & val.num; return FSST_HASH(v); } // hash on the next 3 bytes
104
};
105
106
// Symbol that can be put in a queue, ordered on gain
107
struct QSymbol{
108
   Symbol symbol;
109
   mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of QSymbols
110
0
   bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); }
111
};
112
113
// we construct FSST symbol tables using a random sample of about 16KB (1<<14) 
114
0
#define FSST_SAMPLETARGET (1<<14)
115
0
#define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET)
116
117
// two phases of compression, before and after optimize():
118
//
119
// (1) to encode values we probe (and maintain) three datastructures:
120
// - u16 byteCodes[65536] array at the position of the next byte  (s.length==1)
121
// - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2)
122
// - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), 
123
// this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are 
124
// pseudo codes representing a single byte these will become escapes)
125
//
126
// (2) when we finished looking for the best symbol table we call optimize() to reshape it:
127
// - it renumbers the codes by length (first symbols of length 2,3,4,5,6,7,8; then 1 (starting from byteLim are symbols of length 1)
128
//   length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes 
129
//   (allows shortcut during compression)
130
// - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding 
131
//   to the first byte (if such a single-byte symbol exists). This allows us to just probe the next two bytes (if there is only one
132
//   byte left in the string, there is still a terminator-byte added during compression) in shortCodes[]. That is, byteCodes[]
133
//   and its codepath is no longer required. This makes compression faster. The reason we use byteCodes[] during symbolTable construction
134
//   is that adding a new code/symbol is expensive (you have to touch shortCodes[] in 256 places). This optimization was
135
//   hence added to make symbolTable construction faster.
136
//
137
// this final layout allows for the fastest compression code, only currently present in compressBulk
138
139
// in the hash table, the icl field contains (low-to-high) ignoredBits:16,code:12,length:4
140
0
#define FSST_ICL_FREE ((15<<28)|(((u32)FSST_CODE_MASK)<<16)) // high bits of icl (len=8,code=FSST_CODE_MASK) indicates free bucket
141
142
// ignoredBits is (8-length)*8, which is the amount of high bits to zero in the input word before comparing with the hashtable key
143
//             ..it could of course be computed from len during lookup, but storing it precomputed in some loose bits is faster
144
//
145
// the gain field is only used in the symbol queue that sorts symbols on gain
146
147
struct SymbolTable {
148
   static const u32 hashTabSize = 1<<FSST_HASH_LOG2SIZE; // smallest size that incurs no precision loss
149
150
   // lookup table using the next two bytes (65536 codes), or just the next single byte
151
   u16 shortCodes[65536]; // contains code for 2-byte symbol, otherwise code for pseudo byte (escaped byte)
152
153
   // lookup table (only used during symbolTable construction, not during normal text compression)
154
   u16 byteCodes[256]; // contains code for every 1-byte symbol, otherwise code for pseudo byte (escaped byte)
155
156
   // 'symbols' is the current symbol  table symbol[code].symbol is the max 8-byte 'symbol' for single-byte 'code'
157
   Symbol symbols[FSST_CODE_MAX]; // x in [0,255]: pseudo symbols representing escaped byte x; x in [FSST_CODE_BASE=256,256+nSymbols]: real symbols   
158
159
   // replicate long symbols in hashTab (avoid indirection). 
160
   Symbol hashTab[hashTabSize]; // used for all symbols of 3 and more bytes
161
162
   u16 nSymbols;          // amount of symbols in the map (max 255)
163
   u16 suffixLim;         // codes higher than this do not have a longer suffix
164
   u16 terminator;        // code of 1-byte symbol, that can be used as a terminator during compression
165
   bool zeroTerminated;   // whether we are expecting zero-terminated strings (we then also produce zero-terminated compressed strings)
166
   u16 lenHisto[FSST_CODE_BITS]; // lenHisto[x] is the amount of symbols of byte-length (x+1) in this SymbolTable
167
168
0
   SymbolTable() : nSymbols(0), suffixLim(FSST_CODE_MAX), terminator(0), zeroTerminated(false) {
169
      // stuff done once at startup
170
0
      for (u32 i=0; i<256; i++) {
171
0
         symbols[i] = Symbol(i,i|(1<<FSST_LEN_BITS)); // pseudo symbols
172
0
      }
173
0
      Symbol unused = Symbol((u8) 0,FSST_CODE_MASK); // single-char symbol, exception code
174
0
      for (u32 i=256; i<FSST_CODE_MAX; i++) {
175
0
         symbols[i] = unused; // we start with all symbols unused
176
0
      }
177
      // empty hash table
178
0
      Symbol s;
179
0
      s.val.num = 0;
180
0
      s.icl = FSST_ICL_FREE; //marks empty in hashtab
181
0
      for(u32 i=0; i<hashTabSize; i++)
182
0
         hashTab[i] = s;
183
184
      // fill byteCodes[] with the pseudo code all bytes (escaped bytes)
185
0
      for(u32 i=0; i<256; i++)
186
0
         byteCodes[i] = (1<<FSST_LEN_BITS) | i;
187
188
      // fill shortCodes[] with the pseudo code for the first byte of each two-byte pattern
189
0
      for(u32 i=0; i<65536; i++)
190
0
         shortCodes[i] = (1<<FSST_LEN_BITS) | (i&255);
191
192
0
      memset(lenHisto, 0, sizeof(lenHisto)); // all unused
193
0
   }
194
195
0
   void clear() {
196
      // clear a symbolTable with minimal effort (only erase the used positions in it)
197
0
      memset(lenHisto, 0, sizeof(lenHisto)); // all unused
198
0
      for(u32 i=FSST_CODE_BASE; i<FSST_CODE_BASE+nSymbols; i++) {
199
0
          if (symbols[i].length() == 1) {
200
0
              u16 val = symbols[i].first();
201
0
              byteCodes[val] = (1<<FSST_LEN_BITS) | val;
202
0
          } else if (symbols[i].length() == 2) {
203
0
              u16 val = symbols[i].first2();
204
0
              shortCodes[val] = (1<<FSST_LEN_BITS) | (val&255);
205
0
          } else {
206
0
              u32 idx = symbols[i].hash() & (hashTabSize-1);
207
0
              hashTab[idx].val.num = 0;
208
0
              hashTab[idx].icl = FSST_ICL_FREE; //marks empty in hashtab
209
0
          }           
210
0
      } 
211
0
      nSymbols = 0; // no need to clean symbols[] as no symbols are used
212
0
   }
213
0
   bool hashInsert(Symbol s) {
214
0
      u32 idx = s.hash() & (hashTabSize-1);
215
0
      bool taken = (hashTab[idx].icl < FSST_ICL_FREE);
216
0
      if (taken) return false; // collision in hash table
217
0
      hashTab[idx].icl = s.icl;
218
0
      hashTab[idx].val.num = s.val.num & (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
219
0
      return true;
220
0
   }
221
0
   bool add(Symbol s) {
222
0
      assert(FSST_CODE_BASE + nSymbols < FSST_CODE_MAX);
223
0
      u32 len = s.length();
224
0
      s.set_code_len(FSST_CODE_BASE + nSymbols, len);
225
0
      if (len == 1) {
226
0
         byteCodes[s.first()] = FSST_CODE_BASE + nSymbols + (1<<FSST_LEN_BITS); // len=1 (<<FSST_LEN_BITS)
227
0
      } else if (len == 2) {
228
0
         shortCodes[s.first2()] = FSST_CODE_BASE + nSymbols + (2<<FSST_LEN_BITS); // len=2 (<<FSST_LEN_BITS)
229
0
      } else if (!hashInsert(s)) {
230
0
         return false;
231
0
      }
232
0
      symbols[FSST_CODE_BASE + nSymbols++] = s;
233
0
      lenHisto[len-1]++;
234
0
      return true;
235
0
   }
236
   /// Find longest expansion, return code (= position in symbol table)
237
0
   u16 findLongestSymbol(Symbol s) const {
238
0
      size_t idx = s.hash() & (hashTabSize-1);
239
0
      if (hashTab[idx].icl <= s.icl && hashTab[idx].val.num == (s.val.num & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) {
240
0
         return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol 
241
0
      }
242
0
      if (s.length() >= 2) {
243
0
         u16 code =  shortCodes[s.first2()] & FSST_CODE_MASK;
244
0
         if (code >= FSST_CODE_BASE) return code; 
245
0
      }
246
0
      return byteCodes[s.first()] & FSST_CODE_MASK;
247
0
   }
248
0
   u16 findLongestSymbol(u8* cur, u8* end) const {
249
0
      return findLongestSymbol(Symbol(cur,end)); // represent the string as a temporary symbol
250
0
   }
251
252
   // rationale for finalize:
253
   // - during symbol table construction, we may create more than 256 codes, but bring it down to max 255 in the last makeTable()
254
   //   consequently we needed more than 8 bits during symbol table contruction, but can simplify the codes to single bytes in finalize()
255
   //   (this feature is in fact lo longer used, but could still be exploited: symbol construction creates no more than 255 symbols in each pass)
256
   // - we not only reduce the amount of codes to <255, but also *reorder* the symbols and renumber their codes, for higher compression perf.
257
   //   we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations). 
258
   // - we make the use of byteCode[] no longer necessary by inserting single-byte codes in the free spots of shortCodes[]
259
   //   Using shortCodes[] only makes compression faster. When creating the symbolTable, however, using shortCodes[] for the single-byte
260
   //   symbols is slow, as each insert touches 256 positions in it. This optimization was added when optimizing symbolTable construction time.
261
   //
262
   // In all, we change the layout and coding, as follows..
263
   //
264
   // before finalize(): 
265
   // - The real symbols are symbols[256..256+nSymbols>. As we may have nSymbols > 255
266
   // - The first 256 codes are pseudo symbols (all escaped bytes)
267
   //
268
   // after finalize(): 
269
   // - table layout is symbols[0..nSymbols>, with nSymbols < 256. 
270
   // - Real codes are [0,nSymbols>. 8-th bit not set. 
271
   // - Escapes in shortCodes have the 8th bit set (value: 256+255=511). 255 because the code to be emitted is the escape byte 255
272
   // - symbols are grouped by length: 2,3,4,5,6,7,8, then 1 (single-byte codes last)
273
   // the two-byte codes are split in two sections: 
274
   // - first section contains codes for symbols for which there is no longer symbol (no suffix). It allows an early-out during compression
275
   //
276
   // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore).
277
   //
278
0
   void finalize(u8 zeroTerminated) {
279
0
       assert(nSymbols <= 255);
280
0
       u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated);
281
282
       // compute running sum of code lengths (starting offsets for each length) 
283
0
       rsum[0] = byteLim; // 1-byte codes are highest
284
0
       rsum[1] = zeroTerminated;
285
0
       for(u32 i=1; i<7; i++)
286
0
          rsum[i+1] = rsum[i] + lenHisto[i];
287
288
       // determine the new code for each symbol, ordered by length (and splitting 2byte symbols into two classes around suffixLim)
289
0
       suffixLim = rsum[1];
290
0
       symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only)
291
292
0
       for(u32 i=zeroTerminated, j=rsum[2]; i<nSymbols; i++) {  
293
0
          Symbol s1 = symbols[FSST_CODE_BASE+i];
294
0
          u32 len = s1.length(), opt = (len == 2)*nSymbols;
295
0
          if (opt) {
296
0
              u16 first2 = s1.first2();
297
0
              for(u32 k=0; k<opt; k++) {  
298
0
                 Symbol s2 = symbols[FSST_CODE_BASE+k];
299
0
                 if (k != i && s2.length() > 1 && first2 == s2.first2()) // test if symbol k is a suffix of s
300
0
                    opt = 0;
301
0
              }
302
0
              newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim 
303
0
          } else 
304
0
              newCode[i] = rsum[len-1]++;
305
0
          s1.set_code_len(newCode[i],len);
306
0
          symbols[newCode[i]] = s1; 
307
0
       }
308
       // renumber the codes in byteCodes[] 
309
0
       for(u32 i=0; i<256; i++) 
310
0
          if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
311
0
             byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS);
312
0
          else 
313
0
             byteCodes[i] = 511 + (1 << FSST_LEN_BITS);
314
       
315
       // renumber the codes in shortCodes[] 
316
0
       for(u32 i=0; i<65536; i++)
317
0
          if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE)
318
0
             shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS));
319
0
          else 
320
0
             shortCodes[i] = byteCodes[i&0xFF];
321
322
       // replace the symbols in the hash table
323
0
       for(u32 i=0; i<hashTabSize; i++)
324
0
          if (hashTab[i].icl < FSST_ICL_FREE)
325
0
             hashTab[i] = symbols[newCode[(u8) hashTab[i].code()]];
326
0
   }
327
};
328
329
#ifdef NONOPT_FSST
330
struct Counters {
331
   u16 count1[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample 
332
   u16 count2[FSST_CODE_MAX][FSST_CODE_MAX]; // array to count subsequent combinations of two symbols in the sample 
333
334
   void count1Set(u32 pos1, u16 val) { 
335
      count1[pos1] = val;
336
   }
337
   void count1Inc(u32 pos1) { 
338
      count1[pos1]++;
339
   }
340
   void count2Inc(u32 pos1, u32 pos2) {  
341
      count2[pos1][pos2]++;
342
   }
343
   u32 count1GetNext(u32 &pos1) { 
344
      return count1[pos1];
345
   }
346
   u32 count2GetNext(u32 pos1, u32 &pos2) { 
347
      return count2[pos1][pos2];
348
   }
349
   void backup1(u8 *buf) {
350
      memcpy(buf, count1, FSST_CODE_MAX*sizeof(u16));
351
   }
352
   void restore1(u8 *buf) {
353
      memcpy(count1, buf, FSST_CODE_MAX*sizeof(u16));
354
   }
355
};
356
#else
357
// we keep two counters count1[pos] and count2[pos1][pos2] of resp 16 and 12-bits. Both are split into two columns for performance reasons
358
// first reason is to make the column we update the most during symbolTable construction (the low bits) thinner, thus reducing CPU cache pressure.
359
// second reason is that when scanning the array, after seeing a 64-bits 0 in the high bits column, we can quickly skip over many codes (15 or 7)
360
struct Counters {
361
   // high arrays come before low arrays, because our GetNext() methods may overrun their 64-bits reads a few bytes
362
   u8 count1High[FSST_CODE_MAX];   // array to count frequency of symbols as they occur in the sample (16-bits)
363
   u8 count1Low[FSST_CODE_MAX];    // it is split in a low and high byte: cnt = count1High*256 + count1Low
364
   u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX/2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high)
365
   u8 count2Low[FSST_CODE_MAX][FSST_CODE_MAX];    // its value is (count2High*256+count2Low) -- but high is 4-bits (we put two numbers in one, hence /2)
366
   // 385KB  -- but hot area likely just 10 + 30*4 = 130 cache lines (=8KB)
367
   
368
0
   void count1Set(u32 pos1, u16 val) { 
369
0
      count1Low[pos1] = val&255;
370
0
      count1High[pos1] = val>>8;
371
0
   }
372
0
   void count1Inc(u32 pos1) { 
373
0
      if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
374
0
         count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)...
375
0
   }
376
0
   void count2Inc(u32 pos1, u32 pos2) {  
377
0
       if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0)
378
          // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively
379
0
          count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample)
380
0
   }
381
0
   u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range
382
      // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros
383
0
     u64 high = fsst_unaligned_load(&count1High[pos1]);
384
385
0
      u32 zero = high?(__builtin_ctzll(high)>>3):7UL; // number of zero bytes
386
0
      high = (high >> (zero << 3)) & 255; // advance to nonzero counter
387
0
      if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
388
0
         return 0; // all zero
389
390
0
      u32 low = count1Low[pos1];
391
0
      if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
392
0
      return (u32) ((high << 8) + low);
393
0
   }
394
0
   u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range
395
      // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros
396
0
    u64 high = fsst_unaligned_load(&count2High[pos1][pos2>>1]);
397
0
      high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters
398
399
0
      u32 zero = high?(__builtin_ctzll(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters
400
0
      high = (high >> (zero << 2)) & 15;  // advance to nonzero counter
401
0
      if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2
402
0
         return 0UL; // all zero
403
404
0
      u32 low = count2Low[pos1][pos2];
405
0
      if (low) high--; // high is incremented early and low late, so decrement high (unless low==0)
406
0
      return (u32) ((high << 8) + low);
407
0
   }
408
0
   void backup1(u8 *buf) {
409
0
      memcpy(buf, count1High, FSST_CODE_MAX);
410
0
      memcpy(buf+FSST_CODE_MAX, count1Low, FSST_CODE_MAX);
411
0
   }
412
0
   void restore1(u8 *buf) {
413
0
      memcpy(count1High, buf, FSST_CODE_MAX);
414
0
      memcpy(count1Low, buf+FSST_CODE_MAX, FSST_CODE_MAX);
415
0
   }
416
}; 
417
#endif
418
419
420
#define FSST_BUFSZ (3<<19) // 768KB
421
422
// an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression 
423
struct Encoder {
424
   shared_ptr<SymbolTable> symbolTable; // symbols, plus metadata and data structures for quick compression (shortCode,hashTab, etc)
425
   union {
426
      Counters counters;     // for counting symbol occurences during map construction
427
      u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in) 
428
   };
429
};
430
431
// job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call)
432
struct SIMDjob {
433
   u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB)  
434
};
435
436
// C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree)
437
size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd);
438
size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd);