Coverage Report

Created: 2025-06-24 06:43

/src/hermes/lib/BCGen/HBC/ConsecutiveStringStorage.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 *
4
 * This source code is licensed under the MIT license found in the
5
 * LICENSE file in the root directory of this source tree.
6
 */
7
8
#define DEBUG_TYPE "stringstorage"
9
#include "hermes/BCGen/HBC/ConsecutiveStringStorage.h"
10
11
#include "hermes/BCGen/HBC/BytecodeDataProvider.h"
12
#include "hermes/Support/HashString.h"
13
#include "hermes/Support/JenkinsHash.h"
14
#include "hermes/Support/Statistic.h"
15
#include "hermes/Support/Timer.h"
16
#include "hermes/Support/UTF8.h"
17
18
#include "llvh/ADT/DenseMap.h"
19
#include "llvh/Support/Endian.h"
20
21
#include <algorithm>
22
#include <climits>
23
#include <deque>
24
25
using namespace hermes;
26
using llvh::ArrayRef;
27
using llvh::MutableArrayRef;
28
29
STATISTIC(StringTableSize, "Size of the packed String table");
30
STATISTIC(
31
    StringTableSavings,
32
    "Byte savings in length of String table versus naive packing");
33
34
namespace {
35
36
/// Group Name for any timers we create
37
constexpr const char *const StringTableTimersName = "StringTableBuilder Timers";
38
39
constexpr size_t npos = ~size_t(0);
40
41
/// Longest string that we'll attempt to pack.
42
constexpr size_t kMaximumPackableStringLength = 24 * 1024;
43
44
/// A helper class responsible for deciding how to "pack" strings, that is, lay
45
/// out strings in a linear array suitable for ConsecutiveStringStorage. It is
46
/// templated on the character type (char or char16_t).
47
template <typename CharT>
48
class StringPacker {
49
  StringPacker() = default;
50
51
 public:
52
  /// Represents a single string to be packed.
53
  struct StringEntry {
54
    /// Index of the string in the original array.
55
    uint32_t stringID_;
56
57
    /// Text of the string.
58
    ArrayRef<CharT> chars_;
59
60
    /// Position of the string in the output storage.
61
    /// This is set by the packer.
62
    size_t offsetInStorage_ = npos;
63
64
    // The remaining fields are used only by the optimizing packer.
65
66
    /// If parent_ is set, then this string is fully contained within its
67
    /// parent_, at the given offset.
68
    StringEntry *parent_ = nullptr;
69
    size_t offsetInParent_ = npos;
70
71
    /// The string that must come after us, if any.
72
    StringEntry *next_ = nullptr;
73
74
    /// The string that must come before us, if any.
75
    StringEntry *prev_ = nullptr;
76
77
    /// The amount that our chars_ overlaps with prev_->chars_.
78
    size_t overlapAmount_ = 0;
79
80
    /// Entries that may not be set as our next_, because they would introduce
81
    /// a cycle.
82
    llvh::DenseSet<const StringEntry *> potentialCycles_;
83
84
    StringEntry(uint32_t stringID, ArrayRef<CharT> chars)
85
2.21k
        : stringID_(stringID), chars_(chars) {}
ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::StringEntry::StringEntry(unsigned int, llvh::ArrayRef<unsigned char>)
Line
Count
Source
85
2.00k
        : stringID_(stringID), chars_(chars) {}
ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::StringEntry::StringEntry(unsigned int, llvh::ArrayRef<char16_t>)
Line
Count
Source
85
219
        : stringID_(stringID), chars_(chars) {}
86
  };
87
88
  /// A Trigram represents three packed characters.
89
  /// Note that Trigrams are keys in an llvh::DenseSet, which reserves the
90
  /// values ~0 and ~0+1 as sentinel values. We must not use the MSB.
91
  using Trigram =
92
      typename std::conditional<sizeof(CharT) == 1, uint32_t, uint64_t>::type;
93
94
  static constexpr size_t TrigramCharCount = 3;
95
96
  /// \return a trigram representing the first three characters in \p chars.
97
0
  static inline Trigram makeTrigram(const CharT chars[TrigramCharCount]) {
98
0
    unsigned shift = CHAR_BIT * sizeof(CharT);
99
0
    return (Trigram(chars[0]) << (2 * shift)) |
100
0
        (Trigram(chars[1]) << (1 * shift)) | (Trigram(chars[2]) << (0 * shift));
101
0
  }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::makeTrigram(unsigned char const*)
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::makeTrigram(char16_t const*)
102
103
  /// \return a set of all Trigrams of all 3 character prefixes of \p strings.
104
  static llvh::DenseSet<Trigram> buildPrefixTrigramSet(
105
0
      ArrayRef<StringEntry> strings) {
106
    // A rough experiment showed 8 strings per prefix is a reasonable estimate.
107
0
    llvh::DenseSet<Trigram> result(strings.size() / 8);
108
0
    for (const StringEntry &entry : strings) {
109
0
      ArrayRef<CharT> chars = entry.chars_;
110
0
      if (chars.size() >= TrigramCharCount) {
111
0
        result.insert(makeTrigram(chars.data()));
112
0
      }
113
0
    }
114
0
    return result;
115
0
  }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::buildPrefixTrigramSet(llvh::ArrayRef<(anonymous namespace)::StringPacker<unsigned char>::StringEntry>)
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::buildPrefixTrigramSet(llvh::ArrayRef<(anonymous namespace)::StringPacker<char16_t>::StringEntry>)
116
117
  /// HashedSuffix is a model of DenseMapInfo suitable for hashing string
118
  /// suffixes. When iterating over the suffixes, it is N^2 to hash each suffix
119
  /// separately. We can do better by retaining the hash of the previous suffix
120
  /// and simply incorporating the next character into it.
121
  struct HashedSuffix {
122
    /// The string that has been hashed.
123
    ArrayRef<CharT> chars_;
124
125
    /// The hash value of chars_.
126
    JenkinsHash hash_;
127
128
    /// DenseMapInfo interface. \return the empty key, deferring to ArrayRef.
129
0
    static HashedSuffix getEmptyKey() {
130
0
      return {llvh::DenseMapInfo<ArrayRef<CharT>>::getEmptyKey(), 0};
131
0
    }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::HashedSuffix::getEmptyKey()
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::HashedSuffix::getEmptyKey()
132
133
    /// DenseMapInfo interface. \return the tombstone key, deferring to
134
    /// ArrayRef.
135
0
    static HashedSuffix getTombstoneKey() {
136
0
      return {llvh::DenseMapInfo<ArrayRef<CharT>>::getTombstoneKey(), 0};
137
0
    }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::HashedSuffix::getTombstoneKey()
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::HashedSuffix::getTombstoneKey()
138
139
    /// DenseMapInfo interface. \return the hash, which we have cached.
140
0
    static unsigned getHashValue(const HashedSuffix &s) {
141
0
      return s.hash_;
142
0
    }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::HashedSuffix::getHashValue((anonymous namespace)::StringPacker<unsigned char>::HashedSuffix const&)
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::HashedSuffix::getHashValue((anonymous namespace)::StringPacker<char16_t>::HashedSuffix const&)
143
144
    /// DenseMapInfo interface. \return whether two suffixes are equal.
145
0
    static bool isEqual(const HashedSuffix &lhs, const HashedSuffix &rhs) {
146
0
      return lhs.hash_ == rhs.hash_ && lhs.chars_ == rhs.chars_;
147
0
    }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::HashedSuffix::isEqual((anonymous namespace)::StringPacker<unsigned char>::HashedSuffix const&, (anonymous namespace)::StringPacker<unsigned char>::HashedSuffix const&)
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::HashedSuffix::isEqual((anonymous namespace)::StringPacker<char16_t>::HashedSuffix const&, (anonymous namespace)::StringPacker<char16_t>::HashedSuffix const&)
148
  };
149
150
  /// Represents an entry in a generalized suffix array.
151
  struct SuffixArrayEntry {
152
    /// The suffix represented by this entry.
153
    ArrayRef<CharT> suffix_;
154
155
    /// The list of StringEntries that have this suffix.
156
    std::vector<StringEntry *> entries_;
157
158
    /// Convenience move constructor from a std::pair.
159
    /// This is used to extract SuffixArrayEntries from our uniquing map.
160
    SuffixArrayEntry(std::pair<HashedSuffix, std::vector<StringEntry *>> &&kv)
161
0
        : suffix_(std::move(kv.first.chars_)), entries_(std::move(kv.second)) {}
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::SuffixArrayEntry::SuffixArrayEntry(std::__1::pair<(anonymous namespace)::StringPacker<unsigned char>::HashedSuffix, std::__1::vector<(anonymous namespace)::StringPacker<unsigned char>::StringEntry*, std::__1::allocator<(anonymous namespace)::StringPacker<unsigned char>::StringEntry*> > >&&)
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::SuffixArrayEntry::SuffixArrayEntry(std::__1::pair<(anonymous namespace)::StringPacker<char16_t>::HashedSuffix, std::__1::vector<(anonymous namespace)::StringPacker<char16_t>::StringEntry*, std::__1::allocator<(anonymous namespace)::StringPacker<char16_t>::StringEntry*> > >&&)
162
163
    /// \return the character at index pos, or -1 if pos >= our length.
164
0
    int extCharAt(size_t pos) const {
165
0
      return pos >= suffix_.size() ? -1 : suffix_[pos];
166
0
    }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::SuffixArrayEntry::extCharAt(unsigned long) const
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::SuffixArrayEntry::extCharAt(unsigned long) const
167
  };
168
169
  // Given pointers \p begin and \p end, sort the range [begin, end) starting at
170
  // the character index \p charIdx. This is a recursive function.
171
  static void radixQuicksort(
172
      SuffixArrayEntry *begin,
173
      SuffixArrayEntry *end,
174
0
      size_t charIdx) {
175
0
    for (;;) {
176
0
      if (end - begin <= 1) {
177
        // Already sorted.
178
0
        return;
179
0
      }
180
181
      // Partition via Hoare scheme. Invariants are:
182
      //  [begin, lower) < pivot
183
      //  [upper, end) > pivot
184
      // Final state adds:
185
      //  [lower, upper) == pivot
186
0
      int pivotChar = begin->extCharAt(charIdx);
187
0
      SuffixArrayEntry *lower = begin;
188
0
      SuffixArrayEntry *upper = end;
189
0
      for (SuffixArrayEntry *cursor = begin + 1; cursor < upper;) {
190
0
        int testChar = cursor->extCharAt(charIdx);
191
0
        if (testChar < pivotChar) {
192
0
          std::swap(*lower++, *cursor++);
193
0
        } else if (testChar > pivotChar) {
194
0
          std::swap(*--upper, *cursor);
195
0
        } else {
196
          // testChar == pivotChar
197
0
          cursor++;
198
0
        }
199
0
      }
200
      // We have partitioned [begin, end) according to the character at charIdx.
201
      // Sort left and right, and then recurse on our equal range (unless we've
202
      // exhausted it).
203
0
      radixQuicksort(begin, lower, charIdx);
204
0
      radixQuicksort(upper, end, charIdx);
205
206
      // If we reached the end of the pivot, we're done. Otherwise allow
207
      // ourselves to loop again to inspect the next character.
208
      // Note this is effectively a tail call.
209
0
      if (pivotChar == -1)
210
0
        return;
211
212
0
      begin = lower;
213
0
      end = upper;
214
0
      charIdx += 1;
215
0
    }
216
0
  }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::radixQuicksort((anonymous namespace)::StringPacker<unsigned char>::SuffixArrayEntry*, (anonymous namespace)::StringPacker<unsigned char>::SuffixArrayEntry*, unsigned long)
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::radixQuicksort((anonymous namespace)::StringPacker<char16_t>::SuffixArrayEntry*, (anonymous namespace)::StringPacker<char16_t>::SuffixArrayEntry*, unsigned long)
217
218
  /// \return a generalized suffix array over the given \p strings.
219
  /// Only suffixes that begin with an element of \p prefixesOfInterest (or are
220
  /// shorter than TrigramCharCount) are included.
221
  static std::vector<SuffixArrayEntry> buildSuffixArray(
222
      MutableArrayRef<StringEntry> strings,
223
0
      const llvh::DenseSet<Trigram> &prefixesOfInterest) {
224
    // Unique each suffix, that is, for each suffix that begins with
225
    // prefixOfInterest construct the list of strings containing it. A rough
226
    // test showed 8 suffixes per string is a reasonable initial capacity.
227
0
    llvh::DenseMap<HashedSuffix, std::vector<StringEntry *>, HashedSuffix>
228
0
        suffixMap(8 * strings.size());
229
0
    for (StringEntry &entry : strings) {
230
0
      size_t charsSize = entry.chars_.size();
231
      // Skip excessively long strings.
232
0
      if (charsSize > kMaximumPackableStringLength) {
233
0
        continue;
234
0
      }
235
0
      const CharT *chars = entry.chars_.data();
236
0
      JenkinsHash hash = JenkinsHashInit;
237
0
      size_t i = charsSize;
238
0
      while (i--) {
239
0
        hash = updateJenkinsHash(hash, chars[i]);
240
0
        if (i + TrigramCharCount <= charsSize &&
241
0
            !prefixesOfInterest.count(makeTrigram(&chars[i])))
242
0
          continue;
243
0
        ArrayRef<CharT> suffix(&chars[i], charsSize - i);
244
0
        suffixMap[HashedSuffix{suffix, hash}].push_back(&entry);
245
0
      }
246
0
    }
247
248
    // Move our suffixes to an array, and sort it.
249
0
    std::vector<SuffixArrayEntry> result;
250
0
    if (!suffixMap.empty()) {
251
0
      result.reserve(suffixMap.size());
252
0
      std::move(suffixMap.begin(), suffixMap.end(), std::back_inserter(result));
253
0
      SuffixArrayEntry *entries = &result[0];
254
0
      radixQuicksort(entries, entries + result.size(), 0);
255
0
    }
256
0
    return result;
257
0
  }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::buildSuffixArray(llvh::MutableArrayRef<(anonymous namespace)::StringPacker<unsigned char>::StringEntry>, llvh::DenseSet<unsigned int, llvh::DenseMapInfo<unsigned int> > const&)
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::buildSuffixArray(llvh::MutableArrayRef<(anonymous namespace)::StringPacker<char16_t>::StringEntry>, llvh::DenseSet<unsigned long, llvh::DenseMapInfo<unsigned long> > const&)
258
259
  /// Overlap represents an overlap relationship. There is an overlap
260
  /// relationship between src and dst if some suffix of src is equal to a
261
  /// prefix of dst. For example, in the strings "splitpea" and "peasoup", we
262
  /// have src = "splitpea" and dst = "peasoup" with overlap amount 3. Note we
263
  /// don't track the amount of overlap here; that's stored externally.
264
  /// Also note overlap is directed: there is no overlap from "peasoup" to
265
  /// "splitpea" because no suffix of "peasoup" is a prefix of "splitpea".
266
  struct Overlap {
267
    ArrayRef<StringEntry *> srcs_;
268
    StringEntry *dst_;
269
  };
270
271
  /// A list of Overlaps, indexed by the amount of overlap.
272
  /// For example, WeightIndexedOverlaps[3] is the list of Overlaps with
273
  /// overlap amount 3.
274
  using WeightIndexedOverlaps = std::vector<std::vector<Overlap>>;
275
276
  /// Given a list of StringEntries, determine overlapping strings as follows:
277
  /// 1. Set rightString.parent_ to an entry that contains rightString, if any.
278
  /// 2. If an entry leftString overlaps with \p rightString, add an Overlap
279
  /// leftString->rightString to \p overlaps
280
  static void computeOverlapsAndParentForEntry(
281
      StringEntry *rightString,
282
      ArrayRef<SuffixArrayEntry> suffixArray,
283
0
      WeightIndexedOverlaps *overlaps) {
284
    // This is a subtle function. We want to compute Overlaps, indexed by
285
    // overlap amount, and simultaneously identify parents. Iterate over
286
    // rightString's prefixes. Perform a binary search on our suffix array to
287
    // find suffixes that have that prefix. If any suffix is exactly equal to
288
    // that prefix, we have Arcs (suffix owners, rightString). See the
289
    // documentation "StringPacking.md" for an extended discussion.
290
0
    using std::partition_point;
291
0
    ArrayRef<CharT> rightChars = rightString->chars_;
292
0
    auto lower = suffixArray.begin();
293
0
    auto upper = suffixArray.end();
294
0
    for (size_t index = 0, rightCharsLen = rightChars.size();
295
0
         index < rightCharsLen;
296
0
         index++) {
297
0
      CharT testChar = rightChars[index];
298
      // Binary search on [lower, upper) to find all suffixes that have
299
      // testChar at index.
300
0
      lower = partition_point(lower, upper, [=](const SuffixArrayEntry &ent) {
301
0
        return ent.extCharAt(index) < testChar;
302
0
      });
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::computeOverlapsAndParentForEntry((anonymous namespace)::StringPacker<unsigned char>::StringEntry*, llvh::ArrayRef<(anonymous namespace)::StringPacker<unsigned char>::SuffixArrayEntry>, std::__1::vector<std::__1::vector<(anonymous namespace)::StringPacker<unsigned char>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<unsigned char>::Overlap> >, std::__1::allocator<std::__1::vector<(anonymous namespace)::StringPacker<unsigned char>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<unsigned char>::Overlap> > > >*)::{lambda((anonymous namespace)::StringPacker<unsigned char>::SuffixArrayEntry const&)#1}::operator()((anonymous namespace)::StringPacker<unsigned char>::SuffixArrayEntry const&) const
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::computeOverlapsAndParentForEntry((anonymous namespace)::StringPacker<char16_t>::StringEntry*, llvh::ArrayRef<(anonymous namespace)::StringPacker<char16_t>::SuffixArrayEntry>, std::__1::vector<std::__1::vector<(anonymous namespace)::StringPacker<char16_t>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<char16_t>::Overlap> >, std::__1::allocator<std::__1::vector<(anonymous namespace)::StringPacker<char16_t>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<char16_t>::Overlap> > > >*)::{lambda((anonymous namespace)::StringPacker<char16_t>::SuffixArrayEntry const&)#1}::operator()((anonymous namespace)::StringPacker<char16_t>::SuffixArrayEntry const&) const
303
0
      upper = partition_point(lower, upper, [=](const SuffixArrayEntry &ent) {
304
0
        return ent.extCharAt(index) == testChar;
305
0
      });
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::computeOverlapsAndParentForEntry((anonymous namespace)::StringPacker<unsigned char>::StringEntry*, llvh::ArrayRef<(anonymous namespace)::StringPacker<unsigned char>::SuffixArrayEntry>, std::__1::vector<std::__1::vector<(anonymous namespace)::StringPacker<unsigned char>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<unsigned char>::Overlap> >, std::__1::allocator<std::__1::vector<(anonymous namespace)::StringPacker<unsigned char>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<unsigned char>::Overlap> > > >*)::{lambda((anonymous namespace)::StringPacker<unsigned char>::SuffixArrayEntry const&)#2}::operator()((anonymous namespace)::StringPacker<unsigned char>::SuffixArrayEntry const&) const
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::computeOverlapsAndParentForEntry((anonymous namespace)::StringPacker<char16_t>::StringEntry*, llvh::ArrayRef<(anonymous namespace)::StringPacker<char16_t>::SuffixArrayEntry>, std::__1::vector<std::__1::vector<(anonymous namespace)::StringPacker<char16_t>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<char16_t>::Overlap> >, std::__1::allocator<std::__1::vector<(anonymous namespace)::StringPacker<char16_t>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<char16_t>::Overlap> > > >*)::{lambda((anonymous namespace)::StringPacker<char16_t>::SuffixArrayEntry const&)#2}::operator()((anonymous namespace)::StringPacker<char16_t>::SuffixArrayEntry const&) const
306
0
      assert(lower <= upper && "Binary search crossed the streams");
307
0
      if (lower == upper) {
308
        // No suffixes remaining.
309
0
        break;
310
0
      }
311
312
      // [lower, upper) is now the range of suffixes that start with
313
      // rightString.slice(0, overlapAmount).
314
0
      size_t overlapAmount = index + 1;
315
0
      if (overlapAmount < rightCharsLen) {
316
        // Not the last iteration. We are looking for a prefix match.
317
        // *lower is a suffix that shares a prefix of length overlapAmount with
318
        // rightString. If it is equal to that prefix, then we found a suffix
319
        // equal to the prefix we're looking for. It is sufficient to test for
320
        // equality by checking the length.
321
0
        if (lower->suffix_.size() == overlapAmount) {
322
          // Add an Overlap at index overlapAmount.
323
0
          if (overlaps->size() <= overlapAmount) {
324
0
            overlaps->resize(overlapAmount + 1);
325
0
          }
326
0
          Overlap ov = {lower->entries_, rightString};
327
0
          (*overlaps)[overlapAmount].push_back(ov);
328
0
        }
329
0
      } else {
330
        // overlapAmount == rightCharsLen. Final iteration! [lower, upper) is
331
        // now the range of suffixes that have rightEntry as a prefix.
332
        // That means that rightEntry is wholly contained within some string.
333
        // Of course it is wholly contained within itself; if it's also
334
        // contained within some OTHER string, we found a parent.
335
        // For compressibility, choose the parent with the lowest stringID.
336
0
        for (auto cursor = lower; cursor < upper; ++cursor) {
337
0
          for (StringEntry *parent : cursor->entries_) {
338
            // Can't parent ourselves.
339
0
            if (parent == rightString)
340
0
              continue;
341
342
            // Don't parent if we have an existing parent with a lower ID.
343
            // This means that we prefer parents that tend to end up early in
344
            // the string table.
345
0
            if (rightString->parent_ &&
346
0
                rightString->parent_->stringID_ < parent->stringID_)
347
0
              continue;
348
349
            // We found a parent.
350
            // rightEntry is a prefix of one of parent's suffixes.
351
            // Therefore rightEntry appears in the parent at the same offset of
352
            // the suffix within the parent.
353
            // A parent should always be longer than its child; otherwise we
354
            // must have duplicate strings.
355
0
            assert(
356
0
                parent->chars_.size() > rightString->chars_.size() &&
357
0
                "non-unique strings passed to StringPacker");
358
0
            rightString->parent_ = parent;
359
0
            rightString->offsetInParent_ =
360
0
                parent->chars_.size() - cursor->suffix_.size();
361
0
          }
362
0
        }
363
0
      }
364
0
    }
365
0
  }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::computeOverlapsAndParentForEntry((anonymous namespace)::StringPacker<unsigned char>::StringEntry*, llvh::ArrayRef<(anonymous namespace)::StringPacker<unsigned char>::SuffixArrayEntry>, std::__1::vector<std::__1::vector<(anonymous namespace)::StringPacker<unsigned char>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<unsigned char>::Overlap> >, std::__1::allocator<std::__1::vector<(anonymous namespace)::StringPacker<unsigned char>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<unsigned char>::Overlap> > > >*)
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::computeOverlapsAndParentForEntry((anonymous namespace)::StringPacker<char16_t>::StringEntry*, llvh::ArrayRef<(anonymous namespace)::StringPacker<char16_t>::SuffixArrayEntry>, std::__1::vector<std::__1::vector<(anonymous namespace)::StringPacker<char16_t>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<char16_t>::Overlap> >, std::__1::allocator<std::__1::vector<(anonymous namespace)::StringPacker<char16_t>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<char16_t>::Overlap> > > >*)
366
367
  /// For each entry, compute its overlaps and parent as per
368
  /// computeOverlapsOrParentForEntry.
369
  /// \p return the list of Overlaps indexed by weight (amount of overlap).
370
  static WeightIndexedOverlaps computeOverlapsAndParents(
371
      MutableArrayRef<StringEntry> stringEntries,
372
0
      ArrayRef<SuffixArrayEntry> suffixArray) {
373
0
    WeightIndexedOverlaps result;
374
0
    for (StringEntry &entry : stringEntries) {
375
0
      computeOverlapsAndParentForEntry(&entry, suffixArray, &result);
376
0
    }
377
0
    return result;
378
0
  }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::computeOverlapsAndParents(llvh::MutableArrayRef<(anonymous namespace)::StringPacker<unsigned char>::StringEntry>, llvh::ArrayRef<(anonymous namespace)::StringPacker<unsigned char>::SuffixArrayEntry>)
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::computeOverlapsAndParents(llvh::MutableArrayRef<(anonymous namespace)::StringPacker<char16_t>::StringEntry>, llvh::ArrayRef<(anonymous namespace)::StringPacker<char16_t>::SuffixArrayEntry>)
379
380
  /// Indicate if we can add the edge from \p src to \p dst to our Hamiltonian
381
  /// path, that is, whether we can take advantage of the overlap between src
382
  /// and dst by positioning dst to overlap a suffix of src.
383
0
  static bool canOverlap(const StringEntry *src, const StringEntry *dst) {
384
    // Are we trying to overlap ourself?
385
0
    if (src == dst)
386
0
      return false;
387
388
    // Are src or dst going to be substrings of another string?
389
0
    if (src->parent_ || dst->parent_)
390
0
      return false;
391
392
    // Did we already pick a string to come after src or before dst?
393
0
    if (src->next_ || dst->prev_)
394
0
      return false;
395
396
    // Would forming src->dst create a cycle?
397
0
    if (src->potentialCycles_.count(dst))
398
0
      return false;
399
400
    // This edge is OK!
401
0
    return true;
402
0
  }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::canOverlap((anonymous namespace)::StringPacker<unsigned char>::StringEntry const*, (anonymous namespace)::StringPacker<unsigned char>::StringEntry const*)
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::canOverlap((anonymous namespace)::StringPacker<char16_t>::StringEntry const*, (anonymous namespace)::StringPacker<char16_t>::StringEntry const*)
403
404
  /// Plan how we are going to lay out our strings by applying the weightiest
405
  /// Overlaps. That is, for each entry, determine the (at most 1) entry
406
  /// to come before it, and the (at most 1) entry to come after it.
407
  /// This uses the greedy heuristic. Walk our Overlaps from greatest to least
408
  /// amount, and plan to layout src directly before dst if we can.
409
  /// We must be careful to not produce a cycle like `a->b->c->a`.
410
  /// This is equivalent to computing Hamiltonian path in the graph of strings
411
  /// while attempting to maximize the weight of its edges.
412
0
  static void planLayout(const WeightIndexedOverlaps &overlapsByWeight) {
413
0
    size_t overlapAmount = overlapsByWeight.size();
414
0
    while (overlapAmount--) {
415
0
      for (const Overlap &overlap : overlapsByWeight[overlapAmount]) {
416
0
        StringEntry *dst = overlap.dst_;
417
0
        if (dst->prev_ || dst->parent_) {
418
          // dst is already spoken for, no need to consider it further.
419
0
          continue;
420
0
        }
421
0
        for (StringEntry *src : overlap.srcs_) {
422
0
          if (canOverlap(src, dst)) {
423
            // Apply the Overlap.
424
0
            src->next_ = dst;
425
0
            dst->prev_ = src;
426
0
            dst->overlapAmount_ = overlapAmount;
427
428
            // Traverse to the start and end of our new chain, and mark the fact
429
            // that end->start is prohibited because it would produce a cycle.
430
0
            StringEntry *end = dst;
431
0
            while (end->next_)
432
0
              end = end->next_;
433
0
            StringEntry *start = src;
434
0
            while (start->prev_)
435
0
              start = start->prev_;
436
0
            end->potentialCycles_.insert(start);
437
438
            // We picked an entry to come before dst, so we're done with dst.
439
0
            break;
440
0
          }
441
0
        }
442
0
      }
443
0
    }
444
0
  }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::planLayout(std::__1::vector<std::__1::vector<(anonymous namespace)::StringPacker<unsigned char>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<unsigned char>::Overlap> >, std::__1::allocator<std::__1::vector<(anonymous namespace)::StringPacker<unsigned char>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<unsigned char>::Overlap> > > > const&)
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::planLayout(std::__1::vector<std::__1::vector<(anonymous namespace)::StringPacker<char16_t>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<char16_t>::Overlap> >, std::__1::allocator<std::__1::vector<(anonymous namespace)::StringPacker<char16_t>::Overlap, std::__1::allocator<(anonymous namespace)::StringPacker<char16_t>::Overlap> > > > const&)
445
446
  /// Given a string \p entry, lay it out if it has not already been laid out,
447
  /// appending any data necessaery to the \p storage blob.
448
  /// "Laying out" an entry means positioning it within the storage, by setting
449
  /// its offsetInStorage_ field. Laying out an entry may mean laying out other
450
  /// entries: its parent, or its prev and next entries.
451
0
  static void layoutIfNeeded(StringEntry *entry, std::vector<CharT> *storage) {
452
0
    if (entry->offsetInStorage_ != npos) {
453
0
      return; // already layed out
454
0
    }
455
456
    // Empty string is trivial
457
0
    if (entry->chars_.empty()) {
458
0
      entry->offsetInStorage_ = 0;
459
0
      return;
460
0
    }
461
462
    // If we have a parent, lay it out position ourself within it
463
0
    if (entry->parent_) {
464
0
      assert(entry->parent_ != entry && "entry cannot parent itself");
465
0
      assert(entry->offsetInParent_ != npos && "parent with no offset");
466
0
      assert(
467
0
          entry->prev_ == nullptr && entry->next_ == nullptr &&
468
0
          "Cannot have a parent and be in the path");
469
0
      layoutIfNeeded(entry->parent_, storage);
470
0
      entry->offsetInStorage_ =
471
0
          entry->parent_->offsetInStorage_ + entry->offsetInParent_;
472
0
      return;
473
0
    }
474
475
    // Go to the beginning and then layout forwards
476
0
    StringEntry *cursor = entry;
477
0
    while (cursor->prev_) {
478
0
      cursor = cursor->prev_;
479
0
    }
480
0
    assert(cursor->overlapAmount_ == 0 && "Cannot have overlap with no prev");
481
0
    while (cursor) {
482
0
      const auto &str = cursor->chars_;
483
0
      assert(
484
0
          cursor->offsetInStorage_ == npos &&
485
0
          "Cannot have already laid out an entry in this path");
486
0
      assert(
487
0
          cursor->overlapAmount_ <= str.size() &&
488
0
          "overlap cannot be larger than string size");
489
0
      assert(
490
0
          cursor->overlapAmount_ <= storage->size() &&
491
0
          "overlap cannot exceed the storage laid out so far ");
492
0
      cursor->offsetInStorage_ = storage->size() - cursor->overlapAmount_;
493
0
      storage->insert(
494
0
          storage->end(), str.begin() + cursor->overlapAmount_, str.end());
495
0
      cursor = cursor->next_;
496
0
    }
497
0
  }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::layoutIfNeeded((anonymous namespace)::StringPacker<unsigned char>::StringEntry*, std::__1::vector<unsigned char, std::__1::allocator<unsigned char> >*)
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::layoutIfNeeded((anonymous namespace)::StringPacker<char16_t>::StringEntry*, std::__1::vector<char16_t, std::__1::allocator<char16_t> >*)
498
499
  /// Packs \p strings in an optimized way.
500
  /// \return a storage blob, with the property that each entry appears at its
501
  /// offsetInStorage_ within the blob.
502
  static std::vector<CharT> optimizingPackStrings(
503
0
      MutableArrayRef<StringEntry> strings) {
504
0
    auto prefixSet = buildPrefixTrigramSet(strings);
505
0
    auto suffixes = buildSuffixArray(strings, prefixSet);
506
0
    auto overlaps = computeOverlapsAndParents(strings, suffixes);
507
0
    planLayout(overlaps);
508
0
    std::vector<CharT> storage;
509
0
    for (StringEntry &entry : strings) {
510
0
      layoutIfNeeded(&entry, &storage);
511
0
    }
512
0
    return storage; // efficient move
513
0
  }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::optimizingPackStrings(llvh::MutableArrayRef<(anonymous namespace)::StringPacker<unsigned char>::StringEntry>)
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::optimizingPackStrings(llvh::MutableArrayRef<(anonymous namespace)::StringPacker<char16_t>::StringEntry>)
514
515
  /// Packs \p strings naively, in their original order.
516
  /// \return a storage blob with the property that each string appears at its
517
  /// offsetInStorage_ within the blob.
518
  static std::vector<CharT> fastPackStrings(
519
2.34k
      MutableArrayRef<StringEntry> strings) {
520
2.34k
    std::vector<CharT> result;
521
2.34k
    for (StringEntry &str : strings) {
522
2.21k
      str.offsetInStorage_ = result.size();
523
2.21k
      result.insert(result.end(), str.chars_.begin(), str.chars_.end());
524
2.21k
    }
525
    // Note efficient move-semantics on return.
526
2.34k
    return result;
527
2.34k
  }
ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::fastPackStrings(llvh::MutableArrayRef<(anonymous namespace)::StringPacker<unsigned char>::StringEntry>)
Line
Count
Source
519
1.17k
      MutableArrayRef<StringEntry> strings) {
520
1.17k
    std::vector<CharT> result;
521
2.00k
    for (StringEntry &str : strings) {
522
2.00k
      str.offsetInStorage_ = result.size();
523
2.00k
      result.insert(result.end(), str.chars_.begin(), str.chars_.end());
524
2.00k
    }
525
    // Note efficient move-semantics on return.
526
1.17k
    return result;
527
1.17k
  }
ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::fastPackStrings(llvh::MutableArrayRef<(anonymous namespace)::StringPacker<char16_t>::StringEntry>)
Line
Count
Source
519
1.17k
      MutableArrayRef<StringEntry> strings) {
520
1.17k
    std::vector<CharT> result;
521
1.17k
    for (StringEntry &str : strings) {
522
219
      str.offsetInStorage_ = result.size();
523
219
      result.insert(result.end(), str.chars_.begin(), str.chars_.end());
524
219
    }
525
    // Note efficient move-semantics on return.
526
1.17k
    return result;
527
1.17k
  }
528
529
#ifndef NDEBUG
530
  /// Validate a string packing. asserts that each string in \p strings does
531
  /// indeed appear at its claimed offset within \p storage.
532
  static void validateStringPacking(
533
      ArrayRef<StringEntry> strings,
534
2.34k
      ArrayRef<CharT> storage) {
535
2.34k
    for (const auto &entry : strings) {
536
2.21k
      auto offset = entry.offsetInStorage_;
537
2.21k
      auto size = entry.chars_.size();
538
2.21k
      assert(
539
2.21k
          offset + size >= offset && offset + size <= storage.size() &&
540
2.21k
          "Invalid offset or size for string entry");
541
2.21k
      assert(
542
2.21k
          entry.chars_ == storage.slice(offset, size) &&
543
2.21k
          "String does not appear at claimed offset in storage");
544
2.21k
    }
545
2.34k
  }
ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<unsigned char>::validateStringPacking(llvh::ArrayRef<(anonymous namespace)::StringPacker<unsigned char>::StringEntry>, llvh::ArrayRef<unsigned char>)
Line
Count
Source
534
1.17k
      ArrayRef<CharT> storage) {
535
2.00k
    for (const auto &entry : strings) {
536
2.00k
      auto offset = entry.offsetInStorage_;
537
2.00k
      auto size = entry.chars_.size();
538
2.00k
      assert(
539
2.00k
          offset + size >= offset && offset + size <= storage.size() &&
540
2.00k
          "Invalid offset or size for string entry");
541
2.00k
      assert(
542
2.00k
          entry.chars_ == storage.slice(offset, size) &&
543
2.00k
          "String does not appear at claimed offset in storage");
544
2.00k
    }
545
1.17k
  }
ConsecutiveStringStorage.cpp:(anonymous namespace)::StringPacker<char16_t>::validateStringPacking(llvh::ArrayRef<(anonymous namespace)::StringPacker<char16_t>::StringEntry>, llvh::ArrayRef<char16_t>)
Line
Count
Source
534
1.17k
      ArrayRef<CharT> storage) {
535
1.17k
    for (const auto &entry : strings) {
536
219
      auto offset = entry.offsetInStorage_;
537
219
      auto size = entry.chars_.size();
538
219
      assert(
539
219
          offset + size >= offset && offset + size <= storage.size() &&
540
219
          "Invalid offset or size for string entry");
541
219
      assert(
542
219
          entry.chars_ == storage.slice(offset, size) &&
543
219
          "String does not appear at claimed offset in storage");
544
219
    }
545
1.17k
  }
546
#endif
547
};
548
549
/// Class responsible for building a string table and associated storage blob.
550
/// This is constructed from a list of StringRefs. Strings which are in ASCII
551
/// are separated from strings that require a Unicode representation.
552
/// Internally we represent strings as StringEntries, which associates some
553
/// metadata about the string for use by the StringPacker.
554
class StringTableBuilder {
555
 private:
556
  /// We work in ArrayRefs, but we need something to own the storage for our u16
557
  /// strings. Each u16 string is owned by a vector<char16_t>.
558
  /// Note this cannot be a vector-of-vectors because vector invalidates
559
  /// its contents (i.e. strings are moved). deque and list will both work since
560
  /// push_back() is guaranteed to not invalidate references to the elements.
561
  std::deque<std::vector<char16_t>> u16StringStorage_;
562
563
 public:
564
  // Entries which are in ASCII.
565
  std::vector<StringPacker<unsigned char>::StringEntry> asciiStrings_;
566
567
  // Entries which contain non-ASCII characters.
568
  std::vector<StringPacker<char16_t>::StringEntry> u16Strings_;
569
570
  /// Constructs from a list of strings, given as a pair of iterators: begin
571
  /// and end.  Note that we do not always copy the underlying string data so
572
  /// the resulting builder must not outlive these strings.  In delta
573
  /// optimizing mode, only new strings are added here and packed.
574
  template <typename I, typename Force8Bit>
575
1.17k
  StringTableBuilder(I begin, I end, Force8Bit) {
576
    // Generate and store a StringEntry for each string.
577
    // Remember the index of each string in our StringEntry, so that we can
578
    // later output the table in the correct order.
579
    // Place the entry in our ASCII entries if possible; otherwise we have to
580
    // convert to u16.
581
    // In delta optimizing mode, the ids of new strings in the string table
582
    // are adjusted to reflect that they appear after the old strings.
583
1.17k
    uint32_t index = 0;
584
3.38k
    for (auto it = begin; it != end; ++it) {
585
2.21k
      auto &str = *it;
586
2.21k
      static_assert(sizeof(str.data()[0]) == 1, "strings must be UTF8");
587
2.21k
      const unsigned char *begin = (const unsigned char *)str.data();
588
2.21k
      const unsigned char *end = begin + str.size();
589
2.21k
      if (Force8Bit::value || isAllASCII(begin, end)) {
590
2.00k
        ArrayRef<unsigned char> astr(begin, end);
591
2.00k
        asciiStrings_.emplace_back(index, astr);
592
2.00k
      } else {
593
219
        u16StringStorage_.emplace_back();
594
219
        std::vector<char16_t> &ustr = u16StringStorage_.back();
595
219
        convertUTF8WithSurrogatesToUTF16(
596
219
            std::back_inserter(ustr), (const char *)begin, (const char *)end);
597
219
        u16Strings_.emplace_back(index, ustr);
598
219
      }
599
2.21k
      index++;
600
2.21k
    }
601
1.17k
  }
Unexecuted instantiation: ConsecutiveStringStorage.cpp:(anonymous namespace)::StringTableBuilder::StringTableBuilder<std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const* const*, long, 170l>, std::__1::integral_constant<bool, false> >(std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const* const*, long, 170l>, std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const* const*, long, 170l>, std::__1::integral_constant<bool, false>)
ConsecutiveStringStorage.cpp:(anonymous namespace)::StringTableBuilder::StringTableBuilder<std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >**, long, 170l>, std::__1::integral_constant<bool, false> >(std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >**, long, 170l>, std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >**, long, 170l>, std::__1::integral_constant<bool, false>)
Line
Count
Source
575
234
  StringTableBuilder(I begin, I end, Force8Bit) {
576
    // Generate and store a StringEntry for each string.
577
    // Remember the index of each string in our StringEntry, so that we can
578
    // later output the table in the correct order.
579
    // Place the entry in our ASCII entries if possible; otherwise we have to
580
    // convert to u16.
581
    // In delta optimizing mode, the ids of new strings in the string table
582
    // are adjusted to reflect that they appear after the old strings.
583
234
    uint32_t index = 0;
584
470
    for (auto it = begin; it != end; ++it) {
585
236
      auto &str = *it;
586
236
      static_assert(sizeof(str.data()[0]) == 1, "strings must be UTF8");
587
236
      const unsigned char *begin = (const unsigned char *)str.data();
588
236
      const unsigned char *end = begin + str.size();
589
236
      if (Force8Bit::value || isAllASCII(begin, end)) {
590
234
        ArrayRef<unsigned char> astr(begin, end);
591
234
        asciiStrings_.emplace_back(index, astr);
592
234
      } else {
593
2
        u16StringStorage_.emplace_back();
594
2
        std::vector<char16_t> &ustr = u16StringStorage_.back();
595
2
        convertUTF8WithSurrogatesToUTF16(
596
2
            std::back_inserter(ustr), (const char *)begin, (const char *)end);
597
2
        u16Strings_.emplace_back(index, ustr);
598
2
      }
599
236
      index++;
600
236
    }
601
234
  }
ConsecutiveStringStorage.cpp:(anonymous namespace)::StringTableBuilder::StringTableBuilder<llvh::StringRef const*, std::__1::integral_constant<bool, false> >(llvh::StringRef const*, llvh::StringRef const*, std::__1::integral_constant<bool, false>)
Line
Count
Source
575
234
  StringTableBuilder(I begin, I end, Force8Bit) {
576
    // Generate and store a StringEntry for each string.
577
    // Remember the index of each string in our StringEntry, so that we can
578
    // later output the table in the correct order.
579
    // Place the entry in our ASCII entries if possible; otherwise we have to
580
    // convert to u16.
581
    // In delta optimizing mode, the ids of new strings in the string table
582
    // are adjusted to reflect that they appear after the old strings.
583
234
    uint32_t index = 0;
584
2.21k
    for (auto it = begin; it != end; ++it) {
585
1.97k
      auto &str = *it;
586
1.97k
      static_assert(sizeof(str.data()[0]) == 1, "strings must be UTF8");
587
1.97k
      const unsigned char *begin = (const unsigned char *)str.data();
588
1.97k
      const unsigned char *end = begin + str.size();
589
1.97k
      if (Force8Bit::value || isAllASCII(begin, end)) {
590
1.76k
        ArrayRef<unsigned char> astr(begin, end);
591
1.76k
        asciiStrings_.emplace_back(index, astr);
592
1.76k
      } else {
593
217
        u16StringStorage_.emplace_back();
594
217
        std::vector<char16_t> &ustr = u16StringStorage_.back();
595
217
        convertUTF8WithSurrogatesToUTF16(
596
217
            std::back_inserter(ustr), (const char *)begin, (const char *)end);
597
217
        u16Strings_.emplace_back(index, ustr);
598
217
      }
599
1.97k
      index++;
600
1.97k
    }
601
234
  }
ConsecutiveStringStorage.cpp:(anonymous namespace)::StringTableBuilder::StringTableBuilder<std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const* const*, long, 170l>, std::__1::integral_constant<bool, true> >(std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const* const*, long, 170l>, std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const* const*, long, 170l>, std::__1::integral_constant<bool, true>)
Line
Count
Source
575
702
  StringTableBuilder(I begin, I end, Force8Bit) {
576
    // Generate and store a StringEntry for each string.
577
    // Remember the index of each string in our StringEntry, so that we can
578
    // later output the table in the correct order.
579
    // Place the entry in our ASCII entries if possible; otherwise we have to
580
    // convert to u16.
581
    // In delta optimizing mode, the ids of new strings in the string table
582
    // are adjusted to reflect that they appear after the old strings.
583
702
    uint32_t index = 0;
584
708
    for (auto it = begin; it != end; ++it) {
585
6
      auto &str = *it;
586
6
      static_assert(sizeof(str.data()[0]) == 1, "strings must be UTF8");
587
6
      const unsigned char *begin = (const unsigned char *)str.data();
588
6
      const unsigned char *end = begin + str.size();
589
6
      if (Force8Bit::value || isAllASCII(begin, end)) {
590
6
        ArrayRef<unsigned char> astr(begin, end);
591
6
        asciiStrings_.emplace_back(index, astr);
592
6
      } else {
593
0
        u16StringStorage_.emplace_back();
594
0
        std::vector<char16_t> &ustr = u16StringStorage_.back();
595
0
        convertUTF8WithSurrogatesToUTF16(
596
0
            std::back_inserter(ustr), (const char *)begin, (const char *)end);
597
0
        u16Strings_.emplace_back(index, ustr);
598
0
      }
599
6
      index++;
600
6
    }
601
702
  }
602
603
  /// \return the size our strings would occupy with no packing applied.
604
1.17k
  size_t unpackedSize() const {
605
1.17k
    size_t result = 0;
606
2.00k
    for (const auto &entry : asciiStrings_) {
607
2.00k
      result += sizeof(char) * entry.chars_.size();
608
2.00k
    }
609
1.17k
    for (const auto &entry : u16Strings_) {
610
219
      result += sizeof(char16_t) * entry.chars_.size();
611
219
    }
612
1.17k
    return result;
613
1.17k
  }
614
615
  /// "Pack" our strings into the given storages. This means setting the
616
  /// offsetInStorage_ field of each string entry, and then generating a
617
  /// corresponding storage blob such that each string is at that offset in the
618
  /// blob. Returns packed \p asciiStorage and \p u16Storage, by reference.
619
  /// If \p optimize is set, attempt to find a more efficient packing by reusing
620
  /// substrings and prefix-suffix overlaps.
621
  void packIntoStorage(
622
      std::vector<unsigned char> *asciiStorage,
623
      std::vector<char16_t> *u16Storage,
624
1.17k
      bool optimize) {
625
1.17k
    NamedRegionTimer timer(
626
1.17k
        "",
627
1.17k
        optimize ? "optimizingPackStrings" : "fastPackStrings",
628
1.17k
        "",
629
1.17k
        StringTableTimersName,
630
1.17k
        AreStatisticsEnabled());
631
    // Note these assignments use efficient move-assignment, not copying.
632
1.17k
    if (optimize) {
633
0
      *asciiStorage =
634
0
          StringPacker<unsigned char>::optimizingPackStrings(asciiStrings_);
635
0
      *u16Storage = StringPacker<char16_t>::optimizingPackStrings(u16Strings_);
636
1.17k
    } else {
637
1.17k
      *asciiStorage =
638
1.17k
          StringPacker<unsigned char>::fastPackStrings(asciiStrings_);
639
1.17k
      *u16Storage = StringPacker<char16_t>::fastPackStrings(u16Strings_);
640
1.17k
    }
641
642
1.17k
#ifndef NDEBUG
643
    // Ensure that our packing was correct.
644
1.17k
    StringPacker<unsigned char>::validateStringPacking(
645
1.17k
        asciiStrings_, *asciiStorage);
646
1.17k
    StringPacker<char16_t>::validateStringPacking(u16Strings_, *u16Storage);
647
1.17k
#endif
648
1.17k
  }
649
650
  /// Given that our strings have been packed into some storage, builds a string
651
  /// table from our stored string entries.
652
  /// \return a string table representing the offset and length of each string.
653
  /// Offsets are taken from the offsetInStorage_ property of each string entry.
654
  /// u16 string offsets are adjusted by \p u16OffsetAdjust and have their size
655
  /// negated to indicate they are Unicode. If a string is in the
656
  /// \p identifiers, we also mark the second most significant bit.
657
  std::vector<StringTableEntry> generateStringTable(
658
      ArrayRef<unsigned char> storage,
659
1.17k
      size_t u16OffsetAdjust) {
660
    // Each of our StringEntries remembers its original index in the initial
661
    // array. Create a table large enough, and set the corresponding index in
662
    // the table.
663
1.17k
    std::vector<StringTableEntry> table;
664
1.17k
    table.resize(asciiStrings_.size() + u16Strings_.size());
665
666
2.00k
    for (const auto &asciiStr : asciiStrings_) {
667
2.00k
      table.at(asciiStr.stringID_) = {
668
2.00k
          static_cast<uint32_t>(asciiStr.offsetInStorage_),
669
2.00k
          static_cast<uint32_t>(asciiStr.chars_.size()),
670
2.00k
          false /* isUTF16 */};
671
2.00k
    }
672
1.17k
    for (const auto &u16Str : u16Strings_) {
673
219
      table.at(u16Str.stringID_) = {
674
219
          static_cast<uint32_t>(
675
219
              u16Str.offsetInStorage_ * sizeof(char16_t) + u16OffsetAdjust),
676
219
          static_cast<uint32_t>(u16Str.chars_.size()),
677
219
          true /* isUTF16 */};
678
219
    }
679
1.17k
    return table;
680
1.17k
  }
681
682
  /// Appends \p u16Storage to the given byte array \p output.
683
  /// \return the offset of the u16 storage in that byte array.
684
  static size_t appendU16Storage(
685
      ArrayRef<char16_t> u16Storage,
686
1.17k
      std::vector<unsigned char> *output) {
687
1.17k
    using namespace llvh::support;
688
1.17k
    static_assert(sizeof(char16_t) == 2, "sizeof char16_t unexpectedly not 2");
689
1.17k
    if (u16Storage.empty()) {
690
      // Nothing to do, don't even bother aligning.
691
1.14k
      return 0;
692
1.14k
    }
693
694
    // Ensure 2-byte alignment.
695
29
    if (output->size() % sizeof(char16_t)) {
696
13
      output->push_back('\0');
697
13
    }
698
699
    // Make space, and write as little endian.
700
29
    size_t offset = output->size();
701
29
    output->resize(output->size() + sizeof(char16_t) * u16Storage.size());
702
29
    unsigned char *cursor = &output->at(offset);
703
2.80M
    for (char16_t s : u16Storage) {
704
2.80M
      endian::write<char16_t, little, 0>(cursor, s);
705
2.80M
      cursor += sizeof(s);
706
2.80M
    }
707
29
    return offset;
708
1.17k
  }
709
};
710
711
} // namespace
712
713
namespace hermes {
714
namespace hbc {
715
716
template <typename I, typename Force8Bit>
717
ConsecutiveStringStorage::ConsecutiveStringStorage(
718
    I begin,
719
    I end,
720
    Force8Bit,
721
1.17k
    bool optimize) {
722
  // Prepare to build our string table.
723
  // Generate storage for our ASCII and u16 strings.
724
1.17k
  StringTableBuilder builder(begin, end, Force8Bit{});
725
1.17k
  std::vector<unsigned char> asciiStorage;
726
1.17k
  std::vector<char16_t> u16Storage;
727
1.17k
  builder.packIntoStorage(&asciiStorage, &u16Storage, optimize);
728
729
  // Append the u16 storage to the ASCII storage, to form our combined storage.
730
1.17k
  storage_.insert(storage_.end(), asciiStorage.begin(), asciiStorage.end());
731
1.17k
  auto u16Offset = StringTableBuilder::appendU16Storage(u16Storage, &storage_);
732
733
  // Build our table over the storage.
734
1.17k
  strTable_ = builder.generateStringTable(storage_, u16Offset);
735
736
  // Record some stats.
737
1.17k
  size_t unpackedSize = builder.unpackedSize();
738
1.17k
  StringTableSize += unpackedSize;
739
1.17k
  StringTableSavings += unpackedSize - storage_.size();
740
1.17k
}
Unexecuted instantiation: hermes::hbc::ConsecutiveStringStorage::ConsecutiveStringStorage<std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const* const*, long, 170l>, std::__1::integral_constant<bool, false> >(std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const* const*, long, 170l>, std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const* const*, long, 170l>, std::__1::integral_constant<bool, false>, bool)
hermes::hbc::ConsecutiveStringStorage::ConsecutiveStringStorage<std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >**, long, 170l>, std::__1::integral_constant<bool, false> >(std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >**, long, 170l>, std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >**, long, 170l>, std::__1::integral_constant<bool, false>, bool)
Line
Count
Source
721
234
    bool optimize) {
722
  // Prepare to build our string table.
723
  // Generate storage for our ASCII and u16 strings.
724
234
  StringTableBuilder builder(begin, end, Force8Bit{});
725
234
  std::vector<unsigned char> asciiStorage;
726
234
  std::vector<char16_t> u16Storage;
727
234
  builder.packIntoStorage(&asciiStorage, &u16Storage, optimize);
728
729
  // Append the u16 storage to the ASCII storage, to form our combined storage.
730
234
  storage_.insert(storage_.end(), asciiStorage.begin(), asciiStorage.end());
731
234
  auto u16Offset = StringTableBuilder::appendU16Storage(u16Storage, &storage_);
732
733
  // Build our table over the storage.
734
234
  strTable_ = builder.generateStringTable(storage_, u16Offset);
735
736
  // Record some stats.
737
234
  size_t unpackedSize = builder.unpackedSize();
738
234
  StringTableSize += unpackedSize;
739
234
  StringTableSavings += unpackedSize - storage_.size();
740
234
}
hermes::hbc::ConsecutiveStringStorage::ConsecutiveStringStorage<llvh::StringRef const*, std::__1::integral_constant<bool, false> >(llvh::StringRef const*, llvh::StringRef const*, std::__1::integral_constant<bool, false>, bool)
Line
Count
Source
721
234
    bool optimize) {
722
  // Prepare to build our string table.
723
  // Generate storage for our ASCII and u16 strings.
724
234
  StringTableBuilder builder(begin, end, Force8Bit{});
725
234
  std::vector<unsigned char> asciiStorage;
726
234
  std::vector<char16_t> u16Storage;
727
234
  builder.packIntoStorage(&asciiStorage, &u16Storage, optimize);
728
729
  // Append the u16 storage to the ASCII storage, to form our combined storage.
730
234
  storage_.insert(storage_.end(), asciiStorage.begin(), asciiStorage.end());
731
234
  auto u16Offset = StringTableBuilder::appendU16Storage(u16Storage, &storage_);
732
733
  // Build our table over the storage.
734
234
  strTable_ = builder.generateStringTable(storage_, u16Offset);
735
736
  // Record some stats.
737
234
  size_t unpackedSize = builder.unpackedSize();
738
234
  StringTableSize += unpackedSize;
739
234
  StringTableSavings += unpackedSize - storage_.size();
740
234
}
hermes::hbc::ConsecutiveStringStorage::ConsecutiveStringStorage<std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const* const*, long, 170l>, std::__1::integral_constant<bool, true> >(std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const* const*, long, 170l>, std::__1::__deque_iterator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const*, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const* const*, long, 170l>, std::__1::integral_constant<bool, true>, bool)
Line
Count
Source
721
702
    bool optimize) {
722
  // Prepare to build our string table.
723
  // Generate storage for our ASCII and u16 strings.
724
702
  StringTableBuilder builder(begin, end, Force8Bit{});
725
702
  std::vector<unsigned char> asciiStorage;
726
702
  std::vector<char16_t> u16Storage;
727
702
  builder.packIntoStorage(&asciiStorage, &u16Storage, optimize);
728
729
  // Append the u16 storage to the ASCII storage, to form our combined storage.
730
702
  storage_.insert(storage_.end(), asciiStorage.begin(), asciiStorage.end());
731
702
  auto u16Offset = StringTableBuilder::appendU16Storage(u16Storage, &storage_);
732
733
  // Build our table over the storage.
734
702
  strTable_ = builder.generateStringTable(storage_, u16Offset);
735
736
  // Record some stats.
737
702
  size_t unpackedSize = builder.unpackedSize();
738
702
  StringTableSize += unpackedSize;
739
702
  StringTableSavings += unpackedSize - storage_.size();
740
702
}
741
742
template ConsecutiveStringStorage::ConsecutiveStringStorage(
743
    StringSetVector::const_iterator begin,
744
    StringSetVector::const_iterator end,
745
    std::false_type,
746
    bool optimize);
747
748
template ConsecutiveStringStorage::ConsecutiveStringStorage(
749
    StringSetVector::iterator begin,
750
    StringSetVector::iterator end,
751
    std::false_type,
752
    bool optimize);
753
754
template ConsecutiveStringStorage::ConsecutiveStringStorage(
755
    ArrayRef<llvh::StringRef>::const_iterator begin,
756
    ArrayRef<llvh::StringRef>::const_iterator end,
757
    std::false_type,
758
    bool optimize);
759
760
template ConsecutiveStringStorage::ConsecutiveStringStorage(
761
    StringSetVector::const_iterator begin,
762
    StringSetVector::const_iterator end,
763
    std::true_type,
764
    bool optimize);
765
766
906
uint32_t ConsecutiveStringStorage::getEntryHash(size_t i) const {
767
906
  ensureTableValid();
768
906
  ensureStorageValid();
769
770
906
  auto &entry = strTable_[i];
771
906
  uint32_t length = entry.getLength();
772
906
  assert(
773
906
      entry.getOffset() + (entry.isUTF16() ? length * 2 : length) <=
774
906
          storage_.size() &&
775
906
      "entry past end");
776
906
  const unsigned char *data = storage_.data() + entry.getOffset();
777
906
  if (entry.isUTF16()) {
778
64
    const char16_t *u16data = reinterpret_cast<const char16_t *>(data);
779
64
    return hermes::hashString(ArrayRef<char16_t>{u16data, length});
780
842
  } else {
781
842
    return hermes::hashString(ArrayRef<char>{(const char *)data, length});
782
842
  }
783
906
}
784
785
234
void ConsecutiveStringStorage::appendStorage(ConsecutiveStringStorage &&rhs) {
786
234
  ensureTableValid();
787
234
  ensureStorageValid();
788
  // If we have not yet been written, just acquire the rhs.
789
234
  if (strTable_.empty()) {
790
234
    *this = std::move(rhs);
791
234
    return;
792
234
  }
793
  // Offset incoming string entries by the size of our storage, and append the
794
  // incoming storage. Don't bother to offset if the string is empty; this
795
  // ensures that the empty string doesn't get pushed to strange places.
796
0
  uint32_t storageDelta = storage_.size();
797
0
  strTable_.reserve(strTable_.size() + rhs.strTable_.size());
798
0
  for (const StringTableEntry &entry : rhs.strTable_) {
799
0
    uint32_t length = entry.getLength();
800
0
    uint32_t offset = entry.getOffset() + (length ? storageDelta : 0);
801
0
    strTable_.emplace_back(offset, length, entry.isUTF16());
802
0
  }
803
0
  storage_.insert(storage_.end(), rhs.storage_.begin(), rhs.storage_.end());
804
0
}
805
806
llvh::StringRef ConsecutiveStringStorage::getStringAtIndex(
807
    uint32_t idx,
808
1.97k
    std::string &utf8ConversionStorage) const {
809
1.97k
  ensureTableValid();
810
1.97k
  ensureStorageValid();
811
1.97k
  assert(idx < strTable_.size() && "getStringAtIndex: invalid index");
812
1.97k
  return getStringFromEntry(strTable_[idx], storage_, utf8ConversionStorage);
813
1.97k
}
814
815
llvh::StringRef getStringFromEntry(
816
    const StringTableEntry &entry,
817
    llvh::ArrayRef<unsigned char> storage,
818
1.99k
    std::string &utf8ConversionStorage) {
819
1.99k
  uint32_t offset = entry.getOffset();
820
1.99k
  uint32_t length = entry.getLength();
821
1.99k
  assert(
822
1.99k
      offset + length <= storage.size() && offset + length >= offset &&
823
1.99k
      "Invalid entry");
824
1.99k
  if (!entry.isUTF16()) {
825
1.78k
    return llvh::StringRef{(const char *)storage.data() + offset, length};
826
1.78k
  } else {
827
217
    const char16_t *s =
828
217
        reinterpret_cast<const char16_t *>(storage.data() + offset);
829
217
    llvh::ArrayRef<char16_t> u16String(s, length);
830
217
    convertUTF16ToUTF8WithSingleSurrogates(utf8ConversionStorage, u16String);
831
217
    return utf8ConversionStorage;
832
217
  }
833
1.99k
}
834
835
} // namespace hbc
836
} // namespace hermes
837
838
#undef DEBUG_TYPE