/src/hermes/include/hermes/VM/StringView.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | #ifndef HERMES_VM_STRINGVIEW_H |
9 | | #define HERMES_VM_STRINGVIEW_H |
10 | | |
11 | | #include "SmallXString.h" |
12 | | #include "hermes/VM/Runtime.h" |
13 | | #include "hermes/VM/StringPrimitive.h" |
14 | | #include "hermes/VM/StringRefUtils.h" |
15 | | #include "hermes/VM/TwineChar16.h" |
16 | | #pragma GCC diagnostic push |
17 | | |
18 | | #ifdef HERMES_COMPILER_SUPPORTS_WSHORTEN_64_TO_32 |
19 | | #pragma GCC diagnostic ignored "-Wshorten-64-to-32" |
20 | | #endif |
21 | | namespace hermes { |
22 | | namespace vm { |
23 | | |
24 | | /// StringView is a view to the string content from StringPrimitive. |
25 | | /// It hides the difference between ASCII string and UTF16 string, and hence |
26 | | /// allow you to iterate through a string without worrying about the type. |
27 | | /// Internally, it's a char pointer and a char16 pointer (only one is valid). |
28 | | /// |
29 | | /// Performance: Iterating from StringView is slightly slower than normal |
30 | | /// iterations: every operation has one extra conditional check on the type. |
31 | | /// If you are in a extremely performance sensitive setting, consider getting |
32 | | /// raw pointers directly out of StringPrimitive and explicitly duplicate code |
33 | | /// to handle char and char16 strings separately. |
34 | | /// |
35 | | /// Alternatively, if you know the string is very likely to be UTF16, or the |
36 | | /// string is short, consider call getUTF16Ref (which may invoke a string copy |
37 | | /// if it turns out to be an ASCII string). |
38 | | class StringView { |
39 | | friend class StringPrimitive; |
40 | | friend class IdentifierTable; |
41 | | |
42 | | union { |
43 | | /// StringView can be used to represent a view to a non-GC-managed string, |
44 | | /// a.k.a persistent identifiers whose string content is from a static |
45 | | /// memory address (either C++ literal or from a persistent bytecode module. |
46 | | const void *nonManagedStringPtr_; |
47 | | |
48 | | /// Handle pointing to the actual string. We need a handle to allow a |
49 | | /// StringView to survive allocations, so that we can have multiple |
50 | | /// StringViews around at the same time. Note that the StringPrimitive |
51 | | /// must have been resolved if it's a rope, i.e. we should be able to obtain |
52 | | /// a char/char16 pointer directly from str_. |
53 | | /// |
54 | | /// NOTE: we are using \c llvh::AlignedCharArrayUnion to avoid constructing |
55 | | /// the handle (which doesn't have a default constructor). |
56 | | llvh::AlignedCharArrayUnion<Handle<StringPrimitive>> strPrim_; |
57 | | }; |
58 | | |
59 | | /// Starting index in the StringPrimitive as the beginning of this view. |
60 | | uint32_t startIndex_ : 30; |
61 | | |
62 | | /// Whether we are storing a handle or a non-managed pointer. |
63 | | uint32_t isHandle_ : 1; |
64 | | |
65 | | /// Whether the string is ASCII. |
66 | | uint32_t isASCII_ : 1; |
67 | | |
68 | | /// Length of the string. |
69 | | uint32_t length_; |
70 | | |
71 | | public: |
72 | | /// Iterator for StringView. It's mostly standard except *operator does not |
73 | | /// return a reference, which disables certain things such as creating a |
74 | | /// reverse_iterator using std::reverse_iterator. |
75 | | class const_iterator { |
76 | | friend class StringView; |
77 | | |
78 | | /// Current pointer position if the underlying string is char string. |
79 | | const char *charPtr_{nullptr}; |
80 | | |
81 | | /// Current pointer position if the underlying string is char16 string. |
82 | | const char16_t *char16Ptr_{nullptr}; |
83 | | |
84 | | const_iterator(const char *charPtr, const char16_t *char16Ptr) |
85 | 252k | : charPtr_(charPtr), char16Ptr_(char16Ptr) { |
86 | 252k | assert( |
87 | 252k | ((!charPtr_) ^ (!char16Ptr_)) && |
88 | 252k | "Must provide one of char or char16 pointer"); |
89 | 252k | } |
90 | | |
91 | 2.05k | explicit const_iterator(const char *ptr) : const_iterator(ptr, nullptr) {} |
92 | | |
93 | | explicit const_iterator(const char16_t *ptr) |
94 | 249k | : const_iterator(nullptr, ptr) {} |
95 | | |
96 | | public: |
97 | | using iterator_category = std::random_access_iterator_tag; |
98 | | using value_type = char16_t; |
99 | | using pointer = char16_t *; |
100 | | using difference_type = std::ptrdiff_t; |
101 | | using reference = char16_t; |
102 | | |
103 | | const_iterator() = default; |
104 | | |
105 | | /// Allows for copying. |
106 | | const_iterator(const const_iterator &other) = default; |
107 | | const_iterator &operator=(const const_iterator &other) = default; |
108 | | |
109 | 1.80M | const_iterator &operator++() { |
110 | 1.80M | if (charPtr_) { |
111 | 104k | ++charPtr_; |
112 | 1.69M | } else { |
113 | 1.69M | ++char16Ptr_; |
114 | 1.69M | } |
115 | 1.80M | return *this; |
116 | 1.80M | } |
117 | 0 | const_iterator &operator--() { |
118 | 0 | if (charPtr_) { |
119 | 0 | --charPtr_; |
120 | 0 | } else { |
121 | 0 | --char16Ptr_; |
122 | 0 | } |
123 | 0 | return *this; |
124 | 0 | } |
125 | 0 | const_iterator &operator+=(difference_type rhs) { |
126 | 0 | if (charPtr_) { |
127 | 0 | charPtr_ += rhs; |
128 | 0 | } else { |
129 | 0 | char16Ptr_ += rhs; |
130 | 0 | } |
131 | 0 | return *this; |
132 | 0 | } |
133 | 0 | const_iterator &operator-=(difference_type rhs) { |
134 | 0 | if (charPtr_) { |
135 | 0 | charPtr_ -= rhs; |
136 | 0 | } else { |
137 | 0 | char16Ptr_ -= rhs; |
138 | 0 | } |
139 | 0 | return *this; |
140 | 0 | } |
141 | 0 | const_iterator operator++(int) { |
142 | 0 | const_iterator tmp(charPtr_, char16Ptr_); |
143 | 0 | if (charPtr_) { |
144 | 0 | ++charPtr_; |
145 | 0 | } else { |
146 | 0 | ++char16Ptr_; |
147 | 0 | } |
148 | 0 | return tmp; |
149 | 0 | } |
150 | 0 | const_iterator operator--(int) { |
151 | 0 | const_iterator tmp(charPtr_, char16Ptr_); |
152 | 0 | if (charPtr_) { |
153 | 0 | --charPtr_; |
154 | 0 | } else { |
155 | 0 | --char16Ptr_; |
156 | 0 | } |
157 | 0 | return tmp; |
158 | 0 | } |
159 | | |
160 | 287 | difference_type operator-(const const_iterator &rhs) const { |
161 | 287 | if (charPtr_) { |
162 | 287 | return charPtr_ - rhs.charPtr_; |
163 | 287 | } |
164 | 0 | return char16Ptr_ - rhs.char16Ptr_; |
165 | 287 | } |
166 | | |
167 | 174 | const_iterator operator-(difference_type rhs) const { |
168 | 174 | if (charPtr_) { |
169 | 174 | return const_iterator(charPtr_ - rhs, char16Ptr_); |
170 | 174 | } |
171 | 0 | return const_iterator(charPtr_, char16Ptr_ - rhs); |
172 | 174 | } |
173 | 0 | const_iterator operator+(difference_type rhs) const { |
174 | 0 | if (charPtr_) { |
175 | 0 | return const_iterator(charPtr_ + rhs, char16Ptr_); |
176 | 0 | } |
177 | 0 | return const_iterator(charPtr_, char16Ptr_ + rhs); |
178 | 0 | } |
179 | | |
180 | | /// Const dereference. Note that we cannot return a reference here (without |
181 | | /// losing efficiency, and hence making this iterator non-standard. |
182 | 1.80M | char16_t operator*() const { |
183 | 1.80M | return charPtr_ ? *charPtr_ : *char16Ptr_; |
184 | 1.80M | } |
185 | | |
186 | | /// Comparisons. |
187 | 1.92M | bool operator==(const const_iterator &rhs) const { |
188 | 1.92M | if (charPtr_) { |
189 | 106k | return charPtr_ == rhs.charPtr_; |
190 | 106k | } |
191 | 1.82M | return char16Ptr_ == rhs.char16Ptr_; |
192 | 1.92M | } |
193 | 1.92M | bool operator!=(const const_iterator &rhs) const { |
194 | 1.92M | return !(*this == rhs); |
195 | 1.92M | } |
196 | 0 | bool operator>(const const_iterator &rhs) const { |
197 | 0 | if (charPtr_) { |
198 | 0 | return charPtr_ > rhs.charPtr_; |
199 | 0 | } |
200 | 0 | return char16Ptr_ > rhs.char16Ptr_; |
201 | 0 | } |
202 | 0 | bool operator<(const const_iterator &rhs) const { |
203 | 0 | if (charPtr_) { |
204 | 0 | return charPtr_ < rhs.charPtr_; |
205 | 0 | } |
206 | 0 | return char16Ptr_ < rhs.char16Ptr_; |
207 | 0 | } |
208 | 0 | bool operator>=(const const_iterator &rhs) const { |
209 | 0 | return !(*this < rhs); |
210 | 0 | } |
211 | 0 | bool operator<=(const const_iterator &rhs) const { |
212 | 0 | return !(*this > rhs); |
213 | 0 | } |
214 | | }; |
215 | | |
216 | | /// Reverse iterator type. |
217 | | using const_reverse_iterator = std::reverse_iterator<const_iterator>; |
218 | | |
219 | | // In debug mode the handle is non-trivial, which makes us non-trivial too and |
220 | | // we need to invoke its copy constructor and destructor. |
221 | | // We could also deal with this using templates, by inheriting from a different |
222 | | // base class depending on std::is_trivially_copyable<>, but the complexity is |
223 | | // probably not worth it. |
224 | | #ifndef NDEBUG |
225 | 511k | StringView(const StringView &other) { |
226 | 511k | ::memcpy(this, &other, sizeof(*this)); |
227 | 511k | if (isHandle_) |
228 | 47.4k | new (strPrim_.buffer) Handle<StringPrimitive>(other.strPrim()); |
229 | 511k | } |
230 | | |
231 | 0 | StringView &operator=(const StringView &other) { |
232 | 0 | if (this != &other) { |
233 | 0 | if (isHandle_) |
234 | 0 | strPrim().~Handle<StringPrimitive>(); |
235 | 0 | ::memcpy(this, &other, sizeof(*this)); |
236 | 0 | if (isHandle_) |
237 | 0 | new (strPrim_.buffer) Handle<StringPrimitive>(other.strPrim()); |
238 | 0 | } |
239 | 0 | return *this; |
240 | 0 | } |
241 | | |
242 | 1.53M | ~StringView() { |
243 | 1.53M | if (isHandle_) |
244 | 581k | strPrim().~Handle<StringPrimitive>(); |
245 | 1.53M | } |
246 | | #else |
247 | | StringView(const StringView &other) = default; |
248 | | ~StringView() = default; |
249 | | #endif |
250 | | |
251 | 0 | StringView(const char *ptr) : StringView(ASCIIRef(ptr, strlen(ptr))) {} |
252 | | |
253 | | /// \return an iterator pointing at the beginning of the string. |
254 | 126k | const_iterator begin() const { |
255 | 126k | if (isASCII()) { |
256 | 1.07k | return const_iterator(castToCharPtr()); |
257 | 1.07k | } |
258 | 124k | return const_iterator(castToChar16Ptr()); |
259 | 126k | } |
260 | | |
261 | | /// \return an iterator pointing at one pass the end of the string. |
262 | 125k | const_iterator end() const { |
263 | 125k | if (isASCII()) { |
264 | 984 | return const_iterator(castToCharPtr() + length_); |
265 | 984 | } |
266 | 124k | return const_iterator(castToChar16Ptr() + length_); |
267 | 125k | } |
268 | | |
269 | | /// \return a reverse iterator pointing at the end of the string. |
270 | 0 | const_reverse_iterator rbegin() const { |
271 | 0 | return const_reverse_iterator(end()); |
272 | 0 | } |
273 | | |
274 | | /// \return a reverse iterator pointing at one pass the begin of the string. |
275 | 0 | const_reverse_iterator rend() const { |
276 | 0 | return const_reverse_iterator(begin()); |
277 | 0 | } |
278 | | |
279 | | /// \return the length. |
280 | 1.12M | size_t length() const { |
281 | 1.12M | return length_; |
282 | 1.12M | } |
283 | | |
284 | | /// \return whether this string is empty. |
285 | 0 | bool empty() const { |
286 | 0 | return !length_; |
287 | 0 | } |
288 | | |
289 | | /// \return whether this is a char string. |
290 | 2.30M | bool isASCII() const { |
291 | 2.30M | return isASCII_; |
292 | 2.30M | } |
293 | | |
294 | | /// Direct indexing, \return character at \p index. |
295 | 255 | char16_t operator[](uint32_t index) const { |
296 | 255 | assert(index < length_ && "Out of bound indexing"); |
297 | 255 | if (isASCII()) { |
298 | 255 | return castToCharPtr()[index]; |
299 | 255 | } |
300 | 0 | return castToChar16Ptr()[index]; |
301 | 255 | } |
302 | | |
303 | | /// \return a new StringView with the string sliced from \p start with |
304 | | /// length \p length. |
305 | 87 | StringView slice(uint32_t start, uint32_t length) const { |
306 | 87 | assert(start + length <= length_ && "Out of bound slicing"); |
307 | 87 | auto newStringView = *this; |
308 | 87 | newStringView.startIndex_ += start; |
309 | 87 | newStringView.length_ = length; |
310 | 87 | return newStringView; |
311 | 87 | } |
312 | | |
313 | | /// \return a new StringView with the string sliced from \p start till |
314 | | /// the end of the string. |
315 | 0 | StringView slice(uint32_t start) const { |
316 | 0 | assert(start <= length_ && "Out of bound slicing"); |
317 | 0 | return slice(start, length_ - start); |
318 | 0 | } |
319 | | |
320 | | /// \return a new StringView with the string sliced between [first, last). |
321 | 87 | StringView slice(const_iterator first, const_iterator last) const { |
322 | 87 | return slice(first - begin(), last - first); |
323 | 87 | } |
324 | | |
325 | | /// \return a UTF16Ref that pointing at the beginning of the string. |
326 | | /// If the string is already UTF16, we return the pointer directly; |
327 | | /// otherwise (it's ASCII) we copy the string into the end of \p allocator, |
328 | | /// and \return a pointer to the beginning of this string in the allocator. |
329 | | /// \pre allocator must be empty when passed in. |
330 | 104k | UTF16Ref getUTF16Ref(llvh::SmallVectorImpl<char16_t> &allocator) const { |
331 | 104k | assert(allocator.empty() && "Shouldn't use a non-empty allocator"); |
332 | 104k | return getUTF16Ref(allocator, false); |
333 | 104k | } |
334 | | |
335 | | /// Append the string into \p allocator, even though the string may already be |
336 | | /// UTF16. |
337 | 0 | void appendUTF16String(llvh::SmallVectorImpl<char16_t> &allocator) const { |
338 | 0 | (void)getUTF16Ref(allocator, true); |
339 | 0 | } |
340 | | |
341 | | /// Assuming the StringView represents a char string, \return the pointer. |
342 | 899k | const char *castToCharPtr() const { |
343 | 899k | assert(isASCII() && "Cannot cast char16_t pointer to char pointer"); |
344 | 899k | if (!isHandle_) { |
345 | 488k | return static_cast<const char *>(nonManagedStringPtr_) + startIndex_; |
346 | 488k | } |
347 | 899k | assert(isHandle_ && "StringView does not contain a valid string"); |
348 | 411k | return (*strPrim())->castToASCIIPointer() + startIndex_; |
349 | 411k | } |
350 | | |
351 | | /// Assuming the StringView represents a char16 string, \return the pointer. |
352 | 250k | const char16_t *castToChar16Ptr() const { |
353 | 250k | assert(!isASCII() && "Cannot cast char pointer to char16 pointer"); |
354 | 250k | if (!isHandle_) { |
355 | 0 | return static_cast<const char16_t *>(nonManagedStringPtr_) + startIndex_; |
356 | 0 | } |
357 | 250k | assert(isHandle_ && "StringView does not contain a valid string"); |
358 | 250k | return (*strPrim())->castToUTF16Pointer() + startIndex_; |
359 | 250k | } |
360 | | |
361 | | /// Check if two StringViews are equal. |
362 | 348 | bool equals(const StringView &other) const { |
363 | 348 | if (other.isASCII()) { |
364 | 348 | return equals(ASCIIRef(other.castToCharPtr(), other.length())); |
365 | 348 | } |
366 | 0 | return equals(UTF16Ref(other.castToChar16Ptr(), other.length())); |
367 | 348 | } |
368 | | |
369 | | /// Check if a StringView is equal to an ArrayRef. |
370 | | template <typename T> |
371 | 1.67k | bool equals(const llvh::ArrayRef<T> &other) const { |
372 | 1.67k | if (isASCII()) { |
373 | 1.40k | return stringRefEquals(ASCIIRef(castToCharPtr(), length()), other); |
374 | 1.40k | } |
375 | 263 | return stringRefEquals(UTF16Ref(castToChar16Ptr(), length()), other); |
376 | 1.67k | } bool hermes::vm::StringView::equals<char>(llvh::ArrayRef<char> const&) const Line | Count | Source | 371 | 704 | bool equals(const llvh::ArrayRef<T> &other) const { | 372 | 704 | if (isASCII()) { | 373 | 704 | return stringRefEquals(ASCIIRef(castToCharPtr(), length()), other); | 374 | 704 | } | 375 | 0 | return stringRefEquals(UTF16Ref(castToChar16Ptr(), length()), other); | 376 | 704 | } |
bool hermes::vm::StringView::equals<char16_t>(llvh::ArrayRef<char16_t> const&) const Line | Count | Source | 371 | 966 | bool equals(const llvh::ArrayRef<T> &other) const { | 372 | 966 | if (isASCII()) { | 373 | 703 | return stringRefEquals(ASCIIRef(castToCharPtr(), length()), other); | 374 | 703 | } | 375 | 263 | return stringRefEquals(UTF16Ref(castToChar16Ptr(), length()), other); | 376 | 966 | } |
Unexecuted instantiation: bool hermes::vm::StringView::equals<unsigned char>(llvh::ArrayRef<unsigned char> const&) const |
377 | | |
378 | 36 | TwineChar16 toTwine() const { |
379 | 36 | if (isASCII()) { |
380 | 36 | return TwineChar16(llvh::StringRef(castToCharPtr(), length())); |
381 | 36 | } |
382 | 0 | return TwineChar16(UTF16Ref(castToChar16Ptr(), length())); |
383 | 36 | } |
384 | | |
385 | 36 | operator TwineChar16() const { |
386 | 36 | return toTwine(); |
387 | 36 | } |
388 | | |
389 | | private: |
390 | | /// These constructors should only be called from self or from |
391 | | /// StringPrimitive. |
392 | | |
393 | | // Create a StringView from a StringPrimitive |
394 | | explicit StringView(Handle<StringPrimitive> str) |
395 | 534k | : startIndex_(0), |
396 | 534k | isHandle_(true), |
397 | 534k | isASCII_(str->isASCII()), |
398 | 534k | length_(str->getStringLength()) { |
399 | 534k | new (strPrim_.buffer) Handle<StringPrimitive>(str); |
400 | 534k | } |
401 | | |
402 | | /// Create a StringView from lazy identifier. |
403 | | explicit StringView(ASCIIRef asciiRef) |
404 | 488k | : nonManagedStringPtr_(asciiRef.data()), |
405 | 488k | startIndex_(0), |
406 | 488k | isHandle_(false), |
407 | 488k | isASCII_(true), |
408 | 488k | length_(asciiRef.size()) {} |
409 | | explicit StringView(UTF16Ref utf16Ref) |
410 | 0 | : nonManagedStringPtr_(utf16Ref.data()), |
411 | 0 | startIndex_(0), |
412 | 0 | isHandle_(false), |
413 | 0 | isASCII_(false), |
414 | 0 | length_(utf16Ref.size()) {} |
415 | | |
416 | | /// Helper function for getUTF16Ref and copyUTF16String. |
417 | | UTF16Ref getUTF16Ref( |
418 | | llvh::SmallVectorImpl<char16_t> &allocator, |
419 | | bool alwaysCopy) const; |
420 | | |
421 | 581k | Handle<StringPrimitive> &strPrim() { |
422 | 581k | assert(isHandle_ && "must be a handle"); |
423 | | // Need to go through a variable to placate gcc4.9. |
424 | 581k | char *buffer = strPrim_.buffer; |
425 | 581k | return *reinterpret_cast<Handle<StringPrimitive> *>(buffer); |
426 | 581k | } |
427 | 709k | const Handle<StringPrimitive> &strPrim() const { |
428 | 709k | assert(isHandle_ && "must be a handle"); |
429 | | // Need to go through a variable to placate gcc4.9. |
430 | 709k | const char *buffer = strPrim_.buffer; |
431 | 709k | return *reinterpret_cast<const Handle<StringPrimitive> *>(buffer); |
432 | 709k | } |
433 | | }; |
434 | | |
435 | | llvh::raw_ostream &operator<<(llvh::raw_ostream &os, const StringView &sv); |
436 | | |
437 | | } // namespace vm |
438 | | } // namespace hermes |
439 | | |
440 | | #pragma GCC diagnostic pop |
441 | | #endif // HERMES_VM_STRINGVIEW_H |