Line data Source code
1 : // Copyright 2017 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_OBJECTS_STRING_H_
6 : #define V8_OBJECTS_STRING_H_
7 :
8 : #include "src/base/bits.h"
9 : #include "src/objects/instance-type.h"
10 : #include "src/objects/name.h"
11 : #include "src/objects/smi.h"
12 : #include "src/unicode-decoder.h"
13 :
14 : // Has to be the last include (doesn't have include guards):
15 : #include "src/objects/object-macros.h"
16 :
17 : namespace v8 {
18 : namespace internal {
19 :
20 : enum InstanceType : uint16_t;
21 :
22 : enum AllowNullsFlag { ALLOW_NULLS, DISALLOW_NULLS };
23 : enum RobustnessFlag { ROBUST_STRING_TRAVERSAL, FAST_STRING_TRAVERSAL };
24 :
25 : // The characteristics of a string are stored in its map. Retrieving these
26 : // few bits of information is moderately expensive, involving two memory
27 : // loads where the second is dependent on the first. To improve efficiency
28 : // the shape of the string is given its own class so that it can be retrieved
29 : // once and used for several string operations. A StringShape is small enough
30 : // to be passed by value and is immutable, but be aware that flattening a
31 : // string can potentially alter its shape. Also be aware that a GC caused by
32 : // something else can alter the shape of a string due to ConsString
33 : // shortcutting. Keeping these restrictions in mind has proven to be error-
34 : // prone and so we no longer put StringShapes in variables unless there is a
35 : // concrete performance benefit at that particular point in the code.
36 : class StringShape {
37 : public:
38 : inline explicit StringShape(const String s);
39 : inline explicit StringShape(Map s);
40 : inline explicit StringShape(InstanceType t);
41 : inline bool IsSequential();
42 : inline bool IsExternal();
43 : inline bool IsCons();
44 : inline bool IsSliced();
45 : inline bool IsThin();
46 : inline bool IsIndirect();
47 : inline bool IsExternalOneByte();
48 : inline bool IsExternalTwoByte();
49 : inline bool IsSequentialOneByte();
50 : inline bool IsSequentialTwoByte();
51 : inline bool IsInternalized();
52 : inline StringRepresentationTag representation_tag();
53 : inline uint32_t encoding_tag();
54 : inline uint32_t full_representation_tag();
55 : #ifdef DEBUG
56 : inline uint32_t type() { return type_; }
57 : inline void invalidate() { valid_ = false; }
58 : inline bool valid() { return valid_; }
59 : #else
60 : inline void invalidate() {}
61 : #endif
62 :
63 : private:
64 : uint32_t type_;
65 : #ifdef DEBUG
66 : inline void set_valid() { valid_ = true; }
67 : bool valid_;
68 : #else
69 1064 : inline void set_valid() {}
70 : #endif
71 : };
72 :
73 : // The String abstract class captures JavaScript string values:
74 : //
75 : // Ecma-262:
76 : // 4.3.16 String Value
77 : // A string value is a member of the type String and is a finite
78 : // ordered sequence of zero or more 16-bit unsigned integer values.
79 : //
80 : // All string values have a length field.
81 : class String : public Name {
82 : public:
83 : enum Encoding { ONE_BYTE_ENCODING, TWO_BYTE_ENCODING };
84 :
85 : // Representation of the flat content of a String.
86 : // A non-flat string doesn't have flat content.
87 : // A flat string has content that's encoded as a sequence of either
88 : // one-byte chars or two-byte UC16.
89 : // Returned by String::GetFlatContent().
90 : class FlatContent {
91 : public:
92 : // Returns true if the string is flat and this structure contains content.
93 : bool IsFlat() const { return state_ != NON_FLAT; }
94 : // Returns true if the structure contains one-byte content.
95 1503228 : bool IsOneByte() const { return state_ == ONE_BYTE; }
96 : // Returns true if the structure contains two-byte content.
97 : bool IsTwoByte() const { return state_ == TWO_BYTE; }
98 :
99 : // Return the one byte content of the string. Only use if IsOneByte()
100 : // returns true.
101 : Vector<const uint8_t> ToOneByteVector() const {
102 : DCHECK_EQ(ONE_BYTE, state_);
103 32325499 : return Vector<const uint8_t>(onebyte_start, length_);
104 : }
105 : // Return the two-byte content of the string. Only use if IsTwoByte()
106 : // returns true.
107 : Vector<const uc16> ToUC16Vector() const {
108 : DCHECK_EQ(TWO_BYTE, state_);
109 6077803 : return Vector<const uc16>(twobyte_start, length_);
110 : }
111 :
112 : uc16 Get(int i) const {
113 : DCHECK(i < length_);
114 : DCHECK(state_ != NON_FLAT);
115 62191680 : if (state_ == ONE_BYTE) return onebyte_start[i];
116 15286027 : return twobyte_start[i];
117 : }
118 :
119 : bool UsesSameString(const FlatContent& other) const {
120 : return onebyte_start == other.onebyte_start;
121 : }
122 :
123 : private:
124 : enum State { NON_FLAT, ONE_BYTE, TWO_BYTE };
125 :
126 : // Constructors only used by String::GetFlatContent().
127 : explicit FlatContent(const uint8_t* start, int length)
128 : : onebyte_start(start), length_(length), state_(ONE_BYTE) {}
129 : explicit FlatContent(const uc16* start, int length)
130 : : twobyte_start(start), length_(length), state_(TWO_BYTE) {}
131 : FlatContent() : onebyte_start(nullptr), length_(0), state_(NON_FLAT) {}
132 :
133 : union {
134 : const uint8_t* onebyte_start;
135 : const uc16* twobyte_start;
136 : };
137 : int length_;
138 : State state_;
139 :
140 : friend class String;
141 : friend class IterableSubString;
142 : };
143 :
144 : template <typename Char>
145 : V8_INLINE Vector<const Char> GetCharVector(
146 : const DisallowHeapAllocation& no_gc);
147 :
148 : // Get and set the length of the string.
149 : inline int length() const;
150 : inline void set_length(int value);
151 :
152 : // Get and set the length of the string using acquire loads and release
153 : // stores.
154 : inline int synchronized_length() const;
155 : inline void synchronized_set_length(int value);
156 :
157 : // Returns whether this string has only one-byte chars, i.e. all of them can
158 : // be one-byte encoded. This might be the case even if the string is
159 : // two-byte. Such strings may appear when the embedder prefers
160 : // two-byte external representations even for one-byte data.
161 : inline bool IsOneByteRepresentation() const;
162 : inline bool IsTwoByteRepresentation() const;
163 :
164 : // Cons and slices have an encoding flag that may not represent the actual
165 : // encoding of the underlying string. This is taken into account here.
166 : // This function is static because that helps it get inlined.
167 : // Requires: string.IsFlat()
168 : static inline bool IsOneByteRepresentationUnderneath(String string);
169 :
170 : // Get and set individual two byte chars in the string.
171 : inline void Set(int index, uint16_t value);
172 : // Get individual two byte char in the string. Repeated calls
173 : // to this method are not efficient unless the string is flat.
174 : V8_INLINE uint16_t Get(int index);
175 :
176 : // ES6 section 7.1.3.1 ToNumber Applied to the String Type
177 : static Handle<Object> ToNumber(Isolate* isolate, Handle<String> subject);
178 :
179 : // Flattens the string. Checks first inline to see if it is
180 : // necessary. Does nothing if the string is not a cons string.
181 : // Flattening allocates a sequential string with the same data as
182 : // the given string and mutates the cons string to a degenerate
183 : // form, where the first component is the new sequential string and
184 : // the second component is the empty string. If allocation fails,
185 : // this function returns a failure. If flattening succeeds, this
186 : // function returns the sequential string that is now the first
187 : // component of the cons string.
188 : //
189 : // Degenerate cons strings are handled specially by the garbage
190 : // collector (see IsShortcutCandidate).
191 :
192 : static inline Handle<String> Flatten(
193 : Isolate* isolate, Handle<String> string,
194 : AllocationType allocation = AllocationType::kYoung);
195 :
196 : // Tries to return the content of a flat string as a structure holding either
197 : // a flat vector of char or of uc16.
198 : // If the string isn't flat, and therefore doesn't have flat content, the
199 : // returned structure will report so, and can't provide a vector of either
200 : // kind.
201 : FlatContent GetFlatContent(const DisallowHeapAllocation& no_gc);
202 :
203 : // Returns the parent of a sliced string or first part of a flat cons string.
204 : // Requires: StringShape(this).IsIndirect() && this->IsFlat()
205 : inline String GetUnderlying();
206 :
207 : // String relational comparison, implemented according to ES6 section 7.2.11
208 : // Abstract Relational Comparison (step 5): The comparison of Strings uses a
209 : // simple lexicographic ordering on sequences of code unit values. There is no
210 : // attempt to use the more complex, semantically oriented definitions of
211 : // character or string equality and collating order defined in the Unicode
212 : // specification. Therefore String values that are canonically equal according
213 : // to the Unicode standard could test as unequal. In effect this algorithm
214 : // assumes that both Strings are already in normalized form. Also, note that
215 : // for strings containing supplementary characters, lexicographic ordering on
216 : // sequences of UTF-16 code unit values differs from that on sequences of code
217 : // point values.
218 : V8_WARN_UNUSED_RESULT static ComparisonResult Compare(Isolate* isolate,
219 : Handle<String> x,
220 : Handle<String> y);
221 :
222 : // Perform ES6 21.1.3.8, including checking arguments.
223 : static Object IndexOf(Isolate* isolate, Handle<Object> receiver,
224 : Handle<Object> search, Handle<Object> position);
225 : // Perform string match of pattern on subject, starting at start index.
226 : // Caller must ensure that 0 <= start_index <= sub->length(), as this does not
227 : // check any arguments.
228 : static int IndexOf(Isolate* isolate, Handle<String> receiver,
229 : Handle<String> search, int start_index);
230 :
231 : static Object LastIndexOf(Isolate* isolate, Handle<Object> receiver,
232 : Handle<Object> search, Handle<Object> position);
233 :
234 : // Encapsulates logic related to a match and its capture groups as required
235 : // by GetSubstitution.
236 3894 : class Match {
237 : public:
238 : virtual Handle<String> GetMatch() = 0;
239 : virtual Handle<String> GetPrefix() = 0;
240 : virtual Handle<String> GetSuffix() = 0;
241 :
242 : // A named capture can be invalid (if it is not specified in the pattern),
243 : // unmatched (specified but not matched in the current string), and matched.
244 : enum CaptureState { INVALID, UNMATCHED, MATCHED };
245 :
246 : virtual int CaptureCount() = 0;
247 : virtual bool HasNamedCaptures() = 0;
248 : virtual MaybeHandle<String> GetCapture(int i, bool* capture_exists) = 0;
249 : virtual MaybeHandle<String> GetNamedCapture(Handle<String> name,
250 : CaptureState* state) = 0;
251 :
252 3894 : virtual ~Match() = default;
253 : };
254 :
255 : // ES#sec-getsubstitution
256 : // GetSubstitution(matched, str, position, captures, replacement)
257 : // Expand the $-expressions in the string and return a new string with
258 : // the result.
259 : // A {start_index} can be passed to specify where to start scanning the
260 : // replacement string.
261 : V8_WARN_UNUSED_RESULT static MaybeHandle<String> GetSubstitution(
262 : Isolate* isolate, Match* match, Handle<String> replacement,
263 : int start_index = 0);
264 :
265 : // String equality operations.
266 : inline bool Equals(String other);
267 : inline static bool Equals(Isolate* isolate, Handle<String> one,
268 : Handle<String> two);
269 : bool IsUtf8EqualTo(Vector<const char> str, bool allow_prefix_match = false);
270 :
271 : // Dispatches to Is{One,Two}ByteEqualTo.
272 : template <typename Char>
273 : bool IsEqualTo(Vector<const Char> str);
274 :
275 : bool IsOneByteEqualTo(Vector<const uint8_t> str);
276 : bool IsTwoByteEqualTo(Vector<const uc16> str);
277 :
278 : // Return a UTF8 representation of the string. The string is null
279 : // terminated but may optionally contain nulls. Length is returned
280 : // in length_output if length_output is not a null pointer The string
281 : // should be nearly flat, otherwise the performance of this method may
282 : // be very slow (quadratic in the length). Setting robustness_flag to
283 : // ROBUST_STRING_TRAVERSAL invokes behaviour that is robust This means it
284 : // handles unexpected data without causing assert failures and it does not
285 : // do any heap allocations. This is useful when printing stack traces.
286 : std::unique_ptr<char[]> ToCString(AllowNullsFlag allow_nulls,
287 : RobustnessFlag robustness_flag, int offset,
288 : int length, int* length_output = nullptr);
289 : std::unique_ptr<char[]> ToCString(
290 : AllowNullsFlag allow_nulls = DISALLOW_NULLS,
291 : RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL,
292 : int* length_output = nullptr);
293 :
294 : bool ComputeArrayIndex(uint32_t* index);
295 :
296 : // Externalization.
297 : bool MakeExternal(v8::String::ExternalStringResource* resource);
298 : bool MakeExternal(v8::String::ExternalOneByteStringResource* resource);
299 : bool SupportsExternalization();
300 :
301 : // Conversion.
302 : inline bool AsArrayIndex(uint32_t* index);
303 : uint32_t inline ToValidIndex(Object number);
304 :
305 : // Trimming.
306 : enum TrimMode { kTrim, kTrimStart, kTrimEnd };
307 : static Handle<String> Trim(Isolate* isolate, Handle<String> string,
308 : TrimMode mode);
309 :
310 : DECL_CAST(String)
311 :
312 : void PrintOn(FILE* out);
313 :
314 : // For use during stack traces. Performs rudimentary sanity check.
315 : bool LooksValid();
316 :
317 : // Dispatched behavior.
318 : void StringShortPrint(StringStream* accumulator, bool show_details = true);
319 : void PrintUC16(std::ostream& os, int start = 0, int end = -1); // NOLINT
320 : #if defined(DEBUG) || defined(OBJECT_PRINT)
321 : char* ToAsciiArray();
322 : #endif
323 : DECL_PRINTER(String)
324 : DECL_VERIFIER(String)
325 :
326 : inline bool IsFlat();
327 :
328 : DEFINE_FIELD_OFFSET_CONSTANTS(Name::kHeaderSize,
329 : TORQUE_GENERATED_STRING_FIELDS)
330 :
331 : static const int kHeaderSize = kSize;
332 :
333 : // Max char codes.
334 : static const int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar;
335 : static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar;
336 : static const int kMaxUtf16CodeUnit = 0xffff;
337 : static const uint32_t kMaxUtf16CodeUnitU = kMaxUtf16CodeUnit;
338 : static const uc32 kMaxCodePoint = 0x10ffff;
339 :
340 : // Maximal string length.
341 : // The max length is different on 32 and 64 bit platforms. Max length for a
342 : // 32-bit platform is ~268.4M chars. On 64-bit platforms, max length is
343 : // ~1.073B chars. The limit on 64-bit is so that SeqTwoByteString::kMaxSize
344 : // can fit in a 32bit int: 2^31 - 1 is the max positive int, minus one bit as
345 : // each char needs two bytes, subtract 24 bytes for the string header size.
346 :
347 : // See include/v8.h for the definition.
348 : static const int kMaxLength = v8::String::kMaxLength;
349 : static_assert(kMaxLength <= (Smi::kMaxValue / 2 - kHeaderSize),
350 : "Unexpected max String length");
351 :
352 : // Max length for computing hash. For strings longer than this limit the
353 : // string length is used as the hash value.
354 : static const int kMaxHashCalcLength = 16383;
355 :
356 : // Limit for truncation in short printing.
357 : static const int kMaxShortPrintLength = 1024;
358 :
359 : // Helper function for flattening strings.
360 : template <typename sinkchar>
361 : static void WriteToFlat(String source, sinkchar* sink, int from, int to);
362 :
363 : // The return value may point to the first aligned word containing the first
364 : // non-one-byte character, rather than directly to the non-one-byte character.
365 : // If the return value is >= the passed length, the entire string was
366 : // one-byte.
367 10328879 : static inline int NonAsciiStart(const char* chars, int length) {
368 : const char* start = chars;
369 10328879 : const char* limit = chars + length;
370 :
371 10328879 : if (length >= kIntptrSize) {
372 : // Check unaligned bytes.
373 9509328 : while (!IsAligned(reinterpret_cast<intptr_t>(chars), sizeof(uintptr_t))) {
374 3937153 : if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) {
375 280 : return static_cast<int>(chars - start);
376 : }
377 3936873 : ++chars;
378 : }
379 : // Check aligned words.
380 : DCHECK_EQ(unibrow::Utf8::kMaxOneByteChar, 0x7F);
381 : const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFF * 0x80;
382 382444372 : while (chars + sizeof(uintptr_t) <= limit) {
383 380810634 : if (*reinterpret_cast<const uintptr_t*>(chars) & non_one_byte_mask) {
384 1564 : return static_cast<int>(chars - start);
385 : }
386 : chars += sizeof(uintptr_t);
387 : }
388 : }
389 : // Check remaining unaligned bytes.
390 83062257 : while (chars < limit) {
391 36371167 : if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) {
392 3556 : return static_cast<int>(chars - start);
393 : }
394 36367611 : ++chars;
395 : }
396 :
397 10323479 : return static_cast<int>(chars - start);
398 : }
399 :
400 : static inline bool IsAscii(const char* chars, int length) {
401 120409 : return NonAsciiStart(chars, length) >= length;
402 : }
403 :
404 : static inline bool IsAscii(const uint8_t* chars, int length) {
405 0 : return NonAsciiStart(reinterpret_cast<const char*>(chars), length) >=
406 : length;
407 : }
408 :
409 : static inline int NonOneByteStart(const uc16* chars, int length) {
410 1865057 : const uc16* limit = chars + length;
411 : const uc16* start = chars;
412 1527894094 : while (chars < limit) {
413 1526643568 : if (*chars > kMaxOneByteCharCodeU) return static_cast<int>(chars - start);
414 1526029037 : ++chars;
415 : }
416 1250526 : return static_cast<int>(chars - start);
417 : }
418 :
419 : static inline bool IsOneByte(const uc16* chars, int length) {
420 : return NonOneByteStart(chars, length) >= length;
421 : }
422 :
423 : template <class Visitor>
424 : static inline ConsString VisitFlat(Visitor* visitor, String string,
425 : int offset = 0);
426 :
427 : static Handle<FixedArray> CalculateLineEnds(Isolate* isolate,
428 : Handle<String> string,
429 : bool include_ending_line);
430 :
431 : private:
432 : friend class Name;
433 : friend class StringTableInsertionKey;
434 : friend class InternalizedStringKey;
435 :
436 : static Handle<String> SlowFlatten(Isolate* isolate, Handle<ConsString> cons,
437 : AllocationType allocation);
438 :
439 : // Slow case of String::Equals. This implementation works on any strings
440 : // but it is most efficient on strings that are almost flat.
441 : bool SlowEquals(String other);
442 :
443 : static bool SlowEquals(Isolate* isolate, Handle<String> one,
444 : Handle<String> two);
445 :
446 : // Slow case of AsArrayIndex.
447 : V8_EXPORT_PRIVATE bool SlowAsArrayIndex(uint32_t* index);
448 :
449 : // Compute and set the hash code.
450 : uint32_t ComputeAndSetHash();
451 :
452 14772841 : OBJECT_CONSTRUCTORS(String, Name);
453 : };
454 :
455 : class SubStringRange {
456 : public:
457 : inline SubStringRange(String string, const DisallowHeapAllocation& no_gc,
458 : int first = 0, int length = -1);
459 : class iterator;
460 : inline iterator begin();
461 : inline iterator end();
462 :
463 : private:
464 : String string_;
465 : int first_;
466 : int length_;
467 : const DisallowHeapAllocation& no_gc_;
468 : };
469 :
470 : // The SeqString abstract class captures sequential string values.
471 : class SeqString : public String {
472 : public:
473 : DECL_CAST(SeqString)
474 :
475 : // Truncate the string in-place if possible and return the result.
476 : // In case of new_length == 0, the empty string is returned without
477 : // truncating the original string.
478 : V8_WARN_UNUSED_RESULT static Handle<String> Truncate(Handle<SeqString> string,
479 : int new_length);
480 :
481 : OBJECT_CONSTRUCTORS(SeqString, String);
482 : };
483 :
484 : class InternalizedString : public String {
485 : public:
486 : DECL_CAST(InternalizedString)
487 : // TODO(neis): Possibly move some stuff from String here.
488 :
489 : OBJECT_CONSTRUCTORS(InternalizedString, String);
490 : };
491 :
492 : // The OneByteString class captures sequential one-byte string objects.
493 : // Each character in the OneByteString is an one-byte character.
494 : class SeqOneByteString : public SeqString {
495 : public:
496 : static const bool kHasOneByteEncoding = true;
497 :
498 : // Dispatched behavior.
499 : inline uint16_t SeqOneByteStringGet(int index);
500 : inline void SeqOneByteStringSet(int index, uint16_t value);
501 :
502 : // Get the address of the characters in this string.
503 : inline Address GetCharsAddress();
504 :
505 : inline uint8_t* GetChars(const DisallowHeapAllocation& no_gc);
506 :
507 : // Clear uninitialized padding space. This ensures that the snapshot content
508 : // is deterministic.
509 : void clear_padding();
510 :
511 : DECL_CAST(SeqOneByteString)
512 :
513 : // Garbage collection support. This method is called by the
514 : // garbage collector to compute the actual size of an OneByteString
515 : // instance.
516 : inline int SeqOneByteStringSize(InstanceType instance_type);
517 :
518 : // Computes the size for an OneByteString instance of a given length.
519 784 : static int SizeFor(int length) {
520 377922406 : return OBJECT_POINTER_ALIGN(kHeaderSize + length * kCharSize);
521 : }
522 :
523 : // Maximal memory usage for a single sequential one-byte string.
524 : static const int kMaxCharsSize = kMaxLength;
525 : static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize);
526 : STATIC_ASSERT((kMaxSize - kHeaderSize) >= String::kMaxLength);
527 :
528 : class BodyDescriptor;
529 :
530 0 : OBJECT_CONSTRUCTORS(SeqOneByteString, SeqString);
531 : };
532 :
533 : // The TwoByteString class captures sequential unicode string objects.
534 : // Each character in the TwoByteString is a two-byte uint16_t.
535 : class SeqTwoByteString : public SeqString {
536 : public:
537 : static const bool kHasOneByteEncoding = false;
538 :
539 : // Dispatched behavior.
540 : inline uint16_t SeqTwoByteStringGet(int index);
541 : inline void SeqTwoByteStringSet(int index, uint16_t value);
542 :
543 : // Get the address of the characters in this string.
544 : inline Address GetCharsAddress();
545 :
546 : inline uc16* GetChars(const DisallowHeapAllocation& no_gc);
547 :
548 : // Clear uninitialized padding space. This ensures that the snapshot content
549 : // is deterministic.
550 : void clear_padding();
551 :
552 : DECL_CAST(SeqTwoByteString)
553 :
554 : // Garbage collection support. This method is called by the
555 : // garbage collector to compute the actual size of a TwoByteString
556 : // instance.
557 : inline int SeqTwoByteStringSize(InstanceType instance_type);
558 :
559 : // Computes the size for a TwoByteString instance of a given length.
560 896 : static int SizeFor(int length) {
561 138304476 : return OBJECT_POINTER_ALIGN(kHeaderSize + length * kShortSize);
562 : }
563 :
564 : // Maximal memory usage for a single sequential two-byte string.
565 : static const int kMaxCharsSize = kMaxLength * 2;
566 : static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize);
567 : STATIC_ASSERT(static_cast<int>((kMaxSize - kHeaderSize) / sizeof(uint16_t)) >=
568 : String::kMaxLength);
569 :
570 : class BodyDescriptor;
571 :
572 0 : OBJECT_CONSTRUCTORS(SeqTwoByteString, SeqString);
573 : };
574 :
575 : // The ConsString class describes string values built by using the
576 : // addition operator on strings. A ConsString is a pair where the
577 : // first and second components are pointers to other string values.
578 : // One or both components of a ConsString can be pointers to other
579 : // ConsStrings, creating a binary tree of ConsStrings where the leaves
580 : // are non-ConsString string values. The string value represented by
581 : // a ConsString can be obtained by concatenating the leaf string
582 : // values in a left-to-right depth-first traversal of the tree.
583 : class ConsString : public String {
584 : public:
585 : // First string of the cons cell.
586 : inline String first();
587 : // Doesn't check that the result is a string, even in debug mode. This is
588 : // useful during GC where the mark bits confuse the checks.
589 : inline Object unchecked_first();
590 : inline void set_first(Isolate* isolate, String first,
591 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
592 :
593 : // Second string of the cons cell.
594 : inline String second();
595 : // Doesn't check that the result is a string, even in debug mode. This is
596 : // useful during GC where the mark bits confuse the checks.
597 : inline Object unchecked_second();
598 : inline void set_second(Isolate* isolate, String second,
599 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
600 :
601 : // Dispatched behavior.
602 : V8_EXPORT_PRIVATE uint16_t ConsStringGet(int index);
603 :
604 : DECL_CAST(ConsString)
605 :
606 : DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize,
607 : TORQUE_GENERATED_CONS_STRING_FIELDS)
608 :
609 : // Minimum length for a cons string.
610 : static const int kMinLength = 13;
611 :
612 : typedef FixedBodyDescriptor<kFirstOffset, kSize, kSize> BodyDescriptor;
613 :
614 : DECL_VERIFIER(ConsString)
615 :
616 0 : OBJECT_CONSTRUCTORS(ConsString, String);
617 : };
618 :
619 : // The ThinString class describes string objects that are just references
620 : // to another string object. They are used for in-place internalization when
621 : // the original string cannot actually be internalized in-place: in these
622 : // cases, the original string is converted to a ThinString pointing at its
623 : // internalized version (which is allocated as a new object).
624 : // In terms of memory layout and most algorithms operating on strings,
625 : // ThinStrings can be thought of as "one-part cons strings".
626 : class ThinString : public String {
627 : public:
628 : // Actual string that this ThinString refers to.
629 : inline String actual() const;
630 : inline HeapObject unchecked_actual() const;
631 : inline void set_actual(String s,
632 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
633 :
634 : V8_EXPORT_PRIVATE uint16_t ThinStringGet(int index);
635 :
636 : DECL_CAST(ThinString)
637 : DECL_VERIFIER(ThinString)
638 :
639 : DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize,
640 : TORQUE_GENERATED_THIN_STRING_FIELDS)
641 :
642 : typedef FixedBodyDescriptor<kActualOffset, kSize, kSize> BodyDescriptor;
643 :
644 0 : OBJECT_CONSTRUCTORS(ThinString, String);
645 : };
646 :
647 : // The Sliced String class describes strings that are substrings of another
648 : // sequential string. The motivation is to save time and memory when creating
649 : // a substring. A Sliced String is described as a pointer to the parent,
650 : // the offset from the start of the parent string and the length. Using
651 : // a Sliced String therefore requires unpacking of the parent string and
652 : // adding the offset to the start address. A substring of a Sliced String
653 : // are not nested since the double indirection is simplified when creating
654 : // such a substring.
655 : // Currently missing features are:
656 : // - handling externalized parent strings
657 : // - external strings as parent
658 : // - truncating sliced string to enable otherwise unneeded parent to be GC'ed.
659 : class SlicedString : public String {
660 : public:
661 : inline String parent();
662 : inline void set_parent(Isolate* isolate, String parent,
663 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
664 : inline int offset() const;
665 : inline void set_offset(int offset);
666 :
667 : // Dispatched behavior.
668 : V8_EXPORT_PRIVATE uint16_t SlicedStringGet(int index);
669 :
670 : DECL_CAST(SlicedString)
671 :
672 : DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize,
673 : TORQUE_GENERATED_SLICED_STRING_FIELDS)
674 :
675 : // Minimum length for a sliced string.
676 : static const int kMinLength = 13;
677 :
678 : typedef FixedBodyDescriptor<kParentOffset, kSize, kSize> BodyDescriptor;
679 :
680 : DECL_VERIFIER(SlicedString)
681 :
682 0 : OBJECT_CONSTRUCTORS(SlicedString, String);
683 : };
684 :
685 : // The ExternalString class describes string values that are backed by
686 : // a string resource that lies outside the V8 heap. ExternalStrings
687 : // consist of the length field common to all strings, a pointer to the
688 : // external resource. It is important to ensure (externally) that the
689 : // resource is not deallocated while the ExternalString is live in the
690 : // V8 heap.
691 : //
692 : // The API expects that all ExternalStrings are created through the
693 : // API. Therefore, ExternalStrings should not be used internally.
694 : class ExternalString : public String {
695 : public:
696 : DECL_CAST(ExternalString)
697 :
698 : DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize,
699 : TORQUE_GENERATED_EXTERNAL_STRING_FIELDS)
700 :
701 : // Size of uncached external strings.
702 : static const int kUncachedSize =
703 : kResourceOffset + FIELD_SIZE(kResourceOffset);
704 :
705 : // Return whether the external string data pointer is not cached.
706 : inline bool is_uncached() const;
707 : // Size in bytes of the external payload.
708 : int ExternalPayloadSize() const;
709 :
710 : // Used in the serializer/deserializer.
711 : inline Address resource_as_address();
712 : inline void set_address_as_resource(Address address);
713 : inline uint32_t resource_as_uint32();
714 : inline void set_uint32_as_resource(uint32_t value);
715 :
716 : // Disposes string's resource object if it has not already been disposed.
717 : inline void DisposeResource();
718 :
719 : STATIC_ASSERT(kResourceOffset == Internals::kStringResourceOffset);
720 :
721 : OBJECT_CONSTRUCTORS(ExternalString, String);
722 : };
723 :
724 : // The ExternalOneByteString class is an external string backed by an
725 : // one-byte string.
726 : class ExternalOneByteString : public ExternalString {
727 : public:
728 : static const bool kHasOneByteEncoding = true;
729 :
730 : typedef v8::String::ExternalOneByteStringResource Resource;
731 :
732 : // The underlying resource.
733 : inline const Resource* resource();
734 :
735 : // It is assumed that the previous resource is null. If it is not null, then
736 : // it is the responsability of the caller the handle the previous resource.
737 : inline void SetResource(Isolate* isolate, const Resource* buffer);
738 : // Used only during serialization.
739 : inline void set_resource(const Resource* buffer);
740 :
741 : // Update the pointer cache to the external character array.
742 : // The cached pointer is always valid, as the external character array does =
743 : // not move during lifetime. Deserialization is the only exception, after
744 : // which the pointer cache has to be refreshed.
745 : inline void update_data_cache();
746 :
747 : inline const uint8_t* GetChars();
748 :
749 : // Dispatched behavior.
750 : inline uint16_t ExternalOneByteStringGet(int index);
751 :
752 : DECL_CAST(ExternalOneByteString)
753 :
754 : class BodyDescriptor;
755 :
756 0 : OBJECT_CONSTRUCTORS(ExternalOneByteString, ExternalString);
757 : };
758 :
759 : // The ExternalTwoByteString class is an external string backed by a UTF-16
760 : // encoded string.
761 : class ExternalTwoByteString : public ExternalString {
762 : public:
763 : static const bool kHasOneByteEncoding = false;
764 :
765 : typedef v8::String::ExternalStringResource Resource;
766 :
767 : // The underlying string resource.
768 : inline const Resource* resource();
769 :
770 : // It is assumed that the previous resource is null. If it is not null, then
771 : // it is the responsability of the caller the handle the previous resource.
772 : inline void SetResource(Isolate* isolate, const Resource* buffer);
773 : // Used only during serialization.
774 : inline void set_resource(const Resource* buffer);
775 :
776 : // Update the pointer cache to the external character array.
777 : // The cached pointer is always valid, as the external character array does =
778 : // not move during lifetime. Deserialization is the only exception, after
779 : // which the pointer cache has to be refreshed.
780 : inline void update_data_cache();
781 :
782 : inline const uint16_t* GetChars();
783 :
784 : // Dispatched behavior.
785 : inline uint16_t ExternalTwoByteStringGet(int index);
786 :
787 : // For regexp code.
788 : inline const uint16_t* ExternalTwoByteStringGetData(unsigned start);
789 :
790 : DECL_CAST(ExternalTwoByteString)
791 :
792 : class BodyDescriptor;
793 :
794 0 : OBJECT_CONSTRUCTORS(ExternalTwoByteString, ExternalString);
795 : };
796 :
797 : // A flat string reader provides random access to the contents of a
798 : // string independent of the character width of the string. The handle
799 : // must be valid as long as the reader is being used.
800 3007358 : class FlatStringReader : public Relocatable {
801 : public:
802 : FlatStringReader(Isolate* isolate, Handle<String> str);
803 : FlatStringReader(Isolate* isolate, Vector<const char> input);
804 : void PostGarbageCollection() override;
805 : inline uc32 Get(int index);
806 : template <typename Char>
807 : inline Char Get(int index);
808 : int length() { return length_; }
809 :
810 : private:
811 : Address* str_;
812 : bool is_one_byte_;
813 : int length_;
814 : const void* start_;
815 : };
816 :
817 : // This maintains an off-stack representation of the stack frames required
818 : // to traverse a ConsString, allowing an entirely iterative and restartable
819 : // traversal of the entire string
820 : class ConsStringIterator {
821 : public:
822 6430331 : inline ConsStringIterator() = default;
823 66924 : inline explicit ConsStringIterator(ConsString cons_string, int offset = 0) {
824 : Reset(cons_string, offset);
825 66924 : }
826 : inline void Reset(ConsString cons_string, int offset = 0) {
827 11784827 : depth_ = 0;
828 : // Next will always return nullptr.
829 11749160 : if (cons_string.is_null()) return;
830 108627 : Initialize(cons_string, offset);
831 : }
832 : // Returns nullptr when complete.
833 : inline String Next(int* offset_out) {
834 71466319 : *offset_out = 0;
835 71466319 : if (depth_ == 0) return String();
836 60940358 : return Continue(offset_out);
837 : }
838 :
839 : private:
840 : static const int kStackSize = 32;
841 : // Use a mask instead of doing modulo operations for stack wrapping.
842 : static const int kDepthMask = kStackSize - 1;
843 : static_assert(base::bits::IsPowerOfTwo(kStackSize),
844 : "kStackSize must be power of two");
845 : static inline int OffsetForDepth(int depth);
846 :
847 : inline void PushLeft(ConsString string);
848 : inline void PushRight(ConsString string);
849 : inline void AdjustMaximumDepth();
850 : inline void Pop();
851 122115196 : inline bool StackBlown() { return maximum_depth_ - depth_ == kStackSize; }
852 : void Initialize(ConsString cons_string, int offset);
853 : String Continue(int* offset_out);
854 : String NextLeaf(bool* blew_stack);
855 : String Search(int* offset_out);
856 :
857 : // Stack must always contain only frames for which right traversal
858 : // has not yet been performed.
859 : ConsString frames_[kStackSize];
860 : ConsString root_;
861 : int depth_;
862 : int maximum_depth_;
863 : int consumed_;
864 : DISALLOW_COPY_AND_ASSIGN(ConsStringIterator);
865 : };
866 :
867 : class StringCharacterStream {
868 : public:
869 : inline explicit StringCharacterStream(String string, int offset = 0);
870 : inline uint16_t GetNext();
871 : inline bool HasMore();
872 : inline void Reset(String string, int offset = 0);
873 : inline void VisitOneByteString(const uint8_t* chars, int length);
874 : inline void VisitTwoByteString(const uint16_t* chars, int length);
875 :
876 : private:
877 : ConsStringIterator iter_;
878 : bool is_one_byte_;
879 : union {
880 : const uint8_t* buffer8_;
881 : const uint16_t* buffer16_;
882 : };
883 : const uint8_t* end_;
884 : DISALLOW_COPY_AND_ASSIGN(StringCharacterStream);
885 : };
886 :
887 : } // namespace internal
888 : } // namespace v8
889 :
890 : #include "src/objects/object-macros-undef.h"
891 :
892 : #endif // V8_OBJECTS_STRING_H_
|