Line data Source code
1 : // Copyright 2017 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_OBJECTS_STRING_H_
6 : #define V8_OBJECTS_STRING_H_
7 :
8 : #include "src/base/bits.h"
9 : #include "src/objects/instance-type.h"
10 : #include "src/objects/name.h"
11 : #include "src/objects/smi.h"
12 : #include "src/unicode-decoder.h"
13 :
14 : // Has to be the last include (doesn't have include guards):
15 : #include "src/objects/object-macros.h"
16 :
17 : namespace v8 {
18 : namespace internal {
19 :
20 : enum InstanceType : uint16_t;
21 :
22 : enum AllowNullsFlag { ALLOW_NULLS, DISALLOW_NULLS };
23 : enum RobustnessFlag { ROBUST_STRING_TRAVERSAL, FAST_STRING_TRAVERSAL };
24 :
25 : // The characteristics of a string are stored in its map. Retrieving these
26 : // few bits of information is moderately expensive, involving two memory
27 : // loads where the second is dependent on the first. To improve efficiency
28 : // the shape of the string is given its own class so that it can be retrieved
29 : // once and used for several string operations. A StringShape is small enough
30 : // to be passed by value and is immutable, but be aware that flattening a
31 : // string can potentially alter its shape. Also be aware that a GC caused by
32 : // something else can alter the shape of a string due to ConsString
33 : // shortcutting. Keeping these restrictions in mind has proven to be error-
34 : // prone and so we no longer put StringShapes in variables unless there is a
35 : // concrete performance benefit at that particular point in the code.
36 : class StringShape {
37 : public:
38 : inline explicit StringShape(const String s);
39 : inline explicit StringShape(Map s);
40 : inline explicit StringShape(InstanceType t);
41 : inline bool IsSequential();
42 : inline bool IsExternal();
43 : inline bool IsCons();
44 : inline bool IsSliced();
45 : inline bool IsThin();
46 : inline bool IsIndirect();
47 : inline bool IsExternalOneByte();
48 : inline bool IsExternalTwoByte();
49 : inline bool IsSequentialOneByte();
50 : inline bool IsSequentialTwoByte();
51 : inline bool IsInternalized();
52 : inline StringRepresentationTag representation_tag();
53 : inline uint32_t encoding_tag();
54 : inline uint32_t full_representation_tag();
55 : inline bool HasOnlyOneByteChars();
56 : #ifdef DEBUG
57 : inline uint32_t type() { return type_; }
58 : inline void invalidate() { valid_ = false; }
59 : inline bool valid() { return valid_; }
60 : #else
61 : inline void invalidate() {}
62 : #endif
63 :
64 : private:
65 : uint32_t type_;
66 : #ifdef DEBUG
67 : inline void set_valid() { valid_ = true; }
68 : bool valid_;
69 : #else
70 : inline void set_valid() {}
71 : #endif
72 : };
73 :
74 : // The String abstract class captures JavaScript string values:
75 : //
76 : // Ecma-262:
77 : // 4.3.16 String Value
78 : // A string value is a member of the type String and is a finite
79 : // ordered sequence of zero or more 16-bit unsigned integer values.
80 : //
81 : // All string values have a length field.
82 : class String : public Name {
83 : public:
84 : enum Encoding { ONE_BYTE_ENCODING, TWO_BYTE_ENCODING };
85 :
86 : // Representation of the flat content of a String.
87 : // A non-flat string doesn't have flat content.
88 : // A flat string has content that's encoded as a sequence of either
89 : // one-byte chars or two-byte UC16.
90 : // Returned by String::GetFlatContent().
91 : class FlatContent {
92 : public:
93 : // Returns true if the string is flat and this structure contains content.
94 : bool IsFlat() const { return state_ != NON_FLAT; }
95 : // Returns true if the structure contains one-byte content.
96 1606530 : bool IsOneByte() const { return state_ == ONE_BYTE; }
97 : // Returns true if the structure contains two-byte content.
98 : bool IsTwoByte() const { return state_ == TWO_BYTE; }
99 :
100 : // Return the one byte content of the string. Only use if IsOneByte()
101 : // returns true.
102 1440376 : Vector<const uint8_t> ToOneByteVector() const {
103 : DCHECK_EQ(ONE_BYTE, state_);
104 30697501 : return Vector<const uint8_t>(onebyte_start, length_);
105 : }
106 : // Return the two-byte content of the string. Only use if IsTwoByte()
107 : // returns true.
108 860966 : Vector<const uc16> ToUC16Vector() const {
109 : DCHECK_EQ(TWO_BYTE, state_);
110 3241197 : return Vector<const uc16>(twobyte_start, length_);
111 : }
112 :
113 : uc16 Get(int i) const {
114 : DCHECK(i < length_);
115 : DCHECK(state_ != NON_FLAT);
116 62283282 : if (state_ == ONE_BYTE) return onebyte_start[i];
117 15287507 : return twobyte_start[i];
118 : }
119 :
120 : bool UsesSameString(const FlatContent& other) const {
121 : return onebyte_start == other.onebyte_start;
122 : }
123 :
124 : private:
125 : enum State { NON_FLAT, ONE_BYTE, TWO_BYTE };
126 :
127 : // Constructors only used by String::GetFlatContent().
128 : explicit FlatContent(const uint8_t* start, int length)
129 : : onebyte_start(start), length_(length), state_(ONE_BYTE) {}
130 : explicit FlatContent(const uc16* start, int length)
131 : : twobyte_start(start), length_(length), state_(TWO_BYTE) {}
132 : FlatContent() : onebyte_start(nullptr), length_(0), state_(NON_FLAT) {}
133 :
134 : union {
135 : const uint8_t* onebyte_start;
136 : const uc16* twobyte_start;
137 : };
138 : int length_;
139 : State state_;
140 :
141 : friend class String;
142 : friend class IterableSubString;
143 : };
144 :
145 : template <typename Char>
146 : V8_INLINE Vector<const Char> GetCharVector(
147 : const DisallowHeapAllocation& no_gc);
148 :
149 : // Get and set the length of the string.
150 : inline int length() const;
151 : inline void set_length(int value);
152 :
153 : // Get and set the length of the string using acquire loads and release
154 : // stores.
155 : inline int synchronized_length() const;
156 : inline void synchronized_set_length(int value);
157 :
158 : // Returns whether this string has only one-byte chars, i.e. all of them can
159 : // be one-byte encoded. This might be the case even if the string is
160 : // two-byte. Such strings may appear when the embedder prefers
161 : // two-byte external representations even for one-byte data.
162 : inline bool IsOneByteRepresentation() const;
163 : inline bool IsTwoByteRepresentation() const;
164 :
165 : // Cons and slices have an encoding flag that may not represent the actual
166 : // encoding of the underlying string. This is taken into account here.
167 : // This function is static because that helps it get inlined.
168 : // Requires: string.IsFlat()
169 : static inline bool IsOneByteRepresentationUnderneath(String string);
170 :
171 : // NOTE: this should be considered only a hint. False negatives are
172 : // possible.
173 : inline bool HasOnlyOneByteChars();
174 :
175 : // Get and set individual two byte chars in the string.
176 : inline void Set(int index, uint16_t value);
177 : // Get individual two byte char in the string. Repeated calls
178 : // to this method are not efficient unless the string is flat.
179 : V8_INLINE uint16_t Get(int index);
180 :
181 : // ES6 section 7.1.3.1 ToNumber Applied to the String Type
182 : static Handle<Object> ToNumber(Isolate* isolate, Handle<String> subject);
183 :
184 : // Flattens the string. Checks first inline to see if it is
185 : // necessary. Does nothing if the string is not a cons string.
186 : // Flattening allocates a sequential string with the same data as
187 : // the given string and mutates the cons string to a degenerate
188 : // form, where the first component is the new sequential string and
189 : // the second component is the empty string. If allocation fails,
190 : // this function returns a failure. If flattening succeeds, this
191 : // function returns the sequential string that is now the first
192 : // component of the cons string.
193 : //
194 : // Degenerate cons strings are handled specially by the garbage
195 : // collector (see IsShortcutCandidate).
196 :
197 : static inline Handle<String> Flatten(Isolate* isolate, Handle<String> string,
198 : PretenureFlag pretenure = NOT_TENURED);
199 :
200 : // Tries to return the content of a flat string as a structure holding either
201 : // a flat vector of char or of uc16.
202 : // If the string isn't flat, and therefore doesn't have flat content, the
203 : // returned structure will report so, and can't provide a vector of either
204 : // kind.
205 : FlatContent GetFlatContent(const DisallowHeapAllocation& no_gc);
206 :
207 : // Returns the parent of a sliced string or first part of a flat cons string.
208 : // Requires: StringShape(this).IsIndirect() && this->IsFlat()
209 : inline String GetUnderlying();
210 :
211 : // String relational comparison, implemented according to ES6 section 7.2.11
212 : // Abstract Relational Comparison (step 5): The comparison of Strings uses a
213 : // simple lexicographic ordering on sequences of code unit values. There is no
214 : // attempt to use the more complex, semantically oriented definitions of
215 : // character or string equality and collating order defined in the Unicode
216 : // specification. Therefore String values that are canonically equal according
217 : // to the Unicode standard could test as unequal. In effect this algorithm
218 : // assumes that both Strings are already in normalized form. Also, note that
219 : // for strings containing supplementary characters, lexicographic ordering on
220 : // sequences of UTF-16 code unit values differs from that on sequences of code
221 : // point values.
222 : V8_WARN_UNUSED_RESULT static ComparisonResult Compare(Isolate* isolate,
223 : Handle<String> x,
224 : Handle<String> y);
225 :
226 : // Perform ES6 21.1.3.8, including checking arguments.
227 : static Object IndexOf(Isolate* isolate, Handle<Object> receiver,
228 : Handle<Object> search, Handle<Object> position);
229 : // Perform string match of pattern on subject, starting at start index.
230 : // Caller must ensure that 0 <= start_index <= sub->length(), as this does not
231 : // check any arguments.
232 : static int IndexOf(Isolate* isolate, Handle<String> receiver,
233 : Handle<String> search, int start_index);
234 :
235 : static Object LastIndexOf(Isolate* isolate, Handle<Object> receiver,
236 : Handle<Object> search, Handle<Object> position);
237 :
238 : // Encapsulates logic related to a match and its capture groups as required
239 : // by GetSubstitution.
240 3894 : class Match {
241 : public:
242 : virtual Handle<String> GetMatch() = 0;
243 : virtual Handle<String> GetPrefix() = 0;
244 : virtual Handle<String> GetSuffix() = 0;
245 :
246 : // A named capture can be invalid (if it is not specified in the pattern),
247 : // unmatched (specified but not matched in the current string), and matched.
248 : enum CaptureState { INVALID, UNMATCHED, MATCHED };
249 :
250 : virtual int CaptureCount() = 0;
251 : virtual bool HasNamedCaptures() = 0;
252 : virtual MaybeHandle<String> GetCapture(int i, bool* capture_exists) = 0;
253 : virtual MaybeHandle<String> GetNamedCapture(Handle<String> name,
254 : CaptureState* state) = 0;
255 :
256 3894 : virtual ~Match() = default;
257 : };
258 :
259 : // ES#sec-getsubstitution
260 : // GetSubstitution(matched, str, position, captures, replacement)
261 : // Expand the $-expressions in the string and return a new string with
262 : // the result.
263 : // A {start_index} can be passed to specify where to start scanning the
264 : // replacement string.
265 : V8_WARN_UNUSED_RESULT static MaybeHandle<String> GetSubstitution(
266 : Isolate* isolate, Match* match, Handle<String> replacement,
267 : int start_index = 0);
268 :
269 : // String equality operations.
270 : inline bool Equals(String other);
271 : inline static bool Equals(Isolate* isolate, Handle<String> one,
272 : Handle<String> two);
273 : bool IsUtf8EqualTo(Vector<const char> str, bool allow_prefix_match = false);
274 :
275 : // Dispatches to Is{One,Two}ByteEqualTo.
276 : template <typename Char>
277 : bool IsEqualTo(Vector<const Char> str);
278 :
279 : bool IsOneByteEqualTo(Vector<const uint8_t> str);
280 : bool IsTwoByteEqualTo(Vector<const uc16> str);
281 :
282 : // Return a UTF8 representation of the string. The string is null
283 : // terminated but may optionally contain nulls. Length is returned
284 : // in length_output if length_output is not a null pointer The string
285 : // should be nearly flat, otherwise the performance of this method may
286 : // be very slow (quadratic in the length). Setting robustness_flag to
287 : // ROBUST_STRING_TRAVERSAL invokes behaviour that is robust This means it
288 : // handles unexpected data without causing assert failures and it does not
289 : // do any heap allocations. This is useful when printing stack traces.
290 : std::unique_ptr<char[]> ToCString(AllowNullsFlag allow_nulls,
291 : RobustnessFlag robustness_flag, int offset,
292 : int length, int* length_output = nullptr);
293 : std::unique_ptr<char[]> ToCString(
294 : AllowNullsFlag allow_nulls = DISALLOW_NULLS,
295 : RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL,
296 : int* length_output = nullptr);
297 :
298 : bool ComputeArrayIndex(uint32_t* index);
299 :
300 : // Externalization.
301 : bool MakeExternal(v8::String::ExternalStringResource* resource);
302 : bool MakeExternal(v8::String::ExternalOneByteStringResource* resource);
303 : bool SupportsExternalization();
304 :
305 : // Conversion.
306 : inline bool AsArrayIndex(uint32_t* index);
307 : uint32_t inline ToValidIndex(Object number);
308 :
309 : // Trimming.
310 : enum TrimMode { kTrim, kTrimStart, kTrimEnd };
311 : static Handle<String> Trim(Isolate* isolate, Handle<String> string,
312 : TrimMode mode);
313 :
314 : DECL_CAST(String)
315 :
316 : void PrintOn(FILE* out);
317 :
318 : // For use during stack traces. Performs rudimentary sanity check.
319 : bool LooksValid();
320 :
321 : // Dispatched behavior.
322 : void StringShortPrint(StringStream* accumulator, bool show_details = true);
323 : void PrintUC16(std::ostream& os, int start = 0, int end = -1); // NOLINT
324 : #if defined(DEBUG) || defined(OBJECT_PRINT)
325 : char* ToAsciiArray();
326 : #endif
327 : DECL_PRINTER(String)
328 : DECL_VERIFIER(String)
329 :
330 : inline bool IsFlat();
331 :
332 : // Layout description.
333 : static const int kLengthOffset = Name::kHeaderSize;
334 : static const int kHeaderSize = kLengthOffset + kInt32Size;
335 :
336 : // Max char codes.
337 : static const int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar;
338 : static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar;
339 : static const int kMaxUtf16CodeUnit = 0xffff;
340 : static const uint32_t kMaxUtf16CodeUnitU = kMaxUtf16CodeUnit;
341 : static const uc32 kMaxCodePoint = 0x10ffff;
342 :
343 : // Maximal string length.
344 : // The max length is different on 32 and 64 bit platforms. Max length for a
345 : // 32-bit platform is ~268.4M chars. On 64-bit platforms, max length is
346 : // ~1.073B chars. The limit on 64-bit is so that SeqTwoByteString::kMaxSize
347 : // can fit in a 32bit int: 2^31 - 1 is the max positive int, minus one bit as
348 : // each char needs two bytes, subtract 24 bytes for the string header size.
349 :
350 : // See include/v8.h for the definition.
351 : static const int kMaxLength = v8::String::kMaxLength;
352 : static_assert(kMaxLength <= (Smi::kMaxValue / 2 - kHeaderSize),
353 : "Unexpected max String length");
354 :
355 : // Max length for computing hash. For strings longer than this limit the
356 : // string length is used as the hash value.
357 : static const int kMaxHashCalcLength = 16383;
358 :
359 : // Limit for truncation in short printing.
360 : static const int kMaxShortPrintLength = 1024;
361 :
362 : // Helper function for flattening strings.
363 : template <typename sinkchar>
364 : static void WriteToFlat(String source, sinkchar* sink, int from, int to);
365 :
366 : // The return value may point to the first aligned word containing the first
367 : // non-one-byte character, rather than directly to the non-one-byte character.
368 : // If the return value is >= the passed length, the entire string was
369 : // one-byte.
370 10164602 : static inline int NonAsciiStart(const char* chars, int length) {
371 : const char* start = chars;
372 10164602 : const char* limit = chars + length;
373 :
374 10164602 : if (length >= kIntptrSize) {
375 : // Check unaligned bytes.
376 5300938 : while (!IsAligned(reinterpret_cast<intptr_t>(chars), sizeof(uintptr_t))) {
377 3724785 : if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) {
378 286 : return static_cast<int>(chars - start);
379 : }
380 3724499 : ++chars;
381 : }
382 : // Check aligned words.
383 : DCHECK_EQ(unibrow::Utf8::kMaxOneByteChar, 0x7F);
384 : const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFF * 0x80;
385 874769382 : while (chars + sizeof(uintptr_t) <= limit) {
386 873194801 : if (*reinterpret_cast<const uintptr_t*>(chars) & non_one_byte_mask) {
387 1572 : return static_cast<int>(chars - start);
388 : }
389 : chars += sizeof(uintptr_t);
390 : }
391 : }
392 : // Check remaining unaligned bytes.
393 45515989 : while (chars < limit) {
394 35357217 : if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) {
395 3972 : return static_cast<int>(chars - start);
396 : }
397 35353245 : ++chars;
398 : }
399 :
400 10158772 : return static_cast<int>(chars - start);
401 : }
402 :
403 : static inline bool IsAscii(const char* chars, int length) {
404 10875 : return NonAsciiStart(chars, length) >= length;
405 : }
406 :
407 : static inline bool IsAscii(const uint8_t* chars, int length) {
408 0 : return NonAsciiStart(reinterpret_cast<const char*>(chars), length) >=
409 : length;
410 : }
411 :
412 : static inline int NonOneByteStart(const uc16* chars, int length) {
413 1879730 : const uc16* limit = chars + length;
414 : const uc16* start = chars;
415 1499402760 : while (chars < limit) {
416 1498139846 : if (*chars > kMaxOneByteCharCodeU) return static_cast<int>(chars - start);
417 1497523030 : ++chars;
418 : }
419 1262914 : return static_cast<int>(chars - start);
420 : }
421 :
422 : static inline bool IsOneByte(const uc16* chars, int length) {
423 : return NonOneByteStart(chars, length) >= length;
424 : }
425 :
426 : template <class Visitor>
427 : static inline ConsString VisitFlat(Visitor* visitor, String string,
428 : int offset = 0);
429 :
430 : static Handle<FixedArray> CalculateLineEnds(Isolate* isolate,
431 : Handle<String> string,
432 : bool include_ending_line);
433 :
434 : private:
435 : friend class Name;
436 : friend class StringTableInsertionKey;
437 : friend class InternalizedStringKey;
438 :
439 : static Handle<String> SlowFlatten(Isolate* isolate, Handle<ConsString> cons,
440 : PretenureFlag tenure);
441 :
442 : // Slow case of String::Equals. This implementation works on any strings
443 : // but it is most efficient on strings that are almost flat.
444 : bool SlowEquals(String other);
445 :
446 : static bool SlowEquals(Isolate* isolate, Handle<String> one,
447 : Handle<String> two);
448 :
449 : // Slow case of AsArrayIndex.
450 : V8_EXPORT_PRIVATE bool SlowAsArrayIndex(uint32_t* index);
451 :
452 : // Compute and set the hash code.
453 : uint32_t ComputeAndSetHash(Isolate* isolate);
454 :
455 109778593 : OBJECT_CONSTRUCTORS(String, Name);
456 : };
457 :
458 : class SubStringRange {
459 : public:
460 : inline SubStringRange(String string, const DisallowHeapAllocation& no_gc,
461 : int first = 0, int length = -1);
462 : class iterator;
463 : inline iterator begin();
464 : inline iterator end();
465 :
466 : private:
467 : String string_;
468 : int first_;
469 : int length_;
470 : const DisallowHeapAllocation& no_gc_;
471 : };
472 :
473 : // The SeqString abstract class captures sequential string values.
474 : class SeqString : public String {
475 : public:
476 : DECL_CAST(SeqString)
477 :
478 : // Truncate the string in-place if possible and return the result.
479 : // In case of new_length == 0, the empty string is returned without
480 : // truncating the original string.
481 : V8_WARN_UNUSED_RESULT static Handle<String> Truncate(Handle<SeqString> string,
482 : int new_length);
483 :
484 : OBJECT_CONSTRUCTORS(SeqString, String);
485 : };
486 :
487 : class InternalizedString : public String {
488 : public:
489 : DECL_CAST(InternalizedString)
490 : // TODO(neis): Possibly move some stuff from String here.
491 :
492 : OBJECT_CONSTRUCTORS(InternalizedString, String);
493 : };
494 :
495 : // The OneByteString class captures sequential one-byte string objects.
496 : // Each character in the OneByteString is an one-byte character.
497 : class SeqOneByteString : public SeqString {
498 : public:
499 : static const bool kHasOneByteEncoding = true;
500 :
501 : // Dispatched behavior.
502 : inline uint16_t SeqOneByteStringGet(int index);
503 : inline void SeqOneByteStringSet(int index, uint16_t value);
504 :
505 : // Get the address of the characters in this string.
506 : inline Address GetCharsAddress();
507 :
508 : inline uint8_t* GetChars(const DisallowHeapAllocation& no_gc);
509 :
510 : // Clear uninitialized padding space. This ensures that the snapshot content
511 : // is deterministic.
512 : void clear_padding();
513 :
514 : DECL_CAST(SeqOneByteString)
515 :
516 : // Garbage collection support. This method is called by the
517 : // garbage collector to compute the actual size of an OneByteString
518 : // instance.
519 : inline int SeqOneByteStringSize(InstanceType instance_type);
520 :
521 : // Computes the size for an OneByteString instance of a given length.
522 : static int SizeFor(int length) {
523 422684054 : return OBJECT_POINTER_ALIGN(kHeaderSize + length * kCharSize);
524 : }
525 :
526 : // Maximal memory usage for a single sequential one-byte string.
527 : static const int kMaxCharsSize = kMaxLength;
528 : static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize);
529 : STATIC_ASSERT((kMaxSize - kHeaderSize) >= String::kMaxLength);
530 :
531 : class BodyDescriptor;
532 :
533 9480967857 : OBJECT_CONSTRUCTORS(SeqOneByteString, SeqString);
534 : };
535 :
536 : // The TwoByteString class captures sequential unicode string objects.
537 : // Each character in the TwoByteString is a two-byte uint16_t.
538 : class SeqTwoByteString : public SeqString {
539 : public:
540 : static const bool kHasOneByteEncoding = false;
541 :
542 : // Dispatched behavior.
543 : inline uint16_t SeqTwoByteStringGet(int index);
544 : inline void SeqTwoByteStringSet(int index, uint16_t value);
545 :
546 : // Get the address of the characters in this string.
547 : inline Address GetCharsAddress();
548 :
549 : inline uc16* GetChars(const DisallowHeapAllocation& no_gc);
550 :
551 : // Clear uninitialized padding space. This ensures that the snapshot content
552 : // is deterministic.
553 : void clear_padding();
554 :
555 : DECL_CAST(SeqTwoByteString)
556 :
557 : // Garbage collection support. This method is called by the
558 : // garbage collector to compute the actual size of a TwoByteString
559 : // instance.
560 : inline int SeqTwoByteStringSize(InstanceType instance_type);
561 :
562 : // Computes the size for a TwoByteString instance of a given length.
563 : static int SizeFor(int length) {
564 226580474 : return OBJECT_POINTER_ALIGN(kHeaderSize + length * kShortSize);
565 : }
566 :
567 : // Maximal memory usage for a single sequential two-byte string.
568 : static const int kMaxCharsSize = kMaxLength * 2;
569 : static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize);
570 : STATIC_ASSERT(static_cast<int>((kMaxSize - kHeaderSize) / sizeof(uint16_t)) >=
571 : String::kMaxLength);
572 :
573 : class BodyDescriptor;
574 :
575 227062933 : OBJECT_CONSTRUCTORS(SeqTwoByteString, SeqString);
576 : };
577 :
578 : // The ConsString class describes string values built by using the
579 : // addition operator on strings. A ConsString is a pair where the
580 : // first and second components are pointers to other string values.
581 : // One or both components of a ConsString can be pointers to other
582 : // ConsStrings, creating a binary tree of ConsStrings where the leaves
583 : // are non-ConsString string values. The string value represented by
584 : // a ConsString can be obtained by concatenating the leaf string
585 : // values in a left-to-right depth-first traversal of the tree.
586 : class ConsString : public String {
587 : public:
588 : // First string of the cons cell.
589 : inline String first();
590 : // Doesn't check that the result is a string, even in debug mode. This is
591 : // useful during GC where the mark bits confuse the checks.
592 : inline Object unchecked_first();
593 : inline void set_first(Isolate* isolate, String first,
594 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
595 :
596 : // Second string of the cons cell.
597 : inline String second();
598 : // Doesn't check that the result is a string, even in debug mode. This is
599 : // useful during GC where the mark bits confuse the checks.
600 : inline Object unchecked_second();
601 : inline void set_second(Isolate* isolate, String second,
602 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
603 :
604 : // Dispatched behavior.
605 : V8_EXPORT_PRIVATE uint16_t ConsStringGet(int index);
606 :
607 : DECL_CAST(ConsString)
608 :
609 : // Layout description.
610 : #define CONS_STRING_FIELDS(V) \
611 : V(kFirstOffset, kTaggedSize) \
612 : V(kSecondOffset, kTaggedSize) \
613 : /* Total size. */ \
614 : V(kSize, 0)
615 :
616 : DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize, CONS_STRING_FIELDS)
617 : #undef CONS_STRING_FIELDS
618 :
619 : // Minimum length for a cons string.
620 : static const int kMinLength = 13;
621 :
622 : typedef FixedBodyDescriptor<kFirstOffset, kSize, kSize> BodyDescriptor;
623 :
624 : DECL_VERIFIER(ConsString)
625 :
626 273526 : OBJECT_CONSTRUCTORS(ConsString, String);
627 : };
628 :
629 : // The ThinString class describes string objects that are just references
630 : // to another string object. They are used for in-place internalization when
631 : // the original string cannot actually be internalized in-place: in these
632 : // cases, the original string is converted to a ThinString pointing at its
633 : // internalized version (which is allocated as a new object).
634 : // In terms of memory layout and most algorithms operating on strings,
635 : // ThinStrings can be thought of as "one-part cons strings".
636 : class ThinString : public String {
637 : public:
638 : // Actual string that this ThinString refers to.
639 : inline String actual() const;
640 : inline HeapObject unchecked_actual() const;
641 : inline void set_actual(String s,
642 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
643 :
644 : V8_EXPORT_PRIVATE uint16_t ThinStringGet(int index);
645 :
646 : DECL_CAST(ThinString)
647 : DECL_VERIFIER(ThinString)
648 :
649 : // Layout description.
650 : #define THIN_STRING_FIELDS(V) \
651 : V(kActualOffset, kTaggedSize) \
652 : /* Total size. */ \
653 : V(kSize, 0)
654 :
655 : DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize, THIN_STRING_FIELDS)
656 : #undef THIN_STRING_FIELDS
657 :
658 : typedef FixedBodyDescriptor<kActualOffset, kSize, kSize> BodyDescriptor;
659 :
660 2624 : OBJECT_CONSTRUCTORS(ThinString, String);
661 : };
662 :
663 : // The Sliced String class describes strings that are substrings of another
664 : // sequential string. The motivation is to save time and memory when creating
665 : // a substring. A Sliced String is described as a pointer to the parent,
666 : // the offset from the start of the parent string and the length. Using
667 : // a Sliced String therefore requires unpacking of the parent string and
668 : // adding the offset to the start address. A substring of a Sliced String
669 : // are not nested since the double indirection is simplified when creating
670 : // such a substring.
671 : // Currently missing features are:
672 : // - handling externalized parent strings
673 : // - external strings as parent
674 : // - truncating sliced string to enable otherwise unneeded parent to be GC'ed.
675 : class SlicedString : public String {
676 : public:
677 : inline String parent();
678 : inline void set_parent(Isolate* isolate, String parent,
679 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
680 : inline int offset() const;
681 : inline void set_offset(int offset);
682 :
683 : // Dispatched behavior.
684 : V8_EXPORT_PRIVATE uint16_t SlicedStringGet(int index);
685 :
686 : DECL_CAST(SlicedString)
687 :
688 : // Layout description.
689 : #define SLICED_STRING_FIELDS(V) \
690 : V(kParentOffset, kTaggedSize) \
691 : V(kOffsetOffset, kTaggedSize) \
692 : /* Total size. */ \
693 : V(kSize, 0)
694 :
695 : DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize, SLICED_STRING_FIELDS)
696 : #undef SLICED_STRING_FIELDS
697 :
698 : // Minimum length for a sliced string.
699 : static const int kMinLength = 13;
700 :
701 : typedef FixedBodyDescriptor<kParentOffset, kSize, kSize> BodyDescriptor;
702 :
703 : DECL_VERIFIER(SlicedString)
704 :
705 1095402 : OBJECT_CONSTRUCTORS(SlicedString, String);
706 : };
707 :
708 : // The ExternalString class describes string values that are backed by
709 : // a string resource that lies outside the V8 heap. ExternalStrings
710 : // consist of the length field common to all strings, a pointer to the
711 : // external resource. It is important to ensure (externally) that the
712 : // resource is not deallocated while the ExternalString is live in the
713 : // V8 heap.
714 : //
715 : // The API expects that all ExternalStrings are created through the
716 : // API. Therefore, ExternalStrings should not be used internally.
717 : class ExternalString : public String {
718 : public:
719 : DECL_CAST(ExternalString)
720 :
721 : // Layout description.
722 : #define EXTERNAL_STRING_FIELDS(V) \
723 : V(kResourceOffset, kSystemPointerSize) \
724 : /* Size of uncached external strings. */ \
725 : V(kUncachedSize, 0) \
726 : V(kResourceDataOffset, kSystemPointerSize) \
727 : /* Total size. */ \
728 : V(kSize, 0)
729 :
730 : DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize, EXTERNAL_STRING_FIELDS)
731 : #undef EXTERNAL_STRING_FIELDS
732 :
733 : // Return whether the external string data pointer is not cached.
734 : inline bool is_uncached() const;
735 : // Size in bytes of the external payload.
736 : int ExternalPayloadSize() const;
737 :
738 : // Used in the serializer/deserializer.
739 : inline Address resource_as_address();
740 : inline void set_address_as_resource(Address address);
741 : inline uint32_t resource_as_uint32();
742 : inline void set_uint32_as_resource(uint32_t value);
743 :
744 : STATIC_ASSERT(kResourceOffset == Internals::kStringResourceOffset);
745 :
746 : OBJECT_CONSTRUCTORS(ExternalString, String);
747 : };
748 :
749 : // The ExternalOneByteString class is an external string backed by an
750 : // one-byte string.
751 : class ExternalOneByteString : public ExternalString {
752 : public:
753 : static const bool kHasOneByteEncoding = true;
754 :
755 : typedef v8::String::ExternalOneByteStringResource Resource;
756 :
757 : // The underlying resource.
758 : inline const Resource* resource();
759 :
760 : // It is assumed that the previous resource is null. If it is not null, then
761 : // it is the responsability of the caller the handle the previous resource.
762 : inline void SetResource(Isolate* isolate, const Resource* buffer);
763 : // Used only during serialization.
764 : inline void set_resource(const Resource* buffer);
765 :
766 : // Update the pointer cache to the external character array.
767 : // The cached pointer is always valid, as the external character array does =
768 : // not move during lifetime. Deserialization is the only exception, after
769 : // which the pointer cache has to be refreshed.
770 : inline void update_data_cache();
771 :
772 : inline const uint8_t* GetChars();
773 :
774 : // Dispatched behavior.
775 : inline uint16_t ExternalOneByteStringGet(int index);
776 :
777 : DECL_CAST(ExternalOneByteString)
778 :
779 : class BodyDescriptor;
780 :
781 8882173 : OBJECT_CONSTRUCTORS(ExternalOneByteString, ExternalString);
782 : };
783 :
784 : // The ExternalTwoByteString class is an external string backed by a UTF-16
785 : // encoded string.
786 : class ExternalTwoByteString : public ExternalString {
787 : public:
788 : static const bool kHasOneByteEncoding = false;
789 :
790 : typedef v8::String::ExternalStringResource Resource;
791 :
792 : // The underlying string resource.
793 : inline const Resource* resource();
794 :
795 : // It is assumed that the previous resource is null. If it is not null, then
796 : // it is the responsability of the caller the handle the previous resource.
797 : inline void SetResource(Isolate* isolate, const Resource* buffer);
798 : // Used only during serialization.
799 : inline void set_resource(const Resource* buffer);
800 :
801 : // Update the pointer cache to the external character array.
802 : // The cached pointer is always valid, as the external character array does =
803 : // not move during lifetime. Deserialization is the only exception, after
804 : // which the pointer cache has to be refreshed.
805 : inline void update_data_cache();
806 :
807 : inline const uint16_t* GetChars();
808 :
809 : // Dispatched behavior.
810 : inline uint16_t ExternalTwoByteStringGet(int index);
811 :
812 : // For regexp code.
813 : inline const uint16_t* ExternalTwoByteStringGetData(unsigned start);
814 :
815 : DECL_CAST(ExternalTwoByteString)
816 :
817 : class BodyDescriptor;
818 :
819 54823 : OBJECT_CONSTRUCTORS(ExternalTwoByteString, ExternalString);
820 : };
821 :
822 : // A flat string reader provides random access to the contents of a
823 : // string independent of the character width of the string. The handle
824 : // must be valid as long as the reader is being used.
825 2808790 : class FlatStringReader : public Relocatable {
826 : public:
827 : FlatStringReader(Isolate* isolate, Handle<String> str);
828 : FlatStringReader(Isolate* isolate, Vector<const char> input);
829 : void PostGarbageCollection() override;
830 : inline uc32 Get(int index);
831 : template <typename Char>
832 : inline Char Get(int index);
833 2443120408 : int length() { return length_; }
834 :
835 : private:
836 : Address* str_;
837 : bool is_one_byte_;
838 : int length_;
839 : const void* start_;
840 : };
841 :
842 : // This maintains an off-stack representation of the stack frames required
843 : // to traverse a ConsString, allowing an entirely iterative and restartable
844 : // traversal of the entire string
845 : class ConsStringIterator {
846 : public:
847 209707767 : inline ConsStringIterator() = default;
848 2610072 : inline explicit ConsStringIterator(ConsString cons_string, int offset = 0) {
849 : Reset(cons_string, offset);
850 1720 : }
851 : inline void Reset(ConsString cons_string, int offset = 0) {
852 11644471 : depth_ = 0;
853 : // Next will always return nullptr.
854 11608837 : if (cons_string.is_null()) return;
855 42896 : Initialize(cons_string, offset);
856 : }
857 : // Returns nullptr when complete.
858 : inline String Next(int* offset_out) {
859 21018702 : *offset_out = 0;
860 21018702 : if (depth_ == 0) return String();
861 10637895 : return Continue(offset_out);
862 : }
863 :
864 : private:
865 : static const int kStackSize = 32;
866 : // Use a mask instead of doing modulo operations for stack wrapping.
867 : static const int kDepthMask = kStackSize - 1;
868 : static_assert(base::bits::IsPowerOfTwo(kStackSize),
869 : "kStackSize must be power of two");
870 : static inline int OffsetForDepth(int depth);
871 :
872 : inline void PushLeft(ConsString string);
873 : inline void PushRight(ConsString string);
874 : inline void AdjustMaximumDepth();
875 : inline void Pop();
876 21495955 : inline bool StackBlown() { return maximum_depth_ - depth_ == kStackSize; }
877 : void Initialize(ConsString cons_string, int offset);
878 : String Continue(int* offset_out);
879 : String NextLeaf(bool* blew_stack);
880 : String Search(int* offset_out);
881 :
882 : // Stack must always contain only frames for which right traversal
883 : // has not yet been performed.
884 : ConsString frames_[kStackSize];
885 : ConsString root_;
886 : int depth_;
887 : int maximum_depth_;
888 : int consumed_;
889 : DISALLOW_COPY_AND_ASSIGN(ConsStringIterator);
890 : };
891 :
892 : class StringCharacterStream {
893 : public:
894 : inline explicit StringCharacterStream(String string, int offset = 0);
895 : inline uint16_t GetNext();
896 : inline bool HasMore();
897 : inline void Reset(String string, int offset = 0);
898 : inline void VisitOneByteString(const uint8_t* chars, int length);
899 : inline void VisitTwoByteString(const uint16_t* chars, int length);
900 :
901 : private:
902 : ConsStringIterator iter_;
903 : bool is_one_byte_;
904 : union {
905 : const uint8_t* buffer8_;
906 : const uint16_t* buffer16_;
907 : };
908 : const uint8_t* end_;
909 : DISALLOW_COPY_AND_ASSIGN(StringCharacterStream);
910 : };
911 :
912 : } // namespace internal
913 : } // namespace v8
914 :
915 : #include "src/objects/object-macros-undef.h"
916 :
917 : #endif // V8_OBJECTS_STRING_H_
|