Line data Source code
1 : // Copyright 2017 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_OBJECTS_STRING_H_
6 : #define V8_OBJECTS_STRING_H_
7 :
8 : #include "src/base/bits.h"
9 : #include "src/objects/name.h"
10 :
11 : // Has to be the last include (doesn't have include guards):
12 : #include "src/objects/object-macros.h"
13 :
14 : namespace v8 {
15 : namespace internal {
16 :
17 : class BigInt;
18 :
19 : enum AllowNullsFlag { ALLOW_NULLS, DISALLOW_NULLS };
20 : enum RobustnessFlag { ROBUST_STRING_TRAVERSAL, FAST_STRING_TRAVERSAL };
21 :
22 : // The characteristics of a string are stored in its map. Retrieving these
23 : // few bits of information is moderately expensive, involving two memory
24 : // loads where the second is dependent on the first. To improve efficiency
25 : // the shape of the string is given its own class so that it can be retrieved
26 : // once and used for several string operations. A StringShape is small enough
27 : // to be passed by value and is immutable, but be aware that flattening a
28 : // string can potentially alter its shape. Also be aware that a GC caused by
29 : // something else can alter the shape of a string due to ConsString
30 : // shortcutting. Keeping these restrictions in mind has proven to be error-
31 : // prone and so we no longer put StringShapes in variables unless there is a
32 : // concrete performance benefit at that particular point in the code.
33 : class StringShape BASE_EMBEDDED {
34 : public:
35 : inline explicit StringShape(const String* s);
36 : inline explicit StringShape(Map* s);
37 : inline explicit StringShape(InstanceType t);
38 : inline bool IsSequential();
39 : inline bool IsExternal();
40 : inline bool IsCons();
41 : inline bool IsSliced();
42 : inline bool IsThin();
43 : inline bool IsIndirect();
44 : inline bool IsExternalOneByte();
45 : inline bool IsExternalTwoByte();
46 : inline bool IsSequentialOneByte();
47 : inline bool IsSequentialTwoByte();
48 : inline bool IsInternalized();
49 : inline StringRepresentationTag representation_tag();
50 : inline uint32_t encoding_tag();
51 : inline uint32_t full_representation_tag();
52 : inline bool HasOnlyOneByteChars();
53 : #ifdef DEBUG
54 : inline uint32_t type() { return type_; }
55 : inline void invalidate() { valid_ = false; }
56 : inline bool valid() { return valid_; }
57 : #else
58 : inline void invalidate() {}
59 : #endif
60 :
61 : private:
62 : uint32_t type_;
63 : #ifdef DEBUG
64 : inline void set_valid() { valid_ = true; }
65 : bool valid_;
66 : #else
67 : inline void set_valid() {}
68 : #endif
69 : };
70 :
71 : // The String abstract class captures JavaScript string values:
72 : //
73 : // Ecma-262:
74 : // 4.3.16 String Value
75 : // A string value is a member of the type String and is a finite
76 : // ordered sequence of zero or more 16-bit unsigned integer values.
77 : //
78 : // All string values have a length field.
79 : class String : public Name {
80 : public:
81 : enum Encoding { ONE_BYTE_ENCODING, TWO_BYTE_ENCODING };
82 :
83 : class SubStringRange {
84 : public:
85 : explicit inline SubStringRange(String* string, int first = 0,
86 : int length = -1);
87 : class iterator;
88 : inline iterator begin();
89 : inline iterator end();
90 :
91 : private:
92 : String* string_;
93 : int first_;
94 : int length_;
95 : };
96 :
97 : // Representation of the flat content of a String.
98 : // A non-flat string doesn't have flat content.
99 : // A flat string has content that's encoded as a sequence of either
100 : // one-byte chars or two-byte UC16.
101 : // Returned by String::GetFlatContent().
102 : class FlatContent {
103 : public:
104 : // Returns true if the string is flat and this structure contains content.
105 : bool IsFlat() const { return state_ != NON_FLAT; }
106 : // Returns true if the structure contains one-byte content.
107 1528327 : bool IsOneByte() const { return state_ == ONE_BYTE; }
108 : // Returns true if the structure contains two-byte content.
109 : bool IsTwoByte() const { return state_ == TWO_BYTE; }
110 :
111 : // Return the one byte content of the string. Only use if IsOneByte()
112 : // returns true.
113 1384561 : Vector<const uint8_t> ToOneByteVector() const {
114 : DCHECK_EQ(ONE_BYTE, state_);
115 5274823 : return Vector<const uint8_t>(onebyte_start, length_);
116 : }
117 : // Return the two-byte content of the string. Only use if IsTwoByte()
118 : // returns true.
119 903271 : Vector<const uc16> ToUC16Vector() const {
120 : DCHECK_EQ(TWO_BYTE, state_);
121 3808536 : return Vector<const uc16>(twobyte_start, length_);
122 : }
123 :
124 1260 : uc16 Get(int i) const {
125 : DCHECK(i < length_);
126 : DCHECK(state_ != NON_FLAT);
127 83481841 : if (state_ == ONE_BYTE) return onebyte_start[i];
128 26025135 : return twobyte_start[i];
129 : }
130 :
131 : bool UsesSameString(const FlatContent& other) const {
132 : return onebyte_start == other.onebyte_start;
133 : }
134 :
135 : private:
136 : enum State { NON_FLAT, ONE_BYTE, TWO_BYTE };
137 :
138 : // Constructors only used by String::GetFlatContent().
139 : explicit FlatContent(const uint8_t* start, int length)
140 : : onebyte_start(start), length_(length), state_(ONE_BYTE) {}
141 : explicit FlatContent(const uc16* start, int length)
142 : : twobyte_start(start), length_(length), state_(TWO_BYTE) {}
143 : FlatContent() : onebyte_start(nullptr), length_(0), state_(NON_FLAT) {}
144 :
145 : union {
146 : const uint8_t* onebyte_start;
147 : const uc16* twobyte_start;
148 : };
149 : int length_;
150 : State state_;
151 :
152 : friend class String;
153 : friend class IterableSubString;
154 : };
155 :
156 : template <typename Char>
157 : INLINE(Vector<const Char> GetCharVector());
158 :
159 : // Get and set the length of the string.
160 : inline int length() const;
161 : inline void set_length(int value);
162 :
163 : // Get and set the length of the string using acquire loads and release
164 : // stores.
165 : inline int synchronized_length() const;
166 : inline void synchronized_set_length(int value);
167 :
168 : // Returns whether this string has only one-byte chars, i.e. all of them can
169 : // be one-byte encoded. This might be the case even if the string is
170 : // two-byte. Such strings may appear when the embedder prefers
171 : // two-byte external representations even for one-byte data.
172 : inline bool IsOneByteRepresentation() const;
173 : inline bool IsTwoByteRepresentation() const;
174 :
175 : // Cons and slices have an encoding flag that may not represent the actual
176 : // encoding of the underlying string. This is taken into account here.
177 : // Requires: this->IsFlat()
178 : inline bool IsOneByteRepresentationUnderneath();
179 : inline bool IsTwoByteRepresentationUnderneath();
180 :
181 : // NOTE: this should be considered only a hint. False negatives are
182 : // possible.
183 : inline bool HasOnlyOneByteChars();
184 :
185 : // Get and set individual two byte chars in the string.
186 : inline void Set(int index, uint16_t value);
187 : // Get individual two byte char in the string. Repeated calls
188 : // to this method are not efficient unless the string is flat.
189 : INLINE(uint16_t Get(int index));
190 :
191 : // ES6 section 7.1.3.1 ToNumber Applied to the String Type
192 : static Handle<Object> ToNumber(Handle<String> subject);
193 :
194 : // Flattens the string. Checks first inline to see if it is
195 : // necessary. Does nothing if the string is not a cons string.
196 : // Flattening allocates a sequential string with the same data as
197 : // the given string and mutates the cons string to a degenerate
198 : // form, where the first component is the new sequential string and
199 : // the second component is the empty string. If allocation fails,
200 : // this function returns a failure. If flattening succeeds, this
201 : // function returns the sequential string that is now the first
202 : // component of the cons string.
203 : //
204 : // Degenerate cons strings are handled specially by the garbage
205 : // collector (see IsShortcutCandidate).
206 :
207 : static inline Handle<String> Flatten(Handle<String> string,
208 : PretenureFlag pretenure = NOT_TENURED);
209 :
210 : // Tries to return the content of a flat string as a structure holding either
211 : // a flat vector of char or of uc16.
212 : // If the string isn't flat, and therefore doesn't have flat content, the
213 : // returned structure will report so, and can't provide a vector of either
214 : // kind.
215 : FlatContent GetFlatContent();
216 :
217 : // Returns the parent of a sliced string or first part of a flat cons string.
218 : // Requires: StringShape(this).IsIndirect() && this->IsFlat()
219 : inline String* GetUnderlying();
220 :
221 : // String relational comparison, implemented according to ES6 section 7.2.11
222 : // Abstract Relational Comparison (step 5): The comparison of Strings uses a
223 : // simple lexicographic ordering on sequences of code unit values. There is no
224 : // attempt to use the more complex, semantically oriented definitions of
225 : // character or string equality and collating order defined in the Unicode
226 : // specification. Therefore String values that are canonically equal according
227 : // to the Unicode standard could test as unequal. In effect this algorithm
228 : // assumes that both Strings are already in normalized form. Also, note that
229 : // for strings containing supplementary characters, lexicographic ordering on
230 : // sequences of UTF-16 code unit values differs from that on sequences of code
231 : // point values.
232 : MUST_USE_RESULT static ComparisonResult Compare(Handle<String> x,
233 : Handle<String> y);
234 :
235 : // Perform ES6 21.1.3.8, including checking arguments.
236 : static Object* IndexOf(Isolate* isolate, Handle<Object> receiver,
237 : Handle<Object> search, Handle<Object> position);
238 : // Perform string match of pattern on subject, starting at start index.
239 : // Caller must ensure that 0 <= start_index <= sub->length(), as this does not
240 : // check any arguments.
241 : static int IndexOf(Isolate* isolate, Handle<String> receiver,
242 : Handle<String> search, int start_index);
243 :
244 : static Object* LastIndexOf(Isolate* isolate, Handle<Object> receiver,
245 : Handle<Object> search, Handle<Object> position);
246 :
247 : // Encapsulates logic related to a match and its capture groups as required
248 : // by GetSubstitution.
249 3976 : class Match {
250 : public:
251 : virtual Handle<String> GetMatch() = 0;
252 : virtual Handle<String> GetPrefix() = 0;
253 : virtual Handle<String> GetSuffix() = 0;
254 :
255 : // A named capture can be invalid (if it is not specified in the pattern),
256 : // unmatched (specified but not matched in the current string), and matched.
257 : enum CaptureState { INVALID, UNMATCHED, MATCHED };
258 :
259 : virtual int CaptureCount() = 0;
260 : virtual bool HasNamedCaptures() = 0;
261 : virtual MaybeHandle<String> GetCapture(int i, bool* capture_exists) = 0;
262 : virtual MaybeHandle<String> GetNamedCapture(Handle<String> name,
263 : CaptureState* state) = 0;
264 :
265 3976 : virtual ~Match() {}
266 : };
267 :
268 : // ES#sec-getsubstitution
269 : // GetSubstitution(matched, str, position, captures, replacement)
270 : // Expand the $-expressions in the string and return a new string with
271 : // the result.
272 : // A {start_index} can be passed to specify where to start scanning the
273 : // replacement string.
274 : MUST_USE_RESULT static MaybeHandle<String> GetSubstitution(
275 : Isolate* isolate, Match* match, Handle<String> replacement,
276 : int start_index = 0);
277 :
278 : // String equality operations.
279 : inline bool Equals(String* other);
280 : inline static bool Equals(Handle<String> one, Handle<String> two);
281 : bool IsUtf8EqualTo(Vector<const char> str, bool allow_prefix_match = false);
282 :
283 : // Dispatches to Is{One,Two}ByteEqualTo.
284 : template <typename Char>
285 : bool IsEqualTo(Vector<const Char> str);
286 :
287 : bool IsOneByteEqualTo(Vector<const uint8_t> str);
288 : bool IsTwoByteEqualTo(Vector<const uc16> str);
289 :
290 : // Return a UTF8 representation of the string. The string is null
291 : // terminated but may optionally contain nulls. Length is returned
292 : // in length_output if length_output is not a null pointer The string
293 : // should be nearly flat, otherwise the performance of this method may
294 : // be very slow (quadratic in the length). Setting robustness_flag to
295 : // ROBUST_STRING_TRAVERSAL invokes behaviour that is robust This means it
296 : // handles unexpected data without causing assert failures and it does not
297 : // do any heap allocations. This is useful when printing stack traces.
298 : std::unique_ptr<char[]> ToCString(AllowNullsFlag allow_nulls,
299 : RobustnessFlag robustness_flag, int offset,
300 : int length, int* length_output = 0);
301 : std::unique_ptr<char[]> ToCString(
302 : AllowNullsFlag allow_nulls = DISALLOW_NULLS,
303 : RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL,
304 : int* length_output = 0);
305 :
306 : bool ComputeArrayIndex(uint32_t* index);
307 :
308 : // Externalization.
309 : bool MakeExternal(v8::String::ExternalStringResource* resource);
310 : bool MakeExternal(v8::String::ExternalOneByteStringResource* resource);
311 :
312 : // Conversion.
313 : inline bool AsArrayIndex(uint32_t* index);
314 : uint32_t inline ToValidIndex(Object* number);
315 :
316 : // Trimming.
317 : enum TrimMode { kTrim, kTrimLeft, kTrimRight };
318 : static Handle<String> Trim(Handle<String> string, TrimMode mode);
319 :
320 : DECL_CAST(String)
321 :
322 : void PrintOn(FILE* out);
323 :
324 : // For use during stack traces. Performs rudimentary sanity check.
325 : bool LooksValid();
326 :
327 : // Dispatched behavior.
328 : void StringShortPrint(StringStream* accumulator, bool show_details = true);
329 : void PrintUC16(std::ostream& os, int start = 0, int end = -1); // NOLINT
330 : #if defined(DEBUG) || defined(OBJECT_PRINT)
331 : char* ToAsciiArray();
332 : #endif
333 : DECL_PRINTER(String)
334 : DECL_VERIFIER(String)
335 :
336 : inline bool IsFlat();
337 :
338 : // Layout description.
339 : static const int kLengthOffset = Name::kSize;
340 : static const int kSize = kLengthOffset + kPointerSize;
341 :
342 : // Max char codes.
343 : static const int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar;
344 : static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar;
345 : static const int kMaxUtf16CodeUnit = 0xffff;
346 : static const uint32_t kMaxUtf16CodeUnitU = kMaxUtf16CodeUnit;
347 : static const uc32 kMaxCodePoint = 0x10ffff;
348 :
349 : // Maximal string length.
350 : // The max length is different on 32 and 64 bit platforms. Max length for a
351 : // 32-bit platform is ~268.4M chars. On 64-bit platforms, max length is
352 : // ~1.073B chars. The limit on 64-bit is so that SeqTwoByteString::kMaxSize
353 : // can fit in a 32bit int: 2^31 - 1 is the max positive int, minus one bit as
354 : // each char needs two bytes, subtract 24 bytes for the string header size.
355 :
356 : // See include/v8.h for the definition.
357 : static const int kMaxLength = v8::String::kMaxLength;
358 :
359 : // Max length for computing hash. For strings longer than this limit the
360 : // string length is used as the hash value.
361 : static const int kMaxHashCalcLength = 16383;
362 :
363 : // Limit for truncation in short printing.
364 : static const int kMaxShortPrintLength = 1024;
365 :
366 : // Support for regular expressions.
367 : const uc16* GetTwoByteData(unsigned start);
368 :
369 : // Helper function for flattening strings.
370 : template <typename sinkchar>
371 : static void WriteToFlat(String* source, sinkchar* sink, int from, int to);
372 :
373 : // The return value may point to the first aligned word containing the first
374 : // non-one-byte character, rather than directly to the non-one-byte character.
375 : // If the return value is >= the passed length, the entire string was
376 : // one-byte.
377 9721953 : static inline int NonAsciiStart(const char* chars, int length) {
378 : const char* start = chars;
379 9721953 : const char* limit = chars + length;
380 :
381 9721953 : if (length >= kIntptrSize) {
382 : // Check unaligned bytes.
383 4800247 : while (!IsAligned(reinterpret_cast<intptr_t>(chars), sizeof(uintptr_t))) {
384 3454780 : if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) {
385 310 : return static_cast<int>(chars - start);
386 : }
387 3454470 : ++chars;
388 : }
389 : // Check aligned words.
390 : DCHECK_EQ(unibrow::Utf8::kMaxOneByteChar, 0x7F);
391 : const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFF * 0x80;
392 855609207 : while (chars + sizeof(uintptr_t) <= limit) {
393 854265039 : if (*reinterpret_cast<const uintptr_t*>(chars) & non_one_byte_mask) {
394 1299 : return static_cast<int>(chars - start);
395 : }
396 : chars += sizeof(uintptr_t);
397 : }
398 : }
399 : // Check remaining unaligned bytes.
400 41519960 : while (chars < limit) {
401 31803956 : if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) {
402 4340 : return static_cast<int>(chars - start);
403 : }
404 31799616 : ++chars;
405 : }
406 :
407 9716004 : return static_cast<int>(chars - start);
408 : }
409 :
410 : static inline bool IsAscii(const char* chars, int length) {
411 5 : return NonAsciiStart(chars, length) >= length;
412 : }
413 :
414 : static inline bool IsAscii(const uint8_t* chars, int length) {
415 0 : return NonAsciiStart(reinterpret_cast<const char*>(chars), length) >=
416 : length;
417 : }
418 :
419 : static inline int NonOneByteStart(const uc16* chars, int length) {
420 2055416 : const uc16* limit = chars + length;
421 : const uc16* start = chars;
422 1426985818 : while (chars < limit) {
423 1425599141 : if (*chars > kMaxOneByteCharCodeU) return static_cast<int>(chars - start);
424 1424930402 : ++chars;
425 : }
426 1386677 : return static_cast<int>(chars - start);
427 : }
428 :
429 : static inline bool IsOneByte(const uc16* chars, int length) {
430 : return NonOneByteStart(chars, length) >= length;
431 : }
432 :
433 : template <class Visitor>
434 : static inline ConsString* VisitFlat(Visitor* visitor, String* string,
435 : int offset = 0);
436 :
437 : static Handle<FixedArray> CalculateLineEnds(Handle<String> string,
438 : bool include_ending_line);
439 :
440 : // Use the hash field to forward to the canonical internalized string
441 : // when deserializing an internalized string.
442 : inline void SetForwardedInternalizedString(String* string);
443 : inline String* GetForwardedInternalizedString();
444 :
445 : private:
446 : friend class Name;
447 : friend class StringTableInsertionKey;
448 : friend class InternalizedStringKey;
449 :
450 : static Handle<String> SlowFlatten(Handle<ConsString> cons,
451 : PretenureFlag tenure);
452 :
453 : // Slow case of String::Equals. This implementation works on any strings
454 : // but it is most efficient on strings that are almost flat.
455 : bool SlowEquals(String* other);
456 :
457 : static bool SlowEquals(Handle<String> one, Handle<String> two);
458 :
459 : // Slow case of AsArrayIndex.
460 : V8_EXPORT_PRIVATE bool SlowAsArrayIndex(uint32_t* index);
461 :
462 : // Compute and set the hash code.
463 : uint32_t ComputeAndSetHash();
464 :
465 : DISALLOW_IMPLICIT_CONSTRUCTORS(String);
466 : };
467 :
468 : // The SeqString abstract class captures sequential string values.
469 : class SeqString : public String {
470 : public:
471 : DECL_CAST(SeqString)
472 :
473 : // Layout description.
474 : static const int kHeaderSize = String::kSize;
475 :
476 : // Truncate the string in-place if possible and return the result.
477 : // In case of new_length == 0, the empty string is returned without
478 : // truncating the original string.
479 : MUST_USE_RESULT static Handle<String> Truncate(Handle<SeqString> string,
480 : int new_length);
481 :
482 : private:
483 : DISALLOW_IMPLICIT_CONSTRUCTORS(SeqString);
484 : };
485 :
486 : // The OneByteString class captures sequential one-byte string objects.
487 : // Each character in the OneByteString is an one-byte character.
488 : class SeqOneByteString : public SeqString {
489 : public:
490 : static const bool kHasOneByteEncoding = true;
491 :
492 : // Dispatched behavior.
493 : inline uint16_t SeqOneByteStringGet(int index);
494 : inline void SeqOneByteStringSet(int index, uint16_t value);
495 :
496 : // Get the address of the characters in this string.
497 : inline Address GetCharsAddress();
498 :
499 : inline uint8_t* GetChars();
500 :
501 : // Clear uninitialized padding space. This ensures that the snapshot content
502 : // is deterministic.
503 : void clear_padding();
504 :
505 : DECL_CAST(SeqOneByteString)
506 :
507 : // Garbage collection support. This method is called by the
508 : // garbage collector to compute the actual size of an OneByteString
509 : // instance.
510 : inline int SeqOneByteStringSize(InstanceType instance_type);
511 :
512 : // Computes the size for an OneByteString instance of a given length.
513 : static int SizeFor(int length) {
514 561910868 : return OBJECT_POINTER_ALIGN(kHeaderSize + length * kCharSize);
515 : }
516 :
517 : // Maximal memory usage for a single sequential one-byte string.
518 : static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxLength + kHeaderSize);
519 : STATIC_ASSERT((kMaxSize - kHeaderSize) >= String::kMaxLength);
520 :
521 : class BodyDescriptor;
522 : // No weak fields.
523 : typedef BodyDescriptor BodyDescriptorWeak;
524 :
525 : private:
526 : DISALLOW_IMPLICIT_CONSTRUCTORS(SeqOneByteString);
527 : };
528 :
529 : // The TwoByteString class captures sequential unicode string objects.
530 : // Each character in the TwoByteString is a two-byte uint16_t.
531 : class SeqTwoByteString : public SeqString {
532 : public:
533 : static const bool kHasOneByteEncoding = false;
534 :
535 : // Dispatched behavior.
536 : inline uint16_t SeqTwoByteStringGet(int index);
537 : inline void SeqTwoByteStringSet(int index, uint16_t value);
538 :
539 : // Get the address of the characters in this string.
540 : inline Address GetCharsAddress();
541 :
542 : inline uc16* GetChars();
543 :
544 : // Clear uninitialized padding space. This ensures that the snapshot content
545 : // is deterministic.
546 : void clear_padding();
547 :
548 : // For regexp code.
549 : const uint16_t* SeqTwoByteStringGetData(unsigned start);
550 :
551 : DECL_CAST(SeqTwoByteString)
552 :
553 : // Garbage collection support. This method is called by the
554 : // garbage collector to compute the actual size of a TwoByteString
555 : // instance.
556 : inline int SeqTwoByteStringSize(InstanceType instance_type);
557 :
558 : // Computes the size for a TwoByteString instance of a given length.
559 : static int SizeFor(int length) {
560 80759506 : return OBJECT_POINTER_ALIGN(kHeaderSize + length * kShortSize);
561 : }
562 :
563 : // Maximal memory usage for a single sequential two-byte string.
564 : static const int kMaxSize =
565 : OBJECT_POINTER_ALIGN(kMaxLength * 2 + kHeaderSize);
566 : STATIC_ASSERT(static_cast<int>((kMaxSize - kHeaderSize) / sizeof(uint16_t)) >=
567 : String::kMaxLength);
568 :
569 : class BodyDescriptor;
570 : // No weak fields.
571 : typedef BodyDescriptor BodyDescriptorWeak;
572 :
573 : private:
574 : DISALLOW_IMPLICIT_CONSTRUCTORS(SeqTwoByteString);
575 : };
576 :
577 : // The ConsString class describes string values built by using the
578 : // addition operator on strings. A ConsString is a pair where the
579 : // first and second components are pointers to other string values.
580 : // One or both components of a ConsString can be pointers to other
581 : // ConsStrings, creating a binary tree of ConsStrings where the leaves
582 : // are non-ConsString string values. The string value represented by
583 : // a ConsString can be obtained by concatenating the leaf string
584 : // values in a left-to-right depth-first traversal of the tree.
585 : class ConsString : public String {
586 : public:
587 : // First string of the cons cell.
588 : inline String* first();
589 : // Doesn't check that the result is a string, even in debug mode. This is
590 : // useful during GC where the mark bits confuse the checks.
591 : inline Object* unchecked_first();
592 : inline void set_first(String* first,
593 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
594 :
595 : // Second string of the cons cell.
596 : inline String* second();
597 : // Doesn't check that the result is a string, even in debug mode. This is
598 : // useful during GC where the mark bits confuse the checks.
599 : inline Object* unchecked_second();
600 : inline void set_second(String* second,
601 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
602 :
603 : // Dispatched behavior.
604 : V8_EXPORT_PRIVATE uint16_t ConsStringGet(int index);
605 :
606 : DECL_CAST(ConsString)
607 :
608 : // Layout description.
609 : static const int kFirstOffset = POINTER_SIZE_ALIGN(String::kSize);
610 : static const int kSecondOffset = kFirstOffset + kPointerSize;
611 : static const int kSize = kSecondOffset + kPointerSize;
612 :
613 : // Minimum length for a cons string.
614 : static const int kMinLength = 13;
615 :
616 : typedef FixedBodyDescriptor<kFirstOffset, kSecondOffset + kPointerSize, kSize>
617 : BodyDescriptor;
618 : // No weak fields.
619 : typedef BodyDescriptor BodyDescriptorWeak;
620 :
621 : DECL_VERIFIER(ConsString)
622 :
623 : private:
624 : DISALLOW_IMPLICIT_CONSTRUCTORS(ConsString);
625 : };
626 :
627 : // The ThinString class describes string objects that are just references
628 : // to another string object. They are used for in-place internalization when
629 : // the original string cannot actually be internalized in-place: in these
630 : // cases, the original string is converted to a ThinString pointing at its
631 : // internalized version (which is allocated as a new object).
632 : // In terms of memory layout and most algorithms operating on strings,
633 : // ThinStrings can be thought of as "one-part cons strings".
634 : class ThinString : public String {
635 : public:
636 : // Actual string that this ThinString refers to.
637 : inline String* actual() const;
638 : inline void set_actual(String* s,
639 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
640 :
641 : V8_EXPORT_PRIVATE uint16_t ThinStringGet(int index);
642 :
643 : DECL_CAST(ThinString)
644 : DECL_VERIFIER(ThinString)
645 :
646 : // Layout description.
647 : static const int kActualOffset = String::kSize;
648 : static const int kSize = kActualOffset + kPointerSize;
649 :
650 : typedef FixedBodyDescriptor<kActualOffset, kSize, kSize> BodyDescriptor;
651 : // No weak fields.
652 : typedef BodyDescriptor BodyDescriptorWeak;
653 :
654 : private:
655 : DISALLOW_COPY_AND_ASSIGN(ThinString);
656 : };
657 :
658 : // The Sliced String class describes strings that are substrings of another
659 : // sequential string. The motivation is to save time and memory when creating
660 : // a substring. A Sliced String is described as a pointer to the parent,
661 : // the offset from the start of the parent string and the length. Using
662 : // a Sliced String therefore requires unpacking of the parent string and
663 : // adding the offset to the start address. A substring of a Sliced String
664 : // are not nested since the double indirection is simplified when creating
665 : // such a substring.
666 : // Currently missing features are:
667 : // - handling externalized parent strings
668 : // - external strings as parent
669 : // - truncating sliced string to enable otherwise unneeded parent to be GC'ed.
670 : class SlicedString : public String {
671 : public:
672 : inline String* parent();
673 : inline void set_parent(String* parent,
674 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
675 : inline int offset() const;
676 : inline void set_offset(int offset);
677 :
678 : // Dispatched behavior.
679 : V8_EXPORT_PRIVATE uint16_t SlicedStringGet(int index);
680 :
681 : DECL_CAST(SlicedString)
682 :
683 : // Layout description.
684 : static const int kParentOffset = POINTER_SIZE_ALIGN(String::kSize);
685 : static const int kOffsetOffset = kParentOffset + kPointerSize;
686 : static const int kSize = kOffsetOffset + kPointerSize;
687 :
688 : // Minimum length for a sliced string.
689 : static const int kMinLength = 13;
690 :
691 : typedef FixedBodyDescriptor<kParentOffset, kOffsetOffset + kPointerSize,
692 : kSize>
693 : BodyDescriptor;
694 : // No weak fields.
695 : typedef BodyDescriptor BodyDescriptorWeak;
696 :
697 : DECL_VERIFIER(SlicedString)
698 :
699 : private:
700 : DISALLOW_IMPLICIT_CONSTRUCTORS(SlicedString);
701 : };
702 :
703 : // The ExternalString class describes string values that are backed by
704 : // a string resource that lies outside the V8 heap. ExternalStrings
705 : // consist of the length field common to all strings, a pointer to the
706 : // external resource. It is important to ensure (externally) that the
707 : // resource is not deallocated while the ExternalString is live in the
708 : // V8 heap.
709 : //
710 : // The API expects that all ExternalStrings are created through the
711 : // API. Therefore, ExternalStrings should not be used internally.
712 : class ExternalString : public String {
713 : public:
714 : DECL_CAST(ExternalString)
715 :
716 : // Layout description.
717 : static const int kResourceOffset = POINTER_SIZE_ALIGN(String::kSize);
718 : static const int kShortSize = kResourceOffset + kPointerSize;
719 : static const int kResourceDataOffset = kResourceOffset + kPointerSize;
720 : static const int kSize = kResourceDataOffset + kPointerSize;
721 :
722 : // Return whether external string is short (data pointer is not cached).
723 : inline bool is_short();
724 :
725 : STATIC_ASSERT(kResourceOffset == Internals::kStringResourceOffset);
726 :
727 : private:
728 : DISALLOW_IMPLICIT_CONSTRUCTORS(ExternalString);
729 : };
730 :
731 : // The ExternalOneByteString class is an external string backed by an
732 : // one-byte string.
733 : class ExternalOneByteString : public ExternalString {
734 : public:
735 : static const bool kHasOneByteEncoding = true;
736 :
737 : typedef v8::String::ExternalOneByteStringResource Resource;
738 :
739 : // The underlying resource.
740 : inline const Resource* resource();
741 : inline void set_resource(const Resource* buffer);
742 :
743 : // Update the pointer cache to the external character array.
744 : // The cached pointer is always valid, as the external character array does =
745 : // not move during lifetime. Deserialization is the only exception, after
746 : // which the pointer cache has to be refreshed.
747 : inline void update_data_cache();
748 :
749 : inline const uint8_t* GetChars();
750 :
751 : // Dispatched behavior.
752 : inline uint16_t ExternalOneByteStringGet(int index);
753 :
754 : DECL_CAST(ExternalOneByteString)
755 :
756 : class BodyDescriptor;
757 : // No weak fields.
758 : typedef BodyDescriptor BodyDescriptorWeak;
759 :
760 : private:
761 : DISALLOW_IMPLICIT_CONSTRUCTORS(ExternalOneByteString);
762 : };
763 :
764 : // The ExternalTwoByteString class is an external string backed by a UTF-16
765 : // encoded string.
766 : class ExternalTwoByteString : public ExternalString {
767 : public:
768 : static const bool kHasOneByteEncoding = false;
769 :
770 : typedef v8::String::ExternalStringResource Resource;
771 :
772 : // The underlying string resource.
773 : inline const Resource* resource();
774 : inline void set_resource(const Resource* buffer);
775 :
776 : // Update the pointer cache to the external character array.
777 : // The cached pointer is always valid, as the external character array does =
778 : // not move during lifetime. Deserialization is the only exception, after
779 : // which the pointer cache has to be refreshed.
780 : inline void update_data_cache();
781 :
782 : inline const uint16_t* GetChars();
783 :
784 : // Dispatched behavior.
785 : inline uint16_t ExternalTwoByteStringGet(int index);
786 :
787 : // For regexp code.
788 : inline const uint16_t* ExternalTwoByteStringGetData(unsigned start);
789 :
790 : DECL_CAST(ExternalTwoByteString)
791 :
792 : class BodyDescriptor;
793 : // No weak fields.
794 : typedef BodyDescriptor BodyDescriptorWeak;
795 :
796 : private:
797 : DISALLOW_IMPLICIT_CONSTRUCTORS(ExternalTwoByteString);
798 : };
799 :
800 : // A flat string reader provides random access to the contents of a
801 : // string independent of the character width of the string. The handle
802 : // must be valid as long as the reader is being used.
803 2634567 : class FlatStringReader : public Relocatable {
804 : public:
805 : FlatStringReader(Isolate* isolate, Handle<String> str);
806 : FlatStringReader(Isolate* isolate, Vector<const char> input);
807 : void PostGarbageCollection();
808 : inline uc32 Get(int index);
809 : template <typename Char>
810 : inline Char Get(int index);
811 2711419590 : int length() { return length_; }
812 :
813 : private:
814 : String** str_;
815 : bool is_one_byte_;
816 : int length_;
817 : const void* start_;
818 : };
819 :
820 : // This maintains an off-stack representation of the stack frames required
821 : // to traverse a ConsString, allowing an entirely iterative and restartable
822 : // traversal of the entire string
823 : class ConsStringIterator {
824 : public:
825 : inline ConsStringIterator() {}
826 : inline explicit ConsStringIterator(ConsString* cons_string, int offset = 0) {
827 : Reset(cons_string, offset);
828 : }
829 : inline void Reset(ConsString* cons_string, int offset = 0) {
830 13936302 : depth_ = 0;
831 : // Next will always return nullptr.
832 13929225 : if (cons_string == nullptr) return;
833 47184 : Initialize(cons_string, offset);
834 : }
835 : // Returns nullptr when complete.
836 : inline String* Next(int* offset_out) {
837 51427127 : *offset_out = 0;
838 51427127 : if (depth_ == 0) return nullptr;
839 43511997 : return Continue(offset_out);
840 : }
841 :
842 : private:
843 : static const int kStackSize = 32;
844 : // Use a mask instead of doing modulo operations for stack wrapping.
845 : static const int kDepthMask = kStackSize - 1;
846 : static_assert(base::bits::IsPowerOfTwo(kStackSize),
847 : "kStackSize must be power of two");
848 : static inline int OffsetForDepth(int depth);
849 :
850 : inline void PushLeft(ConsString* string);
851 : inline void PushRight(ConsString* string);
852 : inline void AdjustMaximumDepth();
853 : inline void Pop();
854 83427706 : inline bool StackBlown() { return maximum_depth_ - depth_ == kStackSize; }
855 : void Initialize(ConsString* cons_string, int offset);
856 : String* Continue(int* offset_out);
857 : String* NextLeaf(bool* blew_stack);
858 : String* Search(int* offset_out);
859 :
860 : // Stack must always contain only frames for which right traversal
861 : // has not yet been performed.
862 : ConsString* frames_[kStackSize];
863 : ConsString* root_;
864 : int depth_;
865 : int maximum_depth_;
866 : int consumed_;
867 : DISALLOW_COPY_AND_ASSIGN(ConsStringIterator);
868 : };
869 :
870 : class StringCharacterStream {
871 : public:
872 : inline explicit StringCharacterStream(String* string, int offset = 0);
873 : inline uint16_t GetNext();
874 : inline bool HasMore();
875 : inline void Reset(String* string, int offset = 0);
876 : inline void VisitOneByteString(const uint8_t* chars, int length);
877 : inline void VisitTwoByteString(const uint16_t* chars, int length);
878 :
879 : private:
880 : ConsStringIterator iter_;
881 : bool is_one_byte_;
882 : union {
883 : const uint8_t* buffer8_;
884 : const uint16_t* buffer16_;
885 : };
886 : const uint8_t* end_;
887 : DISALLOW_COPY_AND_ASSIGN(StringCharacterStream);
888 : };
889 :
890 : } // namespace internal
891 : } // namespace v8
892 :
893 : #include "src/objects/object-macros-undef.h"
894 :
895 : #endif // V8_OBJECTS_STRING_H_
|