Line data Source code
1 : // Copyright 2017 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_OBJECTS_STRING_H_
6 : #define V8_OBJECTS_STRING_H_
7 :
8 : #include "src/base/bits.h"
9 : #include "src/base/export-template.h"
10 : #include "src/objects/instance-type.h"
11 : #include "src/objects/name.h"
12 : #include "src/objects/smi.h"
13 : #include "src/unicode-decoder.h"
14 :
15 : // Has to be the last include (doesn't have include guards):
16 : #include "src/objects/object-macros.h"
17 :
18 : namespace v8 {
19 : namespace internal {
20 :
21 : enum InstanceType : uint16_t;
22 :
23 : enum AllowNullsFlag { ALLOW_NULLS, DISALLOW_NULLS };
24 : enum RobustnessFlag { ROBUST_STRING_TRAVERSAL, FAST_STRING_TRAVERSAL };
25 :
26 : // The characteristics of a string are stored in its map. Retrieving these
27 : // few bits of information is moderately expensive, involving two memory
28 : // loads where the second is dependent on the first. To improve efficiency
29 : // the shape of the string is given its own class so that it can be retrieved
30 : // once and used for several string operations. A StringShape is small enough
31 : // to be passed by value and is immutable, but be aware that flattening a
32 : // string can potentially alter its shape. Also be aware that a GC caused by
33 : // something else can alter the shape of a string due to ConsString
34 : // shortcutting. Keeping these restrictions in mind has proven to be error-
35 : // prone and so we no longer put StringShapes in variables unless there is a
36 : // concrete performance benefit at that particular point in the code.
37 : class StringShape {
38 : public:
39 : inline explicit StringShape(const String s);
40 : inline explicit StringShape(Map s);
41 : inline explicit StringShape(InstanceType t);
42 : inline bool IsSequential();
43 : inline bool IsExternal();
44 : inline bool IsCons();
45 : inline bool IsSliced();
46 : inline bool IsThin();
47 : inline bool IsIndirect();
48 : inline bool IsExternalOneByte();
49 : inline bool IsExternalTwoByte();
50 : inline bool IsSequentialOneByte();
51 : inline bool IsSequentialTwoByte();
52 : inline bool IsInternalized();
53 : inline StringRepresentationTag representation_tag();
54 : inline uint32_t encoding_tag();
55 : inline uint32_t full_representation_tag();
56 : #ifdef DEBUG
57 : inline uint32_t type() { return type_; }
58 : inline void invalidate() { valid_ = false; }
59 : inline bool valid() { return valid_; }
60 : #else
61 : inline void invalidate() {}
62 : #endif
63 :
64 : private:
65 : uint32_t type_;
66 : #ifdef DEBUG
67 : inline void set_valid() { valid_ = true; }
68 : bool valid_;
69 : #else
70 : inline void set_valid() {}
71 : #endif
72 : };
73 :
74 : // The String abstract class captures JavaScript string values:
75 : //
76 : // Ecma-262:
77 : // 4.3.16 String Value
78 : // A string value is a member of the type String and is a finite
79 : // ordered sequence of zero or more 16-bit unsigned integer values.
80 : //
81 : // All string values have a length field.
82 : class String : public Name {
83 : public:
84 : enum Encoding { ONE_BYTE_ENCODING, TWO_BYTE_ENCODING };
85 :
86 : // Representation of the flat content of a String.
87 : // A non-flat string doesn't have flat content.
88 : // A flat string has content that's encoded as a sequence of either
89 : // one-byte chars or two-byte UC16.
90 : // Returned by String::GetFlatContent().
91 : class FlatContent {
92 : public:
93 : // Returns true if the string is flat and this structure contains content.
94 : bool IsFlat() const { return state_ != NON_FLAT; }
95 : // Returns true if the structure contains one-byte content.
96 1495262 : bool IsOneByte() const { return state_ == ONE_BYTE; }
97 : // Returns true if the structure contains two-byte content.
98 : bool IsTwoByte() const { return state_ == TWO_BYTE; }
99 :
100 : // Return the one byte content of the string. Only use if IsOneByte()
101 : // returns true.
102 : Vector<const uint8_t> ToOneByteVector() const {
103 : DCHECK_EQ(ONE_BYTE, state_);
104 32249155 : return Vector<const uint8_t>(onebyte_start, length_);
105 : }
106 : // Return the two-byte content of the string. Only use if IsTwoByte()
107 : // returns true.
108 : Vector<const uc16> ToUC16Vector() const {
109 : DCHECK_EQ(TWO_BYTE, state_);
110 6051705 : return Vector<const uc16>(twobyte_start, length_);
111 : }
112 :
113 : uc16 Get(int i) const {
114 : DCHECK(i < length_);
115 : DCHECK(state_ != NON_FLAT);
116 62215639 : if (state_ == ONE_BYTE) return onebyte_start[i];
117 15286027 : return twobyte_start[i];
118 : }
119 :
120 : bool UsesSameString(const FlatContent& other) const {
121 : return onebyte_start == other.onebyte_start;
122 : }
123 :
124 : private:
125 : enum State { NON_FLAT, ONE_BYTE, TWO_BYTE };
126 :
127 : // Constructors only used by String::GetFlatContent().
128 : explicit FlatContent(const uint8_t* start, int length)
129 : : onebyte_start(start), length_(length), state_(ONE_BYTE) {}
130 : explicit FlatContent(const uc16* start, int length)
131 : : twobyte_start(start), length_(length), state_(TWO_BYTE) {}
132 : FlatContent() : onebyte_start(nullptr), length_(0), state_(NON_FLAT) {}
133 :
134 : union {
135 : const uint8_t* onebyte_start;
136 : const uc16* twobyte_start;
137 : };
138 : int length_;
139 : State state_;
140 :
141 : friend class String;
142 : friend class IterableSubString;
143 : };
144 :
145 : template <typename Char>
146 : V8_INLINE Vector<const Char> GetCharVector(
147 : const DisallowHeapAllocation& no_gc);
148 :
149 : // Get and set the length of the string.
150 : inline int length() const;
151 : inline void set_length(int value);
152 :
153 : // Get and set the length of the string using acquire loads and release
154 : // stores.
155 : inline int synchronized_length() const;
156 : inline void synchronized_set_length(int value);
157 :
158 : // Returns whether this string has only one-byte chars, i.e. all of them can
159 : // be one-byte encoded. This might be the case even if the string is
160 : // two-byte. Such strings may appear when the embedder prefers
161 : // two-byte external representations even for one-byte data.
162 : inline bool IsOneByteRepresentation() const;
163 : inline bool IsTwoByteRepresentation() const;
164 :
165 : // Cons and slices have an encoding flag that may not represent the actual
166 : // encoding of the underlying string. This is taken into account here.
167 : // This function is static because that helps it get inlined.
168 : // Requires: string.IsFlat()
169 : static inline bool IsOneByteRepresentationUnderneath(String string);
170 :
171 : // Get and set individual two byte chars in the string.
172 : inline void Set(int index, uint16_t value);
173 : // Get individual two byte char in the string. Repeated calls
174 : // to this method are not efficient unless the string is flat.
175 : V8_INLINE uint16_t Get(int index);
176 :
177 : // ES6 section 7.1.3.1 ToNumber Applied to the String Type
178 : static Handle<Object> ToNumber(Isolate* isolate, Handle<String> subject);
179 :
180 : // Flattens the string. Checks first inline to see if it is
181 : // necessary. Does nothing if the string is not a cons string.
182 : // Flattening allocates a sequential string with the same data as
183 : // the given string and mutates the cons string to a degenerate
184 : // form, where the first component is the new sequential string and
185 : // the second component is the empty string. If allocation fails,
186 : // this function returns a failure. If flattening succeeds, this
187 : // function returns the sequential string that is now the first
188 : // component of the cons string.
189 : //
190 : // Degenerate cons strings are handled specially by the garbage
191 : // collector (see IsShortcutCandidate).
192 :
193 : static inline Handle<String> Flatten(
194 : Isolate* isolate, Handle<String> string,
195 : AllocationType allocation = AllocationType::kYoung);
196 :
197 : // Tries to return the content of a flat string as a structure holding either
198 : // a flat vector of char or of uc16.
199 : // If the string isn't flat, and therefore doesn't have flat content, the
200 : // returned structure will report so, and can't provide a vector of either
201 : // kind.
202 : V8_EXPORT_PRIVATE FlatContent
203 : GetFlatContent(const DisallowHeapAllocation& no_gc);
204 :
205 : // Returns the parent of a sliced string or first part of a flat cons string.
206 : // Requires: StringShape(this).IsIndirect() && this->IsFlat()
207 : inline String GetUnderlying();
208 :
209 : // String relational comparison, implemented according to ES6 section 7.2.11
210 : // Abstract Relational Comparison (step 5): The comparison of Strings uses a
211 : // simple lexicographic ordering on sequences of code unit values. There is no
212 : // attempt to use the more complex, semantically oriented definitions of
213 : // character or string equality and collating order defined in the Unicode
214 : // specification. Therefore String values that are canonically equal according
215 : // to the Unicode standard could test as unequal. In effect this algorithm
216 : // assumes that both Strings are already in normalized form. Also, note that
217 : // for strings containing supplementary characters, lexicographic ordering on
218 : // sequences of UTF-16 code unit values differs from that on sequences of code
219 : // point values.
220 : V8_WARN_UNUSED_RESULT static ComparisonResult Compare(Isolate* isolate,
221 : Handle<String> x,
222 : Handle<String> y);
223 :
224 : // Perform ES6 21.1.3.8, including checking arguments.
225 : static Object IndexOf(Isolate* isolate, Handle<Object> receiver,
226 : Handle<Object> search, Handle<Object> position);
227 : // Perform string match of pattern on subject, starting at start index.
228 : // Caller must ensure that 0 <= start_index <= sub->length(), as this does not
229 : // check any arguments.
230 : static int IndexOf(Isolate* isolate, Handle<String> receiver,
231 : Handle<String> search, int start_index);
232 :
233 : static Object LastIndexOf(Isolate* isolate, Handle<Object> receiver,
234 : Handle<Object> search, Handle<Object> position);
235 :
236 : // Encapsulates logic related to a match and its capture groups as required
237 : // by GetSubstitution.
238 3912 : class Match {
239 : public:
240 : virtual Handle<String> GetMatch() = 0;
241 : virtual Handle<String> GetPrefix() = 0;
242 : virtual Handle<String> GetSuffix() = 0;
243 :
244 : // A named capture can be invalid (if it is not specified in the pattern),
245 : // unmatched (specified but not matched in the current string), and matched.
246 : enum CaptureState { INVALID, UNMATCHED, MATCHED };
247 :
248 : virtual int CaptureCount() = 0;
249 : virtual bool HasNamedCaptures() = 0;
250 : virtual MaybeHandle<String> GetCapture(int i, bool* capture_exists) = 0;
251 : virtual MaybeHandle<String> GetNamedCapture(Handle<String> name,
252 : CaptureState* state) = 0;
253 :
254 3912 : virtual ~Match() = default;
255 : };
256 :
257 : // ES#sec-getsubstitution
258 : // GetSubstitution(matched, str, position, captures, replacement)
259 : // Expand the $-expressions in the string and return a new string with
260 : // the result.
261 : // A {start_index} can be passed to specify where to start scanning the
262 : // replacement string.
263 : V8_WARN_UNUSED_RESULT static MaybeHandle<String> GetSubstitution(
264 : Isolate* isolate, Match* match, Handle<String> replacement,
265 : int start_index = 0);
266 :
267 : // String equality operations.
268 : inline bool Equals(String other);
269 : inline static bool Equals(Isolate* isolate, Handle<String> one,
270 : Handle<String> two);
271 : V8_EXPORT_PRIVATE bool IsUtf8EqualTo(Vector<const char> str,
272 : bool allow_prefix_match = false);
273 :
274 : // Dispatches to Is{One,Two}ByteEqualTo.
275 : template <typename Char>
276 : bool IsEqualTo(Vector<const Char> str);
277 :
278 : V8_EXPORT_PRIVATE bool IsOneByteEqualTo(Vector<const uint8_t> str);
279 : bool IsTwoByteEqualTo(Vector<const uc16> str);
280 :
281 : // Return a UTF8 representation of the string. The string is null
282 : // terminated but may optionally contain nulls. Length is returned
283 : // in length_output if length_output is not a null pointer The string
284 : // should be nearly flat, otherwise the performance of this method may
285 : // be very slow (quadratic in the length). Setting robustness_flag to
286 : // ROBUST_STRING_TRAVERSAL invokes behaviour that is robust This means it
287 : // handles unexpected data without causing assert failures and it does not
288 : // do any heap allocations. This is useful when printing stack traces.
289 : std::unique_ptr<char[]> ToCString(AllowNullsFlag allow_nulls,
290 : RobustnessFlag robustness_flag, int offset,
291 : int length, int* length_output = nullptr);
292 : V8_EXPORT_PRIVATE std::unique_ptr<char[]> ToCString(
293 : AllowNullsFlag allow_nulls = DISALLOW_NULLS,
294 : RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL,
295 : int* length_output = nullptr);
296 :
297 : bool ComputeArrayIndex(uint32_t* index);
298 :
299 : // Externalization.
300 : V8_EXPORT_PRIVATE bool MakeExternal(
301 : v8::String::ExternalStringResource* resource);
302 : V8_EXPORT_PRIVATE bool MakeExternal(
303 : v8::String::ExternalOneByteStringResource* resource);
304 : bool SupportsExternalization();
305 :
306 : // Conversion.
307 : inline bool AsArrayIndex(uint32_t* index);
308 : uint32_t inline ToValidIndex(Object number);
309 :
310 : // Trimming.
311 : enum TrimMode { kTrim, kTrimStart, kTrimEnd };
312 : static Handle<String> Trim(Isolate* isolate, Handle<String> string,
313 : TrimMode mode);
314 :
315 : DECL_CAST(String)
316 :
317 : V8_EXPORT_PRIVATE void PrintOn(FILE* out);
318 :
319 : // For use during stack traces. Performs rudimentary sanity check.
320 : bool LooksValid();
321 :
322 : // Dispatched behavior.
323 : void StringShortPrint(StringStream* accumulator, bool show_details = true);
324 : void PrintUC16(std::ostream& os, int start = 0, int end = -1); // NOLINT
325 : #if defined(DEBUG) || defined(OBJECT_PRINT)
326 : char* ToAsciiArray();
327 : #endif
328 : DECL_PRINTER(String)
329 : DECL_VERIFIER(String)
330 :
331 : inline bool IsFlat();
332 :
333 : DEFINE_FIELD_OFFSET_CONSTANTS(Name::kHeaderSize,
334 : TORQUE_GENERATED_STRING_FIELDS)
335 :
336 : static const int kHeaderSize = kSize;
337 :
338 : // Max char codes.
339 : static const int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar;
340 : static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar;
341 : static const int kMaxUtf16CodeUnit = 0xffff;
342 : static const uint32_t kMaxUtf16CodeUnitU = kMaxUtf16CodeUnit;
343 : static const uc32 kMaxCodePoint = 0x10ffff;
344 :
345 : // Maximal string length.
346 : // The max length is different on 32 and 64 bit platforms. Max length for a
347 : // 32-bit platform is ~268.4M chars. On 64-bit platforms, max length is
348 : // ~1.073B chars. The limit on 64-bit is so that SeqTwoByteString::kMaxSize
349 : // can fit in a 32bit int: 2^31 - 1 is the max positive int, minus one bit as
350 : // each char needs two bytes, subtract 24 bytes for the string header size.
351 :
352 : // See include/v8.h for the definition.
353 : static const int kMaxLength = v8::String::kMaxLength;
354 : static_assert(kMaxLength <= (Smi::kMaxValue / 2 - kHeaderSize),
355 : "Unexpected max String length");
356 :
357 : // Max length for computing hash. For strings longer than this limit the
358 : // string length is used as the hash value.
359 : static const int kMaxHashCalcLength = 16383;
360 :
361 : // Limit for truncation in short printing.
362 : static const int kMaxShortPrintLength = 1024;
363 :
364 : // Helper function for flattening strings.
365 : template <typename sinkchar>
366 : EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
367 : static void WriteToFlat(String source, sinkchar* sink, int from, int to);
368 :
369 : // The return value may point to the first aligned word containing the first
370 : // non-one-byte character, rather than directly to the non-one-byte character.
371 : // If the return value is >= the passed length, the entire string was
372 : // one-byte.
373 10403179 : static inline int NonAsciiStart(const char* chars, int length) {
374 : const char* start = chars;
375 10403179 : const char* limit = chars + length;
376 :
377 10403179 : if (length >= kIntptrSize) {
378 : // Check unaligned bytes.
379 10601970 : while (!IsAligned(reinterpret_cast<intptr_t>(chars), sizeof(uintptr_t))) {
380 4452491 : if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) {
381 280 : return static_cast<int>(chars - start);
382 : }
383 4452211 : ++chars;
384 : }
385 : // Check aligned words.
386 : DCHECK_EQ(unibrow::Utf8::kMaxOneByteChar, 0x7F);
387 : const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFF * 0x80;
388 888537138 : while (chars + sizeof(uintptr_t) <= limit) {
389 886841479 : if (*reinterpret_cast<const uintptr_t*>(chars) & non_one_byte_mask) {
390 1609 : return static_cast<int>(chars - start);
391 : }
392 : chars += sizeof(uintptr_t);
393 : }
394 : }
395 : // Check remaining unaligned bytes.
396 83541162 : while (chars < limit) {
397 36573492 : if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) {
398 3556 : return static_cast<int>(chars - start);
399 : }
400 36569936 : ++chars;
401 : }
402 :
403 10397734 : return static_cast<int>(chars - start);
404 : }
405 :
406 : static inline bool IsAscii(const char* chars, int length) {
407 120571 : return NonAsciiStart(chars, length) >= length;
408 : }
409 :
410 : static inline bool IsAscii(const uint8_t* chars, int length) {
411 0 : return NonAsciiStart(reinterpret_cast<const char*>(chars), length) >=
412 : length;
413 : }
414 :
415 : static inline int NonOneByteStart(const uc16* chars, int length) {
416 1867743 : const uc16* limit = chars + length;
417 : const uc16* start = chars;
418 1527882154 : while (chars < limit) {
419 1526629287 : if (*chars > kMaxOneByteCharCodeU) return static_cast<int>(chars - start);
420 1526014411 : ++chars;
421 : }
422 1252867 : return static_cast<int>(chars - start);
423 : }
424 :
425 : static inline bool IsOneByte(const uc16* chars, int length) {
426 : return NonOneByteStart(chars, length) >= length;
427 : }
428 :
429 : template <class Visitor>
430 : static inline ConsString VisitFlat(Visitor* visitor, String string,
431 : int offset = 0);
432 :
433 : static Handle<FixedArray> CalculateLineEnds(Isolate* isolate,
434 : Handle<String> string,
435 : bool include_ending_line);
436 :
437 : private:
438 : friend class Name;
439 : friend class StringTableInsertionKey;
440 : friend class InternalizedStringKey;
441 :
442 : V8_EXPORT_PRIVATE static Handle<String> SlowFlatten(
443 : Isolate* isolate, Handle<ConsString> cons, AllocationType allocation);
444 :
445 : // Slow case of String::Equals. This implementation works on any strings
446 : // but it is most efficient on strings that are almost flat.
447 : V8_EXPORT_PRIVATE bool SlowEquals(String other);
448 :
449 : V8_EXPORT_PRIVATE static bool SlowEquals(Isolate* isolate, Handle<String> one,
450 : Handle<String> two);
451 :
452 : // Slow case of AsArrayIndex.
453 : V8_EXPORT_PRIVATE bool SlowAsArrayIndex(uint32_t* index);
454 :
455 : // Compute and set the hash code.
456 : V8_EXPORT_PRIVATE uint32_t ComputeAndSetHash();
457 :
458 0 : OBJECT_CONSTRUCTORS(String, Name);
459 : };
460 :
461 : // clang-format off
462 : extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE)
463 : void String::WriteToFlat(String source, uint16_t* sink, int from, int to);
464 : // clang-format on
465 :
466 : class SubStringRange {
467 : public:
468 : inline SubStringRange(String string, const DisallowHeapAllocation& no_gc,
469 : int first = 0, int length = -1);
470 : class iterator;
471 : inline iterator begin();
472 : inline iterator end();
473 :
474 : private:
475 : String string_;
476 : int first_;
477 : int length_;
478 : const DisallowHeapAllocation& no_gc_;
479 : };
480 :
481 : // The SeqString abstract class captures sequential string values.
482 : class SeqString : public String {
483 : public:
484 : DECL_CAST(SeqString)
485 :
486 : // Truncate the string in-place if possible and return the result.
487 : // In case of new_length == 0, the empty string is returned without
488 : // truncating the original string.
489 : V8_WARN_UNUSED_RESULT static Handle<String> Truncate(Handle<SeqString> string,
490 : int new_length);
491 :
492 : OBJECT_CONSTRUCTORS(SeqString, String);
493 : };
494 :
495 : class InternalizedString : public String {
496 : public:
497 : DECL_CAST(InternalizedString)
498 : // TODO(neis): Possibly move some stuff from String here.
499 :
500 : OBJECT_CONSTRUCTORS(InternalizedString, String);
501 : };
502 :
503 : // The OneByteString class captures sequential one-byte string objects.
504 : // Each character in the OneByteString is an one-byte character.
505 : class SeqOneByteString : public SeqString {
506 : public:
507 : static const bool kHasOneByteEncoding = true;
508 :
509 : // Dispatched behavior.
510 : inline uint16_t SeqOneByteStringGet(int index);
511 : inline void SeqOneByteStringSet(int index, uint16_t value);
512 :
513 : // Get the address of the characters in this string.
514 : inline Address GetCharsAddress();
515 :
516 : inline uint8_t* GetChars(const DisallowHeapAllocation& no_gc);
517 :
518 : // Clear uninitialized padding space. This ensures that the snapshot content
519 : // is deterministic.
520 : void clear_padding();
521 :
522 : DECL_CAST(SeqOneByteString)
523 :
524 : // Garbage collection support. This method is called by the
525 : // garbage collector to compute the actual size of an OneByteString
526 : // instance.
527 : inline int SeqOneByteStringSize(InstanceType instance_type);
528 :
529 : // Computes the size for an OneByteString instance of a given length.
530 : static int SizeFor(int length) {
531 354532294 : return OBJECT_POINTER_ALIGN(kHeaderSize + length * kCharSize);
532 : }
533 :
534 : // Maximal memory usage for a single sequential one-byte string.
535 : static const int kMaxCharsSize = kMaxLength;
536 : static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize);
537 : STATIC_ASSERT((kMaxSize - kHeaderSize) >= String::kMaxLength);
538 :
539 : class BodyDescriptor;
540 :
541 0 : OBJECT_CONSTRUCTORS(SeqOneByteString, SeqString);
542 : };
543 :
544 : // The TwoByteString class captures sequential unicode string objects.
545 : // Each character in the TwoByteString is a two-byte uint16_t.
546 : class SeqTwoByteString : public SeqString {
547 : public:
548 : static const bool kHasOneByteEncoding = false;
549 :
550 : // Dispatched behavior.
551 : inline uint16_t SeqTwoByteStringGet(int index);
552 : inline void SeqTwoByteStringSet(int index, uint16_t value);
553 :
554 : // Get the address of the characters in this string.
555 : inline Address GetCharsAddress();
556 :
557 : inline uc16* GetChars(const DisallowHeapAllocation& no_gc);
558 :
559 : // Clear uninitialized padding space. This ensures that the snapshot content
560 : // is deterministic.
561 : void clear_padding();
562 :
563 : DECL_CAST(SeqTwoByteString)
564 :
565 : // Garbage collection support. This method is called by the
566 : // garbage collector to compute the actual size of a TwoByteString
567 : // instance.
568 : inline int SeqTwoByteStringSize(InstanceType instance_type);
569 :
570 : // Computes the size for a TwoByteString instance of a given length.
571 : static int SizeFor(int length) {
572 199003715 : return OBJECT_POINTER_ALIGN(kHeaderSize + length * kShortSize);
573 : }
574 :
575 : // Maximal memory usage for a single sequential two-byte string.
576 : static const int kMaxCharsSize = kMaxLength * 2;
577 : static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize);
578 : STATIC_ASSERT(static_cast<int>((kMaxSize - kHeaderSize) / sizeof(uint16_t)) >=
579 : String::kMaxLength);
580 :
581 : class BodyDescriptor;
582 :
583 0 : OBJECT_CONSTRUCTORS(SeqTwoByteString, SeqString);
584 : };
585 :
586 : // The ConsString class describes string values built by using the
587 : // addition operator on strings. A ConsString is a pair where the
588 : // first and second components are pointers to other string values.
589 : // One or both components of a ConsString can be pointers to other
590 : // ConsStrings, creating a binary tree of ConsStrings where the leaves
591 : // are non-ConsString string values. The string value represented by
592 : // a ConsString can be obtained by concatenating the leaf string
593 : // values in a left-to-right depth-first traversal of the tree.
594 : class ConsString : public String {
595 : public:
596 : // First string of the cons cell.
597 : inline String first();
598 : // Doesn't check that the result is a string, even in debug mode. This is
599 : // useful during GC where the mark bits confuse the checks.
600 : inline Object unchecked_first();
601 : inline void set_first(Isolate* isolate, String first,
602 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
603 :
604 : // Second string of the cons cell.
605 : inline String second();
606 : // Doesn't check that the result is a string, even in debug mode. This is
607 : // useful during GC where the mark bits confuse the checks.
608 : inline Object unchecked_second();
609 : inline void set_second(Isolate* isolate, String second,
610 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
611 :
612 : // Dispatched behavior.
613 : V8_EXPORT_PRIVATE uint16_t ConsStringGet(int index);
614 :
615 : DECL_CAST(ConsString)
616 :
617 : DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize,
618 : TORQUE_GENERATED_CONS_STRING_FIELDS)
619 :
620 : // Minimum length for a cons string.
621 : static const int kMinLength = 13;
622 :
623 : using BodyDescriptor = FixedBodyDescriptor<kFirstOffset, kSize, kSize>;
624 :
625 : DECL_VERIFIER(ConsString)
626 :
627 0 : OBJECT_CONSTRUCTORS(ConsString, String);
628 : };
629 :
630 : // The ThinString class describes string objects that are just references
631 : // to another string object. They are used for in-place internalization when
632 : // the original string cannot actually be internalized in-place: in these
633 : // cases, the original string is converted to a ThinString pointing at its
634 : // internalized version (which is allocated as a new object).
635 : // In terms of memory layout and most algorithms operating on strings,
636 : // ThinStrings can be thought of as "one-part cons strings".
637 : class ThinString : public String {
638 : public:
639 : // Actual string that this ThinString refers to.
640 : inline String actual() const;
641 : inline HeapObject unchecked_actual() const;
642 : inline void set_actual(String s,
643 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
644 :
645 : V8_EXPORT_PRIVATE uint16_t ThinStringGet(int index);
646 :
647 : DECL_CAST(ThinString)
648 : DECL_VERIFIER(ThinString)
649 :
650 : DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize,
651 : TORQUE_GENERATED_THIN_STRING_FIELDS)
652 :
653 : using BodyDescriptor = FixedBodyDescriptor<kActualOffset, kSize, kSize>;
654 :
655 0 : OBJECT_CONSTRUCTORS(ThinString, String);
656 : };
657 :
658 : // The Sliced String class describes strings that are substrings of another
659 : // sequential string. The motivation is to save time and memory when creating
660 : // a substring. A Sliced String is described as a pointer to the parent,
661 : // the offset from the start of the parent string and the length. Using
662 : // a Sliced String therefore requires unpacking of the parent string and
663 : // adding the offset to the start address. A substring of a Sliced String
664 : // are not nested since the double indirection is simplified when creating
665 : // such a substring.
666 : // Currently missing features are:
667 : // - handling externalized parent strings
668 : // - external strings as parent
669 : // - truncating sliced string to enable otherwise unneeded parent to be GC'ed.
670 : class SlicedString : public String {
671 : public:
672 : inline String parent();
673 : inline void set_parent(Isolate* isolate, String parent,
674 : WriteBarrierMode mode = UPDATE_WRITE_BARRIER);
675 : inline int offset() const;
676 : inline void set_offset(int offset);
677 :
678 : // Dispatched behavior.
679 : V8_EXPORT_PRIVATE uint16_t SlicedStringGet(int index);
680 :
681 : DECL_CAST(SlicedString)
682 :
683 : DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize,
684 : TORQUE_GENERATED_SLICED_STRING_FIELDS)
685 :
686 : // Minimum length for a sliced string.
687 : static const int kMinLength = 13;
688 :
689 : using BodyDescriptor = FixedBodyDescriptor<kParentOffset, kSize, kSize>;
690 :
691 : DECL_VERIFIER(SlicedString)
692 :
693 0 : OBJECT_CONSTRUCTORS(SlicedString, String);
694 : };
695 :
696 : // The ExternalString class describes string values that are backed by
697 : // a string resource that lies outside the V8 heap. ExternalStrings
698 : // consist of the length field common to all strings, a pointer to the
699 : // external resource. It is important to ensure (externally) that the
700 : // resource is not deallocated while the ExternalString is live in the
701 : // V8 heap.
702 : //
703 : // The API expects that all ExternalStrings are created through the
704 : // API. Therefore, ExternalStrings should not be used internally.
705 : class ExternalString : public String {
706 : public:
707 : DECL_CAST(ExternalString)
708 :
709 : DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize,
710 : TORQUE_GENERATED_EXTERNAL_STRING_FIELDS)
711 :
712 : // Size of uncached external strings.
713 : static const int kUncachedSize =
714 : kResourceOffset + FIELD_SIZE(kResourceOffset);
715 :
716 : // Return whether the external string data pointer is not cached.
717 : inline bool is_uncached() const;
718 : // Size in bytes of the external payload.
719 : int ExternalPayloadSize() const;
720 :
721 : // Used in the serializer/deserializer.
722 : inline Address resource_as_address();
723 : inline void set_address_as_resource(Address address);
724 : inline uint32_t resource_as_uint32();
725 : inline void set_uint32_as_resource(uint32_t value);
726 :
727 : // Disposes string's resource object if it has not already been disposed.
728 : inline void DisposeResource();
729 :
730 : STATIC_ASSERT(kResourceOffset == Internals::kStringResourceOffset);
731 :
732 : OBJECT_CONSTRUCTORS(ExternalString, String);
733 : };
734 :
735 : // The ExternalOneByteString class is an external string backed by an
736 : // one-byte string.
737 : class ExternalOneByteString : public ExternalString {
738 : public:
739 : static const bool kHasOneByteEncoding = true;
740 :
741 : using Resource = v8::String::ExternalOneByteStringResource;
742 :
743 : // The underlying resource.
744 : inline const Resource* resource();
745 :
746 : // It is assumed that the previous resource is null. If it is not null, then
747 : // it is the responsability of the caller the handle the previous resource.
748 : inline void SetResource(Isolate* isolate, const Resource* buffer);
749 : // Used only during serialization.
750 : inline void set_resource(const Resource* buffer);
751 :
752 : // Update the pointer cache to the external character array.
753 : // The cached pointer is always valid, as the external character array does =
754 : // not move during lifetime. Deserialization is the only exception, after
755 : // which the pointer cache has to be refreshed.
756 : inline void update_data_cache();
757 :
758 : inline const uint8_t* GetChars();
759 :
760 : // Dispatched behavior.
761 : inline uint16_t ExternalOneByteStringGet(int index);
762 :
763 : DECL_CAST(ExternalOneByteString)
764 :
765 : class BodyDescriptor;
766 :
767 0 : OBJECT_CONSTRUCTORS(ExternalOneByteString, ExternalString);
768 : };
769 :
770 : // The ExternalTwoByteString class is an external string backed by a UTF-16
771 : // encoded string.
772 : class ExternalTwoByteString : public ExternalString {
773 : public:
774 : static const bool kHasOneByteEncoding = false;
775 :
776 : using Resource = v8::String::ExternalStringResource;
777 :
778 : // The underlying string resource.
779 : inline const Resource* resource();
780 :
781 : // It is assumed that the previous resource is null. If it is not null, then
782 : // it is the responsability of the caller the handle the previous resource.
783 : inline void SetResource(Isolate* isolate, const Resource* buffer);
784 : // Used only during serialization.
785 : inline void set_resource(const Resource* buffer);
786 :
787 : // Update the pointer cache to the external character array.
788 : // The cached pointer is always valid, as the external character array does =
789 : // not move during lifetime. Deserialization is the only exception, after
790 : // which the pointer cache has to be refreshed.
791 : inline void update_data_cache();
792 :
793 : inline const uint16_t* GetChars();
794 :
795 : // Dispatched behavior.
796 : inline uint16_t ExternalTwoByteStringGet(int index);
797 :
798 : // For regexp code.
799 : inline const uint16_t* ExternalTwoByteStringGetData(unsigned start);
800 :
801 : DECL_CAST(ExternalTwoByteString)
802 :
803 : class BodyDescriptor;
804 :
805 0 : OBJECT_CONSTRUCTORS(ExternalTwoByteString, ExternalString);
806 : };
807 :
808 : // A flat string reader provides random access to the contents of a
809 : // string independent of the character width of the string. The handle
810 : // must be valid as long as the reader is being used.
811 2990744 : class V8_EXPORT_PRIVATE FlatStringReader : public Relocatable {
812 : public:
813 : FlatStringReader(Isolate* isolate, Handle<String> str);
814 : FlatStringReader(Isolate* isolate, Vector<const char> input);
815 : void PostGarbageCollection() override;
816 : inline uc32 Get(int index);
817 : template <typename Char>
818 : inline Char Get(int index);
819 : int length() { return length_; }
820 :
821 : private:
822 : Address* str_;
823 : bool is_one_byte_;
824 : int length_;
825 : const void* start_;
826 : };
827 :
828 : // This maintains an off-stack representation of the stack frames required
829 : // to traverse a ConsString, allowing an entirely iterative and restartable
830 : // traversal of the entire string
831 : class ConsStringIterator {
832 : public:
833 6459412 : inline ConsStringIterator() = default;
834 67010 : inline explicit ConsStringIterator(ConsString cons_string, int offset = 0) {
835 : Reset(cons_string, offset);
836 67010 : }
837 : inline void Reset(ConsString cons_string, int offset = 0) {
838 11830687 : depth_ = 0;
839 : // Next will always return nullptr.
840 11795045 : if (cons_string.is_null()) return;
841 108701 : Initialize(cons_string, offset);
842 : }
843 : // Returns nullptr when complete.
844 : inline String Next(int* offset_out) {
845 222495059 : *offset_out = 0;
846 222495059 : if (depth_ == 0) return String();
847 211935450 : return Continue(offset_out);
848 : }
849 :
850 : private:
851 : static const int kStackSize = 32;
852 : // Use a mask instead of doing modulo operations for stack wrapping.
853 : static const int kDepthMask = kStackSize - 1;
854 : static_assert(base::bits::IsPowerOfTwo(kStackSize),
855 : "kStackSize must be power of two");
856 : static inline int OffsetForDepth(int depth);
857 :
858 : inline void PushLeft(ConsString string);
859 : inline void PushRight(ConsString string);
860 : inline void AdjustMaximumDepth();
861 : inline void Pop();
862 424105312 : inline bool StackBlown() { return maximum_depth_ - depth_ == kStackSize; }
863 : V8_EXPORT_PRIVATE void Initialize(ConsString cons_string, int offset);
864 : V8_EXPORT_PRIVATE String Continue(int* offset_out);
865 : String NextLeaf(bool* blew_stack);
866 : String Search(int* offset_out);
867 :
868 : // Stack must always contain only frames for which right traversal
869 : // has not yet been performed.
870 : ConsString frames_[kStackSize];
871 : ConsString root_;
872 : int depth_;
873 : int maximum_depth_;
874 : int consumed_;
875 : DISALLOW_COPY_AND_ASSIGN(ConsStringIterator);
876 : };
877 :
878 : class StringCharacterStream {
879 : public:
880 : inline explicit StringCharacterStream(String string, int offset = 0);
881 : inline uint16_t GetNext();
882 : inline bool HasMore();
883 : inline void Reset(String string, int offset = 0);
884 : inline void VisitOneByteString(const uint8_t* chars, int length);
885 : inline void VisitTwoByteString(const uint16_t* chars, int length);
886 :
887 : private:
888 : ConsStringIterator iter_;
889 : bool is_one_byte_;
890 : union {
891 : const uint8_t* buffer8_;
892 : const uint16_t* buffer16_;
893 : };
894 : const uint8_t* end_;
895 : DISALLOW_COPY_AND_ASSIGN(StringCharacterStream);
896 : };
897 :
898 : } // namespace internal
899 : } // namespace v8
900 :
901 : #include "src/objects/object-macros-undef.h"
902 :
903 : #endif // V8_OBJECTS_STRING_H_
|