Coverage Report

Created: 2025-06-24 06:43

/src/icu/source/common/unisetspan.h
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 2007, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*   file name:  unisetspan.h
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2007mar01
16
*   created by: Markus W. Scherer
17
*/
18
19
#ifndef __UNISETSPAN_H__
20
#define __UNISETSPAN_H__
21
22
#include "unicode/utypes.h"
23
#include "unicode/uniset.h"
24
25
U_NAMESPACE_BEGIN
26
27
/*
28
 * Implement span() etc. for a set with strings.
29
 * Avoid recursion because of its exponential complexity.
30
 * Instead, try multiple paths at once and track them with an IndexList.
31
 */
32
class UnicodeSetStringSpan : public UMemory {
33
public:
34
    /*
35
     * Which span() variant will be used?
36
     * The object is either built for one variant and used once,
37
     * or built for all and may be used many times.
38
     */
39
    enum {
40
        FWD             = 0x20,
41
        BACK            = 0x10,
42
        UTF16           = 8,
43
        UTF8            = 4,
44
        CONTAINED       = 2,
45
        NOT_CONTAINED   = 1,
46
47
        ALL             = 0x3f,
48
49
        FWD_UTF16_CONTAINED     = FWD  | UTF16 |     CONTAINED,
50
        FWD_UTF16_NOT_CONTAINED = FWD  | UTF16 | NOT_CONTAINED,
51
        FWD_UTF8_CONTAINED      = FWD  | UTF8  |     CONTAINED,
52
        FWD_UTF8_NOT_CONTAINED  = FWD  | UTF8  | NOT_CONTAINED,
53
        BACK_UTF16_CONTAINED    = BACK | UTF16 |     CONTAINED,
54
        BACK_UTF16_NOT_CONTAINED= BACK | UTF16 | NOT_CONTAINED,
55
        BACK_UTF8_CONTAINED     = BACK | UTF8  |     CONTAINED,
56
        BACK_UTF8_NOT_CONTAINED = BACK | UTF8  | NOT_CONTAINED
57
    };
58
59
    UnicodeSetStringSpan(const UnicodeSet &set, const UVector &setStrings, uint32_t which);
60
61
    // Copy constructor. Assumes which==ALL for a frozen set.
62
    UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, const UVector &newParentSetStrings);
63
64
    ~UnicodeSetStringSpan();
65
66
    /*
67
     * Do the strings need to be checked in span() etc.?
68
     * @return true if strings need to be checked (call span() here),
69
     *         false if not (use a BMPSet for best performance).
70
     */
71
    inline UBool needsStringSpanUTF16();
72
    inline UBool needsStringSpanUTF8();
73
74
    // For fast UnicodeSet::contains(c).
75
    inline UBool contains(UChar32 c) const;
76
77
    int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
78
79
    int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
80
81
    int32_t spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
82
83
    int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
84
85
private:
86
    // Special spanLength byte values.
87
    enum {
88
        // The spanLength is >=0xfe.
89
        LONG_SPAN=0xfe,
90
        // All code points in the string are contained in the parent set.
91
        ALL_CP_CONTAINED=0xff
92
    };
93
94
    // Add a starting or ending string character to the spanNotSet
95
    // so that a character span ends before any string.
96
    void addToSpanNotSet(UChar32 c);
97
98
    int32_t spanNot(const UChar *s, int32_t length) const;
99
    int32_t spanNotBack(const UChar *s, int32_t length) const;
100
    int32_t spanNotUTF8(const uint8_t *s, int32_t length) const;
101
    int32_t spanNotBackUTF8(const uint8_t *s, int32_t length) const;
102
103
    // Set for span(). Same as parent but without strings.
104
    UnicodeSet spanSet;
105
106
    // Set for span(not contained).
107
    // Same as spanSet, plus characters that start or end strings.
108
    UnicodeSet *pSpanNotSet;
109
110
    // The strings of the parent set.
111
    const UVector &strings;
112
113
    // Pointer to the UTF-8 string lengths.
114
    // Also pointer to further allocated storage for meta data and
115
    // UTF-8 string contents as necessary.
116
    int32_t *utf8Lengths;
117
118
    // Pointer to the part of the (utf8Lengths) memory block that stores
119
    // the lengths of span(), spanBack() etc. for each string.
120
    uint8_t *spanLengths;
121
122
    // Pointer to the part of the (utf8Lengths) memory block that stores
123
    // the UTF-8 versions of the parent set's strings.
124
    uint8_t *utf8;
125
126
    // Number of bytes for all UTF-8 versions of strings together.
127
    int32_t utf8Length;
128
129
    // Maximum lengths of relevant strings.
130
    int32_t maxLength16;
131
    int32_t maxLength8;
132
133
    // Set up for all variants of span()?
134
    UBool all;
135
136
    // Memory for small numbers and lengths of strings.
137
    // For example, for 8 strings:
138
    // 8 UTF-8 lengths, 8*4 bytes span lengths, 8*2 3-byte UTF-8 characters
139
    // = 112 bytes = int32_t[28].
140
    int32_t staticLengths[32];
141
};
142
143
0
UBool UnicodeSetStringSpan::needsStringSpanUTF16() {
144
0
    return (UBool)(maxLength16!=0);
145
0
}
146
147
0
UBool UnicodeSetStringSpan::needsStringSpanUTF8() {
148
0
    return (UBool)(maxLength8!=0);
149
0
}
150
151
0
UBool UnicodeSetStringSpan::contains(UChar32 c) const {
152
0
    return spanSet.contains(c);
153
0
}
154
155
U_NAMESPACE_END
156
157
#endif