/src/icu/icu4c/source/common/emojiprops.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2021 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: https://www.unicode.org/copyright.html  | 
3  |  |  | 
4  |  | // emojiprops.cpp  | 
5  |  | // created: 2021sep04 Markus W. Scherer  | 
6  |  |  | 
7  |  | #include "unicode/utypes.h"  | 
8  |  | #include "unicode/uchar.h"  | 
9  |  | #include "unicode/ucharstrie.h"  | 
10  |  | #include "unicode/ucptrie.h"  | 
11  |  | #include "unicode/udata.h"  | 
12  |  | #include "unicode/ustringtrie.h"  | 
13  |  | #include "unicode/utf16.h"  | 
14  |  | #include "emojiprops.h"  | 
15  |  | #include "ucln.h"  | 
16  |  | #include "ucln_cmn.h"  | 
17  |  | #include "umutex.h"  | 
18  |  | #include "uset_imp.h"  | 
19  |  |  | 
20  |  | U_NAMESPACE_BEGIN  | 
21  |  |  | 
22  |  | namespace { | 
23  |  |  | 
24  |  | EmojiProps *singleton = nullptr;  | 
25  |  | icu::UInitOnce emojiInitOnce {}; | 
26  |  |  | 
27  | 0  | UBool U_CALLCONV emojiprops_cleanup() { | 
28  | 0  |     delete singleton;  | 
29  | 0  |     singleton = nullptr;  | 
30  | 0  |     emojiInitOnce.reset();  | 
31  | 0  |     return true;  | 
32  | 0  | }  | 
33  |  |  | 
34  | 0  | void U_CALLCONV initSingleton(UErrorCode &errorCode) { | 
35  | 0  |     if (U_FAILURE(errorCode)) { return; } | 
36  | 0  |     singleton = new EmojiProps(errorCode);  | 
37  | 0  |     if (singleton == nullptr) { | 
38  | 0  |         errorCode = U_MEMORY_ALLOCATION_ERROR;  | 
39  | 0  |     } else if (U_FAILURE(errorCode)) { | 
40  | 0  |         delete singleton;  | 
41  | 0  |         singleton = nullptr;  | 
42  | 0  |     }  | 
43  | 0  |     ucln_common_registerCleanup(UCLN_COMMON_EMOJIPROPS, emojiprops_cleanup);  | 
44  | 0  | }  | 
45  |  |  | 
46  |  | // TODO: turn this into a shared helper function  | 
47  |  | // Requires the major version to match, and then requires at least the minor version.  | 
48  |  | UBool udata_isAcceptableMajorMinor(  | 
49  | 0  |         const UDataInfo &info, const char16_t *dataFormat, uint8_t major, uint8_t minor) { | 
50  | 0  |     return  | 
51  | 0  |         info.size >= 20 &&  | 
52  | 0  |         info.isBigEndian == U_IS_BIG_ENDIAN &&  | 
53  | 0  |         info.charsetFamily == U_CHARSET_FAMILY &&  | 
54  | 0  |         info.dataFormat[0] == dataFormat[0] &&  | 
55  | 0  |         info.dataFormat[1] == dataFormat[1] &&  | 
56  | 0  |         info.dataFormat[2] == dataFormat[2] &&  | 
57  | 0  |         info.dataFormat[3] == dataFormat[3] &&  | 
58  | 0  |         info.formatVersion[0] == major &&  | 
59  | 0  |         info.formatVersion[1] >= minor;  | 
60  | 0  | }  | 
61  |  |  | 
62  |  | }  // namespace  | 
63  |  |  | 
64  | 0  | EmojiProps::~EmojiProps() { | 
65  | 0  |     udata_close(memory);  | 
66  | 0  |     ucptrie_close(cpTrie);  | 
67  | 0  | }  | 
68  |  |  | 
69  |  | const EmojiProps *  | 
70  | 0  | EmojiProps::getSingleton(UErrorCode &errorCode) { | 
71  | 0  |     if (U_FAILURE(errorCode)) { return nullptr; } | 
72  | 0  |     umtx_initOnce(emojiInitOnce, &initSingleton, errorCode);  | 
73  | 0  |     return singleton;  | 
74  | 0  | }  | 
75  |  |  | 
76  |  | UBool U_CALLCONV  | 
77  |  | EmojiProps::isAcceptable(void * /*context*/, const char * /*type*/, const char * /*name*/,  | 
78  | 0  |                          const UDataInfo *pInfo) { | 
79  | 0  |     return udata_isAcceptableMajorMinor(*pInfo, u"Emoj", 1, 0);  | 
80  | 0  | }  | 
81  |  |  | 
82  |  | void  | 
83  | 0  | EmojiProps::load(UErrorCode &errorCode) { | 
84  | 0  |     memory = udata_openChoice(nullptr, "icu", "uemoji", isAcceptable, this, &errorCode);  | 
85  | 0  |     if (U_FAILURE(errorCode)) { return; } | 
86  | 0  |     const uint8_t* inBytes = static_cast<const uint8_t*>(udata_getMemory(memory));  | 
87  | 0  |     const int32_t* inIndexes = reinterpret_cast<const int32_t*>(inBytes);  | 
88  | 0  |     int32_t indexesLength = inIndexes[IX_CPTRIE_OFFSET] / 4;  | 
89  | 0  |     if (indexesLength <= IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET) { | 
90  | 0  |         errorCode = U_INVALID_FORMAT_ERROR;  // Not enough indexes.  | 
91  | 0  |         return;  | 
92  | 0  |     }  | 
93  |  |  | 
94  | 0  |     int32_t i = IX_CPTRIE_OFFSET;  | 
95  | 0  |     int32_t offset = inIndexes[i++];  | 
96  | 0  |     int32_t nextOffset = inIndexes[i];  | 
97  | 0  |     cpTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_8,  | 
98  | 0  |                                     inBytes + offset, nextOffset - offset, nullptr, &errorCode);  | 
99  | 0  |     if (U_FAILURE(errorCode)) { | 
100  | 0  |         return;  | 
101  | 0  |     }  | 
102  |  |  | 
103  | 0  |     for (i = IX_BASIC_EMOJI_TRIE_OFFSET; i <= IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET; ++i) { | 
104  | 0  |         offset = inIndexes[i];  | 
105  | 0  |         nextOffset = inIndexes[i + 1];  | 
106  |  |         // Set/leave nullptr if there is no UCharsTrie.  | 
107  | 0  |         const char16_t* p = nextOffset > offset ? reinterpret_cast<const char16_t*>(inBytes + offset) : nullptr;  | 
108  | 0  |         stringTries[getStringTrieIndex(i)] = p;  | 
109  | 0  |     }  | 
110  | 0  | }  | 
111  |  |  | 
112  |  | void  | 
113  | 0  | EmojiProps::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { | 
114  |  |     // Add the start code point of each same-value range of the trie.  | 
115  | 0  |     UChar32 start = 0, end;  | 
116  | 0  |     uint32_t value;  | 
117  | 0  |     while ((end = ucptrie_getRange(cpTrie, start, UCPMAP_RANGE_NORMAL, 0,  | 
118  | 0  |                                    nullptr, nullptr, &value)) >= 0) { | 
119  | 0  |         sa->add(sa->set, start);  | 
120  | 0  |         start = end + 1;  | 
121  | 0  |     }  | 
122  | 0  | }  | 
123  |  |  | 
124  |  | UBool  | 
125  | 0  | EmojiProps::hasBinaryProperty(UChar32 c, UProperty which) { | 
126  | 0  |     UErrorCode errorCode = U_ZERO_ERROR;  | 
127  | 0  |     const EmojiProps *ep = getSingleton(errorCode);  | 
128  | 0  |     return U_SUCCESS(errorCode) && ep->hasBinaryPropertyImpl(c, which);  | 
129  | 0  | }  | 
130  |  |  | 
131  |  | UBool  | 
132  | 0  | EmojiProps::hasBinaryPropertyImpl(UChar32 c, UProperty which) const { | 
133  | 0  |     if (which < UCHAR_EMOJI || UCHAR_RGI_EMOJI < which) { | 
134  | 0  |         return false;  | 
135  | 0  |     }  | 
136  |  |     // Note: UCHAR_REGIONAL_INDICATOR is a single, hardcoded range implemented elsewhere.  | 
137  | 0  |     static constexpr int8_t bitFlags[] = { | 
138  | 0  |         BIT_EMOJI,                  // UCHAR_EMOJI=57  | 
139  | 0  |         BIT_EMOJI_PRESENTATION,     // UCHAR_EMOJI_PRESENTATION=58  | 
140  | 0  |         BIT_EMOJI_MODIFIER,         // UCHAR_EMOJI_MODIFIER=59  | 
141  | 0  |         BIT_EMOJI_MODIFIER_BASE,    // UCHAR_EMOJI_MODIFIER_BASE=60  | 
142  | 0  |         BIT_EMOJI_COMPONENT,        // UCHAR_EMOJI_COMPONENT=61  | 
143  | 0  |         -1,                         // UCHAR_REGIONAL_INDICATOR=62  | 
144  | 0  |         -1,                         // UCHAR_PREPENDED_CONCATENATION_MARK=63  | 
145  | 0  |         BIT_EXTENDED_PICTOGRAPHIC,  // UCHAR_EXTENDED_PICTOGRAPHIC=64  | 
146  | 0  |         BIT_BASIC_EMOJI,            // UCHAR_BASIC_EMOJI=65  | 
147  | 0  |         -1,                         // UCHAR_EMOJI_KEYCAP_SEQUENCE=66  | 
148  | 0  |         -1,                         // UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE=67  | 
149  | 0  |         -1,                         // UCHAR_RGI_EMOJI_FLAG_SEQUENCE=68  | 
150  | 0  |         -1,                         // UCHAR_RGI_EMOJI_TAG_SEQUENCE=69  | 
151  | 0  |         -1,                         // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE=70  | 
152  | 0  |         BIT_BASIC_EMOJI,            // UCHAR_RGI_EMOJI=71  | 
153  | 0  |     };  | 
154  | 0  |     int32_t bit = bitFlags[which - UCHAR_EMOJI];  | 
155  | 0  |     if (bit < 0) { | 
156  | 0  |         return false;  // not a property that we support in this function  | 
157  | 0  |     }  | 
158  | 0  |     uint8_t bits = UCPTRIE_FAST_GET(cpTrie, UCPTRIE_8, c);  | 
159  | 0  |     return (bits >> bit) & 1;  | 
160  | 0  | }  | 
161  |  |  | 
162  |  | UBool  | 
163  | 0  | EmojiProps::hasBinaryProperty(const char16_t *s, int32_t length, UProperty which) { | 
164  | 0  |     UErrorCode errorCode = U_ZERO_ERROR;  | 
165  | 0  |     const EmojiProps *ep = getSingleton(errorCode);  | 
166  | 0  |     return U_SUCCESS(errorCode) && ep->hasBinaryPropertyImpl(s, length, which);  | 
167  | 0  | }  | 
168  |  |  | 
169  |  | UBool  | 
170  | 0  | EmojiProps::hasBinaryPropertyImpl(const char16_t *s, int32_t length, UProperty which) const { | 
171  | 0  |     if (s == nullptr && length != 0) { return false; } | 
172  | 0  |     if (length <= 0 && (length == 0 || *s == 0)) { return false; }  // empty string | 
173  |  |     // The caller should have delegated single code points to hasBinaryProperty(c, which).  | 
174  | 0  |     if (which < UCHAR_BASIC_EMOJI || UCHAR_RGI_EMOJI < which) { | 
175  | 0  |         return false;  | 
176  | 0  |     }  | 
177  | 0  |     UProperty firstProp = which, lastProp = which;  | 
178  | 0  |     if (which == UCHAR_RGI_EMOJI) { | 
179  |  |         // RGI_Emoji is the union of the other emoji properties of strings.  | 
180  | 0  |         firstProp = UCHAR_BASIC_EMOJI;  | 
181  | 0  |         lastProp = UCHAR_RGI_EMOJI_ZWJ_SEQUENCE;  | 
182  | 0  |     }  | 
183  | 0  |     for (int32_t prop = firstProp; prop <= lastProp; ++prop) { | 
184  | 0  |         const char16_t *trieUChars = stringTries[prop - UCHAR_BASIC_EMOJI];  | 
185  | 0  |         if (trieUChars != nullptr) { | 
186  | 0  |             UCharsTrie trie(trieUChars);  | 
187  | 0  |             UStringTrieResult result = trie.next(s, length);  | 
188  | 0  |             if (USTRINGTRIE_HAS_VALUE(result)) { | 
189  | 0  |                 return true;  | 
190  | 0  |             }  | 
191  | 0  |         }  | 
192  | 0  |     }  | 
193  | 0  |     return false;  | 
194  | 0  | }  | 
195  |  |  | 
196  |  | void  | 
197  | 0  | EmojiProps::addStrings(const USetAdder *sa, UProperty which, UErrorCode &errorCode) const { | 
198  | 0  |     if (U_FAILURE(errorCode)) { return; } | 
199  | 0  |     if (which < UCHAR_BASIC_EMOJI || UCHAR_RGI_EMOJI < which) { | 
200  | 0  |         return;  | 
201  | 0  |     }  | 
202  | 0  |     UProperty firstProp = which, lastProp = which;  | 
203  | 0  |     if (which == UCHAR_RGI_EMOJI) { | 
204  |  |         // RGI_Emoji is the union of the other emoji properties of strings.  | 
205  | 0  |         firstProp = UCHAR_BASIC_EMOJI;  | 
206  | 0  |         lastProp = UCHAR_RGI_EMOJI_ZWJ_SEQUENCE;  | 
207  | 0  |     }  | 
208  | 0  |     for (int32_t prop = firstProp; prop <= lastProp; ++prop) { | 
209  | 0  |         const char16_t *trieUChars = stringTries[prop - UCHAR_BASIC_EMOJI];  | 
210  | 0  |         if (trieUChars != nullptr) { | 
211  | 0  |             UCharsTrie::Iterator iter(trieUChars, 0, errorCode);  | 
212  | 0  |             while (iter.next(errorCode)) { | 
213  | 0  |                 const UnicodeString &s = iter.getString();  | 
214  | 0  |                 sa->addString(sa->set, s.getBuffer(), s.length());  | 
215  | 0  |             }  | 
216  | 0  |         }  | 
217  | 0  |     }  | 
218  | 0  | }  | 
219  |  |  | 
220  |  | U_NAMESPACE_END  |