/src/icu/icu4c/source/common/rbbinode.cpp
Line | Count | Source |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | *************************************************************************** |
5 | | * Copyright (C) 2002-2016 International Business Machines Corporation * |
6 | | * and others. All rights reserved. * |
7 | | *************************************************************************** |
8 | | */ |
9 | | |
10 | | // |
11 | | // File: rbbinode.cpp |
12 | | // |
13 | | // Implementation of class RBBINode, which represents a node in the |
14 | | // tree generated when parsing the Rules Based Break Iterator rules. |
15 | | // |
16 | | // This "Class" is actually closer to a struct. |
17 | | // Code using it is expected to directly access fields much of the time. |
18 | | // |
19 | | |
20 | | #include "unicode/utypes.h" |
21 | | |
22 | | #if !UCONFIG_NO_BREAK_ITERATION |
23 | | |
24 | | #include "unicode/unistr.h" |
25 | | #include "unicode/uniset.h" |
26 | | #include "unicode/uchar.h" |
27 | | #include "unicode/parsepos.h" |
28 | | |
29 | | #include "cstr.h" |
30 | | #include "uvector.h" |
31 | | |
32 | | #include "rbbirb.h" |
33 | | #include "rbbinode.h" |
34 | | |
35 | | #include "uassert.h" |
36 | | |
37 | | |
38 | | U_NAMESPACE_BEGIN |
39 | | |
40 | | #ifdef RBBI_DEBUG |
41 | | static int gLastSerial = 0; |
42 | | #endif |
43 | | |
44 | | |
45 | | //------------------------------------------------------------------------- |
46 | | // |
47 | | // Constructor. Just set the fields to reasonable default values. |
48 | | // |
49 | | //------------------------------------------------------------------------- |
50 | 11.5M | RBBINode::RBBINode(NodeType t, UErrorCode& status) : UMemory() { |
51 | 11.5M | if (U_FAILURE(status)) { |
52 | 0 | return; |
53 | 0 | } |
54 | | #ifdef RBBI_DEBUG |
55 | | fSerialNum = ++gLastSerial; |
56 | | #endif |
57 | 11.5M | fType = t; |
58 | 11.5M | fParent = nullptr; |
59 | 11.5M | fLeftChild = nullptr; |
60 | 11.5M | fRightChild = nullptr; |
61 | 11.5M | fInputSet = nullptr; |
62 | 11.5M | fFirstPos = 0; |
63 | 11.5M | fLastPos = 0; |
64 | 11.5M | fNullable = false; |
65 | 11.5M | fLookAheadEnd = false; |
66 | 11.5M | fRuleRoot = false; |
67 | 11.5M | fChainIn = false; |
68 | 11.5M | fVal = 0; |
69 | 11.5M | fPrecedence = precZero; |
70 | | |
71 | 11.5M | fFirstPosSet = new UVector(status); |
72 | 11.5M | fLastPosSet = new UVector(status); |
73 | 11.5M | fFollowPos = new UVector(status); |
74 | 11.5M | if (U_SUCCESS(status) && |
75 | 11.5M | (fFirstPosSet == nullptr || fLastPosSet == nullptr || fFollowPos == nullptr)) { |
76 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
77 | 0 | } |
78 | 11.5M | if (t==opCat) {fPrecedence = precOpCat;} |
79 | 6.08M | else if (t==opOr) {fPrecedence = precOpOr;} |
80 | 5.94M | else if (t==opStart) {fPrecedence = precStart;} |
81 | 5.90M | else if (t==opLParen) {fPrecedence = precLParen;} |
82 | | |
83 | 11.5M | } |
84 | | |
85 | | |
86 | 907k | RBBINode::RBBINode(const RBBINode &other, UErrorCode& status) : UMemory(other) { |
87 | 907k | if (U_FAILURE(status)) { |
88 | 0 | return; |
89 | 0 | } |
90 | | #ifdef RBBI_DEBUG |
91 | | fSerialNum = ++gLastSerial; |
92 | | #endif |
93 | 907k | fType = other.fType; |
94 | 907k | fParent = nullptr; |
95 | 907k | fLeftChild = nullptr; |
96 | 907k | fRightChild = nullptr; |
97 | 907k | fInputSet = other.fInputSet; |
98 | 907k | fPrecedence = other.fPrecedence; |
99 | 907k | fText = other.fText; |
100 | 907k | fFirstPos = other.fFirstPos; |
101 | 907k | fLastPos = other.fLastPos; |
102 | 907k | fNullable = other.fNullable; |
103 | 907k | fVal = other.fVal; |
104 | 907k | fRuleRoot = false; |
105 | 907k | fChainIn = other.fChainIn; |
106 | 907k | fFirstPosSet = new UVector(status); // TODO - get a real status from somewhere |
107 | 907k | fLastPosSet = new UVector(status); |
108 | 907k | fFollowPos = new UVector(status); |
109 | 907k | if (U_SUCCESS(status) && |
110 | 907k | (fFirstPosSet == nullptr || fLastPosSet == nullptr || fFollowPos == nullptr)) { |
111 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
112 | 0 | } |
113 | 907k | } |
114 | | |
115 | | |
116 | | //------------------------------------------------------------------------- |
117 | | // |
118 | | // Destructor. Deletes both this node AND any child nodes, |
119 | | // except in the case of variable reference nodes. For |
120 | | // these, the l. child points back to the definition, which |
121 | | // is common for all references to the variable, meaning |
122 | | // it can't be deleted here. |
123 | | // |
124 | | //------------------------------------------------------------------------- |
125 | 12.4M | RBBINode::~RBBINode() { |
126 | | // printf("deleting node %8x serial %4d\n", this, this->fSerialNum); |
127 | 12.4M | delete fInputSet; |
128 | 12.4M | fInputSet = nullptr; |
129 | | |
130 | 12.4M | switch (this->fType) { |
131 | 2.25k | case varRef: |
132 | 5.54M | case setRef: |
133 | | // for these node types, multiple instances point to the same "children" |
134 | | // Storage ownership of children handled elsewhere. Don't delete here. |
135 | 5.54M | break; |
136 | | |
137 | 6.93M | default: |
138 | | // Avoid using a recursive implementation because of stack overflow problems. |
139 | | // See bug ICU-22584. |
140 | | // delete fLeftChild; |
141 | 6.93M | NRDeleteNode(fLeftChild); |
142 | 6.93M | fLeftChild = nullptr; |
143 | | // delete fRightChild; |
144 | 6.93M | NRDeleteNode(fRightChild); |
145 | 6.93M | fRightChild = nullptr; |
146 | 12.4M | } |
147 | | |
148 | 12.4M | delete fFirstPosSet; |
149 | 12.4M | delete fLastPosSet; |
150 | 12.4M | delete fFollowPos; |
151 | 12.4M | } |
152 | | |
153 | | /** |
154 | | * Non-recursive delete of a node + its children. Used from the node destructor |
155 | | * instead of the more obvious recursive implementation to avoid problems with |
156 | | * stack overflow with some perverse test rule data (from fuzzing). |
157 | | */ |
158 | 13.8M | void RBBINode::NRDeleteNode(RBBINode *node) { |
159 | 13.8M | if (node == nullptr) { |
160 | 13.7M | return; |
161 | 13.7M | } |
162 | | |
163 | 111k | RBBINode *stopNode = node->fParent; |
164 | 111k | RBBINode *nextNode = node; |
165 | 24.0M | while (nextNode != stopNode && nextNode != nullptr) { |
166 | 23.9M | RBBINode *currentNode = nextNode; |
167 | | |
168 | 23.9M | if ((currentNode->fLeftChild == nullptr && currentNode->fRightChild == nullptr) || |
169 | 17.2M | currentNode->fType == varRef || // varRef and setRef nodes do not |
170 | 17.2M | currentNode->fType == setRef) { // own their children nodes. |
171 | | // CurrentNode is effectively a leaf node; it's safe to go ahead and delete it. |
172 | 12.0M | nextNode = currentNode->fParent; |
173 | 12.0M | if (nextNode) { |
174 | 12.0M | if (nextNode->fLeftChild == currentNode) { |
175 | 6.05M | nextNode->fLeftChild = nullptr; |
176 | 6.05M | } else if (nextNode->fRightChild == currentNode) { |
177 | 5.95M | nextNode->fRightChild = nullptr; |
178 | 5.95M | } |
179 | 12.0M | } |
180 | 12.0M | delete currentNode; |
181 | 12.0M | } else if (currentNode->fLeftChild) { |
182 | 5.94M | nextNode = currentNode->fLeftChild; |
183 | 5.94M | if (nextNode->fParent == nullptr) { |
184 | 480 | nextNode->fParent = currentNode; |
185 | | // fParent isn't always set; do it now if not. |
186 | 480 | } |
187 | 5.94M | U_ASSERT(nextNode->fParent == currentNode); |
188 | 5.94M | } else if (currentNode->fRightChild) { |
189 | 5.94M | nextNode = currentNode->fRightChild; |
190 | 5.94M | if (nextNode->fParent == nullptr) { |
191 | 537 | nextNode->fParent = currentNode; |
192 | | // fParent isn't always set; do it now if not. |
193 | 537 | } |
194 | 5.94M | U_ASSERT(nextNode->fParent == currentNode); |
195 | 5.94M | } |
196 | 23.9M | } |
197 | 111k | } |
198 | | |
199 | | //------------------------------------------------------------------------- |
200 | | // |
201 | | // cloneTree Make a copy of the subtree rooted at this node. |
202 | | // Discard any variable references encountered along the way, |
203 | | // and replace with copies of the variable's definitions. |
204 | | // Used to replicate the expression underneath variable |
205 | | // references in preparation for generating the DFA tables. |
206 | | // |
207 | | //------------------------------------------------------------------------- |
208 | | constexpr int kRecursiveDepthLimit = 3500; |
209 | 924k | RBBINode *RBBINode::cloneTree(UErrorCode &status, int depth) { |
210 | 924k | if (U_FAILURE(status)) { |
211 | 0 | return nullptr; |
212 | 0 | } |
213 | | // If the depth of the stack is too deep, we return U_INPUT_TOO_LONG_ERROR |
214 | | // to avoid stack overflow crash. |
215 | 924k | if (depth > kRecursiveDepthLimit) { |
216 | 22 | status = U_INPUT_TOO_LONG_ERROR; |
217 | 22 | return nullptr; |
218 | 22 | } |
219 | 924k | RBBINode *n; |
220 | | |
221 | 924k | if (fType == RBBINode::varRef) { |
222 | | // If the current node is a variable reference, skip over it |
223 | | // and clone the definition of the variable instead. |
224 | 344 | n = fLeftChild->cloneTree(status, depth+1); |
225 | 344 | if (U_FAILURE(status)) { |
226 | 22 | return nullptr; |
227 | 22 | } |
228 | 924k | } else if (fType == RBBINode::uset) { |
229 | 17.0k | n = this; |
230 | 907k | } else { |
231 | 907k | n = new RBBINode(*this, status); |
232 | 907k | if (U_FAILURE(status)) { |
233 | 0 | delete n; |
234 | 0 | return nullptr; |
235 | 0 | } |
236 | | // Check for null pointer. |
237 | 907k | if (n == nullptr) { |
238 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
239 | 0 | return nullptr; |
240 | 0 | } |
241 | 907k | if (fLeftChild != nullptr) { |
242 | 393k | n->fLeftChild = fLeftChild->cloneTree(status, depth+1); |
243 | 393k | if (U_FAILURE(status)) { |
244 | 49.5k | delete n; |
245 | 49.5k | return nullptr; |
246 | 49.5k | } |
247 | 343k | n->fLeftChild->fParent = n; |
248 | 343k | } |
249 | 857k | if (fRightChild != nullptr) { |
250 | 326k | n->fRightChild = fRightChild->cloneTree(status, depth+1); |
251 | 326k | if (U_FAILURE(status)) { |
252 | 75 | delete n; |
253 | 75 | return nullptr; |
254 | 75 | } |
255 | 326k | n->fRightChild->fParent = n; |
256 | 326k | } |
257 | 857k | } |
258 | 875k | return n; |
259 | 924k | } |
260 | | |
261 | | |
262 | | |
263 | | //------------------------------------------------------------------------- |
264 | | // |
265 | | // flattenVariables Walk a parse tree, replacing any variable |
266 | | // references with a copy of the variable's definition. |
267 | | // Aside from variables, the tree is not changed. |
268 | | // |
269 | | // Return the root of the tree. If the root was not a variable |
270 | | // reference, it remains unchanged - the root we started with |
271 | | // is the root we return. If, however, the root was a variable |
272 | | // reference, the root of the newly cloned replacement tree will |
273 | | // be returned, and the original tree deleted. |
274 | | // |
275 | | // This function works by recursively walking the tree |
276 | | // without doing anything until a variable reference is |
277 | | // found, then calling cloneTree() at that point. Any |
278 | | // nested references are handled by cloneTree(), not here. |
279 | | // |
280 | | //------------------------------------------------------------------------- |
281 | 1.59M | RBBINode *RBBINode::flattenVariables(UErrorCode& status, int depth) { |
282 | 1.59M | if (U_FAILURE(status)) { |
283 | 0 | return this; |
284 | 0 | } |
285 | | // If the depth of the stack is too deep, we return U_INPUT_TOO_LONG_ERROR |
286 | | // to avoid stack overflow crash. |
287 | 1.59M | if (depth > kRecursiveDepthLimit) { |
288 | 24 | status = U_INPUT_TOO_LONG_ERROR; |
289 | 24 | return this; |
290 | 24 | } |
291 | 1.59M | if (fType == varRef) { |
292 | 344 | RBBINode *retNode = fLeftChild->cloneTree(status, depth+1); |
293 | 344 | if (U_FAILURE(status)) { |
294 | 22 | return this; |
295 | 22 | } |
296 | 322 | retNode->fRuleRoot = this->fRuleRoot; |
297 | 322 | retNode->fChainIn = this->fChainIn; |
298 | 322 | delete this; // TODO: undefined behavior. Fix. |
299 | 322 | return retNode; |
300 | 344 | } |
301 | | |
302 | 1.59M | if (fLeftChild != nullptr) { |
303 | 1.05M | fLeftChild = fLeftChild->flattenVariables(status, depth+1); |
304 | 1.05M | if (fLeftChild == nullptr) { |
305 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
306 | 0 | } |
307 | 1.05M | if (U_FAILURE(status)) { |
308 | 111k | return this; |
309 | 111k | } |
310 | 941k | fLeftChild->fParent = this; |
311 | 941k | } |
312 | 1.48M | if (fRightChild != nullptr) { |
313 | 542k | fRightChild = fRightChild->flattenVariables(status, depth+1); |
314 | 542k | if (fRightChild == nullptr) { |
315 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
316 | 0 | } |
317 | 542k | if (U_FAILURE(status)) { |
318 | 96 | return this; |
319 | 96 | } |
320 | 542k | fRightChild->fParent = this; |
321 | 542k | } |
322 | 1.48M | return this; |
323 | 1.48M | } |
324 | | |
325 | | |
326 | | //------------------------------------------------------------------------- |
327 | | // |
328 | | // flattenSets Walk the parse tree, replacing any nodes of type setRef |
329 | | // with a copy of the expression tree for the set. A set's |
330 | | // equivalent expression tree is precomputed and saved as |
331 | | // the left child of the uset node. |
332 | | // |
333 | | //------------------------------------------------------------------------- |
334 | 246k | void RBBINode::flattenSets(UErrorCode &status, int depth) { |
335 | 246k | if (U_FAILURE(status)) { |
336 | 0 | return; |
337 | 0 | } |
338 | | // If the depth of the stack is too deep, we return U_INPUT_TOO_LONG_ERROR |
339 | | // to avoid stack overflow crash. |
340 | 246k | if (depth > kRecursiveDepthLimit) { |
341 | 0 | status = U_INPUT_TOO_LONG_ERROR; |
342 | 0 | return; |
343 | 0 | } |
344 | 246k | U_ASSERT(fType != setRef); |
345 | | |
346 | 246k | if (fLeftChild != nullptr) { |
347 | 225k | if (fLeftChild->fType==setRef) { |
348 | 22.2k | RBBINode *setRefNode = fLeftChild; |
349 | 22.2k | RBBINode *usetNode = setRefNode->fLeftChild; |
350 | 22.2k | RBBINode *replTree = usetNode->fLeftChild; |
351 | 22.2k | fLeftChild = replTree->cloneTree(status, depth+1); |
352 | 22.2k | if (U_FAILURE(status)) { |
353 | 0 | delete setRefNode; |
354 | 0 | return; |
355 | 0 | } |
356 | 22.2k | fLeftChild->fParent = this; |
357 | 22.2k | delete setRefNode; |
358 | 203k | } else { |
359 | 203k | fLeftChild->flattenSets(status, depth+1); |
360 | 203k | } |
361 | 225k | } |
362 | | |
363 | 246k | if (fRightChild != nullptr) { |
364 | 222k | if (fRightChild->fType==setRef) { |
365 | 181k | RBBINode *setRefNode = fRightChild; |
366 | 181k | RBBINode *usetNode = setRefNode->fLeftChild; |
367 | 181k | RBBINode *replTree = usetNode->fLeftChild; |
368 | 181k | fRightChild = replTree->cloneTree(status, depth+1); |
369 | 181k | if (U_FAILURE(status)) { |
370 | 0 | delete setRefNode; |
371 | 0 | return; |
372 | 0 | } |
373 | 181k | fRightChild->fParent = this; |
374 | 181k | delete setRefNode; |
375 | 181k | } else { |
376 | 40.9k | fRightChild->flattenSets(status, depth+1); |
377 | 40.9k | } |
378 | 222k | } |
379 | 246k | } |
380 | | |
381 | | |
382 | | |
383 | | //------------------------------------------------------------------------- |
384 | | // |
385 | | // findNodes() Locate all the nodes of the specified type, starting |
386 | | // at the specified root. |
387 | | // |
388 | | //------------------------------------------------------------------------- |
389 | 3.47M | void RBBINode::findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &status) { |
390 | | /* test for buffer overflows */ |
391 | 3.47M | if (U_FAILURE(status)) { |
392 | 0 | return; |
393 | 0 | } |
394 | 3.47M | U_ASSERT(!dest->hasDeleter()); |
395 | 3.47M | if (fType == kind) { |
396 | 147k | dest->addElement(this, status); |
397 | 147k | } |
398 | 3.47M | if (fLeftChild != nullptr) { |
399 | 1.73M | fLeftChild->findNodes(dest, kind, status); |
400 | 1.73M | } |
401 | 3.47M | if (fRightChild != nullptr) { |
402 | 1.72M | fRightChild->findNodes(dest, kind, status); |
403 | 1.72M | } |
404 | 3.47M | } |
405 | | |
406 | | |
407 | | //------------------------------------------------------------------------- |
408 | | // |
409 | | // print. Print out a single node, for debugging. |
410 | | // |
411 | | //------------------------------------------------------------------------- |
412 | | #ifdef RBBI_DEBUG |
413 | | |
414 | | static int32_t serial(const RBBINode *node) { |
415 | | return (node == nullptr? -1 : node->fSerialNum); |
416 | | } |
417 | | |
418 | | |
419 | | void RBBINode::printNode(const RBBINode *node) { |
420 | | static const char * const nodeTypeNames[] = { |
421 | | "setRef", |
422 | | "uset", |
423 | | "varRef", |
424 | | "leafChar", |
425 | | "lookAhead", |
426 | | "tag", |
427 | | "endMark", |
428 | | "opStart", |
429 | | "opCat", |
430 | | "opOr", |
431 | | "opStar", |
432 | | "opPlus", |
433 | | "opQuestion", |
434 | | "opBreak", |
435 | | "opReverse", |
436 | | "opLParen" |
437 | | }; |
438 | | |
439 | | if (node==nullptr) { |
440 | | RBBIDebugPrintf("%10p", (void *)node); |
441 | | } else { |
442 | | RBBIDebugPrintf("%10p %5d %12s %c%c %5d %5d %5d %6d %d ", |
443 | | (void *)node, node->fSerialNum, nodeTypeNames[node->fType], |
444 | | node->fRuleRoot?'R':' ', node->fChainIn?'C':' ', |
445 | | serial(node->fLeftChild), serial(node->fRightChild), serial(node->fParent), |
446 | | node->fFirstPos, node->fVal); |
447 | | if (node->fType == varRef) { |
448 | | RBBI_DEBUG_printUnicodeString(node->fText); |
449 | | } |
450 | | } |
451 | | RBBIDebugPrintf("\n"); |
452 | | } |
453 | | #endif |
454 | | |
455 | | |
456 | | #ifdef RBBI_DEBUG |
457 | | U_CFUNC void RBBI_DEBUG_printUnicodeString(const UnicodeString &s, int minWidth) { |
458 | | RBBIDebugPrintf("%*s", minWidth, CStr(s)()); |
459 | | } |
460 | | #endif |
461 | | |
462 | | |
463 | | //------------------------------------------------------------------------- |
464 | | // |
465 | | // print. Print out the tree of nodes rooted at "this" |
466 | | // |
467 | | //------------------------------------------------------------------------- |
468 | | #ifdef RBBI_DEBUG |
469 | | void RBBINode::printNodeHeader() { |
470 | | RBBIDebugPrintf(" Address serial type LeftChild RightChild Parent position value\n"); |
471 | | } |
472 | | |
473 | | void RBBINode::printTree(const RBBINode *node, UBool printHeading) { |
474 | | if (printHeading) { |
475 | | printNodeHeader(); |
476 | | } |
477 | | printNode(node); |
478 | | if (node != nullptr) { |
479 | | // Only dump the definition under a variable reference if asked to. |
480 | | // Unconditionally dump children of all other node types. |
481 | | if (node->fType != varRef) { |
482 | | if (node->fLeftChild != nullptr) { |
483 | | printTree(node->fLeftChild, false); |
484 | | } |
485 | | |
486 | | if (node->fRightChild != nullptr) { |
487 | | printTree(node->fRightChild, false); |
488 | | } |
489 | | } |
490 | | } |
491 | | } |
492 | | #endif |
493 | | |
494 | | |
495 | | |
496 | | U_NAMESPACE_END |
497 | | |
498 | | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |