/src/icu/source/common/rbbiscan.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | //  | 
4  |  | //  rbbiscan.h  | 
5  |  | //  | 
6  |  | //  Copyright (C) 2002-2016, International Business Machines Corporation and others.  | 
7  |  | //  All Rights Reserved.  | 
8  |  | //  | 
9  |  | //  This file contains declarations for class RBBIRuleScanner  | 
10  |  | //  | 
11  |  |  | 
12  |  |  | 
13  |  | #ifndef RBBISCAN_H  | 
14  |  | #define RBBISCAN_H  | 
15  |  |  | 
16  |  | #include "unicode/utypes.h"  | 
17  |  | #include "unicode/uobject.h"  | 
18  |  | #include "unicode/rbbi.h"  | 
19  |  | #include "unicode/uniset.h"  | 
20  |  | #include "unicode/parseerr.h"  | 
21  |  | #include "uhash.h"  | 
22  |  | #include "uvector.h"  | 
23  |  | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that  | 
24  |  |                           //    looks up references to $variables within a set.  | 
25  |  | #include "rbbinode.h"  | 
26  |  | #include "rbbirpt.h"  | 
27  |  |  | 
28  |  | U_NAMESPACE_BEGIN  | 
29  |  |  | 
30  |  | class   RBBIRuleBuilder;  | 
31  |  | class   RBBISymbolTable;  | 
32  |  |  | 
33  |  |  | 
34  |  | //--------------------------------------------------------------------------------  | 
35  |  | //  | 
36  |  | //  class RBBIRuleScanner does the lowest level, character-at-a-time  | 
37  |  | //                        scanning of break iterator rules.    | 
38  |  | //  | 
39  |  | //                        The output of the scanner is parse trees for  | 
40  |  | //                        the rule expressions and a list of all Unicode Sets  | 
41  |  | //                        encountered.  | 
42  |  | //  | 
43  |  | //--------------------------------------------------------------------------------  | 
44  |  |  | 
45  |  | class RBBIRuleScanner : public UMemory { | 
46  |  | public:  | 
47  |  |  | 
48  |  |     enum { | 
49  |  |         kStackSize = 100            // The size of the state stack for  | 
50  |  |     };                              //   rules parsing.  Corresponds roughly  | 
51  |  |                                     //   to the depth of parentheses nesting  | 
52  |  |                                     //   that is allowed in the rules.  | 
53  |  |  | 
54  |  |     struct RBBIRuleChar { | 
55  |  |         UChar32             fChar;  | 
56  |  |         UBool               fEscaped;  | 
57  | 0  |         RBBIRuleChar() : fChar(0), fEscaped(false) {} | 
58  |  |     };  | 
59  |  |  | 
60  |  |     RBBIRuleScanner(RBBIRuleBuilder  *rb);  | 
61  |  |  | 
62  |  |  | 
63  |  |     virtual    ~RBBIRuleScanner();  | 
64  |  |  | 
65  |  |     void        nextChar(RBBIRuleChar &c);          // Get the next char from the input stream.  | 
66  |  |                                                     // Return false if at end.  | 
67  |  |  | 
68  |  |     UBool       push(const RBBIRuleChar &c);        // Push (unget) one character.  | 
69  |  |                                                     //   Only a single character may be pushed.  | 
70  |  |  | 
71  |  |     void        parse();                            // Parse the rules, generating two parse  | 
72  |  |                                                     //   trees, one each for the forward and  | 
73  |  |                                                     //   reverse rules,  | 
74  |  |                                                     //   and a list of UnicodeSets encountered.  | 
75  |  |  | 
76  |  |     int32_t     numRules();                         // Return the number of rules that have been seen.  | 
77  |  |  | 
78  |  |     /**  | 
79  |  |      * Return a rules string without unnecessary  | 
80  |  |      * characters.  | 
81  |  |      */  | 
82  |  |     static UnicodeString stripRules(const UnicodeString &rules);  | 
83  |  | private:  | 
84  |  |  | 
85  |  |     UBool       doParseActions(int32_t a);  | 
86  |  |     void        error(UErrorCode e);                   // error reporting convenience function.  | 
87  |  |     void        fixOpStack(RBBINode::OpPrecedence p);  | 
88  |  |                                                        //   a character.  | 
89  |  |     void        findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);  | 
90  |  |  | 
91  |  |     UChar32     nextCharLL();  | 
92  |  | #ifdef RBBI_DEBUG  | 
93  |  |     void        printNodeStack(const char *title);  | 
94  |  | #endif  | 
95  |  |     RBBINode    *pushNewNode(RBBINode::NodeType  t);  | 
96  |  |     void        scanSet();  | 
97  |  |  | 
98  |  |  | 
99  |  |     RBBIRuleBuilder               *fRB;              // The rule builder that we are part of.  | 
100  |  |  | 
101  |  |     int32_t                       fScanIndex;        // Index of current character being processed  | 
102  |  |                                                      //   in the rule input string.  | 
103  |  |     int32_t                       fNextIndex;        // Index of the next character, which  | 
104  |  |                                                      //   is the first character not yet scanned.  | 
105  |  |     UBool                         fQuoteMode;        // Scan is in a 'quoted region'  | 
106  |  |     int32_t                       fLineNum;          // Line number in input file.  | 
107  |  |     int32_t                       fCharNum;          // Char position within the line.  | 
108  |  |     UChar32                       fLastChar;         // Previous char, needed to count CR-LF  | 
109  |  |                                                      //   as a single line, not two.  | 
110  |  |  | 
111  |  |     RBBIRuleChar                  fC;                // Current char for parse state machine  | 
112  |  |                                                      //   processing.  | 
113  |  |     UnicodeString                 fVarName;          // $variableName, valid when we've just  | 
114  |  |                                                      //   scanned one.  | 
115  |  |  | 
116  |  |     RBBIRuleTableEl               **fStateTable;     // State Transition Table for RBBI Rule  | 
117  |  |                                                      //   parsing.  index by p[state][char-class]  | 
118  |  |  | 
119  |  |     uint16_t                      fStack[kStackSize];  // State stack, holds state pushes  | 
120  |  |     int32_t                       fStackPtr;           //  and pops as specified in the state  | 
121  |  |                                                        //  transition rules.  | 
122  |  |  | 
123  |  |     RBBINode                      *fNodeStack[kStackSize]; // Node stack, holds nodes created  | 
124  |  |                                                            //  during the parse of a rule  | 
125  |  |     int32_t                        fNodeStackPtr;  | 
126  |  |  | 
127  |  |  | 
128  |  |     UBool                          fReverseRule;     // True if the rule currently being scanned  | 
129  |  |                                                      //  is a reverse direction rule (if it  | 
130  |  |                                                      //  starts with a '!')  | 
131  |  |  | 
132  |  |     UBool                          fLookAheadRule;   // True if the rule includes a '/'  | 
133  |  |                                                      //   somewhere within it.  | 
134  |  |  | 
135  |  |     UBool                          fNoChainInRule;   // True if the current rule starts with a '^'.  | 
136  |  |  | 
137  |  |     RBBISymbolTable               *fSymbolTable;     // symbol table, holds definitions of  | 
138  |  |                                                      //   $variable symbols.  | 
139  |  |  | 
140  |  |     UHashtable                    *fSetTable;        // UnicocodeSet hash table, holds indexes to  | 
141  |  |                                                      //   the sets created while parsing rules.  | 
142  |  |                                                      //   The key is the string used for creating  | 
143  |  |                                                      //   the set.  | 
144  |  |  | 
145  |  |     UnicodeSet                     fRuleSets[10];    // Unicode Sets that are needed during  | 
146  |  |                                                      //  the scanning of RBBI rules.  The  | 
147  |  |                                                      //  indicies for these are assigned by the  | 
148  |  |                                                      //  perl script that builds the state tables.  | 
149  |  |                                                      //  See rbbirpt.h.  | 
150  |  |  | 
151  |  |     int32_t                        fRuleNum;         // Counts each rule as it is scanned.  | 
152  |  |  | 
153  |  |     int32_t                        fOptionStart;     // Input index of start of a !!option  | 
154  |  |                                                      //   keyword, while being scanned.  | 
155  |  |  | 
156  |  |     UnicodeSet *gRuleSet_rule_char;  | 
157  |  |     UnicodeSet *gRuleSet_white_space;  | 
158  |  |     UnicodeSet *gRuleSet_name_char;  | 
159  |  |     UnicodeSet *gRuleSet_name_start_char;  | 
160  |  |  | 
161  |  |     RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class  | 
162  |  |     RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class  | 
163  |  | };  | 
164  |  |  | 
165  |  | U_NAMESPACE_END  | 
166  |  |  | 
167  |  | #endif  |