/src/icu/source/common/dictionarydata.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | * Copyright (C) 2014, International Business Machines  | 
6  |  | * Corporation and others.  All Rights Reserved.  | 
7  |  | *******************************************************************************  | 
8  |  | * dictionarydata.h  | 
9  |  | *  | 
10  |  | * created on: 2012may31  | 
11  |  | * created by: Markus W. Scherer & Maxime Serrano  | 
12  |  | */  | 
13  |  |  | 
14  |  | #ifndef __DICTIONARYDATA_H__  | 
15  |  | #define __DICTIONARYDATA_H__  | 
16  |  |  | 
17  |  | #include "unicode/utypes.h"  | 
18  |  |  | 
19  |  | #if !UCONFIG_NO_BREAK_ITERATION  | 
20  |  |  | 
21  |  | #include "unicode/utext.h"  | 
22  |  | #include "unicode/udata.h"  | 
23  |  | #include "udataswp.h"  | 
24  |  | #include "unicode/uobject.h"  | 
25  |  | #include "unicode/ustringtrie.h"  | 
26  |  |  | 
27  |  | U_NAMESPACE_BEGIN  | 
28  |  |  | 
29  |  | class UCharsTrie;  | 
30  |  | class BytesTrie;  | 
31  |  |  | 
32  |  | class U_COMMON_API DictionaryData : public UMemory { | 
33  |  | public:  | 
34  |  |     static const int32_t TRIE_TYPE_BYTES; // = 0;  | 
35  |  |     static const int32_t TRIE_TYPE_UCHARS; // = 1;  | 
36  |  |     static const int32_t TRIE_TYPE_MASK; // = 7;  | 
37  |  |     static const int32_t TRIE_HAS_VALUES; // = 8;  | 
38  |  |  | 
39  |  |     static const int32_t TRANSFORM_NONE; // = 0;  | 
40  |  |     static const int32_t TRANSFORM_TYPE_OFFSET; // = 0x1000000;  | 
41  |  |     static const int32_t TRANSFORM_TYPE_MASK; // = 0x7f000000;  | 
42  |  |     static const int32_t TRANSFORM_OFFSET_MASK; // = 0x1fffff;  | 
43  |  |  | 
44  |  |     enum { | 
45  |  |         // Byte offsets from the start of the data, after the generic header.  | 
46  |  |         IX_STRING_TRIE_OFFSET,  | 
47  |  |         IX_RESERVED1_OFFSET,  | 
48  |  |         IX_RESERVED2_OFFSET,  | 
49  |  |         IX_TOTAL_SIZE,  | 
50  |  |  | 
51  |  |         // Trie type: TRIE_HAS_VALUES | TRIE_TYPE_BYTES etc.  | 
52  |  |         IX_TRIE_TYPE,  | 
53  |  |         // Transform specification: TRANSFORM_TYPE_OFFSET | 0xe00 etc.  | 
54  |  |         IX_TRANSFORM,  | 
55  |  |  | 
56  |  |         IX_RESERVED6,  | 
57  |  |         IX_RESERVED7,  | 
58  |  |         IX_COUNT  | 
59  |  |     };  | 
60  |  | };  | 
61  |  |  | 
62  |  | /**  | 
63  |  |  * Wrapper class around generic dictionaries, implementing matches().  | 
64  |  |  * getType() should return a TRIE_TYPE_??? constant from DictionaryData.  | 
65  |  |  *   | 
66  |  |  * All implementations of this interface must be thread-safe if they are to be used inside of the  | 
67  |  |  * dictionary-based break iteration code.  | 
68  |  |  */  | 
69  |  | class U_COMMON_API DictionaryMatcher : public UMemory { | 
70  |  | public:  | 
71  | 0  |     DictionaryMatcher() {} | 
72  |  |     virtual ~DictionaryMatcher();  | 
73  |  |     // this should emulate CompactTrieDictionary::matches()  | 
74  |  |     /*  @param text      The text in which to look for matching words. Matching begins  | 
75  |  |      *                   at the current position of the UText.  | 
76  |  |      *  @param maxLength The max length of match to consider. Units are the native indexing  | 
77  |  |      *                   units of the UText.  | 
78  |  |      *  @param limit     Capacity of output arrays, which is also the maximum number of  | 
79  |  |      *                   matching words to be found.  | 
80  |  |      *  @param lengths   output array, filled with the lengths of the matches, in order,  | 
81  |  |      *                   from shortest to longest. Lengths are in native indexing units  | 
82  |  |      *                   of the UText. May be NULL.  | 
83  |  |      *  @param cpLengths output array, filled with the lengths of the matches, in order,  | 
84  |  |      *                   from shortest to longest. Lengths are the number of Unicode code points.  | 
85  |  |      *                   May be NULL.  | 
86  |  |      *  @param values    Output array, filled with the values associated with the words found.  | 
87  |  |      *                   May be NULL.  | 
88  |  |      *  @param prefix    Output parameter, the code point length of the prefix match, even if that  | 
89  |  |      *                   prefix didn't lead to a complete word. Will always be >= the cpLength  | 
90  |  |      *                   of the longest complete word matched. May be NULL.  | 
91  |  |      *  @return          Number of matching words found.  | 
92  |  |      */  | 
93  |  |     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,  | 
94  |  |                             int32_t *lengths, int32_t *cpLengths, int32_t *values,  | 
95  |  |                             int32_t *prefix) const = 0;  | 
96  |  |  | 
97  |  |     /** @return DictionaryData::TRIE_TYPE_XYZ */  | 
98  |  |     virtual int32_t getType() const = 0;  | 
99  |  | };  | 
100  |  |  | 
101  |  | // Implementation of the DictionaryMatcher interface for a UCharsTrie dictionary  | 
102  |  | class U_COMMON_API UCharsDictionaryMatcher : public DictionaryMatcher { | 
103  |  | public:  | 
104  |  |     // constructs a new UCharsDictionaryMatcher.  | 
105  |  |     // The UDataMemory * will be closed on this object's destruction.  | 
106  | 0  |     UCharsDictionaryMatcher(const UChar *c, UDataMemory *f) : characters(c), file(f) { } | 
107  |  |     virtual ~UCharsDictionaryMatcher();  | 
108  |  |     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,  | 
109  |  |                             int32_t *lengths, int32_t *cpLengths, int32_t *values,  | 
110  |  |                             int32_t *prefix) const;  | 
111  |  |     virtual int32_t getType() const;  | 
112  |  | private:  | 
113  |  |     const UChar *characters;  | 
114  |  |     UDataMemory *file;  | 
115  |  | };  | 
116  |  |  | 
117  |  | // Implementation of the DictionaryMatcher interface for a BytesTrie dictionary  | 
118  |  | class U_COMMON_API BytesDictionaryMatcher : public DictionaryMatcher { | 
119  |  | public:  | 
120  |  |     // constructs a new BytesTrieDictionaryMatcher  | 
121  |  |     // the transform constant should be the constant read from the file, not a masked version!  | 
122  |  |     // the UDataMemory * fed in here will be closed on this object's destruction  | 
123  |  |     BytesDictionaryMatcher(const char *c, int32_t t, UDataMemory *f)  | 
124  | 0  |             : characters(c), transformConstant(t), file(f) { } | 
125  |  |     virtual ~BytesDictionaryMatcher();  | 
126  |  |     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,  | 
127  |  |                             int32_t *lengths, int32_t *cpLengths, int32_t *values,  | 
128  |  |                             int32_t *prefix) const;  | 
129  |  |     virtual int32_t getType() const;  | 
130  |  | private:  | 
131  |  |     UChar32 transform(UChar32 c) const;  | 
132  |  |  | 
133  |  |     const char *characters;  | 
134  |  |     int32_t transformConstant;  | 
135  |  |     UDataMemory *file;  | 
136  |  | };  | 
137  |  |  | 
138  |  | U_NAMESPACE_END  | 
139  |  |  | 
140  |  | U_CAPI int32_t U_EXPORT2  | 
141  |  | udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode);  | 
142  |  |  | 
143  |  | /**  | 
144  |  |  * Format of dictionary .dict data files.  | 
145  |  |  * Format version 1.0.  | 
146  |  |  *  | 
147  |  |  * A dictionary .dict data file contains a byte-serialized BytesTrie or  | 
148  |  |  * a UChars-serialized UCharsTrie.  | 
149  |  |  * Such files are used in dictionary-based break iteration (DBBI).  | 
150  |  |  *  | 
151  |  |  * For a BytesTrie, a transformation type is specified for  | 
152  |  |  * transforming Unicode strings into byte sequences.  | 
153  |  |  *  | 
154  |  |  * A .dict file begins with a standard ICU data file header  | 
155  |  |  * (DataHeader, see ucmndata.h and unicode/udata.h).  | 
156  |  |  * The UDataInfo.dataVersion field is currently unused (set to 0.0.0.0).  | 
157  |  |  *  | 
158  |  |  * After the header, the file contains the following parts.  | 
159  |  |  * Constants are defined in the DictionaryData class.  | 
160  |  |  *  | 
161  |  |  * For the data structure of BytesTrie & UCharsTrie see  | 
162  |  |  * http://site.icu-project.org/design/struct/tries  | 
163  |  |  * and the bytestrie.h and ucharstrie.h header files.  | 
164  |  |  *  | 
165  |  |  * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_STRING_TRIE_OFFSET]/4;  | 
166  |  |  *  | 
167  |  |  *      The first four indexes are byte offsets in ascending order.  | 
168  |  |  *      Each byte offset marks the start of the next part in the data file,  | 
169  |  |  *      and the end of the previous one.  | 
170  |  |  *      When two consecutive byte offsets are the same, then the corresponding part is empty.  | 
171  |  |  *      Byte offsets are offsets from after the header,  | 
172  |  |  *      that is, from the beginning of the indexes[].  | 
173  |  |  *      Each part starts at an offset with proper alignment for its data.  | 
174  |  |  *      If necessary, the previous part may include padding bytes to achieve this alignment.  | 
175  |  |  *  | 
176  |  |  *      trieType=indexes[IX_TRIE_TYPE] defines the trie type.  | 
177  |  |  *      transform=indexes[IX_TRANSFORM] defines the Unicode-to-bytes transformation.  | 
178  |  |  *          If the transformation type is TRANSFORM_TYPE_OFFSET,  | 
179  |  |  *          then the lower 21 bits contain the offset code point.  | 
180  |  |  *          Each code point c is mapped to byte b = (c - offset).  | 
181  |  |  *          Code points outside the range offset..(offset+0xff) cannot be mapped  | 
182  |  |  *          and do not occur in the dictionary.  | 
183  |  |  *  | 
184  |  |  * stringTrie; -- a serialized BytesTrie or UCharsTrie  | 
185  |  |  *  | 
186  |  |  *      The dictionary maps strings to specific values (TRIE_HAS_VALUES bit set in trieType),  | 
187  |  |  *      or it maps all strings to 0 (TRIE_HAS_VALUES bit not set).  | 
188  |  |  */  | 
189  |  |  | 
190  |  | #endif  /* !UCONFIG_NO_BREAK_ITERATION */  | 
191  |  | #endif  /* __DICTIONARYDATA_H__ */  |