/src/icu/source/common/dictionarydata.cpp

Source (jump to first uncovered line)
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2014-2016, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* dictionarydata.h
*
* created on: 2012may31
* created by: Markus W. Scherer & Maxime Serrano
*/

#include "dictionarydata.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/udata.h"
#include "cmemory.h"

#if !UCONFIG_NO_BREAK_ITERATION

U_NAMESPACE_BEGIN

const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;

const int32_t  DictionaryData::TRANSFORM_NONE = 0;
const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
    
DictionaryMatcher::~DictionaryMatcher() {
}

UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
    udata_close(file);
}

int32_t UCharsDictionaryMatcher::getType() const {
    return DictionaryData::TRIE_TYPE_UCHARS;
}

int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
                            int32_t *prefix) const {

    UCharsTrie uct(characters);
    int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
    int32_t wordCount = 0;
    int32_t codePointsMatched = 0;

    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
        UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
        int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
        codePointsMatched += 1;
        if (USTRINGTRIE_HAS_VALUE(result)) {
            if (wordCount < limit) {
                if (values != NULL) {
                    values[wordCount] = uct.getValue();
                }
                if (lengths != NULL) {
                    lengths[wordCount] = lengthMatched;
                }
                if (cpLengths != NULL) {
                    cpLengths[wordCount] = codePointsMatched;
                }
                ++wordCount;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        }
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }
        if (lengthMatched >= maxLength) {
            break;
        }
    }

    if (prefix != NULL) {
        *prefix = codePointsMatched;
    }
    return wordCount;
}

BytesDictionaryMatcher::~BytesDictionaryMatcher() {
    udata_close(file);
}

UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
    if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
        if (c == 0x200D) {
            return 0xFF;
        } else if (c == 0x200C) {
            return 0xFE;
        }
        int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
        if (delta < 0 || 0xFD < delta) {
            return U_SENTINEL;
        }
        return (UChar32)delta;
    }
    return c;
}

int32_t BytesDictionaryMatcher::getType() const {
    return DictionaryData::TRIE_TYPE_BYTES;
}

int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
                            int32_t *prefix) const {
    BytesTrie bt(characters);
    int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
    int32_t wordCount = 0;
    int32_t codePointsMatched = 0;

    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
        UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
        int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
        codePointsMatched += 1;
        if (USTRINGTRIE_HAS_VALUE(result)) {
            if (wordCount < limit) {
                if (values != NULL) {
                    values[wordCount] = bt.getValue();
                }
                if (lengths != NULL) {
                    lengths[wordCount] = lengthMatched;
                }
                if (cpLengths != NULL) {
                    cpLengths[wordCount] = codePointsMatched;
                }
                ++wordCount;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        }
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }
        if (lengthMatched >= maxLength) {
            break;
        }
    }

    if (prefix != NULL) {
        *prefix = codePointsMatched;
    }
    return wordCount;
}


U_NAMESPACE_END

U_NAMESPACE_USE

U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
           void *outData, UErrorCode *pErrorCode) {
    const UDataInfo *pInfo;
    int32_t headerSize;
    const uint8_t *inBytes;
    uint8_t *outBytes;
    const int32_t *inIndexes;
    int32_t indexes[DictionaryData::IX_COUNT];
    int32_t i, offset, size;

    headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
    if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
    pInfo = (const UDataInfo *)((const char *)inData + 4);
    if (!(pInfo->dataFormat[0] == 0x44 && 
          pInfo->dataFormat[1] == 0x69 && 
          pInfo->dataFormat[2] == 0x63 && 
          pInfo->dataFormat[3] == 0x74 && 
          pInfo->formatVersion[0] == 1)) {
        udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
                         pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
        *pErrorCode = U_UNSUPPORTED_ERROR;
        return 0;
    }

    inBytes = (const uint8_t *)inData + headerSize;
    outBytes = (uint8_t *)outData + headerSize;

    inIndexes = (const int32_t *)inBytes;
    if (length >= 0) {
        length -= headerSize;
        if (length < (int32_t)(sizeof(indexes))) {
            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
            return 0;
        }
    }

    for (i = 0; i < DictionaryData::IX_COUNT; i++) {
        indexes[i] = udata_readInt32(ds, inIndexes[i]);
    }

    size = indexes[DictionaryData::IX_TOTAL_SIZE];

    if (length >= 0) {
        if (length < size) {
            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
            return 0;
        }

        if (inBytes != outBytes) {
            uprv_memcpy(outBytes, inBytes, size);
        }

        offset = 0;
        ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
        offset = (int32_t)sizeof(indexes);
        int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
        int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];

        if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
            ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
        } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
            // nothing to do
        } else {
            udata_printError(ds, "udict_swap(): unknown trie type!\n");
            *pErrorCode = U_UNSUPPORTED_ERROR;
            return 0;
        }

        // these next two sections are empty in the current format,
        // but may be used later.
        offset = nextOffset;
        nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
        offset = nextOffset;
        nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
        offset = nextOffset;
    }
    return headerSize + size;
}
#endif

Coverage Report

Created: 2025-01-28 06:38

Line	Count	Source (jump to first uncovered line)
1		// © 2016 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3		/*
4		*******************************************************************************
5		* Copyright (C) 2014-2016, International Business Machines
6		* Corporation and others. All Rights Reserved.
7		*******************************************************************************
8		* dictionarydata.h
9		*
10		* created on: 2012may31
11		* created by: Markus W. Scherer & Maxime Serrano
12		*/
13
14		#include "dictionarydata.h"
15		#include "unicode/ucharstrie.h"
16		#include "unicode/bytestrie.h"
17		#include "unicode/udata.h"
18		#include "cmemory.h"
19
20		#if !UCONFIG_NO_BREAK_ITERATION
21
22		U_NAMESPACE_BEGIN
23
24		const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
25		const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
26		const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
27		const int32_t DictionaryData::TRIE_HAS_VALUES = 8;
28
29		const int32_t DictionaryData::TRANSFORM_NONE = 0;
30		const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
31		const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
32		const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
33
34	0	DictionaryMatcher::~DictionaryMatcher() {
35	0	}
36
37	0	UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
38	0	udata_close(file);
39	0	}
40
41	0	int32_t UCharsDictionaryMatcher::getType() const {
42	0	return DictionaryData::TRIE_TYPE_UCHARS;
43	0	}
44
45		int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
46		int32_t lengths, int32_t cpLengths, int32_t *values,
47	0	int32_t *prefix) const {
48
49	0	UCharsTrie uct(characters);
50	0	int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
51	0	int32_t wordCount = 0;
52	0	int32_t codePointsMatched = 0;
53
54	0	for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
55	0	UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
56	0	int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
57	0	codePointsMatched += 1;
58	0	if (USTRINGTRIE_HAS_VALUE(result)) {
59	0	if (wordCount < limit) {
60	0	if (values != NULL) {
61	0	values[wordCount] = uct.getValue();
62	0	}
63	0	if (lengths != NULL) {
64	0	lengths[wordCount] = lengthMatched;
65	0	}
66	0	if (cpLengths != NULL) {
67	0	cpLengths[wordCount] = codePointsMatched;
68	0	}
69	0	++wordCount;
70	0	}
71	0	if (result == USTRINGTRIE_FINAL_VALUE) {
72	0	break;
73	0	}
74	0	}
75	0	else if (result == USTRINGTRIE_NO_MATCH) {
76	0	break;
77	0	}
78	0	if (lengthMatched >= maxLength) {
79	0	break;
80	0	}
81	0	}
82
83	0	if (prefix != NULL) {
84	0	*prefix = codePointsMatched;
85	0	}
86	0	return wordCount;
87	0	}
88
89	0	BytesDictionaryMatcher::~BytesDictionaryMatcher() {
90	0	udata_close(file);
91	0	}
92
93	0	UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
94	0	if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
95	0	if (c == 0x200D) {
96	0	return 0xFF;
97	0	} else if (c == 0x200C) {
98	0	return 0xFE;
99	0	}
100	0	int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
101	0	if (delta < 0 \|\| 0xFD < delta) {
102	0	return U_SENTINEL;
103	0	}
104	0	return (UChar32)delta;
105	0	}
106	0	return c;
107	0	}
108
109	0	int32_t BytesDictionaryMatcher::getType() const {
110	0	return DictionaryData::TRIE_TYPE_BYTES;
111	0	}
112
113		int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
114		int32_t lengths, int32_t cpLengths, int32_t *values,
115	0	int32_t *prefix) const {
116	0	BytesTrie bt(characters);
117	0	int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
118	0	int32_t wordCount = 0;
119	0	int32_t codePointsMatched = 0;
120
121	0	for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
122	0	UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
123	0	int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
124	0	codePointsMatched += 1;
125	0	if (USTRINGTRIE_HAS_VALUE(result)) {
126	0	if (wordCount < limit) {
127	0	if (values != NULL) {
128	0	values[wordCount] = bt.getValue();
129	0	}
130	0	if (lengths != NULL) {
131	0	lengths[wordCount] = lengthMatched;
132	0	}
133	0	if (cpLengths != NULL) {
134	0	cpLengths[wordCount] = codePointsMatched;
135	0	}
136	0	++wordCount;
137	0	}
138	0	if (result == USTRINGTRIE_FINAL_VALUE) {
139	0	break;
140	0	}
141	0	}
142	0	else if (result == USTRINGTRIE_NO_MATCH) {
143	0	break;
144	0	}
145	0	if (lengthMatched >= maxLength) {
146	0	break;
147	0	}
148	0	}
149
150	0	if (prefix != NULL) {
151	0	*prefix = codePointsMatched;
152	0	}
153	0	return wordCount;
154	0	}
155
156
157		U_NAMESPACE_END
158
159		U_NAMESPACE_USE
160
161		U_CAPI int32_t U_EXPORT2
162		udict_swap(const UDataSwapper ds, const void inData, int32_t length,
163	0	void outData, UErrorCode pErrorCode) {
164	0	const UDataInfo *pInfo;
165	0	int32_t headerSize;
166	0	const uint8_t *inBytes;
167	0	uint8_t *outBytes;
168	0	const int32_t *inIndexes;
169	0	int32_t indexes[DictionaryData::IX_COUNT];
170	0	int32_t i, offset, size;
171
172	0	headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
173	0	if (pErrorCode == NULL \|\| U_FAILURE(*pErrorCode)) return 0;
174	0	pInfo = (const UDataInfo )((const char )inData + 4);
175	0	if (!(pInfo->dataFormat[0] == 0x44 &&
176	0	pInfo->dataFormat[1] == 0x69 &&
177	0	pInfo->dataFormat[2] == 0x63 &&
178	0	pInfo->dataFormat[3] == 0x74 &&
179	0	pInfo->formatVersion[0] == 1)) {
180	0	udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
181	0	pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
182	0	*pErrorCode = U_UNSUPPORTED_ERROR;
183	0	return 0;
184	0	}
185
186	0	inBytes = (const uint8_t *)inData + headerSize;
187	0	outBytes = (uint8_t *)outData + headerSize;
188
189	0	inIndexes = (const int32_t *)inBytes;
190	0	if (length >= 0) {
191	0	length -= headerSize;
192	0	if (length < (int32_t)(sizeof(indexes))) {
193	0	udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
194	0	*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
195	0	return 0;
196	0	}
197	0	}
198
199	0	for (i = 0; i < DictionaryData::IX_COUNT; i++) {
200	0	indexes[i] = udata_readInt32(ds, inIndexes[i]);
201	0	}
202
203	0	size = indexes[DictionaryData::IX_TOTAL_SIZE];
204
205	0	if (length >= 0) {
206	0	if (length < size) {
207	0	udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
208	0	*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
209	0	return 0;
210	0	}
211
212	0	if (inBytes != outBytes) {
213	0	uprv_memcpy(outBytes, inBytes, size);
214	0	}
215
216	0	offset = 0;
217	0	ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
218	0	offset = (int32_t)sizeof(indexes);
219	0	int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
220	0	int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
221
222	0	if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
223	0	ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
224	0	} else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
225		// nothing to do
226	0	} else {
227	0	udata_printError(ds, "udict_swap(): unknown trie type!\n");
228	0	*pErrorCode = U_UNSUPPORTED_ERROR;
229	0	return 0;
230	0	}
231
232		// these next two sections are empty in the current format,
233		// but may be used later.
234	0	offset = nextOffset;
235	0	nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
236	0	offset = nextOffset;
237	0	nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
238	0	offset = nextOffset;
239	0	}
240	0	return headerSize + size;
241	0	}
242		#endif