File: | out/../deps/icu-small/source/common/umutablecptrie.cpp |
Warning: | line 847, column 9 Assigned value is garbage or undefined |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | // © 2017 and later: Unicode, Inc. and others. | |||
2 | // License & terms of use: http://www.unicode.org/copyright.html | |||
3 | ||||
4 | // umutablecptrie.cpp (inspired by utrie2_builder.cpp) | |||
5 | // created: 2017dec29 Markus W. Scherer | |||
6 | ||||
7 | // #define UCPTRIE_DEBUG | |||
8 | #ifdef UCPTRIE_DEBUG | |||
9 | # include <stdio.h> | |||
10 | #endif | |||
11 | ||||
12 | #include "unicode/utypes.h" | |||
13 | #include "unicode/ucptrie.h" | |||
14 | #include "unicode/umutablecptrie.h" | |||
15 | #include "unicode/uobject.h" | |||
16 | #include "unicode/utf16.h" | |||
17 | #include "cmemory.h" | |||
18 | #include "uassert.h" | |||
19 | #include "ucptrie_impl.h" | |||
20 | ||||
21 | // ICU-20235 In case Microsoft math.h has defined this, undefine it. | |||
22 | #ifdef OVERFLOW | |||
23 | #undef OVERFLOW | |||
24 | #endif | |||
25 | ||||
26 | U_NAMESPACE_BEGINnamespace icu_71 { | |||
27 | ||||
28 | namespace { | |||
29 | ||||
30 | constexpr int32_t MAX_UNICODE = 0x10ffff; | |||
31 | ||||
32 | constexpr int32_t UNICODE_LIMIT = 0x110000; | |||
33 | constexpr int32_t BMP_LIMIT = 0x10000; | |||
34 | constexpr int32_t ASCII_LIMIT = 0x80; | |||
35 | ||||
36 | constexpr int32_t I_LIMIT = UNICODE_LIMIT >> UCPTRIE_SHIFT_3; | |||
37 | constexpr int32_t BMP_I_LIMIT = BMP_LIMIT >> UCPTRIE_SHIFT_3; | |||
38 | constexpr int32_t ASCII_I_LIMIT = ASCII_LIMIT >> UCPTRIE_SHIFT_3; | |||
39 | ||||
40 | constexpr int32_t SMALL_DATA_BLOCKS_PER_BMP_BLOCK = (1 << (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3)); | |||
41 | ||||
42 | // Flag values for data blocks. | |||
43 | constexpr uint8_t ALL_SAME = 0; | |||
44 | constexpr uint8_t MIXED = 1; | |||
45 | constexpr uint8_t SAME_AS = 2; | |||
46 | ||||
47 | /** Start with allocation of 16k data entries. */ | |||
48 | constexpr int32_t INITIAL_DATA_LENGTH = ((int32_t)1 << 14); | |||
49 | ||||
50 | /** Grow about 8x each time. */ | |||
51 | constexpr int32_t MEDIUM_DATA_LENGTH = ((int32_t)1 << 17); | |||
52 | ||||
53 | /** | |||
54 | * Maximum length of the build-time data array. | |||
55 | * One entry per 0x110000 code points. | |||
56 | */ | |||
57 | constexpr int32_t MAX_DATA_LENGTH = UNICODE_LIMIT; | |||
58 | ||||
59 | // Flag values for index-3 blocks while compacting/building. | |||
60 | constexpr uint8_t I3_NULL = 0; | |||
61 | constexpr uint8_t I3_BMP = 1; | |||
62 | constexpr uint8_t I3_16 = 2; | |||
63 | constexpr uint8_t I3_18 = 3; | |||
64 | ||||
65 | constexpr int32_t INDEX_3_18BIT_BLOCK_LENGTH = UCPTRIE_INDEX_3_BLOCK_LENGTH + UCPTRIE_INDEX_3_BLOCK_LENGTH / 8; | |||
66 | ||||
67 | class AllSameBlocks; | |||
68 | class MixedBlocks; | |||
69 | ||||
70 | class MutableCodePointTrie : public UMemory { | |||
71 | public: | |||
72 | MutableCodePointTrie(uint32_t initialValue, uint32_t errorValue, UErrorCode &errorCode); | |||
73 | MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode); | |||
74 | MutableCodePointTrie(const MutableCodePointTrie &other) = delete; | |||
75 | ~MutableCodePointTrie(); | |||
76 | ||||
77 | MutableCodePointTrie &operator=(const MutableCodePointTrie &other) = delete; | |||
78 | ||||
79 | static MutableCodePointTrie *fromUCPMap(const UCPMap *map, UErrorCode &errorCode); | |||
80 | static MutableCodePointTrie *fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode); | |||
81 | ||||
82 | uint32_t get(UChar32 c) const; | |||
83 | int32_t getRange(UChar32 start, UCPMapValueFilter *filter, const void *context, | |||
84 | uint32_t *pValue) const; | |||
85 | ||||
86 | void set(UChar32 c, uint32_t value, UErrorCode &errorCode); | |||
87 | void setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode); | |||
88 | ||||
89 | UCPTrie *build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode); | |||
90 | ||||
91 | private: | |||
92 | void clear(); | |||
93 | ||||
94 | bool ensureHighStart(UChar32 c); | |||
95 | int32_t allocDataBlock(int32_t blockLength); | |||
96 | int32_t getDataBlock(int32_t i); | |||
97 | ||||
98 | void maskValues(uint32_t mask); | |||
99 | UChar32 findHighStart() const; | |||
100 | int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks); | |||
101 | int32_t compactData( | |||
102 | int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity, | |||
103 | int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode); | |||
104 | int32_t compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, UErrorCode &errorCode); | |||
105 | int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode); | |||
106 | ||||
107 | uint32_t *index = nullptr; | |||
108 | int32_t indexCapacity = 0; | |||
109 | int32_t index3NullOffset = -1; | |||
110 | uint32_t *data = nullptr; | |||
111 | int32_t dataCapacity = 0; | |||
112 | int32_t dataLength = 0; | |||
113 | int32_t dataNullOffset = -1; | |||
114 | ||||
115 | uint32_t origInitialValue; | |||
116 | uint32_t initialValue; | |||
117 | uint32_t errorValue; | |||
118 | UChar32 highStart; | |||
119 | uint32_t highValue; | |||
120 | #ifdef UCPTRIE_DEBUG | |||
121 | public: | |||
122 | const char *name; | |||
123 | #endif | |||
124 | private: | |||
125 | /** Temporary array while building the final data. */ | |||
126 | uint16_t *index16 = nullptr; | |||
127 | uint8_t flags[UNICODE_LIMIT >> UCPTRIE_SHIFT_3]; | |||
128 | }; | |||
129 | ||||
130 | MutableCodePointTrie::MutableCodePointTrie(uint32_t iniValue, uint32_t errValue, UErrorCode &errorCode) : | |||
131 | origInitialValue(iniValue), initialValue(iniValue), errorValue(errValue), | |||
132 | highStart(0), highValue(initialValue) | |||
133 | #ifdef UCPTRIE_DEBUG | |||
134 | , name("open") | |||
135 | #endif | |||
136 | { | |||
137 | if (U_FAILURE(errorCode)) { return; } | |||
138 | index = (uint32_t *)uprv_mallocuprv_malloc_71(BMP_I_LIMIT * 4); | |||
139 | data = (uint32_t *)uprv_mallocuprv_malloc_71(INITIAL_DATA_LENGTH * 4); | |||
140 | if (index == nullptr || data == nullptr) { | |||
141 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
142 | return; | |||
143 | } | |||
144 | indexCapacity = BMP_I_LIMIT; | |||
145 | dataCapacity = INITIAL_DATA_LENGTH; | |||
146 | } | |||
147 | ||||
148 | MutableCodePointTrie::MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode) : | |||
149 | index3NullOffset(other.index3NullOffset), | |||
150 | dataNullOffset(other.dataNullOffset), | |||
151 | origInitialValue(other.origInitialValue), initialValue(other.initialValue), | |||
152 | errorValue(other.errorValue), | |||
153 | highStart(other.highStart), highValue(other.highValue) | |||
154 | #ifdef UCPTRIE_DEBUG | |||
155 | , name("mutable clone") | |||
156 | #endif | |||
157 | { | |||
158 | if (U_FAILURE(errorCode)) { return; } | |||
159 | int32_t iCapacity = highStart <= BMP_LIMIT ? BMP_I_LIMIT : I_LIMIT; | |||
160 | index = (uint32_t *)uprv_mallocuprv_malloc_71(iCapacity * 4); | |||
161 | data = (uint32_t *)uprv_mallocuprv_malloc_71(other.dataCapacity * 4); | |||
162 | if (index == nullptr || data == nullptr) { | |||
163 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
164 | return; | |||
165 | } | |||
166 | indexCapacity = iCapacity; | |||
167 | dataCapacity = other.dataCapacity; | |||
168 | ||||
169 | int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; | |||
170 | uprv_memcpy(flags, other.flags, iLimit)do { clang diagnostic push
clang diagnostic ignored "-Waddress" (void)0; (void)0; clang diagnostic pop :: memcpy(flags, other .flags, iLimit); } while (false); | |||
171 | uprv_memcpy(index, other.index, iLimit * 4)do { clang diagnostic push
clang diagnostic ignored "-Waddress" (void)0; (void)0; clang diagnostic pop :: memcpy(index, other .index, iLimit * 4); } while (false); | |||
172 | uprv_memcpy(data, other.data, (size_t)other.dataLength * 4)do { clang diagnostic push
clang diagnostic ignored "-Waddress" (void)0; (void)0; clang diagnostic pop :: memcpy(data, other .data, (size_t)other.dataLength * 4); } while (false); | |||
173 | dataLength = other.dataLength; | |||
174 | U_ASSERT(other.index16 == nullptr)(void)0; | |||
175 | } | |||
176 | ||||
177 | MutableCodePointTrie::~MutableCodePointTrie() { | |||
178 | uprv_freeuprv_free_71(index); | |||
179 | uprv_freeuprv_free_71(data); | |||
180 | uprv_freeuprv_free_71(index16); | |||
181 | } | |||
182 | ||||
183 | MutableCodePointTrie *MutableCodePointTrie::fromUCPMap(const UCPMap *map, UErrorCode &errorCode) { | |||
184 | // Use the highValue as the initialValue to reduce the highStart. | |||
185 | uint32_t errorValue = ucpmap_getucpmap_get_71(map, -1); | |||
186 | uint32_t initialValue = ucpmap_getucpmap_get_71(map, 0x10ffff); | |||
187 | LocalPointer<MutableCodePointTrie> mutableTrie( | |||
188 | new MutableCodePointTrie(initialValue, errorValue, errorCode), | |||
189 | errorCode); | |||
190 | if (U_FAILURE(errorCode)) { | |||
191 | return nullptr; | |||
192 | } | |||
193 | UChar32 start = 0, end; | |||
194 | uint32_t value; | |||
195 | while ((end = ucpmap_getRangeucpmap_getRange_71(map, start, UCPMAP_RANGE_NORMAL, 0, | |||
196 | nullptr, nullptr, &value)) >= 0) { | |||
197 | if (value != initialValue) { | |||
198 | if (start == end) { | |||
199 | mutableTrie->set(start, value, errorCode); | |||
200 | } else { | |||
201 | mutableTrie->setRange(start, end, value, errorCode); | |||
202 | } | |||
203 | } | |||
204 | start = end + 1; | |||
205 | } | |||
206 | if (U_SUCCESS(errorCode)) { | |||
207 | return mutableTrie.orphan(); | |||
208 | } else { | |||
209 | return nullptr; | |||
210 | } | |||
211 | } | |||
212 | ||||
213 | MutableCodePointTrie *MutableCodePointTrie::fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode) { | |||
214 | // Use the highValue as the initialValue to reduce the highStart. | |||
215 | uint32_t errorValue; | |||
216 | uint32_t initialValue; | |||
217 | switch (trie->valueWidth) { | |||
218 | case UCPTRIE_VALUE_BITS_16: | |||
219 | errorValue = trie->data.ptr16[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET]; | |||
220 | initialValue = trie->data.ptr16[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET]; | |||
221 | break; | |||
222 | case UCPTRIE_VALUE_BITS_32: | |||
223 | errorValue = trie->data.ptr32[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET]; | |||
224 | initialValue = trie->data.ptr32[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET]; | |||
225 | break; | |||
226 | case UCPTRIE_VALUE_BITS_8: | |||
227 | errorValue = trie->data.ptr8[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET]; | |||
228 | initialValue = trie->data.ptr8[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET]; | |||
229 | break; | |||
230 | default: | |||
231 | // Unreachable if the trie is properly initialized. | |||
232 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |||
233 | return nullptr; | |||
234 | } | |||
235 | LocalPointer<MutableCodePointTrie> mutableTrie( | |||
236 | new MutableCodePointTrie(initialValue, errorValue, errorCode), | |||
237 | errorCode); | |||
238 | if (U_FAILURE(errorCode)) { | |||
239 | return nullptr; | |||
240 | } | |||
241 | UChar32 start = 0, end; | |||
242 | uint32_t value; | |||
243 | while ((end = ucptrie_getRangeucptrie_getRange_71(trie, start, UCPMAP_RANGE_NORMAL, 0, | |||
244 | nullptr, nullptr, &value)) >= 0) { | |||
245 | if (value != initialValue) { | |||
246 | if (start == end) { | |||
247 | mutableTrie->set(start, value, errorCode); | |||
248 | } else { | |||
249 | mutableTrie->setRange(start, end, value, errorCode); | |||
250 | } | |||
251 | } | |||
252 | start = end + 1; | |||
253 | } | |||
254 | if (U_SUCCESS(errorCode)) { | |||
255 | return mutableTrie.orphan(); | |||
256 | } else { | |||
257 | return nullptr; | |||
258 | } | |||
259 | } | |||
260 | ||||
261 | void MutableCodePointTrie::clear() { | |||
262 | index3NullOffset = dataNullOffset = -1; | |||
263 | dataLength = 0; | |||
264 | highValue = initialValue = origInitialValue; | |||
265 | highStart = 0; | |||
266 | uprv_freeuprv_free_71(index16); | |||
267 | index16 = nullptr; | |||
268 | } | |||
269 | ||||
270 | uint32_t MutableCodePointTrie::get(UChar32 c) const { | |||
271 | if ((uint32_t)c > MAX_UNICODE) { | |||
272 | return errorValue; | |||
273 | } | |||
274 | if (c >= highStart) { | |||
275 | return highValue; | |||
276 | } | |||
277 | int32_t i = c >> UCPTRIE_SHIFT_3; | |||
278 | if (flags[i] == ALL_SAME) { | |||
279 | return index[i]; | |||
280 | } else { | |||
281 | return data[index[i] + (c & UCPTRIE_SMALL_DATA_MASK)]; | |||
282 | } | |||
283 | } | |||
284 | ||||
285 | inline uint32_t maybeFilterValue(uint32_t value, uint32_t initialValue, uint32_t nullValue, | |||
286 | UCPMapValueFilter *filter, const void *context) { | |||
287 | if (value == initialValue) { | |||
288 | value = nullValue; | |||
289 | } else if (filter != nullptr) { | |||
290 | value = filter(context, value); | |||
291 | } | |||
292 | return value; | |||
293 | } | |||
294 | ||||
295 | UChar32 MutableCodePointTrie::getRange( | |||
296 | UChar32 start, UCPMapValueFilter *filter, const void *context, | |||
297 | uint32_t *pValue) const { | |||
298 | if ((uint32_t)start > MAX_UNICODE) { | |||
299 | return U_SENTINEL(-1); | |||
300 | } | |||
301 | if (start >= highStart) { | |||
302 | if (pValue != nullptr) { | |||
303 | uint32_t value = highValue; | |||
304 | if (filter != nullptr) { value = filter(context, value); } | |||
305 | *pValue = value; | |||
306 | } | |||
307 | return MAX_UNICODE; | |||
308 | } | |||
309 | uint32_t nullValue = initialValue; | |||
310 | if (filter != nullptr) { nullValue = filter(context, nullValue); } | |||
311 | UChar32 c = start; | |||
312 | uint32_t trieValue, value; | |||
313 | bool haveValue = false; | |||
314 | int32_t i = c >> UCPTRIE_SHIFT_3; | |||
315 | do { | |||
316 | if (flags[i] == ALL_SAME) { | |||
317 | uint32_t trieValue2 = index[i]; | |||
318 | if (haveValue) { | |||
319 | if (trieValue2 != trieValue) { | |||
320 | if (filter == nullptr || | |||
321 | maybeFilterValue(trieValue2, initialValue, nullValue, | |||
322 | filter, context) != value) { | |||
323 | return c - 1; | |||
324 | } | |||
325 | trieValue = trieValue2; // may or may not help | |||
326 | } | |||
327 | } else { | |||
328 | trieValue = trieValue2; | |||
329 | value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); | |||
330 | if (pValue != nullptr) { *pValue = value; } | |||
331 | haveValue = true; | |||
332 | } | |||
333 | c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK; | |||
334 | } else /* MIXED */ { | |||
335 | int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK); | |||
336 | uint32_t trieValue2 = data[di]; | |||
337 | if (haveValue) { | |||
338 | if (trieValue2 != trieValue) { | |||
339 | if (filter == nullptr || | |||
340 | maybeFilterValue(trieValue2, initialValue, nullValue, | |||
341 | filter, context) != value) { | |||
342 | return c - 1; | |||
343 | } | |||
344 | trieValue = trieValue2; // may or may not help | |||
345 | } | |||
346 | } else { | |||
347 | trieValue = trieValue2; | |||
348 | value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); | |||
349 | if (pValue != nullptr) { *pValue = value; } | |||
350 | haveValue = true; | |||
351 | } | |||
352 | while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) { | |||
353 | trieValue2 = data[++di]; | |||
354 | if (trieValue2 != trieValue) { | |||
355 | if (filter == nullptr || | |||
356 | maybeFilterValue(trieValue2, initialValue, nullValue, | |||
357 | filter, context) != value) { | |||
358 | return c - 1; | |||
359 | } | |||
360 | } | |||
361 | trieValue = trieValue2; // may or may not help | |||
362 | } | |||
363 | } | |||
364 | ++i; | |||
365 | } while (c < highStart); | |||
366 | U_ASSERT(haveValue)(void)0; | |||
367 | if (maybeFilterValue(highValue, initialValue, nullValue, | |||
368 | filter, context) != value) { | |||
369 | return c - 1; | |||
370 | } else { | |||
371 | return MAX_UNICODE; | |||
372 | } | |||
373 | } | |||
374 | ||||
375 | void | |||
376 | writeBlock(uint32_t *block, uint32_t value) { | |||
377 | uint32_t *limit = block + UCPTRIE_SMALL_DATA_BLOCK_LENGTH; | |||
378 | while (block < limit) { | |||
379 | *block++ = value; | |||
380 | } | |||
381 | } | |||
382 | ||||
383 | bool MutableCodePointTrie::ensureHighStart(UChar32 c) { | |||
384 | if (c >= highStart) { | |||
385 | // Round up to a UCPTRIE_CP_PER_INDEX_2_ENTRY boundary to simplify compaction. | |||
386 | c = (c + UCPTRIE_CP_PER_INDEX_2_ENTRY) & ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1); | |||
387 | int32_t i = highStart >> UCPTRIE_SHIFT_3; | |||
388 | int32_t iLimit = c >> UCPTRIE_SHIFT_3; | |||
389 | if (iLimit > indexCapacity) { | |||
390 | uint32_t *newIndex = (uint32_t *)uprv_mallocuprv_malloc_71(I_LIMIT * 4); | |||
391 | if (newIndex == nullptr) { return false; } | |||
392 | uprv_memcpy(newIndex, index, i * 4)do { clang diagnostic push
clang diagnostic ignored "-Waddress" (void)0; (void)0; clang diagnostic pop :: memcpy(newIndex, index, i * 4); } while (false); | |||
393 | uprv_freeuprv_free_71(index); | |||
394 | index = newIndex; | |||
395 | indexCapacity = I_LIMIT; | |||
396 | } | |||
397 | do { | |||
398 | flags[i] = ALL_SAME; | |||
399 | index[i] = initialValue; | |||
400 | } while(++i < iLimit); | |||
401 | highStart = c; | |||
402 | } | |||
403 | return true; | |||
404 | } | |||
405 | ||||
406 | int32_t MutableCodePointTrie::allocDataBlock(int32_t blockLength) { | |||
407 | int32_t newBlock = dataLength; | |||
408 | int32_t newTop = newBlock + blockLength; | |||
409 | if (newTop > dataCapacity) { | |||
410 | int32_t capacity; | |||
411 | if (dataCapacity < MEDIUM_DATA_LENGTH) { | |||
412 | capacity = MEDIUM_DATA_LENGTH; | |||
413 | } else if (dataCapacity < MAX_DATA_LENGTH) { | |||
414 | capacity = MAX_DATA_LENGTH; | |||
415 | } else { | |||
416 | // Should never occur. | |||
417 | // Either MAX_DATA_LENGTH is incorrect, | |||
418 | // or the code writes more values than should be possible. | |||
419 | return -1; | |||
420 | } | |||
421 | uint32_t *newData = (uint32_t *)uprv_mallocuprv_malloc_71(capacity * 4); | |||
422 | if (newData == nullptr) { | |||
423 | return -1; | |||
424 | } | |||
425 | uprv_memcpy(newData, data, (size_t)dataLength * 4)do { clang diagnostic push
clang diagnostic ignored "-Waddress" (void)0; (void)0; clang diagnostic pop :: memcpy(newData, data , (size_t)dataLength * 4); } while (false); | |||
426 | uprv_freeuprv_free_71(data); | |||
427 | data = newData; | |||
428 | dataCapacity = capacity; | |||
429 | } | |||
430 | dataLength = newTop; | |||
431 | return newBlock; | |||
432 | } | |||
433 | ||||
434 | /** | |||
435 | * No error checking for illegal arguments. | |||
436 | * | |||
437 | * @return -1 if no new data block available (out of memory in data array) | |||
438 | * @internal | |||
439 | */ | |||
440 | int32_t MutableCodePointTrie::getDataBlock(int32_t i) { | |||
441 | if (flags[i] == MIXED) { | |||
442 | return index[i]; | |||
443 | } | |||
444 | if (i < BMP_I_LIMIT) { | |||
445 | int32_t newBlock = allocDataBlock(UCPTRIE_FAST_DATA_BLOCK_LENGTH); | |||
446 | if (newBlock < 0) { return newBlock; } | |||
447 | int32_t iStart = i & ~(SMALL_DATA_BLOCKS_PER_BMP_BLOCK -1); | |||
448 | int32_t iLimit = iStart + SMALL_DATA_BLOCKS_PER_BMP_BLOCK; | |||
449 | do { | |||
450 | U_ASSERT(flags[iStart] == ALL_SAME)(void)0; | |||
451 | writeBlock(data + newBlock, index[iStart]); | |||
452 | flags[iStart] = MIXED; | |||
453 | index[iStart++] = newBlock; | |||
454 | newBlock += UCPTRIE_SMALL_DATA_BLOCK_LENGTH; | |||
455 | } while (iStart < iLimit); | |||
456 | return index[i]; | |||
457 | } else { | |||
458 | int32_t newBlock = allocDataBlock(UCPTRIE_SMALL_DATA_BLOCK_LENGTH); | |||
459 | if (newBlock < 0) { return newBlock; } | |||
460 | writeBlock(data + newBlock, index[i]); | |||
461 | flags[i] = MIXED; | |||
462 | index[i] = newBlock; | |||
463 | return newBlock; | |||
464 | } | |||
465 | } | |||
466 | ||||
467 | void MutableCodePointTrie::set(UChar32 c, uint32_t value, UErrorCode &errorCode) { | |||
468 | if (U_FAILURE(errorCode)) { | |||
469 | return; | |||
470 | } | |||
471 | if ((uint32_t)c > MAX_UNICODE) { | |||
472 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |||
473 | return; | |||
474 | } | |||
475 | ||||
476 | int32_t block; | |||
477 | if (!ensureHighStart(c) || (block = getDataBlock(c >> UCPTRIE_SHIFT_3)) < 0) { | |||
478 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
479 | return; | |||
480 | } | |||
481 | ||||
482 | data[block + (c & UCPTRIE_SMALL_DATA_MASK)] = value; | |||
483 | } | |||
484 | ||||
485 | void | |||
486 | fillBlock(uint32_t *block, UChar32 start, UChar32 limit, uint32_t value) { | |||
487 | uint32_t *pLimit = block + limit; | |||
488 | block += start; | |||
489 | while (block < pLimit) { | |||
490 | *block++ = value; | |||
491 | } | |||
492 | } | |||
493 | ||||
494 | void MutableCodePointTrie::setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode) { | |||
495 | if (U_FAILURE(errorCode)) { | |||
496 | return; | |||
497 | } | |||
498 | if ((uint32_t)start > MAX_UNICODE || (uint32_t)end > MAX_UNICODE || start > end) { | |||
499 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |||
500 | return; | |||
501 | } | |||
502 | if (!ensureHighStart(end)) { | |||
503 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
504 | return; | |||
505 | } | |||
506 | ||||
507 | UChar32 limit = end + 1; | |||
508 | if (start & UCPTRIE_SMALL_DATA_MASK) { | |||
509 | // Set partial block at [start..following block boundary[. | |||
510 | int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3); | |||
511 | if (block < 0) { | |||
512 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
513 | return; | |||
514 | } | |||
515 | ||||
516 | UChar32 nextStart = (start + UCPTRIE_SMALL_DATA_MASK) & ~UCPTRIE_SMALL_DATA_MASK; | |||
517 | if (nextStart <= limit) { | |||
518 | fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, UCPTRIE_SMALL_DATA_BLOCK_LENGTH, | |||
519 | value); | |||
520 | start = nextStart; | |||
521 | } else { | |||
522 | fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, limit & UCPTRIE_SMALL_DATA_MASK, | |||
523 | value); | |||
524 | return; | |||
525 | } | |||
526 | } | |||
527 | ||||
528 | // Number of positions in the last, partial block. | |||
529 | int32_t rest = limit & UCPTRIE_SMALL_DATA_MASK; | |||
530 | ||||
531 | // Round down limit to a block boundary. | |||
532 | limit &= ~UCPTRIE_SMALL_DATA_MASK; | |||
533 | ||||
534 | // Iterate over all-value blocks. | |||
535 | while (start < limit) { | |||
536 | int32_t i = start >> UCPTRIE_SHIFT_3; | |||
537 | if (flags[i] == ALL_SAME) { | |||
538 | index[i] = value; | |||
539 | } else /* MIXED */ { | |||
540 | fillBlock(data + index[i], 0, UCPTRIE_SMALL_DATA_BLOCK_LENGTH, value); | |||
541 | } | |||
542 | start += UCPTRIE_SMALL_DATA_BLOCK_LENGTH; | |||
543 | } | |||
544 | ||||
545 | if (rest > 0) { | |||
546 | // Set partial block at [last block boundary..limit[. | |||
547 | int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3); | |||
548 | if (block < 0) { | |||
549 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
550 | return; | |||
551 | } | |||
552 | ||||
553 | fillBlock(data + block, 0, rest, value); | |||
554 | } | |||
555 | } | |||
556 | ||||
557 | /* compaction --------------------------------------------------------------- */ | |||
558 | ||||
559 | void MutableCodePointTrie::maskValues(uint32_t mask) { | |||
560 | initialValue &= mask; | |||
561 | errorValue &= mask; | |||
562 | highValue &= mask; | |||
563 | int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; | |||
564 | for (int32_t i = 0; i < iLimit; ++i) { | |||
565 | if (flags[i] == ALL_SAME) { | |||
566 | index[i] &= mask; | |||
567 | } | |||
568 | } | |||
569 | for (int32_t i = 0; i < dataLength; ++i) { | |||
570 | data[i] &= mask; | |||
571 | } | |||
572 | } | |||
573 | ||||
574 | template<typename UIntA, typename UIntB> | |||
575 | bool equalBlocks(const UIntA *s, const UIntB *t, int32_t length) { | |||
576 | while (length > 0 && *s == *t) { | |||
577 | ++s; | |||
578 | ++t; | |||
579 | --length; | |||
580 | } | |||
581 | return length == 0; | |||
582 | } | |||
583 | ||||
584 | bool allValuesSameAs(const uint32_t *p, int32_t length, uint32_t value) { | |||
585 | const uint32_t *pLimit = p + length; | |||
586 | while (p < pLimit && *p == value) { ++p; } | |||
587 | return p == pLimit; | |||
588 | } | |||
589 | ||||
590 | /** Search for an identical block. */ | |||
591 | int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length, | |||
592 | const uint16_t *q, int32_t qStart, int32_t blockLength) { | |||
593 | // Ensure that we do not even partially get past length. | |||
594 | length -= blockLength; | |||
595 | ||||
596 | q += qStart; | |||
597 | while (pStart <= length) { | |||
598 | if (equalBlocks(p + pStart, q, blockLength)) { | |||
599 | return pStart; | |||
600 | } | |||
601 | ++pStart; | |||
602 | } | |||
603 | return -1; | |||
604 | } | |||
605 | ||||
606 | int32_t findAllSameBlock(const uint32_t *p, int32_t start, int32_t limit, | |||
607 | uint32_t value, int32_t blockLength) { | |||
608 | // Ensure that we do not even partially get past limit. | |||
609 | limit -= blockLength; | |||
610 | ||||
611 | for (int32_t block = start; block <= limit; ++block) { | |||
612 | if (p[block] == value) { | |||
613 | for (int32_t i = 1;; ++i) { | |||
614 | if (i == blockLength) { | |||
615 | return block; | |||
616 | } | |||
617 | if (p[block + i] != value) { | |||
618 | block += i; | |||
619 | break; | |||
620 | } | |||
621 | } | |||
622 | } | |||
623 | } | |||
624 | return -1; | |||
625 | } | |||
626 | ||||
627 | /** | |||
628 | * Look for maximum overlap of the beginning of the other block | |||
629 | * with the previous, adjacent block. | |||
630 | */ | |||
631 | template<typename UIntA, typename UIntB> | |||
632 | int32_t getOverlap(const UIntA *p, int32_t length, | |||
633 | const UIntB *q, int32_t qStart, int32_t blockLength) { | |||
634 | int32_t overlap = blockLength - 1; | |||
635 | U_ASSERT(overlap <= length)(void)0; | |||
636 | q += qStart; | |||
637 | while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) { | |||
638 | --overlap; | |||
639 | } | |||
640 | return overlap; | |||
641 | } | |||
642 | ||||
643 | int32_t getAllSameOverlap(const uint32_t *p, int32_t length, uint32_t value, | |||
644 | int32_t blockLength) { | |||
645 | int32_t min = length - (blockLength - 1); | |||
646 | int32_t i = length; | |||
647 | while (min < i && p[i - 1] == value) { --i; } | |||
648 | return length - i; | |||
649 | } | |||
650 | ||||
651 | bool isStartOfSomeFastBlock(uint32_t dataOffset, const uint32_t index[], int32_t fastILimit) { | |||
652 | for (int32_t i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) { | |||
653 | if (index[i] == dataOffset) { | |||
654 | return true; | |||
655 | } | |||
656 | } | |||
657 | return false; | |||
658 | } | |||
659 | ||||
660 | /** | |||
661 | * Finds the start of the last range in the trie by enumerating backward. | |||
662 | * Indexes for code points higher than this will be omitted. | |||
663 | */ | |||
664 | UChar32 MutableCodePointTrie::findHighStart() const { | |||
665 | int32_t i = highStart >> UCPTRIE_SHIFT_3; | |||
666 | while (i > 0) { | |||
667 | bool match; | |||
668 | if (flags[--i] == ALL_SAME) { | |||
669 | match = index[i] == highValue; | |||
670 | } else /* MIXED */ { | |||
671 | const uint32_t *p = data + index[i]; | |||
672 | for (int32_t j = 0;; ++j) { | |||
673 | if (j == UCPTRIE_SMALL_DATA_BLOCK_LENGTH) { | |||
674 | match = true; | |||
675 | break; | |||
676 | } | |||
677 | if (p[j] != highValue) { | |||
678 | match = false; | |||
679 | break; | |||
680 | } | |||
681 | } | |||
682 | } | |||
683 | if (!match) { | |||
684 | return (i + 1) << UCPTRIE_SHIFT_3; | |||
685 | } | |||
686 | } | |||
687 | return 0; | |||
688 | } | |||
689 | ||||
690 | class AllSameBlocks { | |||
691 | public: | |||
692 | static constexpr int32_t NEW_UNIQUE = -1; | |||
693 | static constexpr int32_t OVERFLOW = -2; | |||
694 | ||||
695 | AllSameBlocks() : length(0), mostRecent(-1) {} | |||
696 | ||||
697 | int32_t findOrAdd(int32_t index, int32_t count, uint32_t value) { | |||
698 | if (mostRecent >= 0 && values[mostRecent] == value) { | |||
699 | refCounts[mostRecent] += count; | |||
700 | return indexes[mostRecent]; | |||
701 | } | |||
702 | for (int32_t i = 0; i < length; ++i) { | |||
703 | if (values[i] == value) { | |||
704 | mostRecent = i; | |||
705 | refCounts[i] += count; | |||
706 | return indexes[i]; | |||
707 | } | |||
708 | } | |||
709 | if (length == CAPACITY) { | |||
710 | return OVERFLOW; | |||
711 | } | |||
712 | mostRecent = length; | |||
713 | indexes[length] = index; | |||
714 | values[length] = value; | |||
715 | refCounts[length++] = count; | |||
716 | return NEW_UNIQUE; | |||
717 | } | |||
718 | ||||
719 | /** Replaces the block which has the lowest reference count. */ | |||
720 | void add(int32_t index, int32_t count, uint32_t value) { | |||
721 | U_ASSERT(length == CAPACITY)(void)0; | |||
722 | int32_t least = -1; | |||
723 | int32_t leastCount = I_LIMIT; | |||
724 | for (int32_t i = 0; i < length; ++i) { | |||
725 | U_ASSERT(values[i] != value)(void)0; | |||
726 | if (refCounts[i] < leastCount) { | |||
727 | least = i; | |||
728 | leastCount = refCounts[i]; | |||
729 | } | |||
730 | } | |||
731 | U_ASSERT(least >= 0)(void)0; | |||
732 | mostRecent = least; | |||
733 | indexes[least] = index; | |||
734 | values[least] = value; | |||
735 | refCounts[least] = count; | |||
736 | } | |||
737 | ||||
738 | int32_t findMostUsed() const { | |||
739 | if (length == 0) { return -1; } | |||
740 | int32_t max = -1; | |||
741 | int32_t maxCount = 0; | |||
742 | for (int32_t i = 0; i < length; ++i) { | |||
743 | if (refCounts[i] > maxCount) { | |||
744 | max = i; | |||
745 | maxCount = refCounts[i]; | |||
746 | } | |||
747 | } | |||
748 | return indexes[max]; | |||
749 | } | |||
750 | ||||
751 | private: | |||
752 | static constexpr int32_t CAPACITY = 32; | |||
753 | ||||
754 | int32_t length; | |||
755 | int32_t mostRecent; | |||
756 | ||||
757 | int32_t indexes[CAPACITY]; | |||
758 | uint32_t values[CAPACITY]; | |||
759 | int32_t refCounts[CAPACITY]; | |||
760 | }; | |||
761 | ||||
762 | // Custom hash table for mixed-value blocks to be found anywhere in the | |||
763 | // compacted data or index so far. | |||
764 | class MixedBlocks { | |||
765 | public: | |||
766 | MixedBlocks() {} | |||
767 | ~MixedBlocks() { | |||
768 | uprv_freeuprv_free_71(table); | |||
769 | } | |||
770 | ||||
771 | bool init(int32_t maxLength, int32_t newBlockLength) { | |||
772 | // We store actual data indexes + 1 to reserve 0 for empty entries. | |||
773 | int32_t maxDataIndex = maxLength - newBlockLength + 1; | |||
774 | int32_t newLength; | |||
775 | if (maxDataIndex <= 0xfff) { // 4k | |||
776 | newLength = 6007; | |||
777 | shift = 12; | |||
778 | mask = 0xfff; | |||
779 | } else if (maxDataIndex <= 0x7fff) { // 32k | |||
780 | newLength = 50021; | |||
781 | shift = 15; | |||
782 | mask = 0x7fff; | |||
783 | } else if (maxDataIndex <= 0x1ffff) { // 128k | |||
784 | newLength = 200003; | |||
785 | shift = 17; | |||
786 | mask = 0x1ffff; | |||
787 | } else { | |||
788 | // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M | |||
789 | newLength = 1500007; | |||
790 | shift = 21; | |||
791 | mask = 0x1fffff; | |||
792 | } | |||
793 | if (newLength > capacity) { | |||
794 | uprv_freeuprv_free_71(table); | |||
795 | table = (uint32_t *)uprv_mallocuprv_malloc_71(newLength * 4); | |||
796 | if (table == nullptr) { | |||
797 | return false; | |||
798 | } | |||
799 | capacity = newLength; | |||
800 | } | |||
801 | length = newLength; | |||
802 | uprv_memset(table, 0, length * 4):: memset(table, 0, length * 4); | |||
803 | ||||
804 | blockLength = newBlockLength; | |||
805 | return true; | |||
806 | } | |||
807 | ||||
808 | template<typename UInt> | |||
809 | void extend(const UInt *data, int32_t minStart, int32_t prevDataLength, int32_t newDataLength) { | |||
810 | int32_t start = prevDataLength - blockLength; | |||
811 | if (start
| |||
812 | ++start; // Skip the last block that we added last time. | |||
813 | } else { | |||
814 | start = minStart; // Begin with the first full block. | |||
815 | } | |||
816 | for (int32_t end = newDataLength - blockLength; start <= end; ++start) { | |||
817 | uint32_t hashCode = makeHashCode(data, start); | |||
818 | addEntry(data, start, hashCode, start); | |||
819 | } | |||
820 | } | |||
821 | ||||
822 | template<typename UIntA, typename UIntB> | |||
823 | int32_t findBlock(const UIntA *data, const UIntB *blockData, int32_t blockStart) const { | |||
824 | uint32_t hashCode = makeHashCode(blockData, blockStart); | |||
825 | int32_t entryIndex = findEntry(data, blockData, blockStart, hashCode); | |||
826 | if (entryIndex >= 0) { | |||
827 | return (table[entryIndex] & mask) - 1; | |||
828 | } else { | |||
829 | return -1; | |||
830 | } | |||
831 | } | |||
832 | ||||
833 | int32_t findAllSameBlock(const uint32_t *data, uint32_t blockValue) const { | |||
834 | uint32_t hashCode = makeHashCode(blockValue); | |||
835 | int32_t entryIndex = findEntry(data, blockValue, hashCode); | |||
836 | if (entryIndex >= 0) { | |||
837 | return (table[entryIndex] & mask) - 1; | |||
838 | } else { | |||
839 | return -1; | |||
840 | } | |||
841 | } | |||
842 | ||||
843 | private: | |||
844 | template<typename UInt> | |||
845 | uint32_t makeHashCode(const UInt *blockData, int32_t blockStart) const { | |||
846 | int32_t blockLimit = blockStart + blockLength; | |||
847 | uint32_t hashCode = blockData[blockStart++]; | |||
| ||||
848 | do { | |||
849 | hashCode = 37 * hashCode + blockData[blockStart++]; | |||
850 | } while (blockStart < blockLimit); | |||
851 | return hashCode; | |||
852 | } | |||
853 | ||||
854 | uint32_t makeHashCode(uint32_t blockValue) const { | |||
855 | uint32_t hashCode = blockValue; | |||
856 | for (int32_t i = 1; i < blockLength; ++i) { | |||
857 | hashCode = 37 * hashCode + blockValue; | |||
858 | } | |||
859 | return hashCode; | |||
860 | } | |||
861 | ||||
862 | template<typename UInt> | |||
863 | void addEntry(const UInt *data, int32_t blockStart, uint32_t hashCode, int32_t dataIndex) { | |||
864 | U_ASSERT(0 <= dataIndex && dataIndex < (int32_t)mask)(void)0; | |||
865 | int32_t entryIndex = findEntry(data, data, blockStart, hashCode); | |||
866 | if (entryIndex < 0) { | |||
867 | table[~entryIndex] = (hashCode << shift) | (dataIndex + 1); | |||
868 | } | |||
869 | } | |||
870 | ||||
871 | template<typename UIntA, typename UIntB> | |||
872 | int32_t findEntry(const UIntA *data, const UIntB *blockData, int32_t blockStart, | |||
873 | uint32_t hashCode) const { | |||
874 | uint32_t shiftedHashCode = hashCode << shift; | |||
875 | int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1 | |||
876 | for (int32_t entryIndex = initialEntryIndex;;) { | |||
877 | uint32_t entry = table[entryIndex]; | |||
878 | if (entry == 0) { | |||
879 | return ~entryIndex; | |||
880 | } | |||
881 | if ((entry & ~mask) == shiftedHashCode) { | |||
882 | int32_t dataIndex = (entry & mask) - 1; | |||
883 | if (equalBlocks(data + dataIndex, blockData + blockStart, blockLength)) { | |||
884 | return entryIndex; | |||
885 | } | |||
886 | } | |||
887 | entryIndex = nextIndex(initialEntryIndex, entryIndex); | |||
888 | } | |||
889 | } | |||
890 | ||||
891 | int32_t findEntry(const uint32_t *data, uint32_t blockValue, uint32_t hashCode) const { | |||
892 | uint32_t shiftedHashCode = hashCode << shift; | |||
893 | int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1 | |||
894 | for (int32_t entryIndex = initialEntryIndex;;) { | |||
895 | uint32_t entry = table[entryIndex]; | |||
896 | if (entry == 0) { | |||
897 | return ~entryIndex; | |||
898 | } | |||
899 | if ((entry & ~mask) == shiftedHashCode) { | |||
900 | int32_t dataIndex = (entry & mask) - 1; | |||
901 | if (allValuesSameAs(data + dataIndex, blockLength, blockValue)) { | |||
902 | return entryIndex; | |||
903 | } | |||
904 | } | |||
905 | entryIndex = nextIndex(initialEntryIndex, entryIndex); | |||
906 | } | |||
907 | } | |||
908 | ||||
909 | inline int32_t nextIndex(int32_t initialEntryIndex, int32_t entryIndex) const { | |||
910 | // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length); | |||
911 | return (entryIndex + initialEntryIndex) % length; | |||
912 | } | |||
913 | ||||
914 | // Hash table. | |||
915 | // The length is a prime number, larger than the maximum data length. | |||
916 | // The "shift" lower bits store a data index + 1. | |||
917 | // The remaining upper bits store a partial hashCode of the block data values. | |||
918 | uint32_t *table = nullptr; | |||
919 | int32_t capacity = 0; | |||
920 | int32_t length = 0; | |||
921 | int32_t shift = 0; | |||
922 | uint32_t mask = 0; | |||
923 | ||||
924 | int32_t blockLength = 0; | |||
925 | }; | |||
926 | ||||
927 | int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks) { | |||
928 | #ifdef UCPTRIE_DEBUG | |||
929 | bool overflow = false; | |||
930 | #endif | |||
931 | ||||
932 | // ASCII data will be stored as a linear table, even if the following code | |||
933 | // does not yet count it that way. | |||
934 | int32_t newDataCapacity = ASCII_LIMIT; | |||
935 | // Add room for a small data null block in case it would match the start of | |||
936 | // a fast data block where dataNullOffset must not be set in that case. | |||
937 | newDataCapacity += UCPTRIE_SMALL_DATA_BLOCK_LENGTH; | |||
938 | // Add room for special values (errorValue, highValue) and padding. | |||
939 | newDataCapacity += 4; | |||
940 | int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; | |||
941 | int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH; | |||
942 | int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; | |||
943 | for (int32_t i = 0; i < iLimit; i += inc) { | |||
944 | if (i == fastILimit) { | |||
945 | blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH; | |||
946 | inc = 1; | |||
947 | } | |||
948 | uint32_t value = index[i]; | |||
949 | if (flags[i] == MIXED) { | |||
950 | // Really mixed? | |||
951 | const uint32_t *p = data + value; | |||
952 | value = *p; | |||
953 | if (allValuesSameAs(p + 1, blockLength - 1, value)) { | |||
954 | flags[i] = ALL_SAME; | |||
955 | index[i] = value; | |||
956 | // Fall through to ALL_SAME handling. | |||
957 | } else { | |||
958 | newDataCapacity += blockLength; | |||
959 | continue; | |||
960 | } | |||
961 | } else { | |||
962 | U_ASSERT(flags[i] == ALL_SAME)(void)0; | |||
963 | if (inc > 1) { | |||
964 | // Do all of the fast-range data block's ALL_SAME parts have the same value? | |||
965 | bool allSame = true; | |||
966 | int32_t next_i = i + inc; | |||
967 | for (int32_t j = i + 1; j < next_i; ++j) { | |||
968 | U_ASSERT(flags[j] == ALL_SAME)(void)0; | |||
969 | if (index[j] != value) { | |||
970 | allSame = false; | |||
971 | break; | |||
972 | } | |||
973 | } | |||
974 | if (!allSame) { | |||
975 | // Turn it into a MIXED block. | |||
976 | if (getDataBlock(i) < 0) { | |||
977 | return -1; | |||
978 | } | |||
979 | newDataCapacity += blockLength; | |||
980 | continue; | |||
981 | } | |||
982 | } | |||
983 | } | |||
984 | // Is there another ALL_SAME block with the same value? | |||
985 | int32_t other = allSameBlocks.findOrAdd(i, inc, value); | |||
986 | if (other == AllSameBlocks::OVERFLOW) { | |||
987 | // The fixed-size array overflowed. Slow check for a duplicate block. | |||
988 | #ifdef UCPTRIE_DEBUG | |||
989 | if (!overflow) { | |||
990 | puts("UCPTrie AllSameBlocks overflow"); | |||
991 | overflow = true; | |||
992 | } | |||
993 | #endif | |||
994 | int32_t jInc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; | |||
995 | for (int32_t j = 0;; j += jInc) { | |||
996 | if (j == i) { | |||
997 | allSameBlocks.add(i, inc, value); | |||
998 | break; | |||
999 | } | |||
1000 | if (j == fastILimit) { | |||
1001 | jInc = 1; | |||
1002 | } | |||
1003 | if (flags[j] == ALL_SAME && index[j] == value) { | |||
1004 | allSameBlocks.add(j, jInc + inc, value); | |||
1005 | other = j; | |||
1006 | break; | |||
1007 | // We could keep counting blocks with the same value | |||
1008 | // before we add the first one, which may improve compaction in rare cases, | |||
1009 | // but it would make it slower. | |||
1010 | } | |||
1011 | } | |||
1012 | } | |||
1013 | if (other >= 0) { | |||
1014 | flags[i] = SAME_AS; | |||
1015 | index[i] = other; | |||
1016 | } else { | |||
1017 | // New unique same-value block. | |||
1018 | newDataCapacity += blockLength; | |||
1019 | } | |||
1020 | } | |||
1021 | return newDataCapacity; | |||
1022 | } | |||
1023 | ||||
1024 | #ifdef UCPTRIE_DEBUG | |||
1025 | # define DEBUG_DO(expr) expr | |||
1026 | #else | |||
1027 | # define DEBUG_DO(expr) | |||
1028 | #endif | |||
1029 | ||||
1030 | #ifdef UCPTRIE_DEBUG | |||
1031 | // Braille symbols: U+28xx = UTF-8 E2 A0 80..E2 A3 BF | |||
1032 | int32_t appendValue(char s[], int32_t length, uint32_t value) { | |||
1033 | value ^= value >> 16; | |||
1034 | value ^= value >> 8; | |||
1035 | s[length] = 0xE2; | |||
1036 | s[length + 1] = (char)(0xA0 + ((value >> 6) & 3)); | |||
1037 | s[length + 2] = (char)(0x80 + (value & 0x3F)); | |||
1038 | return length + 3; | |||
1039 | } | |||
1040 | ||||
1041 | void printBlock(const uint32_t *block, int32_t blockLength, uint32_t value, | |||
1042 | UChar32 start, int32_t overlap, uint32_t initialValue) { | |||
1043 | char s[UCPTRIE_FAST_DATA_BLOCK_LENGTH * 3 + 3]; | |||
1044 | int32_t length = 0; | |||
1045 | int32_t i; | |||
1046 | for (i = 0; i < overlap; ++i) { | |||
1047 | length = appendValue(s, length, 0); // Braille blank | |||
1048 | } | |||
1049 | s[length++] = '|'; | |||
1050 | for (; i < blockLength; ++i) { | |||
1051 | if (block != nullptr) { | |||
1052 | value = block[i]; | |||
1053 | } | |||
1054 | if (value == initialValue) { | |||
1055 | value = 0x40; // Braille lower left dot | |||
1056 | } | |||
1057 | length = appendValue(s, length, value); | |||
1058 | } | |||
1059 | s[length] = 0; | |||
1060 | start += overlap; | |||
1061 | if (start <= 0xffff) { | |||
1062 | printf(" %04lX %s|\n", (long)start, s); | |||
1063 | } else if (start <= 0xfffff) { | |||
1064 | printf(" %5lX %s|\n", (long)start, s); | |||
1065 | } else { | |||
1066 | printf(" %6lX %s|\n", (long)start, s); | |||
1067 | } | |||
1068 | } | |||
1069 | #endif | |||
1070 | ||||
1071 | /** | |||
1072 | * Compacts a build-time trie. | |||
1073 | * | |||
1074 | * The compaction | |||
1075 | * - removes blocks that are identical with earlier ones | |||
1076 | * - overlaps each new non-duplicate block as much as possible with the previously-written one | |||
1077 | * - works with fast-range data blocks whose length is a multiple of that of | |||
1078 | * higher-code-point data blocks | |||
1079 | * | |||
1080 | * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks. | |||
1081 | */ | |||
1082 | int32_t MutableCodePointTrie::compactData( | |||
1083 | int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity, | |||
1084 | int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode) { | |||
1085 | #ifdef UCPTRIE_DEBUG | |||
1086 | int32_t countSame=0, sumOverlaps=0; | |||
1087 | bool printData = dataLength == 29088 /* line.brk */ || | |||
1088 | // dataLength == 30048 /* CanonIterData */ || | |||
1089 | dataLength == 50400 /* zh.txt~stroke */; | |||
1090 | #endif | |||
1091 | ||||
1092 | // The linear ASCII data has been copied into newData already. | |||
1093 | int32_t newDataLength = 0; | |||
1094 | for (int32_t i = 0; newDataLength < ASCII_LIMIT; | |||
1095 | newDataLength += UCPTRIE_FAST_DATA_BLOCK_LENGTH, i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) { | |||
1096 | index[i] = newDataLength; | |||
1097 | #ifdef UCPTRIE_DEBUG | |||
1098 | if (printData) { | |||
1099 | printBlock(newData + newDataLength, UCPTRIE_FAST_DATA_BLOCK_LENGTH, 0, newDataLength, 0, initialValue); | |||
1100 | } | |||
1101 | #endif | |||
1102 | } | |||
1103 | ||||
1104 | int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH; | |||
1105 | if (!mixedBlocks.init(newDataCapacity, blockLength)) { | |||
1106 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
1107 | return 0; | |||
1108 | } | |||
1109 | mixedBlocks.extend(newData, 0, 0, newDataLength); | |||
1110 | ||||
1111 | int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; | |||
1112 | int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; | |||
1113 | int32_t fastLength = 0; | |||
1114 | for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) { | |||
1115 | if (i == fastILimit) { | |||
1116 | blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH; | |||
1117 | inc = 1; | |||
1118 | fastLength = newDataLength; | |||
1119 | if (!mixedBlocks.init(newDataCapacity, blockLength)) { | |||
1120 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
1121 | return 0; | |||
1122 | } | |||
1123 | mixedBlocks.extend(newData, 0, 0, newDataLength); | |||
1124 | } | |||
1125 | if (flags[i] == ALL_SAME) { | |||
1126 | uint32_t value = index[i]; | |||
1127 | // Find an earlier part of the data array of length blockLength | |||
1128 | // that is filled with this value. | |||
1129 | int32_t n = mixedBlocks.findAllSameBlock(newData, value); | |||
1130 | // If we find a match, and the current block is the data null block, | |||
1131 | // and it is not a fast block but matches the start of a fast block, | |||
1132 | // then we need to continue looking. | |||
1133 | // This is because this small block is shorter than the fast block, | |||
1134 | // and not all of the rest of the fast block is filled with this value. | |||
1135 | // Otherwise trie.getRange() would detect that the fast block starts at | |||
1136 | // dataNullOffset and assume incorrectly that it is filled with the null value. | |||
1137 | while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength && | |||
1138 | isStartOfSomeFastBlock(n, index, fastILimit)) { | |||
1139 | n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength); | |||
1140 | } | |||
1141 | if (n >= 0) { | |||
1142 | DEBUG_DO(++countSame); | |||
1143 | index[i] = n; | |||
1144 | } else { | |||
1145 | n = getAllSameOverlap(newData, newDataLength, value, blockLength); | |||
1146 | DEBUG_DO(sumOverlaps += n); | |||
1147 | #ifdef UCPTRIE_DEBUG | |||
1148 | if (printData) { | |||
1149 | printBlock(nullptr, blockLength, value, i << UCPTRIE_SHIFT_3, n, initialValue); | |||
1150 | } | |||
1151 | #endif | |||
1152 | index[i] = newDataLength - n; | |||
1153 | int32_t prevDataLength = newDataLength; | |||
1154 | while (n < blockLength) { | |||
1155 | newData[newDataLength++] = value; | |||
1156 | ++n; | |||
1157 | } | |||
1158 | mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); | |||
1159 | } | |||
1160 | } else if (flags[i] == MIXED) { | |||
1161 | const uint32_t *block = data + index[i]; | |||
1162 | int32_t n = mixedBlocks.findBlock(newData, block, 0); | |||
1163 | if (n >= 0) { | |||
1164 | DEBUG_DO(++countSame); | |||
1165 | index[i] = n; | |||
1166 | } else { | |||
1167 | n = getOverlap(newData, newDataLength, block, 0, blockLength); | |||
1168 | DEBUG_DO(sumOverlaps += n); | |||
1169 | #ifdef UCPTRIE_DEBUG | |||
1170 | if (printData) { | |||
1171 | printBlock(block, blockLength, 0, i << UCPTRIE_SHIFT_3, n, initialValue); | |||
1172 | } | |||
1173 | #endif | |||
1174 | index[i] = newDataLength - n; | |||
1175 | int32_t prevDataLength = newDataLength; | |||
1176 | while (n < blockLength) { | |||
1177 | newData[newDataLength++] = block[n++]; | |||
1178 | } | |||
1179 | mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); | |||
1180 | } | |||
1181 | } else /* SAME_AS */ { | |||
1182 | uint32_t j = index[i]; | |||
1183 | index[i] = index[j]; | |||
1184 | } | |||
1185 | } | |||
1186 | ||||
1187 | #ifdef UCPTRIE_DEBUG | |||
1188 | /* we saved some space */ | |||
1189 | printf("compacting UCPTrie: count of 32-bit data words %lu->%lu countSame=%ld sumOverlaps=%ld\n", | |||
1190 | (long)dataLength, (long)newDataLength, (long)countSame, (long)sumOverlaps); | |||
1191 | #endif | |||
1192 | return newDataLength; | |||
1193 | } | |||
1194 | ||||
1195 | int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, | |||
1196 | UErrorCode &errorCode) { | |||
1197 | int32_t fastIndexLength = fastILimit >> (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3); | |||
1198 | if ((highStart >> UCPTRIE_FAST_SHIFT) <= fastIndexLength) { | |||
| ||||
1199 | // Only the linear fast index, no multi-stage index tables. | |||
1200 | index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET; | |||
1201 | return fastIndexLength; | |||
1202 | } | |||
1203 | ||||
1204 | // Condense the fast index table. | |||
1205 | // Also, does it contain an index-3 block with all dataNullOffset? | |||
1206 | uint16_t fastIndex[UCPTRIE_BMP_INDEX_LENGTH]; // fastIndexLength | |||
1207 | int32_t i3FirstNull = -1; | |||
1208 | for (int32_t i = 0, j = 0; i < fastILimit; ++j) { | |||
1209 | uint32_t i3 = index[i]; | |||
1210 | fastIndex[j] = (uint16_t)i3; | |||
1211 | if (i3 == (uint32_t)dataNullOffset) { | |||
1212 | if (i3FirstNull < 0) { | |||
1213 | i3FirstNull = j; | |||
1214 | } else if (index3NullOffset < 0 && | |||
1215 | (j - i3FirstNull + 1) == UCPTRIE_INDEX_3_BLOCK_LENGTH) { | |||
1216 | index3NullOffset = i3FirstNull; | |||
1217 | } | |||
1218 | } else { | |||
1219 | i3FirstNull = -1; | |||
1220 | } | |||
1221 | // Set the index entries that compactData() skipped. | |||
1222 | // Needed when the multi-stage index covers the fast index range as well. | |||
1223 | int32_t iNext = i + SMALL_DATA_BLOCKS_PER_BMP_BLOCK; | |||
1224 | while (++i < iNext) { | |||
1225 | i3 += UCPTRIE_SMALL_DATA_BLOCK_LENGTH; | |||
1226 | index[i] = i3; | |||
1227 | } | |||
1228 | } | |||
1229 | ||||
1230 | if (!mixedBlocks.init(fastIndexLength, UCPTRIE_INDEX_3_BLOCK_LENGTH)) { | |||
1231 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
1232 | return 0; | |||
1233 | } | |||
1234 | mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength); | |||
1235 | ||||
1236 | // Examine index-3 blocks. For each determine one of: | |||
1237 | // - same as the index-3 null block | |||
1238 | // - same as a fast-index block | |||
1239 | // - 16-bit indexes | |||
1240 | // - 18-bit indexes | |||
1241 | // We store this in the first flags entry for the index-3 block. | |||
1242 | // | |||
1243 | // Also determine an upper limit for the index-3 table length. | |||
1244 | int32_t index3Capacity = 0; | |||
1245 | i3FirstNull = index3NullOffset; | |||
1246 | bool hasLongI3Blocks = false; | |||
1247 | // If the fast index covers the whole BMP, then | |||
1248 | // the multi-stage index is only for supplementary code points. | |||
1249 | // Otherwise, the multi-stage index covers all of Unicode. | |||
1250 | int32_t iStart = fastILimit < BMP_I_LIMIT ? 0 : BMP_I_LIMIT; | |||
1251 | int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; | |||
1252 | for (int32_t i = iStart; i < iLimit;) { | |||
1253 | int32_t j = i; | |||
1254 | int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH; | |||
1255 | uint32_t oredI3 = 0; | |||
1256 | bool isNull = true; | |||
1257 | do { | |||
1258 | uint32_t i3 = index[j]; | |||
1259 | oredI3 |= i3; | |||
1260 | if (i3 != (uint32_t)dataNullOffset) { | |||
1261 | isNull = false; | |||
1262 | } | |||
1263 | } while (++j < jLimit); | |||
1264 | if (isNull) { | |||
1265 | flags[i] = I3_NULL; | |||
1266 | if (i3FirstNull < 0) { | |||
1267 | if (oredI3 <= 0xffff) { | |||
1268 | index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH; | |||
1269 | } else { | |||
1270 | index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; | |||
1271 | hasLongI3Blocks = true; | |||
1272 | } | |||
1273 | i3FirstNull = 0; | |||
1274 | } | |||
1275 | } else { | |||
1276 | if (oredI3 <= 0xffff) { | |||
1277 | int32_t n = mixedBlocks.findBlock(fastIndex, index, i); | |||
1278 | if (n >= 0) { | |||
1279 | flags[i] = I3_BMP; | |||
1280 | index[i] = n; | |||
1281 | } else { | |||
1282 | flags[i] = I3_16; | |||
1283 | index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH; | |||
1284 | } | |||
1285 | } else { | |||
1286 | flags[i] = I3_18; | |||
1287 | index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; | |||
1288 | hasLongI3Blocks = true; | |||
1289 | } | |||
1290 | } | |||
1291 | i = j; | |||
1292 | } | |||
1293 | ||||
1294 | int32_t index2Capacity = (iLimit - iStart) >> UCPTRIE_SHIFT_2_3; | |||
1295 | ||||
1296 | // Length of the index-1 table, rounded up. | |||
1297 | int32_t index1Length = (index2Capacity + UCPTRIE_INDEX_2_MASK) >> UCPTRIE_SHIFT_1_2; | |||
1298 | ||||
1299 | // Index table: Fast index, index-1, index-3, index-2. | |||
1300 | // +1 for possible index table padding. | |||
1301 | int32_t index16Capacity = fastIndexLength + index1Length + index3Capacity + index2Capacity + 1; | |||
1302 | index16 = (uint16_t *)uprv_mallocuprv_malloc_71(index16Capacity * 2); | |||
1303 | if (index16 == nullptr) { | |||
1304 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
1305 | return 0; | |||
1306 | } | |||
1307 | uprv_memcpy(index16, fastIndex, fastIndexLength * 2)do { clang diagnostic push
clang diagnostic ignored "-Waddress" (void)0; (void)0; clang diagnostic pop :: memcpy(index16, fastIndex , fastIndexLength * 2); } while (false); | |||
1308 | ||||
1309 | if (!mixedBlocks.init(index16Capacity, UCPTRIE_INDEX_3_BLOCK_LENGTH)) { | |||
1310 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
1311 | return 0; | |||
1312 | } | |||
1313 | MixedBlocks longI3Blocks; | |||
1314 | if (hasLongI3Blocks) { | |||
1315 | if (!longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH)) { | |||
1316 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
1317 | return 0; | |||
1318 | } | |||
1319 | } | |||
1320 | ||||
1321 | // Compact the index-3 table and write an uncompacted version of the index-2 table. | |||
1322 | uint16_t index2[UNICODE_LIMIT >> UCPTRIE_SHIFT_2]; // index2Capacity | |||
1323 | int32_t i2Length = 0; | |||
1324 | i3FirstNull = index3NullOffset; | |||
1325 | int32_t index3Start = fastIndexLength + index1Length; | |||
1326 | int32_t indexLength = index3Start; | |||
1327 | for (int32_t i = iStart; i < iLimit; i += UCPTRIE_INDEX_3_BLOCK_LENGTH) { | |||
1328 | int32_t i3; | |||
1329 | uint8_t f = flags[i]; | |||
1330 | if (f == I3_NULL && i3FirstNull < 0) { | |||
1331 | // First index-3 null block. Write & overlap it like a normal block, then remember it. | |||
1332 | f = dataNullOffset <= 0xffff ? I3_16 : I3_18; | |||
1333 | i3FirstNull = 0; | |||
1334 | } | |||
1335 | if (f == I3_NULL) { | |||
1336 | i3 = index3NullOffset; | |||
1337 | } else if (f == I3_BMP) { | |||
1338 | i3 = index[i]; | |||
1339 | } else if (f == I3_16) { | |||
1340 | int32_t n = mixedBlocks.findBlock(index16, index, i); | |||
1341 | if (n >= 0) { | |||
1342 | i3 = n; | |||
1343 | } else { | |||
1344 | if (indexLength == index3Start) { | |||
1345 | // No overlap at the boundary between the index-1 and index-3 tables. | |||
1346 | n = 0; | |||
1347 | } else { | |||
1348 | n = getOverlap(index16, indexLength, | |||
1349 | index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH); | |||
1350 | } | |||
1351 | i3 = indexLength - n; | |||
1352 | int32_t prevIndexLength = indexLength; | |||
1353 | while (n < UCPTRIE_INDEX_3_BLOCK_LENGTH) { | |||
1354 | index16[indexLength++] = index[i + n++]; | |||
1355 | } | |||
1356 | mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); | |||
1357 | if (hasLongI3Blocks) { | |||
1358 | longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); | |||
1359 | } | |||
1360 | } | |||
1361 | } else { | |||
1362 | U_ASSERT(f == I3_18)(void)0; | |||
1363 | U_ASSERT(hasLongI3Blocks)(void)0; | |||
1364 | // Encode an index-3 block that contains one or more data indexes exceeding 16 bits. | |||
1365 | int32_t j = i; | |||
1366 | int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH; | |||
1367 | int32_t k = indexLength; | |||
1368 | do { | |||
1369 | ++k; | |||
1370 | uint32_t v = index[j++]; | |||
1371 | uint32_t upperBits = (v & 0x30000) >> 2; | |||
1372 | index16[k++] = v; | |||
1373 | v = index[j++]; | |||
1374 | upperBits |= (v & 0x30000) >> 4; | |||
1375 | index16[k++] = v; | |||
1376 | v = index[j++]; | |||
1377 | upperBits |= (v & 0x30000) >> 6; | |||
1378 | index16[k++] = v; | |||
1379 | v = index[j++]; | |||
1380 | upperBits |= (v & 0x30000) >> 8; | |||
1381 | index16[k++] = v; | |||
1382 | v = index[j++]; | |||
1383 | upperBits |= (v & 0x30000) >> 10; | |||
1384 | index16[k++] = v; | |||
1385 | v = index[j++]; | |||
1386 | upperBits |= (v & 0x30000) >> 12; | |||
1387 | index16[k++] = v; | |||
1388 | v = index[j++]; | |||
1389 | upperBits |= (v & 0x30000) >> 14; | |||
1390 | index16[k++] = v; | |||
1391 | v = index[j++]; | |||
1392 | upperBits |= (v & 0x30000) >> 16; | |||
1393 | index16[k++] = v; | |||
1394 | index16[k - 9] = upperBits; | |||
1395 | } while (j < jLimit); | |||
1396 | int32_t n = longI3Blocks.findBlock(index16, index16, indexLength); | |||
1397 | if (n >= 0) { | |||
1398 | i3 = n | 0x8000; | |||
1399 | } else { | |||
1400 | if (indexLength == index3Start) { | |||
1401 | // No overlap at the boundary between the index-1 and index-3 tables. | |||
1402 | n = 0; | |||
1403 | } else { | |||
1404 | n = getOverlap(index16, indexLength, | |||
1405 | index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH); | |||
1406 | } | |||
1407 | i3 = (indexLength - n) | 0x8000; | |||
1408 | int32_t prevIndexLength = indexLength; | |||
1409 | if (n > 0) { | |||
1410 | int32_t start = indexLength; | |||
1411 | while (n < INDEX_3_18BIT_BLOCK_LENGTH) { | |||
1412 | index16[indexLength++] = index16[start + n++]; | |||
1413 | } | |||
1414 | } else { | |||
1415 | indexLength += INDEX_3_18BIT_BLOCK_LENGTH; | |||
1416 | } | |||
1417 | mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); | |||
1418 | if (hasLongI3Blocks) { | |||
1419 | longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); | |||
1420 | } | |||
1421 | } | |||
1422 | } | |||
1423 | if (index3NullOffset < 0 && i3FirstNull >= 0) { | |||
1424 | index3NullOffset = i3; | |||
1425 | } | |||
1426 | // Set the index-2 table entry. | |||
1427 | index2[i2Length++] = i3; | |||
1428 | } | |||
1429 | U_ASSERT(i2Length == index2Capacity)(void)0; | |||
1430 | U_ASSERT(indexLength <= index3Start + index3Capacity)(void)0; | |||
1431 | ||||
1432 | if (index3NullOffset < 0) { | |||
1433 | index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET; | |||
1434 | } | |||
1435 | if (indexLength >= (UCPTRIE_NO_INDEX3_NULL_OFFSET + UCPTRIE_INDEX_3_BLOCK_LENGTH)) { | |||
1436 | // The index-3 offsets exceed 15 bits, or | |||
1437 | // the last one cannot be distinguished from the no-null-block value. | |||
1438 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |||
1439 | return 0; | |||
1440 | } | |||
1441 | ||||
1442 | // Compact the index-2 table and write the index-1 table. | |||
1443 | static_assert(UCPTRIE_INDEX_2_BLOCK_LENGTH == UCPTRIE_INDEX_3_BLOCK_LENGTH, | |||
1444 | "must re-init mixedBlocks"); | |||
1445 | int32_t blockLength = UCPTRIE_INDEX_2_BLOCK_LENGTH; | |||
1446 | int32_t i1 = fastIndexLength; | |||
1447 | for (int32_t i = 0; i < i2Length; i += blockLength) { | |||
1448 | int32_t n; | |||
1449 | if ((i2Length - i) >= blockLength) { | |||
1450 | // normal block | |||
1451 | U_ASSERT(blockLength == UCPTRIE_INDEX_2_BLOCK_LENGTH)(void)0; | |||
1452 | n = mixedBlocks.findBlock(index16, index2, i); | |||
1453 | } else { | |||
1454 | // highStart is inside the last index-2 block. Shorten it. | |||
1455 | blockLength = i2Length - i; | |||
1456 | n = findSameBlock(index16, index3Start, indexLength, | |||
1457 | index2, i, blockLength); | |||
1458 | } | |||
1459 | int32_t i2; | |||
1460 | if (n >= 0) { | |||
1461 | i2 = n; | |||
1462 | } else { | |||
1463 | if (indexLength == index3Start) { | |||
1464 | // No overlap at the boundary between the index-1 and index-3/2 tables. | |||
1465 | n = 0; | |||
1466 | } else { | |||
1467 | n = getOverlap(index16, indexLength, index2, i, blockLength); | |||
1468 | } | |||
1469 | i2 = indexLength - n; | |||
1470 | int32_t prevIndexLength = indexLength; | |||
1471 | while (n < blockLength) { | |||
1472 | index16[indexLength++] = index2[i + n++]; | |||
1473 | } | |||
1474 | mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); | |||
1475 | } | |||
1476 | // Set the index-1 table entry. | |||
1477 | index16[i1++] = i2; | |||
1478 | } | |||
1479 | U_ASSERT(i1 == index3Start)(void)0; | |||
1480 | U_ASSERT(indexLength <= index16Capacity)(void)0; | |||
1481 | ||||
1482 | #ifdef UCPTRIE_DEBUG | |||
1483 | /* we saved some space */ | |||
1484 | printf("compacting UCPTrie: count of 16-bit index words %lu->%lu\n", | |||
1485 | (long)iLimit, (long)indexLength); | |||
1486 | #endif | |||
1487 | ||||
1488 | return indexLength; | |||
1489 | } | |||
1490 | ||||
1491 | int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorCode) { | |||
1492 | // Find the real highStart and round it up. | |||
1493 | U_ASSERT((highStart & (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) == 0)(void)0; | |||
1494 | highValue = get(MAX_UNICODE); | |||
1495 | int32_t realHighStart = findHighStart(); | |||
1496 | realHighStart = (realHighStart + (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) & | |||
1497 | ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1); | |||
1498 | if (realHighStart == UNICODE_LIMIT) { | |||
1499 | highValue = initialValue; | |||
1500 | } | |||
1501 | ||||
1502 | #ifdef UCPTRIE_DEBUG | |||
1503 | printf("UCPTrie: highStart U+%06lx highValue 0x%lx initialValue 0x%lx\n", | |||
1504 | (long)realHighStart, (long)highValue, (long)initialValue); | |||
1505 | #endif | |||
1506 | ||||
1507 | // We always store indexes and data values for the fast range. | |||
1508 | // Pin highStart to the top of that range while building. | |||
1509 | UChar32 fastLimit = fastILimit << UCPTRIE_SHIFT_3; | |||
1510 | if (realHighStart < fastLimit) { | |||
1511 | for (int32_t i = (realHighStart >> UCPTRIE_SHIFT_3); i < fastILimit; ++i) { | |||
1512 | flags[i] = ALL_SAME; | |||
1513 | index[i] = highValue; | |||
1514 | } | |||
1515 | highStart = fastLimit; | |||
1516 | } else { | |||
1517 | highStart = realHighStart; | |||
1518 | } | |||
1519 | ||||
1520 | uint32_t asciiData[ASCII_LIMIT]; | |||
1521 | for (int32_t i = 0; i < ASCII_LIMIT; ++i) { | |||
1522 | asciiData[i] = get(i); | |||
1523 | } | |||
1524 | ||||
1525 | // First we look for which data blocks have the same value repeated over the whole block, | |||
1526 | // deduplicate such blocks, find a good null data block (for faster enumeration), | |||
1527 | // and get an upper bound for the necessary data array length. | |||
1528 | AllSameBlocks allSameBlocks; | |||
1529 | int32_t newDataCapacity = compactWholeDataBlocks(fastILimit, allSameBlocks); | |||
1530 | if (newDataCapacity < 0) { | |||
1531 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
1532 | return 0; | |||
1533 | } | |||
1534 | uint32_t *newData = (uint32_t *)uprv_mallocuprv_malloc_71(newDataCapacity * 4); | |||
1535 | if (newData == nullptr) { | |||
1536 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
1537 | return 0; | |||
1538 | } | |||
1539 | uprv_memcpy(newData, asciiData, sizeof(asciiData))do { clang diagnostic push
clang diagnostic ignored "-Waddress" (void)0; (void)0; clang diagnostic pop :: memcpy(newData, asciiData , sizeof(asciiData)); } while (false); | |||
1540 | ||||
1541 | int32_t dataNullIndex = allSameBlocks.findMostUsed(); | |||
1542 | ||||
1543 | MixedBlocks mixedBlocks; | |||
1544 | int32_t newDataLength = compactData(fastILimit, newData, newDataCapacity, | |||
1545 | dataNullIndex, mixedBlocks, errorCode); | |||
1546 | if (U_FAILURE(errorCode)) { return 0; } | |||
1547 | U_ASSERT(newDataLength <= newDataCapacity)(void)0; | |||
1548 | uprv_freeuprv_free_71(data); | |||
1549 | data = newData; | |||
1550 | dataCapacity = newDataCapacity; | |||
1551 | dataLength = newDataLength; | |||
1552 | if (dataLength > (0x3ffff + UCPTRIE_SMALL_DATA_BLOCK_LENGTH)) { | |||
1553 | // The offset of the last data block is too high to be stored in the index table. | |||
1554 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |||
1555 | return 0; | |||
1556 | } | |||
1557 | ||||
1558 | if (dataNullIndex >= 0) { | |||
1559 | dataNullOffset = index[dataNullIndex]; | |||
1560 | #ifdef UCPTRIE_DEBUG | |||
1561 | if (data[dataNullOffset] != initialValue) { | |||
1562 | printf("UCPTrie initialValue %lx -> more common nullValue %lx\n", | |||
1563 | (long)initialValue, (long)data[dataNullOffset]); | |||
1564 | } | |||
1565 | #endif | |||
1566 | initialValue = data[dataNullOffset]; | |||
1567 | } else { | |||
1568 | dataNullOffset = UCPTRIE_NO_DATA_NULL_OFFSET; | |||
1569 | } | |||
1570 | ||||
1571 | int32_t indexLength = compactIndex(fastILimit, mixedBlocks, errorCode); | |||
1572 | highStart = realHighStart; | |||
1573 | return indexLength; | |||
1574 | } | |||
1575 | ||||
1576 | UCPTrie *MutableCodePointTrie::build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode) { | |||
1577 | if (U_FAILURE(errorCode)) { | |||
1578 | return nullptr; | |||
1579 | } | |||
1580 | if (type < UCPTRIE_TYPE_FAST || UCPTRIE_TYPE_SMALL < type || | |||
1581 | valueWidth < UCPTRIE_VALUE_BITS_16 || UCPTRIE_VALUE_BITS_8 < valueWidth) { | |||
1582 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; | |||
1583 | return nullptr; | |||
1584 | } | |||
1585 | ||||
1586 | // The mutable trie always stores 32-bit values. | |||
1587 | // When we build a UCPTrie for a smaller value width, we first mask off unused bits | |||
1588 | // before compacting the data. | |||
1589 | switch (valueWidth) { | |||
1590 | case UCPTRIE_VALUE_BITS_32: | |||
1591 | break; | |||
1592 | case UCPTRIE_VALUE_BITS_16: | |||
1593 | maskValues(0xffff); | |||
1594 | break; | |||
1595 | case UCPTRIE_VALUE_BITS_8: | |||
1596 | maskValues(0xff); | |||
1597 | break; | |||
1598 | default: | |||
1599 | break; | |||
1600 | } | |||
1601 | ||||
1602 | UChar32 fastLimit = type == UCPTRIE_TYPE_FAST ? BMP_LIMIT : UCPTRIE_SMALL_LIMIT; | |||
1603 | int32_t indexLength = compactTrie(fastLimit >> UCPTRIE_SHIFT_3, errorCode); | |||
1604 | if (U_FAILURE(errorCode)) { | |||
1605 | clear(); | |||
1606 | return nullptr; | |||
1607 | } | |||
1608 | ||||
1609 | // Ensure data table alignment: The index length must be even for uint32_t data. | |||
1610 | if (valueWidth == UCPTRIE_VALUE_BITS_32 && (indexLength & 1) != 0) { | |||
1611 | index16[indexLength++] = 0xffee; // arbitrary value | |||
1612 | } | |||
1613 | ||||
1614 | // Make the total trie structure length a multiple of 4 bytes by padding the data table, | |||
1615 | // and store special values as the last two data values. | |||
1616 | int32_t length = indexLength * 2; | |||
1617 | if (valueWidth == UCPTRIE_VALUE_BITS_16) { | |||
1618 | if (((indexLength ^ dataLength) & 1) != 0) { | |||
1619 | // padding | |||
1620 | data[dataLength++] = errorValue; | |||
1621 | } | |||
1622 | if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) { | |||
1623 | data[dataLength++] = highValue; | |||
1624 | data[dataLength++] = errorValue; | |||
1625 | } | |||
1626 | length += dataLength * 2; | |||
1627 | } else if (valueWidth == UCPTRIE_VALUE_BITS_32) { | |||
1628 | // 32-bit data words never need padding to a multiple of 4 bytes. | |||
1629 | if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) { | |||
1630 | if (data[dataLength - 1] != highValue) { | |||
1631 | data[dataLength++] = highValue; | |||
1632 | } | |||
1633 | data[dataLength++] = errorValue; | |||
1634 | } | |||
1635 | length += dataLength * 4; | |||
1636 | } else { | |||
1637 | int32_t and3 = (length + dataLength) & 3; | |||
1638 | if (and3 == 0 && data[dataLength - 1] == errorValue && data[dataLength - 2] == highValue) { | |||
1639 | // all set | |||
1640 | } else if(and3 == 3 && data[dataLength - 1] == highValue) { | |||
1641 | data[dataLength++] = errorValue; | |||
1642 | } else { | |||
1643 | while (and3 != 2) { | |||
1644 | data[dataLength++] = highValue; | |||
1645 | and3 = (and3 + 1) & 3; | |||
1646 | } | |||
1647 | data[dataLength++] = highValue; | |||
1648 | data[dataLength++] = errorValue; | |||
1649 | } | |||
1650 | length += dataLength; | |||
1651 | } | |||
1652 | ||||
1653 | // Calculate the total length of the UCPTrie as a single memory block. | |||
1654 | length += sizeof(UCPTrie); | |||
1655 | U_ASSERT((length & 3) == 0)(void)0; | |||
1656 | ||||
1657 | uint8_t *bytes = (uint8_t *)uprv_mallocuprv_malloc_71(length); | |||
1658 | if (bytes == nullptr) { | |||
1659 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
1660 | clear(); | |||
1661 | return nullptr; | |||
1662 | } | |||
1663 | UCPTrie *trie = reinterpret_cast<UCPTrie *>(bytes); | |||
1664 | uprv_memset(trie, 0, sizeof(UCPTrie)):: memset(trie, 0, sizeof(UCPTrie)); | |||
1665 | trie->indexLength = indexLength; | |||
1666 | trie->dataLength = dataLength; | |||
1667 | ||||
1668 | trie->highStart = highStart; | |||
1669 | // Round up shifted12HighStart to a multiple of 0x1000 for easy testing from UTF-8 lead bytes. | |||
1670 | // Runtime code needs to then test for the real highStart as well. | |||
1671 | trie->shifted12HighStart = (highStart + 0xfff) >> 12; | |||
1672 | trie->type = type; | |||
1673 | trie->valueWidth = valueWidth; | |||
1674 | ||||
1675 | trie->index3NullOffset = index3NullOffset; | |||
1676 | trie->dataNullOffset = dataNullOffset; | |||
1677 | trie->nullValue = initialValue; | |||
1678 | ||||
1679 | bytes += sizeof(UCPTrie); | |||
1680 | ||||
1681 | // Fill the index and data arrays. | |||
1682 | uint16_t *dest16 = (uint16_t *)bytes; | |||
1683 | trie->index = dest16; | |||
1684 | ||||
1685 | if (highStart <= fastLimit) { | |||
1686 | // Condense only the fast index from the mutable-trie index. | |||
1687 | for (int32_t i = 0, j = 0; j < indexLength; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK, ++j) { | |||
1688 | *dest16++ = (uint16_t)index[i]; // dest16[j] | |||
1689 | } | |||
1690 | } else { | |||
1691 | uprv_memcpy(dest16, index16, indexLength * 2)do { clang diagnostic push
clang diagnostic ignored "-Waddress" (void)0; (void)0; clang diagnostic pop :: memcpy(dest16, index16 , indexLength * 2); } while (false); | |||
1692 | dest16 += indexLength; | |||
1693 | } | |||
1694 | bytes += indexLength * 2; | |||
1695 | ||||
1696 | // Write the data array. | |||
1697 | const uint32_t *p = data; | |||
1698 | switch (valueWidth) { | |||
1699 | case UCPTRIE_VALUE_BITS_16: | |||
1700 | // Write 16-bit data values. | |||
1701 | trie->data.ptr16 = dest16; | |||
1702 | for (int32_t i = dataLength; i > 0; --i) { | |||
1703 | *dest16++ = (uint16_t)*p++; | |||
1704 | } | |||
1705 | break; | |||
1706 | case UCPTRIE_VALUE_BITS_32: | |||
1707 | // Write 32-bit data values. | |||
1708 | trie->data.ptr32 = (uint32_t *)bytes; | |||
1709 | uprv_memcpy(bytes, p, (size_t)dataLength * 4)do { clang diagnostic push
clang diagnostic ignored "-Waddress" (void)0; (void)0; clang diagnostic pop :: memcpy(bytes, p, (size_t)dataLength * 4); } while (false); | |||
1710 | break; | |||
1711 | case UCPTRIE_VALUE_BITS_8: | |||
1712 | // Write 8-bit data values. | |||
1713 | trie->data.ptr8 = bytes; | |||
1714 | for (int32_t i = dataLength; i > 0; --i) { | |||
1715 | *bytes++ = (uint8_t)*p++; | |||
1716 | } | |||
1717 | break; | |||
1718 | default: | |||
1719 | // Will not occur, valueWidth checked at the beginning. | |||
1720 | break; | |||
1721 | } | |||
1722 | ||||
1723 | #ifdef UCPTRIE_DEBUG | |||
1724 | trie->name = name; | |||
1725 | ||||
1726 | ucptrie_printLengths(trie, ""); | |||
1727 | #endif | |||
1728 | ||||
1729 | clear(); | |||
1730 | return trie; | |||
1731 | } | |||
1732 | ||||
1733 | } // namespace | |||
1734 | ||||
1735 | U_NAMESPACE_END} | |||
1736 | ||||
1737 | U_NAMESPACE_USEusing namespace icu_71; | |||
1738 | ||||
1739 | U_CAPIextern "C" UMutableCPTrie * U_EXPORT2 | |||
1740 | umutablecptrie_openumutablecptrie_open_71(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) { | |||
1741 | if (U_FAILURE(*pErrorCode)) { | |||
1742 | return nullptr; | |||
1743 | } | |||
1744 | LocalPointer<MutableCodePointTrie> trie( | |||
1745 | new MutableCodePointTrie(initialValue, errorValue, *pErrorCode), *pErrorCode); | |||
1746 | if (U_FAILURE(*pErrorCode)) { | |||
1747 | return nullptr; | |||
1748 | } | |||
1749 | return reinterpret_cast<UMutableCPTrie *>(trie.orphan()); | |||
1750 | } | |||
1751 | ||||
1752 | U_CAPIextern "C" UMutableCPTrie * U_EXPORT2 | |||
1753 | umutablecptrie_cloneumutablecptrie_clone_71(const UMutableCPTrie *other, UErrorCode *pErrorCode) { | |||
1754 | if (U_FAILURE(*pErrorCode)) { | |||
1755 | return nullptr; | |||
1756 | } | |||
1757 | if (other == nullptr) { | |||
1758 | return nullptr; | |||
1759 | } | |||
1760 | LocalPointer<MutableCodePointTrie> clone( | |||
1761 | new MutableCodePointTrie(*reinterpret_cast<const MutableCodePointTrie *>(other), *pErrorCode), *pErrorCode); | |||
1762 | if (U_FAILURE(*pErrorCode)) { | |||
1763 | return nullptr; | |||
1764 | } | |||
1765 | return reinterpret_cast<UMutableCPTrie *>(clone.orphan()); | |||
1766 | } | |||
1767 | ||||
1768 | U_CAPIextern "C" void U_EXPORT2 | |||
1769 | umutablecptrie_closeumutablecptrie_close_71(UMutableCPTrie *trie) { | |||
1770 | delete reinterpret_cast<MutableCodePointTrie *>(trie); | |||
1771 | } | |||
1772 | ||||
1773 | U_CAPIextern "C" UMutableCPTrie * U_EXPORT2 | |||
1774 | umutablecptrie_fromUCPMapumutablecptrie_fromUCPMap_71(const UCPMap *map, UErrorCode *pErrorCode) { | |||
1775 | if (U_FAILURE(*pErrorCode)) { | |||
1776 | return nullptr; | |||
1777 | } | |||
1778 | if (map == nullptr) { | |||
1779 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; | |||
1780 | return nullptr; | |||
1781 | } | |||
1782 | return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPMap(map, *pErrorCode)); | |||
1783 | } | |||
1784 | ||||
1785 | U_CAPIextern "C" UMutableCPTrie * U_EXPORT2 | |||
1786 | umutablecptrie_fromUCPTrieumutablecptrie_fromUCPTrie_71(const UCPTrie *trie, UErrorCode *pErrorCode) { | |||
1787 | if (U_FAILURE(*pErrorCode)) { | |||
1788 | return nullptr; | |||
1789 | } | |||
1790 | if (trie == nullptr) { | |||
1791 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; | |||
1792 | return nullptr; | |||
1793 | } | |||
1794 | return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPTrie(trie, *pErrorCode)); | |||
1795 | } | |||
1796 | ||||
1797 | U_CAPIextern "C" uint32_t U_EXPORT2 | |||
1798 | umutablecptrie_getumutablecptrie_get_71(const UMutableCPTrie *trie, UChar32 c) { | |||
1799 | return reinterpret_cast<const MutableCodePointTrie *>(trie)->get(c); | |||
1800 | } | |||
1801 | ||||
1802 | namespace { | |||
1803 | ||||
1804 | UChar32 getRange(const void *trie, UChar32 start, | |||
1805 | UCPMapValueFilter *filter, const void *context, uint32_t *pValue) { | |||
1806 | return reinterpret_cast<const MutableCodePointTrie *>(trie)-> | |||
1807 | getRange(start, filter, context, pValue); | |||
1808 | } | |||
1809 | ||||
1810 | } // namespace | |||
1811 | ||||
1812 | U_CAPIextern "C" UChar32 U_EXPORT2 | |||
1813 | umutablecptrie_getRangeumutablecptrie_getRange_71(const UMutableCPTrie *trie, UChar32 start, | |||
1814 | UCPMapRangeOption option, uint32_t surrogateValue, | |||
1815 | UCPMapValueFilter *filter, const void *context, uint32_t *pValue) { | |||
1816 | return ucptrie_internalGetRangeucptrie_internalGetRange_71(getRange, trie, start, | |||
1817 | option, surrogateValue, | |||
1818 | filter, context, pValue); | |||
1819 | } | |||
1820 | ||||
1821 | U_CAPIextern "C" void U_EXPORT2 | |||
1822 | umutablecptrie_setumutablecptrie_set_71(UMutableCPTrie *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) { | |||
1823 | if (U_FAILURE(*pErrorCode)) { | |||
1824 | return; | |||
1825 | } | |||
1826 | reinterpret_cast<MutableCodePointTrie *>(trie)->set(c, value, *pErrorCode); | |||
1827 | } | |||
1828 | ||||
1829 | U_CAPIextern "C" void U_EXPORT2 | |||
1830 | umutablecptrie_setRangeumutablecptrie_setRange_71(UMutableCPTrie *trie, UChar32 start, UChar32 end, | |||
1831 | uint32_t value, UErrorCode *pErrorCode) { | |||
1832 | if (U_FAILURE(*pErrorCode)) { | |||
1833 | return; | |||
1834 | } | |||
1835 | reinterpret_cast<MutableCodePointTrie *>(trie)->setRange(start, end, value, *pErrorCode); | |||
1836 | } | |||
1837 | ||||
1838 | /* Compact and internally serialize the trie. */ | |||
1839 | U_CAPIextern "C" UCPTrie * U_EXPORT2 | |||
1840 | umutablecptrie_buildImmutableumutablecptrie_buildImmutable_71(UMutableCPTrie *trie, UCPTrieType type, UCPTrieValueWidth valueWidth, | |||
1841 | UErrorCode *pErrorCode) { | |||
1842 | if (U_FAILURE(*pErrorCode)) { | |||
1843 | return nullptr; | |||
1844 | } | |||
1845 | return reinterpret_cast<MutableCodePointTrie *>(trie)->build(type, valueWidth, *pErrorCode); | |||
1846 | } | |||
1847 | ||||
1848 | #ifdef UCPTRIE_DEBUG | |||
1849 | U_CFUNCextern "C" void umutablecptrie_setName(UMutableCPTrie *trie, const char *name) { | |||
1850 | reinterpret_cast<MutableCodePointTrie *>(trie)->name = name; | |||
1851 | } | |||
1852 | #endif |