File: | out/../deps/icu-small/source/common/rbbi.cpp |
Warning: | line 1270, column 13 Value stored to 'status' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | *************************************************************************** |
5 | * Copyright (C) 1999-2016 International Business Machines Corporation |
6 | * and others. All rights reserved. |
7 | *************************************************************************** |
8 | */ |
9 | // |
10 | // file: rbbi.cpp Contains the implementation of the rule based break iterator |
11 | // runtime engine and the API implementation for |
12 | // class RuleBasedBreakIterator |
13 | // |
14 | |
15 | #include "utypeinfo.h" // for 'typeid' to work |
16 | |
17 | #include "unicode/utypes.h" |
18 | |
19 | #if !UCONFIG_NO_BREAK_ITERATION0 |
20 | |
21 | #include <cinttypes> |
22 | |
23 | #include "unicode/rbbi.h" |
24 | #include "unicode/schriter.h" |
25 | #include "unicode/uchriter.h" |
26 | #include "unicode/uclean.h" |
27 | #include "unicode/udata.h" |
28 | |
29 | #include "brkeng.h" |
30 | #include "ucln_cmn.h" |
31 | #include "cmemory.h" |
32 | #include "cstring.h" |
33 | #include "localsvc.h" |
34 | #include "rbbidata.h" |
35 | #include "rbbi_cache.h" |
36 | #include "rbbirb.h" |
37 | #include "uassert.h" |
38 | #include "umutex.h" |
39 | #include "uvectr32.h" |
40 | |
41 | #ifdef RBBI_DEBUG |
42 | static UBool gTrace = FALSE0; |
43 | #endif |
44 | |
45 | U_NAMESPACE_BEGINnamespace icu_71 { |
46 | |
47 | // The state number of the starting state |
48 | constexpr int32_t START_STATE = 1; |
49 | |
50 | // The state-transition value indicating "stop" |
51 | constexpr int32_t STOP_STATE = 0; |
52 | |
53 | |
54 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)UClassID RuleBasedBreakIterator::getStaticClassID() { static char classID = 0; return (UClassID)&classID; } UClassID RuleBasedBreakIterator ::getDynamicClassID() const { return RuleBasedBreakIterator:: getStaticClassID(); } |
55 | |
56 | |
57 | //======================================================================= |
58 | // constructors |
59 | //======================================================================= |
60 | |
61 | /** |
62 | * Constructs a RuleBasedBreakIterator that uses the already-created |
63 | * tables object that is passed in as a parameter. |
64 | */ |
65 | RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) |
66 | : fSCharIter(UnicodeString()) |
67 | { |
68 | init(status); |
69 | fData = new RBBIDataWrapper(data, status); // status checked in constructor |
70 | if (U_FAILURE(status)) {return;} |
71 | if(fData == nullptr) { |
72 | status = U_MEMORY_ALLOCATION_ERROR; |
73 | return; |
74 | } |
75 | if (fData->fForwardTable->fLookAheadResultsSize > 0) { |
76 | fLookAheadMatches = static_cast<int32_t *>( |
77 | uprv_mallocuprv_malloc_71(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); |
78 | if (fLookAheadMatches == nullptr) { |
79 | status = U_MEMORY_ALLOCATION_ERROR; |
80 | return; |
81 | } |
82 | } |
83 | } |
84 | |
85 | //------------------------------------------------------------------------------- |
86 | // |
87 | // Constructor from a UDataMemory handle to precompiled break rules |
88 | // stored in an ICU data file. This construcotr is private API, |
89 | // only for internal use. |
90 | // |
91 | //------------------------------------------------------------------------------- |
92 | RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking, |
93 | UErrorCode &status) : RuleBasedBreakIterator(udm, status) |
94 | { |
95 | fIsPhraseBreaking = isPhraseBreaking; |
96 | } |
97 | |
98 | // |
99 | // Construct from precompiled binary rules (tables). This constructor is public API, |
100 | // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). |
101 | // |
102 | RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, |
103 | uint32_t ruleLength, |
104 | UErrorCode &status) |
105 | : fSCharIter(UnicodeString()) |
106 | { |
107 | init(status); |
108 | if (U_FAILURE(status)) { |
109 | return; |
110 | } |
111 | if (compiledRules == NULL__null || ruleLength < sizeof(RBBIDataHeader)) { |
112 | status = U_ILLEGAL_ARGUMENT_ERROR; |
113 | return; |
114 | } |
115 | const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; |
116 | if (data->fLength > ruleLength) { |
117 | status = U_ILLEGAL_ARGUMENT_ERROR; |
118 | return; |
119 | } |
120 | fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); |
121 | if (U_FAILURE(status)) {return;} |
122 | if(fData == nullptr) { |
123 | status = U_MEMORY_ALLOCATION_ERROR; |
124 | return; |
125 | } |
126 | if (fData->fForwardTable->fLookAheadResultsSize > 0) { |
127 | fLookAheadMatches = static_cast<int32_t *>( |
128 | uprv_mallocuprv_malloc_71(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); |
129 | if (fLookAheadMatches == nullptr) { |
130 | status = U_MEMORY_ALLOCATION_ERROR; |
131 | return; |
132 | } |
133 | } |
134 | } |
135 | |
136 | |
137 | //------------------------------------------------------------------------------- |
138 | // |
139 | // Constructor from a UDataMemory handle to precompiled break rules |
140 | // stored in an ICU data file. |
141 | // |
142 | //------------------------------------------------------------------------------- |
143 | RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) |
144 | : fSCharIter(UnicodeString()) |
145 | { |
146 | init(status); |
147 | fData = new RBBIDataWrapper(udm, status); // status checked in constructor |
148 | if (U_FAILURE(status)) {return;} |
149 | if(fData == nullptr) { |
150 | status = U_MEMORY_ALLOCATION_ERROR; |
151 | return; |
152 | } |
153 | if (fData->fForwardTable->fLookAheadResultsSize > 0) { |
154 | fLookAheadMatches = static_cast<int32_t *>( |
155 | uprv_mallocuprv_malloc_71(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); |
156 | if (fLookAheadMatches == nullptr) { |
157 | status = U_MEMORY_ALLOCATION_ERROR; |
158 | return; |
159 | } |
160 | } |
161 | } |
162 | |
163 | |
164 | |
165 | //------------------------------------------------------------------------------- |
166 | // |
167 | // Constructor from a set of rules supplied as a string. |
168 | // |
169 | //------------------------------------------------------------------------------- |
170 | RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, |
171 | UParseError &parseError, |
172 | UErrorCode &status) |
173 | : fSCharIter(UnicodeString()) |
174 | { |
175 | init(status); |
176 | if (U_FAILURE(status)) {return;} |
177 | RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) |
178 | RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); |
179 | // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that |
180 | // creates and returns a complete RBBI. From here, in a constructor, we |
181 | // can't just return the object created by the builder factory, hence |
182 | // the assignment of the factory created object to "this". |
183 | if (U_SUCCESS(status)) { |
184 | *this = *bi; |
185 | delete bi; |
186 | } |
187 | } |
188 | |
189 | |
190 | //------------------------------------------------------------------------------- |
191 | // |
192 | // Default Constructor. Create an empty shell that can be set up later. |
193 | // Used when creating a RuleBasedBreakIterator from a set |
194 | // of rules. |
195 | //------------------------------------------------------------------------------- |
196 | RuleBasedBreakIterator::RuleBasedBreakIterator() |
197 | : fSCharIter(UnicodeString()) |
198 | { |
199 | UErrorCode status = U_ZERO_ERROR; |
200 | init(status); |
201 | } |
202 | |
203 | |
204 | //------------------------------------------------------------------------------- |
205 | // |
206 | // Copy constructor. Will produce a break iterator with the same behavior, |
207 | // and which iterates over the same text, as the one passed in. |
208 | // |
209 | //------------------------------------------------------------------------------- |
210 | RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) |
211 | : BreakIterator(other), |
212 | fSCharIter(UnicodeString()) |
213 | { |
214 | UErrorCode status = U_ZERO_ERROR; |
215 | this->init(status); |
216 | *this = other; |
217 | } |
218 | |
219 | |
220 | /** |
221 | * Destructor |
222 | */ |
223 | RuleBasedBreakIterator::~RuleBasedBreakIterator() { |
224 | if (fCharIter != &fSCharIter) { |
225 | // fCharIter was adopted from the outside. |
226 | delete fCharIter; |
227 | } |
228 | fCharIter = nullptr; |
229 | |
230 | utext_closeutext_close_71(&fText); |
231 | |
232 | if (fData != nullptr) { |
233 | fData->removeReference(); |
234 | fData = nullptr; |
235 | } |
236 | delete fBreakCache; |
237 | fBreakCache = nullptr; |
238 | |
239 | delete fDictionaryCache; |
240 | fDictionaryCache = nullptr; |
241 | |
242 | delete fLanguageBreakEngines; |
243 | fLanguageBreakEngines = nullptr; |
244 | |
245 | delete fUnhandledBreakEngine; |
246 | fUnhandledBreakEngine = nullptr; |
247 | |
248 | uprv_freeuprv_free_71(fLookAheadMatches); |
249 | fLookAheadMatches = nullptr; |
250 | } |
251 | |
252 | /** |
253 | * Assignment operator. Sets this iterator to have the same behavior, |
254 | * and iterate over the same text, as the one passed in. |
255 | * TODO: needs better handling of memory allocation errors. |
256 | */ |
257 | RuleBasedBreakIterator& |
258 | RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { |
259 | if (this == &that) { |
260 | return *this; |
261 | } |
262 | BreakIterator::operator=(that); |
263 | |
264 | if (fLanguageBreakEngines != NULL__null) { |
265 | delete fLanguageBreakEngines; |
266 | fLanguageBreakEngines = NULL__null; // Just rebuild for now |
267 | } |
268 | // TODO: clone fLanguageBreakEngines from "that" |
269 | UErrorCode status = U_ZERO_ERROR; |
270 | utext_cloneutext_clone_71(&fText, &that.fText, FALSE0, TRUE1, &status); |
271 | |
272 | if (fCharIter != &fSCharIter) { |
273 | delete fCharIter; |
274 | } |
275 | fCharIter = &fSCharIter; |
276 | |
277 | if (that.fCharIter != NULL__null && that.fCharIter != &that.fSCharIter) { |
278 | // This is a little bit tricky - it will initially appear that |
279 | // this->fCharIter is adopted, even if that->fCharIter was |
280 | // not adopted. That's ok. |
281 | fCharIter = that.fCharIter->clone(); |
282 | } |
283 | fSCharIter = that.fSCharIter; |
284 | if (fCharIter == NULL__null) { |
285 | fCharIter = &fSCharIter; |
286 | } |
287 | |
288 | if (fData != NULL__null) { |
289 | fData->removeReference(); |
290 | fData = NULL__null; |
291 | } |
292 | if (that.fData != NULL__null) { |
293 | fData = that.fData->addReference(); |
294 | } |
295 | |
296 | uprv_freeuprv_free_71(fLookAheadMatches); |
297 | fLookAheadMatches = nullptr; |
298 | if (fData && fData->fForwardTable->fLookAheadResultsSize > 0) { |
299 | fLookAheadMatches = static_cast<int32_t *>( |
300 | uprv_mallocuprv_malloc_71(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); |
301 | } |
302 | |
303 | |
304 | fPosition = that.fPosition; |
305 | fRuleStatusIndex = that.fRuleStatusIndex; |
306 | fDone = that.fDone; |
307 | |
308 | // TODO: both the dictionary and the main cache need to be copied. |
309 | // Current position could be within a dictionary range. Trying to continue |
310 | // the iteration without the caches present would go to the rules, with |
311 | // the assumption that the current position is on a rule boundary. |
312 | fBreakCache->reset(fPosition, fRuleStatusIndex); |
313 | fDictionaryCache->reset(); |
314 | |
315 | return *this; |
316 | } |
317 | |
318 | |
319 | |
320 | //----------------------------------------------------------------------------- |
321 | // |
322 | // init() Shared initialization routine. Used by all the constructors. |
323 | // Initializes all fields, leaving the object in a consistent state. |
324 | // |
325 | //----------------------------------------------------------------------------- |
326 | void RuleBasedBreakIterator::init(UErrorCode &status) { |
327 | fCharIter = nullptr; |
328 | fData = nullptr; |
329 | fPosition = 0; |
330 | fRuleStatusIndex = 0; |
331 | fDone = false; |
332 | fDictionaryCharCount = 0; |
333 | fLanguageBreakEngines = nullptr; |
334 | fUnhandledBreakEngine = nullptr; |
335 | fBreakCache = nullptr; |
336 | fDictionaryCache = nullptr; |
337 | fLookAheadMatches = nullptr; |
338 | fIsPhraseBreaking = false; |
339 | |
340 | // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER. |
341 | // fText = UTEXT_INITIALIZER; |
342 | static const UText initializedUText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null, __null, __null, __null, __null, __null, __null, __null, 0, 0 , 0, 0, 0, 0 }; |
343 | uprv_memcpy(&fText, &initializedUText, sizeof(UText))do { clang diagnostic push
clang diagnostic ignored "-Waddress" (void)0; (void)0; clang diagnostic pop :: memcpy(&fText , &initializedUText, sizeof(UText)); } while (false); |
344 | |
345 | if (U_FAILURE(status)) { |
346 | return; |
347 | } |
348 | |
349 | utext_openUCharsutext_openUChars_71(&fText, NULL__null, 0, &status); |
350 | fDictionaryCache = new DictionaryCache(this, status); |
351 | fBreakCache = new BreakCache(this, status); |
352 | if (U_SUCCESS(status) && (fDictionaryCache == NULL__null || fBreakCache == NULL__null)) { |
353 | status = U_MEMORY_ALLOCATION_ERROR; |
354 | } |
355 | |
356 | #ifdef RBBI_DEBUG |
357 | static UBool debugInitDone = FALSE0; |
358 | if (debugInitDone == FALSE0) { |
359 | char *debugEnv = getenv("U_RBBIDEBUG"); |
360 | if (debugEnv && uprv_strstr(debugEnv, "trace"):: strstr(debugEnv, "trace")) { |
361 | gTrace = TRUE1; |
362 | } |
363 | debugInitDone = TRUE1; |
364 | } |
365 | #endif |
366 | } |
367 | |
368 | |
369 | |
370 | //----------------------------------------------------------------------------- |
371 | // |
372 | // clone - Returns a newly-constructed RuleBasedBreakIterator with the same |
373 | // behavior, and iterating over the same text, as this one. |
374 | // Virtual function: does the right thing with subclasses. |
375 | // |
376 | //----------------------------------------------------------------------------- |
377 | RuleBasedBreakIterator* |
378 | RuleBasedBreakIterator::clone() const { |
379 | return new RuleBasedBreakIterator(*this); |
380 | } |
381 | |
382 | /** |
383 | * Equality operator. Returns true if both BreakIterators are of the |
384 | * same class, have the same behavior, and iterate over the same text. |
385 | */ |
386 | bool |
387 | RuleBasedBreakIterator::operator==(const BreakIterator& that) const { |
388 | if (typeid(*this) != typeid(that)) { |
389 | return false; |
390 | } |
391 | if (this == &that) { |
392 | return true; |
393 | } |
394 | |
395 | // The base class BreakIterator carries no state that participates in equality, |
396 | // and does not implement an equality function that would otherwise be |
397 | // checked at this point. |
398 | |
399 | const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; |
400 | |
401 | if (!utext_equalsutext_equals_71(&fText, &that2.fText)) { |
402 | // The two break iterators are operating on different text, |
403 | // or have a different iteration position. |
404 | // Note that fText's position is always the same as the break iterator's position. |
405 | return false; |
406 | } |
407 | |
408 | if (!(fPosition == that2.fPosition && |
409 | fRuleStatusIndex == that2.fRuleStatusIndex && |
410 | fDone == that2.fDone)) { |
411 | return false; |
412 | } |
413 | |
414 | if (that2.fData == fData || |
415 | (fData != NULL__null && that2.fData != NULL__null && *that2.fData == *fData)) { |
416 | // The two break iterators are using the same rules. |
417 | return true; |
418 | } |
419 | return false; |
420 | } |
421 | |
422 | /** |
423 | * Compute a hash code for this BreakIterator |
424 | * @return A hash code |
425 | */ |
426 | int32_t |
427 | RuleBasedBreakIterator::hashCode(void) const { |
428 | int32_t hash = 0; |
429 | if (fData != NULL__null) { |
430 | hash = fData->hashCode(); |
431 | } |
432 | return hash; |
433 | } |
434 | |
435 | |
436 | void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { |
437 | if (U_FAILURE(status)) { |
438 | return; |
439 | } |
440 | fBreakCache->reset(); |
441 | fDictionaryCache->reset(); |
442 | utext_cloneutext_clone_71(&fText, ut, FALSE0, TRUE1, &status); |
443 | |
444 | // Set up a dummy CharacterIterator to be returned if anyone |
445 | // calls getText(). With input from UText, there is no reasonable |
446 | // way to return a characterIterator over the actual input text. |
447 | // Return one over an empty string instead - this is the closest |
448 | // we can come to signaling a failure. |
449 | // (GetText() is obsolete, this failure is sort of OK) |
450 | fSCharIter.setText(UnicodeString()); |
451 | |
452 | if (fCharIter != &fSCharIter) { |
453 | // existing fCharIter was adopted from the outside. Delete it now. |
454 | delete fCharIter; |
455 | } |
456 | fCharIter = &fSCharIter; |
457 | |
458 | this->first(); |
459 | } |
460 | |
461 | |
462 | UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { |
463 | UText *result = utext_cloneutext_clone_71(fillIn, &fText, FALSE0, TRUE1, &status); |
464 | return result; |
465 | } |
466 | |
467 | |
468 | //======================================================================= |
469 | // BreakIterator overrides |
470 | //======================================================================= |
471 | |
472 | /** |
473 | * Return a CharacterIterator over the text being analyzed. |
474 | */ |
475 | CharacterIterator& |
476 | RuleBasedBreakIterator::getText() const { |
477 | return *fCharIter; |
478 | } |
479 | |
480 | /** |
481 | * Set the iterator to analyze a new piece of text. This function resets |
482 | * the current iteration position to the beginning of the text. |
483 | * @param newText An iterator over the text to analyze. |
484 | */ |
485 | void |
486 | RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { |
487 | // If we are holding a CharacterIterator adopted from a |
488 | // previous call to this function, delete it now. |
489 | if (fCharIter != &fSCharIter) { |
490 | delete fCharIter; |
491 | } |
492 | |
493 | fCharIter = newText; |
494 | UErrorCode status = U_ZERO_ERROR; |
495 | fBreakCache->reset(); |
496 | fDictionaryCache->reset(); |
497 | if (newText==NULL__null || newText->startIndex() != 0) { |
498 | // startIndex !=0 wants to be an error, but there's no way to report it. |
499 | // Make the iterator text be an empty string. |
500 | utext_openUCharsutext_openUChars_71(&fText, NULL__null, 0, &status); |
501 | } else { |
502 | utext_openCharacterIteratorutext_openCharacterIterator_71(&fText, newText, &status); |
503 | } |
504 | this->first(); |
505 | } |
506 | |
507 | /** |
508 | * Set the iterator to analyze a new piece of text. This function resets |
509 | * the current iteration position to the beginning of the text. |
510 | * @param newText An iterator over the text to analyze. |
511 | */ |
512 | void |
513 | RuleBasedBreakIterator::setText(const UnicodeString& newText) { |
514 | UErrorCode status = U_ZERO_ERROR; |
515 | fBreakCache->reset(); |
516 | fDictionaryCache->reset(); |
517 | utext_openConstUnicodeStringutext_openConstUnicodeString_71(&fText, &newText, &status); |
518 | |
519 | // Set up a character iterator on the string. |
520 | // Needed in case someone calls getText(). |
521 | // Can not, unfortunately, do this lazily on the (probably never) |
522 | // call to getText(), because getText is const. |
523 | fSCharIter.setText(newText); |
524 | |
525 | if (fCharIter != &fSCharIter) { |
526 | // old fCharIter was adopted from the outside. Delete it. |
527 | delete fCharIter; |
528 | } |
529 | fCharIter = &fSCharIter; |
530 | |
531 | this->first(); |
532 | } |
533 | |
534 | |
535 | /** |
536 | * Provide a new UText for the input text. Must reference text with contents identical |
537 | * to the original. |
538 | * Intended for use with text data originating in Java (garbage collected) environments |
539 | * where the data may be moved in memory at arbitrary times. |
540 | */ |
541 | RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { |
542 | if (U_FAILURE(status)) { |
543 | return *this; |
544 | } |
545 | if (input == NULL__null) { |
546 | status = U_ILLEGAL_ARGUMENT_ERROR; |
547 | return *this; |
548 | } |
549 | int64_t pos = utext_getNativeIndexutext_getNativeIndex_71(&fText); |
550 | // Shallow read-only clone of the new UText into the existing input UText |
551 | utext_cloneutext_clone_71(&fText, input, FALSE0, TRUE1, &status); |
552 | if (U_FAILURE(status)) { |
553 | return *this; |
554 | } |
555 | utext_setNativeIndexutext_setNativeIndex_71(&fText, pos); |
556 | if (utext_getNativeIndexutext_getNativeIndex_71(&fText) != pos) { |
557 | // Sanity check. The new input utext is supposed to have the exact same |
558 | // contents as the old. If we can't set to the same position, it doesn't. |
559 | // The contents underlying the old utext might be invalid at this point, |
560 | // so it's not safe to check directly. |
561 | status = U_ILLEGAL_ARGUMENT_ERROR; |
562 | } |
563 | return *this; |
564 | } |
565 | |
566 | |
567 | /** |
568 | * Sets the current iteration position to the beginning of the text, position zero. |
569 | * @return The new iterator position, which is zero. |
570 | */ |
571 | int32_t RuleBasedBreakIterator::first(void) { |
572 | UErrorCode status = U_ZERO_ERROR; |
573 | if (!fBreakCache->seek(0)) { |
574 | fBreakCache->populateNear(0, status); |
575 | } |
576 | fBreakCache->current(); |
577 | U_ASSERT(fPosition == 0)(void)0; |
578 | return 0; |
579 | } |
580 | |
581 | /** |
582 | * Sets the current iteration position to the end of the text. |
583 | * @return The text's past-the-end offset. |
584 | */ |
585 | int32_t RuleBasedBreakIterator::last(void) { |
586 | int32_t endPos = (int32_t)utext_nativeLengthutext_nativeLength_71(&fText); |
587 | UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. |
588 | (void)endShouldBeBoundary; |
589 | U_ASSERT(endShouldBeBoundary)(void)0; |
590 | U_ASSERT(fPosition == endPos)(void)0; |
591 | return endPos; |
592 | } |
593 | |
594 | /** |
595 | * Advances the iterator either forward or backward the specified number of steps. |
596 | * Negative values move backward, and positive values move forward. This is |
597 | * equivalent to repeatedly calling next() or previous(). |
598 | * @param n The number of steps to move. The sign indicates the direction |
599 | * (negative is backwards, and positive is forwards). |
600 | * @return The character offset of the boundary position n boundaries away from |
601 | * the current one. |
602 | */ |
603 | int32_t RuleBasedBreakIterator::next(int32_t n) { |
604 | int32_t result = 0; |
605 | if (n > 0) { |
606 | for (; n > 0 && result != UBRK_DONE((int32_t) -1); --n) { |
607 | result = next(); |
608 | } |
609 | } else if (n < 0) { |
610 | for (; n < 0 && result != UBRK_DONE((int32_t) -1); ++n) { |
611 | result = previous(); |
612 | } |
613 | } else { |
614 | result = current(); |
615 | } |
616 | return result; |
617 | } |
618 | |
619 | /** |
620 | * Advances the iterator to the next boundary position. |
621 | * @return The position of the first boundary after this one. |
622 | */ |
623 | int32_t RuleBasedBreakIterator::next(void) { |
624 | fBreakCache->next(); |
625 | return fDone ? UBRK_DONE((int32_t) -1) : fPosition; |
626 | } |
627 | |
628 | /** |
629 | * Move the iterator backwards, to the boundary preceding the current one. |
630 | * |
631 | * Starts from the current position within fText. |
632 | * Starting position need not be on a boundary. |
633 | * |
634 | * @return The position of the boundary position immediately preceding the starting position. |
635 | */ |
636 | int32_t RuleBasedBreakIterator::previous(void) { |
637 | UErrorCode status = U_ZERO_ERROR; |
638 | fBreakCache->previous(status); |
639 | return fDone ? UBRK_DONE((int32_t) -1) : fPosition; |
640 | } |
641 | |
642 | /** |
643 | * Sets the iterator to refer to the first boundary position following |
644 | * the specified position. |
645 | * @param startPos The position from which to begin searching for a break position. |
646 | * @return The position of the first break after the current position. |
647 | */ |
648 | int32_t RuleBasedBreakIterator::following(int32_t startPos) { |
649 | // if the supplied position is before the beginning, return the |
650 | // text's starting offset |
651 | if (startPos < 0) { |
652 | return first(); |
653 | } |
654 | |
655 | // Move requested offset to a code point start. It might be on a trail surrogate, |
656 | // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text. |
657 | utext_setNativeIndexutext_setNativeIndex_71(&fText, startPos); |
658 | startPos = (int32_t)utext_getNativeIndexutext_getNativeIndex_71(&fText); |
659 | |
660 | UErrorCode status = U_ZERO_ERROR; |
661 | fBreakCache->following(startPos, status); |
662 | return fDone ? UBRK_DONE((int32_t) -1) : fPosition; |
663 | } |
664 | |
665 | /** |
666 | * Sets the iterator to refer to the last boundary position before the |
667 | * specified position. |
668 | * @param offset The position to begin searching for a break from. |
669 | * @return The position of the last boundary before the starting position. |
670 | */ |
671 | int32_t RuleBasedBreakIterator::preceding(int32_t offset) { |
672 | if (offset > utext_nativeLengthutext_nativeLength_71(&fText)) { |
673 | return last(); |
674 | } |
675 | |
676 | // Move requested offset to a code point start. It might be on a trail surrogate, |
677 | // or on a trail byte if the input is UTF-8. |
678 | |
679 | utext_setNativeIndexutext_setNativeIndex_71(&fText, offset); |
680 | int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndexutext_getNativeIndex_71(&fText)); |
681 | |
682 | UErrorCode status = U_ZERO_ERROR; |
683 | fBreakCache->preceding(adjustedOffset, status); |
684 | return fDone ? UBRK_DONE((int32_t) -1) : fPosition; |
685 | } |
686 | |
687 | /** |
688 | * Returns true if the specified position is a boundary position. As a side |
689 | * effect, leaves the iterator pointing to the first boundary position at |
690 | * or after "offset". |
691 | * |
692 | * @param offset the offset to check. |
693 | * @return True if "offset" is a boundary position. |
694 | */ |
695 | UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { |
696 | // out-of-range indexes are never boundary positions |
697 | if (offset < 0) { |
698 | first(); // For side effects on current position, tag values. |
699 | return FALSE0; |
700 | } |
701 | |
702 | // Adjust offset to be on a code point boundary and not beyond the end of the text. |
703 | // Note that isBoundary() is always false for offsets that are not on code point boundaries. |
704 | // But we still need the side effect of leaving iteration at the following boundary. |
705 | |
706 | utext_setNativeIndexutext_setNativeIndex_71(&fText, offset); |
707 | int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndexutext_getNativeIndex_71(&fText)); |
708 | |
709 | bool result = false; |
710 | UErrorCode status = U_ZERO_ERROR; |
711 | if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) { |
712 | result = (fBreakCache->current() == offset); |
713 | } |
714 | |
715 | if (result && adjustedOffset < offset && utext_char32Atutext_char32At_71(&fText, offset) == U_SENTINEL(-1)) { |
716 | // Original offset is beyond the end of the text. Return FALSE, it's not a boundary, |
717 | // but the iteration position remains set to the end of the text, which is a boundary. |
718 | return FALSE0; |
719 | } |
720 | if (!result) { |
721 | // Not on a boundary. isBoundary() must leave iterator on the following boundary. |
722 | // Cache->seek(), above, left us on the preceding boundary, so advance one. |
723 | next(); |
724 | } |
725 | return result; |
726 | } |
727 | |
728 | |
729 | /** |
730 | * Returns the current iteration position. |
731 | * @return The current iteration position. |
732 | */ |
733 | int32_t RuleBasedBreakIterator::current(void) const { |
734 | return fPosition; |
735 | } |
736 | |
737 | |
738 | //======================================================================= |
739 | // implementation |
740 | //======================================================================= |
741 | |
742 | // |
743 | // RBBIRunMode - the state machine runs an extra iteration at the beginning and end |
744 | // of user text. A variable with this enum type keeps track of where we |
745 | // are. The state machine only fetches user input while in the RUN mode. |
746 | // |
747 | enum RBBIRunMode { |
748 | RBBI_START, // state machine processing is before first char of input |
749 | RBBI_RUN, // state machine processing is in the user text |
750 | RBBI_END // state machine processing is after end of user text. |
751 | }; |
752 | |
753 | |
754 | // Wrapper functions to select the appropriate handleNext() or handleSafePrevious() |
755 | // instantiation, based on whether an 8 or 16 bit table is required. |
756 | // |
757 | // These Trie access functions will be inlined within the handleNext()/Previous() instantions. |
758 | static inline uint16_t TrieFunc8(const UCPTrie *trie, UChar32 c) { |
759 | return UCPTRIE_FAST_GET(trie, UCPTRIE_8, c)((trie)->data.ptr8[((uint32_t)(c) <= (uint32_t)(0xffff) ? ((int32_t)(trie)->index[(c) >> UCPTRIE_FAST_SHIFT ] + ((c) & UCPTRIE_FAST_DATA_MASK)) : (uint32_t)(c) <= 0x10ffff ? ((c) >= (trie)->highStart ? (trie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_71 (trie, c)) : (trie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET )]); |
760 | } |
761 | |
762 | static inline uint16_t TrieFunc16(const UCPTrie *trie, UChar32 c) { |
763 | return UCPTRIE_FAST_GET(trie, UCPTRIE_16, c)((trie)->data.ptr16[((uint32_t)(c) <= (uint32_t)(0xffff ) ? ((int32_t)(trie)->index[(c) >> UCPTRIE_FAST_SHIFT ] + ((c) & UCPTRIE_FAST_DATA_MASK)) : (uint32_t)(c) <= 0x10ffff ? ((c) >= (trie)->highStart ? (trie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : ucptrie_internalSmallIndex_71 (trie, c)) : (trie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET )]); |
764 | } |
765 | |
766 | int32_t RuleBasedBreakIterator::handleNext() { |
767 | const RBBIStateTable *statetable = fData->fForwardTable; |
768 | bool use8BitsTrie = ucptrie_getValueWidthucptrie_getValueWidth_71(fData->fTrie) == UCPTRIE_VALUE_BITS_8; |
769 | if (statetable->fFlags & RBBI_8BITS_ROWS) { |
770 | if (use8BitsTrie) { |
771 | return handleNext<RBBIStateTableRow8, TrieFunc8>(); |
772 | } else { |
773 | return handleNext<RBBIStateTableRow8, TrieFunc16>(); |
774 | } |
775 | } else { |
776 | if (use8BitsTrie) { |
777 | return handleNext<RBBIStateTableRow16, TrieFunc8>(); |
778 | } else { |
779 | return handleNext<RBBIStateTableRow16, TrieFunc16>(); |
780 | } |
781 | } |
782 | } |
783 | |
784 | int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { |
785 | const RBBIStateTable *statetable = fData->fReverseTable; |
786 | bool use8BitsTrie = ucptrie_getValueWidthucptrie_getValueWidth_71(fData->fTrie) == UCPTRIE_VALUE_BITS_8; |
787 | if (statetable->fFlags & RBBI_8BITS_ROWS) { |
788 | if (use8BitsTrie) { |
789 | return handleSafePrevious<RBBIStateTableRow8, TrieFunc8>(fromPosition); |
790 | } else { |
791 | return handleSafePrevious<RBBIStateTableRow8, TrieFunc16>(fromPosition); |
792 | } |
793 | } else { |
794 | if (use8BitsTrie) { |
795 | return handleSafePrevious<RBBIStateTableRow16, TrieFunc8>(fromPosition); |
796 | } else { |
797 | return handleSafePrevious<RBBIStateTableRow16, TrieFunc16>(fromPosition); |
798 | } |
799 | } |
800 | } |
801 | |
802 | |
803 | //----------------------------------------------------------------------------------- |
804 | // |
805 | // handleNext() |
806 | // Run the state machine to find a boundary |
807 | // |
808 | //----------------------------------------------------------------------------------- |
809 | template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc> |
810 | int32_t RuleBasedBreakIterator::handleNext() { |
811 | int32_t state; |
812 | uint16_t category = 0; |
813 | RBBIRunMode mode; |
814 | |
815 | RowType *row; |
816 | UChar32 c; |
817 | int32_t result = 0; |
818 | int32_t initialPosition = 0; |
819 | const RBBIStateTable *statetable = fData->fForwardTable; |
820 | const char *tableData = statetable->fTableData; |
821 | uint32_t tableRowLen = statetable->fRowLen; |
822 | uint32_t dictStart = statetable->fDictCategoriesStart; |
823 | #ifdef RBBI_DEBUG |
824 | if (gTrace) { |
825 | RBBIDebugPuts("Handle Next pos char state category"); |
826 | } |
827 | #endif |
828 | |
829 | // handleNext always sets the break tag value. |
830 | // Set the default for it. |
831 | fRuleStatusIndex = 0; |
832 | |
833 | fDictionaryCharCount = 0; |
834 | |
835 | // if we're already at the end of the text, return DONE. |
836 | initialPosition = fPosition; |
837 | UTEXT_SETNATIVEINDEX(&fText, initialPosition)do { int64_t __offset = (initialPosition) - (&fText)-> chunkNativeStart; if (__offset>=0 && __offset<( int64_t)(&fText)->nativeIndexingLimit && (& fText)->chunkContents[__offset]<0xdc00) { (&fText)-> chunkOffset=(int32_t)__offset; } else { utext_setNativeIndex_71 ((&fText), (initialPosition)); } } while (false); |
838 | result = initialPosition; |
839 | c = UTEXT_NEXT32(&fText)((&fText)->chunkOffset < (&fText)->chunkLength && ((&fText)->chunkContents)[(&fText)-> chunkOffset]<0xd800 ? ((&fText)->chunkContents)[((& fText)->chunkOffset)++] : utext_next32_71(&fText)); |
840 | if (c==U_SENTINEL(-1)) { |
841 | fDone = TRUE1; |
842 | return UBRK_DONE((int32_t) -1); |
843 | } |
844 | |
845 | // Set the initial state for the state machine |
846 | state = START_STATE; |
847 | row = (RowType *) |
848 | //(statetable->fTableData + (statetable->fRowLen * state)); |
849 | (tableData + tableRowLen * state); |
850 | |
851 | |
852 | mode = RBBI_RUN; |
853 | if (statetable->fFlags & RBBI_BOF_REQUIRED) { |
854 | category = 2; |
855 | mode = RBBI_START; |
856 | } |
857 | |
858 | |
859 | // loop until we reach the end of the text or transition to state 0 |
860 | // |
861 | for (;;) { |
862 | if (c == U_SENTINEL(-1)) { |
863 | // Reached end of input string. |
864 | if (mode == RBBI_END) { |
865 | // We have already run the loop one last time with the |
866 | // character set to the psueudo {eof} value. Now it is time |
867 | // to unconditionally bail out. |
868 | break; |
869 | } |
870 | // Run the loop one last time with the fake end-of-input character category. |
871 | mode = RBBI_END; |
872 | category = 1; |
873 | } |
874 | |
875 | // |
876 | // Get the char category. An incoming category of 1 or 2 means that |
877 | // we are preset for doing the beginning or end of input, and |
878 | // that we shouldn't get a category from an actual text input character. |
879 | // |
880 | if (mode == RBBI_RUN) { |
881 | // look up the current character's character category, which tells us |
882 | // which column in the state table to look at. |
883 | category = trieFunc(fData->fTrie, c); |
884 | fDictionaryCharCount += (category >= dictStart); |
885 | } |
886 | |
887 | #ifdef RBBI_DEBUG |
888 | if (gTrace) { |
889 | RBBIDebugPrintf(" %4" PRId64"l" "d" " ", utext_getNativeIndexutext_getNativeIndex_71(&fText)); |
890 | if (0x20<=c && c<0x7f) { |
891 | RBBIDebugPrintf("\"%c\" ", c); |
892 | } else { |
893 | RBBIDebugPrintf("%5x ", c); |
894 | } |
895 | RBBIDebugPrintf("%3d %3d\n", state, category); |
896 | } |
897 | #endif |
898 | |
899 | // State Transition - move machine to its next state |
900 | // |
901 | |
902 | // fNextState is a variable-length array. |
903 | U_ASSERT(category<fData->fHeader->fCatCount)(void)0; |
904 | state = row->fNextState[category]; /*Not accessing beyond memory*/ |
905 | row = (RowType *) |
906 | // (statetable->fTableData + (statetable->fRowLen * state)); |
907 | (tableData + tableRowLen * state); |
908 | |
909 | |
910 | uint16_t accepting = row->fAccepting; |
911 | if (accepting == ACCEPTING_UNCONDITIONAL) { |
912 | // Match found, common case. |
913 | if (mode != RBBI_START) { |
914 | result = (int32_t)UTEXT_GETNATIVEINDEX(&fText)((&fText)->chunkOffset <= (&fText)->nativeIndexingLimit ? (&fText)->chunkNativeStart+(&fText)->chunkOffset : (&fText)->pFuncs->mapOffsetToNative(&fText)); |
915 | } |
916 | fRuleStatusIndex = row->fTagsIdx; // Remember the break status (tag) values. |
917 | } else if (accepting > ACCEPTING_UNCONDITIONAL) { |
918 | // Lookahead match is completed. |
919 | U_ASSERT(accepting < fData->fForwardTable->fLookAheadResultsSize)(void)0; |
920 | int32_t lookaheadResult = fLookAheadMatches[accepting]; |
921 | if (lookaheadResult >= 0) { |
922 | fRuleStatusIndex = row->fTagsIdx; |
923 | fPosition = lookaheadResult; |
924 | return lookaheadResult; |
925 | } |
926 | } |
927 | |
928 | // If we are at the position of the '/' in a look-ahead (hard break) rule; |
929 | // record the current position, to be returned later, if the full rule matches. |
930 | // TODO: Move this check before the previous check of fAccepting. |
931 | // This would enable hard-break rules with no following context. |
932 | // But there are line break test failures when trying this. Investigate. |
933 | // Issue ICU-20837 |
934 | uint16_t rule = row->fLookAhead; |
935 | U_ASSERT(rule == 0 || rule > ACCEPTING_UNCONDITIONAL)(void)0; |
936 | U_ASSERT(rule == 0 || rule < fData->fForwardTable->fLookAheadResultsSize)(void)0; |
937 | if (rule > ACCEPTING_UNCONDITIONAL) { |
938 | int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText)((&fText)->chunkOffset <= (&fText)->nativeIndexingLimit ? (&fText)->chunkNativeStart+(&fText)->chunkOffset : (&fText)->pFuncs->mapOffsetToNative(&fText)); |
939 | fLookAheadMatches[rule] = pos; |
940 | } |
941 | |
942 | if (state == STOP_STATE) { |
943 | // This is the normal exit from the lookup state machine. |
944 | // We have advanced through the string until it is certain that no |
945 | // longer match is possible, no matter what characters follow. |
946 | break; |
947 | } |
948 | |
949 | // Advance to the next character. |
950 | // If this is a beginning-of-input loop iteration, don't advance |
951 | // the input position. The next iteration will be processing the |
952 | // first real input character. |
953 | if (mode == RBBI_RUN) { |
954 | c = UTEXT_NEXT32(&fText)((&fText)->chunkOffset < (&fText)->chunkLength && ((&fText)->chunkContents)[(&fText)-> chunkOffset]<0xd800 ? ((&fText)->chunkContents)[((& fText)->chunkOffset)++] : utext_next32_71(&fText)); |
955 | } else { |
956 | if (mode == RBBI_START) { |
957 | mode = RBBI_RUN; |
958 | } |
959 | } |
960 | } |
961 | |
962 | // The state machine is done. Check whether it found a match... |
963 | |
964 | // If the iterator failed to advance in the match engine, force it ahead by one. |
965 | // (This really indicates a defect in the break rules. They should always match |
966 | // at least one character.) |
967 | if (result == initialPosition) { |
968 | utext_setNativeIndexutext_setNativeIndex_71(&fText, initialPosition); |
969 | utext_next32utext_next32_71(&fText); |
970 | result = (int32_t)utext_getNativeIndexutext_getNativeIndex_71(&fText); |
971 | fRuleStatusIndex = 0; |
972 | } |
973 | |
974 | // Leave the iterator at our result position. |
975 | fPosition = result; |
976 | #ifdef RBBI_DEBUG |
977 | if (gTrace) { |
978 | RBBIDebugPrintf("result = %d\n\n", result); |
979 | } |
980 | #endif |
981 | return result; |
982 | } |
983 | |
984 | |
985 | //----------------------------------------------------------------------------------- |
986 | // |
987 | // handleSafePrevious() |
988 | // |
989 | // Iterate backwards using the safe reverse rules. |
990 | // The logic of this function is similar to handleNext(), but simpler |
991 | // because the safe table does not require as many options. |
992 | // |
993 | //----------------------------------------------------------------------------------- |
994 | template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc> |
995 | int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { |
996 | |
997 | int32_t state; |
998 | uint16_t category = 0; |
999 | RowType *row; |
1000 | UChar32 c; |
1001 | int32_t result = 0; |
1002 | |
1003 | const RBBIStateTable *stateTable = fData->fReverseTable; |
1004 | UTEXT_SETNATIVEINDEX(&fText, fromPosition)do { int64_t __offset = (fromPosition) - (&fText)->chunkNativeStart ; if (__offset>=0 && __offset<(int64_t)(&fText )->nativeIndexingLimit && (&fText)->chunkContents [__offset]<0xdc00) { (&fText)->chunkOffset=(int32_t )__offset; } else { utext_setNativeIndex_71((&fText), (fromPosition )); } } while (false); |
1005 | #ifdef RBBI_DEBUG |
1006 | if (gTrace) { |
1007 | RBBIDebugPuts("Handle Previous pos char state category"); |
1008 | } |
1009 | #endif |
1010 | |
1011 | // if we're already at the start of the text, return DONE. |
1012 | if (fData == NULL__null || UTEXT_GETNATIVEINDEX(&fText)((&fText)->chunkOffset <= (&fText)->nativeIndexingLimit ? (&fText)->chunkNativeStart+(&fText)->chunkOffset : (&fText)->pFuncs->mapOffsetToNative(&fText))==0) { |
1013 | return BreakIterator::DONE; |
1014 | } |
1015 | |
1016 | // Set the initial state for the state machine |
1017 | c = UTEXT_PREVIOUS32(&fText)((&fText)->chunkOffset > 0 && (&fText)-> chunkContents[(&fText)->chunkOffset-1] < 0xd800 ? ( &fText)->chunkContents[--((&fText)->chunkOffset )] : utext_previous32_71(&fText)); |
1018 | state = START_STATE; |
1019 | row = (RowType *) |
1020 | (stateTable->fTableData + (stateTable->fRowLen * state)); |
1021 | |
1022 | // loop until we reach the start of the text or transition to state 0 |
1023 | // |
1024 | for (; c != U_SENTINEL(-1); c = UTEXT_PREVIOUS32(&fText)((&fText)->chunkOffset > 0 && (&fText)-> chunkContents[(&fText)->chunkOffset-1] < 0xd800 ? ( &fText)->chunkContents[--((&fText)->chunkOffset )] : utext_previous32_71(&fText))) { |
1025 | |
1026 | // look up the current character's character category, which tells us |
1027 | // which column in the state table to look at. |
1028 | // |
1029 | // Off the dictionary flag bit. For reverse iteration it is not used. |
1030 | category = trieFunc(fData->fTrie, c); |
1031 | |
1032 | #ifdef RBBI_DEBUG |
1033 | if (gTrace) { |
1034 | RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndexutext_getNativeIndex_71(&fText)); |
1035 | if (0x20<=c && c<0x7f) { |
1036 | RBBIDebugPrintf("\"%c\" ", c); |
1037 | } else { |
1038 | RBBIDebugPrintf("%5x ", c); |
1039 | } |
1040 | RBBIDebugPrintf("%3d %3d\n", state, category); |
1041 | } |
1042 | #endif |
1043 | |
1044 | // State Transition - move machine to its next state |
1045 | // |
1046 | // fNextState is a variable-length array. |
1047 | U_ASSERT(category<fData->fHeader->fCatCount)(void)0; |
1048 | state = row->fNextState[category]; /*Not accessing beyond memory*/ |
1049 | row = (RowType *) |
1050 | (stateTable->fTableData + (stateTable->fRowLen * state)); |
1051 | |
1052 | if (state == STOP_STATE) { |
1053 | // This is the normal exit from the lookup state machine. |
1054 | // Transition to state zero means we have found a safe point. |
1055 | break; |
1056 | } |
1057 | } |
1058 | |
1059 | // The state machine is done. Check whether it found a match... |
1060 | result = (int32_t)UTEXT_GETNATIVEINDEX(&fText)((&fText)->chunkOffset <= (&fText)->nativeIndexingLimit ? (&fText)->chunkNativeStart+(&fText)->chunkOffset : (&fText)->pFuncs->mapOffsetToNative(&fText)); |
1061 | #ifdef RBBI_DEBUG |
1062 | if (gTrace) { |
1063 | RBBIDebugPrintf("result = %d\n\n", result); |
1064 | } |
1065 | #endif |
1066 | return result; |
1067 | } |
1068 | |
1069 | |
1070 | //------------------------------------------------------------------------------- |
1071 | // |
1072 | // getRuleStatus() Return the break rule tag associated with the current |
1073 | // iterator position. If the iterator arrived at its current |
1074 | // position by iterating forwards, the value will have been |
1075 | // cached by the handleNext() function. |
1076 | // |
1077 | //------------------------------------------------------------------------------- |
1078 | |
1079 | int32_t RuleBasedBreakIterator::getRuleStatus() const { |
1080 | |
1081 | // fLastRuleStatusIndex indexes to the start of the appropriate status record |
1082 | // (the number of status values.) |
1083 | // This function returns the last (largest) of the array of status values. |
1084 | int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex]; |
1085 | int32_t tagVal = fData->fRuleStatusTable[idx]; |
1086 | |
1087 | return tagVal; |
1088 | } |
1089 | |
1090 | |
1091 | int32_t RuleBasedBreakIterator::getRuleStatusVec( |
1092 | int32_t *fillInVec, int32_t capacity, UErrorCode &status) { |
1093 | if (U_FAILURE(status)) { |
1094 | return 0; |
1095 | } |
1096 | |
1097 | int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex]; |
1098 | int32_t numValsToCopy = numVals; |
1099 | if (numVals > capacity) { |
1100 | status = U_BUFFER_OVERFLOW_ERROR; |
1101 | numValsToCopy = capacity; |
1102 | } |
1103 | int i; |
1104 | for (i=0; i<numValsToCopy; i++) { |
1105 | fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1]; |
1106 | } |
1107 | return numVals; |
1108 | } |
1109 | |
1110 | |
1111 | |
1112 | //------------------------------------------------------------------------------- |
1113 | // |
1114 | // getBinaryRules Access to the compiled form of the rules, |
1115 | // for use by build system tools that save the data |
1116 | // for standard iterator types. |
1117 | // |
1118 | //------------------------------------------------------------------------------- |
1119 | const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { |
1120 | const uint8_t *retPtr = NULL__null; |
1121 | length = 0; |
1122 | |
1123 | if (fData != NULL__null) { |
1124 | retPtr = (const uint8_t *)fData->fHeader; |
1125 | length = fData->fHeader->fLength; |
1126 | } |
1127 | return retPtr; |
1128 | } |
1129 | |
1130 | |
1131 | RuleBasedBreakIterator *RuleBasedBreakIterator::createBufferClone( |
1132 | void * /*stackBuffer*/, int32_t &bufferSize, UErrorCode &status) { |
1133 | if (U_FAILURE(status)){ |
1134 | return NULL__null; |
1135 | } |
1136 | |
1137 | if (bufferSize == 0) { |
1138 | bufferSize = 1; // preflighting for deprecated functionality |
1139 | return NULL__null; |
1140 | } |
1141 | |
1142 | BreakIterator *clonedBI = clone(); |
1143 | if (clonedBI == NULL__null) { |
1144 | status = U_MEMORY_ALLOCATION_ERROR; |
1145 | } else { |
1146 | status = U_SAFECLONE_ALLOCATED_WARNING; |
1147 | } |
1148 | return (RuleBasedBreakIterator *)clonedBI; |
1149 | } |
1150 | |
1151 | U_NAMESPACE_END} |
1152 | |
1153 | |
1154 | static icu::UStack *gLanguageBreakFactories = nullptr; |
1155 | static const icu::UnicodeString *gEmptyString = nullptr; |
1156 | static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER{{ 0 }, U_ZERO_ERROR}; |
1157 | static icu::UInitOnce gRBBIInitOnce = U_INITONCE_INITIALIZER{{ 0 }, U_ZERO_ERROR}; |
1158 | |
1159 | /** |
1160 | * Release all static memory held by breakiterator. |
1161 | */ |
1162 | U_CDECL_BEGINextern "C" { |
1163 | UBool U_CALLCONV rbbi_cleanuprbbi_cleanup_71(void) { |
1164 | delete gLanguageBreakFactories; |
1165 | gLanguageBreakFactories = nullptr; |
1166 | delete gEmptyString; |
1167 | gEmptyString = nullptr; |
1168 | gLanguageBreakFactoriesInitOnce.reset(); |
1169 | gRBBIInitOnce.reset(); |
1170 | return TRUE1; |
1171 | } |
1172 | U_CDECL_END} |
1173 | |
1174 | U_CDECL_BEGINextern "C" { |
1175 | static void U_CALLCONV _deleteFactory(void *obj) { |
1176 | delete (icu::LanguageBreakFactory *) obj; |
1177 | } |
1178 | U_CDECL_END} |
1179 | U_NAMESPACE_BEGINnamespace icu_71 { |
1180 | |
1181 | static void U_CALLCONV rbbiInit() { |
1182 | gEmptyString = new UnicodeString(); |
1183 | ucln_common_registerCleanupucln_common_registerCleanup_71(UCLN_COMMON_RBBI, rbbi_cleanuprbbi_cleanup_71); |
1184 | } |
1185 | |
1186 | static void U_CALLCONV initLanguageFactories() { |
1187 | UErrorCode status = U_ZERO_ERROR; |
1188 | U_ASSERT(gLanguageBreakFactories == NULL)(void)0; |
1189 | gLanguageBreakFactories = new UStack(_deleteFactory, NULL__null, status); |
1190 | if (gLanguageBreakFactories != NULL__null && U_SUCCESS(status)) { |
1191 | ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); |
1192 | gLanguageBreakFactories->push(builtIn, status); |
1193 | #ifdef U_LOCAL_SERVICE_HOOK |
1194 | LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); |
1195 | if (extra != NULL__null) { |
1196 | gLanguageBreakFactories->push(extra, status); |
1197 | } |
1198 | #endif |
1199 | } |
1200 | ucln_common_registerCleanupucln_common_registerCleanup_71(UCLN_COMMON_RBBI, rbbi_cleanuprbbi_cleanup_71); |
1201 | } |
1202 | |
1203 | |
1204 | static const LanguageBreakEngine* |
1205 | getLanguageBreakEngineFromFactory(UChar32 c) |
1206 | { |
1207 | umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories); |
1208 | if (gLanguageBreakFactories == NULL__null) { |
1209 | return NULL__null; |
1210 | } |
1211 | |
1212 | int32_t i = gLanguageBreakFactories->size(); |
1213 | const LanguageBreakEngine *lbe = NULL__null; |
1214 | while (--i >= 0) { |
1215 | LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); |
1216 | lbe = factory->getEngineFor(c); |
1217 | if (lbe != NULL__null) { |
1218 | break; |
1219 | } |
1220 | } |
1221 | return lbe; |
1222 | } |
1223 | |
1224 | |
1225 | //------------------------------------------------------------------------------- |
1226 | // |
1227 | // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the |
1228 | // the character c. |
1229 | // |
1230 | //------------------------------------------------------------------------------- |
1231 | const LanguageBreakEngine * |
1232 | RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { |
1233 | const LanguageBreakEngine *lbe = NULL__null; |
1234 | UErrorCode status = U_ZERO_ERROR; |
1235 | |
1236 | if (fLanguageBreakEngines == NULL__null) { |
1237 | fLanguageBreakEngines = new UStack(status); |
1238 | if (fLanguageBreakEngines == NULL__null || U_FAILURE(status)) { |
1239 | delete fLanguageBreakEngines; |
1240 | fLanguageBreakEngines = 0; |
1241 | return NULL__null; |
1242 | } |
1243 | } |
1244 | |
1245 | int32_t i = fLanguageBreakEngines->size(); |
1246 | while (--i >= 0) { |
1247 | lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); |
1248 | if (lbe->handles(c)) { |
1249 | return lbe; |
1250 | } |
1251 | } |
1252 | |
1253 | // No existing dictionary took the character. See if a factory wants to |
1254 | // give us a new LanguageBreakEngine for this character. |
1255 | lbe = getLanguageBreakEngineFromFactory(c); |
1256 | |
1257 | // If we got one, use it and push it on our stack. |
1258 | if (lbe != NULL__null) { |
1259 | fLanguageBreakEngines->push((void *)lbe, status); |
1260 | // Even if we can't remember it, we can keep looking it up, so |
1261 | // return it even if the push fails. |
1262 | return lbe; |
1263 | } |
1264 | |
1265 | // No engine is forthcoming for this character. Add it to the |
1266 | // reject set. Create the reject break engine if needed. |
1267 | if (fUnhandledBreakEngine == NULL__null) { |
1268 | fUnhandledBreakEngine = new UnhandledEngine(status); |
1269 | if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL__null) { |
1270 | status = U_MEMORY_ALLOCATION_ERROR; |
Value stored to 'status' is never read | |
1271 | return nullptr; |
1272 | } |
1273 | // Put it last so that scripts for which we have an engine get tried |
1274 | // first. |
1275 | fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); |
1276 | // If we can't insert it, or creation failed, get rid of it |
1277 | U_ASSERT(!fLanguageBreakEngines->hasDeleter())(void)0; |
1278 | if (U_FAILURE(status)) { |
1279 | delete fUnhandledBreakEngine; |
1280 | fUnhandledBreakEngine = 0; |
1281 | return NULL__null; |
1282 | } |
1283 | } |
1284 | |
1285 | // Tell the reject engine about the character; at its discretion, it may |
1286 | // add more than just the one character. |
1287 | fUnhandledBreakEngine->handleCharacter(c); |
1288 | |
1289 | return fUnhandledBreakEngine; |
1290 | } |
1291 | |
1292 | void RuleBasedBreakIterator::dumpCache() { |
1293 | fBreakCache->dumpCache(); |
1294 | } |
1295 | |
1296 | void RuleBasedBreakIterator::dumpTables() { |
1297 | fData->printData(); |
1298 | } |
1299 | |
1300 | /** |
1301 | * Returns the description used to create this iterator |
1302 | */ |
1303 | |
1304 | const UnicodeString& |
1305 | RuleBasedBreakIterator::getRules() const { |
1306 | if (fData != NULL__null) { |
1307 | return fData->getRuleSourceString(); |
1308 | } else { |
1309 | umtx_initOnce(gRBBIInitOnce, &rbbiInit); |
1310 | return *gEmptyString; |
1311 | } |
1312 | } |
1313 | |
1314 | U_NAMESPACE_END} |
1315 | |
1316 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |