| File: | out/../deps/icu-small/source/i18n/alphaindex.cpp |
| Warning: | line 888, column 26 Forming reference to null pointer |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | // © 2016 and later: Unicode, Inc. and others. | |||
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | |||
| 3 | /* | |||
| 4 | ******************************************************************************* | |||
| 5 | * Copyright (C) 2009-2014, International Business Machines Corporation and | |||
| 6 | * others. All Rights Reserved. | |||
| 7 | ******************************************************************************* | |||
| 8 | */ | |||
| 9 | ||||
| 10 | #include "unicode/utypes.h" | |||
| 11 | ||||
| 12 | #if !UCONFIG_NO_COLLATION0 | |||
| 13 | ||||
| 14 | #include "unicode/alphaindex.h" | |||
| 15 | #include "unicode/coll.h" | |||
| 16 | #include "unicode/localpointer.h" | |||
| 17 | #include "unicode/normalizer2.h" | |||
| 18 | #include "unicode/tblcoll.h" | |||
| 19 | #include "unicode/uchar.h" | |||
| 20 | #include "unicode/ulocdata.h" | |||
| 21 | #include "unicode/uniset.h" | |||
| 22 | #include "unicode/uobject.h" | |||
| 23 | #include "unicode/usetiter.h" | |||
| 24 | #include "unicode/utf16.h" | |||
| 25 | ||||
| 26 | #include "cmemory.h" | |||
| 27 | #include "cstring.h" | |||
| 28 | #include "uassert.h" | |||
| 29 | #include "uvector.h" | |||
| 30 | #include "uvectr64.h" | |||
| 31 | ||||
| 32 | //#include <string> | |||
| 33 | //#include <iostream> | |||
| 34 | ||||
| 35 | U_NAMESPACE_BEGINnamespace icu_71 { | |||
| 36 | ||||
| 37 | namespace { | |||
| 38 | ||||
| 39 | /** | |||
| 40 | * Prefix string for Chinese index buckets. | |||
| 41 | * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes | |||
| 42 | */ | |||
| 43 | const UChar BASE[1] = { 0xFDD0 }; | |||
| 44 | const int32_t BASE_LENGTH = 1; | |||
| 45 | ||||
| 46 | UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, | |||
| 47 | const UnicodeString &one, const UnicodeString &other); | |||
| 48 | ||||
| 49 | } // namespace | |||
| 50 | ||||
| 51 | static int32_t U_CALLCONV | |||
| 52 | collatorComparator(const void *context, const void *left, const void *right); | |||
| 53 | ||||
| 54 | static int32_t U_CALLCONV | |||
| 55 | recordCompareFn(const void *context, const void *left, const void *right); | |||
| 56 | ||||
| 57 | // UVector<Record *> support function, delete a Record. | |||
| 58 | static void U_CALLCONV | |||
| 59 | alphaIndex_deleteRecord(void *obj) { | |||
| 60 | delete static_cast<AlphabeticIndex::Record *>(obj); | |||
| 61 | } | |||
| 62 | ||||
| 63 | namespace { | |||
| 64 | ||||
| 65 | UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned, | |||
| 66 | UErrorCode &errorCode) { | |||
| 67 | if (U_FAILURE(errorCode)) { return NULL__null; } | |||
| 68 | if (owned.isValid()) { | |||
| 69 | return owned.orphan(); | |||
| 70 | } | |||
| 71 | UnicodeString *p = new UnicodeString(s); | |||
| 72 | if (p == NULL__null) { | |||
| 73 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
| 74 | } | |||
| 75 | return p; | |||
| 76 | } | |||
| 77 | ||||
| 78 | inline UnicodeString *getString(const UVector &list, int32_t i) { | |||
| 79 | return static_cast<UnicodeString *>(list[i]); | |||
| 80 | } | |||
| 81 | ||||
| 82 | inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) { | |||
| 83 | return static_cast<AlphabeticIndex::Bucket *>(list[i]); | |||
| 84 | } | |||
| 85 | ||||
| 86 | inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) { | |||
| 87 | return static_cast<AlphabeticIndex::Record *>(list[i]); | |||
| 88 | } | |||
| 89 | ||||
| 90 | /** | |||
| 91 | * Like Java Collections.binarySearch(List, String, Comparator). | |||
| 92 | * | |||
| 93 | * @return the index>=0 where the item was found, | |||
| 94 | * or the index<0 for inserting the string at ~index in sorted order | |||
| 95 | */ | |||
| 96 | int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { | |||
| 97 | if (list.size() == 0) { return ~0; } | |||
| 98 | int32_t start = 0; | |||
| 99 | int32_t limit = list.size(); | |||
| 100 | for (;;) { | |||
| 101 | int32_t i = (start + limit) / 2; | |||
| 102 | const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i)); | |||
| 103 | UErrorCode errorCode = U_ZERO_ERROR; | |||
| 104 | UCollationResult cmp = coll.compare(s, *si, errorCode); | |||
| 105 | if (cmp == UCOL_EQUAL) { | |||
| 106 | return i; | |||
| 107 | } else if (cmp < 0) { | |||
| 108 | if (i == start) { | |||
| 109 | return ~start; // insert s before *si | |||
| 110 | } | |||
| 111 | limit = i; | |||
| 112 | } else { | |||
| 113 | if (i == start) { | |||
| 114 | return ~(start + 1); // insert s after *si | |||
| 115 | } | |||
| 116 | start = i; | |||
| 117 | } | |||
| 118 | } | |||
| 119 | } | |||
| 120 | ||||
| 121 | } // namespace | |||
| 122 | ||||
| 123 | // The BucketList is not in the anonymous namespace because only Clang | |||
| 124 | // seems to support its use in other classes from there. | |||
| 125 | // However, we also don't need U_I18N_API because it is not used from outside the i18n library. | |||
| 126 | class BucketList : public UObject { | |||
| 127 | public: | |||
| 128 | BucketList(UVector *bucketList, UVector *publicBucketList) | |||
| 129 | : bucketList_(bucketList), immutableVisibleList_(publicBucketList) { | |||
| 130 | int32_t displayIndex = 0; | |||
| 131 | for (int32_t i = 0; i < publicBucketList->size(); ++i) { | |||
| 132 | getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++; | |||
| 133 | } | |||
| 134 | } | |||
| 135 | ||||
| 136 | // The virtual destructor must not be inline. | |||
| 137 | // See ticket #8454 for details. | |||
| 138 | virtual ~BucketList(); | |||
| 139 | ||||
| 140 | int32_t getBucketCount() const { | |||
| 141 | return immutableVisibleList_->size(); | |||
| 142 | } | |||
| 143 | ||||
| 144 | int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly, | |||
| 145 | UErrorCode &errorCode) { | |||
| 146 | // binary search | |||
| 147 | int32_t start = 0; | |||
| 148 | int32_t limit = bucketList_->size(); | |||
| 149 | while ((start + 1) < limit) { | |||
| 150 | int32_t i = (start + limit) / 2; | |||
| 151 | const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i); | |||
| 152 | UCollationResult nameVsBucket = | |||
| 153 | collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode); | |||
| 154 | if (nameVsBucket < 0) { | |||
| 155 | limit = i; | |||
| 156 | } else { | |||
| 157 | start = i; | |||
| 158 | } | |||
| 159 | } | |||
| 160 | const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start); | |||
| 161 | if (bucket->displayBucket_ != NULL__null) { | |||
| 162 | bucket = bucket->displayBucket_; | |||
| 163 | } | |||
| 164 | return bucket->displayIndex_; | |||
| 165 | } | |||
| 166 | ||||
| 167 | /** All of the buckets, visible and invisible. */ | |||
| 168 | UVector *bucketList_; | |||
| 169 | /** Just the visible buckets. */ | |||
| 170 | UVector *immutableVisibleList_; | |||
| 171 | }; | |||
| 172 | ||||
| 173 | BucketList::~BucketList() { | |||
| 174 | delete bucketList_; | |||
| 175 | if (immutableVisibleList_ != bucketList_) { | |||
| 176 | delete immutableVisibleList_; | |||
| 177 | } | |||
| 178 | } | |||
| 179 | ||||
| 180 | AlphabeticIndex::ImmutableIndex::~ImmutableIndex() { | |||
| 181 | delete buckets_; | |||
| 182 | delete collatorPrimaryOnly_; | |||
| 183 | } | |||
| 184 | ||||
| 185 | int32_t | |||
| 186 | AlphabeticIndex::ImmutableIndex::getBucketCount() const { | |||
| 187 | return buckets_->getBucketCount(); | |||
| 188 | } | |||
| 189 | ||||
| 190 | int32_t | |||
| 191 | AlphabeticIndex::ImmutableIndex::getBucketIndex( | |||
| 192 | const UnicodeString &name, UErrorCode &errorCode) const { | |||
| 193 | return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode); | |||
| 194 | } | |||
| 195 | ||||
| 196 | const AlphabeticIndex::Bucket * | |||
| 197 | AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const { | |||
| 198 | if (0 <= index && index < buckets_->getBucketCount()) { | |||
| 199 | return icu::getBucket(*buckets_->immutableVisibleList_, index); | |||
| 200 | } else { | |||
| 201 | return NULL__null; | |||
| 202 | } | |||
| 203 | } | |||
| 204 | ||||
| 205 | AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) | |||
| 206 | : inputList_(NULL__null), | |||
| 207 | labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL__null), | |||
| 208 | maxLabelCount_(99), | |||
| 209 | initialLabels_(NULL__null), firstCharsInScripts_(NULL__null), | |||
| 210 | collator_(NULL__null), collatorPrimaryOnly_(NULL__null), | |||
| 211 | buckets_(NULL__null) { | |||
| 212 | init(&locale, status); | |||
| 213 | } | |||
| 214 | ||||
| 215 | ||||
| 216 | AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status) | |||
| 217 | : inputList_(NULL__null), | |||
| 218 | labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL__null), | |||
| 219 | maxLabelCount_(99), | |||
| 220 | initialLabels_(NULL__null), firstCharsInScripts_(NULL__null), | |||
| 221 | collator_(collator), collatorPrimaryOnly_(NULL__null), | |||
| 222 | buckets_(NULL__null) { | |||
| 223 | init(NULL__null, status); | |||
| ||||
| 224 | } | |||
| 225 | ||||
| 226 | ||||
| 227 | ||||
| 228 | AlphabeticIndex::~AlphabeticIndex() { | |||
| 229 | delete collator_; | |||
| 230 | delete collatorPrimaryOnly_; | |||
| 231 | delete firstCharsInScripts_; | |||
| 232 | delete buckets_; | |||
| 233 | delete inputList_; | |||
| 234 | delete initialLabels_; | |||
| 235 | } | |||
| 236 | ||||
| 237 | ||||
| 238 | AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { | |||
| 239 | if (U_FAILURE(status)) { | |||
| 240 | return *this; | |||
| 241 | } | |||
| 242 | initialLabels_->addAll(additions); | |||
| 243 | clearBuckets(); | |||
| 244 | return *this; | |||
| 245 | } | |||
| 246 | ||||
| 247 | ||||
| 248 | AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { | |||
| 249 | addIndexExemplars(locale, status); | |||
| 250 | clearBuckets(); | |||
| 251 | return *this; | |||
| 252 | } | |||
| 253 | ||||
| 254 | ||||
| 255 | AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { | |||
| 256 | if (U_FAILURE(errorCode)) { return NULL__null; } | |||
| 257 | // In C++, the ImmutableIndex must own its copy of the BucketList, | |||
| 258 | // even if it contains no records, for proper memory management. | |||
| 259 | // We could clone the buckets_ if they are not NULL, | |||
| 260 | // but that would be worth it only if this method is called multiple times, | |||
| 261 | // or called after using the old-style bucket iterator API. | |||
| 262 | LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode)); | |||
| 263 | LocalPointer<RuleBasedCollator> coll(collatorPrimaryOnly_->clone()); | |||
| 264 | if (immutableBucketList.isNull() || coll.isNull()) { | |||
| 265 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
| 266 | return NULL__null; | |||
| 267 | } | |||
| 268 | ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); | |||
| 269 | if (immIndex == NULL__null) { | |||
| 270 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
| 271 | return NULL__null; | |||
| 272 | } | |||
| 273 | // The ImmutableIndex adopted its parameter objects. | |||
| 274 | immutableBucketList.orphan(); | |||
| 275 | coll.orphan(); | |||
| 276 | return immIndex; | |||
| 277 | } | |||
| 278 | ||||
| 279 | int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { | |||
| 280 | initBuckets(status); | |||
| 281 | if (U_FAILURE(status)) { | |||
| 282 | return 0; | |||
| 283 | } | |||
| 284 | return buckets_->getBucketCount(); | |||
| 285 | } | |||
| 286 | ||||
| 287 | ||||
| 288 | int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { | |||
| 289 | if (U_FAILURE(status) || inputList_ == NULL__null) { | |||
| 290 | return 0; | |||
| 291 | } | |||
| 292 | return inputList_->size(); | |||
| 293 | } | |||
| 294 | ||||
| 295 | void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { | |||
| 296 | U_ASSERT(indexCharacters.hasDeleter())(void)0; | |||
| 297 | const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); | |||
| 298 | if (U_FAILURE(errorCode)) { return; } | |||
| 299 | ||||
| 300 | const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); | |||
| 301 | const UnicodeString &overflowBoundary = | |||
| 302 | *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); | |||
| 303 | ||||
| 304 | // We make a sorted array of elements. | |||
| 305 | // Some of the input may be redundant. | |||
| 306 | // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". | |||
| 307 | // We filter out those cases. | |||
| 308 | UnicodeSetIterator iter(*initialLabels_); | |||
| 309 | while (U_SUCCESS(errorCode) && iter.next()) { | |||
| 310 | const UnicodeString *item = &iter.getString(); | |||
| 311 | LocalPointer<UnicodeString> ownedItem; | |||
| 312 | UBool checkDistinct; | |||
| 313 | int32_t itemLength = item->length(); | |||
| 314 | if (!item->hasMoreChar32Than(0, itemLength, 1)) { | |||
| 315 | checkDistinct = FALSE0; | |||
| 316 | } else if(item->charAt(itemLength - 1) == 0x2a && // '*' | |||
| 317 | item->charAt(itemLength - 2) != 0x2a) { | |||
| 318 | // Use a label if it is marked with one trailing star, | |||
| 319 | // even if the label string sorts the same when all contractions are suppressed. | |||
| 320 | ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); | |||
| 321 | item = ownedItem.getAlias(); | |||
| 322 | if (item == NULL__null) { | |||
| 323 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
| 324 | return; | |||
| 325 | } | |||
| 326 | checkDistinct = FALSE0; | |||
| 327 | } else { | |||
| 328 | checkDistinct = TRUE1; | |||
| 329 | } | |||
| 330 | if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { | |||
| 331 | // Ignore a primary-ignorable or non-alphabetic index character. | |||
| 332 | } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { | |||
| 333 | // Ignore an index character that will land in the overflow bucket. | |||
| 334 | } else if (checkDistinct && | |||
| 335 | collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { | |||
| 336 | // Ignore a multi-code point index character that does not sort distinctly | |||
| 337 | // from the sequence of its separate characters. | |||
| 338 | } else { | |||
| 339 | int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); | |||
| 340 | if (insertionPoint < 0) { | |||
| 341 | indexCharacters.insertElementAt( | |||
| 342 | ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); | |||
| 343 | } else { | |||
| 344 | const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); | |||
| 345 | if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { | |||
| 346 | indexCharacters.setElementAt( | |||
| 347 | ownedString(*item, ownedItem, errorCode), insertionPoint); | |||
| 348 | } | |||
| 349 | } | |||
| 350 | } | |||
| 351 | } | |||
| 352 | if (U_FAILURE(errorCode)) { return; } | |||
| 353 | ||||
| 354 | // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element | |||
| 355 | ||||
| 356 | int32_t size = indexCharacters.size() - 1; | |||
| 357 | if (size > maxLabelCount_) { | |||
| 358 | int32_t count = 0; | |||
| 359 | int32_t old = -1; | |||
| 360 | for (int32_t i = 0; i < indexCharacters.size();) { | |||
| 361 | ++count; | |||
| 362 | int32_t bump = count * maxLabelCount_ / size; | |||
| 363 | if (bump == old) { | |||
| 364 | indexCharacters.removeElementAt(i); | |||
| 365 | } else { | |||
| 366 | old = bump; | |||
| 367 | ++i; | |||
| 368 | } | |||
| 369 | } | |||
| 370 | } | |||
| 371 | } | |||
| 372 | ||||
| 373 | namespace { | |||
| 374 | ||||
| 375 | const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) { | |||
| 376 | if (!current.startsWith(BASE, BASE_LENGTH)) { | |||
| 377 | return current; | |||
| 378 | } | |||
| 379 | UChar rest = current.charAt(BASE_LENGTH); | |||
| 380 | if (0x2800 < rest && rest <= 0x28FF) { // stroke count | |||
| 381 | int32_t count = rest-0x2800; | |||
| 382 | temp.setTo((UChar)(0x30 + count % 10)); | |||
| 383 | if (count >= 10) { | |||
| 384 | count /= 10; | |||
| 385 | temp.insert(0, (UChar)(0x30 + count % 10)); | |||
| 386 | if (count >= 10) { | |||
| 387 | count /= 10; | |||
| 388 | temp.insert(0, (UChar)(0x30 + count)); | |||
| 389 | } | |||
| 390 | } | |||
| 391 | return temp.append((UChar)0x5283); | |||
| 392 | } | |||
| 393 | return temp.setTo(current, BASE_LENGTH); | |||
| 394 | } | |||
| 395 | ||||
| 396 | UBool hasMultiplePrimaryWeights( | |||
| 397 | const RuleBasedCollator &coll, uint32_t variableTop, | |||
| 398 | const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) { | |||
| 399 | ces.removeAllElements(); | |||
| 400 | coll.internalGetCEs(s, ces, errorCode); | |||
| 401 | if (U_FAILURE(errorCode)) { return FALSE0; } | |||
| 402 | UBool seenPrimary = FALSE0; | |||
| 403 | for (int32_t i = 0; i < ces.size(); ++i) { | |||
| 404 | int64_t ce = ces.elementAti(i); | |||
| 405 | uint32_t p = (uint32_t)(ce >> 32); | |||
| 406 | if (p > variableTop) { | |||
| 407 | // not primary ignorable | |||
| 408 | if (seenPrimary) { | |||
| 409 | return TRUE1; | |||
| 410 | } | |||
| 411 | seenPrimary = TRUE1; | |||
| 412 | } | |||
| 413 | } | |||
| 414 | return FALSE0; | |||
| 415 | } | |||
| 416 | ||||
| 417 | } // namespace | |||
| 418 | ||||
| 419 | BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { | |||
| 420 | // Initialize indexCharacters. | |||
| 421 | UVector indexCharacters(errorCode); | |||
| 422 | indexCharacters.setDeleter(uprv_deleteUObjectuprv_deleteUObject_71); | |||
| 423 | initLabels(indexCharacters, errorCode); | |||
| 424 | if (U_FAILURE(errorCode)) { return NULL__null; } | |||
| 425 | ||||
| 426 | // Variables for hasMultiplePrimaryWeights(). | |||
| 427 | UVector64 ces(errorCode); | |||
| 428 | uint32_t variableTop; | |||
| 429 | if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { | |||
| 430 | variableTop = collatorPrimaryOnly_->getVariableTop(errorCode); | |||
| 431 | } else { | |||
| 432 | variableTop = 0; | |||
| 433 | } | |||
| 434 | UBool hasInvisibleBuckets = FALSE0; | |||
| 435 | ||||
| 436 | // Helper arrays for Chinese Pinyin collation. | |||
| 437 | Bucket *asciiBuckets[26] = { | |||
| 438 | NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, | |||
| 439 | NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null | |||
| 440 | }; | |||
| 441 | Bucket *pinyinBuckets[26] = { | |||
| 442 | NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, | |||
| 443 | NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null, NULL__null | |||
| 444 | }; | |||
| 445 | UBool hasPinyin = FALSE0; | |||
| 446 | ||||
| 447 | LocalPointer<UVector> bucketList(new UVector(errorCode), errorCode); | |||
| 448 | if (U_FAILURE(errorCode)) { | |||
| 449 | return NULL__null; | |||
| 450 | } | |||
| 451 | bucketList->setDeleter(uprv_deleteUObjectuprv_deleteUObject_71); | |||
| 452 | ||||
| 453 | // underflow bucket | |||
| 454 | LocalPointer<Bucket> bucket(new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW), errorCode); | |||
| 455 | if (U_FAILURE(errorCode)) { | |||
| 456 | return NULL__null; | |||
| 457 | } | |||
| 458 | bucketList->adoptElement(bucket.orphan(), errorCode); | |||
| 459 | if (U_FAILURE(errorCode)) { return NULL__null; } | |||
| 460 | ||||
| 461 | UnicodeString temp; | |||
| 462 | ||||
| 463 | // fix up the list, adding underflow, additions, overflow | |||
| 464 | // Insert inflow labels as needed. | |||
| 465 | int32_t scriptIndex = -1; | |||
| 466 | const UnicodeString *scriptUpperBoundary = &emptyString_; | |||
| 467 | for (int32_t i = 0; i < indexCharacters.size(); ++i) { | |||
| 468 | UnicodeString ¤t = *getString(indexCharacters, i); | |||
| 469 | if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { | |||
| 470 | // We crossed the script boundary into a new script. | |||
| 471 | const UnicodeString &inflowBoundary = *scriptUpperBoundary; | |||
| 472 | UBool skippedScript = FALSE0; | |||
| 473 | for (;;) { | |||
| 474 | scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); | |||
| 475 | if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { | |||
| 476 | break; | |||
| 477 | } | |||
| 478 | skippedScript = TRUE1; | |||
| 479 | } | |||
| 480 | if (skippedScript && bucketList->size() > 1) { | |||
| 481 | // We are skipping one or more scripts, | |||
| 482 | // and we are not just getting out of the underflow label. | |||
| 483 | bucket.adoptInsteadAndCheckErrorCode( | |||
| 484 | new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW), errorCode); | |||
| 485 | bucketList->adoptElement(bucket.orphan(), errorCode); | |||
| 486 | if (U_FAILURE(errorCode)) { return nullptr; } | |||
| 487 | } | |||
| 488 | } | |||
| 489 | // Add a bucket with the current label. | |||
| 490 | bucket.adoptInsteadAndCheckErrorCode( | |||
| 491 | new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL), errorCode); | |||
| 492 | bucketList->adoptElement(bucket.orphan(), errorCode); | |||
| 493 | if (U_FAILURE(errorCode)) { return nullptr; } | |||
| 494 | // Remember ASCII and Pinyin buckets for Pinyin redirects. | |||
| 495 | UChar c; | |||
| 496 | if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z | |||
| 497 | asciiBuckets[c - 0x41] = (Bucket *)bucketList->lastElement(); | |||
| 498 | } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) && | |||
| 499 | 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { | |||
| 500 | pinyinBuckets[c - 0x41] = (Bucket *)bucketList->lastElement(); | |||
| 501 | hasPinyin = TRUE1; | |||
| 502 | } | |||
| 503 | // Check for multiple primary weights. | |||
| 504 | if (!current.startsWith(BASE, BASE_LENGTH) && | |||
| 505 | hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current, | |||
| 506 | ces, errorCode) && | |||
| 507 | current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { | |||
| 508 | // "AE-ligature" or "Sch" etc. | |||
| 509 | for (int32_t j = bucketList->size() - 2;; --j) { | |||
| 510 | Bucket *singleBucket = getBucket(*bucketList, j); | |||
| 511 | if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { | |||
| 512 | // There is no single-character bucket since the last | |||
| 513 | // underflow or inflow label. | |||
| 514 | break; | |||
| 515 | } | |||
| 516 | if (singleBucket->displayBucket_ == NULL__null && | |||
| 517 | !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, | |||
| 518 | singleBucket->lowerBoundary_, | |||
| 519 | ces, errorCode)) { | |||
| 520 | // Add an invisible bucket that redirects strings greater than the expansion | |||
| 521 | // to the previous single-character bucket. | |||
| 522 | // For example, after ... Q R S Sch we add Sch\uFFFF->S | |||
| 523 | // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. | |||
| 524 | bucket.adoptInsteadAndCheckErrorCode(new Bucket(emptyString_, | |||
| 525 | UnicodeString(current).append((UChar)0xFFFF), | |||
| 526 | U_ALPHAINDEX_NORMAL), | |||
| 527 | errorCode); | |||
| 528 | if (U_FAILURE(errorCode)) { | |||
| 529 | return NULL__null; | |||
| 530 | } | |||
| 531 | bucket->displayBucket_ = singleBucket; | |||
| 532 | bucketList->adoptElement(bucket.orphan(), errorCode); | |||
| 533 | if (U_FAILURE(errorCode)) { return nullptr; } | |||
| 534 | hasInvisibleBuckets = TRUE1; | |||
| 535 | break; | |||
| 536 | } | |||
| 537 | } | |||
| 538 | } | |||
| 539 | } | |||
| 540 | if (U_FAILURE(errorCode)) { return NULL__null; } | |||
| 541 | if (bucketList->size() == 1) { | |||
| 542 | // No real labels, show only the underflow label. | |||
| 543 | BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); | |||
| 544 | if (bl == NULL__null) { | |||
| 545 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
| 546 | return NULL__null; | |||
| 547 | } | |||
| 548 | bucketList.orphan(); | |||
| 549 | return bl; | |||
| 550 | } | |||
| 551 | // overflow bucket | |||
| 552 | bucket.adoptInsteadAndCheckErrorCode( | |||
| 553 | new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW), errorCode); | |||
| 554 | bucketList->adoptElement(bucket.orphan(), errorCode); // final | |||
| 555 | if (U_FAILURE(errorCode)) { return nullptr; } | |||
| 556 | ||||
| 557 | if (hasPinyin) { | |||
| 558 | // Redirect Pinyin buckets. | |||
| 559 | Bucket *asciiBucket = NULL__null; | |||
| 560 | for (int32_t i = 0; i < 26; ++i) { | |||
| 561 | if (asciiBuckets[i] != NULL__null) { | |||
| 562 | asciiBucket = asciiBuckets[i]; | |||
| 563 | } | |||
| 564 | if (pinyinBuckets[i] != NULL__null && asciiBucket != NULL__null) { | |||
| 565 | pinyinBuckets[i]->displayBucket_ = asciiBucket; | |||
| 566 | hasInvisibleBuckets = TRUE1; | |||
| 567 | } | |||
| 568 | } | |||
| 569 | } | |||
| 570 | ||||
| 571 | if (U_FAILURE(errorCode)) { return NULL__null; } | |||
| 572 | if (!hasInvisibleBuckets) { | |||
| 573 | BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); | |||
| 574 | if (bl == NULL__null) { | |||
| 575 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
| 576 | return NULL__null; | |||
| 577 | } | |||
| 578 | bucketList.orphan(); | |||
| 579 | return bl; | |||
| 580 | } | |||
| 581 | // Merge inflow buckets that are visually adjacent. | |||
| 582 | // Iterate backwards: Merge inflow into overflow rather than the other way around. | |||
| 583 | int32_t i = bucketList->size() - 1; | |||
| 584 | Bucket *nextBucket = getBucket(*bucketList, i); | |||
| 585 | while (--i > 0) { | |||
| 586 | Bucket *bucket = getBucket(*bucketList, i); | |||
| 587 | if (bucket->displayBucket_ != NULL__null) { | |||
| 588 | continue; // skip invisible buckets | |||
| 589 | } | |||
| 590 | if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { | |||
| 591 | if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) { | |||
| 592 | bucket->displayBucket_ = nextBucket; | |||
| 593 | continue; | |||
| 594 | } | |||
| 595 | } | |||
| 596 | nextBucket = bucket; | |||
| 597 | } | |||
| 598 | ||||
| 599 | LocalPointer<UVector> publicBucketList(new UVector(errorCode), errorCode); | |||
| 600 | if (U_FAILURE(errorCode)) { | |||
| 601 | return NULL__null; | |||
| 602 | } | |||
| 603 | // Do not call publicBucketList->setDeleter(): | |||
| 604 | // This vector shares its objects with the bucketList. | |||
| 605 | for (int32_t j = 0; j < bucketList->size(); ++j) { | |||
| 606 | Bucket *bucket = getBucket(*bucketList, j); | |||
| 607 | if (bucket->displayBucket_ == NULL__null) { | |||
| 608 | publicBucketList->addElement(bucket, errorCode); | |||
| 609 | } | |||
| 610 | } | |||
| 611 | if (U_FAILURE(errorCode)) { return NULL__null; } | |||
| 612 | BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); | |||
| 613 | if (bl == NULL__null) { | |||
| 614 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |||
| 615 | return NULL__null; | |||
| 616 | } | |||
| 617 | bucketList.orphan(); | |||
| 618 | publicBucketList.orphan(); | |||
| 619 | return bl; | |||
| 620 | } | |||
| 621 | ||||
| 622 | /** | |||
| 623 | * Creates an index, and buckets and sorts the list of records into the index. | |||
| 624 | */ | |||
| 625 | void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { | |||
| 626 | if (U_FAILURE(errorCode) || buckets_ != NULL__null) { | |||
| 627 | return; | |||
| 628 | } | |||
| 629 | buckets_ = createBucketList(errorCode); | |||
| 630 | if (U_FAILURE(errorCode) || inputList_ == NULL__null || inputList_->isEmpty()) { | |||
| 631 | return; | |||
| 632 | } | |||
| 633 | ||||
| 634 | // Sort the records by name. | |||
| 635 | // Stable sort preserves input order of collation duplicates. | |||
| 636 | inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode); | |||
| 637 | ||||
| 638 | // Now, we traverse all of the input, which is now sorted. | |||
| 639 | // If the item doesn't go in the current bucket, we find the next bucket that contains it. | |||
| 640 | // This makes the process order n*log(n), since we just sort the list and then do a linear process. | |||
| 641 | // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so | |||
| 642 | // we need to improve it for that case. | |||
| 643 | ||||
| 644 | Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0); | |||
| 645 | int32_t bucketIndex = 1; | |||
| 646 | Bucket *nextBucket; | |||
| 647 | const UnicodeString *upperBoundary; | |||
| 648 | if (bucketIndex < buckets_->bucketList_->size()) { | |||
| 649 | nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); | |||
| 650 | upperBoundary = &nextBucket->lowerBoundary_; | |||
| 651 | } else { | |||
| 652 | nextBucket = NULL__null; | |||
| 653 | upperBoundary = NULL__null; | |||
| 654 | } | |||
| 655 | for (int32_t i = 0; i < inputList_->size(); ++i) { | |||
| 656 | Record *r = getRecord(*inputList_, i); | |||
| 657 | // if the current bucket isn't the right one, find the one that is | |||
| 658 | // We have a special flag for the last bucket so that we don't look any further | |||
| 659 | while (upperBoundary != NULL__null && | |||
| 660 | collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) { | |||
| 661 | currentBucket = nextBucket; | |||
| 662 | // now reset the boundary that we compare against | |||
| 663 | if (bucketIndex < buckets_->bucketList_->size()) { | |||
| 664 | nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); | |||
| 665 | upperBoundary = &nextBucket->lowerBoundary_; | |||
| 666 | } else { | |||
| 667 | upperBoundary = NULL__null; | |||
| 668 | } | |||
| 669 | } | |||
| 670 | // now put the record into the bucket. | |||
| 671 | Bucket *bucket = currentBucket; | |||
| 672 | if (bucket->displayBucket_ != NULL__null) { | |||
| 673 | bucket = bucket->displayBucket_; | |||
| 674 | } | |||
| 675 | if (bucket->records_ == NULL__null) { | |||
| 676 | LocalPointer<UVector> records(new UVector(errorCode), errorCode); | |||
| 677 | if (U_FAILURE(errorCode)) { | |||
| 678 | return; | |||
| 679 | } | |||
| 680 | bucket->records_ = records.orphan(); | |||
| 681 | } | |||
| 682 | bucket->records_->addElement(r, errorCode); | |||
| 683 | } | |||
| 684 | } | |||
| 685 | ||||
| 686 | void AlphabeticIndex::clearBuckets() { | |||
| 687 | if (buckets_ != NULL__null) { | |||
| 688 | delete buckets_; | |||
| 689 | buckets_ = NULL__null; | |||
| 690 | internalResetBucketIterator(); | |||
| 691 | } | |||
| 692 | } | |||
| 693 | ||||
| 694 | void AlphabeticIndex::internalResetBucketIterator() { | |||
| 695 | labelsIterIndex_ = -1; | |||
| 696 | currentBucket_ = NULL__null; | |||
| 697 | } | |||
| 698 | ||||
| 699 | ||||
| 700 | void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { | |||
| 701 | LocalULocaleDataPointer uld(ulocdata_openulocdata_open_71(locale.getName(), &status)); | |||
| 702 | if (U_FAILURE(status)) { | |||
| 703 | return; | |||
| 704 | } | |||
| 705 | ||||
| 706 | UnicodeSet exemplars; | |||
| 707 | ulocdata_getExemplarSetulocdata_getExemplarSet_71(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); | |||
| 708 | if (U_SUCCESS(status)) { | |||
| 709 | initialLabels_->addAll(exemplars); | |||
| 710 | return; | |||
| 711 | } | |||
| 712 | status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR | |||
| 713 | ||||
| 714 | // The locale data did not include explicit Index characters. | |||
| 715 | // Synthesize a set of them from the locale's standard exemplar characters. | |||
| 716 | ulocdata_getExemplarSetulocdata_getExemplarSet_71(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); | |||
| 717 | if (U_FAILURE(status)) { | |||
| 718 | return; | |||
| 719 | } | |||
| 720 | ||||
| 721 | // question: should we add auxiliary exemplars? | |||
| 722 | if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.isEmpty()) { | |||
| 723 | exemplars.add(0x61, 0x7A); | |||
| 724 | } | |||
| 725 | if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables | |||
| 726 | // cut down to small list | |||
| 727 | exemplars.remove(0xAC00, 0xD7A3). | |||
| 728 | add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). | |||
| 729 | add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). | |||
| 730 | add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). | |||
| 731 | add(0xD30C).add(0xD558); | |||
| 732 | } | |||
| 733 | if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block | |||
| 734 | // cut down to small list | |||
| 735 | // make use of the fact that Ethiopic is allocated in 8's, where | |||
| 736 | // the base is 0 mod 8. | |||
| 737 | UnicodeSet ethiopic(UnicodeString(u"[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]"), status); | |||
| 738 | ethiopic.retainAll(exemplars); | |||
| 739 | exemplars.remove(u'ሀ', 0x137F).addAll(ethiopic); | |||
| 740 | } | |||
| 741 | ||||
| 742 | // Upper-case any that aren't already so. | |||
| 743 | // (We only do this for synthesized index characters.) | |||
| 744 | UnicodeSetIterator it(exemplars); | |||
| 745 | UnicodeString upperC; | |||
| 746 | while (it.next()) { | |||
| 747 | const UnicodeString &exemplarC = it.getString(); | |||
| 748 | upperC = exemplarC; | |||
| 749 | upperC.toUpper(locale); | |||
| 750 | initialLabels_->add(upperC); | |||
| 751 | } | |||
| 752 | } | |||
| 753 | ||||
| 754 | UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { | |||
| 755 | UnicodeSet contractions; | |||
| 756 | collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode); | |||
| 757 | if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE0; } | |||
| 758 | initialLabels_->addAll(contractions); | |||
| 759 | UnicodeSetIterator iter(contractions); | |||
| 760 | while (iter.next()) { | |||
| 761 | const UnicodeString &s = iter.getString(); | |||
| 762 | U_ASSERT (s.startsWith(BASE, BASE_LENGTH))(void)0; | |||
| 763 | UChar c = s.charAt(s.length() - 1); | |||
| 764 | if (0x41 <= c && c <= 0x5A) { // A-Z | |||
| 765 | // There are Pinyin labels, add ASCII A-Z labels as well. | |||
| 766 | initialLabels_->add(0x41, 0x5A); // A-Z | |||
| 767 | break; | |||
| 768 | } | |||
| 769 | } | |||
| 770 | return TRUE1; | |||
| 771 | } | |||
| 772 | ||||
| 773 | ||||
| 774 | /* | |||
| 775 | * Return the string with interspersed CGJs. Input must have more than 2 codepoints. | |||
| 776 | */ | |||
| 777 | static const UChar CGJ = 0x034F; | |||
| 778 | UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { | |||
| 779 | UnicodeString result; | |||
| 780 | if (item.length() == 0) { | |||
| 781 | return result; | |||
| 782 | } | |||
| 783 | int32_t i = 0; | |||
| 784 | for (;;) { | |||
| 785 | UChar32 cp = item.char32At(i); | |||
| 786 | result.append(cp); | |||
| 787 | i = item.moveIndex32(i, 1); | |||
| 788 | if (i >= item.length()) { | |||
| 789 | break; | |||
| 790 | } | |||
| 791 | result.append(CGJ); | |||
| 792 | } | |||
| 793 | return result; | |||
| 794 | } | |||
| 795 | ||||
| 796 | ||||
| 797 | bool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { | |||
| 798 | return false; | |||
| 799 | } | |||
| 800 | ||||
| 801 | ||||
| 802 | bool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { | |||
| 803 | return false; | |||
| 804 | } | |||
| 805 | ||||
| 806 | ||||
| 807 | const RuleBasedCollator &AlphabeticIndex::getCollator() const { | |||
| 808 | return *collator_; | |||
| 809 | } | |||
| 810 | ||||
| 811 | ||||
| 812 | const UnicodeString &AlphabeticIndex::getInflowLabel() const { | |||
| 813 | return inflowLabel_; | |||
| 814 | } | |||
| 815 | ||||
| 816 | const UnicodeString &AlphabeticIndex::getOverflowLabel() const { | |||
| 817 | return overflowLabel_; | |||
| 818 | } | |||
| 819 | ||||
| 820 | ||||
| 821 | const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { | |||
| 822 | return underflowLabel_; | |||
| 823 | } | |||
| 824 | ||||
| 825 | ||||
| 826 | AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { | |||
| 827 | inflowLabel_ = label; | |||
| 828 | clearBuckets(); | |||
| 829 | return *this; | |||
| 830 | } | |||
| 831 | ||||
| 832 | ||||
| 833 | AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { | |||
| 834 | overflowLabel_ = label; | |||
| 835 | clearBuckets(); | |||
| 836 | return *this; | |||
| 837 | } | |||
| 838 | ||||
| 839 | ||||
| 840 | AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { | |||
| 841 | underflowLabel_ = label; | |||
| 842 | clearBuckets(); | |||
| 843 | return *this; | |||
| 844 | } | |||
| 845 | ||||
| 846 | ||||
| 847 | int32_t AlphabeticIndex::getMaxLabelCount() const { | |||
| 848 | return maxLabelCount_; | |||
| 849 | } | |||
| 850 | ||||
| 851 | ||||
| 852 | AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { | |||
| 853 | if (U_FAILURE(status)) { | |||
| 854 | return *this; | |||
| 855 | } | |||
| 856 | if (maxLabelCount <= 0) { | |||
| 857 | status = U_ILLEGAL_ARGUMENT_ERROR; | |||
| 858 | return *this; | |||
| 859 | } | |||
| 860 | maxLabelCount_ = maxLabelCount; | |||
| 861 | clearBuckets(); | |||
| 862 | return *this; | |||
| 863 | } | |||
| 864 | ||||
| 865 | ||||
| 866 | // | |||
| 867 | // init() - Common code for constructors. | |||
| 868 | // | |||
| 869 | ||||
| 870 | void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { | |||
| 871 | if (U_FAILURE(status)) { return; } | |||
| 872 | if (locale
| |||
| 873 | status = U_ILLEGAL_ARGUMENT_ERROR; | |||
| 874 | return; | |||
| 875 | } | |||
| 876 | ||||
| 877 | initialLabels_ = new UnicodeSet(); | |||
| 878 | if (initialLabels_ == NULL__null) { | |||
| 879 | status = U_MEMORY_ALLOCATION_ERROR; | |||
| 880 | return; | |||
| 881 | } | |||
| 882 | ||||
| 883 | inflowLabel_.setTo((UChar)0x2026); // Ellipsis | |||
| 884 | overflowLabel_ = inflowLabel_; | |||
| 885 | underflowLabel_ = inflowLabel_; | |||
| 886 | ||||
| 887 | if (collator_ == NULL__null) { | |||
| 888 | Collator *coll = Collator::createInstance(*locale, status); | |||
| ||||
| 889 | if (U_FAILURE(status)) { | |||
| 890 | delete coll; | |||
| 891 | return; | |||
| 892 | } | |||
| 893 | if (coll == NULL__null) { | |||
| 894 | status = U_MEMORY_ALLOCATION_ERROR; | |||
| 895 | return; | |||
| 896 | } | |||
| 897 | collator_ = dynamic_cast<RuleBasedCollator *>(coll); | |||
| 898 | if (collator_ == NULL__null) { | |||
| 899 | delete coll; | |||
| 900 | status = U_UNSUPPORTED_ERROR; | |||
| 901 | return; | |||
| 902 | } | |||
| 903 | } | |||
| 904 | collatorPrimaryOnly_ = collator_->clone(); | |||
| 905 | if (collatorPrimaryOnly_ == NULL__null) { | |||
| 906 | status = U_MEMORY_ALLOCATION_ERROR; | |||
| 907 | return; | |||
| 908 | } | |||
| 909 | collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); | |||
| 910 | firstCharsInScripts_ = firstStringsInScript(status); | |||
| 911 | if (U_FAILURE(status)) { return; } | |||
| 912 | firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); | |||
| 913 | // Guard against a degenerate collator where | |||
| 914 | // some script boundary strings are primary ignorable. | |||
| 915 | for (;;) { | |||
| 916 | if (U_FAILURE(status)) { return; } | |||
| 917 | if (firstCharsInScripts_->isEmpty()) { | |||
| 918 | // AlphabeticIndex requires some non-ignorable script boundary strings. | |||
| 919 | status = U_ILLEGAL_ARGUMENT_ERROR; | |||
| 920 | return; | |||
| 921 | } | |||
| 922 | if (collatorPrimaryOnly_->compare( | |||
| 923 | *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)), | |||
| 924 | emptyString_, status) == UCOL_EQUAL) { | |||
| 925 | firstCharsInScripts_->removeElementAt(0); | |||
| 926 | } else { | |||
| 927 | break; | |||
| 928 | } | |||
| 929 | } | |||
| 930 | ||||
| 931 | // Chinese index characters, which are specific to each of the several Chinese tailorings, | |||
| 932 | // take precedence over the single locale data exemplar set per language. | |||
| 933 | if (!addChineseIndexCharacters(status) && locale != NULL__null) { | |||
| 934 | addIndexExemplars(*locale, status); | |||
| 935 | } | |||
| 936 | } | |||
| 937 | ||||
| 938 | ||||
| 939 | // | |||
| 940 | // Comparison function for UVector<UnicodeString *> sorting with a collator. | |||
| 941 | // | |||
| 942 | static int32_t U_CALLCONV | |||
| 943 | collatorComparator(const void *context, const void *left, const void *right) { | |||
| 944 | const UElement *leftElement = static_cast<const UElement *>(left); | |||
| 945 | const UElement *rightElement = static_cast<const UElement *>(right); | |||
| 946 | const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer); | |||
| 947 | const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); | |||
| 948 | ||||
| 949 | if (leftString == rightString) { | |||
| 950 | // Catches case where both are NULL | |||
| 951 | return 0; | |||
| 952 | } | |||
| 953 | if (leftString == NULL__null) { | |||
| 954 | return 1; | |||
| 955 | } | |||
| 956 | if (rightString == NULL__null) { | |||
| 957 | return -1; | |||
| 958 | } | |||
| 959 | const Collator *col = static_cast<const Collator *>(context); | |||
| 960 | UErrorCode errorCode = U_ZERO_ERROR; | |||
| 961 | return col->compare(*leftString, *rightString, errorCode); | |||
| 962 | } | |||
| 963 | ||||
| 964 | // | |||
| 965 | // Comparison function for UVector<Record *> sorting with a collator. | |||
| 966 | // | |||
| 967 | static int32_t U_CALLCONV | |||
| 968 | recordCompareFn(const void *context, const void *left, const void *right) { | |||
| 969 | const UElement *leftElement = static_cast<const UElement *>(left); | |||
| 970 | const UElement *rightElement = static_cast<const UElement *>(right); | |||
| 971 | const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); | |||
| 972 | const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); | |||
| 973 | const Collator *col = static_cast<const Collator *>(context); | |||
| 974 | UErrorCode errorCode = U_ZERO_ERROR; | |||
| 975 | return col->compare(leftRec->name_, rightRec->name_, errorCode); | |||
| 976 | } | |||
| 977 | ||||
| 978 | UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { | |||
| 979 | if (U_FAILURE(status)) { | |||
| 980 | return NULL__null; | |||
| 981 | } | |||
| 982 | LocalPointer<UVector> dest(new UVector(status), status); | |||
| 983 | if (U_FAILURE(status)) { | |||
| 984 | return NULL__null; | |||
| 985 | } | |||
| 986 | dest->setDeleter(uprv_deleteUObjectuprv_deleteUObject_71); | |||
| 987 | // Fetch the script-first-primary contractions which are defined in the root collator. | |||
| 988 | // They all start with U+FDD1. | |||
| 989 | UnicodeSet set; | |||
| 990 | collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status); | |||
| 991 | if (U_FAILURE(status)) { | |||
| 992 | return NULL__null; | |||
| 993 | } | |||
| 994 | if (set.isEmpty()) { | |||
| 995 | status = U_UNSUPPORTED_ERROR; | |||
| 996 | return NULL__null; | |||
| 997 | } | |||
| 998 | UnicodeSetIterator iter(set); | |||
| 999 | while (iter.next()) { | |||
| 1000 | const UnicodeString &boundary = iter.getString(); | |||
| 1001 | uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1))((uint32_t)1<<(u_charType_71(boundary.char32At(1)))); | |||
| 1002 | if ((gcMask & (U_GC_L_MASK(((uint32_t)1<<(U_UPPERCASE_LETTER))|((uint32_t)1<< (U_LOWERCASE_LETTER))|((uint32_t)1<<(U_TITLECASE_LETTER ))|((uint32_t)1<<(U_MODIFIER_LETTER))|((uint32_t)1<< (U_OTHER_LETTER))) | U_GC_CN_MASK((uint32_t)1<<(U_GENERAL_OTHER_TYPES)))) == 0) { | |||
| 1003 | // Ignore boundaries for the special reordering groups. | |||
| 1004 | // Take only those for "real scripts" (where the sample character is a Letter, | |||
| 1005 | // and the one for unassigned implicit weights (Cn). | |||
| 1006 | continue; | |||
| 1007 | } | |||
| 1008 | LocalPointer<UnicodeString> s(new UnicodeString(boundary), status); | |||
| 1009 | dest->adoptElement(s.orphan(), status); | |||
| 1010 | if (U_FAILURE(status)) { | |||
| 1011 | return nullptr; | |||
| 1012 | } | |||
| 1013 | } | |||
| 1014 | return dest.orphan(); | |||
| 1015 | } | |||
| 1016 | ||||
| 1017 | ||||
| 1018 | namespace { | |||
| 1019 | ||||
| 1020 | /** | |||
| 1021 | * Returns true if one index character string is "better" than the other. | |||
| 1022 | * Shorter NFKD is better, and otherwise NFKD-binary-less-than is | |||
| 1023 | * better, and otherwise binary-less-than is better. | |||
| 1024 | */ | |||
| 1025 | UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, | |||
| 1026 | const UnicodeString &one, const UnicodeString &other) { | |||
| 1027 | // This is called with primary-equal strings, but never with one.equals(other). | |||
| 1028 | UErrorCode status = U_ZERO_ERROR; | |||
| 1029 | UnicodeString n1 = nfkdNormalizer.normalize(one, status); | |||
| 1030 | UnicodeString n2 = nfkdNormalizer.normalize(other, status); | |||
| 1031 | if (U_FAILURE(status)) { return FALSE0; } | |||
| 1032 | int32_t result = n1.countChar32() - n2.countChar32(); | |||
| 1033 | if (result != 0) { | |||
| 1034 | return result < 0; | |||
| 1035 | } | |||
| 1036 | result = n1.compareCodePointOrder(n2); | |||
| 1037 | if (result != 0) { | |||
| 1038 | return result < 0; | |||
| 1039 | } | |||
| 1040 | return one.compareCodePointOrder(other) < 0; | |||
| 1041 | } | |||
| 1042 | ||||
| 1043 | } // namespace | |||
| 1044 | ||||
| 1045 | // | |||
| 1046 | // Constructor & Destructor for AlphabeticIndex::Record | |||
| 1047 | // | |||
| 1048 | // Records are internal only, instances are not directly surfaced in the public API. | |||
| 1049 | // This class is mostly struct-like, with all public fields. | |||
| 1050 | ||||
| 1051 | AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data) | |||
| 1052 | : name_(name), data_(data) {} | |||
| 1053 | ||||
| 1054 | AlphabeticIndex::Record::~Record() { | |||
| 1055 | } | |||
| 1056 | ||||
| 1057 | ||||
| 1058 | AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { | |||
| 1059 | if (U_FAILURE(status)) { | |||
| 1060 | return *this; | |||
| 1061 | } | |||
| 1062 | if (inputList_ == NULL__null) { | |||
| 1063 | LocalPointer<UVector> inputList(new UVector(status), status); | |||
| 1064 | if (U_FAILURE(status)) { | |||
| 1065 | return *this; | |||
| 1066 | } | |||
| 1067 | inputList_ = inputList.orphan(); | |||
| 1068 | inputList_->setDeleter(alphaIndex_deleteRecord); | |||
| 1069 | } | |||
| 1070 | LocalPointer<Record> r(new Record(name, data), status); | |||
| 1071 | inputList_->adoptElement(r.orphan(), status); | |||
| 1072 | if (U_FAILURE(status)) { | |||
| 1073 | return *this; | |||
| 1074 | } | |||
| 1075 | clearBuckets(); | |||
| 1076 | //std::string ss; | |||
| 1077 | //std::string ss2; | |||
| 1078 | //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << | |||
| 1079 | // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; | |||
| 1080 | return *this; | |||
| 1081 | } | |||
| 1082 | ||||
| 1083 | ||||
| 1084 | AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { | |||
| 1085 | if (U_SUCCESS(status) && inputList_ != NULL__null && !inputList_->isEmpty()) { | |||
| 1086 | inputList_->removeAllElements(); | |||
| 1087 | clearBuckets(); | |||
| 1088 | } | |||
| 1089 | return *this; | |||
| 1090 | } | |||
| 1091 | ||||
| 1092 | int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { | |||
| 1093 | initBuckets(status); | |||
| 1094 | if (U_FAILURE(status)) { | |||
| 1095 | return 0; | |||
| 1096 | } | |||
| 1097 | return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status); | |||
| 1098 | } | |||
| 1099 | ||||
| 1100 | ||||
| 1101 | int32_t AlphabeticIndex::getBucketIndex() const { | |||
| 1102 | return labelsIterIndex_; | |||
| 1103 | } | |||
| 1104 | ||||
| 1105 | ||||
| 1106 | UBool AlphabeticIndex::nextBucket(UErrorCode &status) { | |||
| 1107 | if (U_FAILURE(status)) { | |||
| 1108 | return FALSE0; | |||
| 1109 | } | |||
| 1110 | if (buckets_ == NULL__null && currentBucket_ != NULL__null) { | |||
| 1111 | status = U_ENUM_OUT_OF_SYNC_ERROR; | |||
| 1112 | return FALSE0; | |||
| 1113 | } | |||
| 1114 | initBuckets(status); | |||
| 1115 | if (U_FAILURE(status)) { | |||
| 1116 | return FALSE0; | |||
| 1117 | } | |||
| 1118 | ++labelsIterIndex_; | |||
| 1119 | if (labelsIterIndex_ >= buckets_->getBucketCount()) { | |||
| 1120 | labelsIterIndex_ = buckets_->getBucketCount(); | |||
| 1121 | return FALSE0; | |||
| 1122 | } | |||
| 1123 | currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_); | |||
| 1124 | resetRecordIterator(); | |||
| 1125 | return TRUE1; | |||
| 1126 | } | |||
| 1127 | ||||
| 1128 | const UnicodeString &AlphabeticIndex::getBucketLabel() const { | |||
| 1129 | if (currentBucket_ != NULL__null) { | |||
| 1130 | return currentBucket_->label_; | |||
| 1131 | } else { | |||
| 1132 | return emptyString_; | |||
| 1133 | } | |||
| 1134 | } | |||
| 1135 | ||||
| 1136 | ||||
| 1137 | UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { | |||
| 1138 | if (currentBucket_ != NULL__null) { | |||
| 1139 | return currentBucket_->labelType_; | |||
| 1140 | } else { | |||
| 1141 | return U_ALPHAINDEX_NORMAL; | |||
| 1142 | } | |||
| 1143 | } | |||
| 1144 | ||||
| 1145 | ||||
| 1146 | int32_t AlphabeticIndex::getBucketRecordCount() const { | |||
| 1147 | if (currentBucket_ != NULL__null && currentBucket_->records_ != NULL__null) { | |||
| 1148 | return currentBucket_->records_->size(); | |||
| 1149 | } else { | |||
| 1150 | return 0; | |||
| 1151 | } | |||
| 1152 | } | |||
| 1153 | ||||
| 1154 | ||||
| 1155 | AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { | |||
| 1156 | if (U_FAILURE(status)) { | |||
| 1157 | return *this; | |||
| 1158 | } | |||
| 1159 | internalResetBucketIterator(); | |||
| 1160 | return *this; | |||
| 1161 | } | |||
| 1162 | ||||
| 1163 | ||||
| 1164 | UBool AlphabeticIndex::nextRecord(UErrorCode &status) { | |||
| 1165 | if (U_FAILURE(status)) { | |||
| 1166 | return FALSE0; | |||
| 1167 | } | |||
| 1168 | if (currentBucket_ == NULL__null) { | |||
| 1169 | // We are trying to iterate over the items in a bucket, but there is no | |||
| 1170 | // current bucket from the enumeration of buckets. | |||
| 1171 | status = U_INVALID_STATE_ERROR; | |||
| 1172 | return FALSE0; | |||
| 1173 | } | |||
| 1174 | if (buckets_ == NULL__null) { | |||
| 1175 | status = U_ENUM_OUT_OF_SYNC_ERROR; | |||
| 1176 | return FALSE0; | |||
| 1177 | } | |||
| 1178 | if (currentBucket_->records_ == NULL__null) { | |||
| 1179 | return FALSE0; | |||
| 1180 | } | |||
| 1181 | ++itemsIterIndex_; | |||
| 1182 | if (itemsIterIndex_ >= currentBucket_->records_->size()) { | |||
| 1183 | itemsIterIndex_ = currentBucket_->records_->size(); | |||
| 1184 | return FALSE0; | |||
| 1185 | } | |||
| 1186 | return TRUE1; | |||
| 1187 | } | |||
| 1188 | ||||
| 1189 | ||||
| 1190 | const UnicodeString &AlphabeticIndex::getRecordName() const { | |||
| 1191 | const UnicodeString *retStr = &emptyString_; | |||
| 1192 | if (currentBucket_ != NULL__null && currentBucket_->records_ != NULL__null && | |||
| 1193 | itemsIterIndex_ >= 0 && | |||
| 1194 | itemsIterIndex_ < currentBucket_->records_->size()) { | |||
| 1195 | Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); | |||
| 1196 | retStr = &item->name_; | |||
| 1197 | } | |||
| 1198 | return *retStr; | |||
| 1199 | } | |||
| 1200 | ||||
| 1201 | const void *AlphabeticIndex::getRecordData() const { | |||
| 1202 | const void *retPtr = NULL__null; | |||
| 1203 | if (currentBucket_ != NULL__null && currentBucket_->records_ != NULL__null && | |||
| 1204 | itemsIterIndex_ >= 0 && | |||
| 1205 | itemsIterIndex_ < currentBucket_->records_->size()) { | |||
| 1206 | Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); | |||
| 1207 | retPtr = item->data_; | |||
| 1208 | } | |||
| 1209 | return retPtr; | |||
| 1210 | } | |||
| 1211 | ||||
| 1212 | ||||
| 1213 | AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { | |||
| 1214 | itemsIterIndex_ = -1; | |||
| 1215 | return *this; | |||
| 1216 | } | |||
| 1217 | ||||
| 1218 | ||||
| 1219 | ||||
| 1220 | AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, | |||
| 1221 | const UnicodeString &lowerBoundary, | |||
| 1222 | UAlphabeticIndexLabelType type) | |||
| 1223 | : label_(label), lowerBoundary_(lowerBoundary), labelType_(type), | |||
| 1224 | displayBucket_(NULL__null), displayIndex_(-1), | |||
| 1225 | records_(NULL__null) { | |||
| 1226 | } | |||
| 1227 | ||||
| 1228 | ||||
| 1229 | AlphabeticIndex::Bucket::~Bucket() { | |||
| 1230 | delete records_; | |||
| 1231 | } | |||
| 1232 | ||||
| 1233 | U_NAMESPACE_END} | |||
| 1234 | ||||
| 1235 | #endif // !UCONFIG_NO_COLLATION |