Bug Summary

File:out/../deps/icu-small/source/common/unormcmp.cpp
Warning:line 332, column 23
Assigned value is garbage or undefined

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name unormcmp.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/home/maurizio/node-v18.6.0/out -resource-dir /usr/local/lib/clang/16.0.0 -D V8_DEPRECATION_WARNINGS -D V8_IMMINENT_DEPRECATION_WARNINGS -D _GLIBCXX_USE_CXX11_ABI=1 -D NODE_OPENSSL_CONF_NAME=nodejs_conf -D NODE_OPENSSL_HAS_QUIC -D __STDC_FORMAT_MACROS -D OPENSSL_NO_PINSHARED -D OPENSSL_THREADS -D U_COMMON_IMPLEMENTATION=1 -D U_ATTRIBUTE_DEPRECATED= -D _CRT_SECURE_NO_DEPRECATE= -D U_STATIC_IMPLEMENTATION=1 -D UCONFIG_NO_SERVICE=1 -D U_ENABLE_DYLOAD=0 -D U_HAVE_STD_STRING=1 -D UCONFIG_NO_BREAK_ITERATION=0 -I ../deps/icu-small/source/common -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8 -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/x86_64-redhat-linux -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/backward -internal-isystem /usr/local/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../x86_64-redhat-linux/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-unused-parameter -Wno-deprecated-declarations -Wno-strict-aliasing -std=gnu++17 -fdeprecated-macro -fdebug-compilation-dir=/home/maurizio/node-v18.6.0/out -ferror-limit 19 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-08-22-142216-507842-1 -x c++ ../deps/icu-small/source/common/unormcmp.cpp
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2001-2014, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: unormcmp.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2004sep13
16* created by: Markus W. Scherer
17*
18* unorm_compare() function moved here from unorm.cpp for better modularization.
19* Depends on both normalization and case folding.
20* Allows unorm.cpp to not depend on any character properties code.
21*/
22
23#include "unicode/utypes.h"
24
25#if !UCONFIG_NO_NORMALIZATION0
26
27#include "unicode/unorm.h"
28#include "unicode/ustring.h"
29#include "cmemory.h"
30#include "normalizer2impl.h"
31#include "ucase.h"
32#include "uprops.h"
33#include "ustr_imp.h"
34
35U_NAMESPACE_USEusing namespace icu_71;
36
37/* compare canonically equivalent ------------------------------------------- */
38
39/*
40 * Compare two strings for canonical equivalence.
41 * Further options include case-insensitive comparison and
42 * code point order (as opposed to code unit order).
43 *
44 * In this function, canonical equivalence is optional as well.
45 * If canonical equivalence is tested, then both strings must fulfill
46 * the FCD check.
47 *
48 * Semantically, this is equivalent to
49 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
50 * where code point order, NFD and foldCase are all optional.
51 *
52 * String comparisons almost always yield results before processing both strings
53 * completely.
54 * They are generally more efficient working incrementally instead of
55 * performing the sub-processing (strlen, normalization, case-folding)
56 * on the entire strings first.
57 *
58 * It is also unnecessary to not normalize identical characters.
59 *
60 * This function works in principle as follows:
61 *
62 * loop {
63 * get one code unit c1 from s1 (-1 if end of source)
64 * get one code unit c2 from s2 (-1 if end of source)
65 *
66 * if(either string finished) {
67 * return result;
68 * }
69 * if(c1==c2) {
70 * continue;
71 * }
72 *
73 * // c1!=c2
74 * try to decompose/case-fold c1/c2, and continue if one does;
75 *
76 * // still c1!=c2 and neither decomposes/case-folds, return result
77 * return c1-c2;
78 * }
79 *
80 * When a character decomposes, then the pointer for that source changes to
81 * the decomposition, pushing the previous pointer onto a stack.
82 * When the end of the decomposition is reached, then the code unit reader
83 * pops the previous source from the stack.
84 * (Same for case-folding.)
85 *
86 * This is complicated further by operating on variable-width UTF-16.
87 * The top part of the loop works on code units, while lookups for decomposition
88 * and case-folding need code points.
89 * Code points are assembled after the equality/end-of-source part.
90 * The source pointer is only advanced beyond all code units when the code point
91 * actually decomposes/case-folds.
92 *
93 * If we were on a trail surrogate unit when assembling a code point,
94 * and the code point decomposes/case-folds, then the decomposition/folding
95 * result must be compared with the part of the other string that corresponds to
96 * this string's lead surrogate.
97 * Since we only assemble a code point when hitting a trail unit when the
98 * preceding lead units were identical, we back up the other string by one unit
99 * in such a case.
100 *
101 * The optional code point order comparison at the end works with
102 * the same fix-up as the other code point order comparison functions.
103 * See ustring.c and the comment near the end of this function.
104 *
105 * Assumption: A decomposition or case-folding result string never contains
106 * a single surrogate. This is a safe assumption in the Unicode Standard.
107 * Therefore, we do not need to check for surrogate pairs across
108 * decomposition/case-folding boundaries.
109 *
110 * Further assumptions (see verifications tstnorm.cpp):
111 * The API function checks for FCD first, while the core function
112 * first case-folds and then decomposes. This requires that case-folding does not
113 * un-FCD any strings.
114 *
115 * The API function may also NFD the input and turn off decomposition.
116 * This requires that case-folding does not un-NFD strings either.
117 *
118 * TODO If any of the above two assumptions is violated,
119 * then this entire code must be re-thought.
120 * If this happens, then a simple solution is to case-fold both strings up front
121 * and to turn off UNORM_INPUT_IS_FCD.
122 * We already do this when not both strings are in FCD because makeFCD
123 * would be a partial NFD before the case folding, which does not work.
124 * Note that all of this is only a problem when case-folding _and_
125 * canonical equivalence come together.
126 * (Comments in unorm_compare() are more up to date than this TODO.)
127 */
128
129/* stack element for previous-level source/decomposition pointers */
130struct CmpEquivLevel {
131 const UChar *start, *s, *limit;
132};
133typedef struct CmpEquivLevel CmpEquivLevel;
134
135/**
136 * Internal option for unorm_cmpEquivFold() for decomposing.
137 * If not set, just do strcasecmp().
138 */
139#define _COMPARE_EQUIV0x80000 0x80000
140
141/* internal function */
142static int32_t
143unorm_cmpEquivFold(const UChar *s1, int32_t length1,
144 const UChar *s2, int32_t length2,
145 uint32_t options,
146 UErrorCode *pErrorCode) {
147 const Normalizer2Impl *nfcImpl;
148
149 /* current-level start/limit - s1/s2 as current */
150 const UChar *start1, *start2, *limit1, *limit2;
151
152 /* decomposition and case folding variables */
153 const UChar *p;
154 int32_t length;
155
156 /* stacks of previous-level start/current/limit */
157 CmpEquivLevel stack1[2], stack2[2];
158
159 /* buffers for algorithmic decompositions */
160 UChar decomp1[4], decomp2[4];
161
162 /* case folding buffers, only use current-level start/limit */
163 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
164
165 /* track which is the current level per string */
166 int32_t level1, level2;
167
168 /* current code units, and code points for lookups */
169 UChar32 c1, c2, cp1, cp2;
170
171 /* no argument error checking because this itself is not an API */
172
173 /*
174 * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
175 * otherwise this function must behave exactly as uprv_strCompare()
176 * not checking for that here makes testing this function easier
177 */
178
179 /* normalization/properties data loaded? */
180 if((options&_COMPARE_EQUIV0x80000)!=0) {
1
Assuming the condition is false
2
Taking false branch
181 nfcImpl=Normalizer2Factory::getNFCImpl(*pErrorCode);
182 } else {
183 nfcImpl=NULL__null;
184 }
185 if(U_FAILURE(*pErrorCode)) {
3
Taking false branch
186 return 0;
187 }
188
189 /* initialize */
190 start1=s1;
191 if(length1==-1) {
4
Assuming the condition is false
5
Taking false branch
192 limit1=NULL__null;
193 } else {
194 limit1=s1+length1;
195 }
196
197 start2=s2;
198 if(length2==-1) {
6
Assuming the condition is false
7
Taking false branch
199 limit2=NULL__null;
200 } else {
201 limit2=s2+length2;
202 }
203
204 level1=level2=0;
205 c1=c2=-1;
206
207 /* comparison loop */
208 for(;;) {
8
Loop condition is true. Entering loop body
38
Loop condition is true. Entering loop body
209 /*
210 * here a code unit value of -1 means "get another code unit"
211 * below it will mean "this source is finished"
212 */
213
214 if(c1
8.1
'c1' is < 0
<0
) {
9
Taking true branch
39
Assuming 'c1' is >= 0
40
Taking false branch
215 /* get next code unit from string 1, post-increment */
216 for(;;) {
217 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL__null || (options&_STRNCMP_STYLE0x1000)))) {
10
Assuming 's1' is not equal to 'limit1'
11
Assuming the condition is false
218 if(level1==0) {
219 c1=-1;
220 break;
221 }
222 } else {
223 ++s1;
224 break;
225 }
226
227 /* reached end of level buffer, pop one level */
228 do {
229 --level1;
230 start1=stack1[level1].start; /*Not uninitialized*/
231 } while(start1==NULL__null);
232 s1=stack1[level1].s; /*Not uninitialized*/
233 limit1=stack1[level1].limit; /*Not uninitialized*/
234 }
235 }
236
237 if(c2
12.1
'c2' is < 0
40.1
'c2' is < 0
<0) {
12
Execution continues on line 237
13
Taking true branch
41
Taking true branch
238 /* get next code unit from string 2, post-increment */
239 for(;;) {
240 if(s2
41.1
's2' is not equal to 'limit2'
==limit2
|| ((c2=*s2)==0 && (limit2==NULL__null || (options&_STRNCMP_STYLE0x1000)))) {
14
Assuming 's2' is not equal to 'limit2'
15
Assuming the condition is false
241 if(level2==0) {
242 c2=-1;
243 break;
244 }
245 } else {
246 ++s2;
247 break;
248 }
249
250 /* reached end of level buffer, pop one level */
251 do {
252 --level2;
253 start2=stack2[level2].start; /*Not uninitialized*/
254 } while(start2==NULL__null);
255 s2=stack2[level2].s; /*Not uninitialized*/
256 limit2=stack2[level2].limit; /*Not uninitialized*/
257 }
258 }
259
260 /*
261 * compare c1 and c2
262 * either variable c1, c2 is -1 only if the corresponding string is finished
263 */
264 if(c1==c2) {
16
Execution continues on line 264
17
Assuming 'c1' is not equal to 'c2'
18
Taking false branch
42
Execution continues on line 264
43
Assuming 'c1' is not equal to 'c2'
44
Taking false branch
265 if(c1<0) {
266 return 0; /* c1==c2==-1 indicating end of strings */
267 }
268 c1=c2=-1; /* make us fetch new code units */
269 continue;
270 } else if(c1
18.1
'c1' is >= 0
44.1
'c1' is >= 0
<0) {
19
Taking false branch
45
Taking false branch
271 return -1; /* string 1 ends before string 2 */
272 } else if(c2
19.1
'c2' is >= 0
45.1
'c2' is >= 0
<0) {
20
Taking false branch
46
Taking false branch
273 return 1; /* string 2 ends before string 1 */
274 }
275 /* c1!=c2 && c1>=0 && c2>=0 */
276
277 /* get complete code points for c1, c2 for lookups if either is a surrogate */
278 cp1=c1;
279 if(U_IS_SURROGATE(c1)(((c1)&0xfffff800)==0xd800)) {
21
Assuming the condition is false
22
Taking false branch
47
Assuming the condition is true
48
Taking true branch
280 UChar c;
281
282 if(U_IS_SURROGATE_LEAD(c1)(((c1)&0x400)==0)) {
49
Assuming the condition is false
283 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)(((c=*s1)&0xfffffc00)==0xdc00)) {
284 /* advance ++s1; only below if cp1 decomposes/case-folds */
285 cp1=U16_GET_SUPPLEMENTARY(c1, c)(((UChar32)(c1)<<10UL)+(UChar32)(c)-((0xd800<<10UL
)+0xdc00-0x10000))
;
286 }
287 } else /* isTrail(c1) */ {
288 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))(((c=*(s1-2))&0xfffffc00)==0xd800)) {
289 cp1=U16_GET_SUPPLEMENTARY(c, c1)(((UChar32)(c)<<10UL)+(UChar32)(c1)-((0xd800<<10UL
)+0xdc00-0x10000))
;
290 }
291 }
292 }
293
294 cp2=c2;
295 if(U_IS_SURROGATE(c2)(((c2)&0xfffff800)==0xd800)) {
23
Assuming the condition is true
24
Taking true branch
50
Assuming the condition is false
296 UChar c;
297
298 if(U_IS_SURROGATE_LEAD(c2)(((c2)&0x400)==0)) {
25
Assuming the condition is false
299 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)(((c=*s2)&0xfffffc00)==0xdc00)) {
300 /* advance ++s2; only below if cp2 decomposes/case-folds */
301 cp2=U16_GET_SUPPLEMENTARY(c2, c)(((UChar32)(c2)<<10UL)+(UChar32)(c)-((0xd800<<10UL
)+0xdc00-0x10000))
;
302 }
303 } else /* isTrail(c2) */ {
304 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))(((c=*(s2-2))&0xfffffc00)==0xd800)) {
305 cp2=U16_GET_SUPPLEMENTARY(c, c2)(((UChar32)(c)<<10UL)+(UChar32)(c2)-((0xd800<<10UL
)+0xdc00-0x10000))
;
306 }
307 }
308 }
309
310 /*
311 * go down one level for each string
312 * continue with the main loop as soon as there is a real change
313 */
314
315 if( level1
25.1
'level1' is equal to 0
50.1
'level1' is equal to 0
==0 && (options&U_COMPARE_IGNORE_CASE0x10000) &&
26
Assuming the condition is true
52
Taking true branch
316 (length=ucase_toFullFoldingucase_toFullFolding_71((UChar32)cp1, &p, options))>=0
27
Assuming the condition is false
51
Assuming the condition is true
317 ) {
318 /* cp1 case-folds to the code point "length" or to p[length] */
319 if(U_IS_SURROGATE(c1)(((c1)&0xfffff800)==0xd800)) {
53
Taking true branch
320 if(U_IS_SURROGATE_LEAD(c1)(((c1)&0x400)==0)) {
54
Taking false branch
321 /* advance beyond source surrogate pair if it case-folds */
322 ++s1;
323 } else /* isTrail(c1) */ {
324 /*
325 * we got a supplementary code point when hitting its trail surrogate,
326 * therefore the lead surrogate must have been the same as in the other string;
327 * compare this decomposition with the lead surrogate in the other string
328 * remember that this simulates bulk text replacement:
329 * the decomposition would replace the entire code point
330 */
331 --s2;
332 c2=*(s2-1);
55
Assigned value is garbage or undefined
333 }
334 }
335
336 /* push current level pointers */
337 stack1[0].start=start1;
338 stack1[0].s=s1;
339 stack1[0].limit=limit1;
340 ++level1;
341
342 /* copy the folding result to fold1[] */
343 if(length<=UCASE_MAX_STRING_LENGTH) {
344 u_memcpyu_memcpy_71(fold1, p, length);
345 } else {
346 int32_t i=0;
347 U16_APPEND_UNSAFE(fold1, i, length)do { if((uint32_t)(length)<=0xffff) { (fold1)[(i)++]=(uint16_t
)(length); } else { (fold1)[(i)++]=(uint16_t)(((length)>>
10)+0xd7c0); (fold1)[(i)++]=(uint16_t)(((length)&0x3ff)|0xdc00
); } } while (false)
;
348 length=i;
349 }
350
351 /* set next level pointers to case folding */
352 start1=s1=fold1;
353 limit1=fold1+length;
354
355 /* get ready to read from decomposition, continue with loop */
356 c1=-1;
357 continue;
358 }
359
360 if( level2
27.1
'level2' is equal to 0
==0 && (options&U_COMPARE_IGNORE_CASE0x10000) &&
29
Taking true branch
361 (length=ucase_toFullFoldingucase_toFullFolding_71((UChar32)cp2, &p, options))>=0
28
Assuming the condition is true
362 ) {
363 /* cp2 case-folds to the code point "length" or to p[length] */
364 if(U_IS_SURROGATE(c2)(((c2)&0xfffff800)==0xd800)) {
30
Taking true branch
365 if(U_IS_SURROGATE_LEAD(c2)(((c2)&0x400)==0)) {
31
Taking false branch
366 /* advance beyond source surrogate pair if it case-folds */
367 ++s2;
368 } else /* isTrail(c2) */ {
369 /*
370 * we got a supplementary code point when hitting its trail surrogate,
371 * therefore the lead surrogate must have been the same as in the other string;
372 * compare this decomposition with the lead surrogate in the other string
373 * remember that this simulates bulk text replacement:
374 * the decomposition would replace the entire code point
375 */
376 --s1;
377 c1=*(s1-1);
378 }
379 }
380
381 /* push current level pointers */
382 stack2[0].start=start2;
383 stack2[0].s=s2;
384 stack2[0].limit=limit2;
385 ++level2;
386
387 /* copy the folding result to fold2[] */
388 if(length<=UCASE_MAX_STRING_LENGTH) {
32
Assuming 'length' is > UCASE_MAX_STRING_LENGTH
33
Taking false branch
389 u_memcpyu_memcpy_71(fold2, p, length);
390 } else {
391 int32_t i=0;
392 U16_APPEND_UNSAFE(fold2, i, length)do { if((uint32_t)(length)<=0xffff) { (fold2)[(i)++]=(uint16_t
)(length); } else { (fold2)[(i)++]=(uint16_t)(((length)>>
10)+0xd7c0); (fold2)[(i)++]=(uint16_t)(((length)&0x3ff)|0xdc00
); } } while (false)
;
34
Assuming 'length' is <= 65535
35
Taking true branch
36
Loop condition is false. Exiting loop
393 length=i;
394 }
395
396 /* set next level pointers to case folding */
397 start2=s2=fold2;
398 limit2=fold2+length;
399
400 /* get ready to read from decomposition, continue with loop */
401 c2=-1;
402 continue;
37
Execution continues on line 208
403 }
404
405 if( level1<2 && (options&_COMPARE_EQUIV0x80000) &&
406 0!=(p=nfcImpl->getDecomposition((UChar32)cp1, decomp1, length))
407 ) {
408 /* cp1 decomposes into p[length] */
409 if(U_IS_SURROGATE(c1)(((c1)&0xfffff800)==0xd800)) {
410 if(U_IS_SURROGATE_LEAD(c1)(((c1)&0x400)==0)) {
411 /* advance beyond source surrogate pair if it decomposes */
412 ++s1;
413 } else /* isTrail(c1) */ {
414 /*
415 * we got a supplementary code point when hitting its trail surrogate,
416 * therefore the lead surrogate must have been the same as in the other string;
417 * compare this decomposition with the lead surrogate in the other string
418 * remember that this simulates bulk text replacement:
419 * the decomposition would replace the entire code point
420 */
421 --s2;
422 c2=*(s2-1);
423 }
424 }
425
426 /* push current level pointers */
427 stack1[level1].start=start1;
428 stack1[level1].s=s1;
429 stack1[level1].limit=limit1;
430 ++level1;
431
432 /* set empty intermediate level if skipped */
433 if(level1<2) {
434 stack1[level1++].start=NULL__null;
435 }
436
437 /* set next level pointers to decomposition */
438 start1=s1=p;
439 limit1=p+length;
440
441 /* get ready to read from decomposition, continue with loop */
442 c1=-1;
443 continue;
444 }
445
446 if( level2<2 && (options&_COMPARE_EQUIV0x80000) &&
447 0!=(p=nfcImpl->getDecomposition((UChar32)cp2, decomp2, length))
448 ) {
449 /* cp2 decomposes into p[length] */
450 if(U_IS_SURROGATE(c2)(((c2)&0xfffff800)==0xd800)) {
451 if(U_IS_SURROGATE_LEAD(c2)(((c2)&0x400)==0)) {
452 /* advance beyond source surrogate pair if it decomposes */
453 ++s2;
454 } else /* isTrail(c2) */ {
455 /*
456 * we got a supplementary code point when hitting its trail surrogate,
457 * therefore the lead surrogate must have been the same as in the other string;
458 * compare this decomposition with the lead surrogate in the other string
459 * remember that this simulates bulk text replacement:
460 * the decomposition would replace the entire code point
461 */
462 --s1;
463 c1=*(s1-1);
464 }
465 }
466
467 /* push current level pointers */
468 stack2[level2].start=start2;
469 stack2[level2].s=s2;
470 stack2[level2].limit=limit2;
471 ++level2;
472
473 /* set empty intermediate level if skipped */
474 if(level2<2) {
475 stack2[level2++].start=NULL__null;
476 }
477
478 /* set next level pointers to decomposition */
479 start2=s2=p;
480 limit2=p+length;
481
482 /* get ready to read from decomposition, continue with loop */
483 c2=-1;
484 continue;
485 }
486
487 /*
488 * no decomposition/case folding, max level for both sides:
489 * return difference result
490 *
491 * code point order comparison must not just return cp1-cp2
492 * because when single surrogates are present then the surrogate pairs
493 * that formed cp1 and cp2 may be from different string indexes
494 *
495 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
496 * c1=d800 cp1=10001 c2=dc00 cp2=10000
497 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
498 *
499 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
500 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
501 * so we have slightly different pointer/start/limit comparisons here
502 */
503
504 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER0x8000)) {
505 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
506 if(
507 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)(((*s1)&0xfffffc00)==0xdc00)) ||
508 (U16_IS_TRAIL(c1)(((c1)&0xfffffc00)==0xdc00) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))(((*(s1-2))&0xfffffc00)==0xd800))
509 ) {
510 /* part of a surrogate pair, leave >=d800 */
511 } else {
512 /* BMP code point - may be surrogate code point - make <d800 */
513 c1-=0x2800;
514 }
515
516 if(
517 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)(((*s2)&0xfffffc00)==0xdc00)) ||
518 (U16_IS_TRAIL(c2)(((c2)&0xfffffc00)==0xdc00) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))(((*(s2-2))&0xfffffc00)==0xd800))
519 ) {
520 /* part of a surrogate pair, leave >=d800 */
521 } else {
522 /* BMP code point - may be surrogate code point - make <d800 */
523 c2-=0x2800;
524 }
525 }
526
527 return c1-c2;
528 }
529}
530
531static
532UBool _normalize(const Normalizer2 *n2, const UChar *s, int32_t length,
533 UnicodeString &normalized, UErrorCode *pErrorCode) {
534 UnicodeString str(length<0, s, length);
535
536 // check if s fulfill the conditions
537 int32_t spanQCYes=n2->spanQuickCheckYes(str, *pErrorCode);
538 if (U_FAILURE(*pErrorCode)) {
539 return FALSE0;
540 }
541 /*
542 * ICU 2.4 had a further optimization:
543 * If both strings were not in FCD, then they were both NFD'ed,
544 * and the _COMPARE_EQUIV option was turned off.
545 * It is not entirely clear that this is valid with the current
546 * definition of the canonical caseless match.
547 * Therefore, ICU 2.6 removes that optimization.
548 */
549 if(spanQCYes<str.length()) {
550 UnicodeString unnormalized=str.tempSubString(spanQCYes);
551 normalized.setTo(FALSE0, str.getBuffer(), spanQCYes);
552 n2->normalizeSecondAndAppend(normalized, unnormalized, *pErrorCode);
553 if (U_SUCCESS(*pErrorCode)) {
554 return TRUE1;
555 }
556 }
557 return FALSE0;
558}
559
560U_CAPIextern "C" int32_t U_EXPORT2
561unorm_compareunorm_compare_71(const UChar *s1, int32_t length1,
562 const UChar *s2, int32_t length2,
563 uint32_t options,
564 UErrorCode *pErrorCode) {
565 /* argument checking */
566 if(U_FAILURE(*pErrorCode)) {
567 return 0;
568 }
569 if(s1==0 || length1<-1 || s2==0 || length2<-1) {
570 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
571 return 0;
572 }
573
574 UnicodeString fcd1, fcd2;
575 int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT20);
576 options|=_COMPARE_EQUIV0x80000;
577
578 /*
579 * UAX #21 Case Mappings, as fixed for Unicode version 4
580 * (see Jitterbug 2021), defines a canonical caseless match as
581 *
582 * A string X is a canonical caseless match
583 * for a string Y if and only if
584 * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
585 *
586 * For better performance, we check for FCD (or let the caller tell us that
587 * both strings are in FCD) for the inner normalization.
588 * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
589 * case-folding preserves the FCD-ness of a string.
590 * The outer normalization is then only performed by unorm_cmpEquivFold()
591 * when there is a difference.
592 *
593 * Exception: When using the Turkic case-folding option, we do perform
594 * full NFD first. This is because in the Turkic case precomposed characters
595 * with 0049 capital I or 0069 small i fold differently whether they
596 * are first decomposed or not, so an FCD check - a check only for
597 * canonical order - is not sufficient.
598 */
599 if(!(options&UNORM_INPUT_IS_FCD0x20000) || (options&U_FOLD_CASE_EXCLUDE_SPECIAL_I1)) {
600 const Normalizer2 *n2;
601 if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I1) {
602 n2=Normalizer2::getNFDInstance(*pErrorCode);
603 } else {
604 n2=Normalizer2Factory::getFCDInstance(*pErrorCode);
605 }
606 if (U_FAILURE(*pErrorCode)) {
607 return 0;
608 }
609
610 if(normOptions&UNORM_UNICODE_3_2) {
611 const UnicodeSet *uni32=uniset_getUnicode32Instanceuniset_getUnicode32Instance_71(*pErrorCode);
612 FilteredNormalizer2 fn2(*n2, *uni32);
613 if(_normalize(&fn2, s1, length1, fcd1, pErrorCode)) {
614 s1=fcd1.getBuffer();
615 length1=fcd1.length();
616 }
617 if(_normalize(&fn2, s2, length2, fcd2, pErrorCode)) {
618 s2=fcd2.getBuffer();
619 length2=fcd2.length();
620 }
621 } else {
622 if(_normalize(n2, s1, length1, fcd1, pErrorCode)) {
623 s1=fcd1.getBuffer();
624 length1=fcd1.length();
625 }
626 if(_normalize(n2, s2, length2, fcd2, pErrorCode)) {
627 s2=fcd2.getBuffer();
628 length2=fcd2.length();
629 }
630 }
631 }
632
633 if(U_SUCCESS(*pErrorCode)) {
634 return unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
635 } else {
636 return 0;
637 }
638}
639
640#endif /* #if !UCONFIG_NO_NORMALIZATION */