Bug Summary

File:out/../deps/icu-small/source/common/bmpset.cpp
Warning:line 559, column 58
Although the value stored to 'b' is used in the enclosing expression, the value is never actually read from 'b'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name bmpset.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/home/maurizio/node-v18.6.0/out -resource-dir /usr/local/lib/clang/16.0.0 -D V8_DEPRECATION_WARNINGS -D V8_IMMINENT_DEPRECATION_WARNINGS -D _GLIBCXX_USE_CXX11_ABI=1 -D NODE_OPENSSL_CONF_NAME=nodejs_conf -D NODE_OPENSSL_HAS_QUIC -D __STDC_FORMAT_MACROS -D OPENSSL_NO_PINSHARED -D OPENSSL_THREADS -D U_COMMON_IMPLEMENTATION=1 -D U_ATTRIBUTE_DEPRECATED= -D _CRT_SECURE_NO_DEPRECATE= -D U_STATIC_IMPLEMENTATION=1 -D UCONFIG_NO_SERVICE=1 -D U_ENABLE_DYLOAD=0 -D U_HAVE_STD_STRING=1 -D UCONFIG_NO_BREAK_ITERATION=0 -I ../deps/icu-small/source/common -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8 -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/x86_64-redhat-linux -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/backward -internal-isystem /usr/local/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../x86_64-redhat-linux/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-unused-parameter -Wno-deprecated-declarations -Wno-strict-aliasing -std=gnu++17 -fdeprecated-macro -fdebug-compilation-dir=/home/maurizio/node-v18.6.0/out -ferror-limit 19 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-08-22-142216-507842-1 -x c++ ../deps/icu-small/source/common/bmpset.cpp
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5*
6* Copyright (C) 2007-2012, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9******************************************************************************
10* file name: bmpset.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2007jan29
16* created by: Markus W. Scherer
17*/
18
19#include "unicode/utypes.h"
20#include "unicode/uniset.h"
21#include "unicode/utf8.h"
22#include "unicode/utf16.h"
23#include "cmemory.h"
24#include "bmpset.h"
25#include "uassert.h"
26
27U_NAMESPACE_BEGINnamespace icu_71 {
28
29BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
30 list(parentList), listLength(parentListLength) {
31 uprv_memset(latin1Contains, 0, sizeof(latin1Contains)):: memset(latin1Contains, 0, sizeof(latin1Contains));
32 uprv_memset(table7FF, 0, sizeof(table7FF)):: memset(table7FF, 0, sizeof(table7FF));
33 uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits)):: memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
34
35 /*
36 * Set the list indexes for binary searches for
37 * U+0800, U+1000, U+2000, .., U+F000, U+10000.
38 * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
39 * looked up in the bit tables.
40 * The last pair of indexes is for finding supplementary code points.
41 */
42 list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
43 int32_t i;
44 for(i=1; i<=0x10; ++i) {
45 list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
46 }
47 list4kStarts[0x11]=listLength-1;
48 containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
49
50 initBits();
51 overrideIllegal();
52}
53
54BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
55 containsFFFD(otherBMPSet.containsFFFD),
56 list(newParentList), listLength(newParentListLength) {
57 uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains))do { clang diagnostic push clang diagnostic ignored "-Waddress"
(void)0; (void)0; clang diagnostic pop :: memcpy(latin1Contains
, otherBMPSet.latin1Contains, sizeof(latin1Contains)); } while
(false)
;
58 uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF))do { clang diagnostic push clang diagnostic ignored "-Waddress"
(void)0; (void)0; clang diagnostic pop :: memcpy(table7FF,
otherBMPSet.table7FF, sizeof(table7FF)); } while (false)
;
59 uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits))do { clang diagnostic push clang diagnostic ignored "-Waddress"
(void)0; (void)0; clang diagnostic pop :: memcpy(bmpBlockBits
, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits)); } while (false
)
;
60 uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts))do { clang diagnostic push clang diagnostic ignored "-Waddress"
(void)0; (void)0; clang diagnostic pop :: memcpy(list4kStarts
, otherBMPSet.list4kStarts, sizeof(list4kStarts)); } while (false
)
;
61}
62
63BMPSet::~BMPSet() {
64}
65
66/*
67 * Set bits in a bit rectangle in "vertical" bit organization.
68 * start<limit<=0x800
69 */
70static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
71 U_ASSERT(start<limit)(void)0;
72 U_ASSERT(limit<=0x800)(void)0;
73
74 int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
75 int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
76
77 // Set one bit indicating an all-one block.
78 uint32_t bits=(uint32_t)1<<lead;
79 if((start+1)==limit) { // Single-character shortcut.
80 table[trail]|=bits;
81 return;
82 }
83
84 int32_t limitLead=limit>>6;
85 int32_t limitTrail=limit&0x3f;
86
87 if(lead==limitLead) {
88 // Partial vertical bit column.
89 while(trail<limitTrail) {
90 table[trail++]|=bits;
91 }
92 } else {
93 // Partial vertical bit column,
94 // followed by a bit rectangle,
95 // followed by another partial vertical bit column.
96 if(trail>0) {
97 do {
98 table[trail++]|=bits;
99 } while(trail<64);
100 ++lead;
101 }
102 if(lead<limitLead) {
103 bits=~(((unsigned)1<<lead)-1);
104 if(limitLead<0x20) {
105 bits&=((unsigned)1<<limitLead)-1;
106 }
107 for(trail=0; trail<64; ++trail) {
108 table[trail]|=bits;
109 }
110 }
111 // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
112 // In that case, bits=1<<limitLead is undefined but the bits value
113 // is not used because trail<limitTrail is already false.
114 bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
115 for(trail=0; trail<limitTrail; ++trail) {
116 table[trail]|=bits;
117 }
118 }
119}
120
121void BMPSet::initBits() {
122 UChar32 start, limit;
123 int32_t listIndex=0;
124
125 // Set latin1Contains[].
126 do {
127 start=list[listIndex++];
128 if(listIndex<listLength) {
129 limit=list[listIndex++];
130 } else {
131 limit=0x110000;
132 }
133 if(start>=0x100) {
134 break;
135 }
136 do {
137 latin1Contains[start++]=1;
138 } while(start<limit && start<0x100);
139 } while(limit<=0x100);
140
141 // Find the first range overlapping with (or after) 80..FF again,
142 // to include them in table7FF as well.
143 for(listIndex=0;;) {
144 start=list[listIndex++];
145 if(listIndex<listLength) {
146 limit=list[listIndex++];
147 } else {
148 limit=0x110000;
149 }
150 if(limit>0x80) {
151 if(start<0x80) {
152 start=0x80;
153 }
154 break;
155 }
156 }
157
158 // Set table7FF[].
159 while(start<0x800) {
160 set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
161 if(limit>0x800) {
162 start=0x800;
163 break;
164 }
165
166 start=list[listIndex++];
167 if(listIndex<listLength) {
168 limit=list[listIndex++];
169 } else {
170 limit=0x110000;
171 }
172 }
173
174 // Set bmpBlockBits[].
175 int32_t minStart=0x800;
176 while(start<0x10000) {
177 if(limit>0x10000) {
178 limit=0x10000;
179 }
180
181 if(start<minStart) {
182 start=minStart;
183 }
184 if(start<limit) { // Else: Another range entirely in a known mixed-value block.
185 if(start&0x3f) {
186 // Mixed-value block of 64 code points.
187 start>>=6;
188 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
189 start=(start+1)<<6; // Round up to the next block boundary.
190 minStart=start; // Ignore further ranges in this block.
191 }
192 if(start<limit) {
193 if(start<(limit&~0x3f)) {
194 // Multiple all-ones blocks of 64 code points each.
195 set32x64Bits(bmpBlockBits, start>>6, limit>>6);
196 }
197
198 if(limit&0x3f) {
199 // Mixed-value block of 64 code points.
200 limit>>=6;
201 bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
202 limit=(limit+1)<<6; // Round up to the next block boundary.
203 minStart=limit; // Ignore further ranges in this block.
204 }
205 }
206 }
207
208 if(limit==0x10000) {
209 break;
210 }
211
212 start=list[listIndex++];
213 if(listIndex<listLength) {
214 limit=list[listIndex++];
215 } else {
216 limit=0x110000;
217 }
218 }
219}
220
221/*
222 * Override some bits and bytes to the result of contains(FFFD)
223 * for faster validity checking at runtime.
224 * No need to set 0 values where they were reset to 0 in the constructor
225 * and not modified by initBits().
226 * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
227 * Need to set 0 values for surrogates D800..DFFF.
228 */
229void BMPSet::overrideIllegal() {
230 uint32_t bits, mask;
231 int32_t i;
232
233 if(containsFFFD) {
234 bits=3; // Lead bytes 0xC0 and 0xC1.
235 for(i=0; i<64; ++i) {
236 table7FF[i]|=bits;
237 }
238
239 bits=1; // Lead byte 0xE0.
240 for(i=0; i<32; ++i) { // First half of 4k block.
241 bmpBlockBits[i]|=bits;
242 }
243
244 mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.
245 bits=1<<0xd;
246 for(i=32; i<64; ++i) { // Second half of 4k block.
247 bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
248 }
249 } else {
250 mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.
251 for(i=32; i<64; ++i) { // Second half of 4k block.
252 bmpBlockBits[i]&=mask;
253 }
254 }
255}
256
257int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
258 /* Examples:
259 findCodePoint(c)
260 set list[] c=0 1 3 4 7 8
261 === ============== ===========
262 [] [110000] 0 0 0 0 0 0
263 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
264 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
265 [:Any:] [0, 110000] 1 1 1 1 1 1
266 */
267
268 // Return the smallest i such that c < list[i]. Assume
269 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
270 if (c < list[lo])
271 return lo;
272 // High runner test. c is often after the last range, so an
273 // initial check for this condition pays off.
274 if (lo >= hi || c >= list[hi-1])
275 return hi;
276 // invariant: c >= list[lo]
277 // invariant: c < list[hi]
278 for (;;) {
279 int32_t i = (lo + hi) >> 1;
280 if (i == lo) {
281 break; // Found!
282 } else if (c < list[i]) {
283 hi = i;
284 } else {
285 lo = i;
286 }
287 }
288 return hi;
289}
290
291UBool
292BMPSet::contains(UChar32 c) const {
293 if((uint32_t)c<=0xff) {
294 return (UBool)latin1Contains[c];
295 } else if((uint32_t)c<=0x7ff) {
296 return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
297 } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
298 int lead=c>>12;
299 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
300 if(twoBits<=1) {
301 // All 64 code points with the same bits 15..6
302 // are either in the set or not.
303 return (UBool)twoBits;
304 } else {
305 // Look up the code point in its 4k block of code points.
306 return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
307 }
308 } else if((uint32_t)c<=0x10ffff) {
309 // surrogate or supplementary code point
310 return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
311 } else {
312 // Out-of-range code points get FALSE, consistent with long-standing
313 // behavior of UnicodeSet::contains(c).
314 return FALSE0;
315 }
316}
317
318/*
319 * Check for sufficient length for trail unit for each surrogate pair.
320 * Handle single surrogates as surrogate code points as usual in ICU.
321 */
322const UChar *
323BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
324 UChar c, c2;
325
326 if(spanCondition) {
327 // span
328 do {
329 c=*s;
330 if(c<=0xff) {
331 if(!latin1Contains[c]) {
332 break;
333 }
334 } else if(c<=0x7ff) {
335 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
336 break;
337 }
338 } else if(c<0xd800 || c>=0xe000) {
339 int lead=c>>12;
340 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
341 if(twoBits<=1) {
342 // All 64 code points with the same bits 15..6
343 // are either in the set or not.
344 if(twoBits==0) {
345 break;
346 }
347 } else {
348 // Look up the code point in its 4k block of code points.
349 if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
350 break;
351 }
352 }
353 } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
354 // surrogate code point
355 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
356 break;
357 }
358 } else {
359 // surrogate pair
360 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2)(((UChar32)(c)<<10UL)+(UChar32)(c2)-((0xd800<<10UL
)+0xdc00-0x10000))
, list4kStarts[0x10], list4kStarts[0x11])) {
361 break;
362 }
363 ++s;
364 }
365 } while(++s<limit);
366 } else {
367 // span not
368 do {
369 c=*s;
370 if(c<=0xff) {
371 if(latin1Contains[c]) {
372 break;
373 }
374 } else if(c<=0x7ff) {
375 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
376 break;
377 }
378 } else if(c<0xd800 || c>=0xe000) {
379 int lead=c>>12;
380 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
381 if(twoBits<=1) {
382 // All 64 code points with the same bits 15..6
383 // are either in the set or not.
384 if(twoBits!=0) {
385 break;
386 }
387 } else {
388 // Look up the code point in its 4k block of code points.
389 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
390 break;
391 }
392 }
393 } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
394 // surrogate code point
395 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
396 break;
397 }
398 } else {
399 // surrogate pair
400 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2)(((UChar32)(c)<<10UL)+(UChar32)(c2)-((0xd800<<10UL
)+0xdc00-0x10000))
, list4kStarts[0x10], list4kStarts[0x11])) {
401 break;
402 }
403 ++s;
404 }
405 } while(++s<limit);
406 }
407 return s;
408}
409
410/* Symmetrical with span(). */
411const UChar *
412BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
413 UChar c, c2;
414
415 if(spanCondition) {
416 // span
417 for(;;) {
418 c=*(--limit);
419 if(c<=0xff) {
420 if(!latin1Contains[c]) {
421 break;
422 }
423 } else if(c<=0x7ff) {
424 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
425 break;
426 }
427 } else if(c<0xd800 || c>=0xe000) {
428 int lead=c>>12;
429 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
430 if(twoBits<=1) {
431 // All 64 code points with the same bits 15..6
432 // are either in the set or not.
433 if(twoBits==0) {
434 break;
435 }
436 } else {
437 // Look up the code point in its 4k block of code points.
438 if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
439 break;
440 }
441 }
442 } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
443 // surrogate code point
444 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
445 break;
446 }
447 } else {
448 // surrogate pair
449 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c)(((UChar32)(c2)<<10UL)+(UChar32)(c)-((0xd800<<10UL
)+0xdc00-0x10000))
, list4kStarts[0x10], list4kStarts[0x11])) {
450 break;
451 }
452 --limit;
453 }
454 if(s==limit) {
455 return s;
456 }
457 }
458 } else {
459 // span not
460 for(;;) {
461 c=*(--limit);
462 if(c<=0xff) {
463 if(latin1Contains[c]) {
464 break;
465 }
466 } else if(c<=0x7ff) {
467 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
468 break;
469 }
470 } else if(c<0xd800 || c>=0xe000) {
471 int lead=c>>12;
472 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
473 if(twoBits<=1) {
474 // All 64 code points with the same bits 15..6
475 // are either in the set or not.
476 if(twoBits!=0) {
477 break;
478 }
479 } else {
480 // Look up the code point in its 4k block of code points.
481 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
482 break;
483 }
484 }
485 } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
486 // surrogate code point
487 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
488 break;
489 }
490 } else {
491 // surrogate pair
492 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c)(((UChar32)(c2)<<10UL)+(UChar32)(c)-((0xd800<<10UL
)+0xdc00-0x10000))
, list4kStarts[0x10], list4kStarts[0x11])) {
493 break;
494 }
495 --limit;
496 }
497 if(s==limit) {
498 return s;
499 }
500 }
501 }
502 return limit+1;
503}
504
505/*
506 * Precheck for sufficient trail bytes at end of string only once per span.
507 * Check validity.
508 */
509const uint8_t *
510BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
511 const uint8_t *limit=s+length;
512 uint8_t b=*s;
513 if(U8_IS_SINGLE(b)(((b)&0x80)==0)) {
514 // Initial all-ASCII span.
515 if(spanCondition) {
516 do {
517 if(!latin1Contains[b] || ++s==limit) {
518 return s;
519 }
520 b=*s;
521 } while(U8_IS_SINGLE(b)(((b)&0x80)==0));
522 } else {
523 do {
524 if(latin1Contains[b] || ++s==limit) {
525 return s;
526 }
527 b=*s;
528 } while(U8_IS_SINGLE(b)(((b)&0x80)==0));
529 }
530 length=(int32_t)(limit-s);
531 }
532
533 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
534 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
535 }
536
537 const uint8_t *limit0=limit;
538
539 /*
540 * Make sure that the last 1/2/3/4-byte sequence before limit is complete
541 * or runs into a lead byte.
542 * In the span loop compare s with limit only once
543 * per multi-byte character.
544 *
545 * Give a trailing illegal sequence the same value as the result of contains(FFFD),
546 * including it if that is part of the span, otherwise set limit0 to before
547 * the truncated sequence.
548 */
549 b=*(limit-1);
550 if((int8_t)b<0) {
551 // b>=0x80: lead or trail byte
552 if(b<0xc0) {
553 // single trail byte, check for preceding 3- or 4-byte lead byte
554 if(length>=2 && (b=*(limit-2))>=0xe0) {
555 limit-=2;
556 if(containsFFFD!=spanCondition) {
557 limit0=limit;
558 }
559 } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
Although the value stored to 'b' is used in the enclosing expression, the value is never actually read from 'b'
560 // 4-byte lead byte with only two trail bytes
561 limit-=3;
562 if(containsFFFD!=spanCondition) {
563 limit0=limit;
564 }
565 }
566 } else {
567 // lead byte with no trail bytes
568 --limit;
569 if(containsFFFD!=spanCondition) {
570 limit0=limit;
571 }
572 }
573 }
574
575 uint8_t t1, t2, t3;
576
577 while(s<limit) {
578 b=*s;
579 if(U8_IS_SINGLE(b)(((b)&0x80)==0)) {
580 // ASCII
581 if(spanCondition) {
582 do {
583 if(!latin1Contains[b]) {
584 return s;
585 } else if(++s==limit) {
586 return limit0;
587 }
588 b=*s;
589 } while(U8_IS_SINGLE(b)(((b)&0x80)==0));
590 } else {
591 do {
592 if(latin1Contains[b]) {
593 return s;
594 } else if(++s==limit) {
595 return limit0;
596 }
597 b=*s;
598 } while(U8_IS_SINGLE(b)(((b)&0x80)==0));
599 }
600 }
601 ++s; // Advance past the lead byte.
602 if(b>=0xe0) {
603 if(b<0xf0) {
604 if( /* handle U+0000..U+FFFF inline */
605 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
606 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
607 ) {
608 b&=0xf;
609 uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
610 if(twoBits<=1) {
611 // All 64 code points with this lead byte and middle trail byte
612 // are either in the set or not.
613 if(twoBits!=(uint32_t)spanCondition) {
614 return s-1;
615 }
616 } else {
617 // Look up the code point in its 4k block of code points.
618 UChar32 c=(b<<12)|(t1<<6)|t2;
619 if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
620 return s-1;
621 }
622 }
623 s+=2;
624 continue;
625 }
626 } else if( /* handle U+10000..U+10FFFF inline */
627 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
628 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
629 (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
630 ) {
631 // Give an illegal sequence the same value as the result of contains(FFFD).
632 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
633 if( ( (0x10000<=c && c<=0x10ffff) ?
634 containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
635 containsFFFD
636 ) != spanCondition
637 ) {
638 return s-1;
639 }
640 s+=3;
641 continue;
642 }
643 } else {
644 if( /* handle U+0000..U+07FF inline */
645 b>=0xc0 &&
646 (t1=(uint8_t)(*s-0x80)) <= 0x3f
647 ) {
648 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
649 return s-1;
650 }
651 ++s;
652 continue;
653 }
654 }
655
656 // Give an illegal sequence the same value as the result of contains(FFFD).
657 // Handle each byte of an illegal sequence separately to simplify the code;
658 // no need to optimize error handling.
659 if(containsFFFD!=spanCondition) {
660 return s-1;
661 }
662 }
663
664 return limit0;
665}
666
667/*
668 * While going backwards through UTF-8 optimize only for ASCII.
669 * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
670 * possible to tell from the last byte in a multi-byte sequence how many
671 * preceding bytes there should be. Therefore, going backwards through UTF-8
672 * is much harder than going forward.
673 */
674int32_t
675BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
676 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
677 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
678 }
679
680 uint8_t b;
681
682 do {
683 b=s[--length];
684 if(U8_IS_SINGLE(b)(((b)&0x80)==0)) {
685 // ASCII sub-span
686 if(spanCondition) {
687 do {
688 if(!latin1Contains[b]) {
689 return length+1;
690 } else if(length==0) {
691 return 0;
692 }
693 b=s[--length];
694 } while(U8_IS_SINGLE(b)(((b)&0x80)==0));
695 } else {
696 do {
697 if(latin1Contains[b]) {
698 return length+1;
699 } else if(length==0) {
700 return 0;
701 }
702 b=s[--length];
703 } while(U8_IS_SINGLE(b)(((b)&0x80)==0));
704 }
705 }
706
707 int32_t prev=length;
708 UChar32 c;
709 // trail byte: collect a multi-byte character
710 // (or lead byte in last-trail position)
711 c=utf8_prevCharSafeBodyutf8_prevCharSafeBody_71(s, 0, &length, b, -3);
712 // c is a valid code point, not ASCII, not a surrogate
713 if(c<=0x7ff) {
714 if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
715 return prev+1;
716 }
717 } else if(c<=0xffff) {
718 int lead=c>>12;
719 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
720 if(twoBits<=1) {
721 // All 64 code points with the same bits 15..6
722 // are either in the set or not.
723 if(twoBits!=(uint32_t)spanCondition) {
724 return prev+1;
725 }
726 } else {
727 // Look up the code point in its 4k block of code points.
728 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
729 return prev+1;
730 }
731 }
732 } else {
733 if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
734 return prev+1;
735 }
736 }
737 } while(length>0);
738 return 0;
739}
740
741U_NAMESPACE_END}