../deps/icu-small/source/common/unisetspan.cpp

Bug Summary

File:	out/../deps/icu-small/source/common/unisetspan.cpp
Warning:	line 154, column 16 Although the value stored to 'result' is used in the enclosing expression, the value is never actually read from 'result'

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name unisetspan.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/home/maurizio/node-v18.6.0/out -resource-dir /usr/local/lib/clang/16.0.0 -D V8_DEPRECATION_WARNINGS -D V8_IMMINENT_DEPRECATION_WARNINGS -D _GLIBCXX_USE_CXX11_ABI=1 -D NODE_OPENSSL_CONF_NAME=nodejs_conf -D NODE_OPENSSL_HAS_QUIC -D __STDC_FORMAT_MACROS -D OPENSSL_NO_PINSHARED -D OPENSSL_THREADS -D U_COMMON_IMPLEMENTATION=1 -D U_ATTRIBUTE_DEPRECATED= -D _CRT_SECURE_NO_DEPRECATE= -D U_STATIC_IMPLEMENTATION=1 -D UCONFIG_NO_SERVICE=1 -D U_ENABLE_DYLOAD=0 -D U_HAVE_STD_STRING=1 -D UCONFIG_NO_BREAK_ITERATION=0 -I ../deps/icu-small/source/common -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8 -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/x86_64-redhat-linux -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/backward -internal-isystem /usr/local/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../x86_64-redhat-linux/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-unused-parameter -Wno-deprecated-declarations -Wno-strict-aliasing -std=gnu++17 -fdeprecated-macro -fdebug-compilation-dir=/home/maurizio/node-v18.6.0/out -ferror-limit 19 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-08-22-142216-507842-1 -x c++ ../deps/icu-small/source/common/unisetspan.cpp

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	******************************************************************************
5	*
6	* Copyright (C) 2007-2012, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	******************************************************************************
10	* file name: unisetspan.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2007mar01
16	* created by: Markus W. Scherer
17	*/
18
19	#include "unicode/utypes.h"
20	#include "unicode/uniset.h"
21	#include "unicode/ustring.h"
22	#include "unicode/utf8.h"
23	#include "unicode/utf16.h"
24	#include "cmemory.h"
25	#include "uvector.h"
26	#include "unisetspan.h"
27
28	U_NAMESPACE_BEGINnamespace icu_71 {
29
30	/*
31	* List of offsets from the current position from where to try matching
32	* a code point or a string.
33	* Store offsets rather than indexes to simplify the code and use the same list
34	* for both increments (in span()) and decrements (in spanBack()).
35	*
36	* Assumption: The maximum offset is limited, and the offsets that are stored
37	* at any one time are relatively dense, that is, there are normally no gaps of
38	* hundreds or thousands of offset values.
39	*
40	* The implementation uses a circular buffer of byte flags,
41	* each indicating whether the corresponding offset is in the list.
42	* This avoids inserting into a sorted list of offsets (or absolute indexes) and
43	* physically moving part of the list.
44	*
45	* Note: In principle, the caller should setMaxLength() to the maximum of the
46	* max string length and U16_LENGTH/U8_LENGTH to account for
47	* "long" single code points.
48	* However, this implementation uses at least a staticList with more than
49	* U8_LENGTH entries anyway.
50	*
51	* Note: If maxLength were guaranteed to be no more than 32 or 64,
52	* the list could be stored as bit flags in a single integer.
53	* Rather than handling a circular buffer with a start list index,
54	* the integer would simply be shifted when lower offsets are removed.
55	* UnicodeSet does not have a limit on the lengths of strings.
56	*/
57	class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
58	public:
59	OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
60
61	~OffsetList() {
62	if(list!=staticList) {
63	uprv_freeuprv_free_71(list);
64	}
65	}
66
67	// Call exactly once if the list is to be used.
68	void setMaxLength(int32_t maxLength) {
69	if(maxLength<=(int32_t)sizeof(staticList)) {
70	capacity=(int32_t)sizeof(staticList);
71	} else {
72	UBool l=(UBool )uprv_mallocuprv_malloc_71(maxLength);
73	if(l!=NULL__null) {
74	list=l;
75	capacity=maxLength;
76	}
77	}
78	uprv_memset(list, 0, capacity):: memset(list, 0, capacity);
79	}
80
81	void clear() {
82	uprv_memset(list, 0, capacity):: memset(list, 0, capacity);
83	start=length=0;
84	}
85
86	UBool isEmpty() const {
87	return (UBool)(length==0);
88	}
89
90	// Reduce all stored offsets by delta, used when the current position
91	// moves by delta.
92	// There must not be any offsets lower than delta.
93	// If there is an offset equal to delta, it is removed.
94	// delta=[1..maxLength]
95	void shift(int32_t delta) {
96	int32_t i=start+delta;
97	if(i>=capacity) {
98	i-=capacity;
99	}
100	if(list[i]) {
101	list[i]=FALSE0;
102	--length;
103	}
104	start=i;
105	}
106
107	// Add an offset. The list must not contain it yet.
108	// offset=[1..maxLength]
109	void addOffset(int32_t offset) {
110	int32_t i=start+offset;
111	if(i>=capacity) {
112	i-=capacity;
113	}
114	list[i]=TRUE1;
115	++length;
116	}
117
118	// offset=[1..maxLength]
119	UBool containsOffset(int32_t offset) const {
120	int32_t i=start+offset;
121	if(i>=capacity) {
122	i-=capacity;
123	}
124	return list[i];
125	}
126
127	// Find the lowest stored offset from a non-empty list, remove it,
128	// and reduce all other offsets by this minimum.
129	// Returns [1..maxLength].
130	int32_t popMinimum() {
131	// Look for the next offset in list[start+1..capacity-1].
132	int32_t i=start, result;
133	while(++i<capacity) {
134	if(list[i]) {
135	list[i]=FALSE0;
136	--length;
137	result=i-start;
138	start=i;
139	return result;
140	}
141	}
142	// i==capacity
143
144	// Wrap around and look for the next offset in list[0..start].
145	// Since the list is not empty, there will be one.
146	result=capacity-start;
147	i=0;
148	while(!list[i]) {
149	++i;
150	}
151	list[i]=FALSE0;
152	--length;
153	start=i;
154	return result+=i;
	Although the value stored to 'result' is used in the enclosing expression, the value is never actually read from 'result'
155	}
156
157	private:
158	UBool *list;
159	int32_t capacity;
160	int32_t length;
161	int32_t start;
162
163	UBool staticList[16];
164	};
165
166	// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
167	static int32_t
168	getUTF8Length(const UChar *s, int32_t length) {
169	UErrorCode errorCode=U_ZERO_ERROR;
170	int32_t length8=0;
171	u_strToUTF8u_strToUTF8_71(NULL__null, 0, &length8, s, length, &errorCode);
172	if(U_SUCCESS(errorCode) \|\| errorCode==U_BUFFER_OVERFLOW_ERROR) {
173	return length8;
174	} else {
175	// The string contains an unpaired surrogate.
176	// Ignore this string.
177	return 0;
178	}
179	}
180
181	// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
182	static int32_t
183	appendUTF8(const UChar s, int32_t length, uint8_t t, int32_t capacity) {
184	UErrorCode errorCode=U_ZERO_ERROR;
185	int32_t length8=0;
186	u_strToUTF8u_strToUTF8_71((char *)t, capacity, &length8, s, length, &errorCode);
187	if(U_SUCCESS(errorCode)) {
188	return length8;
189	} else {
190	// The string contains an unpaired surrogate.
191	// Ignore this string.
192	return 0;
193	}
194	}
195
196	static inline uint8_t
197	makeSpanLengthByte(int32_t spanLength) {
198	// 0xfe==UnicodeSetStringSpan::LONG_SPAN
199	return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
200	}
201
202	// Construct for all variants of span(), or only for any one variant.
203	// Initialize as little as possible, for single use.
204	UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
205	const UVector &setStrings,
206	uint32_t which)
207	: spanSet(0, 0x10ffff), pSpanNotSet(NULL__null), strings(setStrings),
208	utf8Lengths(NULL__null), spanLengths(NULL__null), utf8(NULL__null),
209	utf8Length(0),
210	maxLength16(0), maxLength8(0),
211	all((UBool)(which==ALL)) {
212	spanSet.retainAll(set);
213	if(which&NOT_CONTAINED) {
214	// Default to the same sets.
215	// addToSpanNotSet() will create a separate set if necessary.
216	pSpanNotSet=&spanSet;
217	}
218
219	// Determine if the strings even need to be taken into account at all for span() etc.
220	// If any string is relevant, then all strings need to be used for
221	// span(longest match) but only the relevant ones for span(while contained).
222	// TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
223	// and do not store UTF-8 strings if !thisRelevant and CONTAINED.
224	// (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
225	// Also count the lengths of the UTF-8 versions of the strings for memory allocation.
226	int32_t stringsLength=strings.size();
227
228	int32_t i, spanLength;
229	UBool someRelevant=FALSE0;
230	for(i=0; i<stringsLength; ++i) {
231	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
232	const UChar *s16=string.getBuffer();
233	int32_t length16=string.length();
234	if (length16==0) {
235	continue; // skip the empty string
236	}
237	UBool thisRelevant;
238	spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
239	if(spanLength<length16) { // Relevant string.
240	someRelevant=thisRelevant=TRUE1;
241	} else {
242	thisRelevant=FALSE0;
243	}
244	if((which&UTF16) && length16>maxLength16) {
245	maxLength16=length16;
246	}
247	if((which&UTF8) && (thisRelevant \|\| (which&CONTAINED))) {
248	int32_t length8=getUTF8Length(s16, length16);
249	utf8Length+=length8;
250	if(length8>maxLength8) {
251	maxLength8=length8;
252	}
253	}
254	}
255	if(!someRelevant) {
256	maxLength16=maxLength8=0;
257	return;
258	}
259
260	// Freeze after checking for the need to use strings at all because freezing
261	// a set takes some time and memory which are wasted if there are no relevant strings.
262	if(all) {
263	spanSet.freeze();
264	}
265
266	uint8_t *spanBackLengths;
267	uint8_t *spanUTF8Lengths;
268	uint8_t *spanBackUTF8Lengths;
269
270	// Allocate a block of meta data.
271	int32_t allocSize;
272	if(all) {
273	// UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
274	allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
275	} else {
276	allocSize=stringsLength; // One set of span lengths.
277	if(which&UTF8) {
278	// UTF-8 lengths and UTF-8 strings.
279	allocSize+=stringsLength*4+utf8Length;
280	}
281	}
282	if(allocSize<=(int32_t)sizeof(staticLengths)) {
283	utf8Lengths=staticLengths;
284	} else {
285	utf8Lengths=(int32_t *)uprv_mallocuprv_malloc_71(allocSize);
286	if(utf8Lengths==NULL__null) {
287	maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
288	return; // Out of memory.
289	}
290	}
291
292	if(all) {
293	// Store span lengths for all span() variants.
294	spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
295	spanBackLengths=spanLengths+stringsLength;
296	spanUTF8Lengths=spanBackLengths+stringsLength;
297	spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
298	utf8=spanBackUTF8Lengths+stringsLength;
299	} else {
300	// Store span lengths for only one span() variant.
301	if(which&UTF8) {
302	spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
303	utf8=spanLengths+stringsLength;
304	} else {
305	spanLengths=(uint8_t *)utf8Lengths;
306	}
307	spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
308	}
309
310	// Set the meta data and pSpanNotSet and write the UTF-8 strings.
311	int32_t utf8Count=0; // Count UTF-8 bytes written so far.
312
313	for(i=0; i<stringsLength; ++i) {
314	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
315	const UChar *s16=string.getBuffer();
316	int32_t length16=string.length();
317	spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
318	if(spanLength<length16 && length16>0) { // Relevant string.
319	if(which&UTF16) {
320	if(which&CONTAINED) {
321	if(which&FWD) {
322	spanLengths[i]=makeSpanLengthByte(spanLength);
323	}
324	if(which&BACK) {
325	spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
326	spanBackLengths[i]=makeSpanLengthByte(spanLength);
327	}
328	} else /* not CONTAINED, not all, but NOT_CONTAINED */ {
329	spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
330	}
331	}
332	if(which&UTF8) {
333	uint8_t *s8=utf8+utf8Count;
334	int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
335	utf8Count+=utf8Lengths[i]=length8;
336	if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
337	spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
338	} else { // Relevant for UTF-8.
339	if(which&CONTAINED) {
340	if(which&FWD) {
341	spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
342	spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
343	}
344	if(which&BACK) {
345	spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
346	spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
347	}
348	} else /* not CONTAINED, not all, but NOT_CONTAINED */ {
349	spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
350	}
351	}
352	}
353	if(which&NOT_CONTAINED) {
354	// Add string start and end code points to the spanNotSet so that
355	// a span(while not contained) stops before any string.
356	UChar32 c;
357	if(which&FWD) {
358	int32_t len=0;
359	U16_NEXT(s16, len, length16, c)do { (c)=(s16)[(len)++]; if((((c)&0xfffffc00)==0xd800)) { uint16_t __c2; if((len)!=(length16) && (((__c2=(s16) [(len)])&0xfffffc00)==0xdc00)) { ++(len); (c)=(((UChar32) ((c))<<10UL)+(UChar32)(__c2)-((0xd800<<10UL)+0xdc00 -0x10000)); } } } while (false);
360	addToSpanNotSet(c);
361	}
362	if(which&BACK) {
363	int32_t len=length16;
364	U16_PREV(s16, 0, len, c)do { (c)=(s16)[--(len)]; if((((c)&0xfffffc00)==0xdc00)) { uint16_t __c2; if((len)>(0) && (((__c2=(s16)[(len )-1])&0xfffffc00)==0xd800)) { --(len); (c)=(((UChar32)(__c2 )<<10UL)+(UChar32)((c))-((0xd800<<10UL)+0xdc00-0x10000 )); } } } while (false);
365	addToSpanNotSet(c);
366	}
367	}
368	} else { // Irrelevant string. (Also the empty string.)
369	if(which&UTF8) {
370	if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
371	uint8_t *s8=utf8+utf8Count;
372	int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
373	utf8Count+=utf8Lengths[i]=length8;
374	} else {
375	utf8Lengths[i]=0;
376	}
377	}
378	if(all) {
379	spanLengths[i]=spanBackLengths[i]=
380	spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
381	(uint8_t)ALL_CP_CONTAINED;
382	} else {
383	// All spanXYZLengths pointers contain the same address.
384	spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
385	}
386	}
387	}
388
389	// Finish.
390	if(all) {
391	pSpanNotSet->freeze();
392	}
393	}
394
395	// Copy constructor. Assumes which==ALL for a frozen set.
396	UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
397	const UVector &newParentSetStrings)
398	: spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL__null), strings(newParentSetStrings),
399	utf8Lengths(NULL__null), spanLengths(NULL__null), utf8(NULL__null),
400	utf8Length(otherStringSpan.utf8Length),
401	maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
402	all(TRUE1) {
403	if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
404	pSpanNotSet=&spanSet;
405	} else {
406	pSpanNotSet=otherStringSpan.pSpanNotSet->clone();
407	}
408
409	// Allocate a block of meta data.
410	// UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
411	int32_t stringsLength=strings.size();
412	int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
413	if(allocSize<=(int32_t)sizeof(staticLengths)) {
414	utf8Lengths=staticLengths;
415	} else {
416	utf8Lengths=(int32_t *)uprv_mallocuprv_malloc_71(allocSize);
417	if(utf8Lengths==NULL__null) {
418	maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
419	return; // Out of memory.
420	}
421	}
422
423	spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
424	utf8=spanLengths+stringsLength*4;
425	uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize)do { clang diagnostic push clang diagnostic ignored "-Waddress" (void)0; (void)0; clang diagnostic pop :: memcpy(utf8Lengths , otherStringSpan.utf8Lengths, allocSize); } while (false);
426	}
427
428	UnicodeSetStringSpan::~UnicodeSetStringSpan() {
429	if(pSpanNotSet!=NULL__null && pSpanNotSet!=&spanSet) {
430	delete pSpanNotSet;
431	}
432	if(utf8Lengths!=NULL__null && utf8Lengths!=staticLengths) {
433	uprv_freeuprv_free_71(utf8Lengths);
434	}
435	}
436
437	void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
438	if(pSpanNotSet==NULL__null \|\| pSpanNotSet==&spanSet) {
439	if(spanSet.contains(c)) {
440	return; // Nothing to do.
441	}
442	UnicodeSet *newSet=spanSet.cloneAsThawed();
443	if(newSet==NULL__null) {
444	return; // Out of memory.
445	} else {
446	pSpanNotSet=newSet;
447	}
448	}
449	pSpanNotSet->add(c);
450	}
451
452	// Compare strings without any argument checks. Requires length>0.
453	static inline UBool
454	matches16(const UChar s, const UChar t, int32_t length) {
455	do {
456	if(s++!=t++) {
457	return FALSE0;
458	}
459	} while(--length>0);
460	return TRUE1;
461	}
462
463	static inline UBool
464	matches8(const uint8_t s, const uint8_t t, int32_t length) {
465	do {
466	if(s++!=t++) {
467	return FALSE0;
468	}
469	} while(--length>0);
470	return TRUE1;
471	}
472
473	// Compare 16-bit Unicode strings (which may be malformed UTF-16)
474	// at code point boundaries.
475	// That is, each edge of a match must not be in the middle of a surrogate pair.
476	static inline UBool
477	matches16CPB(const UChar s, int32_t start, int32_t limit, const UChar t, int32_t length) {
478	s+=start;
479	limit-=start;
480	return matches16(s, t, length) &&
481	!(0<start && U16_IS_LEAD(s[-1])(((s[-1])&0xfffffc00)==0xd800) && U16_IS_TRAIL(s[0])(((s[0])&0xfffffc00)==0xdc00)) &&
482	!(length<limit && U16_IS_LEAD(s[length-1])(((s[length-1])&0xfffffc00)==0xd800) && U16_IS_TRAIL(s[length])(((s[length])&0xfffffc00)==0xdc00));
483	}
484
485	// Does the set contain the next code point?
486	// If so, return its length; otherwise return its negative length.
487	static inline int32_t
488	spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
489	UChar c=*s, c2;
490	if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])(((c2=s[1])&0xfffffc00)==0xdc00)) {
491	return set.contains(U16_GET_SUPPLEMENTARY(c, c2)(((UChar32)(c)<<10UL)+(UChar32)(c2)-((0xd800<<10UL )+0xdc00-0x10000))) ? 2 : -2;
492	}
493	return set.contains(c) ? 1 : -1;
494	}
495
496	static inline int32_t
497	spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
498	UChar c=s[length-1], c2;
499	if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])(((c2=s[length-2])&0xfffffc00)==0xd800)) {
500	return set.contains(U16_GET_SUPPLEMENTARY(c2, c)(((UChar32)(c2)<<10UL)+(UChar32)(c)-((0xd800<<10UL )+0xdc00-0x10000))) ? 2 : -2;
501	}
502	return set.contains(c) ? 1 : -1;
503	}
504
505	static inline int32_t
506	spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
507	UChar32 c=*s;
508	if(U8_IS_SINGLE(c)(((c)&0x80)==0)) {
509	return set.contains(c) ? 1 : -1;
510	}
511	// Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
512	int32_t i=0;
513	U8_NEXT_OR_FFFD(s, i, length, c)do { (c)=(uint8_t)(s)[(i)++]; if(!(((c)&0x80)==0)) { uint8_t __t = 0; if((i)!=(length) && ((c)>=0xe0 ? ((c)< 0xf0 ? "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30" [(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && (__t&=0x3f, 1) : ((c)-=0xf0)<=4 && "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00" [(__t=(s)[i])>>4]&(1<<(c)) && ((c)=(( c)<<6)\|(__t&0x3f), ++(i)!=(length)) && (__t =(s)[i]-0x80)<=0x3f) && ((c)=((c)<<6)\|__t, ++ (i)!=(length)) : (c)>=0xc2 && ((c)&=0x1f, 1)) && (__t=(s)[i]-0x80)<=0x3f && ((c)=((c)<<6)\|__t , ++(i), 1)) { } else { (c)=(0xfffd); } } } while (false);
514	return set.contains(c) ? i : -i;
515	}
516
517	static inline int32_t
518	spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
519	UChar32 c=s[length-1];
520	if(U8_IS_SINGLE(c)(((c)&0x80)==0)) {
521	return set.contains(c) ? 1 : -1;
522	}
523	int32_t i=length-1;
524	c=utf8_prevCharSafeBodyutf8_prevCharSafeBody_71(s, 0, &i, c, -3);
525	length-=i;
526	return set.contains(c) ? length : -length;
527	}
528
529	/*
530	* Note: In span() when spanLength==0 (after a string match, or at the beginning
531	* after an empty code point span) and in spanNot() and spanNotUTF8(),
532	* string matching could use a binary search
533	* because all string matches are done from the same start index.
534	*
535	* For UTF-8, this would require a comparison function that returns UTF-16 order.
536	*
537	* This optimization should not be necessary for normal UnicodeSets because
538	* most sets have no strings, and most sets with strings have
539	* very few very short strings.
540	* For cases with many strings, it might be better to use a different API
541	* and implementation with a DFA (state machine).
542	*/
543
544	/*
545	* Algorithm for span(USET_SPAN_CONTAINED)
546	*
547	* Theoretical algorithm:
548	* - Iterate through the string, and at each code point boundary:
549	* + If the code point there is in the set, then remember to continue after it.
550	* + If a set string matches at the current position, then remember to continue after it.
551	* + Either recursively span for each code point or string match,
552	* or recursively span for all but the shortest one and
553	* iteratively continue the span with the shortest local match.
554	* + Remember the longest recursive span (the farthest end point).
555	* + If there is no match at the current position, neither for the code point there
556	* nor for any set string, then stop and return the longest recursive span length.
557	*
558	* Optimized implementation:
559	*
560	* (We assume that most sets will have very few very short strings.
561	* A span using a string-less set is extremely fast.)
562	*
563	* Create and cache a spanSet which contains all of the single code points
564	* of the original set but none of its strings.
565	*
566	* - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
567	* - Loop:
568	* + Try to match each set string at the end of the spanLength.
569	* ~ Set strings that start with set-contained code points must be matched
570	* with a partial overlap because the recursive algorithm would have tried
571	* to match them at every position.
572	* ~ Set strings that entirely consist of set-contained code points
573	* are irrelevant for span(USET_SPAN_CONTAINED) because the
574	* recursive algorithm would continue after them anyway
575	* and find the longest recursive match from their end.
576	* ~ Rather than recursing, note each end point of a set string match.
577	* + If no set string matched after spanSet.span(), then return
578	* with where the spanSet.span() ended.
579	* + If at least one set string matched after spanSet.span(), then
580	* pop the shortest string match end point and continue
581	* the loop, trying to match all set strings from there.
582	* + If at least one more set string matched after a previous string match,
583	* then test if the code point after the previous string match is also
584	* contained in the set.
585	* Continue the loop with the shortest end point of either this code point
586	* or a matching set string.
587	* + If no more set string matched after a previous string match,
588	* then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
589	* Stop if spanLength==0, otherwise continue the loop.
590	*
591	* By noting each end point of a set string match,
592	* the function visits each string position at most once and finishes
593	* in linear time.
594	*
595	* The recursive algorithm may visit the same string position many times
596	* if multiple paths lead to it and finishes in exponential time.
597	*/
598
599	/*
600	* Algorithm for span(USET_SPAN_SIMPLE)
601	*
602	* Theoretical algorithm:
603	* - Iterate through the string, and at each code point boundary:
604	* + If the code point there is in the set, then remember to continue after it.
605	* + If a set string matches at the current position, then remember to continue after it.
606	* + Continue from the farthest match position and ignore all others.
607	* + If there is no match at the current position,
608	* then stop and return the current position.
609	*
610	* Optimized implementation:
611	*
612	* (Same assumption and spanSet as above.)
613	*
614	* - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
615	* - Loop:
616	* + Try to match each set string at the end of the spanLength.
617	* ~ Set strings that start with set-contained code points must be matched
618	* with a partial overlap because the standard algorithm would have tried
619	* to match them earlier.
620	* ~ Set strings that entirely consist of set-contained code points
621	* must be matched with a full overlap because the longest-match algorithm
622	* would hide set string matches that end earlier.
623	* Such set strings need not be matched earlier inside the code point span
624	* because the standard algorithm would then have continued after
625	* the set string match anyway.
626	* ~ Remember the longest set string match (farthest end point) from the earliest
627	* starting point.
628	* + If no set string matched after spanSet.span(), then return
629	* with where the spanSet.span() ended.
630	* + If at least one set string matched, then continue the loop after the
631	* longest match from the earliest position.
632	* + If no more set string matched after a previous string match,
633	* then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
634	* Stop if spanLength==0, otherwise continue the loop.
635	*/
636
637	int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
638	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
639	return spanNot(s, length);
640	}
641	int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
642	if(spanLength==length) {
643	return length;
644	}
645
646	// Consider strings; they may overlap with the span.
647	OffsetList offsets;
648	if(spanCondition==USET_SPAN_CONTAINED) {
649	// Use offset list to try all possibilities.
650	offsets.setMaxLength(maxLength16);
651	}
652	int32_t pos=spanLength, rest=length-pos;
653	int32_t i, stringsLength=strings.size();
654	for(;;) {
655	if(spanCondition==USET_SPAN_CONTAINED) {
656	for(i=0; i<stringsLength; ++i) {
657	int32_t overlap=spanLengths[i];
658	if(overlap==ALL_CP_CONTAINED) {
659	continue; // Irrelevant string. (Also the empty string.)
660	}
661	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
662	const UChar *s16=string.getBuffer();
663	int32_t length16=string.length();
664	U_ASSERT(length>0)(void)0;
665
666	// Try to match this string at pos-overlap..pos.
667	if(overlap>=LONG_SPAN) {
668	overlap=length16;
669	// While contained: No point matching fully inside the code point span.
670	U16_BACK_1(s16, 0, overlap)do { if(((((s16)[--(overlap)])&0xfffffc00)==0xdc00) && (overlap)>(0) && ((((s16)[(overlap)-1])&0xfffffc00 )==0xd800)) { --(overlap); } } while (false); // Length of the string minus the last code point.
671	}
672	if(overlap>spanLength) {
673	overlap=spanLength;
674	}
675	int32_t inc=length16-overlap; // Keep overlap+inc==length16.
676	for(;;) {
677	if(inc>rest) {
678	break;
679	}
680	// Try to match if the increment is not listed already.
681	if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
682	if(inc==rest) {
683	return length; // Reached the end of the string.
684	}
685	offsets.addOffset(inc);
686	}
687	if(overlap==0) {
688	break;
689	}
690	--overlap;
691	++inc;
692	}
693	}
694	} else /* USET_SPAN_SIMPLE */ {
695	int32_t maxInc=0, maxOverlap=0;
696	for(i=0; i<stringsLength; ++i) {
697	int32_t overlap=spanLengths[i];
698	// For longest match, we do need to try to match even an all-contained string
699	// to find the match from the earliest start.
700
701	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
702	const UChar *s16=string.getBuffer();
703	int32_t length16=string.length();
704	if (length16==0) {
705	continue; // skip the empty string
706	}
707
708	// Try to match this string at pos-overlap..pos.
709	if(overlap>=LONG_SPAN) {
710	overlap=length16;
711	// Longest match: Need to match fully inside the code point span
712	// to find the match from the earliest start.
713	}
714	if(overlap>spanLength) {
715	overlap=spanLength;
716	}
717	int32_t inc=length16-overlap; // Keep overlap+inc==length16.
718	for(;;) {
719	if(inc>rest \|\| overlap<maxOverlap) {
720	break;
721	}
722	// Try to match if the string is longer or starts earlier.
723	if( (overlap>maxOverlap \|\| /* redundant overlap==maxOverlap && */ inc>maxInc) &&
724	matches16CPB(s, pos-overlap, length, s16, length16)
725	) {
726	maxInc=inc; // Longest match from earliest start.
727	maxOverlap=overlap;
728	break;
729	}
730	--overlap;
731	++inc;
732	}
733	}
734
735	if(maxInc!=0 \|\| maxOverlap!=0) {
736	// Longest-match algorithm, and there was a string match.
737	// Simply continue after it.
738	pos+=maxInc;
739	rest-=maxInc;
740	if(rest==0) {
741	return length; // Reached the end of the string.
742	}
743	spanLength=0; // Match strings from after a string match.
744	continue;
745	}
746	}
747	// Finished trying to match all strings at pos.
748
749	if(spanLength!=0 \|\| pos==0) {
750	// The position is after an unlimited code point span (spanLength!=0),
751	// not after a string match.
752	// The only position where spanLength==0 after a span is pos==0.
753	// Otherwise, an unlimited code point span is only tried again when no
754	// strings match, and if such a non-initial span fails we stop.
755	if(offsets.isEmpty()) {
756	return pos; // No strings matched after a span.
757	}
758	// Match strings from after the next string match.
759	} else {
760	// The position is after a string match (or a single code point).
761	if(offsets.isEmpty()) {
762	// No more strings matched after a previous string match.
763	// Try another code point span from after the last string match.
764	spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
765	if( spanLength==rest \|\| // Reached the end of the string, or
766	spanLength==0 // neither strings nor span progressed.
767	) {
768	return pos+spanLength;
769	}
770	pos+=spanLength;
771	rest-=spanLength;
772	continue; // spanLength>0: Match strings from after a span.
773	} else {
774	// Try to match only one code point from after a string match if some
775	// string matched beyond it, so that we try all possible positions
776	// and don't overshoot.
777	spanLength=spanOne(spanSet, s+pos, rest);
778	if(spanLength>0) {
779	if(spanLength==rest) {
780	return length; // Reached the end of the string.
781	}
782	// Match strings after this code point.
783	// There cannot be any increments below it because UnicodeSet strings
784	// contain multiple code points.
785	pos+=spanLength;
786	rest-=spanLength;
787	offsets.shift(spanLength);
788	spanLength=0;
789	continue; // Match strings from after a single code point.
790	}
791	// Match strings from after the next string match.
792	}
793	}
794	int32_t minOffset=offsets.popMinimum();
795	pos+=minOffset;
796	rest-=minOffset;
797	spanLength=0; // Match strings from after a string match.
798	}
799	}
800
801	int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
802	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
803	return spanNotBack(s, length);
804	}
805	int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
806	if(pos==0) {
807	return 0;
808	}
809	int32_t spanLength=length-pos;
810
811	// Consider strings; they may overlap with the span.
812	OffsetList offsets;
813	if(spanCondition==USET_SPAN_CONTAINED) {
814	// Use offset list to try all possibilities.
815	offsets.setMaxLength(maxLength16);
816	}
817	int32_t i, stringsLength=strings.size();
818	uint8_t *spanBackLengths=spanLengths;
819	if(all) {
820	spanBackLengths+=stringsLength;
821	}
822	for(;;) {
823	if(spanCondition==USET_SPAN_CONTAINED) {
824	for(i=0; i<stringsLength; ++i) {
825	int32_t overlap=spanBackLengths[i];
826	if(overlap==ALL_CP_CONTAINED) {
827	continue; // Irrelevant string. (Also the empty string.)
828	}
829	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
830	const UChar *s16=string.getBuffer();
831	int32_t length16=string.length();
832	U_ASSERT(length>0)(void)0;
833
834	// Try to match this string at pos-(length16-overlap)..pos-length16.
835	if(overlap>=LONG_SPAN) {
836	overlap=length16;
837	// While contained: No point matching fully inside the code point span.
838	int32_t len1=0;
839	U16_FWD_1(s16, len1, overlap)do { if(((((s16)[(len1)++])&0xfffffc00)==0xd800) && (len1)!=(overlap) && ((((s16)[len1])&0xfffffc00) ==0xdc00)) { ++(len1); } } while (false);
840	overlap-=len1; // Length of the string minus the first code point.
841	}
842	if(overlap>spanLength) {
843	overlap=spanLength;
844	}
845	int32_t dec=length16-overlap; // Keep dec+overlap==length16.
846	for(;;) {
847	if(dec>pos) {
848	break;
849	}
850	// Try to match if the decrement is not listed already.
851	if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
852	if(dec==pos) {
853	return 0; // Reached the start of the string.
854	}
855	offsets.addOffset(dec);
856	}
857	if(overlap==0) {
858	break;
859	}
860	--overlap;
861	++dec;
862	}
863	}
864	} else /* USET_SPAN_SIMPLE */ {
865	int32_t maxDec=0, maxOverlap=0;
866	for(i=0; i<stringsLength; ++i) {
867	int32_t overlap=spanBackLengths[i];
868	// For longest match, we do need to try to match even an all-contained string
869	// to find the match from the latest end.
870
871	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
872	const UChar *s16=string.getBuffer();
873	int32_t length16=string.length();
874	if (length16==0) {
875	continue; // skip the empty string
876	}
877
878	// Try to match this string at pos-(length16-overlap)..pos-length16.
879	if(overlap>=LONG_SPAN) {
880	overlap=length16;
881	// Longest match: Need to match fully inside the code point span
882	// to find the match from the latest end.
883	}
884	if(overlap>spanLength) {
885	overlap=spanLength;
886	}
887	int32_t dec=length16-overlap; // Keep dec+overlap==length16.
888	for(;;) {
889	if(dec>pos \|\| overlap<maxOverlap) {
890	break;
891	}
892	// Try to match if the string is longer or ends later.
893	if( (overlap>maxOverlap \|\| /* redundant overlap==maxOverlap && */ dec>maxDec) &&
894	matches16CPB(s, pos-dec, length, s16, length16)
895	) {
896	maxDec=dec; // Longest match from latest end.
897	maxOverlap=overlap;
898	break;
899	}
900	--overlap;
901	++dec;
902	}
903	}
904
905	if(maxDec!=0 \|\| maxOverlap!=0) {
906	// Longest-match algorithm, and there was a string match.
907	// Simply continue before it.
908	pos-=maxDec;
909	if(pos==0) {
910	return 0; // Reached the start of the string.
911	}
912	spanLength=0; // Match strings from before a string match.
913	continue;
914	}
915	}
916	// Finished trying to match all strings at pos.
917
918	if(spanLength!=0 \|\| pos==length) {
919	// The position is before an unlimited code point span (spanLength!=0),
920	// not before a string match.
921	// The only position where spanLength==0 before a span is pos==length.
922	// Otherwise, an unlimited code point span is only tried again when no
923	// strings match, and if such a non-initial span fails we stop.
924	if(offsets.isEmpty()) {
925	return pos; // No strings matched before a span.
926	}
927	// Match strings from before the next string match.
928	} else {
929	// The position is before a string match (or a single code point).
930	if(offsets.isEmpty()) {
931	// No more strings matched before a previous string match.
932	// Try another code point span from before the last string match.
933	int32_t oldPos=pos;
934	pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
935	spanLength=oldPos-pos;
936	if( pos==0 \|\| // Reached the start of the string, or
937	spanLength==0 // neither strings nor span progressed.
938	) {
939	return pos;
940	}
941	continue; // spanLength>0: Match strings from before a span.
942	} else {
943	// Try to match only one code point from before a string match if some
944	// string matched beyond it, so that we try all possible positions
945	// and don't overshoot.
946	spanLength=spanOneBack(spanSet, s, pos);
947	if(spanLength>0) {
948	if(spanLength==pos) {
949	return 0; // Reached the start of the string.
950	}
951	// Match strings before this code point.
952	// There cannot be any decrements below it because UnicodeSet strings
953	// contain multiple code points.
954	pos-=spanLength;
955	offsets.shift(spanLength);
956	spanLength=0;
957	continue; // Match strings from before a single code point.
958	}
959	// Match strings from before the next string match.
960	}
961	}
962	pos-=offsets.popMinimum();
963	spanLength=0; // Match strings from before a string match.
964	}
965	}
966
967	int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
968	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
969	return spanNotUTF8(s, length);
970	}
971	int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
972	if(spanLength==length) {
973	return length;
974	}
975
976	// Consider strings; they may overlap with the span.
977	OffsetList offsets;
978	if(spanCondition==USET_SPAN_CONTAINED) {
979	// Use offset list to try all possibilities.
980	offsets.setMaxLength(maxLength8);
981	}
982	int32_t pos=spanLength, rest=length-pos;
983	int32_t i, stringsLength=strings.size();
984	uint8_t *spanUTF8Lengths=spanLengths;
985	if(all) {
986	spanUTF8Lengths+=2*stringsLength;
987	}
988	for(;;) {
989	const uint8_t *s8=utf8;
990	int32_t length8;
991	if(spanCondition==USET_SPAN_CONTAINED) {
992	for(i=0; i<stringsLength; ++i) {
993	length8=utf8Lengths[i];
994	if(length8==0) {
995	continue; // String not representable in UTF-8.
996	}
997	int32_t overlap=spanUTF8Lengths[i];
998	if(overlap==ALL_CP_CONTAINED) {
999	s8+=length8;
1000	continue; // Irrelevant string.
1001	}
1002
1003	// Try to match this string at pos-overlap..pos.
1004	if(overlap>=LONG_SPAN) {
1005	overlap=length8;
1006	// While contained: No point matching fully inside the code point span.
1007	U8_BACK_1(s8, 0, overlap)do { if(((int8_t)((s8)[--(overlap)])<-0x40)) { (overlap)=utf8_back1SafeBody_71 (s8, 0, (overlap)); } } while (false); // Length of the string minus the last code point.
1008	}
1009	if(overlap>spanLength) {
1010	overlap=spanLength;
1011	}
1012	int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1013	for(;;) {
1014	if(inc>rest) {
1015	break;
1016	}
1017	// Try to match if the increment is not listed already.
1018	// Match at code point boundaries. (The UTF-8 strings were converted
1019	// from UTF-16 and are guaranteed to be well-formed.)
1020	if(!U8_IS_TRAIL(s[pos-overlap])((int8_t)(s[pos-overlap])<-0x40) &&
1021	!offsets.containsOffset(inc) &&
1022	matches8(s+pos-overlap, s8, length8)) {
1023	if(inc==rest) {
1024	return length; // Reached the end of the string.
1025	}
1026	offsets.addOffset(inc);
1027	}
1028	if(overlap==0) {
1029	break;
1030	}
1031	--overlap;
1032	++inc;
1033	}
1034	s8+=length8;
1035	}
1036	} else /* USET_SPAN_SIMPLE */ {
1037	int32_t maxInc=0, maxOverlap=0;
1038	for(i=0; i<stringsLength; ++i) {
1039	length8=utf8Lengths[i];
1040	if(length8==0) {
1041	continue; // String not representable in UTF-8.
1042	}
1043	int32_t overlap=spanUTF8Lengths[i];
1044	// For longest match, we do need to try to match even an all-contained string
1045	// to find the match from the earliest start.
1046
1047	// Try to match this string at pos-overlap..pos.
1048	if(overlap>=LONG_SPAN) {
1049	overlap=length8;
1050	// Longest match: Need to match fully inside the code point span
1051	// to find the match from the earliest start.
1052	}
1053	if(overlap>spanLength) {
1054	overlap=spanLength;
1055	}
1056	int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1057	for(;;) {
1058	if(inc>rest \|\| overlap<maxOverlap) {
1059	break;
1060	}
1061	// Try to match if the string is longer or starts earlier.
1062	// Match at code point boundaries. (The UTF-8 strings were converted
1063	// from UTF-16 and are guaranteed to be well-formed.)
1064	if(!U8_IS_TRAIL(s[pos-overlap])((int8_t)(s[pos-overlap])<-0x40) &&
1065	(overlap>maxOverlap \|\|
1066	/* redundant overlap==maxOverlap && */ inc>maxInc) &&
1067	matches8(s+pos-overlap, s8, length8)) {
1068	maxInc=inc; // Longest match from earliest start.
1069	maxOverlap=overlap;
1070	break;
1071	}
1072	--overlap;
1073	++inc;
1074	}
1075	s8+=length8;
1076	}
1077
1078	if(maxInc!=0 \|\| maxOverlap!=0) {
1079	// Longest-match algorithm, and there was a string match.
1080	// Simply continue after it.
1081	pos+=maxInc;
1082	rest-=maxInc;
1083	if(rest==0) {
1084	return length; // Reached the end of the string.
1085	}
1086	spanLength=0; // Match strings from after a string match.
1087	continue;
1088	}
1089	}
1090	// Finished trying to match all strings at pos.
1091
1092	if(spanLength!=0 \|\| pos==0) {
1093	// The position is after an unlimited code point span (spanLength!=0),
1094	// not after a string match.
1095	// The only position where spanLength==0 after a span is pos==0.
1096	// Otherwise, an unlimited code point span is only tried again when no
1097	// strings match, and if such a non-initial span fails we stop.
1098	if(offsets.isEmpty()) {
1099	return pos; // No strings matched after a span.
1100	}
1101	// Match strings from after the next string match.
1102	} else {
1103	// The position is after a string match (or a single code point).
1104	if(offsets.isEmpty()) {
1105	// No more strings matched after a previous string match.
1106	// Try another code point span from after the last string match.
1107	spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1108	if( spanLength==rest \|\| // Reached the end of the string, or
1109	spanLength==0 // neither strings nor span progressed.
1110	) {
1111	return pos+spanLength;
1112	}
1113	pos+=spanLength;
1114	rest-=spanLength;
1115	continue; // spanLength>0: Match strings from after a span.
1116	} else {
1117	// Try to match only one code point from after a string match if some
1118	// string matched beyond it, so that we try all possible positions
1119	// and don't overshoot.
1120	spanLength=spanOneUTF8(spanSet, s+pos, rest);
1121	if(spanLength>0) {
1122	if(spanLength==rest) {
1123	return length; // Reached the end of the string.
1124	}
1125	// Match strings after this code point.
1126	// There cannot be any increments below it because UnicodeSet strings
1127	// contain multiple code points.
1128	pos+=spanLength;
1129	rest-=spanLength;
1130	offsets.shift(spanLength);
1131	spanLength=0;
1132	continue; // Match strings from after a single code point.
1133	}
1134	// Match strings from after the next string match.
1135	}
1136	}
1137	int32_t minOffset=offsets.popMinimum();
1138	pos+=minOffset;
1139	rest-=minOffset;
1140	spanLength=0; // Match strings from after a string match.
1141	}
1142	}
1143
1144	int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1145	if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1146	return spanNotBackUTF8(s, length);
1147	}
1148	int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1149	if(pos==0) {
1150	return 0;
1151	}
1152	int32_t spanLength=length-pos;
1153
1154	// Consider strings; they may overlap with the span.
1155	OffsetList offsets;
1156	if(spanCondition==USET_SPAN_CONTAINED) {
1157	// Use offset list to try all possibilities.
1158	offsets.setMaxLength(maxLength8);
1159	}
1160	int32_t i, stringsLength=strings.size();
1161	uint8_t *spanBackUTF8Lengths=spanLengths;
1162	if(all) {
1163	spanBackUTF8Lengths+=3*stringsLength;
1164	}
1165	for(;;) {
1166	const uint8_t *s8=utf8;
1167	int32_t length8;
1168	if(spanCondition==USET_SPAN_CONTAINED) {
1169	for(i=0; i<stringsLength; ++i) {
1170	length8=utf8Lengths[i];
1171	if(length8==0) {
1172	continue; // String not representable in UTF-8.
1173	}
1174	int32_t overlap=spanBackUTF8Lengths[i];
1175	if(overlap==ALL_CP_CONTAINED) {
1176	s8+=length8;
1177	continue; // Irrelevant string.
1178	}
1179
1180	// Try to match this string at pos-(length8-overlap)..pos-length8.
1181	if(overlap>=LONG_SPAN) {
1182	overlap=length8;
1183	// While contained: No point matching fully inside the code point span.
1184	int32_t len1=0;
1185	U8_FWD_1(s8, len1, overlap)do { uint8_t __b=(s8)[(len1)++]; if(((uint8_t)((__b)-0xc2)<= 0x32) && (len1)!=(overlap)) { uint8_t __t1=(s8)[len1] ; if((0xe0<=__b && __b<0xf0)) { if(("\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30" [(__b)&0xf]&(1<<((uint8_t)(__t1)>>5))) && ++(len1)!=(overlap) && ((int8_t)((s8)[len1])<-0x40 )) { ++(len1); } } else if(__b<0xe0) { if(((int8_t)(__t1)< -0x40)) { ++(len1); } } else { if(("\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00" [(uint8_t)(__t1)>>4]&(1<<((__b)&7))) && ++(len1)!=(overlap) && ((int8_t)((s8)[len1])<-0x40 ) && ++(len1)!=(overlap) && ((int8_t)((s8)[len1 ])<-0x40)) { ++(len1); } } } } while (false);
1186	overlap-=len1; // Length of the string minus the first code point.
1187	}
1188	if(overlap>spanLength) {
1189	overlap=spanLength;
1190	}
1191	int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1192	for(;;) {
1193	if(dec>pos) {
1194	break;
1195	}
1196	// Try to match if the decrement is not listed already.
1197	// Match at code point boundaries. (The UTF-8 strings were converted
1198	// from UTF-16 and are guaranteed to be well-formed.)
1199	if( !U8_IS_TRAIL(s[pos-dec])((int8_t)(s[pos-dec])<-0x40) &&
1200	!offsets.containsOffset(dec) &&
1201	matches8(s+pos-dec, s8, length8)
1202	) {
1203	if(dec==pos) {
1204	return 0; // Reached the start of the string.
1205	}
1206	offsets.addOffset(dec);
1207	}
1208	if(overlap==0) {
1209	break;
1210	}
1211	--overlap;
1212	++dec;
1213	}
1214	s8+=length8;
1215	}
1216	} else /* USET_SPAN_SIMPLE */ {
1217	int32_t maxDec=0, maxOverlap=0;
1218	for(i=0; i<stringsLength; ++i) {
1219	length8=utf8Lengths[i];
1220	if(length8==0) {
1221	continue; // String not representable in UTF-8.
1222	}
1223	int32_t overlap=spanBackUTF8Lengths[i];
1224	// For longest match, we do need to try to match even an all-contained string
1225	// to find the match from the latest end.
1226
1227	// Try to match this string at pos-(length8-overlap)..pos-length8.
1228	if(overlap>=LONG_SPAN) {
1229	overlap=length8;
1230	// Longest match: Need to match fully inside the code point span
1231	// to find the match from the latest end.
1232	}
1233	if(overlap>spanLength) {
1234	overlap=spanLength;
1235	}
1236	int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1237	for(;;) {
1238	if(dec>pos \|\| overlap<maxOverlap) {
1239	break;
1240	}
1241	// Try to match if the string is longer or ends later.
1242	// Match at code point boundaries. (The UTF-8 strings were converted
1243	// from UTF-16 and are guaranteed to be well-formed.)
1244	if( !U8_IS_TRAIL(s[pos-dec])((int8_t)(s[pos-dec])<-0x40) &&
1245	(overlap>maxOverlap \|\| /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1246	matches8(s+pos-dec, s8, length8)
1247	) {
1248	maxDec=dec; // Longest match from latest end.
1249	maxOverlap=overlap;
1250	break;
1251	}
1252	--overlap;
1253	++dec;
1254	}
1255	s8+=length8;
1256	}
1257
1258	if(maxDec!=0 \|\| maxOverlap!=0) {
1259	// Longest-match algorithm, and there was a string match.
1260	// Simply continue before it.
1261	pos-=maxDec;
1262	if(pos==0) {
1263	return 0; // Reached the start of the string.
1264	}
1265	spanLength=0; // Match strings from before a string match.
1266	continue;
1267	}
1268	}
1269	// Finished trying to match all strings at pos.
1270
1271	if(spanLength!=0 \|\| pos==length) {
1272	// The position is before an unlimited code point span (spanLength!=0),
1273	// not before a string match.
1274	// The only position where spanLength==0 before a span is pos==length.
1275	// Otherwise, an unlimited code point span is only tried again when no
1276	// strings match, and if such a non-initial span fails we stop.
1277	if(offsets.isEmpty()) {
1278	return pos; // No strings matched before a span.
1279	}
1280	// Match strings from before the next string match.
1281	} else {
1282	// The position is before a string match (or a single code point).
1283	if(offsets.isEmpty()) {
1284	// No more strings matched before a previous string match.
1285	// Try another code point span from before the last string match.
1286	int32_t oldPos=pos;
1287	pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1288	spanLength=oldPos-pos;
1289	if( pos==0 \|\| // Reached the start of the string, or
1290	spanLength==0 // neither strings nor span progressed.
1291	) {
1292	return pos;
1293	}
1294	continue; // spanLength>0: Match strings from before a span.
1295	} else {
1296	// Try to match only one code point from before a string match if some
1297	// string matched beyond it, so that we try all possible positions
1298	// and don't overshoot.
1299	spanLength=spanOneBackUTF8(spanSet, s, pos);
1300	if(spanLength>0) {
1301	if(spanLength==pos) {
1302	return 0; // Reached the start of the string.
1303	}
1304	// Match strings before this code point.
1305	// There cannot be any decrements below it because UnicodeSet strings
1306	// contain multiple code points.
1307	pos-=spanLength;
1308	offsets.shift(spanLength);
1309	spanLength=0;
1310	continue; // Match strings from before a single code point.
1311	}
1312	// Match strings from before the next string match.
1313	}
1314	}
1315	pos-=offsets.popMinimum();
1316	spanLength=0; // Match strings from before a string match.
1317	}
1318	}
1319
1320	/*
1321	* Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1322	*
1323	* Theoretical algorithm:
1324	* - Iterate through the string, and at each code point boundary:
1325	* + If the code point there is in the set, then return with the current position.
1326	* + If a set string matches at the current position, then return with the current position.
1327	*
1328	* Optimized implementation:
1329	*
1330	* (Same assumption as for span() above.)
1331	*
1332	* Create and cache a spanNotSet which contains all of the single code points
1333	* of the original set but none of its strings.
1334	* For each set string add its initial code point to the spanNotSet.
1335	* (Also add its final code point for spanNotBack().)
1336	*
1337	* - Loop:
1338	* + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1339	* + If the current code point is in the original set, then
1340	* return the current position.
1341	* + If any set string matches at the current position, then
1342	* return the current position.
1343	* + If there is no match at the current position, neither for the code point there
1344	* nor for any set string, then skip this code point and continue the loop.
1345	* This happens for set-string-initial code points that were added to spanNotSet
1346	* when there is not actually a match for such a set string.
1347	*/
1348
1349	int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1350	int32_t pos=0, rest=length;
1351	int32_t i, stringsLength=strings.size();
1352	do {
1353	// Span until we find a code point from the set,
1354	// or a code point that starts or ends some string.
1355	i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1356	if(i==rest) {
1357	return length; // Reached the end of the string.
1358	}
1359	pos+=i;
1360	rest-=i;
1361
1362	// Check whether the current code point is in the original set,
1363	// without the string starts and ends.
1364	int32_t cpLength=spanOne(spanSet, s+pos, rest);
1365	if(cpLength>0) {
1366	return pos; // There is a set element at pos.
1367	}
1368
1369	// Try to match the strings at pos.
1370	for(i=0; i<stringsLength; ++i) {
1371	if(spanLengths[i]==ALL_CP_CONTAINED) {
1372	continue; // Irrelevant string. (Also the empty string.)
1373	}
1374	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
1375	const UChar *s16=string.getBuffer();
1376	int32_t length16=string.length();
1377	U_ASSERT(length>0)(void)0;
1378	if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1379	return pos; // There is a set element at pos.
1380	}
1381	}
1382
1383	// The span(while not contained) ended on a string start/end which is
1384	// not in the original set. Skip this code point and continue.
1385	// cpLength<0
1386	pos-=cpLength;
1387	rest+=cpLength;
1388	} while(rest!=0);
1389	return length; // Reached the end of the string.
1390	}
1391
1392	int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1393	int32_t pos=length;
1394	int32_t i, stringsLength=strings.size();
1395	do {
1396	// Span until we find a code point from the set,
1397	// or a code point that starts or ends some string.
1398	pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1399	if(pos==0) {
1400	return 0; // Reached the start of the string.
1401	}
1402
1403	// Check whether the current code point is in the original set,
1404	// without the string starts and ends.
1405	int32_t cpLength=spanOneBack(spanSet, s, pos);
1406	if(cpLength>0) {
1407	return pos; // There is a set element at pos.
1408	}
1409
1410	// Try to match the strings at pos.
1411	for(i=0; i<stringsLength; ++i) {
1412	// Use spanLengths rather than a spanBackLengths pointer because
1413	// it is easier and we only need to know whether the string is irrelevant
1414	// which is the same in either array.
1415	if(spanLengths[i]==ALL_CP_CONTAINED) {
1416	continue; // Irrelevant string. (Also the empty string.)
1417	}
1418	const UnicodeString &string=(const UnicodeString )strings.elementAt(i);
1419	const UChar *s16=string.getBuffer();
1420	int32_t length16=string.length();
1421	U_ASSERT(length>0)(void)0;
1422	if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1423	return pos; // There is a set element at pos.
1424	}
1425	}
1426
1427	// The span(while not contained) ended on a string start/end which is
1428	// not in the original set. Skip this code point and continue.
1429	// cpLength<0
1430	pos+=cpLength;
1431	} while(pos!=0);
1432	return 0; // Reached the start of the string.
1433	}
1434
1435	int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1436	int32_t pos=0, rest=length;
1437	int32_t i, stringsLength=strings.size();
1438	uint8_t *spanUTF8Lengths=spanLengths;
1439	if(all) {
1440	spanUTF8Lengths+=2*stringsLength;
1441	}
1442	do {
1443	// Span until we find a code point from the set,
1444	// or a code point that starts or ends some string.
1445	i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1446	if(i==rest) {
1447	return length; // Reached the end of the string.
1448	}
1449	pos+=i;
1450	rest-=i;
1451
1452	// Check whether the current code point is in the original set,
1453	// without the string starts and ends.
1454	int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1455	if(cpLength>0) {
1456	return pos; // There is a set element at pos.
1457	}
1458
1459	// Try to match the strings at pos.
1460	const uint8_t *s8=utf8;
1461	int32_t length8;
1462	for(i=0; i<stringsLength; ++i) {
1463	length8=utf8Lengths[i];
1464	// ALL_CP_CONTAINED: Irrelevant string.
1465	if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1466	return pos; // There is a set element at pos.
1467	}
1468	s8+=length8;
1469	}
1470
1471	// The span(while not contained) ended on a string start/end which is
1472	// not in the original set. Skip this code point and continue.
1473	// cpLength<0
1474	pos-=cpLength;
1475	rest+=cpLength;
1476	} while(rest!=0);
1477	return length; // Reached the end of the string.
1478	}
1479
1480	int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1481	int32_t pos=length;
1482	int32_t i, stringsLength=strings.size();
1483	uint8_t *spanBackUTF8Lengths=spanLengths;
1484	if(all) {
1485	spanBackUTF8Lengths+=3*stringsLength;
1486	}
1487	do {
1488	// Span until we find a code point from the set,
1489	// or a code point that starts or ends some string.
1490	pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1491	if(pos==0) {
1492	return 0; // Reached the start of the string.
1493	}
1494
1495	// Check whether the current code point is in the original set,
1496	// without the string starts and ends.
1497	int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1498	if(cpLength>0) {
1499	return pos; // There is a set element at pos.
1500	}
1501
1502	// Try to match the strings at pos.
1503	const uint8_t *s8=utf8;
1504	int32_t length8;
1505	for(i=0; i<stringsLength; ++i) {
1506	length8=utf8Lengths[i];
1507	// ALL_CP_CONTAINED: Irrelevant string.
1508	if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1509	return pos; // There is a set element at pos.
1510	}
1511	s8+=length8;
1512	}
1513
1514	// The span(while not contained) ended on a string start/end which is
1515	// not in the original set. Skip this code point and continue.
1516	// cpLength<0
1517	pos+=cpLength;
1518	} while(pos!=0);
1519	return 0; // Reached the start of the string.
1520	}
1521
1522	U_NAMESPACE_END}