../deps/icu-small/source/i18n/regexcmp.cpp

Bug Summary

File:	out/../deps/icu-small/source/i18n/regexcmp.cpp
Warning:	line 406, column 21 Value stored to 'op' during its initialization is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name regexcmp.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/home/maurizio/node-v18.6.0/out -resource-dir /usr/local/lib/clang/16.0.0 -D V8_DEPRECATION_WARNINGS -D V8_IMMINENT_DEPRECATION_WARNINGS -D _GLIBCXX_USE_CXX11_ABI=1 -D NODE_OPENSSL_CONF_NAME=nodejs_conf -D NODE_OPENSSL_HAS_QUIC -D __STDC_FORMAT_MACROS -D OPENSSL_NO_PINSHARED -D OPENSSL_THREADS -D U_COMMON_IMPLEMENTATION=1 -D U_I18N_IMPLEMENTATION=1 -D U_IO_IMPLEMENTATION=1 -D U_TOOLUTIL_IMPLEMENTATION=1 -D U_ATTRIBUTE_DEPRECATED= -D _CRT_SECURE_NO_DEPRECATE= -D U_STATIC_IMPLEMENTATION=1 -D UCONFIG_NO_SERVICE=1 -D U_ENABLE_DYLOAD=0 -D U_HAVE_STD_STRING=1 -D UCONFIG_NO_BREAK_ITERATION=0 -I ../deps/icu-small/source/common -I ../deps/icu-small/source/i18n -I ../deps/icu-small/source/tools/toolutil -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8 -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/x86_64-redhat-linux -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/backward -internal-isystem /usr/local/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../x86_64-redhat-linux/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-unused-parameter -Wno-deprecated-declarations -Wno-strict-aliasing -std=gnu++17 -fdeprecated-macro -fdebug-compilation-dir=/home/maurizio/node-v18.6.0/out -ferror-limit 19 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-08-22-142216-507842-1 -x c++ ../deps/icu-small/source/i18n/regexcmp.cpp

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	//
4	// file: regexcmp.cpp
5	//
6	// Copyright (C) 2002-2016 International Business Machines Corporation and others.
7	// All Rights Reserved.
8	//
9	// This file contains the ICU regular expression compiler, which is responsible
10	// for processing a regular expression pattern into the compiled form that
11	// is used by the match finding engine.
12	//
13
14	#include "unicode/utypes.h"
15
16	#if !UCONFIG_NO_REGULAR_EXPRESSIONS0
17
18	#include "unicode/ustring.h"
19	#include "unicode/unistr.h"
20	#include "unicode/uniset.h"
21	#include "unicode/uchar.h"
22	#include "unicode/uchriter.h"
23	#include "unicode/parsepos.h"
24	#include "unicode/parseerr.h"
25	#include "unicode/regex.h"
26	#include "unicode/utf.h"
27	#include "unicode/utf16.h"
28	#include "patternprops.h"
29	#include "putilimp.h"
30	#include "cmemory.h"
31	#include "cstr.h"
32	#include "cstring.h"
33	#include "uvectr32.h"
34	#include "uvectr64.h"
35	#include "uassert.h"
36	#include "uinvchar.h"
37
38	#include "regeximp.h"
39	#include "regexcst.h" // Contains state table for the regex pattern parser.
40	// generated by a Perl script.
41	#include "regexcmp.h"
42	#include "regexst.h"
43	#include "regextxt.h"
44
45
46
47	U_NAMESPACE_BEGINnamespace icu_71 {
48
49
50	//------------------------------------------------------------------------------
51	//
52	// Constructor.
53	//
54	//------------------------------------------------------------------------------
55	RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56	fParenStack(status), fSetStack(uprv_deleteUObjectuprv_deleteUObject_71, nullptr, status), fSetOpStack(status)
57	{
58	// Lazy init of all shared global sets (needed for init()'s empty text)
59	RegexStaticSets::initGlobals(&status);
60
61	fStatus = &status;
62
63	fRXPat = rxp;
64	fScanIndex = 0;
65	fLastChar = -1;
66	fPeekChar = -1;
67	fLineNum = 1;
68	fCharNum = 0;
69	fQuoteMode = FALSE0;
70	fInBackslashQuote = FALSE0;
71	fModeFlags = fRXPat->fFlags \| 0x80000000;
72	fEOLComments = TRUE1;
73
74	fMatchOpenParen = -1;
75	fMatchCloseParen = -1;
76	fCaptureName = NULL__null;
77	fLastSetLiteral = U_SENTINEL(-1);
78
79	if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80	status = rxp->fDeferredStatus;
81	}
82	}
83
84	static const UChar chAmp = 0x26; // '&'
85	static const UChar chDash = 0x2d; // '-'
86
87
88	//------------------------------------------------------------------------------
89	//
90	// Destructor
91	//
92	//------------------------------------------------------------------------------
93	RegexCompile::~RegexCompile() {
94	delete fCaptureName; // Normally will be NULL, but can exist if pattern
95	// compilation stops with a syntax error.
96	}
97
98	static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99	set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100	}
101
102	//------------------------------------------------------------------------------
103	//
104	// Compile regex pattern. The state machine for rexexp pattern parsing is here.
105	// The state tables are hand-written in the file regexcst.txt,
106	// and converted to the form used here by a perl
107	// script regexcst.pl
108	//
109	//------------------------------------------------------------------------------
110	void RegexCompile::compile(
111	const UnicodeString &pat, // Source pat to be compiled.
112	UParseError &pp, // Error position info
113	UErrorCode &e) // Error Code
114	{
115	fRXPat->fPatternString = new UnicodeString(pat);
116	UText patternText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null, __null, __null, __null, __null, __null, __null, __null, 0, 0 , 0, 0, 0, 0 };
117	utext_openConstUnicodeStringutext_openConstUnicodeString_71(&patternText, fRXPat->fPatternString, &e);
118
119	if (U_SUCCESS(e)) {
120	compile(&patternText, pp, e);
121	utext_closeutext_close_71(&patternText);
122	}
123	}
124
125	//
126	// compile, UText mode
127	// All the work is actually done here.
128	//
129	void RegexCompile::compile(
130	UText *pat, // Source pat to be compiled.
131	UParseError &pp, // Error position info
132	UErrorCode &e) // Error Code
133	{
134	fStatus = &e;
135	fParseErr = &pp;
136	fStackPtr = 0;
137	fStack[fStackPtr] = 0;
138
139	if (U_FAILURE(*fStatus)) {
140	return;
141	}
142
143	// There should be no pattern stuff in the RegexPattern object. They can not be reused.
144	U_ASSERT(fRXPat->fPattern == NULL \|\| utext_nativeLength(fRXPat->fPattern) == 0)(void)0;
145
146	// Prepare the RegexPattern object to receive the compiled pattern.
147	fRXPat->fPattern = utext_cloneutext_clone_71(fRXPat->fPattern, pat, FALSE0, TRUE1, fStatus);
148	if (U_FAILURE(*fStatus)) {
149	return;
150	}
151
152	// Initialize the pattern scanning state machine
153	fPatternLength = utext_nativeLengthutext_nativeLength_71(pat);
154	uint16_t state = 1;
155	const RegexTableEl *tableEl;
156
157	// UREGEX_LITERAL force entire pattern to be treated as a literal string.
158	if (fModeFlags & UREGEX_LITERAL) {
159	fQuoteMode = TRUE1;
160	}
161
162	nextChar(fC); // Fetch the first char from the pattern string.
163
164	//
165	// Main loop for the regex pattern parsing state machine.
166	// Runs once per state transition.
167	// Each time through optionally performs, depending on the state table,
168	// - an advance to the the next pattern char
169	// - an action to be performed.
170	// - pushing or popping a state to/from the local state return stack.
171	// file regexcst.txt is the source for the state table. The logic behind
172	// recongizing the pattern syntax is there, not here.
173	//
174	for (;;) {
175	// Bail out if anything has gone wrong.
176	// Regex pattern parsing stops on the first error encountered.
177	if (U_FAILURE(*fStatus)) {
178	break;
179	}
180
181	U_ASSERT(state != 0)(void)0;
182
183	// Find the state table element that matches the input char from the pattern, or the
184	// class of the input character. Start with the first table row for this
185	// state, then linearly scan forward until we find a row that matches the
186	// character. The last row for each state always matches all characters, so
187	// the search will stop there, if not before.
188	//
189	tableEl = &gRuleParseStateTable[state];
190	REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
191	fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
192
193	for (;;) { // loop through table rows belonging to this state, looking for one
194	// that matches the current input char.
195	REGEX_SCAN_DEBUG_PRINTF(("."));
196	if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE0 && tableEl->fCharClass == fC.fChar) {
197	// Table row specified an individual character, not a set, and
198	// the input character is not quoted, and
199	// the input character matched it.
200	break;
201	}
202	if (tableEl->fCharClass == 255) {
203	// Table row specified default, match anything character class.
204	break;
205	}
206	if (tableEl->fCharClass == 254 && fC.fQuoted) {
207	// Table row specified "quoted" and the char was quoted.
208	break;
209	}
210	if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
211	// Table row specified eof and we hit eof on the input.
212	break;
213	}
214
215	if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
216	fC.fQuoted == FALSE0 && // char is not escaped &&
217	fC.fChar != (UChar32)-1) { // char is not EOF
218	U_ASSERT(tableEl->fCharClass <= 137)(void)0;
219	if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
220	// Table row specified a character class, or set of characters,
221	// and the current char matches it.
222	break;
223	}
224	}
225
226	// No match on this row, advance to the next row for this state,
227	tableEl++;
228	}
229	REGEX_SCAN_DEBUG_PRINTF(("\n"));
230
231	//
232	// We've found the row of the state table that matches the current input
233	// character from the rules string.
234	// Perform any action specified by this row in the state table.
235	if (doParseActions(tableEl->fAction) == FALSE0) {
236	// Break out of the state machine loop if the
237	// the action signalled some kind of error, or
238	// the action was to exit, occurs on normal end-of-rules-input.
239	break;
240	}
241
242	if (tableEl->fPushState != 0) {
243	fStackPtr++;
244	if (fStackPtr >= kStackSize) {
245	error(U_REGEX_INTERNAL_ERROR);
246	REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
247	fStackPtr--;
248	}
249	fStack[fStackPtr] = tableEl->fPushState;
250	}
251
252	//
253	// NextChar. This is where characters are actually fetched from the pattern.
254	// Happens under control of the 'n' tag in the state table.
255	//
256	if (tableEl->fNextChar) {
257	nextChar(fC);
258	}
259
260	// Get the next state from the table entry, or from the
261	// state stack if the next state was specified as "pop".
262	if (tableEl->fNextState != 255) {
263	state = tableEl->fNextState;
264	} else {
265	state = fStack[fStackPtr];
266	fStackPtr--;
267	if (fStackPtr < 0) {
268	// state stack underflow
269	// This will occur if the user pattern has mis-matched parentheses,
270	// with extra close parens.
271	//
272	fStackPtr++;
273	error(U_REGEX_MISMATCHED_PAREN);
274	}
275	}
276
277	}
278
279	if (U_FAILURE(*fStatus)) {
280	// Bail out if the pattern had errors.
281	return;
282	}
283
284	//
285	// The pattern has now been read and processed, and the compiled code generated.
286	//
287
288	//
289	// The pattern's fFrameSize so far has accumulated the requirements for
290	// storage for capture parentheses, counters, etc. that are encountered
291	// in the pattern. Add space for the two variables that are always
292	// present in the saved state: the input string position (int64_t) and
293	// the position in the compiled pattern.
294	//
295	allocateStackData(RESTACKFRAME_HDRCOUNT2);
296
297	//
298	// Optimization pass 1: NOPs, back-references, and case-folding
299	//
300	stripNOPs();
301
302	//
303	// Get bounds for the minimum and maximum length of a string that this
304	// pattern can match. Used to avoid looking for matches in strings that
305	// are too short.
306	//
307	fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
308
309	//
310	// Optimization pass 2: match start type
311	//
312	matchStartType();
313
314	//
315	// Set up fast latin-1 range sets
316	//
317	int32_t numSets = fRXPat->fSets->size();
318	fRXPat->fSets8 = new Regex8BitSet[numSets];
319	// Null pointer check.
320	if (fRXPat->fSets8 == NULL__null) {
321	e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
322	return;
323	}
324	int32_t i;
325	for (i=0; i<numSets; i++) {
326	UnicodeSet s = (UnicodeSet )fRXPat->fSets->elementAt(i);
327	fRXPat->fSets8[i].init(s);
328	}
329
330	}
331
332
333
334
335
336	//------------------------------------------------------------------------------
337	//
338	// doParseAction Do some action during regex pattern parsing.
339	// Called by the parse state machine.
340	//
341	// Generation of the match engine PCode happens here, or
342	// in functions called from the parse actions defined here.
343	//
344	//
345	//------------------------------------------------------------------------------
346	UBool RegexCompile::doParseActions(int32_t action)
347	{
348	UBool returnVal = TRUE1;
349
350	switch ((Regex_PatternParseAction)action) {
351
352	case doPatStart:
353	// Start of pattern compiles to:
354	//0 SAVE 2 Fall back to position of FAIL
355	//1 jmp 3
356	//2 FAIL Stop if we ever reach here.
357	//3 NOP Dummy, so start of pattern looks the same as
358	// the start of an ( grouping.
359	//4 NOP Resreved, will be replaced by a save if there are
360	// OR \| operators at the top level
361	appendOp(URX_STATE_SAVE, 2);
362	appendOp(URX_JMP, 3);
363	appendOp(URX_FAIL, 0);
364
365	// Standard open nonCapture paren action emits the two NOPs and
366	// sets up the paren stack frame.
367	doParseActions(doOpenNonCaptureParen);
368	break;
369
370	case doPatFinish:
371	// We've scanned to the end of the pattern
372	// The end of pattern compiles to:
373	// URX_END
374	// which will stop the runtime match engine.
375	// Encountering end of pattern also behaves like a close paren,
376	// and forces fixups of the State Save at the beginning of the compiled pattern
377	// and of any OR operations at the top level.
378	//
379	handleCloseParen();
380	if (fParenStack.size() > 0) {
381	// Missing close paren in pattern.
382	error(U_REGEX_MISMATCHED_PAREN);
383	}
384
385	// add the END operation to the compiled pattern.
386	appendOp(URX_END, 0);
387
388	// Terminate the pattern compilation state machine.
389	returnVal = FALSE0;
390	break;
391
392
393
394	case doOrOperator:
395	// Scanning a '\|', as in (A\|B)
396	{
397	// Generate code for any pending literals preceding the '\|'
398	fixLiterals(FALSE0);
399
400	// Insert a SAVE operation at the start of the pattern section preceding
401	// this OR at this level. This SAVE will branch the match forward
402	// to the right hand side of the OR in the event that the left hand
403	// side fails to match and backtracks. Locate the position for the
404	// save from the location on the top of the parentheses stack.
405	int32_t savePosition = fParenStack.popi();
406	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
	Value stored to 'op' during its initialization is never read
407	U_ASSERT(URX_TYPE(op) == URX_NOP)(void)0; // original contents of reserved location
408	op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
409	fRXPat->fCompiledPat->setElementAt(op, savePosition);
410
411	// Append an JMP operation into the compiled pattern. The operand for
412	// the JMP will eventually be the location following the ')' for the
413	// group. This will be patched in later, when the ')' is encountered.
414	appendOp(URX_JMP, 0);
415
416	// Push the position of the newly added JMP op onto the parentheses stack.
417	// This registers if for fixup when this block's close paren is encountered.
418	fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
419
420	// Append a NOP to the compiled pattern. This is the slot reserved
421	// for a SAVE in the event that there is yet another '\|' following
422	// this one.
423	appendOp(URX_NOP, 0);
424	fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
425	}
426	break;
427
428
429	case doBeginNamedCapture:
430	// Scanning (?<letter.
431	// The first letter of the name will come through again under doConinueNamedCapture.
432	fCaptureName = new UnicodeString();
433	if (fCaptureName == NULL__null) {
434	error(U_MEMORY_ALLOCATION_ERROR);
435	}
436	break;
437
438	case doContinueNamedCapture:
439	fCaptureName->append(fC.fChar);
440	break;
441
442	case doBadNamedCapture:
443	error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
444	break;
445
446	case doOpenCaptureParen:
447	// Open Capturing Paren, possibly named.
448	// Compile to a
449	// - NOP, which later may be replaced by a save-state if the
450	// parenthesized group gets a * quantifier, followed by
451	// - START_CAPTURE n where n is stack frame offset to the capture group variables.
452	// - NOP, which may later be replaced by a save-state if there
453	// is an '\|' alternation within the parens.
454	//
455	// Each capture group gets three slots in the save stack frame:
456	// 0: Capture Group start position (in input string being matched.)
457	// 1: Capture Group end position.
458	// 2: Start of Match-in-progress.
459	// The first two locations are for a completed capture group, and are
460	// referred to by back references and the like.
461	// The third location stores the capture start position when an START_CAPTURE is
462	// encountered. This will be promoted to a completed capture when (and if) the corresponding
463	// END_CAPTURE is encountered.
464	{
465	fixLiterals();
466	appendOp(URX_NOP, 0);
467	int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame.
468	appendOp(URX_START_CAPTURE, varsLoc);
469	appendOp(URX_NOP, 0);
470
471	// On the Parentheses stack, start a new frame and add the positions
472	// of the two NOPs. Depending on what follows in the pattern, the
473	// NOPs may be changed to SAVE_STATE or JMP ops, with a target
474	// address of the end of the parenthesized group.
475	fParenStack.push(fModeFlags, *fStatus); // Match mode state
476	fParenStack.push(capturing, *fStatus); // Frame type.
477	fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
478	fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
479
480	// Save the mapping from group number to stack frame variable position.
481	fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
482
483	// If this is a named capture group, add the name->group number mapping.
484	if (fCaptureName != NULL__null) {
485	if (!fRXPat->initNamedCaptureMap()) {
486	if (U_SUCCESS(*fStatus)) {
487	error(fRXPat->fDeferredStatus);
488	}
489	break;
490	}
491	int32_t groupNumber = fRXPat->fGroupMap->size();
492	int32_t previousMapping = uhash_putiuhash_puti_71(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
493	fCaptureName = NULL__null; // hash table takes ownership of the name (key) string.
494	if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
495	error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
496	}
497	}
498	}
499	break;
500
501	case doOpenNonCaptureParen:
502	// Open non-caputuring (grouping only) Paren.
503	// Compile to a
504	// - NOP, which later may be replaced by a save-state if the
505	// parenthesized group gets a * quantifier, followed by
506	// - NOP, which may later be replaced by a save-state if there
507	// is an '\|' alternation within the parens.
508	{
509	fixLiterals();
510	appendOp(URX_NOP, 0);
511	appendOp(URX_NOP, 0);
512
513	// On the Parentheses stack, start a new frame and add the positions
514	// of the two NOPs.
515	fParenStack.push(fModeFlags, *fStatus); // Match mode state
516	fParenStack.push(plain, *fStatus); // Begin a new frame.
517	fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
518	fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
519	}
520	break;
521
522
523	case doOpenAtomicParen:
524	// Open Atomic Paren. (?>
525	// Compile to a
526	// - NOP, which later may be replaced if the parenthesized group
527	// has a quantifier, followed by
528	// - STO_SP save state stack position, so it can be restored at the ")"
529	// - NOP, which may later be replaced by a save-state if there
530	// is an '\|' alternation within the parens.
531	{
532	fixLiterals();
533	appendOp(URX_NOP, 0);
534	int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr.
535	appendOp(URX_STO_SP, varLoc);
536	appendOp(URX_NOP, 0);
537
538	// On the Parentheses stack, start a new frame and add the positions
539	// of the two NOPs. Depending on what follows in the pattern, the
540	// NOPs may be changed to SAVE_STATE or JMP ops, with a target
541	// address of the end of the parenthesized group.
542	fParenStack.push(fModeFlags, *fStatus); // Match mode state
543	fParenStack.push(atomic, *fStatus); // Frame type.
544	fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
545	fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
546	}
547	break;
548
549
550	case doOpenLookAhead:
551	// Positive Look-ahead (?= stuff )
552	//
553	// Note: Addition of transparent input regions, with the need to
554	// restore the original regions when failing out of a lookahead
555	// block, complicated this sequence. Some combined opcodes
556	// might make sense - or might not, lookahead aren't that common.
557	//
558	// Caution: min match length optimization knows about this
559	// sequence; don't change without making updates there too.
560	//
561	// Compiles to
562	// 1 LA_START dataLoc Saves SP, Input Pos, Active input region.
563	// 2. STATE_SAVE 4 on failure of lookahead, goto 4
564	// 3 JMP 6 continue ...
565	//
566	// 4. LA_END Look Ahead failed. Restore regions.
567	// 5. BACKTRACK and back track again.
568	//
569	// 6. NOP reserved for use by quantifiers on the block.
570	// Look-ahead can't have quantifiers, but paren stack
571	// compile time conventions require the slot anyhow.
572	// 7. NOP may be replaced if there is are '\|' ops in the block.
573	// 8. code for parenthesized stuff.
574	// 9. LA_END
575	//
576	// Four data slots are reserved, for saving state on entry to the look-around
577	// 0: stack pointer on entry.
578	// 1: input position on entry.
579	// 2: fActiveStart, the active bounds start on entry.
580	// 3: fActiveLimit, the active bounds limit on entry.
581	{
582	fixLiterals();
583	int32_t dataLoc = allocateData(4);
584	appendOp(URX_LA_START, dataLoc);
585	appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
586	appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
587	appendOp(URX_LA_END, dataLoc);
588	appendOp(URX_BACKTRACK, 0);
589	appendOp(URX_NOP, 0);
590	appendOp(URX_NOP, 0);
591
592	// On the Parentheses stack, start a new frame and add the positions
593	// of the NOPs.
594	fParenStack.push(fModeFlags, *fStatus); // Match mode state
595	fParenStack.push(lookAhead, *fStatus); // Frame type.
596	fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
597	fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
598	}
599	break;
600
601	case doOpenLookAheadNeg:
602	// Negated Lookahead. (?! stuff )
603	// Compiles to
604	// 1. LA_START dataloc
605	// 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
606	// // which continues with the match.
607	// 3. NOP // Std. Open Paren sequence, for possible '\|'
608	// 4. code for parenthesized stuff.
609	// 5. LA_END // Cut back stack, remove saved state from step 2.
610	// 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
611	// 7. END_LA // Restore match region, in case look-ahead was using
612	// an alternate (transparent) region.
613	// Four data slots are reserved, for saving state on entry to the look-around
614	// 0: stack pointer on entry.
615	// 1: input position on entry.
616	// 2: fActiveStart, the active bounds start on entry.
617	// 3: fActiveLimit, the active bounds limit on entry.
618	{
619	fixLiterals();
620	int32_t dataLoc = allocateData(4);
621	appendOp(URX_LA_START, dataLoc);
622	appendOp(URX_STATE_SAVE, 0); // dest address will be patched later.
623	appendOp(URX_NOP, 0);
624
625	// On the Parentheses stack, start a new frame and add the positions
626	// of the StateSave and NOP.
627	fParenStack.push(fModeFlags, *fStatus); // Match mode state
628	fParenStack.push(negLookAhead, *fStatus); // Frame type
629	fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
630	fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
631
632	// Instructions #5 - #7 will be added when the ')' is encountered.
633	}
634	break;
635
636	case doOpenLookBehind:
637	{
638	// Compile a (?<= look-behind open paren.
639	//
640	// Compiles to
641	// 0 URX_LB_START dataLoc
642	// 1 URX_LB_CONT dataLoc
643	// 2 MinMatchLen
644	// 3 MaxMatchLen
645	// 4 URX_NOP Standard '(' boilerplate.
646	// 5 URX_NOP Reserved slot for use with '\|' ops within (block).
647	// 6 <code for LookBehind expression>
648	// 7 URX_LB_END dataLoc # Check match len, restore input len
649	// 8 URX_LA_END dataLoc # Restore stack, input pos
650	//
651	// Allocate a block of matcher data, to contain (when running a match)
652	// 0: Stack ptr on entry
653	// 1: Input Index on entry
654	// 2: fActiveStart, the active bounds start on entry.
655	// 3: fActiveLimit, the active bounds limit on entry.
656	// 4: Start index of match current match attempt.
657	// The first four items must match the layout of data for LA_START / LA_END
658
659	// Generate match code for any pending literals.
660	fixLiterals();
661
662	// Allocate data space
663	int32_t dataLoc = allocateData(5);
664
665	// Emit URX_LB_START
666	appendOp(URX_LB_START, dataLoc);
667
668	// Emit URX_LB_CONT
669	appendOp(URX_LB_CONT, dataLoc);
670	appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
671	appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
672
673	// Emit the NOPs
674	appendOp(URX_NOP, 0);
675	appendOp(URX_NOP, 0);
676
677	// On the Parentheses stack, start a new frame and add the positions
678	// of the URX_LB_CONT and the NOP.
679	fParenStack.push(fModeFlags, *fStatus); // Match mode state
680	fParenStack.push(lookBehind, *fStatus); // Frame type
681	fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
682	fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
683
684	// The final two instructions will be added when the ')' is encountered.
685	}
686
687	break;
688
689	case doOpenLookBehindNeg:
690	{
691	// Compile a (?<! negated look-behind open paren.
692	//
693	// Compiles to
694	// 0 URX_LB_START dataLoc # Save entry stack, input len
695	// 1 URX_LBN_CONT dataLoc # Iterate possible match positions
696	// 2 MinMatchLen
697	// 3 MaxMatchLen
698	// 4 continueLoc (9)
699	// 5 URX_NOP Standard '(' boilerplate.
700	// 6 URX_NOP Reserved slot for use with '\|' ops within (block).
701	// 7 <code for LookBehind expression>
702	// 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
703	// 9 ...
704	//
705	// Allocate a block of matcher data, to contain (when running a match)
706	// 0: Stack ptr on entry
707	// 1: Input Index on entry
708	// 2: fActiveStart, the active bounds start on entry.
709	// 3: fActiveLimit, the active bounds limit on entry.
710	// 4: Start index of match current match attempt.
711	// The first four items must match the layout of data for LA_START / LA_END
712
713	// Generate match code for any pending literals.
714	fixLiterals();
715
716	// Allocate data space
717	int32_t dataLoc = allocateData(5);
718
719	// Emit URX_LB_START
720	appendOp(URX_LB_START, dataLoc);
721
722	// Emit URX_LBN_CONT
723	appendOp(URX_LBN_CONT, dataLoc);
724	appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
725	appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
726	appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled later.
727
728	// Emit the NOPs
729	appendOp(URX_NOP, 0);
730	appendOp(URX_NOP, 0);
731
732	// On the Parentheses stack, start a new frame and add the positions
733	// of the URX_LB_CONT and the NOP.
734	fParenStack.push(fModeFlags, *fStatus); // Match mode state
735	fParenStack.push(lookBehindN, *fStatus); // Frame type
736	fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
737	fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
738
739	// The final two instructions will be added when the ')' is encountered.
740	}
741	break;
742
743	case doConditionalExpr:
744	// Conditionals such as (?(1)a:b)
745	case doPerlInline:
746	// Perl inline-conditionals. (?{perl code}a\|b) We're not perl, no way to do them.
747	error(U_REGEX_UNIMPLEMENTED);
748	break;
749
750
751	case doCloseParen:
752	handleCloseParen();
753	if (fParenStack.size() <= 0) {
754	// Extra close paren, or missing open paren.
755	error(U_REGEX_MISMATCHED_PAREN);
756	}
757	break;
758
759	case doNOP:
760	break;
761
762
763	case doBadOpenParenType:
764	case doRuleError:
765	error(U_REGEX_RULE_SYNTAX);
766	break;
767
768
769	case doMismatchedParenErr:
770	error(U_REGEX_MISMATCHED_PAREN);
771	break;
772
773	case doPlus:
774	// Normal '+' compiles to
775	// 1. stuff to be repeated (already built)
776	// 2. jmp-sav 1
777	// 3. ...
778	//
779	// Or, if the item to be repeated can match a zero length string,
780	// 1. STO_INP_LOC data-loc
781	// 2. body of stuff to be repeated
782	// 3. JMP_SAV_X 2
783	// 4. ...
784
785	//
786	// Or, if the item to be repeated is simple
787	// 1. Item to be repeated.
788	// 2. LOOP_SR_I set number (assuming repeated item is a set ref)
789	// 3. LOOP_C stack location
790	{
791	int32_t topLoc = blockTopLoc(FALSE0); // location of item #1
792	int32_t frameLoc;
793
794	// Check for simple constructs, which may get special optimized code.
795	if (topLoc == fRXPat->fCompiledPat->size() - 1) {
796	int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
797
798	if (URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_SETREF) {
799	// Emit optimized code for [char set]+
800	appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)((repeatedOp) & 0xffffff));
801	frameLoc = allocateStackData(1);
802	appendOp(URX_LOOP_C, frameLoc);
803	break;
804	}
805
806	if (URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY \|\|
807	URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY_ALL \|\|
808	URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY_UNIX) {
809	// Emit Optimized code for .+ operations.
810	int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
811	if (URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY_ALL) {
812	// URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
813	loopOpI \|= 1;
814	}
815	if (fModeFlags & UREGEX_UNIX_LINES) {
816	loopOpI \|= 2;
817	}
818	appendOp(loopOpI);
819	frameLoc = allocateStackData(1);
820	appendOp(URX_LOOP_C, frameLoc);
821	break;
822	}
823
824	}
825
826	// General case.
827
828	// Check for minimum match length of zero, which requires
829	// extra loop-breaking code.
830	if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
831	// Zero length match is possible.
832	// Emit the code sequence that can handle it.
833	insertOp(topLoc);
834	frameLoc = allocateStackData(1);
835
836	int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
837	fRXPat->fCompiledPat->setElementAt(op, topLoc);
838
839	appendOp(URX_JMP_SAV_X, topLoc+1);
840	} else {
841	// Simpler code when the repeated body must match something non-empty
842	appendOp(URX_JMP_SAV, topLoc);
843	}
844	}
845	break;
846
847	case doNGPlus:
848	// Non-greedy '+?' compiles to
849	// 1. stuff to be repeated (already built)
850	// 2. state-save 1
851	// 3. ...
852	{
853	int32_t topLoc = blockTopLoc(FALSE0);
854	appendOp(URX_STATE_SAVE, topLoc);
855	}
856	break;
857
858
859	case doOpt:
860	// Normal (greedy) ? quantifier.
861	// Compiles to
862	// 1. state save 3
863	// 2. body of optional block
864	// 3. ...
865	// Insert the state save into the compiled pattern, and we're done.
866	{
867	int32_t saveStateLoc = blockTopLoc(TRUE1);
868	int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
869	fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
870	}
871	break;
872
873	case doNGOpt:
874	// Non-greedy ?? quantifier
875	// compiles to
876	// 1. jmp 4
877	// 2. body of optional block
878	// 3 jmp 5
879	// 4. state save 2
880	// 5 ...
881	// This code is less than ideal, with two jmps instead of one, because we can only
882	// insert one instruction at the top of the block being iterated.
883	{
884	int32_t jmp1_loc = blockTopLoc(TRUE1);
885	int32_t jmp2_loc = fRXPat->fCompiledPat->size();
886
887	int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1);
888	fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
889
890	appendOp(URX_JMP, jmp2_loc+2);
891
892	appendOp(URX_STATE_SAVE, jmp1_loc+1);
893	}
894	break;
895
896
897	case doStar:
898	// Normal (greedy) * quantifier.
899	// Compiles to
900	// 1. STATE_SAVE 4
901	// 2. body of stuff being iterated over
902	// 3. JMP_SAV 2
903	// 4. ...
904	//
905	// Or, if the body is a simple [Set],
906	// 1. LOOP_SR_I set number
907	// 2. LOOP_C stack location
908	// ...
909	//
910	// Or if this is a .*
911	// 1. LOOP_DOT_I (. matches all mode flag)
912	// 2. LOOP_C stack location
913	//
914	// Or, if the body can match a zero-length string, to inhibit infinite loops,
915	// 1. STATE_SAVE 5
916	// 2. STO_INP_LOC data-loc
917	// 3. body of stuff
918	// 4. JMP_SAV_X 2
919	// 5. ...
920	{
921	// location of item #1, the STATE_SAVE
922	int32_t topLoc = blockTopLoc(FALSE0);
923	int32_t dataLoc = -1;
924
925	// Check for simple *, where the construct being repeated
926	// compiled to single opcode, and might be optimizable.
927	if (topLoc == fRXPat->fCompiledPat->size() - 1) {
928	int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
929
930	if (URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_SETREF) {
931	// Emit optimized code for a [char set]*
932	int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)((repeatedOp) & 0xffffff));
933	fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
934	dataLoc = allocateStackData(1);
935	appendOp(URX_LOOP_C, dataLoc);
936	break;
937	}
938
939	if (URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY \|\|
940	URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY_ALL \|\|
941	URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY_UNIX) {
942	// Emit Optimized code for .* operations.
943	int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
944	if (URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY_ALL) {
945	// URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
946	loopOpI \|= 1;
947	}
948	if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
949	loopOpI \|= 2;
950	}
951	fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
952	dataLoc = allocateStackData(1);
953	appendOp(URX_LOOP_C, dataLoc);
954	break;
955	}
956	}
957
958	// Emit general case code for this *
959	// The optimizations did not apply.
960
961	int32_t saveStateLoc = blockTopLoc(TRUE1);
962	int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1);
963
964	// Check for minimum match length of zero, which requires
965	// extra loop-breaking code.
966	if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
967	insertOp(saveStateLoc);
968	dataLoc = allocateStackData(1);
969
970	int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
971	fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
972	jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
973	}
974
975	// Locate the position in the compiled pattern where the match will continue
976	// after completing the *. (4 or 5 in the comment above)
977	int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
978
979	// Put together the save state op and store it into the compiled code.
980	int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
981	fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
982
983	// Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
984	appendOp(jmpOp);
985	}
986	break;
987
988	case doNGStar:
989	// Non-greedy *? quantifier
990	// compiles to
991	// 1. JMP 3
992	// 2. body of stuff being iterated over
993	// 3. STATE_SAVE 2
994	// 4 ...
995	{
996	int32_t jmpLoc = blockTopLoc(TRUE1); // loc 1.
997	int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
998	int32_t jmpOp = buildOp(URX_JMP, saveLoc);
999	fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1000	appendOp(URX_STATE_SAVE, jmpLoc+1);
1001	}
1002	break;
1003
1004
1005	case doIntervalInit:
1006	// The '{' opening an interval quantifier was just scanned.
1007	// Init the counter variables that will accumulate the values as the digits
1008	// are scanned.
1009	fIntervalLow = 0;
1010	fIntervalUpper = -1;
1011	break;
1012
1013	case doIntevalLowerDigit:
1014	// Scanned a digit from the lower value of an {lower,upper} interval
1015	{
1016	int32_t digitValue = u_charDigitValueu_charDigitValue_71(fC.fChar);
1017	U_ASSERT(digitValue >= 0)(void)0;
1018	int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1019	if (val > INT32_MAX(2147483647)) {
1020	error(U_REGEX_NUMBER_TOO_BIG);
1021	} else {
1022	fIntervalLow = (int32_t)val;
1023	}
1024	}
1025	break;
1026
1027	case doIntervalUpperDigit:
1028	// Scanned a digit from the upper value of an {lower,upper} interval
1029	{
1030	if (fIntervalUpper < 0) {
1031	fIntervalUpper = 0;
1032	}
1033	int32_t digitValue = u_charDigitValueu_charDigitValue_71(fC.fChar);
1034	U_ASSERT(digitValue >= 0)(void)0;
1035	int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1036	if (val > INT32_MAX(2147483647)) {
1037	error(U_REGEX_NUMBER_TOO_BIG);
1038	} else {
1039	fIntervalUpper = (int32_t)val;
1040	}
1041	}
1042	break;
1043
1044	case doIntervalSame:
1045	// Scanned a single value interval like {27}. Upper = Lower.
1046	fIntervalUpper = fIntervalLow;
1047	break;
1048
1049	case doInterval:
1050	// Finished scanning a normal {lower,upper} interval. Generate the code for it.
1051	if (compileInlineInterval() == FALSE0) {
1052	compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1053	}
1054	break;
1055
1056	case doPossessiveInterval:
1057	// Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1058	{
1059	// Remember the loc for the top of the block being looped over.
1060	// (Can not reserve a slot in the compiled pattern at this time, because
1061	// compileInterval needs to reserve also, and blockTopLoc can only reserve
1062	// once per block.)
1063	int32_t topLoc = blockTopLoc(FALSE0);
1064
1065	// Produce normal looping code.
1066	compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1067
1068	// Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1069	// just as if the loop was inclosed in atomic parentheses.
1070
1071	// First the STO_SP before the start of the loop
1072	insertOp(topLoc);
1073
1074	int32_t varLoc = allocateData(1); // Reserve a data location for saving the
1075	int32_t op = buildOp(URX_STO_SP, varLoc);
1076	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1077
1078	int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1079	U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc)(void)0;
1080	loopOp++; // point LoopOp after the just-inserted STO_SP
1081	fRXPat->fCompiledPat->push(loopOp, *fStatus);
1082
1083	// Then the LD_SP after the end of the loop
1084	appendOp(URX_LD_SP, varLoc);
1085	}
1086
1087	break;
1088
1089	case doNGInterval:
1090	// Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1091	compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1092	break;
1093
1094	case doIntervalError:
1095	error(U_REGEX_BAD_INTERVAL);
1096	break;
1097
1098	case doLiteralChar:
1099	// We've just scanned a "normal" character from the pattern,
1100	literalChar(fC.fChar);
1101	break;
1102
1103
1104	case doEscapedLiteralChar:
1105	// We've just scanned an backslashed escaped character with no
1106	// special meaning. It represents itself.
1107	if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1108	((fC.fChar >= 0x41 && fC.fChar<= 0x5A) \|\| // in [A-Z]
1109	(fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1110	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1111	}
1112	literalChar(fC.fChar);
1113	break;
1114
1115
1116	case doDotAny:
1117	// scanned a ".", match any single character.
1118	{
1119	fixLiterals(FALSE0);
1120	if (fModeFlags & UREGEX_DOTALL) {
1121	appendOp(URX_DOTANY_ALL, 0);
1122	} else if (fModeFlags & UREGEX_UNIX_LINES) {
1123	appendOp(URX_DOTANY_UNIX, 0);
1124	} else {
1125	appendOp(URX_DOTANY, 0);
1126	}
1127	}
1128	break;
1129
1130	case doCaret:
1131	{
1132	fixLiterals(FALSE0);
1133	if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1134	appendOp(URX_CARET, 0);
1135	} else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1136	appendOp(URX_CARET_M, 0);
1137	} else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1138	appendOp(URX_CARET, 0); // Only testing true start of input.
1139	} else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1140	appendOp(URX_CARET_M_UNIX, 0);
1141	}
1142	}
1143	break;
1144
1145	case doDollar:
1146	{
1147	fixLiterals(FALSE0);
1148	if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1149	appendOp(URX_DOLLAR, 0);
1150	} else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1151	appendOp(URX_DOLLAR_M, 0);
1152	} else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1153	appendOp(URX_DOLLAR_D, 0);
1154	} else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1155	appendOp(URX_DOLLAR_MD, 0);
1156	}
1157	}
1158	break;
1159
1160	case doBackslashA:
1161	fixLiterals(FALSE0);
1162	appendOp(URX_CARET, 0);
1163	break;
1164
1165	case doBackslashB:
1166	{
1167	#if UCONFIG_NO_BREAK_ITERATION0==1
1168	if (fModeFlags & UREGEX_UWORD) {
1169	error(U_UNSUPPORTED_ERROR);
1170	}
1171	#endif
1172	fixLiterals(FALSE0);
1173	int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1174	appendOp(op, 1);
1175	}
1176	break;
1177
1178	case doBackslashb:
1179	{
1180	#if UCONFIG_NO_BREAK_ITERATION0==1
1181	if (fModeFlags & UREGEX_UWORD) {
1182	error(U_UNSUPPORTED_ERROR);
1183	}
1184	#endif
1185	fixLiterals(FALSE0);
1186	int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1187	appendOp(op, 0);
1188	}
1189	break;
1190
1191	case doBackslashD:
1192	fixLiterals(FALSE0);
1193	appendOp(URX_BACKSLASH_D, 1);
1194	break;
1195
1196	case doBackslashd:
1197	fixLiterals(FALSE0);
1198	appendOp(URX_BACKSLASH_D, 0);
1199	break;
1200
1201	case doBackslashG:
1202	fixLiterals(FALSE0);
1203	appendOp(URX_BACKSLASH_G, 0);
1204	break;
1205
1206	case doBackslashH:
1207	fixLiterals(FALSE0);
1208	appendOp(URX_BACKSLASH_H, 1);
1209	break;
1210
1211	case doBackslashh:
1212	fixLiterals(FALSE0);
1213	appendOp(URX_BACKSLASH_H, 0);
1214	break;
1215
1216	case doBackslashR:
1217	fixLiterals(FALSE0);
1218	appendOp(URX_BACKSLASH_R, 0);
1219	break;
1220
1221	case doBackslashS:
1222	fixLiterals(FALSE0);
1223	appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1224	break;
1225
1226	case doBackslashs:
1227	fixLiterals(FALSE0);
1228	appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1229	break;
1230
1231	case doBackslashV:
1232	fixLiterals(FALSE0);
1233	appendOp(URX_BACKSLASH_V, 1);
1234	break;
1235
1236	case doBackslashv:
1237	fixLiterals(FALSE0);
1238	appendOp(URX_BACKSLASH_V, 0);
1239	break;
1240
1241	case doBackslashW:
1242	fixLiterals(FALSE0);
1243	appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1244	break;
1245
1246	case doBackslashw:
1247	fixLiterals(FALSE0);
1248	appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1249	break;
1250
1251	case doBackslashX:
1252	#if UCONFIG_NO_BREAK_ITERATION0==1
1253	// Grapheme Cluster Boundary requires ICU break iteration.
1254	error(U_UNSUPPORTED_ERROR);
1255	#endif
1256	fixLiterals(FALSE0);
1257	appendOp(URX_BACKSLASH_X, 0);
1258	break;
1259
1260	case doBackslashZ:
1261	fixLiterals(FALSE0);
1262	appendOp(URX_DOLLAR, 0);
1263	break;
1264
1265	case doBackslashz:
1266	fixLiterals(FALSE0);
1267	appendOp(URX_BACKSLASH_Z, 0);
1268	break;
1269
1270	case doEscapeError:
1271	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1272	break;
1273
1274	case doExit:
1275	fixLiterals(FALSE0);
1276	returnVal = FALSE0;
1277	break;
1278
1279	case doProperty:
1280	{
1281	fixLiterals(FALSE0);
1282	UnicodeSet *theSet = scanProp();
1283	compileSet(theSet);
1284	}
1285	break;
1286
1287	case doNamedChar:
1288	{
1289	UChar32 c = scanNamedChar();
1290	literalChar(c);
1291	}
1292	break;
1293
1294
1295	case doBackRef:
1296	// BackReference. Somewhat unusual in that the front-end can not completely parse
1297	// the regular expression, because the number of digits to be consumed
1298	// depends on the number of capture groups that have been defined. So
1299	// we have to do it here instead.
1300	{
1301	int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1302	int32_t groupNum = 0;
1303	UChar32 c = fC.fChar;
1304
1305	for (;;) {
1306	// Loop once per digit, for max allowed number of digits in a back reference.
1307	int32_t digit = u_charDigitValueu_charDigitValue_71(c);
1308	groupNum = groupNum * 10 + digit;
1309	if (groupNum >= numCaptureGroups) {
1310	break;
1311	}
1312	c = peekCharLL();
1313	if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE0) {
1314	break;
1315	}
1316	nextCharLL();
1317	}
1318
1319	// Scan of the back reference in the source regexp is complete. Now generate
1320	// the compiled code for it.
1321	// Because capture groups can be forward-referenced by back-references,
1322	// we fill the operand with the capture group number. At the end
1323	// of compilation, it will be changed to the variable's location.
1324	U_ASSERT(groupNum > 0)(void)0; // Shouldn't happen. '\0' begins an octal escape sequence,
1325	// and shouldn't enter this code path at all.
1326	fixLiterals(FALSE0);
1327	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1328	appendOp(URX_BACKREF_I, groupNum);
1329	} else {
1330	appendOp(URX_BACKREF, groupNum);
1331	}
1332	}
1333	break;
1334
1335	case doBeginNamedBackRef:
1336	U_ASSERT(fCaptureName == NULL)(void)0;
1337	fCaptureName = new UnicodeString;
1338	if (fCaptureName == NULL__null) {
1339	error(U_MEMORY_ALLOCATION_ERROR);
1340	}
1341	break;
1342
1343	case doContinueNamedBackRef:
1344	fCaptureName->append(fC.fChar);
1345	break;
1346
1347	case doCompleteNamedBackRef:
1348	{
1349	int32_t groupNumber =
1350	fRXPat->fNamedCaptureMap ? uhash_getiuhash_geti_71(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
1351	if (groupNumber == 0) {
1352	// Group name has not been defined.
1353	// Could be a forward reference. If we choose to support them at some
1354	// future time, extra mechanism will be required at this point.
1355	error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1356	} else {
1357	// Given the number, handle identically to a \n numbered back reference.
1358	// See comments above, under doBackRef
1359	fixLiterals(FALSE0);
1360	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1361	appendOp(URX_BACKREF_I, groupNumber);
1362	} else {
1363	appendOp(URX_BACKREF, groupNumber);
1364	}
1365	}
1366	delete fCaptureName;
1367	fCaptureName = NULL__null;
1368	break;
1369	}
1370
1371	case doPossessivePlus:
1372	// Possessive ++ quantifier.
1373	// Compiles to
1374	// 1. STO_SP
1375	// 2. body of stuff being iterated over
1376	// 3. STATE_SAVE 5
1377	// 4. JMP 2
1378	// 5. LD_SP
1379	// 6. ...
1380	//
1381	// Note: TODO: This is pretty inefficient. A mass of saved state is built up
1382	// then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1383	//
1384	{
1385	// Emit the STO_SP
1386	int32_t topLoc = blockTopLoc(TRUE1);
1387	int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1388	int32_t op = buildOp(URX_STO_SP, stoLoc);
1389	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1390
1391	// Emit the STATE_SAVE
1392	appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1393
1394	// Emit the JMP
1395	appendOp(URX_JMP, topLoc+1);
1396
1397	// Emit the LD_SP
1398	appendOp(URX_LD_SP, stoLoc);
1399	}
1400	break;
1401
1402	case doPossessiveStar:
1403	// Possessive *+ quantifier.
1404	// Compiles to
1405	// 1. STO_SP loc
1406	// 2. STATE_SAVE 5
1407	// 3. body of stuff being iterated over
1408	// 4. JMP 2
1409	// 5. LD_SP loc
1410	// 6 ...
1411	// TODO: do something to cut back the state stack each time through the loop.
1412	{
1413	// Reserve two slots at the top of the block.
1414	int32_t topLoc = blockTopLoc(TRUE1);
1415	insertOp(topLoc);
1416
1417	// emit STO_SP loc
1418	int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1419	int32_t op = buildOp(URX_STO_SP, stoLoc);
1420	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1421
1422	// Emit the SAVE_STATE 5
1423	int32_t L7 = fRXPat->fCompiledPat->size()+1;
1424	op = buildOp(URX_STATE_SAVE, L7);
1425	fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1426
1427	// Append the JMP operation.
1428	appendOp(URX_JMP, topLoc+1);
1429
1430	// Emit the LD_SP loc
1431	appendOp(URX_LD_SP, stoLoc);
1432	}
1433	break;
1434
1435	case doPossessiveOpt:
1436	// Possessive ?+ quantifier.
1437	// Compiles to
1438	// 1. STO_SP loc
1439	// 2. SAVE_STATE 5
1440	// 3. body of optional block
1441	// 4. LD_SP loc
1442	// 5. ...
1443	//
1444	{
1445	// Reserve two slots at the top of the block.
1446	int32_t topLoc = blockTopLoc(TRUE1);
1447	insertOp(topLoc);
1448
1449	// Emit the STO_SP
1450	int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1451	int32_t op = buildOp(URX_STO_SP, stoLoc);
1452	fRXPat->fCompiledPat->setElementAt(op, topLoc);
1453
1454	// Emit the SAVE_STATE
1455	int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
1456	op = buildOp(URX_STATE_SAVE, continueLoc);
1457	fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1458
1459	// Emit the LD_SP
1460	appendOp(URX_LD_SP, stoLoc);
1461	}
1462	break;
1463
1464
1465	case doBeginMatchMode:
1466	fNewModeFlags = fModeFlags;
1467	fSetModeFlag = TRUE1;
1468	break;
1469
1470	case doMatchMode: // (?i) and similar
1471	{
1472	int32_t bit = 0;
1473	switch (fC.fChar) {
1474	case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
1475	case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break;
1476	case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
1477	case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
1478	case 0x75: /* 'u' / bit = 0; / Unicode casing */ break;
1479	case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
1480	case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
1481	case 0x2d: /* '-' */ fSetModeFlag = FALSE0; break;
1482	default:
1483	UPRV_UNREACHABLE_EXITabort(); // Should never happen. Other chars are filtered out
1484	// by the scanner.
1485	}
1486	if (fSetModeFlag) {
1487	fNewModeFlags \|= bit;
1488	} else {
1489	fNewModeFlags &= ~bit;
1490	}
1491	}
1492	break;
1493
1494	case doSetMatchMode:
1495	// Emit code to match any pending literals, using the not-yet changed match mode.
1496	fixLiterals();
1497
1498	// We've got a (?i) or similar. The match mode is being changed, but
1499	// the change is not scoped to a parenthesized block.
1500	U_ASSERT(fNewModeFlags < 0)(void)0;
1501	fModeFlags = fNewModeFlags;
1502
1503	break;
1504
1505
1506	case doMatchModeParen:
1507	// We've got a (?i: or similar. Begin a parenthesized block, save old
1508	// mode flags so they can be restored at the close of the block.
1509	//
1510	// Compile to a
1511	// - NOP, which later may be replaced by a save-state if the
1512	// parenthesized group gets a * quantifier, followed by
1513	// - NOP, which may later be replaced by a save-state if there
1514	// is an '\|' alternation within the parens.
1515	{
1516	fixLiterals(FALSE0);
1517	appendOp(URX_NOP, 0);
1518	appendOp(URX_NOP, 0);
1519
1520	// On the Parentheses stack, start a new frame and add the positions
1521	// of the two NOPs (a normal non-capturing () frame, except for the
1522	// saving of the original mode flags.)
1523	fParenStack.push(fModeFlags, *fStatus);
1524	fParenStack.push(flags, *fStatus); // Frame Marker
1525	fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
1526	fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
1527
1528	// Set the current mode flags to the new values.
1529	U_ASSERT(fNewModeFlags < 0)(void)0;
1530	fModeFlags = fNewModeFlags;
1531	}
1532	break;
1533
1534	case doBadModeFlag:
1535	error(U_REGEX_INVALID_FLAG);
1536	break;
1537
1538	case doSuppressComments:
1539	// We have just scanned a '(?'. We now need to prevent the character scanner from
1540	// treating a '#' as a to-the-end-of-line comment.
1541	// (This Perl compatibility just gets uglier and uglier to do...)
1542	fEOLComments = FALSE0;
1543	break;
1544
1545
1546	case doSetAddAmp:
1547	{
1548	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1549	set->add(chAmp);
1550	}
1551	break;
1552
1553	case doSetAddDash:
1554	{
1555	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1556	set->add(chDash);
1557	}
1558	break;
1559
1560	case doSetBackslash_s:
1561	{
1562	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1563	set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1564	break;
1565	}
1566
1567	case doSetBackslash_S:
1568	{
1569	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1570	UnicodeSet SSet;
1571	SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
1572	set->addAll(SSet);
1573	break;
1574	}
1575
1576	case doSetBackslash_d:
1577	{
1578	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1579	// TODO - make a static set, ticket 6058.
1580	addCategory(set, U_GC_ND_MASK((uint32_t)1<<(U_DECIMAL_DIGIT_NUMBER)), *fStatus);
1581	break;
1582	}
1583
1584	case doSetBackslash_D:
1585	{
1586	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1587	UnicodeSet digits;
1588	// TODO - make a static set, ticket 6058.
1589	digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK((uint32_t)1<<(U_DECIMAL_DIGIT_NUMBER)), *fStatus);
1590	digits.complement();
1591	set->addAll(digits);
1592	break;
1593	}
1594
1595	case doSetBackslash_h:
1596	{
1597	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1598	UnicodeSet h;
1599	h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK((uint32_t)1<<(U_SPACE_SEPARATOR)), *fStatus);
1600	h.add((UChar32)9); // Tab
1601	set->addAll(h);
1602	break;
1603	}
1604
1605	case doSetBackslash_H:
1606	{
1607	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1608	UnicodeSet h;
1609	h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK((uint32_t)1<<(U_SPACE_SEPARATOR)), *fStatus);
1610	h.add((UChar32)9); // Tab
1611	h.complement();
1612	set->addAll(h);
1613	break;
1614	}
1615
1616	case doSetBackslash_v:
1617	{
1618	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1619	set->add((UChar32)0x0a, (UChar32)0x0d); // add range
1620	set->add((UChar32)0x85);
1621	set->add((UChar32)0x2028, (UChar32)0x2029);
1622	break;
1623	}
1624
1625	case doSetBackslash_V:
1626	{
1627	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1628	UnicodeSet v;
1629	v.add((UChar32)0x0a, (UChar32)0x0d); // add range
1630	v.add((UChar32)0x85);
1631	v.add((UChar32)0x2028, (UChar32)0x2029);
1632	v.complement();
1633	set->addAll(v);
1634	break;
1635	}
1636
1637	case doSetBackslash_w:
1638	{
1639	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1640	set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1641	break;
1642	}
1643
1644	case doSetBackslash_W:
1645	{
1646	UnicodeSet set = (UnicodeSet )fSetStack.peek();
1647	UnicodeSet SSet;
1648	SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
1649	set->addAll(SSet);
1650	break;
1651	}
1652
1653	case doSetBegin:
1654	{
1655	fixLiterals(FALSE0);
1656	LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
1657	fSetStack.push(lpSet.orphan(), *fStatus);
1658	fSetOpStack.push(setStart, *fStatus);
1659	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1660	fSetOpStack.push(setCaseClose, *fStatus);
1661	}
1662	break;
1663	}
1664
1665	case doSetBeginDifference1:
1666	// We have scanned something like [[abc]-[
1667	// Set up a new UnicodeSet for the set beginning with the just-scanned '['
1668	// Push a Difference operator, which will cause the new set to be subtracted from what
1669	// went before once it is created.
1670	setPushOp(setDifference1);
1671	fSetOpStack.push(setStart, *fStatus);
1672	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1673	fSetOpStack.push(setCaseClose, *fStatus);
1674	}
1675	break;
1676
1677	case doSetBeginIntersection1:
1678	// We have scanned something like [[abc]&[
1679	// Need both the '&' operator and the open '[' operator.
1680	setPushOp(setIntersection1);
1681	fSetOpStack.push(setStart, *fStatus);
1682	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1683	fSetOpStack.push(setCaseClose, *fStatus);
1684	}
1685	break;
1686
1687	case doSetBeginUnion:
1688	// We have scanned something like [[abc][
1689	// Need to handle the union operation explicitly [[abc] \| [
1690	setPushOp(setUnion);
1691	fSetOpStack.push(setStart, *fStatus);
1692	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1693	fSetOpStack.push(setCaseClose, *fStatus);
1694	}
1695	break;
1696
1697	case doSetDifference2:
1698	// We have scanned something like [abc--
1699	// Consider this to unambiguously be a set difference operator.
1700	setPushOp(setDifference2);
1701	break;
1702
1703	case doSetEnd:
1704	// Have encountered the ']' that closes a set.
1705	// Force the evaluation of any pending operations within this set,
1706	// leave the completed set on the top of the set stack.
1707	setEval(setEnd);
1708	U_ASSERT(fSetOpStack.peeki()==setStart)(void)0;
1709	fSetOpStack.popi();
1710	break;
1711
1712	case doSetFinish:
1713	{
1714	// Finished a complete set expression, including all nested sets.
1715	// The close bracket has already triggered clearing out pending set operators,
1716	// the operator stack should be empty and the operand stack should have just
1717	// one entry, the result set.
1718	U_ASSERT(fSetOpStack.empty())(void)0;
1719	UnicodeSet theSet = (UnicodeSet )fSetStack.pop();
1720	U_ASSERT(fSetStack.empty())(void)0;
1721	compileSet(theSet);
1722	break;
1723	}
1724
1725	case doSetIntersection2:
1726	// Have scanned something like [abc&&
1727	setPushOp(setIntersection2);
1728	break;
1729
1730	case doSetLiteral:
1731	// Union the just-scanned literal character into the set being built.
1732	// This operation is the highest precedence set operation, so we can always do
1733	// it immediately, without waiting to see what follows. It is necessary to perform
1734	// any pending '-' or '&' operation first, because these have the same precedence
1735	// as union-ing in a literal'
1736	{
1737	setEval(setUnion);
1738	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1739	s->add(fC.fChar);
1740	fLastSetLiteral = fC.fChar;
1741	break;
1742	}
1743
1744	case doSetLiteralEscaped:
1745	// A back-slash escaped literal character was encountered.
1746	// Processing is the same as with setLiteral, above, with the addition of
1747	// the optional check for errors on escaped ASCII letters.
1748	{
1749	if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1750	((fC.fChar >= 0x41 && fC.fChar<= 0x5A) \|\| // in [A-Z]
1751	(fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1752	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1753	}
1754	setEval(setUnion);
1755	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1756	s->add(fC.fChar);
1757	fLastSetLiteral = fC.fChar;
1758	break;
1759	}
1760
1761	case doSetNamedChar:
1762	// Scanning a \N{UNICODE CHARACTER NAME}
1763	// Aside from the source of the character, the processing is identical to doSetLiteral,
1764	// above.
1765	{
1766	UChar32 c = scanNamedChar();
1767	setEval(setUnion);
1768	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1769	s->add(c);
1770	fLastSetLiteral = c;
1771	break;
1772	}
1773
1774	case doSetNamedRange:
1775	// We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1776	// The left character is already in the set, and is saved in fLastSetLiteral.
1777	// The right side needs to be picked up, the scan is at the 'N'.
1778	// Lower Limit > Upper limit being an error matches both Java
1779	// and ICU UnicodeSet behavior.
1780	{
1781	UChar32 c = scanNamedChar();
1782	if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL(-1) \|\| fLastSetLiteral > c)) {
1783	error(U_REGEX_INVALID_RANGE);
1784	}
1785	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1786	s->add(fLastSetLiteral, c);
1787	fLastSetLiteral = c;
1788	break;
1789	}
1790
1791
1792	case doSetNegate:
1793	// Scanned a '^' at the start of a set.
1794	// Push the negation operator onto the set op stack.
1795	// A twist for case-insensitive matching:
1796	// the case closure operation must happen _before_ negation.
1797	// But the case closure operation will already be on the stack if it's required.
1798	// This requires checking for case closure, and swapping the stack order
1799	// if it is present.
1800	{
1801	int32_t tosOp = fSetOpStack.peeki();
1802	if (tosOp == setCaseClose) {
1803	fSetOpStack.popi();
1804	fSetOpStack.push(setNegation, *fStatus);
1805	fSetOpStack.push(setCaseClose, *fStatus);
1806	} else {
1807	fSetOpStack.push(setNegation, *fStatus);
1808	}
1809	}
1810	break;
1811
1812	case doSetNoCloseError:
1813	error(U_REGEX_MISSING_CLOSE_BRACKET);
1814	break;
1815
1816	case doSetOpError:
1817	error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal.
1818	break;
1819
1820	case doSetPosixProp:
1821	{
1822	UnicodeSet *s = scanPosixProp();
1823	if (s != NULL__null) {
1824	UnicodeSet tos = (UnicodeSet )fSetStack.peek();
1825	tos->addAll(*s);
1826	delete s;
1827	} // else error. scanProp() reported the error status already.
1828	}
1829	break;
1830
1831	case doSetProp:
1832	// Scanned a \p \P within [brackets].
1833	{
1834	UnicodeSet *s = scanProp();
1835	if (s != NULL__null) {
1836	UnicodeSet tos = (UnicodeSet )fSetStack.peek();
1837	tos->addAll(*s);
1838	delete s;
1839	} // else error. scanProp() reported the error status already.
1840	}
1841	break;
1842
1843
1844	case doSetRange:
1845	// We have scanned literal-literal. Add the range to the set.
1846	// The left character is already in the set, and is saved in fLastSetLiteral.
1847	// The right side is the current character.
1848	// Lower Limit > Upper limit being an error matches both Java
1849	// and ICU UnicodeSet behavior.
1850	{
1851
1852	if (fLastSetLiteral == U_SENTINEL(-1) \|\| fLastSetLiteral > fC.fChar) {
1853	error(U_REGEX_INVALID_RANGE);
1854	}
1855	UnicodeSet s = (UnicodeSet )fSetStack.peek();
1856	s->add(fLastSetLiteral, fC.fChar);
1857	break;
1858	}
1859
1860	default:
1861	UPRV_UNREACHABLE_EXITabort();
1862	}
1863
1864	if (U_FAILURE(*fStatus)) {
1865	returnVal = FALSE0;
1866	}
1867
1868	return returnVal;
1869	}
1870
1871
1872
1873	//------------------------------------------------------------------------------
1874	//
1875	// literalChar We've encountered a literal character from the pattern,
1876	// or an escape sequence that reduces to a character.
1877	// Add it to the string containing all literal chars/strings from
1878	// the pattern.
1879	//
1880	//------------------------------------------------------------------------------
1881	void RegexCompile::literalChar(UChar32 c) {
1882	fLiteralChars.append(c);
1883	}
1884
1885
1886	//------------------------------------------------------------------------------
1887	//
1888	// fixLiterals When compiling something that can follow a literal
1889	// string in a pattern, emit the code to match the
1890	// accumulated literal string.
1891	//
1892	// Optionally, split the last char of the string off into
1893	// a single "ONE_CHAR" operation, so that quantifiers can
1894	// apply to that char alone. Example: abc*
1895	// The * must apply to the 'c' only.
1896	//
1897	//------------------------------------------------------------------------------
1898	void RegexCompile::fixLiterals(UBool split) {
1899
1900	// If no literal characters have been scanned but not yet had code generated
1901	// for them, nothing needs to be done.
1902	if (fLiteralChars.length() == 0) {
1903	return;
1904	}
1905
1906	int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1907	UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1908
1909	// Split: We need to ensure that the last item in the compiled pattern
1910	// refers only to the last literal scanned in the pattern, so that
1911	// quantifiers (*, +, etc.) affect only it, and not a longer string.
1912	// Split before case folding for case insensitive matches.
1913
1914	if (split) {
1915	fLiteralChars.truncate(indexOfLastCodePoint);
1916	fixLiterals(FALSE0); // Recursive call, emit code to match the first part of the string.
1917	// Note that the truncated literal string may be empty, in which case
1918	// nothing will be emitted.
1919
1920	literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal.
1921	fixLiterals(FALSE0); // Second recursive call, code for the final code point.
1922	return;
1923	}
1924
1925	// If we are doing case-insensitive matching, case fold the string. This may expand
1926	// the string, e.g. the German sharp-s turns into "ss"
1927	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1928	fLiteralChars.foldCase();
1929	indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1930	lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1931	}
1932
1933	if (indexOfLastCodePoint == 0) {
1934	// Single character, emit a URX_ONECHAR op to match it.
1935	if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1936	u_hasBinaryPropertyu_hasBinaryProperty_71(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1937	appendOp(URX_ONECHAR_I, lastCodePoint);
1938	} else {
1939	appendOp(URX_ONECHAR, lastCodePoint);
1940	}
1941	} else {
1942	// Two or more chars, emit a URX_STRING to match them.
1943	if (fLiteralChars.length() > 0x00ffffff \|\| fRXPat->fLiteralText.length() > 0x00ffffff) {
1944	error(U_REGEX_PATTERN_TOO_BIG);
1945	}
1946	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1947	appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1948	} else {
1949	// TODO here: add optimization to split case sensitive strings of length two
1950	// into two single char ops, for efficiency.
1951	appendOp(URX_STRING, fRXPat->fLiteralText.length());
1952	}
1953	appendOp(URX_STRING_LEN, fLiteralChars.length());
1954
1955	// Add this string into the accumulated strings of the compiled pattern.
1956	fRXPat->fLiteralText.append(fLiteralChars);
1957	}
1958
1959	fLiteralChars.remove();
1960	}
1961
1962
1963	int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1964	if (U_FAILURE(*fStatus)) {
1965	return 0;
1966	}
1967	if (type < 0 \|\| type > 255) {
1968	UPRV_UNREACHABLE_EXITabort();
1969	}
1970	if (val > 0x00ffffff) {
1971	UPRV_UNREACHABLE_EXITabort();
1972	}
1973	if (val < 0) {
1974	if (!(type == URX_RESERVED_OP_N \|\| type == URX_RESERVED_OP)) {
1975	UPRV_UNREACHABLE_EXITabort();
1976	}
1977	if (URX_TYPE(val)((uint32_t)(val) >> 24) != 0xff) {
1978	UPRV_UNREACHABLE_EXITabort();
1979	}
1980	type = URX_RESERVED_OP_N;
1981	}
1982	return (type << 24) \| val;
1983	}
1984
1985
1986	//------------------------------------------------------------------------------
1987	//
1988	// appendOp() Append a new instruction onto the compiled pattern
1989	// Includes error checking, limiting the size of the
1990	// pattern to lengths that can be represented in the
1991	// 24 bit operand field of an instruction.
1992	//
1993	//------------------------------------------------------------------------------
1994	void RegexCompile::appendOp(int32_t op) {
1995	if (U_FAILURE(*fStatus)) {
1996	return;
1997	}
1998	fRXPat->fCompiledPat->addElement(op, *fStatus);
1999	if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
2000	error(U_REGEX_PATTERN_TOO_BIG);
2001	}
2002	}
2003
2004	void RegexCompile::appendOp(int32_t type, int32_t val) {
2005	appendOp(buildOp(type, val));
2006	}
2007
2008
2009	//------------------------------------------------------------------------------
2010	//
2011	// insertOp() Insert a slot for a new opcode into the already
2012	// compiled pattern code.
2013	//
2014	// Fill the slot with a NOP. Our caller will replace it
2015	// with what they really wanted.
2016	//
2017	//------------------------------------------------------------------------------
2018	void RegexCompile::insertOp(int32_t where) {
2019	UVector64 *code = fRXPat->fCompiledPat;
2020	U_ASSERT(where>0 && where < code->size())(void)0;
2021
2022	int32_t nop = buildOp(URX_NOP, 0);
2023	code->insertElementAt(nop, where, *fStatus);
2024
2025	// Walk through the pattern, looking for any ops with targets that
2026	// were moved down by the insert. Fix them.
2027	int32_t loc;
2028	for (loc=0; loc<code->size(); loc++) {
2029	int32_t op = (int32_t)code->elementAti(loc);
2030	int32_t opType = URX_TYPE(op)((uint32_t)(op) >> 24);
2031	int32_t opValue = URX_VAL(op)((op) & 0xffffff);
2032	if ((opType == URX_JMP \|\|
2033	opType == URX_JMPX \|\|
2034	opType == URX_STATE_SAVE \|\|
2035	opType == URX_CTR_LOOP \|\|
2036	opType == URX_CTR_LOOP_NG \|\|
2037	opType == URX_JMP_SAV \|\|
2038	opType == URX_JMP_SAV_X \|\|
2039	opType == URX_RELOC_OPRND) && opValue > where) {
2040	// Target location for this opcode is after the insertion point and
2041	// needs to be incremented to adjust for the insertion.
2042	opValue++;
2043	op = buildOp(opType, opValue);
2044	code->setElementAt(op, loc);
2045	}
2046	}
2047
2048	// Now fix up the parentheses stack. All positive values in it are locations in
2049	// the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2050	for (loc=0; loc<fParenStack.size(); loc++) {
2051	int32_t x = fParenStack.elementAti(loc);
2052	U_ASSERT(x < code->size())(void)0;
2053	if (x>where) {
2054	x++;
2055	fParenStack.setElementAt(x, loc);
2056	}
2057	}
2058
2059	if (fMatchCloseParen > where) {
2060	fMatchCloseParen++;
2061	}
2062	if (fMatchOpenParen > where) {
2063	fMatchOpenParen++;
2064	}
2065	}
2066
2067
2068	//------------------------------------------------------------------------------
2069	//
2070	// allocateData() Allocate storage in the matcher's static data area.
2071	// Return the index for the newly allocated data.
2072	// The storage won't actually exist until we are running a match
2073	// operation, but the storage indexes are inserted into various
2074	// opcodes while compiling the pattern.
2075	//
2076	//------------------------------------------------------------------------------
2077	int32_t RegexCompile::allocateData(int32_t size) {
2078	if (U_FAILURE(*fStatus)) {
2079	return 0;
2080	}
2081	if (size <= 0 \|\| size > 0x100 \|\| fRXPat->fDataSize < 0) {
2082	error(U_REGEX_INTERNAL_ERROR);
2083	return 0;
2084	}
2085	int32_t dataIndex = fRXPat->fDataSize;
2086	fRXPat->fDataSize += size;
2087	if (fRXPat->fDataSize >= 0x00fffff0) {
2088	error(U_REGEX_INTERNAL_ERROR);
2089	}
2090	return dataIndex;
2091	}
2092
2093
2094	//------------------------------------------------------------------------------
2095	//
2096	// allocateStackData() Allocate space in the back-tracking stack frame.
2097	// Return the index for the newly allocated data.
2098	// The frame indexes are inserted into various
2099	// opcodes while compiling the pattern, meaning that frame
2100	// size must be restricted to the size that will fit
2101	// as an operand (24 bits).
2102	//
2103	//------------------------------------------------------------------------------
2104	int32_t RegexCompile::allocateStackData(int32_t size) {
2105	if (U_FAILURE(*fStatus)) {
2106	return 0;
2107	}
2108	if (size <= 0 \|\| size > 0x100 \|\| fRXPat->fFrameSize < 0) {
2109	error(U_REGEX_INTERNAL_ERROR);
2110	return 0;
2111	}
2112	int32_t dataIndex = fRXPat->fFrameSize;
2113	fRXPat->fFrameSize += size;
2114	if (fRXPat->fFrameSize >= 0x00fffff0) {
2115	error(U_REGEX_PATTERN_TOO_BIG);
2116	}
2117	return dataIndex;
2118	}
2119
2120
2121	//------------------------------------------------------------------------------
2122	//
2123	// blockTopLoc() Find or create a location in the compiled pattern
2124	// at the start of the operation or block that has
2125	// just been compiled. Needed when a quantifier (* or
2126	// whatever) appears, and we need to add an operation
2127	// at the start of the thing being quantified.
2128	//
2129	// (Parenthesized Blocks) have a slot with a NOP that
2130	// is reserved for this purpose. .* or similar don't
2131	// and a slot needs to be added.
2132	//
2133	// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2134	// at the returned location.
2135	// FALSE - just return the address,
2136	// do not reserve a location there.
2137	//
2138	//------------------------------------------------------------------------------
2139	int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
2140	int32_t theLoc;
2141	fixLiterals(TRUE1); // Emit code for any pending literals.
2142	// If last item was a string, emit separate op for the its last char.
2143	if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2144	{
2145	// The item just processed is a parenthesized block.
2146	theLoc = fMatchOpenParen; // A slot is already reserved for us.
2147	U_ASSERT(theLoc > 0)(void)0;
2148	U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP)(void)0;
2149	}
2150	else {
2151	// Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2152	// No slot for STATE_SAVE was pre-reserved in the compiled code.
2153	// We need to make space now.
2154	theLoc = fRXPat->fCompiledPat->size()-1;
2155	int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2156	if (URX_TYPE(opAtTheLoc)((uint32_t)(opAtTheLoc) >> 24) == URX_STRING_LEN) {
2157	// Strings take two opcode, we want the position of the first one.
2158	// We can have a string at this point if a single character case-folded to two.
2159	theLoc--;
2160	}
2161	if (reserveLoc) {
2162	int32_t nop = buildOp(URX_NOP, 0);
2163	fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2164	}
2165	}
2166	return theLoc;
2167	}
2168
2169
2170
2171	//------------------------------------------------------------------------------
2172	//
2173	// handleCloseParen When compiling a close paren, we need to go back
2174	// and fix up any JMP or SAVE operations within the
2175	// parenthesized block that need to target the end
2176	// of the block. The locations of these are kept on
2177	// the paretheses stack.
2178	//
2179	// This function is called both when encountering a
2180	// real ) and at the end of the pattern.
2181	//
2182	//------------------------------------------------------------------------------
2183	void RegexCompile::handleCloseParen() {
2184	int32_t patIdx;
2185	int32_t patOp;
2186	if (fParenStack.size() <= 0) {
2187	error(U_REGEX_MISMATCHED_PAREN);
2188	return;
2189	}
2190
2191	// Emit code for any pending literals.
2192	fixLiterals(FALSE0);
2193
2194	// Fixup any operations within the just-closed parenthesized group
2195	// that need to reference the end of the (block).
2196	// (The first one popped from the stack is an unused slot for
2197	// alternation (OR) state save, but applying the fixup to it does no harm.)
2198	for (;;) {
2199	patIdx = fParenStack.popi();
2200	if (patIdx < 0) {
2201	// value < 0 flags the start of the frame on the paren stack.
2202	break;
2203	}
2204	U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size())(void)0;
2205	patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2206	U_ASSERT(URX_VAL(patOp) == 0)(void)0; // Branch target for JMP should not be set.
2207	patOp \|= fRXPat->fCompiledPat->size(); // Set it now.
2208	fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2209	fMatchOpenParen = patIdx;
2210	}
2211
2212	// At the close of any parenthesized block, restore the match mode flags to
2213	// the value they had at the open paren. Saved value is
2214	// at the top of the paren stack.
2215	fModeFlags = fParenStack.popi();
2216	U_ASSERT(fModeFlags < 0)(void)0;
2217
2218	// DO any additional fixups, depending on the specific kind of
2219	// parentesized grouping this is
2220
2221	switch (patIdx) {
2222	case plain:
2223	case flags:
2224	// No additional fixups required.
2225	// (Grouping-only parentheses)
2226	break;
2227	case capturing:
2228	// Capturing Parentheses.
2229	// Insert a End Capture op into the pattern.
2230	// The frame offset of the variables for this cg is obtained from the
2231	// start capture op and put it into the end-capture op.
2232	{
2233	int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2234	U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE)(void)0;
2235
2236	int32_t frameVarLocation = URX_VAL(captureOp)((captureOp) & 0xffffff);
2237	appendOp(URX_END_CAPTURE, frameVarLocation);
2238	}
2239	break;
2240	case atomic:
2241	// Atomic Parenthesis.
2242	// Insert a LD_SP operation to restore the state stack to the position
2243	// it was when the atomic parens were entered.
2244	{
2245	int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2246	U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP)(void)0;
2247	int32_t stoLoc = URX_VAL(stoOp)((stoOp) & 0xffffff);
2248	appendOp(URX_LD_SP, stoLoc);
2249	}
2250	break;
2251
2252	case lookAhead:
2253	{
2254	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2255	U_ASSERT(URX_TYPE(startOp) == URX_LA_START)(void)0;
2256	int32_t dataLoc = URX_VAL(startOp)((startOp) & 0xffffff);
2257	appendOp(URX_LA_END, dataLoc);
2258	}
2259	break;
2260
2261	case negLookAhead:
2262	{
2263	// See comment at doOpenLookAheadNeg
2264	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2265	U_ASSERT(URX_TYPE(startOp) == URX_LA_START)(void)0;
2266	int32_t dataLoc = URX_VAL(startOp)((startOp) & 0xffffff);
2267	appendOp(URX_LA_END, dataLoc);
2268	appendOp(URX_BACKTRACK, 0);
2269	appendOp(URX_LA_END, dataLoc);
2270
2271	// Patch the URX_SAVE near the top of the block.
2272	// The destination of the SAVE is the final LA_END that was just added.
2273	int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2274	U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE)(void)0;
2275	int32_t dest = fRXPat->fCompiledPat->size()-1;
2276	saveOp = buildOp(URX_STATE_SAVE, dest);
2277	fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2278	}
2279	break;
2280
2281	case lookBehind:
2282	{
2283	// See comment at doOpenLookBehind.
2284
2285	// Append the URX_LB_END and URX_LA_END to the compiled pattern.
2286	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2287	U_ASSERT(URX_TYPE(startOp) == URX_LB_START)(void)0;
2288	int32_t dataLoc = URX_VAL(startOp)((startOp) & 0xffffff);
2289	appendOp(URX_LB_END, dataLoc);
2290	appendOp(URX_LA_END, dataLoc);
2291
2292	// Determine the min and max bounds for the length of the
2293	// string that the pattern can match.
2294	// An unbounded upper limit is an error.
2295	int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2296	int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2297	int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2298	if (URX_TYPE(maxML)((uint32_t)(maxML) >> 24) != 0) {
2299	error(U_REGEX_LOOK_BEHIND_LIMIT);
2300	break;
2301	}
2302	if (maxML == INT32_MAX(2147483647)) {
2303	error(U_REGEX_LOOK_BEHIND_LIMIT);
2304	break;
2305	}
2306	if (minML == INT32_MAX(2147483647)) {
2307	// This condition happens when no match is possible, such as with a
2308	// [set] expression containing no elements.
2309	// In principle, the generated code to evaluate the expression could be deleted,
2310	// but it's probably not worth the complication.
2311	minML = 0;
2312	}
2313	U_ASSERT(minML <= maxML)(void)0;
2314
2315	// Insert the min and max match len bounds into the URX_LB_CONT op that
2316	// appears at the top of the look-behind block, at location fMatchOpenParen+1
2317	fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
2318	fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
2319
2320	}
2321	break;
2322
2323
2324
2325	case lookBehindN:
2326	{
2327	// See comment at doOpenLookBehindNeg.
2328
2329	// Append the URX_LBN_END to the compiled pattern.
2330	int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2331	U_ASSERT(URX_TYPE(startOp) == URX_LB_START)(void)0;
2332	int32_t dataLoc = URX_VAL(startOp)((startOp) & 0xffffff);
2333	appendOp(URX_LBN_END, dataLoc);
2334
2335	// Determine the min and max bounds for the length of the
2336	// string that the pattern can match.
2337	// An unbounded upper limit is an error.
2338	int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2339	int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2340	int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2341	if (URX_TYPE(maxML)((uint32_t)(maxML) >> 24) != 0) {
2342	error(U_REGEX_LOOK_BEHIND_LIMIT);
2343	break;
2344	}
2345	if (maxML == INT32_MAX(2147483647)) {
2346	error(U_REGEX_LOOK_BEHIND_LIMIT);
2347	break;
2348	}
2349	if (minML == INT32_MAX(2147483647)) {
2350	// This condition happens when no match is possible, such as with a
2351	// [set] expression containing no elements.
2352	// In principle, the generated code to evaluate the expression could be deleted,
2353	// but it's probably not worth the complication.
2354	minML = 0;
2355	}
2356
2357	U_ASSERT(minML <= maxML)(void)0;
2358
2359	// Insert the min and max match len bounds into the URX_LB_CONT op that
2360	// appears at the top of the look-behind block, at location fMatchOpenParen+1
2361	fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
2362	fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
2363
2364	// Insert the pattern location to continue at after a successful match
2365	// as the last operand of the URX_LBN_CONT
2366	int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2367	fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
2368	}
2369	break;
2370
2371
2372
2373	default:
2374	UPRV_UNREACHABLE_EXITabort();
2375	}
2376
2377	// remember the next location in the compiled pattern.
2378	// The compilation of Quantifiers will look at this to see whether its looping
2379	// over a parenthesized block or a single item
2380	fMatchCloseParen = fRXPat->fCompiledPat->size();
2381	}
2382
2383
2384
2385	//------------------------------------------------------------------------------
2386	//
2387	// compileSet Compile the pattern operations for a reference to a
2388	// UnicodeSet.
2389	//
2390	//------------------------------------------------------------------------------
2391	void RegexCompile::compileSet(UnicodeSet *theSet)
2392	{
2393	if (theSet == NULL__null) {
2394	return;
2395	}
2396	// Remove any strings from the set.
2397	// There shouldn't be any, but just in case.
2398	// (Case Closure can add them; if we had a simple case closure available that
2399	// ignored strings, that would be better.)
2400	theSet->removeAllStrings();
2401	int32_t setSize = theSet->size();
2402
2403	switch (setSize) {
2404	case 0:
2405	{
2406	// Set of no elements. Always fails to match.
2407	appendOp(URX_BACKTRACK, 0);
2408	delete theSet;
2409	}
2410	break;
2411
2412	case 1:
2413	{
2414	// The set contains only a single code point. Put it into
2415	// the compiled pattern as a single char operation rather
2416	// than a set, and discard the set itself.
2417	literalChar(theSet->charAt(0));
2418	delete theSet;
2419	}
2420	break;
2421
2422	default:
2423	{
2424	// The set contains two or more chars. (the normal case)
2425	// Put it into the compiled pattern as a set.
2426	theSet->freeze();
2427	int32_t setNumber = fRXPat->fSets->size();
2428	fRXPat->fSets->addElement(theSet, *fStatus);
2429	if (U_SUCCESS(*fStatus)) {
2430	appendOp(URX_SETREF, setNumber);
2431	} else {
2432	delete theSet;
2433	}
2434	}
2435	}
2436	}
2437
2438
2439	//------------------------------------------------------------------------------
2440	//
2441	// compileInterval Generate the code for a {min, max} style interval quantifier.
2442	// Except for the specific opcodes used, the code is the same
2443	// for all three types (greedy, non-greedy, possessive) of
2444	// intervals. The opcodes are supplied as parameters.
2445	// (There are two sets of opcodes - greedy & possessive use the
2446	// same ones, while non-greedy has it's own.)
2447	//
2448	// The code for interval loops has this form:
2449	// 0 CTR_INIT counter loc (in stack frame)
2450	// 1 5 patt address of CTR_LOOP at bottom of block
2451	// 2 min count
2452	// 3 max count (-1 for unbounded)
2453	// 4 ... block to be iterated over
2454	// 5 CTR_LOOP
2455	//
2456	// In
2457	//------------------------------------------------------------------------------
2458	void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
2459	{
2460	// The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2461	// four slots in the compiled code. Reserve them.
2462	int32_t topOfBlock = blockTopLoc(TRUE1);
2463	insertOp(topOfBlock);
2464	insertOp(topOfBlock);
2465	insertOp(topOfBlock);
2466
2467	// The operands for the CTR_INIT opcode include the index in the matcher data
2468	// of the counter. Allocate it now. There are two data items
2469	// counterLoc --> Loop counter
2470	// +1 --> Input index (for breaking non-progressing loops)
2471	// (Only present if unbounded upper limit on loop)
2472	int32_t dataSize = fIntervalUpper < 0 ? 2 : 1;
2473	int32_t counterLoc = allocateStackData(dataSize);
2474
2475	int32_t op = buildOp(InitOp, counterLoc);
2476	fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2477
2478	// The second operand of CTR_INIT is the location following the end of the loop.
2479	// Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2480	// compilation of something later on causes the code to grow and the target
2481	// position to move.
2482	int32_t loopEnd = fRXPat->fCompiledPat->size();
2483	op = buildOp(URX_RELOC_OPRND, loopEnd);
2484	fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2485
2486	// Followed by the min and max counts.
2487	fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2488	fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2489
2490	// Append the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2491	// Goes at end of the block being looped over, so just append to the code so far.
2492	appendOp(LoopOp, topOfBlock);
2493
2494	if ((fIntervalLow & 0xff000000) != 0 \|\|
2495	(fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2496	error(U_REGEX_NUMBER_TOO_BIG);
2497	}
2498
2499	if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2500	error(U_REGEX_MAX_LT_MIN);
2501	}
2502	}
2503
2504
2505
2506	UBool RegexCompile::compileInlineInterval() {
2507	if (fIntervalUpper > 10 \|\| fIntervalUpper < fIntervalLow) {
2508	// Too big to inline. Fail, which will cause looping code to be generated.
2509	// (Upper < Lower picks up unbounded upper and errors, both.)
2510	return FALSE0;
2511	}
2512
2513	int32_t topOfBlock = blockTopLoc(FALSE0);
2514	if (fIntervalUpper == 0) {
2515	// Pathological case. Attempt no matches, as if the block doesn't exist.
2516	// Discard the generated code for the block.
2517	// If the block included parens, discard the info pertaining to them as well.
2518	fRXPat->fCompiledPat->setSize(topOfBlock);
2519	if (fMatchOpenParen >= topOfBlock) {
2520	fMatchOpenParen = -1;
2521	}
2522	if (fMatchCloseParen >= topOfBlock) {
2523	fMatchCloseParen = -1;
2524	}
2525	return TRUE1;
2526	}
2527
2528	if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2529	// The thing being repeated is not a single op, but some
2530	// more complex block. Do it as a loop, not inlines.
2531	// Note that things "repeated" a max of once are handled as inline, because
2532	// the one copy of the code already generated is just fine.
2533	return FALSE0;
2534	}
2535
2536	// Pick up the opcode that is to be repeated
2537	//
2538	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2539
2540	// Compute the pattern location where the inline sequence
2541	// will end, and set up the state save op that will be needed.
2542	//
2543	int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2544	+ fIntervalUpper + (fIntervalUpper-fIntervalLow);
2545	int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2546	if (fIntervalLow == 0) {
2547	insertOp(topOfBlock);
2548	fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2549	}
2550
2551
2552
2553	// Loop, emitting the op for the thing being repeated each time.
2554	// Loop starts at 1 because one instance of the op already exists in the pattern,
2555	// it was put there when it was originally encountered.
2556	int32_t i;
2557	for (i=1; i<fIntervalUpper; i++ ) {
2558	if (i >= fIntervalLow) {
2559	appendOp(saveOp);
2560	}
2561	appendOp(op);
2562	}
2563	return TRUE1;
2564	}
2565
2566
2567
2568	//------------------------------------------------------------------------------
2569	//
2570	// caseInsensitiveStart given a single code point from a pattern string, determine the
2571	// set of characters that could potentially begin a case-insensitive
2572	// match of a string beginning with that character, using full Unicode
2573	// case insensitive matching.
2574	//
2575	// This is used in optimizing find().
2576	//
2577	// closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2578	// misses cases like this:
2579	// A string from the pattern begins with 'ss' (although all we know
2580	// in this context is that it begins with 's')
2581	// The pattern could match a string beginning with a German sharp-s
2582	//
2583	// To the ordinary case closure for a character c, we add all other
2584	// characters cx where the case closure of cx includes a string form that begins
2585	// with the original character c.
2586	//
2587	// This function could be made smarter. The full pattern string is available
2588	// and it would be possible to verify that the extra characters being added
2589	// to the starting set fully match, rather than having just a first-char of the
2590	// folded form match.
2591	//
2592	//------------------------------------------------------------------------------
2593	void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2594
2595	// Machine Generated below.
2596	// It may need updating with new versions of Unicode.
2597	// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2598	// The update tool is here:
2599	// https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing
2600
2601	// Machine Generated Data. Do not hand edit.
2602	static const UChar32 RECaseFixCodePoints[] = {
2603	0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2604	0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2605	0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2606	0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2607	0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2608
2609	static const int16_t RECaseFixStringOffsets[] = {
2610	0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2611	0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2612	0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2613	0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2614	0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2615
2616	static const int16_t RECaseFixCounts[] = {
2617	0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2618	0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2619	0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2620	0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2621	0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2622
2623	static const UChar RECaseFixData[] = {
2624	0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2625	0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2626	0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2627	0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2628	0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2629	0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2630	0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2631	0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2632	0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2633	0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2634	0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2635
2636	// End of machine generated data.
2637
2638	if (c < UCHAR_MIN_VALUE0 \|\| c > UCHAR_MAX_VALUE0x10ffff) {
2639	// This function should never be called with an invalid input character.
2640	UPRV_UNREACHABLE_EXITabort();
2641	} else if (u_hasBinaryPropertyu_hasBinaryProperty_71(c, UCHAR_CASE_SENSITIVE)) {
2642	UChar32 caseFoldedC = u_foldCaseu_foldCase_71(c, U_FOLD_CASE_DEFAULT0);
2643	starterChars->set(caseFoldedC, caseFoldedC);
2644
2645	int32_t i;
2646	for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2647	// Simple linear search through the sorted list of interesting code points.
2648	}
2649
2650	if (RECaseFixCodePoints[i] == c) {
2651	int32_t dataIndex = RECaseFixStringOffsets[i];
2652	int32_t numCharsToAdd = RECaseFixCounts[i];
2653	UChar32 cpToAdd = 0;
2654	for (int32_t j=0; j<numCharsToAdd; j++) {
2655	U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd)do { (cpToAdd)=(RECaseFixData)[(dataIndex)++]; if((((cpToAdd) &0xfffffc00)==0xd800)) { (cpToAdd)=(((UChar32)((cpToAdd)) <<10UL)+(UChar32)((RECaseFixData)[(dataIndex)++])-((0xd800 <<10UL)+0xdc00-0x10000)); } } while (false);
2656	starterChars->add(cpToAdd);
2657	}
2658	}
2659
2660	starterChars->closeOver(USET_CASE_INSENSITIVE);
2661	starterChars->removeAllStrings();
2662	} else {
2663	// Not a cased character. Just return it alone.
2664	starterChars->set(c, c);
2665	}
2666	}
2667
2668
2669	// Increment with overflow check.
2670	// val and delta will both be positive.
2671
2672	static int32_t safeIncrement(int32_t val, int32_t delta) {
2673	if (INT32_MAX(2147483647) - val > delta) {
2674	return val + delta;
2675	} else {
2676	return INT32_MAX(2147483647);
2677	}
2678	}
2679
2680
2681	//------------------------------------------------------------------------------
2682	//
2683	// matchStartType Determine how a match can start.
2684	// Used to optimize find() operations.
2685	//
2686	// Operation is very similar to minMatchLength(). Walk the compiled
2687	// pattern, keeping an on-going minimum-match-length. For any
2688	// op where the min match coming in is zero, add that ops possible
2689	// starting matches to the possible starts for the overall pattern.
2690	//
2691	//------------------------------------------------------------------------------
2692	void RegexCompile::matchStartType() {
2693	if (U_FAILURE(*fStatus)) {
2694	return;
2695	}
2696
2697
2698	int32_t loc; // Location in the pattern of the current op being processed.
2699	int32_t op; // The op being processed
2700	int32_t opType; // The opcode type of the op
2701	int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
2702	int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
2703
2704	UBool atStart = TRUE1; // True if no part of the pattern yet encountered
2705	// could have advanced the position in a match.
2706	// (Maximum match length so far == 0)
2707
2708	// forwardedLength is a vector holding minimum-match-length values that
2709	// are propagated forward in the pattern by JMP or STATE_SAVE operations.
2710	// It must be one longer than the pattern being checked because some ops
2711	// will jmp to a end-of-block+1 location from within a block, and we must
2712	// count those when checking the block.
2713	int32_t end = fRXPat->fCompiledPat->size();
2714	UVector32 forwardedLength(end+1, *fStatus);
2715	forwardedLength.setSize(end+1);
2716	for (loc=3; loc<end; loc++) {
2717	forwardedLength.setElementAt(INT32_MAX(2147483647), loc);
2718	}
2719
2720	for (loc = 3; loc<end; loc++) {
2721	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2722	opType = URX_TYPE(op)((uint32_t)(op) >> 24);
2723
2724	// The loop is advancing linearly through the pattern.
2725	// If the op we are now at was the destination of a branch in the pattern,
2726	// and that path has a shorter minimum length than the current accumulated value,
2727	// replace the current accumulated value.
2728	if (forwardedLength.elementAti(loc) < currentLen) {
2729	currentLen = forwardedLength.elementAti(loc);
2730	U_ASSERT(currentLen>=0 && currentLen < INT32_MAX)(void)0;
2731	}
2732
2733	switch (opType) {
2734	// Ops that don't change the total length matched
2735	case URX_RESERVED_OP:
2736	case URX_END:
2737	case URX_FAIL:
2738	case URX_STRING_LEN:
2739	case URX_NOP:
2740	case URX_START_CAPTURE:
2741	case URX_END_CAPTURE:
2742	case URX_BACKSLASH_B:
2743	case URX_BACKSLASH_BU:
2744	case URX_BACKSLASH_G:
2745	case URX_BACKSLASH_Z:
2746	case URX_DOLLAR:
2747	case URX_DOLLAR_M:
2748	case URX_DOLLAR_D:
2749	case URX_DOLLAR_MD:
2750	case URX_RELOC_OPRND:
2751	case URX_STO_INP_LOC:
2752	case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2753	case URX_BACKREF_I:
2754
2755	case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2756	case URX_LD_SP:
2757	break;
2758
2759	case URX_CARET:
2760	if (atStart) {
2761	fRXPat->fStartType = START_START;
2762	}
2763	break;
2764
2765	case URX_CARET_M:
2766	case URX_CARET_M_UNIX:
2767	if (atStart) {
2768	fRXPat->fStartType = START_LINE;
2769	}
2770	break;
2771
2772	case URX_ONECHAR:
2773	if (currentLen == 0) {
2774	// This character could appear at the start of a match.
2775	// Add it to the set of possible starting characters.
2776	fRXPat->fInitialChars->add(URX_VAL(op)((op) & 0xffffff));
2777	numInitialStrings += 2;
2778	}
2779	currentLen = safeIncrement(currentLen, 1);
2780	atStart = FALSE0;
2781	break;
2782
2783
2784	case URX_SETREF:
2785	if (currentLen == 0) {
2786	int32_t sn = URX_VAL(op)((op) & 0xffffff);
2787	U_ASSERT(sn > 0 && sn < fRXPat->fSets->size())(void)0;
2788	const UnicodeSet s = (UnicodeSet )fRXPat->fSets->elementAt(sn);
2789	fRXPat->fInitialChars->addAll(*s);
2790	numInitialStrings += 2;
2791	}
2792	currentLen = safeIncrement(currentLen, 1);
2793	atStart = FALSE0;
2794	break;
2795
2796	case URX_LOOP_SR_I:
2797	// [Set]*, like a SETREF, above, in what it can match,
2798	// but may not match at all, so currentLen is not incremented.
2799	if (currentLen == 0) {
2800	int32_t sn = URX_VAL(op)((op) & 0xffffff);
2801	U_ASSERT(sn > 0 && sn < fRXPat->fSets->size())(void)0;
2802	const UnicodeSet s = (UnicodeSet )fRXPat->fSets->elementAt(sn);
2803	fRXPat->fInitialChars->addAll(*s);
2804	numInitialStrings += 2;
2805	}
2806	atStart = FALSE0;
2807	break;
2808
2809	case URX_LOOP_DOT_I:
2810	if (currentLen == 0) {
2811	// .* at the start of a pattern.
2812	// Any character can begin the match.
2813	fRXPat->fInitialChars->clear();
2814	fRXPat->fInitialChars->complement();
2815	numInitialStrings += 2;
2816	}
2817	atStart = FALSE0;
2818	break;
2819
2820
2821	case URX_STATIC_SETREF:
2822	if (currentLen == 0) {
2823	int32_t sn = URX_VAL(op)((op) & 0xffffff);
2824	U_ASSERT(sn>0 && sn<URX_LAST_SET)(void)0;
2825	const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
2826	fRXPat->fInitialChars->addAll(s);
2827	numInitialStrings += 2;
2828	}
2829	currentLen = safeIncrement(currentLen, 1);
2830	atStart = FALSE0;
2831	break;
2832
2833
2834
2835	case URX_STAT_SETREF_N:
2836	if (currentLen == 0) {
2837	int32_t sn = URX_VAL(op)((op) & 0xffffff);
2838	UnicodeSet sc;
2839	sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
2840	fRXPat->fInitialChars->addAll(sc);
2841	numInitialStrings += 2;
2842	}
2843	currentLen = safeIncrement(currentLen, 1);
2844	atStart = FALSE0;
2845	break;
2846
2847
2848
2849	case URX_BACKSLASH_D:
2850	// Digit Char
2851	if (currentLen == 0) {
2852	UnicodeSet s;
2853	s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK((uint32_t)1<<(U_DECIMAL_DIGIT_NUMBER)), *fStatus);
2854	if (URX_VAL(op)((op) & 0xffffff) != 0) {
2855	s.complement();
2856	}
2857	fRXPat->fInitialChars->addAll(s);
2858	numInitialStrings += 2;
2859	}
2860	currentLen = safeIncrement(currentLen, 1);
2861	atStart = FALSE0;
2862	break;
2863
2864
2865	case URX_BACKSLASH_H:
2866	// Horiz white space
2867	if (currentLen == 0) {
2868	UnicodeSet s;
2869	s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK((uint32_t)1<<(U_SPACE_SEPARATOR)), *fStatus);
2870	s.add((UChar32)9); // Tab
2871	if (URX_VAL(op)((op) & 0xffffff) != 0) {
2872	s.complement();
2873	}
2874	fRXPat->fInitialChars->addAll(s);
2875	numInitialStrings += 2;
2876	}
2877	currentLen = safeIncrement(currentLen, 1);
2878	atStart = FALSE0;
2879	break;
2880
2881
2882	case URX_BACKSLASH_R: // Any line ending sequence
2883	case URX_BACKSLASH_V: // Any line ending code point, with optional negation
2884	if (currentLen == 0) {
2885	UnicodeSet s;
2886	s.add((UChar32)0x0a, (UChar32)0x0d); // add range
2887	s.add((UChar32)0x85);
2888	s.add((UChar32)0x2028, (UChar32)0x2029);
2889	if (URX_VAL(op)((op) & 0xffffff) != 0) {
2890	// Complement option applies to URX_BACKSLASH_V only.
2891	s.complement();
2892	}
2893	fRXPat->fInitialChars->addAll(s);
2894	numInitialStrings += 2;
2895	}
2896	currentLen = safeIncrement(currentLen, 1);
2897	atStart = FALSE0;
2898	break;
2899
2900
2901
2902	case URX_ONECHAR_I:
2903	// Case Insensitive Single Character.
2904	if (currentLen == 0) {
2905	UChar32 c = URX_VAL(op)((op) & 0xffffff);
2906	if (u_hasBinaryPropertyu_hasBinaryProperty_71(c, UCHAR_CASE_SENSITIVE)) {
2907	UnicodeSet starters(c, c);
2908	starters.closeOver(USET_CASE_INSENSITIVE);
2909	// findCaseInsensitiveStarters(c, &starters);
2910	// For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2911	// The expanded folding can't match the pattern.
2912	fRXPat->fInitialChars->addAll(starters);
2913	} else {
2914	// Char has no case variants. Just add it as-is to the
2915	// set of possible starting chars.
2916	fRXPat->fInitialChars->add(c);
2917	}
2918	numInitialStrings += 2;
2919	}
2920	currentLen = safeIncrement(currentLen, 1);
2921	atStart = FALSE0;
2922	break;
2923
2924
2925	case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded.
2926	case URX_DOTANY_ALL: // . matches one or two.
2927	case URX_DOTANY:
2928	case URX_DOTANY_UNIX:
2929	if (currentLen == 0) {
2930	// These constructs are all bad news when they appear at the start
2931	// of a match. Any character can begin the match.
2932	fRXPat->fInitialChars->clear();
2933	fRXPat->fInitialChars->complement();
2934	numInitialStrings += 2;
2935	}
2936	currentLen = safeIncrement(currentLen, 1);
2937	atStart = FALSE0;
2938	break;
2939
2940
2941	case URX_JMPX:
2942	loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2943	U_FALLTHROUGH[[clang::fallthrough]];
2944	case URX_JMP:
2945	{
2946	int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
2947	if (jmpDest < loc) {
2948	// Loop of some kind. Can safely ignore, the worst that will happen
2949	// is that we understate the true minimum length
2950	currentLen = forwardedLength.elementAti(loc+1);
2951
2952	} else {
2953	// Forward jump. Propagate the current min length to the target loc of the jump.
2954	U_ASSERT(jmpDest <= end+1)(void)0;
2955	if (forwardedLength.elementAti(jmpDest) > currentLen) {
2956	forwardedLength.setElementAt(currentLen, jmpDest);
2957	}
2958	}
2959	}
2960	atStart = FALSE0;
2961	break;
2962
2963	case URX_JMP_SAV:
2964	case URX_JMP_SAV_X:
2965	// Combo of state save to the next loc, + jmp backwards.
2966	// Net effect on min. length computation is nothing.
2967	atStart = FALSE0;
2968	break;
2969
2970	case URX_BACKTRACK:
2971	// Fails are kind of like a branch, except that the min length was
2972	// propagated already, by the state save.
2973	currentLen = forwardedLength.elementAti(loc+1);
2974	atStart = FALSE0;
2975	break;
2976
2977
2978	case URX_STATE_SAVE:
2979	{
2980	// State Save, for forward jumps, propagate the current minimum.
2981	// of the state save.
2982	int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
2983	if (jmpDest > loc) {
2984	if (currentLen < forwardedLength.elementAti(jmpDest)) {
2985	forwardedLength.setElementAt(currentLen, jmpDest);
2986	}
2987	}
2988	}
2989	atStart = FALSE0;
2990	break;
2991
2992
2993
2994
2995	case URX_STRING:
2996	{
2997	loc++;
2998	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2999	int32_t stringLen = URX_VAL(stringLenOp)((stringLenOp) & 0xffffff);
3000	U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN)(void)0;
3001	U_ASSERT(stringLenOp >= 2)(void)0;
3002	if (currentLen == 0) {
3003	// Add the starting character of this string to the set of possible starting
3004	// characters for this pattern.
3005	int32_t stringStartIdx = URX_VAL(op)((op) & 0xffffff);
3006	UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3007	fRXPat->fInitialChars->add(c);
3008
3009	// Remember this string. After the entire pattern has been checked,
3010	// if nothing else is identified that can start a match, we'll use it.
3011	numInitialStrings++;
3012	fRXPat->fInitialStringIdx = stringStartIdx;
3013	fRXPat->fInitialStringLen = stringLen;
3014	}
3015
3016	currentLen = safeIncrement(currentLen, stringLen);
3017	atStart = FALSE0;
3018	}
3019	break;
3020
3021	case URX_STRING_I:
3022	{
3023	// Case-insensitive string. Unlike exact-match strings, we won't
3024	// attempt a string search for possible match positions. But we
3025	// do update the set of possible starting characters.
3026	loc++;
3027	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3028	int32_t stringLen = URX_VAL(stringLenOp)((stringLenOp) & 0xffffff);
3029	U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN)(void)0;
3030	U_ASSERT(stringLenOp >= 2)(void)0;
3031	if (currentLen == 0) {
3032	// Add the starting character of this string to the set of possible starting
3033	// characters for this pattern.
3034	int32_t stringStartIdx = URX_VAL(op)((op) & 0xffffff);
3035	UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3036	UnicodeSet s;
3037	findCaseInsensitiveStarters(c, &s);
3038	fRXPat->fInitialChars->addAll(s);
3039	numInitialStrings += 2; // Matching on an initial string not possible.
3040	}
3041	currentLen = safeIncrement(currentLen, stringLen);
3042	atStart = FALSE0;
3043	}
3044	break;
3045
3046	case URX_CTR_INIT:
3047	case URX_CTR_INIT_NG:
3048	{
3049	// Loop Init Ops. These don't change the min length, but they are 4 word ops
3050	// so location must be updated accordingly.
3051	// Loop Init Ops.
3052	// If the min loop count == 0
3053	// move loc forwards to the end of the loop, skipping over the body.
3054	// If the min count is > 0,
3055	// continue normal processing of the body of the loop.
3056	int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3057	loopEndLoc = URX_VAL(loopEndLoc)((loopEndLoc) & 0xffffff);
3058	int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3059	if (minLoopCount == 0) {
3060	// Min Loop Count of 0, treat like a forward branch and
3061	// move the current minimum length up to the target
3062	// (end of loop) location.
3063	U_ASSERT(loopEndLoc <= end+1)(void)0;
3064	if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3065	forwardedLength.setElementAt(currentLen, loopEndLoc);
3066	}
3067	}
3068	loc+=3; // Skips over operands of CTR_INIT
3069	}
3070	atStart = FALSE0;
3071	break;
3072
3073
3074	case URX_CTR_LOOP:
3075	case URX_CTR_LOOP_NG:
3076	// Loop ops.
3077	// The jump is conditional, backwards only.
3078	atStart = FALSE0;
3079	break;
3080
3081	case URX_LOOP_C:
3082	// More loop ops. These state-save to themselves.
3083	// don't change the minimum match
3084	atStart = FALSE0;
3085	break;
3086
3087
3088	case URX_LA_START:
3089	case URX_LB_START:
3090	{
3091	// Look-around. Scan forward until the matching look-ahead end,
3092	// without processing the look-around block. This is overly pessimistic.
3093
3094	// Keep track of the nesting depth of look-around blocks. Boilerplate code for
3095	// lookahead contains two LA_END instructions, so count goes up by two
3096	// for each LA_START.
3097	int32_t depth = (opType == URX_LA_START? 2: 1);
3098	for (;;) {
3099	loc++;
3100	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3101	if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_LA_START) {
3102	depth+=2;
3103	}
3104	if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_LB_START) {
3105	depth++;
3106	}
3107	if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_LA_END \|\| URX_TYPE(op)((uint32_t)(op) >> 24)==URX_LBN_END) {
3108	depth--;
3109	if (depth == 0) {
3110	break;
3111	}
3112	}
3113	if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_STATE_SAVE) {
3114	// Need this because neg lookahead blocks will FAIL to outside
3115	// of the block.
3116	int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
3117	if (jmpDest > loc) {
3118	if (currentLen < forwardedLength.elementAti(jmpDest)) {
3119	forwardedLength.setElementAt(currentLen, jmpDest);
3120	}
3121	}
3122	}
3123	U_ASSERT(loc <= end)(void)0;
3124	}
3125	}
3126	break;
3127
3128	case URX_LA_END:
3129	case URX_LB_CONT:
3130	case URX_LB_END:
3131	case URX_LBN_CONT:
3132	case URX_LBN_END:
3133	UPRV_UNREACHABLE_EXITabort(); // Shouldn't get here. These ops should be
3134	// consumed by the scan in URX_LA_START and LB_START
3135	default:
3136	UPRV_UNREACHABLE_EXITabort();
3137	}
3138
3139	}
3140
3141
3142	// We have finished walking through the ops. Check whether some forward jump
3143	// propagated a shorter length to location end+1.
3144	if (forwardedLength.elementAti(end+1) < currentLen) {
3145	currentLen = forwardedLength.elementAti(end+1);
3146	}
3147
3148
3149	fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3150
3151
3152	// Sort out what we should check for when looking for candidate match start positions.
3153	// In order of preference,
3154	// 1. Start of input text buffer.
3155	// 2. A literal string.
3156	// 3. Start of line in multi-line mode.
3157	// 4. A single literal character.
3158	// 5. A character from a set of characters.
3159	//
3160	if (fRXPat->fStartType == START_START) {
3161	// Match only at the start of an input text string.
3162	// start type is already set. We're done.
3163	} else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3164	// Match beginning only with a literal string.
3165	UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3166	U_ASSERT(fRXPat->fInitialChars->contains(c))(void)0;
3167	fRXPat->fStartType = START_STRING;
3168	fRXPat->fInitialChar = c;
3169	} else if (fRXPat->fStartType == START_LINE) {
3170	// Match at start of line in Multi-Line mode.
3171	// Nothing to do here; everything is already set.
3172	} else if (fRXPat->fMinMatchLen == 0) {
3173	// Zero length match possible. We could start anywhere.
3174	fRXPat->fStartType = START_NO_INFO;
3175	} else if (fRXPat->fInitialChars->size() == 1) {
3176	// All matches begin with the same char.
3177	fRXPat->fStartType = START_CHAR;
3178	fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3179	U_ASSERT(fRXPat->fInitialChar != (UChar32)-1)(void)0;
3180	} else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE0 &&
3181	fRXPat->fMinMatchLen > 0) {
3182	// Matches start with a set of character smaller than the set of all chars.
3183	fRXPat->fStartType = START_SET;
3184	} else {
3185	// Matches can start with anything
3186	fRXPat->fStartType = START_NO_INFO;
3187	}
3188
3189	return;
3190	}
3191
3192
3193
3194	//------------------------------------------------------------------------------
3195	//
3196	// minMatchLength Calculate the length of the shortest string that could
3197	// match the specified pattern.
3198	// Length is in 16 bit code units, not code points.
3199	//
3200	// The calculated length may not be exact. The returned
3201	// value may be shorter than the actual minimum; it must
3202	// never be longer.
3203	//
3204	// start and end are the range of p-code operations to be
3205	// examined. The endpoints are included in the range.
3206	//
3207	//------------------------------------------------------------------------------
3208	int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
3209	if (U_FAILURE(*fStatus)) {
3210	return 0;
3211	}
3212
3213	U_ASSERT(start <= end)(void)0;
3214	U_ASSERT(end < fRXPat->fCompiledPat->size())(void)0;
3215
3216
3217	int32_t loc;
3218	int32_t op;
3219	int32_t opType;
3220	int32_t currentLen = 0;
3221
3222
3223	// forwardedLength is a vector holding minimum-match-length values that
3224	// are propagated forward in the pattern by JMP or STATE_SAVE operations.
3225	// It must be one longer than the pattern being checked because some ops
3226	// will jmp to a end-of-block+1 location from within a block, and we must
3227	// count those when checking the block.
3228	UVector32 forwardedLength(end+2, *fStatus);
3229	forwardedLength.setSize(end+2);
3230	for (loc=start; loc<=end+1; loc++) {
3231	forwardedLength.setElementAt(INT32_MAX(2147483647), loc);
3232	}
3233
3234	for (loc = start; loc<=end; loc++) {
3235	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3236	opType = URX_TYPE(op)((uint32_t)(op) >> 24);
3237
3238	// The loop is advancing linearly through the pattern.
3239	// If the op we are now at was the destination of a branch in the pattern,
3240	// and that path has a shorter minimum length than the current accumulated value,
3241	// replace the current accumulated value.
3242	// U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3243	// no-match-possible cases.
3244	if (forwardedLength.elementAti(loc) < currentLen) {
3245	currentLen = forwardedLength.elementAti(loc);
3246	U_ASSERT(currentLen>=0 && currentLen < INT32_MAX)(void)0;
3247	}
3248
3249	switch (opType) {
3250	// Ops that don't change the total length matched
3251	case URX_RESERVED_OP:
3252	case URX_END:
3253	case URX_STRING_LEN:
3254	case URX_NOP:
3255	case URX_START_CAPTURE:
3256	case URX_END_CAPTURE:
3257	case URX_BACKSLASH_B:
3258	case URX_BACKSLASH_BU:
3259	case URX_BACKSLASH_G:
3260	case URX_BACKSLASH_Z:
3261	case URX_CARET:
3262	case URX_DOLLAR:
3263	case URX_DOLLAR_M:
3264	case URX_DOLLAR_D:
3265	case URX_DOLLAR_MD:
3266	case URX_RELOC_OPRND:
3267	case URX_STO_INP_LOC:
3268	case URX_CARET_M:
3269	case URX_CARET_M_UNIX:
3270	case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3271	case URX_BACKREF_I:
3272
3273	case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3274	case URX_LD_SP:
3275
3276	case URX_JMP_SAV:
3277	case URX_JMP_SAV_X:
3278	break;
3279
3280
3281	// Ops that match a minimum of one character (one or two 16 bit code units.)
3282	//
3283	case URX_ONECHAR:
3284	case URX_STATIC_SETREF:
3285	case URX_STAT_SETREF_N:
3286	case URX_SETREF:
3287	case URX_BACKSLASH_D:
3288	case URX_BACKSLASH_H:
3289	case URX_BACKSLASH_R:
3290	case URX_BACKSLASH_V:
3291	case URX_ONECHAR_I:
3292	case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded.
3293	case URX_DOTANY_ALL: // . matches one or two.
3294	case URX_DOTANY:
3295	case URX_DOTANY_UNIX:
3296	currentLen = safeIncrement(currentLen, 1);
3297	break;
3298
3299
3300	case URX_JMPX:
3301	loc++; // URX_JMPX has an extra operand, ignored here,
3302	// otherwise processed identically to URX_JMP.
3303	U_FALLTHROUGH[[clang::fallthrough]];
3304	case URX_JMP:
3305	{
3306	int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
3307	if (jmpDest < loc) {
3308	// Loop of some kind. Can safely ignore, the worst that will happen
3309	// is that we understate the true minimum length
3310	currentLen = forwardedLength.elementAti(loc+1);
3311	} else {
3312	// Forward jump. Propagate the current min length to the target loc of the jump.
3313	U_ASSERT(jmpDest <= end+1)(void)0;
3314	if (forwardedLength.elementAti(jmpDest) > currentLen) {
3315	forwardedLength.setElementAt(currentLen, jmpDest);
3316	}
3317	}
3318	}
3319	break;
3320
3321	case URX_BACKTRACK:
3322	{
3323	// Back-tracks are kind of like a branch, except that the min length was
3324	// propagated already, by the state save.
3325	currentLen = forwardedLength.elementAti(loc+1);
3326	}
3327	break;
3328
3329
3330	case URX_STATE_SAVE:
3331	{
3332	// State Save, for forward jumps, propagate the current minimum.
3333	// of the state save.
3334	int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
3335	if (jmpDest > loc) {
3336	if (currentLen < forwardedLength.elementAti(jmpDest)) {
3337	forwardedLength.setElementAt(currentLen, jmpDest);
3338	}
3339	}
3340	}
3341	break;
3342
3343
3344	case URX_STRING:
3345	{
3346	loc++;
3347	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3348	currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)((stringLenOp) & 0xffffff));
3349	}
3350	break;
3351
3352
3353	case URX_STRING_I:
3354	{
3355	loc++;
3356	// TODO: with full case folding, matching input text may be shorter than
3357	// the string we have here. More smarts could put some bounds on it.
3358	// Assume a min length of one for now. A min length of zero causes
3359	// optimization failures for a pattern like "string"+
3360	// currentLen += URX_VAL(stringLenOp);
3361	currentLen = safeIncrement(currentLen, 1);
3362	}
3363	break;
3364
3365	case URX_CTR_INIT:
3366	case URX_CTR_INIT_NG:
3367	{
3368	// Loop Init Ops.
3369	// If the min loop count == 0
3370	// move loc forwards to the end of the loop, skipping over the body.
3371	// If the min count is > 0,
3372	// continue normal processing of the body of the loop.
3373	int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3374	loopEndLoc = URX_VAL(loopEndLoc)((loopEndLoc) & 0xffffff);
3375	int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3376	if (minLoopCount == 0) {
3377	loc = loopEndLoc;
3378	} else {
3379	loc+=3; // Skips over operands of CTR_INIT
3380	}
3381	}
3382	break;
3383
3384
3385	case URX_CTR_LOOP:
3386	case URX_CTR_LOOP_NG:
3387	// Loop ops.
3388	// The jump is conditional, backwards only.
3389	break;
3390
3391	case URX_LOOP_SR_I:
3392	case URX_LOOP_DOT_I:
3393	case URX_LOOP_C:
3394	// More loop ops. These state-save to themselves.
3395	// don't change the minimum match - could match nothing at all.
3396	break;
3397
3398
3399	case URX_LA_START:
3400	case URX_LB_START:
3401	{
3402	// Look-around. Scan forward until the matching look-ahead end,
3403	// without processing the look-around block. This is overly pessimistic for look-ahead,
3404	// it assumes that the look-ahead match might be zero-length.
3405	// TODO: Positive lookahead could recursively do the block, then continue
3406	// with the longer of the block or the value coming in. Ticket 6060
3407	int32_t depth = (opType == URX_LA_START? 2: 1);
3408	for (;;) {
3409	loc++;
3410	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3411	if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_LA_START) {
3412	// The boilerplate for look-ahead includes two LA_END instructions,
3413	// Depth will be decremented by each one when it is seen.
3414	depth += 2;
3415	}
3416	if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_LB_START) {
3417	depth++;
3418	}
3419	if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_LA_END) {
3420	depth--;
3421	if (depth == 0) {
3422	break;
3423	}
3424	}
3425	if (URX_TYPE(op)((uint32_t)(op) >> 24)==URX_LBN_END) {
3426	depth--;
3427	if (depth == 0) {
3428	break;
3429	}
3430	}
3431	if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_STATE_SAVE) {
3432	// Need this because neg lookahead blocks will FAIL to outside
3433	// of the block.
3434	int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
3435	if (jmpDest > loc) {
3436	if (currentLen < forwardedLength.elementAti(jmpDest)) {
3437	forwardedLength.setElementAt(currentLen, jmpDest);
3438	}
3439	}
3440	}
3441	U_ASSERT(loc <= end)(void)0;
3442	}
3443	}
3444	break;
3445
3446	case URX_LA_END:
3447	case URX_LB_CONT:
3448	case URX_LB_END:
3449	case URX_LBN_CONT:
3450	case URX_LBN_END:
3451	// Only come here if the matching URX_LA_START or URX_LB_START was not in the
3452	// range being sized, which happens when measuring size of look-behind blocks.
3453	break;
3454
3455	default:
3456	UPRV_UNREACHABLE_EXITabort();
3457	}
3458
3459	}
3460
3461	// We have finished walking through the ops. Check whether some forward jump
3462	// propagated a shorter length to location end+1.
3463	if (forwardedLength.elementAti(end+1) < currentLen) {
3464	currentLen = forwardedLength.elementAti(end+1);
3465	U_ASSERT(currentLen>=0 && currentLen < INT32_MAX)(void)0;
3466	}
3467
3468	return currentLen;
3469	}
3470
3471	//------------------------------------------------------------------------------
3472	//
3473	// maxMatchLength Calculate the length of the longest string that could
3474	// match the specified pattern.
3475	// Length is in 16 bit code units, not code points.
3476	//
3477	// The calculated length may not be exact. The returned
3478	// value may be longer than the actual maximum; it must
3479	// never be shorter.
3480	//
3481	// start, end: the range of the pattern to check.
3482	// end is inclusive.
3483	//
3484	//------------------------------------------------------------------------------
3485	int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3486	if (U_FAILURE(*fStatus)) {
3487	return 0;
3488	}
3489	U_ASSERT(start <= end)(void)0;
3490	U_ASSERT(end < fRXPat->fCompiledPat->size())(void)0;
3491
3492	int32_t loc;
3493	int32_t op;
3494	int32_t opType;
3495	int32_t currentLen = 0;
3496	UVector32 forwardedLength(end+1, *fStatus);
3497	forwardedLength.setSize(end+1);
3498
3499	for (loc=start; loc<=end; loc++) {
3500	forwardedLength.setElementAt(0, loc);
3501	}
3502
3503	for (loc = start; loc<=end; loc++) {
3504	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3505	opType = URX_TYPE(op)((uint32_t)(op) >> 24);
3506
3507	// The loop is advancing linearly through the pattern.
3508	// If the op we are now at was the destination of a branch in the pattern,
3509	// and that path has a longer maximum length than the current accumulated value,
3510	// replace the current accumulated value.
3511	if (forwardedLength.elementAti(loc) > currentLen) {
3512	currentLen = forwardedLength.elementAti(loc);
3513	}
3514
3515	switch (opType) {
3516	// Ops that don't change the total length matched
3517	case URX_RESERVED_OP:
3518	case URX_END:
3519	case URX_STRING_LEN:
3520	case URX_NOP:
3521	case URX_START_CAPTURE:
3522	case URX_END_CAPTURE:
3523	case URX_BACKSLASH_B:
3524	case URX_BACKSLASH_BU:
3525	case URX_BACKSLASH_G:
3526	case URX_BACKSLASH_Z:
3527	case URX_CARET:
3528	case URX_DOLLAR:
3529	case URX_DOLLAR_M:
3530	case URX_DOLLAR_D:
3531	case URX_DOLLAR_MD:
3532	case URX_RELOC_OPRND:
3533	case URX_STO_INP_LOC:
3534	case URX_CARET_M:
3535	case URX_CARET_M_UNIX:
3536
3537	case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3538	case URX_LD_SP:
3539
3540	case URX_LB_END:
3541	case URX_LB_CONT:
3542	case URX_LBN_CONT:
3543	case URX_LBN_END:
3544	break;
3545
3546
3547	// Ops that increase that cause an unbounded increase in the length
3548	// of a matched string, or that increase it a hard to characterize way.
3549	// Call the max length unbounded, and stop further checking.
3550	case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3551	case URX_BACKREF_I:
3552	case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded.
3553	currentLen = INT32_MAX(2147483647);
3554	break;
3555
3556
3557	// Ops that match a max of one character (possibly two 16 bit code units.)
3558	//
3559	case URX_STATIC_SETREF:
3560	case URX_STAT_SETREF_N:
3561	case URX_SETREF:
3562	case URX_BACKSLASH_D:
3563	case URX_BACKSLASH_H:
3564	case URX_BACKSLASH_R:
3565	case URX_BACKSLASH_V:
3566	case URX_ONECHAR_I:
3567	case URX_DOTANY_ALL:
3568	case URX_DOTANY:
3569	case URX_DOTANY_UNIX:
3570	currentLen = safeIncrement(currentLen, 2);
3571	break;
3572
3573	// Single literal character. Increase current max length by one or two,
3574	// depending on whether the char is in the supplementary range.
3575	case URX_ONECHAR:
3576	currentLen = safeIncrement(currentLen, 1);
3577	if (URX_VAL(op)((op) & 0xffffff) > 0x10000) {
3578	currentLen = safeIncrement(currentLen, 1);
3579	}
3580	break;
3581
3582	// Jumps.
3583	//
3584	case URX_JMP:
3585	case URX_JMPX:
3586	case URX_JMP_SAV:
3587	case URX_JMP_SAV_X:
3588	{
3589	int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
3590	if (jmpDest < loc) {
3591	// Loop of some kind. Max match length is unbounded.
3592	currentLen = INT32_MAX(2147483647);
3593	} else {
3594	// Forward jump. Propagate the current min length to the target loc of the jump.
3595	if (forwardedLength.elementAti(jmpDest) < currentLen) {
3596	forwardedLength.setElementAt(currentLen, jmpDest);
3597	}
3598	currentLen = 0;
3599	}
3600	}
3601	break;
3602
3603	case URX_BACKTRACK:
3604	// back-tracks are kind of like a branch, except that the max length was
3605	// propagated already, by the state save.
3606	currentLen = forwardedLength.elementAti(loc+1);
3607	break;
3608
3609
3610	case URX_STATE_SAVE:
3611	{
3612	// State Save, for forward jumps, propagate the current minimum.
3613	// of the state save.
3614	// For backwards jumps, they create a loop, maximum
3615	// match length is unbounded.
3616	int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
3617	if (jmpDest > loc) {
3618	if (currentLen > forwardedLength.elementAti(jmpDest)) {
3619	forwardedLength.setElementAt(currentLen, jmpDest);
3620	}
3621	} else {
3622	currentLen = INT32_MAX(2147483647);
3623	}
3624	}
3625	break;
3626
3627
3628
3629
3630	case URX_STRING:
3631	{
3632	loc++;
3633	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3634	currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)((stringLenOp) & 0xffffff));
3635	break;
3636	}
3637
3638	case URX_STRING_I:
3639	// TODO: This code assumes that any user string that matches will be no longer
3640	// than our compiled string, with case insensitive matching.
3641	// Our compiled string has been case-folded already.
3642	//
3643	// Any matching user string will have no more code points than our
3644	// compiled (folded) string. Folding may add code points, but
3645	// not remove them.
3646	//
3647	// There is a potential problem if a supplemental code point
3648	// case-folds to a BMP code point. In this case our compiled string
3649	// could be shorter (in code units) than a matching user string.
3650	//
3651	// At this time (Unicode 6.1) there are no such characters, and this case
3652	// is not being handled. A test, intltest regex/Bug9283, will fail if
3653	// any problematic characters are added to Unicode.
3654	//
3655	// If this happens, we can make a set of the BMP chars that the
3656	// troublesome supplementals fold to, scan our string, and bump the
3657	// currentLen one extra for each that is found.
3658	//
3659	{
3660	loc++;
3661	int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3662	currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)((stringLenOp) & 0xffffff));
3663	}
3664	break;
3665
3666	case URX_CTR_INIT:
3667	case URX_CTR_INIT_NG:
3668	// For Loops, recursively call this function on the pattern for the loop body,
3669	// then multiply the result by the maximum loop count.
3670	{
3671	int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1))((fRXPat->fCompiledPat->elementAti(loc+1)) & 0xffffff );
3672	if (loopEndLoc == loc+4) {
3673	// Loop has an empty body. No affect on max match length.
3674	// Continue processing with code after the loop end.
3675	loc = loopEndLoc;
3676	break;
3677	}
3678
3679	int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3680	if (maxLoopCount == -1) {
3681	// Unbounded Loop. No upper bound on match length.
3682	currentLen = INT32_MAX(2147483647);
3683	break;
3684	}
3685
3686	U_ASSERT(loopEndLoc >= loc+4)(void)0;
3687	int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
3688	int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3689	if (updatedLen >= INT32_MAX(2147483647)) {
3690	currentLen = INT32_MAX(2147483647);
3691	break;
3692	}
3693	currentLen = (int32_t)updatedLen;
3694	loc = loopEndLoc;
3695	break;
3696	}
3697
3698	case URX_CTR_LOOP:
3699	case URX_CTR_LOOP_NG:
3700	// These opcodes will be skipped over by code for URX_CTR_INIT.
3701	// We shouldn't encounter them here.
3702	UPRV_UNREACHABLE_EXITabort();
3703
3704	case URX_LOOP_SR_I:
3705	case URX_LOOP_DOT_I:
3706	case URX_LOOP_C:
3707	// For anything to do with loops, make the match length unbounded.
3708	currentLen = INT32_MAX(2147483647);
3709	break;
3710
3711
3712
3713	case URX_LA_START:
3714	case URX_LA_END:
3715	// Look-ahead. Just ignore, treat the look-ahead block as if
3716	// it were normal pattern. Gives a too-long match length,
3717	// but good enough for now.
3718	break;
3719
3720	// End of look-ahead ops should always be consumed by the processing at
3721	// the URX_LA_START op.
3722	// UPRV_UNREACHABLE_EXIT;
3723
3724	case URX_LB_START:
3725	{
3726	// Look-behind. Scan forward until the matching look-around end,
3727	// without processing the look-behind block.
3728	int32_t dataLoc = URX_VAL(op)((op) & 0xffffff);
3729	for (loc = loc + 1; loc <= end; ++loc) {
3730	op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3731	int32_t opType = URX_TYPE(op)((uint32_t)(op) >> 24);
3732	if ((opType == URX_LA_END \|\| opType == URX_LBN_END) && (URX_VAL(op)((op) & 0xffffff) == dataLoc)) {
3733	break;
3734	}
3735	}
3736	U_ASSERT(loc <= end)(void)0;
3737	}
3738	break;
3739
3740	default:
3741	UPRV_UNREACHABLE_EXITabort();
3742	}
3743
3744
3745	if (currentLen == INT32_MAX(2147483647)) {
3746	// The maximum length is unbounded.
3747	// Stop further processing of the pattern.
3748	break;
3749	}
3750
3751	}
3752	return currentLen;
3753
3754	}
3755
3756
3757	//------------------------------------------------------------------------------
3758	//
3759	// stripNOPs Remove any NOP operations from the compiled pattern code.
3760	// Extra NOPs are inserted for some constructs during the initial
3761	// code generation to provide locations that may be patched later.
3762	// Many end up unneeded, and are removed by this function.
3763	//
3764	// In order to minimize the number of passes through the pattern,
3765	// back-reference fixup is also performed here (adjusting
3766	// back-reference operands to point to the correct frame offsets).
3767	//
3768	//------------------------------------------------------------------------------
3769	void RegexCompile::stripNOPs() {
3770
3771	if (U_FAILURE(*fStatus)) {
3772	return;
3773	}
3774
3775	int32_t end = fRXPat->fCompiledPat->size();
3776	UVector32 deltas(end, *fStatus);
3777
3778	// Make a first pass over the code, computing the amount that things
3779	// will be offset at each location in the original code.
3780	int32_t loc;
3781	int32_t d = 0;
3782	for (loc=0; loc<end; loc++) {
3783	deltas.addElement(d, *fStatus);
3784	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3785	if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_NOP) {
3786	d++;
3787	}
3788	}
3789
3790	UnicodeString caseStringBuffer;
3791
3792	// Make a second pass over the code, removing the NOPs by moving following
3793	// code up, and patching operands that refer to code locations that
3794	// are being moved. The array of offsets from the first step is used
3795	// to compute the new operand values.
3796	int32_t src;
3797	int32_t dst = 0;
3798	for (src=0; src<end; src++) {
3799	int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3800	int32_t opType = URX_TYPE(op)((uint32_t)(op) >> 24);
3801	switch (opType) {
3802	case URX_NOP:
3803	break;
3804
3805	case URX_STATE_SAVE:
3806	case URX_JMP:
3807	case URX_CTR_LOOP:
3808	case URX_CTR_LOOP_NG:
3809	case URX_RELOC_OPRND:
3810	case URX_JMPX:
3811	case URX_JMP_SAV:
3812	case URX_JMP_SAV_X:
3813	// These are instructions with operands that refer to code locations.
3814	{
3815	int32_t operandAddress = URX_VAL(op)((op) & 0xffffff);
3816	U_ASSERT(operandAddress>=0 && operandAddress<deltas.size())(void)0;
3817	int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3818	op = buildOp(opType, fixedOperandAddress);
3819	fRXPat->fCompiledPat->setElementAt(op, dst);
3820	dst++;
3821	break;
3822	}
3823
3824	case URX_BACKREF:
3825	case URX_BACKREF_I:
3826	{
3827	int32_t where = URX_VAL(op)((op) & 0xffffff);
3828	if (where > fRXPat->fGroupMap->size()) {
3829	error(U_REGEX_INVALID_BACK_REF);
3830	break;
3831	}
3832	where = fRXPat->fGroupMap->elementAti(where-1);
3833	op = buildOp(opType, where);
3834	fRXPat->fCompiledPat->setElementAt(op, dst);
3835	dst++;
3836
3837	fRXPat->fNeedsAltInput = TRUE1;
3838	break;
3839	}
3840	case URX_RESERVED_OP:
3841	case URX_RESERVED_OP_N:
3842	case URX_BACKTRACK:
3843	case URX_END:
3844	case URX_ONECHAR:
3845	case URX_STRING:
3846	case URX_STRING_LEN:
3847	case URX_START_CAPTURE:
3848	case URX_END_CAPTURE:
3849	case URX_STATIC_SETREF:
3850	case URX_STAT_SETREF_N:
3851	case URX_SETREF:
3852	case URX_DOTANY:
3853	case URX_FAIL:
3854	case URX_BACKSLASH_B:
3855	case URX_BACKSLASH_BU:
3856	case URX_BACKSLASH_G:
3857	case URX_BACKSLASH_X:
3858	case URX_BACKSLASH_Z:
3859	case URX_DOTANY_ALL:
3860	case URX_BACKSLASH_D:
3861	case URX_CARET:
3862	case URX_DOLLAR:
3863	case URX_CTR_INIT:
3864	case URX_CTR_INIT_NG:
3865	case URX_DOTANY_UNIX:
3866	case URX_STO_SP:
3867	case URX_LD_SP:
3868	case URX_STO_INP_LOC:
3869	case URX_LA_START:
3870	case URX_LA_END:
3871	case URX_ONECHAR_I:
3872	case URX_STRING_I:
3873	case URX_DOLLAR_M:
3874	case URX_CARET_M:
3875	case URX_CARET_M_UNIX:
3876	case URX_LB_START:
3877	case URX_LB_CONT:
3878	case URX_LB_END:
3879	case URX_LBN_CONT:
3880	case URX_LBN_END:
3881	case URX_LOOP_SR_I:
3882	case URX_LOOP_DOT_I:
3883	case URX_LOOP_C:
3884	case URX_DOLLAR_D:
3885	case URX_DOLLAR_MD:
3886	case URX_BACKSLASH_H:
3887	case URX_BACKSLASH_R:
3888	case URX_BACKSLASH_V:
3889	// These instructions are unaltered by the relocation.
3890	fRXPat->fCompiledPat->setElementAt(op, dst);
3891	dst++;
3892	break;
3893
3894	default:
3895	// Some op is unaccounted for.
3896	UPRV_UNREACHABLE_EXITabort();
3897	}
3898	}
3899
3900	fRXPat->fCompiledPat->setSize(dst);
3901	}
3902
3903
3904
3905
3906	//------------------------------------------------------------------------------
3907	//
3908	// Error Report a rule parse error.
3909	// Only report it if no previous error has been recorded.
3910	//
3911	//------------------------------------------------------------------------------
3912	void RegexCompile::error(UErrorCode e) {
3913	if (U_SUCCESS(*fStatus) \|\| e == U_MEMORY_ALLOCATION_ERROR) {
3914	*fStatus = e;
3915	// Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3916	// API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3917	// int64_t. If the values of the latter are out of range for the former,
3918	// set them to the appropriate "field not supported" values.
3919	if (fLineNum > 0x7FFFFFFF) {
3920	fParseErr->line = 0;
3921	fParseErr->offset = -1;
3922	} else if (fCharNum > 0x7FFFFFFF) {
3923	fParseErr->line = (int32_t)fLineNum;
3924	fParseErr->offset = -1;
3925	} else {
3926	fParseErr->line = (int32_t)fLineNum;
3927	fParseErr->offset = (int32_t)fCharNum;
3928	}
3929
3930	UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3931
3932	// Fill in the context.
3933	// Note: extractBetween() pins supplied indices to the string bounds.
3934	uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)):: memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext ));
3935	uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)):: memset(fParseErr->postContext, 0, sizeof(fParseErr-> postContext));
3936	utext_extractutext_extract_71(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3937	utext_extractutext_extract_71(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3938	}
3939	}
3940
3941
3942	//
3943	// Assorted Unicode character constants.
3944	// Numeric because there is no portable way to enter them as literals.
3945	// (Think EBCDIC).
3946	//
3947	static const UChar chCR = 0x0d; // New lines, for terminating comments.
3948	static const UChar chLF = 0x0a; // Line Feed
3949	static const UChar chPound = 0x23; // '#', introduces a comment.
3950	static const UChar chDigit0 = 0x30; // '0'
3951	static const UChar chDigit7 = 0x37; // '9'
3952	static const UChar chColon = 0x3A; // ':'
3953	static const UChar chE = 0x45; // 'E'
3954	static const UChar chQ = 0x51; // 'Q'
3955	//static const UChar chN = 0x4E; // 'N'
3956	static const UChar chP = 0x50; // 'P'
3957	static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
3958	//static const UChar chLBracket = 0x5b; // '['
3959	static const UChar chRBracket = 0x5d; // ']'
3960	static const UChar chUp = 0x5e; // '^'
3961	static const UChar chLowerP = 0x70;
3962	static const UChar chLBrace = 0x7b; // '{'
3963	static const UChar chRBrace = 0x7d; // '}'
3964	static const UChar chNEL = 0x85; // NEL newline variant
3965	static const UChar chLS = 0x2028; // Unicode Line Separator
3966
3967
3968	//------------------------------------------------------------------------------
3969	//
3970	// nextCharLL Low Level Next Char from the regex pattern.
3971	// Get a char from the string, keep track of input position
3972	// for error reporting.
3973	//
3974	//------------------------------------------------------------------------------
3975	UChar32 RegexCompile::nextCharLL() {
3976	UChar32 ch;
3977
3978	if (fPeekChar != -1) {
3979	ch = fPeekChar;
3980	fPeekChar = -1;
3981	return ch;
3982	}
3983
3984	// assume we're already in the right place
3985	ch = UTEXT_NEXT32(fRXPat->fPattern)((fRXPat->fPattern)->chunkOffset < (fRXPat->fPattern )->chunkLength && ((fRXPat->fPattern)->chunkContents )[(fRXPat->fPattern)->chunkOffset]<0xd800 ? ((fRXPat ->fPattern)->chunkContents)[((fRXPat->fPattern)-> chunkOffset)++] : utext_next32_71(fRXPat->fPattern));
3986	if (ch == U_SENTINEL(-1)) {
3987	return ch;
3988	}
3989
3990	if (ch == chCR \|\|
3991	ch == chNEL \|\|
3992	ch == chLS \|\|
3993	(ch == chLF && fLastChar != chCR)) {
3994	// Character is starting a new line. Bump up the line number, and
3995	// reset the column to 0.
3996	fLineNum++;
3997	fCharNum=0;
3998	}
3999	else {
4000	// Character is not starting a new line. Except in the case of a
4001	// LF following a CR, increment the column position.
4002	if (ch != chLF) {
4003	fCharNum++;
4004	}
4005	}
4006	fLastChar = ch;
4007	return ch;
4008	}
4009
4010	//------------------------------------------------------------------------------
4011	//
4012	// peekCharLL Low Level Character Scanning, sneak a peek at the next
4013	// character without actually getting it.
4014	//
4015	//------------------------------------------------------------------------------
4016	UChar32 RegexCompile::peekCharLL() {
4017	if (fPeekChar == -1) {
4018	fPeekChar = nextCharLL();
4019	}
4020	return fPeekChar;
4021	}
4022
4023
4024	//------------------------------------------------------------------------------
4025	//
4026	// nextChar for pattern scanning. At this level, we handle stripping
4027	// out comments and processing some backslash character escapes.
4028	// The rest of the pattern grammar is handled at the next level up.
4029	//
4030	//------------------------------------------------------------------------------
4031	void RegexCompile::nextChar(RegexPatternChar &c) {
4032	tailRecursion:
4033	fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern)((fRXPat->fPattern)->chunkOffset <= (fRXPat->fPattern )->nativeIndexingLimit? (fRXPat->fPattern)->chunkNativeStart +(fRXPat->fPattern)->chunkOffset : (fRXPat->fPattern )->pFuncs->mapOffsetToNative(fRXPat->fPattern));
4034	c.fChar = nextCharLL();
4035	c.fQuoted = FALSE0;
4036
4037	if (fQuoteMode) {
4038	c.fQuoted = TRUE1;
4039	if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) \|\|
4040	c.fChar == (UChar32)-1) {
4041	fQuoteMode = FALSE0; // Exit quote mode,
4042	nextCharLL(); // discard the E
4043	// nextChar(c); // recurse to get the real next char
4044	goto tailRecursion; // Note: fuzz testing produced testcases that
4045	// resulted in stack overflow here.
4046	}
4047	}
4048	else if (fInBackslashQuote) {
4049	// The current character immediately follows a '\'
4050	// Don't check for any further escapes, just return it as-is.
4051	// Don't set c.fQuoted, because that would prevent the state machine from
4052	// dispatching on the character.
4053	fInBackslashQuote = FALSE0;
4054	}
4055	else
4056	{
4057	// We are not in a \Q quoted region \E of the source.
4058	//
4059	if (fModeFlags & UREGEX_COMMENTS) {
4060	//
4061	// We are in free-spacing and comments mode.
4062	// Scan through any white space and comments, until we
4063	// reach a significant character or the end of input.
4064	for (;;) {
4065	if (c.fChar == (UChar32)-1) {
4066	break; // End of Input
4067	}
4068	if (c.fChar == chPound && fEOLComments == TRUE1) {
4069	// Start of a comment. Consume the rest of it, until EOF or a new line
4070	for (;;) {
4071	c.fChar = nextCharLL();
4072	if (c.fChar == (UChar32)-1 \|\| // EOF
4073	c.fChar == chCR \|\|
4074	c.fChar == chLF \|\|
4075	c.fChar == chNEL \|\|
4076	c.fChar == chLS) {
4077	break;
4078	}
4079	}
4080	}
4081	// TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4082	if (PatternProps::isWhiteSpace(c.fChar) == FALSE0) {
4083	break;
4084	}
4085	c.fChar = nextCharLL();
4086	}
4087	}
4088
4089	//
4090	// check for backslash escaped characters.
4091	//
4092	if (c.fChar == chBackSlash) {
4093	int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern)((fRXPat->fPattern)->chunkOffset <= (fRXPat->fPattern )->nativeIndexingLimit? (fRXPat->fPattern)->chunkNativeStart +(fRXPat->fPattern)->chunkOffset : (fRXPat->fPattern )->pFuncs->mapOffsetToNative(fRXPat->fPattern));
4094	if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4095	//
4096	// A '\' sequence that is handled by ICU's standard unescapeAt function.
4097	// Includes \uxxxx, \n, \r, many others.
4098	// Return the single equivalent character.
4099	//
4100	nextCharLL(); // get & discard the peeked char.
4101	c.fQuoted = TRUE1;
4102
4103	if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)((0==((fRXPat->fPattern)->chunkNativeStart))&&( (fPatternLength)==((fRXPat->fPattern)->chunkNativeLimit ))&&((fPatternLength)==((fRXPat->fPattern)->nativeIndexingLimit )))) {
4104	int32_t endIndex = (int32_t)pos;
4105	c.fChar = u_unescapeAtu_unescapeAt_71(uregex_ucstr_unescape_charAturegex_ucstr_unescape_charAt_71, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4106
4107	if (endIndex == pos) {
4108	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4109	}
4110	fCharNum += endIndex - pos;
4111	UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex)do { int64_t __offset = (endIndex) - (fRXPat->fPattern)-> chunkNativeStart; if (__offset>=0 && __offset<( int64_t)(fRXPat->fPattern)->nativeIndexingLimit && (fRXPat->fPattern)->chunkContents[__offset]<0xdc00) { (fRXPat->fPattern)->chunkOffset=(int32_t)__offset; } else { utext_setNativeIndex_71((fRXPat->fPattern), (endIndex )); } } while (false);
4112	} else {
4113	int32_t offset = 0;
4114	struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern){ (fRXPat->fPattern), -1 };
4115
4116	UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos)do { int64_t __offset = (pos) - (fRXPat->fPattern)->chunkNativeStart ; if (__offset>=0 && __offset<(int64_t)(fRXPat-> fPattern)->nativeIndexingLimit && (fRXPat->fPattern )->chunkContents[__offset]<0xdc00) { (fRXPat->fPattern )->chunkOffset=(int32_t)__offset; } else { utext_setNativeIndex_71 ((fRXPat->fPattern), (pos)); } } while (false);
4117	c.fChar = u_unescapeAtu_unescapeAt_71(uregex_utext_unescape_charAturegex_utext_unescape_charAt_71, &offset, INT32_MAX(2147483647), &context);
4118
4119	if (offset == 0) {
4120	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4121	} else if (context.lastOffset == offset) {
4122	UTEXT_PREVIOUS32(fRXPat->fPattern)((fRXPat->fPattern)->chunkOffset > 0 && (fRXPat ->fPattern)->chunkContents[(fRXPat->fPattern)->chunkOffset -1] < 0xd800 ? (fRXPat->fPattern)->chunkContents[--( (fRXPat->fPattern)->chunkOffset)] : utext_previous32_71 (fRXPat->fPattern));
4123	} else if (context.lastOffset != offset-1) {
4124	utext_moveIndex32utext_moveIndex32_71(fRXPat->fPattern, offset - context.lastOffset - 1);
4125	}
4126	fCharNum += offset;
4127	}
4128	}
4129	else if (peekCharLL() == chDigit0) {
4130	// Octal Escape, using Java Regexp Conventions
4131	// which are \0 followed by 1-3 octal digits.
4132	// Different from ICU Unescape handling of Octal, which does not
4133	// require the leading 0.
4134	// Java also has the convention of only consuming 2 octal digits if
4135	// the three digit number would be > 0xff
4136	//
4137	c.fChar = 0;
4138	nextCharLL(); // Consume the initial 0.
4139	int index;
4140	for (index=0; index<3; index++) {
4141	int32_t ch = peekCharLL();
4142	if (ch<chDigit0 \|\| ch>chDigit7) {
4143	if (index==0) {
4144	// \0 is not followed by any octal digits.
4145	error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4146	}
4147	break;
4148	}
4149	c.fChar <<= 3;
4150	c.fChar += ch&7;
4151	if (c.fChar <= 255) {
4152	nextCharLL();
4153	} else {
4154	// The last digit made the number too big. Forget we saw it.
4155	c.fChar >>= 3;
4156	}
4157	}
4158	c.fQuoted = TRUE1;
4159	}
4160	else if (peekCharLL() == chQ) {
4161	// "\Q" enter quote mode, which will continue until "\E"
4162	fQuoteMode = TRUE1;
4163	nextCharLL(); // discard the 'Q'.
4164	// nextChar(c); // recurse to get the real next char.
4165	goto tailRecursion; // Note: fuzz testing produced test cases that
4166	// resulted in stack overflow here.
4167	}
4168	else
4169	{
4170	// We are in a '\' escape that will be handled by the state table scanner.
4171	// Just return the backslash, but remember that the following char is to
4172	// be taken literally.
4173	fInBackslashQuote = TRUE1;
4174	}
4175	}
4176	}
4177
4178	// re-enable # to end-of-line comments, in case they were disabled.
4179	// They are disabled by the parser upon seeing '(?', but this lasts for
4180	// the fetching of the next character only.
4181	fEOLComments = TRUE1;
4182
4183	// putc(c.fChar, stdout);
4184	}
4185
4186
4187
4188	//------------------------------------------------------------------------------
4189	//
4190	// scanNamedChar
4191	// Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4192	//
4193	// The scan position will be at the 'N'. On return
4194	// the scan position should be just after the '}'
4195	//
4196	// Return the UChar32
4197	//
4198	//------------------------------------------------------------------------------
4199	UChar32 RegexCompile::scanNamedChar() {
4200	if (U_FAILURE(*fStatus)) {
4201	return 0;
4202	}
4203
4204	nextChar(fC);
4205	if (fC.fChar != chLBrace) {
4206	error(U_REGEX_PROPERTY_SYNTAX);
4207	return 0;
4208	}
4209
4210	UnicodeString charName;
4211	for (;;) {
4212	nextChar(fC);
4213	if (fC.fChar == chRBrace) {
4214	break;
4215	}
4216	if (fC.fChar == -1) {
4217	error(U_REGEX_PROPERTY_SYNTAX);
4218	return 0;
4219	}
4220	charName.append(fC.fChar);
4221	}
4222
4223	char name[100];
4224	if (!uprv_isInvariantUStringuprv_isInvariantUString_71(charName.getBuffer(), charName.length()) \|\|
4225	(uint32_t)charName.length()>=sizeof(name)) {
4226	// All Unicode character names have only invariant characters.
4227	// The API to get a character, given a name, accepts only char *, forcing us to convert,
4228	// which requires this error check
4229	error(U_REGEX_PROPERTY_SYNTAX);
4230	return 0;
4231	}
4232	charName.extract(0, charName.length(), name, sizeof(name), US_INVicu::UnicodeString::kInvariant);
4233
4234	UChar32 theChar = u_charFromNameu_charFromName_71(U_UNICODE_CHAR_NAME, name, fStatus);
4235	if (U_FAILURE(*fStatus)) {
4236	error(U_REGEX_PROPERTY_SYNTAX);
4237	}
4238
4239	nextChar(fC); // Continue overall regex pattern processing with char after the '}'
4240	return theChar;
4241	}
4242
4243	//------------------------------------------------------------------------------
4244	//
4245	// scanProp Construct a UnicodeSet from the text at the current scan
4246	// position, which will be of the form \p{whaterver}
4247	//
4248	// The scan position will be at the 'p' or 'P'. On return
4249	// the scan position should be just after the '}'
4250	//
4251	// Return a UnicodeSet, constructed from the \P pattern,
4252	// or NULL if the pattern is invalid.
4253	//
4254	//------------------------------------------------------------------------------
4255	UnicodeSet *RegexCompile::scanProp() {
4256	UnicodeSet *uset = NULL__null;
4257
4258	if (U_FAILURE(*fStatus)) {
4259	return NULL__null;
4260	}
4261	(void)chLowerP; // Suppress compiler unused variable warning.
4262	U_ASSERT(fC.fChar == chLowerP \|\| fC.fChar == chP)(void)0;
4263	UBool negated = (fC.fChar == chP);
4264
4265	UnicodeString propertyName;
4266	nextChar(fC);
4267	if (fC.fChar != chLBrace) {
4268	error(U_REGEX_PROPERTY_SYNTAX);
4269	return NULL__null;
4270	}
4271	for (;;) {
4272	nextChar(fC);
4273	if (fC.fChar == chRBrace) {
4274	break;
4275	}
4276	if (fC.fChar == -1) {
4277	// Hit the end of the input string without finding the closing '}'
4278	error(U_REGEX_PROPERTY_SYNTAX);
4279	return NULL__null;
4280	}
4281	propertyName.append(fC.fChar);
4282	}
4283	uset = createSetForProperty(propertyName, negated);
4284	nextChar(fC); // Move input scan to position following the closing '}'
4285	return uset;
4286	}
4287
4288	//------------------------------------------------------------------------------
4289	//
4290	// scanPosixProp Construct a UnicodeSet from the text at the current scan
4291	// position, which is expected be of the form [:property expression:]
4292	//
4293	// The scan position will be at the opening ':'. On return
4294	// the scan position must be on the closing ']'
4295	//
4296	// Return a UnicodeSet constructed from the pattern,
4297	// or NULL if this is not a valid POSIX-style set expression.
4298	// If not a property expression, restore the initial scan position
4299	// (to the opening ':')
4300	//
4301	// Note: the opening '[:' is not sufficient to guarantee that
4302	// this is a [:property:] expression.
4303	// [:'+=,] is a perfectly good ordinary set expression that
4304	// happens to include ':' as one of its characters.
4305	//
4306	//------------------------------------------------------------------------------
4307	UnicodeSet *RegexCompile::scanPosixProp() {
4308	UnicodeSet *uset = NULL__null;
4309
4310	if (U_FAILURE(*fStatus)) {
4311	return NULL__null;
4312	}
4313
4314	U_ASSERT(fC.fChar == chColon)(void)0;
4315
4316	// Save the scanner state.
4317	// TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4318	int64_t savedScanIndex = fScanIndex;
4319	int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern)((fRXPat->fPattern)->chunkOffset <= (fRXPat->fPattern )->nativeIndexingLimit? (fRXPat->fPattern)->chunkNativeStart +(fRXPat->fPattern)->chunkOffset : (fRXPat->fPattern )->pFuncs->mapOffsetToNative(fRXPat->fPattern));
4320	UBool savedQuoteMode = fQuoteMode;
4321	UBool savedInBackslashQuote = fInBackslashQuote;
4322	UBool savedEOLComments = fEOLComments;
4323	int64_t savedLineNum = fLineNum;
4324	int64_t savedCharNum = fCharNum;
4325	UChar32 savedLastChar = fLastChar;
4326	UChar32 savedPeekChar = fPeekChar;
4327	RegexPatternChar savedfC = fC;
4328
4329	// Scan for a closing ]. A little tricky because there are some perverse
4330	// edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4331	// ending on the second closing ].
4332
4333	UnicodeString propName;
4334	UBool negated = FALSE0;
4335
4336	// Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4337	nextChar(fC);
4338	if (fC.fChar == chUp) {
4339	negated = TRUE1;
4340	nextChar(fC);
4341	}
4342
4343	// Scan for the closing ":]", collecting the property name along the way.
4344	UBool sawPropSetTerminator = FALSE0;
4345	for (;;) {
4346	propName.append(fC.fChar);
4347	nextChar(fC);
4348	if (fC.fQuoted \|\| fC.fChar == -1) {
4349	// Escaped characters or end of input - either says this isn't a [:Property:]
4350	break;
4351	}
4352	if (fC.fChar == chColon) {
4353	nextChar(fC);
4354	if (fC.fChar == chRBracket) {
4355	sawPropSetTerminator = TRUE1;
4356	}
4357	break;
4358	}
4359	}
4360
4361	if (sawPropSetTerminator) {
4362	uset = createSetForProperty(propName, negated);
4363	}
4364	else
4365	{
4366	// No closing ":]".
4367	// Restore the original scan position.
4368	// The main scanner will retry the input as a normal set expression,
4369	// not a [:Property:] expression.
4370	fScanIndex = savedScanIndex;
4371	fQuoteMode = savedQuoteMode;
4372	fInBackslashQuote = savedInBackslashQuote;
4373	fEOLComments = savedEOLComments;
4374	fLineNum = savedLineNum;
4375	fCharNum = savedCharNum;
4376	fLastChar = savedLastChar;
4377	fPeekChar = savedPeekChar;
4378	fC = savedfC;
4379	UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex)do { int64_t __offset = (savedNextIndex) - (fRXPat->fPattern )->chunkNativeStart; if (__offset>=0 && __offset <(int64_t)(fRXPat->fPattern)->nativeIndexingLimit && (fRXPat->fPattern)->chunkContents[__offset]<0xdc00) { (fRXPat->fPattern)->chunkOffset=(int32_t)__offset; } else { utext_setNativeIndex_71((fRXPat->fPattern), (savedNextIndex )); } } while (false);
4380	}
4381	return uset;
4382	}
4383
4384	static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4385	set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4386	addCategory(set, U_GC_CF_MASK((uint32_t)1<<(U_FORMAT_CHAR)), ec);
4387	}
4388
4389	//
4390	// Create a Unicode Set from a Unicode Property expression.
4391	// This is common code underlying both \p{...} and [:...:] expressions.
4392	// Includes trying the Java "properties" that aren't supported as
4393	// normal ICU UnicodeSet properties
4394	//
4395	UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4396
4397	if (U_FAILURE(*fStatus)) {
4398	return nullptr;
4399	}
4400	LocalPointer<UnicodeSet> set;
4401	UErrorCode status = U_ZERO_ERROR;
4402
4403	do { // non-loop, exists to allow breaks from the block.
4404	//
4405	// First try the property as we received it
4406	//
4407	UnicodeString setExpr;
4408	uint32_t usetFlags = 0;
4409	setExpr.append(u"[\\p{", -1);
4410	setExpr.append(propName);
4411	setExpr.append(u"}]", -1);
4412	if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4413	usetFlags \|= USET_CASE_INSENSITIVE;
4414	}
4415	set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL__null, status), status);
4416	if (U_SUCCESS(status) \|\| status == U_MEMORY_ALLOCATION_ERROR) {
4417	break;
4418	}
4419
4420	//
4421	// The incoming property wasn't directly recognized by ICU.
4422
4423	// Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4424	// Java accepts 'word' with mixed case.
4425	// Java accepts 'all' only in all lower case.
4426
4427	status = U_ZERO_ERROR;
4428	if (propName.caseCompare(u"word", -1, 0) == 0) {
4429	set.adoptInsteadAndCheckErrorCode(
4430	RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
4431	break;
4432	}
4433	if (propName.compare(u"all", -1) == 0) {
4434	set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4435	break;
4436	}
4437
4438
4439	// Do Java InBlock expressions
4440	//
4441	UnicodeString mPropName = propName;
4442	if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4443	status = U_ZERO_ERROR;
4444	set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4445	if (U_FAILURE(status)) {
4446	break;
4447	}
4448	UnicodeString blockName(mPropName, 2); // Property with the leading "In" removed.
4449	set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4450	break;
4451	}
4452
4453	// Check for the Java form "IsBooleanPropertyValue", which we will recast
4454	// as "BooleanPropertyValue". The property value can be either a
4455	// a General Category or a Script Name.
4456
4457	if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4458	mPropName.remove(0, 2); // Strip the "Is"
4459	if (mPropName.indexOf(u'=') >= 0) {
4460	// Reject any "Is..." property expression containing an '=', that is,
4461	// any non-binary property expression.
4462	status = U_REGEX_PROPERTY_SYNTAX;
4463	break;
4464	}
4465
4466	if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4467	mPropName.setTo(u"unassigned", -1);
4468	negated = !negated;
4469	} else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4470	mPropName.setTo(u"Titlecase_Letter", -1);
4471	}
4472
4473	mPropName.insert(0, u"[\\p{", -1);
4474	mPropName.append(u"}]", -1);
4475	set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4476
4477	if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4478	set->closeOver(USET_CASE_INSENSITIVE);
4479	}
4480	break;
4481
4482	}
4483
4484	if (propName.startsWith(u"java", -1)) {
4485	status = U_ZERO_ERROR;
4486	set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4487	if (U_FAILURE(status)) {
4488	break;
4489	}
4490	//
4491	// Try the various Java specific properties.
4492	// These all begin with "java"
4493	//
4494	if (propName.compare(u"javaDefined", -1) == 0) {
4495	addCategory(set.getAlias(), U_GC_CN_MASK((uint32_t)1<<(U_GENERAL_OTHER_TYPES)), status);
4496	set->complement();
4497	}
4498	else if (propName.compare(u"javaDigit", -1) == 0) {
4499	addCategory(set.getAlias(), U_GC_ND_MASK((uint32_t)1<<(U_DECIMAL_DIGIT_NUMBER)), status);
4500	}
4501	else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4502	addIdentifierIgnorable(set.getAlias(), status);
4503	}
4504	else if (propName.compare(u"javaISOControl", -1) == 0) {
4505	set->add(0, 0x1F).add(0x7F, 0x9F);
4506	}
4507	else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4508	addCategory(set.getAlias(), U_GC_L_MASK(((uint32_t)1<<(U_UPPERCASE_LETTER))\|((uint32_t)1<< (U_LOWERCASE_LETTER))\|((uint32_t)1<<(U_TITLECASE_LETTER ))\|((uint32_t)1<<(U_MODIFIER_LETTER))\|((uint32_t)1<< (U_OTHER_LETTER))), status);
4509	addCategory(set.getAlias(), U_GC_SC_MASK((uint32_t)1<<(U_CURRENCY_SYMBOL)), status);
4510	addCategory(set.getAlias(), U_GC_PC_MASK((uint32_t)1<<(U_CONNECTOR_PUNCTUATION)), status);
4511	addCategory(set.getAlias(), U_GC_ND_MASK((uint32_t)1<<(U_DECIMAL_DIGIT_NUMBER)), status);
4512	addCategory(set.getAlias(), U_GC_NL_MASK((uint32_t)1<<(U_LETTER_NUMBER)), status);
4513	addCategory(set.getAlias(), U_GC_MC_MASK((uint32_t)1<<(U_COMBINING_SPACING_MARK)), status);
4514	addCategory(set.getAlias(), U_GC_MN_MASK((uint32_t)1<<(U_NON_SPACING_MARK)), status);
4515	addIdentifierIgnorable(set.getAlias(), status);
4516	}
4517	else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4518	addCategory(set.getAlias(), U_GC_L_MASK(((uint32_t)1<<(U_UPPERCASE_LETTER))\|((uint32_t)1<< (U_LOWERCASE_LETTER))\|((uint32_t)1<<(U_TITLECASE_LETTER ))\|((uint32_t)1<<(U_MODIFIER_LETTER))\|((uint32_t)1<< (U_OTHER_LETTER))), status);
4519	addCategory(set.getAlias(), U_GC_NL_MASK((uint32_t)1<<(U_LETTER_NUMBER)), status);
4520	addCategory(set.getAlias(), U_GC_SC_MASK((uint32_t)1<<(U_CURRENCY_SYMBOL)), status);
4521	addCategory(set.getAlias(), U_GC_PC_MASK((uint32_t)1<<(U_CONNECTOR_PUNCTUATION)), status);
4522	}
4523	else if (propName.compare(u"javaLetter", -1) == 0) {
4524	addCategory(set.getAlias(), U_GC_L_MASK(((uint32_t)1<<(U_UPPERCASE_LETTER))\|((uint32_t)1<< (U_LOWERCASE_LETTER))\|((uint32_t)1<<(U_TITLECASE_LETTER ))\|((uint32_t)1<<(U_MODIFIER_LETTER))\|((uint32_t)1<< (U_OTHER_LETTER))), status);
4525	}
4526	else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4527	addCategory(set.getAlias(), U_GC_L_MASK(((uint32_t)1<<(U_UPPERCASE_LETTER))\|((uint32_t)1<< (U_LOWERCASE_LETTER))\|((uint32_t)1<<(U_TITLECASE_LETTER ))\|((uint32_t)1<<(U_MODIFIER_LETTER))\|((uint32_t)1<< (U_OTHER_LETTER))), status);
4528	addCategory(set.getAlias(), U_GC_ND_MASK((uint32_t)1<<(U_DECIMAL_DIGIT_NUMBER)), status);
4529	}
4530	else if (propName.compare(u"javaLowerCase", -1) == 0) {
4531	addCategory(set.getAlias(), U_GC_LL_MASK((uint32_t)1<<(U_LOWERCASE_LETTER)), status);
4532	}
4533	else if (propName.compare(u"javaMirrored", -1) == 0) {
4534	set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4535	}
4536	else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4537	addCategory(set.getAlias(), U_GC_Z_MASK(((uint32_t)1<<(U_SPACE_SEPARATOR))\|((uint32_t)1<< (U_LINE_SEPARATOR))\|((uint32_t)1<<(U_PARAGRAPH_SEPARATOR ))), status);
4538	}
4539	else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4540	set->add(0x10000, UnicodeSet::MAX_VALUE);
4541	}
4542	else if (propName.compare(u"javaTitleCase", -1) == 0) {
4543	addCategory(set.getAlias(), U_GC_LT_MASK((uint32_t)1<<(U_TITLECASE_LETTER)), status);
4544	}
4545	else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4546	addCategory(set.getAlias(), U_GC_L_MASK(((uint32_t)1<<(U_UPPERCASE_LETTER))\|((uint32_t)1<< (U_LOWERCASE_LETTER))\|((uint32_t)1<<(U_TITLECASE_LETTER ))\|((uint32_t)1<<(U_MODIFIER_LETTER))\|((uint32_t)1<< (U_OTHER_LETTER))), status);
4547	addCategory(set.getAlias(), U_GC_NL_MASK((uint32_t)1<<(U_LETTER_NUMBER)), status);
4548	}
4549	else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4550	addCategory(set.getAlias(), U_GC_L_MASK(((uint32_t)1<<(U_UPPERCASE_LETTER))\|((uint32_t)1<< (U_LOWERCASE_LETTER))\|((uint32_t)1<<(U_TITLECASE_LETTER ))\|((uint32_t)1<<(U_MODIFIER_LETTER))\|((uint32_t)1<< (U_OTHER_LETTER))), status);
4551	addCategory(set.getAlias(), U_GC_PC_MASK((uint32_t)1<<(U_CONNECTOR_PUNCTUATION)), status);
4552	addCategory(set.getAlias(), U_GC_ND_MASK((uint32_t)1<<(U_DECIMAL_DIGIT_NUMBER)), status);
4553	addCategory(set.getAlias(), U_GC_NL_MASK((uint32_t)1<<(U_LETTER_NUMBER)), status);
4554	addCategory(set.getAlias(), U_GC_MC_MASK((uint32_t)1<<(U_COMBINING_SPACING_MARK)), status);
4555	addCategory(set.getAlias(), U_GC_MN_MASK((uint32_t)1<<(U_NON_SPACING_MARK)), status);
4556	addIdentifierIgnorable(set.getAlias(), status);
4557	}
4558	else if (propName.compare(u"javaUpperCase", -1) == 0) {
4559	addCategory(set.getAlias(), U_GC_LU_MASK((uint32_t)1<<(U_UPPERCASE_LETTER)), status);
4560	}
4561	else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4562	set->add(0, UnicodeSet::MAX_VALUE);
4563	}
4564	else if (propName.compare(u"javaWhitespace", -1) == 0) {
4565	addCategory(set.getAlias(), U_GC_Z_MASK(((uint32_t)1<<(U_SPACE_SEPARATOR))\|((uint32_t)1<< (U_LINE_SEPARATOR))\|((uint32_t)1<<(U_PARAGRAPH_SEPARATOR ))), status);
4566	set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4567	set->add(9, 0x0d).add(0x1c, 0x1f);
4568	} else {
4569	status = U_REGEX_PROPERTY_SYNTAX;
4570	}
4571
4572	if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4573	set->closeOver(USET_CASE_INSENSITIVE);
4574	}
4575	break;
4576	}
4577
4578	// Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4579	// extensions matched it.
4580	status = U_REGEX_PROPERTY_SYNTAX;
4581	} while (false); // End of do loop block. Code above breaks out of the block on success or hard failure.
4582
4583	if (U_SUCCESS(status)) {
4584	// ICU 70 adds emoji properties of strings, but as long as Java does not say how to
4585	// deal with properties of strings and character classes with strings, we ignore them.
4586	// Just in case something downstream might stumble over the strings,
4587	// we remove them from the set.
4588	// Note that when we support strings, the complement of a property (as with \P)
4589	// should be implemented as .complement().removeAllStrings() (code point complement).
4590	set->removeAllStrings();
4591	U_ASSERT(set.isValid())(void)0;
4592	if (negated) {
4593	set->complement();
4594	}
4595	return set.orphan();
4596	} else {
4597	if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4598	status = U_REGEX_PROPERTY_SYNTAX;
4599	}
4600	error(status);
4601	return nullptr;
4602	}
4603	}
4604
4605
4606	//
4607	// SetEval Part of the evaluation of [set expressions].
4608	// Perform any pending (stacked) operations with precedence
4609	// equal or greater to that of the next operator encountered
4610	// in the expression.
4611	//
4612	void RegexCompile::setEval(int32_t nextOp) {
4613	UnicodeSet *rightOperand = NULL__null;
4614	UnicodeSet *leftOperand = NULL__null;
4615	for (;;) {
4616	U_ASSERT(fSetOpStack.empty()==FALSE)(void)0;
4617	int32_t pendingSetOperation = fSetOpStack.peeki();
4618	if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4619	break;
4620	}
4621	fSetOpStack.popi();
4622	U_ASSERT(fSetStack.empty() == FALSE)(void)0;
4623	rightOperand = (UnicodeSet *)fSetStack.peek();
4624	// ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings
4625	// (see comments there).
4626	// We also do not yet support string literals in character classes,
4627	// so there should not be any strings.
4628	// Note that when we support strings, the complement of a set (as with ^ or \P)
4629	// should be implemented as .complement().removeAllStrings() (code point complement).
4630	U_ASSERT(!rightOperand->hasStrings())(void)0;
4631	switch (pendingSetOperation) {
4632	case setNegation:
4633	rightOperand->complement();
4634	break;
4635	case setCaseClose:
4636	// TODO: need a simple close function. Ticket 6065
4637	rightOperand->closeOver(USET_CASE_INSENSITIVE);
4638	rightOperand->removeAllStrings();
4639	break;
4640	case setDifference1:
4641	case setDifference2:
4642	fSetStack.pop();
4643	leftOperand = (UnicodeSet *)fSetStack.peek();
4644	leftOperand->removeAll(*rightOperand);
4645	delete rightOperand;
4646	break;
4647	case setIntersection1:
4648	case setIntersection2:
4649	fSetStack.pop();
4650	leftOperand = (UnicodeSet *)fSetStack.peek();
4651	leftOperand->retainAll(*rightOperand);
4652	delete rightOperand;
4653	break;
4654	case setUnion:
4655	fSetStack.pop();
4656	leftOperand = (UnicodeSet *)fSetStack.peek();
4657	leftOperand->addAll(*rightOperand);
4658	delete rightOperand;
4659	break;
4660	default:
4661	UPRV_UNREACHABLE_EXITabort();
4662	}
4663	}
4664	}
4665
4666	void RegexCompile::setPushOp(int32_t op) {
4667	setEval(op);
4668	fSetOpStack.push(op, *fStatus);
4669	LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
4670	fSetStack.push(lpSet.orphan(), *fStatus);
4671	}
4672
4673	U_NAMESPACE_END}
4674	#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
4675