Bug Summary

File:out/../deps/icu-small/source/i18n/regexcmp.cpp
Warning:line 2273, column 21
Value stored to 'saveOp' during its initialization is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name regexcmp.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/home/maurizio/node-v18.6.0/out -resource-dir /usr/local/lib/clang/16.0.0 -D V8_DEPRECATION_WARNINGS -D V8_IMMINENT_DEPRECATION_WARNINGS -D _GLIBCXX_USE_CXX11_ABI=1 -D NODE_OPENSSL_CONF_NAME=nodejs_conf -D NODE_OPENSSL_HAS_QUIC -D __STDC_FORMAT_MACROS -D OPENSSL_NO_PINSHARED -D OPENSSL_THREADS -D U_COMMON_IMPLEMENTATION=1 -D U_I18N_IMPLEMENTATION=1 -D U_IO_IMPLEMENTATION=1 -D U_TOOLUTIL_IMPLEMENTATION=1 -D U_ATTRIBUTE_DEPRECATED= -D _CRT_SECURE_NO_DEPRECATE= -D U_STATIC_IMPLEMENTATION=1 -D UCONFIG_NO_SERVICE=1 -D U_ENABLE_DYLOAD=0 -D U_HAVE_STD_STRING=1 -D UCONFIG_NO_BREAK_ITERATION=0 -I ../deps/icu-small/source/common -I ../deps/icu-small/source/i18n -I ../deps/icu-small/source/tools/toolutil -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8 -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/x86_64-redhat-linux -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/backward -internal-isystem /usr/local/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../x86_64-redhat-linux/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-unused-parameter -Wno-deprecated-declarations -Wno-strict-aliasing -std=gnu++17 -fdeprecated-macro -fdebug-compilation-dir=/home/maurizio/node-v18.6.0/out -ferror-limit 19 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-08-22-142216-507842-1 -x c++ ../deps/icu-small/source/i18n/regexcmp.cpp
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4// file: regexcmp.cpp
5//
6// Copyright (C) 2002-2016 International Business Machines Corporation and others.
7// All Rights Reserved.
8//
9// This file contains the ICU regular expression compiler, which is responsible
10// for processing a regular expression pattern into the compiled form that
11// is used by the match finding engine.
12//
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_REGULAR_EXPRESSIONS0
17
18#include "unicode/ustring.h"
19#include "unicode/unistr.h"
20#include "unicode/uniset.h"
21#include "unicode/uchar.h"
22#include "unicode/uchriter.h"
23#include "unicode/parsepos.h"
24#include "unicode/parseerr.h"
25#include "unicode/regex.h"
26#include "unicode/utf.h"
27#include "unicode/utf16.h"
28#include "patternprops.h"
29#include "putilimp.h"
30#include "cmemory.h"
31#include "cstr.h"
32#include "cstring.h"
33#include "uvectr32.h"
34#include "uvectr64.h"
35#include "uassert.h"
36#include "uinvchar.h"
37
38#include "regeximp.h"
39#include "regexcst.h" // Contains state table for the regex pattern parser.
40 // generated by a Perl script.
41#include "regexcmp.h"
42#include "regexst.h"
43#include "regextxt.h"
44
45
46
47U_NAMESPACE_BEGINnamespace icu_71 {
48
49
50//------------------------------------------------------------------------------
51//
52// Constructor.
53//
54//------------------------------------------------------------------------------
55RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56 fParenStack(status), fSetStack(uprv_deleteUObjectuprv_deleteUObject_71, nullptr, status), fSetOpStack(status)
57{
58 // Lazy init of all shared global sets (needed for init()'s empty text)
59 RegexStaticSets::initGlobals(&status);
60
61 fStatus = &status;
62
63 fRXPat = rxp;
64 fScanIndex = 0;
65 fLastChar = -1;
66 fPeekChar = -1;
67 fLineNum = 1;
68 fCharNum = 0;
69 fQuoteMode = FALSE0;
70 fInBackslashQuote = FALSE0;
71 fModeFlags = fRXPat->fFlags | 0x80000000;
72 fEOLComments = TRUE1;
73
74 fMatchOpenParen = -1;
75 fMatchCloseParen = -1;
76 fCaptureName = NULL__null;
77 fLastSetLiteral = U_SENTINEL(-1);
78
79 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80 status = rxp->fDeferredStatus;
81 }
82}
83
84static const UChar chAmp = 0x26; // '&'
85static const UChar chDash = 0x2d; // '-'
86
87
88//------------------------------------------------------------------------------
89//
90// Destructor
91//
92//------------------------------------------------------------------------------
93RegexCompile::~RegexCompile() {
94 delete fCaptureName; // Normally will be NULL, but can exist if pattern
95 // compilation stops with a syntax error.
96}
97
98static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100}
101
102//------------------------------------------------------------------------------
103//
104// Compile regex pattern. The state machine for rexexp pattern parsing is here.
105// The state tables are hand-written in the file regexcst.txt,
106// and converted to the form used here by a perl
107// script regexcst.pl
108//
109//------------------------------------------------------------------------------
110void RegexCompile::compile(
111 const UnicodeString &pat, // Source pat to be compiled.
112 UParseError &pp, // Error position info
113 UErrorCode &e) // Error Code
114{
115 fRXPat->fPatternString = new UnicodeString(pat);
116 UText patternText = UTEXT_INITIALIZER{ UTEXT_MAGIC, 0, 0, sizeof(UText), 0, 0, 0, 0, 0, 0, __null,
__null, __null, __null, __null, __null, __null, __null, 0, 0
, 0, 0, 0, 0 }
;
117 utext_openConstUnicodeStringutext_openConstUnicodeString_71(&patternText, fRXPat->fPatternString, &e);
118
119 if (U_SUCCESS(e)) {
120 compile(&patternText, pp, e);
121 utext_closeutext_close_71(&patternText);
122 }
123}
124
125//
126// compile, UText mode
127// All the work is actually done here.
128//
129void RegexCompile::compile(
130 UText *pat, // Source pat to be compiled.
131 UParseError &pp, // Error position info
132 UErrorCode &e) // Error Code
133{
134 fStatus = &e;
135 fParseErr = &pp;
136 fStackPtr = 0;
137 fStack[fStackPtr] = 0;
138
139 if (U_FAILURE(*fStatus)) {
140 return;
141 }
142
143 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
144 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0)(void)0;
145
146 // Prepare the RegexPattern object to receive the compiled pattern.
147 fRXPat->fPattern = utext_cloneutext_clone_71(fRXPat->fPattern, pat, FALSE0, TRUE1, fStatus);
148 if (U_FAILURE(*fStatus)) {
149 return;
150 }
151
152 // Initialize the pattern scanning state machine
153 fPatternLength = utext_nativeLengthutext_nativeLength_71(pat);
154 uint16_t state = 1;
155 const RegexTableEl *tableEl;
156
157 // UREGEX_LITERAL force entire pattern to be treated as a literal string.
158 if (fModeFlags & UREGEX_LITERAL) {
159 fQuoteMode = TRUE1;
160 }
161
162 nextChar(fC); // Fetch the first char from the pattern string.
163
164 //
165 // Main loop for the regex pattern parsing state machine.
166 // Runs once per state transition.
167 // Each time through optionally performs, depending on the state table,
168 // - an advance to the the next pattern char
169 // - an action to be performed.
170 // - pushing or popping a state to/from the local state return stack.
171 // file regexcst.txt is the source for the state table. The logic behind
172 // recongizing the pattern syntax is there, not here.
173 //
174 for (;;) {
175 // Bail out if anything has gone wrong.
176 // Regex pattern parsing stops on the first error encountered.
177 if (U_FAILURE(*fStatus)) {
178 break;
179 }
180
181 U_ASSERT(state != 0)(void)0;
182
183 // Find the state table element that matches the input char from the pattern, or the
184 // class of the input character. Start with the first table row for this
185 // state, then linearly scan forward until we find a row that matches the
186 // character. The last row for each state always matches all characters, so
187 // the search will stop there, if not before.
188 //
189 tableEl = &gRuleParseStateTable[state];
190 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
191 fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
192
193 for (;;) { // loop through table rows belonging to this state, looking for one
194 // that matches the current input char.
195 REGEX_SCAN_DEBUG_PRINTF(("."));
196 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE0 && tableEl->fCharClass == fC.fChar) {
197 // Table row specified an individual character, not a set, and
198 // the input character is not quoted, and
199 // the input character matched it.
200 break;
201 }
202 if (tableEl->fCharClass == 255) {
203 // Table row specified default, match anything character class.
204 break;
205 }
206 if (tableEl->fCharClass == 254 && fC.fQuoted) {
207 // Table row specified "quoted" and the char was quoted.
208 break;
209 }
210 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
211 // Table row specified eof and we hit eof on the input.
212 break;
213 }
214
215 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
216 fC.fQuoted == FALSE0 && // char is not escaped &&
217 fC.fChar != (UChar32)-1) { // char is not EOF
218 U_ASSERT(tableEl->fCharClass <= 137)(void)0;
219 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
220 // Table row specified a character class, or set of characters,
221 // and the current char matches it.
222 break;
223 }
224 }
225
226 // No match on this row, advance to the next row for this state,
227 tableEl++;
228 }
229 REGEX_SCAN_DEBUG_PRINTF(("\n"));
230
231 //
232 // We've found the row of the state table that matches the current input
233 // character from the rules string.
234 // Perform any action specified by this row in the state table.
235 if (doParseActions(tableEl->fAction) == FALSE0) {
236 // Break out of the state machine loop if the
237 // the action signalled some kind of error, or
238 // the action was to exit, occurs on normal end-of-rules-input.
239 break;
240 }
241
242 if (tableEl->fPushState != 0) {
243 fStackPtr++;
244 if (fStackPtr >= kStackSize) {
245 error(U_REGEX_INTERNAL_ERROR);
246 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
247 fStackPtr--;
248 }
249 fStack[fStackPtr] = tableEl->fPushState;
250 }
251
252 //
253 // NextChar. This is where characters are actually fetched from the pattern.
254 // Happens under control of the 'n' tag in the state table.
255 //
256 if (tableEl->fNextChar) {
257 nextChar(fC);
258 }
259
260 // Get the next state from the table entry, or from the
261 // state stack if the next state was specified as "pop".
262 if (tableEl->fNextState != 255) {
263 state = tableEl->fNextState;
264 } else {
265 state = fStack[fStackPtr];
266 fStackPtr--;
267 if (fStackPtr < 0) {
268 // state stack underflow
269 // This will occur if the user pattern has mis-matched parentheses,
270 // with extra close parens.
271 //
272 fStackPtr++;
273 error(U_REGEX_MISMATCHED_PAREN);
274 }
275 }
276
277 }
278
279 if (U_FAILURE(*fStatus)) {
280 // Bail out if the pattern had errors.
281 return;
282 }
283
284 //
285 // The pattern has now been read and processed, and the compiled code generated.
286 //
287
288 //
289 // The pattern's fFrameSize so far has accumulated the requirements for
290 // storage for capture parentheses, counters, etc. that are encountered
291 // in the pattern. Add space for the two variables that are always
292 // present in the saved state: the input string position (int64_t) and
293 // the position in the compiled pattern.
294 //
295 allocateStackData(RESTACKFRAME_HDRCOUNT2);
296
297 //
298 // Optimization pass 1: NOPs, back-references, and case-folding
299 //
300 stripNOPs();
301
302 //
303 // Get bounds for the minimum and maximum length of a string that this
304 // pattern can match. Used to avoid looking for matches in strings that
305 // are too short.
306 //
307 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
308
309 //
310 // Optimization pass 2: match start type
311 //
312 matchStartType();
313
314 //
315 // Set up fast latin-1 range sets
316 //
317 int32_t numSets = fRXPat->fSets->size();
318 fRXPat->fSets8 = new Regex8BitSet[numSets];
319 // Null pointer check.
320 if (fRXPat->fSets8 == NULL__null) {
321 e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
322 return;
323 }
324 int32_t i;
325 for (i=0; i<numSets; i++) {
326 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
327 fRXPat->fSets8[i].init(s);
328 }
329
330}
331
332
333
334
335
336//------------------------------------------------------------------------------
337//
338// doParseAction Do some action during regex pattern parsing.
339// Called by the parse state machine.
340//
341// Generation of the match engine PCode happens here, or
342// in functions called from the parse actions defined here.
343//
344//
345//------------------------------------------------------------------------------
346UBool RegexCompile::doParseActions(int32_t action)
347{
348 UBool returnVal = TRUE1;
349
350 switch ((Regex_PatternParseAction)action) {
351
352 case doPatStart:
353 // Start of pattern compiles to:
354 //0 SAVE 2 Fall back to position of FAIL
355 //1 jmp 3
356 //2 FAIL Stop if we ever reach here.
357 //3 NOP Dummy, so start of pattern looks the same as
358 // the start of an ( grouping.
359 //4 NOP Resreved, will be replaced by a save if there are
360 // OR | operators at the top level
361 appendOp(URX_STATE_SAVE, 2);
362 appendOp(URX_JMP, 3);
363 appendOp(URX_FAIL, 0);
364
365 // Standard open nonCapture paren action emits the two NOPs and
366 // sets up the paren stack frame.
367 doParseActions(doOpenNonCaptureParen);
368 break;
369
370 case doPatFinish:
371 // We've scanned to the end of the pattern
372 // The end of pattern compiles to:
373 // URX_END
374 // which will stop the runtime match engine.
375 // Encountering end of pattern also behaves like a close paren,
376 // and forces fixups of the State Save at the beginning of the compiled pattern
377 // and of any OR operations at the top level.
378 //
379 handleCloseParen();
380 if (fParenStack.size() > 0) {
381 // Missing close paren in pattern.
382 error(U_REGEX_MISMATCHED_PAREN);
383 }
384
385 // add the END operation to the compiled pattern.
386 appendOp(URX_END, 0);
387
388 // Terminate the pattern compilation state machine.
389 returnVal = FALSE0;
390 break;
391
392
393
394 case doOrOperator:
395 // Scanning a '|', as in (A|B)
396 {
397 // Generate code for any pending literals preceding the '|'
398 fixLiterals(FALSE0);
399
400 // Insert a SAVE operation at the start of the pattern section preceding
401 // this OR at this level. This SAVE will branch the match forward
402 // to the right hand side of the OR in the event that the left hand
403 // side fails to match and backtracks. Locate the position for the
404 // save from the location on the top of the parentheses stack.
405 int32_t savePosition = fParenStack.popi();
406 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
407 U_ASSERT(URX_TYPE(op) == URX_NOP)(void)0; // original contents of reserved location
408 op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
409 fRXPat->fCompiledPat->setElementAt(op, savePosition);
410
411 // Append an JMP operation into the compiled pattern. The operand for
412 // the JMP will eventually be the location following the ')' for the
413 // group. This will be patched in later, when the ')' is encountered.
414 appendOp(URX_JMP, 0);
415
416 // Push the position of the newly added JMP op onto the parentheses stack.
417 // This registers if for fixup when this block's close paren is encountered.
418 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
419
420 // Append a NOP to the compiled pattern. This is the slot reserved
421 // for a SAVE in the event that there is yet another '|' following
422 // this one.
423 appendOp(URX_NOP, 0);
424 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
425 }
426 break;
427
428
429 case doBeginNamedCapture:
430 // Scanning (?<letter.
431 // The first letter of the name will come through again under doConinueNamedCapture.
432 fCaptureName = new UnicodeString();
433 if (fCaptureName == NULL__null) {
434 error(U_MEMORY_ALLOCATION_ERROR);
435 }
436 break;
437
438 case doContinueNamedCapture:
439 fCaptureName->append(fC.fChar);
440 break;
441
442 case doBadNamedCapture:
443 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
444 break;
445
446 case doOpenCaptureParen:
447 // Open Capturing Paren, possibly named.
448 // Compile to a
449 // - NOP, which later may be replaced by a save-state if the
450 // parenthesized group gets a * quantifier, followed by
451 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
452 // - NOP, which may later be replaced by a save-state if there
453 // is an '|' alternation within the parens.
454 //
455 // Each capture group gets three slots in the save stack frame:
456 // 0: Capture Group start position (in input string being matched.)
457 // 1: Capture Group end position.
458 // 2: Start of Match-in-progress.
459 // The first two locations are for a completed capture group, and are
460 // referred to by back references and the like.
461 // The third location stores the capture start position when an START_CAPTURE is
462 // encountered. This will be promoted to a completed capture when (and if) the corresponding
463 // END_CAPTURE is encountered.
464 {
465 fixLiterals();
466 appendOp(URX_NOP, 0);
467 int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame.
468 appendOp(URX_START_CAPTURE, varsLoc);
469 appendOp(URX_NOP, 0);
470
471 // On the Parentheses stack, start a new frame and add the positions
472 // of the two NOPs. Depending on what follows in the pattern, the
473 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
474 // address of the end of the parenthesized group.
475 fParenStack.push(fModeFlags, *fStatus); // Match mode state
476 fParenStack.push(capturing, *fStatus); // Frame type.
477 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
478 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
479
480 // Save the mapping from group number to stack frame variable position.
481 fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
482
483 // If this is a named capture group, add the name->group number mapping.
484 if (fCaptureName != NULL__null) {
485 if (!fRXPat->initNamedCaptureMap()) {
486 if (U_SUCCESS(*fStatus)) {
487 error(fRXPat->fDeferredStatus);
488 }
489 break;
490 }
491 int32_t groupNumber = fRXPat->fGroupMap->size();
492 int32_t previousMapping = uhash_putiuhash_puti_71(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
493 fCaptureName = NULL__null; // hash table takes ownership of the name (key) string.
494 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
495 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
496 }
497 }
498 }
499 break;
500
501 case doOpenNonCaptureParen:
502 // Open non-caputuring (grouping only) Paren.
503 // Compile to a
504 // - NOP, which later may be replaced by a save-state if the
505 // parenthesized group gets a * quantifier, followed by
506 // - NOP, which may later be replaced by a save-state if there
507 // is an '|' alternation within the parens.
508 {
509 fixLiterals();
510 appendOp(URX_NOP, 0);
511 appendOp(URX_NOP, 0);
512
513 // On the Parentheses stack, start a new frame and add the positions
514 // of the two NOPs.
515 fParenStack.push(fModeFlags, *fStatus); // Match mode state
516 fParenStack.push(plain, *fStatus); // Begin a new frame.
517 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
518 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
519 }
520 break;
521
522
523 case doOpenAtomicParen:
524 // Open Atomic Paren. (?>
525 // Compile to a
526 // - NOP, which later may be replaced if the parenthesized group
527 // has a quantifier, followed by
528 // - STO_SP save state stack position, so it can be restored at the ")"
529 // - NOP, which may later be replaced by a save-state if there
530 // is an '|' alternation within the parens.
531 {
532 fixLiterals();
533 appendOp(URX_NOP, 0);
534 int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr.
535 appendOp(URX_STO_SP, varLoc);
536 appendOp(URX_NOP, 0);
537
538 // On the Parentheses stack, start a new frame and add the positions
539 // of the two NOPs. Depending on what follows in the pattern, the
540 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
541 // address of the end of the parenthesized group.
542 fParenStack.push(fModeFlags, *fStatus); // Match mode state
543 fParenStack.push(atomic, *fStatus); // Frame type.
544 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
545 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
546 }
547 break;
548
549
550 case doOpenLookAhead:
551 // Positive Look-ahead (?= stuff )
552 //
553 // Note: Addition of transparent input regions, with the need to
554 // restore the original regions when failing out of a lookahead
555 // block, complicated this sequence. Some combined opcodes
556 // might make sense - or might not, lookahead aren't that common.
557 //
558 // Caution: min match length optimization knows about this
559 // sequence; don't change without making updates there too.
560 //
561 // Compiles to
562 // 1 LA_START dataLoc Saves SP, Input Pos, Active input region.
563 // 2. STATE_SAVE 4 on failure of lookahead, goto 4
564 // 3 JMP 6 continue ...
565 //
566 // 4. LA_END Look Ahead failed. Restore regions.
567 // 5. BACKTRACK and back track again.
568 //
569 // 6. NOP reserved for use by quantifiers on the block.
570 // Look-ahead can't have quantifiers, but paren stack
571 // compile time conventions require the slot anyhow.
572 // 7. NOP may be replaced if there is are '|' ops in the block.
573 // 8. code for parenthesized stuff.
574 // 9. LA_END
575 //
576 // Four data slots are reserved, for saving state on entry to the look-around
577 // 0: stack pointer on entry.
578 // 1: input position on entry.
579 // 2: fActiveStart, the active bounds start on entry.
580 // 3: fActiveLimit, the active bounds limit on entry.
581 {
582 fixLiterals();
583 int32_t dataLoc = allocateData(4);
584 appendOp(URX_LA_START, dataLoc);
585 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
586 appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
587 appendOp(URX_LA_END, dataLoc);
588 appendOp(URX_BACKTRACK, 0);
589 appendOp(URX_NOP, 0);
590 appendOp(URX_NOP, 0);
591
592 // On the Parentheses stack, start a new frame and add the positions
593 // of the NOPs.
594 fParenStack.push(fModeFlags, *fStatus); // Match mode state
595 fParenStack.push(lookAhead, *fStatus); // Frame type.
596 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
597 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
598 }
599 break;
600
601 case doOpenLookAheadNeg:
602 // Negated Lookahead. (?! stuff )
603 // Compiles to
604 // 1. LA_START dataloc
605 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
606 // // which continues with the match.
607 // 3. NOP // Std. Open Paren sequence, for possible '|'
608 // 4. code for parenthesized stuff.
609 // 5. LA_END // Cut back stack, remove saved state from step 2.
610 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
611 // 7. END_LA // Restore match region, in case look-ahead was using
612 // an alternate (transparent) region.
613 // Four data slots are reserved, for saving state on entry to the look-around
614 // 0: stack pointer on entry.
615 // 1: input position on entry.
616 // 2: fActiveStart, the active bounds start on entry.
617 // 3: fActiveLimit, the active bounds limit on entry.
618 {
619 fixLiterals();
620 int32_t dataLoc = allocateData(4);
621 appendOp(URX_LA_START, dataLoc);
622 appendOp(URX_STATE_SAVE, 0); // dest address will be patched later.
623 appendOp(URX_NOP, 0);
624
625 // On the Parentheses stack, start a new frame and add the positions
626 // of the StateSave and NOP.
627 fParenStack.push(fModeFlags, *fStatus); // Match mode state
628 fParenStack.push(negLookAhead, *fStatus); // Frame type
629 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
630 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
631
632 // Instructions #5 - #7 will be added when the ')' is encountered.
633 }
634 break;
635
636 case doOpenLookBehind:
637 {
638 // Compile a (?<= look-behind open paren.
639 //
640 // Compiles to
641 // 0 URX_LB_START dataLoc
642 // 1 URX_LB_CONT dataLoc
643 // 2 MinMatchLen
644 // 3 MaxMatchLen
645 // 4 URX_NOP Standard '(' boilerplate.
646 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
647 // 6 <code for LookBehind expression>
648 // 7 URX_LB_END dataLoc # Check match len, restore input len
649 // 8 URX_LA_END dataLoc # Restore stack, input pos
650 //
651 // Allocate a block of matcher data, to contain (when running a match)
652 // 0: Stack ptr on entry
653 // 1: Input Index on entry
654 // 2: fActiveStart, the active bounds start on entry.
655 // 3: fActiveLimit, the active bounds limit on entry.
656 // 4: Start index of match current match attempt.
657 // The first four items must match the layout of data for LA_START / LA_END
658
659 // Generate match code for any pending literals.
660 fixLiterals();
661
662 // Allocate data space
663 int32_t dataLoc = allocateData(5);
664
665 // Emit URX_LB_START
666 appendOp(URX_LB_START, dataLoc);
667
668 // Emit URX_LB_CONT
669 appendOp(URX_LB_CONT, dataLoc);
670 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
671 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
672
673 // Emit the NOPs
674 appendOp(URX_NOP, 0);
675 appendOp(URX_NOP, 0);
676
677 // On the Parentheses stack, start a new frame and add the positions
678 // of the URX_LB_CONT and the NOP.
679 fParenStack.push(fModeFlags, *fStatus); // Match mode state
680 fParenStack.push(lookBehind, *fStatus); // Frame type
681 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
682 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
683
684 // The final two instructions will be added when the ')' is encountered.
685 }
686
687 break;
688
689 case doOpenLookBehindNeg:
690 {
691 // Compile a (?<! negated look-behind open paren.
692 //
693 // Compiles to
694 // 0 URX_LB_START dataLoc # Save entry stack, input len
695 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
696 // 2 MinMatchLen
697 // 3 MaxMatchLen
698 // 4 continueLoc (9)
699 // 5 URX_NOP Standard '(' boilerplate.
700 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
701 // 7 <code for LookBehind expression>
702 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
703 // 9 ...
704 //
705 // Allocate a block of matcher data, to contain (when running a match)
706 // 0: Stack ptr on entry
707 // 1: Input Index on entry
708 // 2: fActiveStart, the active bounds start on entry.
709 // 3: fActiveLimit, the active bounds limit on entry.
710 // 4: Start index of match current match attempt.
711 // The first four items must match the layout of data for LA_START / LA_END
712
713 // Generate match code for any pending literals.
714 fixLiterals();
715
716 // Allocate data space
717 int32_t dataLoc = allocateData(5);
718
719 // Emit URX_LB_START
720 appendOp(URX_LB_START, dataLoc);
721
722 // Emit URX_LBN_CONT
723 appendOp(URX_LBN_CONT, dataLoc);
724 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
725 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
726 appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled later.
727
728 // Emit the NOPs
729 appendOp(URX_NOP, 0);
730 appendOp(URX_NOP, 0);
731
732 // On the Parentheses stack, start a new frame and add the positions
733 // of the URX_LB_CONT and the NOP.
734 fParenStack.push(fModeFlags, *fStatus); // Match mode state
735 fParenStack.push(lookBehindN, *fStatus); // Frame type
736 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
737 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
738
739 // The final two instructions will be added when the ')' is encountered.
740 }
741 break;
742
743 case doConditionalExpr:
744 // Conditionals such as (?(1)a:b)
745 case doPerlInline:
746 // Perl inline-conditionals. (?{perl code}a|b) We're not perl, no way to do them.
747 error(U_REGEX_UNIMPLEMENTED);
748 break;
749
750
751 case doCloseParen:
752 handleCloseParen();
753 if (fParenStack.size() <= 0) {
754 // Extra close paren, or missing open paren.
755 error(U_REGEX_MISMATCHED_PAREN);
756 }
757 break;
758
759 case doNOP:
760 break;
761
762
763 case doBadOpenParenType:
764 case doRuleError:
765 error(U_REGEX_RULE_SYNTAX);
766 break;
767
768
769 case doMismatchedParenErr:
770 error(U_REGEX_MISMATCHED_PAREN);
771 break;
772
773 case doPlus:
774 // Normal '+' compiles to
775 // 1. stuff to be repeated (already built)
776 // 2. jmp-sav 1
777 // 3. ...
778 //
779 // Or, if the item to be repeated can match a zero length string,
780 // 1. STO_INP_LOC data-loc
781 // 2. body of stuff to be repeated
782 // 3. JMP_SAV_X 2
783 // 4. ...
784
785 //
786 // Or, if the item to be repeated is simple
787 // 1. Item to be repeated.
788 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
789 // 3. LOOP_C stack location
790 {
791 int32_t topLoc = blockTopLoc(FALSE0); // location of item #1
792 int32_t frameLoc;
793
794 // Check for simple constructs, which may get special optimized code.
795 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
796 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
797
798 if (URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_SETREF) {
799 // Emit optimized code for [char set]+
800 appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)((repeatedOp) & 0xffffff));
801 frameLoc = allocateStackData(1);
802 appendOp(URX_LOOP_C, frameLoc);
803 break;
804 }
805
806 if (URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY ||
807 URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY_ALL ||
808 URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY_UNIX) {
809 // Emit Optimized code for .+ operations.
810 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
811 if (URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY_ALL) {
812 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
813 loopOpI |= 1;
814 }
815 if (fModeFlags & UREGEX_UNIX_LINES) {
816 loopOpI |= 2;
817 }
818 appendOp(loopOpI);
819 frameLoc = allocateStackData(1);
820 appendOp(URX_LOOP_C, frameLoc);
821 break;
822 }
823
824 }
825
826 // General case.
827
828 // Check for minimum match length of zero, which requires
829 // extra loop-breaking code.
830 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
831 // Zero length match is possible.
832 // Emit the code sequence that can handle it.
833 insertOp(topLoc);
834 frameLoc = allocateStackData(1);
835
836 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
837 fRXPat->fCompiledPat->setElementAt(op, topLoc);
838
839 appendOp(URX_JMP_SAV_X, topLoc+1);
840 } else {
841 // Simpler code when the repeated body must match something non-empty
842 appendOp(URX_JMP_SAV, topLoc);
843 }
844 }
845 break;
846
847 case doNGPlus:
848 // Non-greedy '+?' compiles to
849 // 1. stuff to be repeated (already built)
850 // 2. state-save 1
851 // 3. ...
852 {
853 int32_t topLoc = blockTopLoc(FALSE0);
854 appendOp(URX_STATE_SAVE, topLoc);
855 }
856 break;
857
858
859 case doOpt:
860 // Normal (greedy) ? quantifier.
861 // Compiles to
862 // 1. state save 3
863 // 2. body of optional block
864 // 3. ...
865 // Insert the state save into the compiled pattern, and we're done.
866 {
867 int32_t saveStateLoc = blockTopLoc(TRUE1);
868 int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
869 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
870 }
871 break;
872
873 case doNGOpt:
874 // Non-greedy ?? quantifier
875 // compiles to
876 // 1. jmp 4
877 // 2. body of optional block
878 // 3 jmp 5
879 // 4. state save 2
880 // 5 ...
881 // This code is less than ideal, with two jmps instead of one, because we can only
882 // insert one instruction at the top of the block being iterated.
883 {
884 int32_t jmp1_loc = blockTopLoc(TRUE1);
885 int32_t jmp2_loc = fRXPat->fCompiledPat->size();
886
887 int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1);
888 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
889
890 appendOp(URX_JMP, jmp2_loc+2);
891
892 appendOp(URX_STATE_SAVE, jmp1_loc+1);
893 }
894 break;
895
896
897 case doStar:
898 // Normal (greedy) * quantifier.
899 // Compiles to
900 // 1. STATE_SAVE 4
901 // 2. body of stuff being iterated over
902 // 3. JMP_SAV 2
903 // 4. ...
904 //
905 // Or, if the body is a simple [Set],
906 // 1. LOOP_SR_I set number
907 // 2. LOOP_C stack location
908 // ...
909 //
910 // Or if this is a .*
911 // 1. LOOP_DOT_I (. matches all mode flag)
912 // 2. LOOP_C stack location
913 //
914 // Or, if the body can match a zero-length string, to inhibit infinite loops,
915 // 1. STATE_SAVE 5
916 // 2. STO_INP_LOC data-loc
917 // 3. body of stuff
918 // 4. JMP_SAV_X 2
919 // 5. ...
920 {
921 // location of item #1, the STATE_SAVE
922 int32_t topLoc = blockTopLoc(FALSE0);
923 int32_t dataLoc = -1;
924
925 // Check for simple *, where the construct being repeated
926 // compiled to single opcode, and might be optimizable.
927 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
928 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
929
930 if (URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_SETREF) {
931 // Emit optimized code for a [char set]*
932 int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)((repeatedOp) & 0xffffff));
933 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
934 dataLoc = allocateStackData(1);
935 appendOp(URX_LOOP_C, dataLoc);
936 break;
937 }
938
939 if (URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY ||
940 URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY_ALL ||
941 URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY_UNIX) {
942 // Emit Optimized code for .* operations.
943 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
944 if (URX_TYPE(repeatedOp)((uint32_t)(repeatedOp) >> 24) == URX_DOTANY_ALL) {
945 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
946 loopOpI |= 1;
947 }
948 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
949 loopOpI |= 2;
950 }
951 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
952 dataLoc = allocateStackData(1);
953 appendOp(URX_LOOP_C, dataLoc);
954 break;
955 }
956 }
957
958 // Emit general case code for this *
959 // The optimizations did not apply.
960
961 int32_t saveStateLoc = blockTopLoc(TRUE1);
962 int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1);
963
964 // Check for minimum match length of zero, which requires
965 // extra loop-breaking code.
966 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
967 insertOp(saveStateLoc);
968 dataLoc = allocateStackData(1);
969
970 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
971 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
972 jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
973 }
974
975 // Locate the position in the compiled pattern where the match will continue
976 // after completing the *. (4 or 5 in the comment above)
977 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
978
979 // Put together the save state op and store it into the compiled code.
980 int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
981 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
982
983 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
984 appendOp(jmpOp);
985 }
986 break;
987
988 case doNGStar:
989 // Non-greedy *? quantifier
990 // compiles to
991 // 1. JMP 3
992 // 2. body of stuff being iterated over
993 // 3. STATE_SAVE 2
994 // 4 ...
995 {
996 int32_t jmpLoc = blockTopLoc(TRUE1); // loc 1.
997 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
998 int32_t jmpOp = buildOp(URX_JMP, saveLoc);
999 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1000 appendOp(URX_STATE_SAVE, jmpLoc+1);
1001 }
1002 break;
1003
1004
1005 case doIntervalInit:
1006 // The '{' opening an interval quantifier was just scanned.
1007 // Init the counter variables that will accumulate the values as the digits
1008 // are scanned.
1009 fIntervalLow = 0;
1010 fIntervalUpper = -1;
1011 break;
1012
1013 case doIntevalLowerDigit:
1014 // Scanned a digit from the lower value of an {lower,upper} interval
1015 {
1016 int32_t digitValue = u_charDigitValueu_charDigitValue_71(fC.fChar);
1017 U_ASSERT(digitValue >= 0)(void)0;
1018 int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1019 if (val > INT32_MAX(2147483647)) {
1020 error(U_REGEX_NUMBER_TOO_BIG);
1021 } else {
1022 fIntervalLow = (int32_t)val;
1023 }
1024 }
1025 break;
1026
1027 case doIntervalUpperDigit:
1028 // Scanned a digit from the upper value of an {lower,upper} interval
1029 {
1030 if (fIntervalUpper < 0) {
1031 fIntervalUpper = 0;
1032 }
1033 int32_t digitValue = u_charDigitValueu_charDigitValue_71(fC.fChar);
1034 U_ASSERT(digitValue >= 0)(void)0;
1035 int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1036 if (val > INT32_MAX(2147483647)) {
1037 error(U_REGEX_NUMBER_TOO_BIG);
1038 } else {
1039 fIntervalUpper = (int32_t)val;
1040 }
1041 }
1042 break;
1043
1044 case doIntervalSame:
1045 // Scanned a single value interval like {27}. Upper = Lower.
1046 fIntervalUpper = fIntervalLow;
1047 break;
1048
1049 case doInterval:
1050 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
1051 if (compileInlineInterval() == FALSE0) {
1052 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1053 }
1054 break;
1055
1056 case doPossessiveInterval:
1057 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1058 {
1059 // Remember the loc for the top of the block being looped over.
1060 // (Can not reserve a slot in the compiled pattern at this time, because
1061 // compileInterval needs to reserve also, and blockTopLoc can only reserve
1062 // once per block.)
1063 int32_t topLoc = blockTopLoc(FALSE0);
1064
1065 // Produce normal looping code.
1066 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1067
1068 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1069 // just as if the loop was inclosed in atomic parentheses.
1070
1071 // First the STO_SP before the start of the loop
1072 insertOp(topLoc);
1073
1074 int32_t varLoc = allocateData(1); // Reserve a data location for saving the
1075 int32_t op = buildOp(URX_STO_SP, varLoc);
1076 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1077
1078 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1079 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc)(void)0;
1080 loopOp++; // point LoopOp after the just-inserted STO_SP
1081 fRXPat->fCompiledPat->push(loopOp, *fStatus);
1082
1083 // Then the LD_SP after the end of the loop
1084 appendOp(URX_LD_SP, varLoc);
1085 }
1086
1087 break;
1088
1089 case doNGInterval:
1090 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1091 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1092 break;
1093
1094 case doIntervalError:
1095 error(U_REGEX_BAD_INTERVAL);
1096 break;
1097
1098 case doLiteralChar:
1099 // We've just scanned a "normal" character from the pattern,
1100 literalChar(fC.fChar);
1101 break;
1102
1103
1104 case doEscapedLiteralChar:
1105 // We've just scanned an backslashed escaped character with no
1106 // special meaning. It represents itself.
1107 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1108 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1109 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1110 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1111 }
1112 literalChar(fC.fChar);
1113 break;
1114
1115
1116 case doDotAny:
1117 // scanned a ".", match any single character.
1118 {
1119 fixLiterals(FALSE0);
1120 if (fModeFlags & UREGEX_DOTALL) {
1121 appendOp(URX_DOTANY_ALL, 0);
1122 } else if (fModeFlags & UREGEX_UNIX_LINES) {
1123 appendOp(URX_DOTANY_UNIX, 0);
1124 } else {
1125 appendOp(URX_DOTANY, 0);
1126 }
1127 }
1128 break;
1129
1130 case doCaret:
1131 {
1132 fixLiterals(FALSE0);
1133 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1134 appendOp(URX_CARET, 0);
1135 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1136 appendOp(URX_CARET_M, 0);
1137 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1138 appendOp(URX_CARET, 0); // Only testing true start of input.
1139 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1140 appendOp(URX_CARET_M_UNIX, 0);
1141 }
1142 }
1143 break;
1144
1145 case doDollar:
1146 {
1147 fixLiterals(FALSE0);
1148 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1149 appendOp(URX_DOLLAR, 0);
1150 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1151 appendOp(URX_DOLLAR_M, 0);
1152 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1153 appendOp(URX_DOLLAR_D, 0);
1154 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1155 appendOp(URX_DOLLAR_MD, 0);
1156 }
1157 }
1158 break;
1159
1160 case doBackslashA:
1161 fixLiterals(FALSE0);
1162 appendOp(URX_CARET, 0);
1163 break;
1164
1165 case doBackslashB:
1166 {
1167 #if UCONFIG_NO_BREAK_ITERATION0==1
1168 if (fModeFlags & UREGEX_UWORD) {
1169 error(U_UNSUPPORTED_ERROR);
1170 }
1171 #endif
1172 fixLiterals(FALSE0);
1173 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1174 appendOp(op, 1);
1175 }
1176 break;
1177
1178 case doBackslashb:
1179 {
1180 #if UCONFIG_NO_BREAK_ITERATION0==1
1181 if (fModeFlags & UREGEX_UWORD) {
1182 error(U_UNSUPPORTED_ERROR);
1183 }
1184 #endif
1185 fixLiterals(FALSE0);
1186 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1187 appendOp(op, 0);
1188 }
1189 break;
1190
1191 case doBackslashD:
1192 fixLiterals(FALSE0);
1193 appendOp(URX_BACKSLASH_D, 1);
1194 break;
1195
1196 case doBackslashd:
1197 fixLiterals(FALSE0);
1198 appendOp(URX_BACKSLASH_D, 0);
1199 break;
1200
1201 case doBackslashG:
1202 fixLiterals(FALSE0);
1203 appendOp(URX_BACKSLASH_G, 0);
1204 break;
1205
1206 case doBackslashH:
1207 fixLiterals(FALSE0);
1208 appendOp(URX_BACKSLASH_H, 1);
1209 break;
1210
1211 case doBackslashh:
1212 fixLiterals(FALSE0);
1213 appendOp(URX_BACKSLASH_H, 0);
1214 break;
1215
1216 case doBackslashR:
1217 fixLiterals(FALSE0);
1218 appendOp(URX_BACKSLASH_R, 0);
1219 break;
1220
1221 case doBackslashS:
1222 fixLiterals(FALSE0);
1223 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1224 break;
1225
1226 case doBackslashs:
1227 fixLiterals(FALSE0);
1228 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1229 break;
1230
1231 case doBackslashV:
1232 fixLiterals(FALSE0);
1233 appendOp(URX_BACKSLASH_V, 1);
1234 break;
1235
1236 case doBackslashv:
1237 fixLiterals(FALSE0);
1238 appendOp(URX_BACKSLASH_V, 0);
1239 break;
1240
1241 case doBackslashW:
1242 fixLiterals(FALSE0);
1243 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1244 break;
1245
1246 case doBackslashw:
1247 fixLiterals(FALSE0);
1248 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1249 break;
1250
1251 case doBackslashX:
1252 #if UCONFIG_NO_BREAK_ITERATION0==1
1253 // Grapheme Cluster Boundary requires ICU break iteration.
1254 error(U_UNSUPPORTED_ERROR);
1255 #endif
1256 fixLiterals(FALSE0);
1257 appendOp(URX_BACKSLASH_X, 0);
1258 break;
1259
1260 case doBackslashZ:
1261 fixLiterals(FALSE0);
1262 appendOp(URX_DOLLAR, 0);
1263 break;
1264
1265 case doBackslashz:
1266 fixLiterals(FALSE0);
1267 appendOp(URX_BACKSLASH_Z, 0);
1268 break;
1269
1270 case doEscapeError:
1271 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1272 break;
1273
1274 case doExit:
1275 fixLiterals(FALSE0);
1276 returnVal = FALSE0;
1277 break;
1278
1279 case doProperty:
1280 {
1281 fixLiterals(FALSE0);
1282 UnicodeSet *theSet = scanProp();
1283 compileSet(theSet);
1284 }
1285 break;
1286
1287 case doNamedChar:
1288 {
1289 UChar32 c = scanNamedChar();
1290 literalChar(c);
1291 }
1292 break;
1293
1294
1295 case doBackRef:
1296 // BackReference. Somewhat unusual in that the front-end can not completely parse
1297 // the regular expression, because the number of digits to be consumed
1298 // depends on the number of capture groups that have been defined. So
1299 // we have to do it here instead.
1300 {
1301 int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1302 int32_t groupNum = 0;
1303 UChar32 c = fC.fChar;
1304
1305 for (;;) {
1306 // Loop once per digit, for max allowed number of digits in a back reference.
1307 int32_t digit = u_charDigitValueu_charDigitValue_71(c);
1308 groupNum = groupNum * 10 + digit;
1309 if (groupNum >= numCaptureGroups) {
1310 break;
1311 }
1312 c = peekCharLL();
1313 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE0) {
1314 break;
1315 }
1316 nextCharLL();
1317 }
1318
1319 // Scan of the back reference in the source regexp is complete. Now generate
1320 // the compiled code for it.
1321 // Because capture groups can be forward-referenced by back-references,
1322 // we fill the operand with the capture group number. At the end
1323 // of compilation, it will be changed to the variable's location.
1324 U_ASSERT(groupNum > 0)(void)0; // Shouldn't happen. '\0' begins an octal escape sequence,
1325 // and shouldn't enter this code path at all.
1326 fixLiterals(FALSE0);
1327 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1328 appendOp(URX_BACKREF_I, groupNum);
1329 } else {
1330 appendOp(URX_BACKREF, groupNum);
1331 }
1332 }
1333 break;
1334
1335 case doBeginNamedBackRef:
1336 U_ASSERT(fCaptureName == NULL)(void)0;
1337 fCaptureName = new UnicodeString;
1338 if (fCaptureName == NULL__null) {
1339 error(U_MEMORY_ALLOCATION_ERROR);
1340 }
1341 break;
1342
1343 case doContinueNamedBackRef:
1344 fCaptureName->append(fC.fChar);
1345 break;
1346
1347 case doCompleteNamedBackRef:
1348 {
1349 int32_t groupNumber =
1350 fRXPat->fNamedCaptureMap ? uhash_getiuhash_geti_71(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
1351 if (groupNumber == 0) {
1352 // Group name has not been defined.
1353 // Could be a forward reference. If we choose to support them at some
1354 // future time, extra mechanism will be required at this point.
1355 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1356 } else {
1357 // Given the number, handle identically to a \n numbered back reference.
1358 // See comments above, under doBackRef
1359 fixLiterals(FALSE0);
1360 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1361 appendOp(URX_BACKREF_I, groupNumber);
1362 } else {
1363 appendOp(URX_BACKREF, groupNumber);
1364 }
1365 }
1366 delete fCaptureName;
1367 fCaptureName = NULL__null;
1368 break;
1369 }
1370
1371 case doPossessivePlus:
1372 // Possessive ++ quantifier.
1373 // Compiles to
1374 // 1. STO_SP
1375 // 2. body of stuff being iterated over
1376 // 3. STATE_SAVE 5
1377 // 4. JMP 2
1378 // 5. LD_SP
1379 // 6. ...
1380 //
1381 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1382 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1383 //
1384 {
1385 // Emit the STO_SP
1386 int32_t topLoc = blockTopLoc(TRUE1);
1387 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1388 int32_t op = buildOp(URX_STO_SP, stoLoc);
1389 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1390
1391 // Emit the STATE_SAVE
1392 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1393
1394 // Emit the JMP
1395 appendOp(URX_JMP, topLoc+1);
1396
1397 // Emit the LD_SP
1398 appendOp(URX_LD_SP, stoLoc);
1399 }
1400 break;
1401
1402 case doPossessiveStar:
1403 // Possessive *+ quantifier.
1404 // Compiles to
1405 // 1. STO_SP loc
1406 // 2. STATE_SAVE 5
1407 // 3. body of stuff being iterated over
1408 // 4. JMP 2
1409 // 5. LD_SP loc
1410 // 6 ...
1411 // TODO: do something to cut back the state stack each time through the loop.
1412 {
1413 // Reserve two slots at the top of the block.
1414 int32_t topLoc = blockTopLoc(TRUE1);
1415 insertOp(topLoc);
1416
1417 // emit STO_SP loc
1418 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1419 int32_t op = buildOp(URX_STO_SP, stoLoc);
1420 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1421
1422 // Emit the SAVE_STATE 5
1423 int32_t L7 = fRXPat->fCompiledPat->size()+1;
1424 op = buildOp(URX_STATE_SAVE, L7);
1425 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1426
1427 // Append the JMP operation.
1428 appendOp(URX_JMP, topLoc+1);
1429
1430 // Emit the LD_SP loc
1431 appendOp(URX_LD_SP, stoLoc);
1432 }
1433 break;
1434
1435 case doPossessiveOpt:
1436 // Possessive ?+ quantifier.
1437 // Compiles to
1438 // 1. STO_SP loc
1439 // 2. SAVE_STATE 5
1440 // 3. body of optional block
1441 // 4. LD_SP loc
1442 // 5. ...
1443 //
1444 {
1445 // Reserve two slots at the top of the block.
1446 int32_t topLoc = blockTopLoc(TRUE1);
1447 insertOp(topLoc);
1448
1449 // Emit the STO_SP
1450 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1451 int32_t op = buildOp(URX_STO_SP, stoLoc);
1452 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1453
1454 // Emit the SAVE_STATE
1455 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
1456 op = buildOp(URX_STATE_SAVE, continueLoc);
1457 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1458
1459 // Emit the LD_SP
1460 appendOp(URX_LD_SP, stoLoc);
1461 }
1462 break;
1463
1464
1465 case doBeginMatchMode:
1466 fNewModeFlags = fModeFlags;
1467 fSetModeFlag = TRUE1;
1468 break;
1469
1470 case doMatchMode: // (?i) and similar
1471 {
1472 int32_t bit = 0;
1473 switch (fC.fChar) {
1474 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
1475 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break;
1476 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
1477 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
1478 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break;
1479 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
1480 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
1481 case 0x2d: /* '-' */ fSetModeFlag = FALSE0; break;
1482 default:
1483 UPRV_UNREACHABLE_EXITabort(); // Should never happen. Other chars are filtered out
1484 // by the scanner.
1485 }
1486 if (fSetModeFlag) {
1487 fNewModeFlags |= bit;
1488 } else {
1489 fNewModeFlags &= ~bit;
1490 }
1491 }
1492 break;
1493
1494 case doSetMatchMode:
1495 // Emit code to match any pending literals, using the not-yet changed match mode.
1496 fixLiterals();
1497
1498 // We've got a (?i) or similar. The match mode is being changed, but
1499 // the change is not scoped to a parenthesized block.
1500 U_ASSERT(fNewModeFlags < 0)(void)0;
1501 fModeFlags = fNewModeFlags;
1502
1503 break;
1504
1505
1506 case doMatchModeParen:
1507 // We've got a (?i: or similar. Begin a parenthesized block, save old
1508 // mode flags so they can be restored at the close of the block.
1509 //
1510 // Compile to a
1511 // - NOP, which later may be replaced by a save-state if the
1512 // parenthesized group gets a * quantifier, followed by
1513 // - NOP, which may later be replaced by a save-state if there
1514 // is an '|' alternation within the parens.
1515 {
1516 fixLiterals(FALSE0);
1517 appendOp(URX_NOP, 0);
1518 appendOp(URX_NOP, 0);
1519
1520 // On the Parentheses stack, start a new frame and add the positions
1521 // of the two NOPs (a normal non-capturing () frame, except for the
1522 // saving of the original mode flags.)
1523 fParenStack.push(fModeFlags, *fStatus);
1524 fParenStack.push(flags, *fStatus); // Frame Marker
1525 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
1526 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
1527
1528 // Set the current mode flags to the new values.
1529 U_ASSERT(fNewModeFlags < 0)(void)0;
1530 fModeFlags = fNewModeFlags;
1531 }
1532 break;
1533
1534 case doBadModeFlag:
1535 error(U_REGEX_INVALID_FLAG);
1536 break;
1537
1538 case doSuppressComments:
1539 // We have just scanned a '(?'. We now need to prevent the character scanner from
1540 // treating a '#' as a to-the-end-of-line comment.
1541 // (This Perl compatibility just gets uglier and uglier to do...)
1542 fEOLComments = FALSE0;
1543 break;
1544
1545
1546 case doSetAddAmp:
1547 {
1548 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1549 set->add(chAmp);
1550 }
1551 break;
1552
1553 case doSetAddDash:
1554 {
1555 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1556 set->add(chDash);
1557 }
1558 break;
1559
1560 case doSetBackslash_s:
1561 {
1562 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1563 set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1564 break;
1565 }
1566
1567 case doSetBackslash_S:
1568 {
1569 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1570 UnicodeSet SSet;
1571 SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
1572 set->addAll(SSet);
1573 break;
1574 }
1575
1576 case doSetBackslash_d:
1577 {
1578 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1579 // TODO - make a static set, ticket 6058.
1580 addCategory(set, U_GC_ND_MASK((uint32_t)1<<(U_DECIMAL_DIGIT_NUMBER)), *fStatus);
1581 break;
1582 }
1583
1584 case doSetBackslash_D:
1585 {
1586 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1587 UnicodeSet digits;
1588 // TODO - make a static set, ticket 6058.
1589 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK((uint32_t)1<<(U_DECIMAL_DIGIT_NUMBER)), *fStatus);
1590 digits.complement();
1591 set->addAll(digits);
1592 break;
1593 }
1594
1595 case doSetBackslash_h:
1596 {
1597 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1598 UnicodeSet h;
1599 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK((uint32_t)1<<(U_SPACE_SEPARATOR)), *fStatus);
1600 h.add((UChar32)9); // Tab
1601 set->addAll(h);
1602 break;
1603 }
1604
1605 case doSetBackslash_H:
1606 {
1607 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1608 UnicodeSet h;
1609 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK((uint32_t)1<<(U_SPACE_SEPARATOR)), *fStatus);
1610 h.add((UChar32)9); // Tab
1611 h.complement();
1612 set->addAll(h);
1613 break;
1614 }
1615
1616 case doSetBackslash_v:
1617 {
1618 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1619 set->add((UChar32)0x0a, (UChar32)0x0d); // add range
1620 set->add((UChar32)0x85);
1621 set->add((UChar32)0x2028, (UChar32)0x2029);
1622 break;
1623 }
1624
1625 case doSetBackslash_V:
1626 {
1627 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1628 UnicodeSet v;
1629 v.add((UChar32)0x0a, (UChar32)0x0d); // add range
1630 v.add((UChar32)0x85);
1631 v.add((UChar32)0x2028, (UChar32)0x2029);
1632 v.complement();
1633 set->addAll(v);
1634 break;
1635 }
1636
1637 case doSetBackslash_w:
1638 {
1639 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1640 set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1641 break;
1642 }
1643
1644 case doSetBackslash_W:
1645 {
1646 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1647 UnicodeSet SSet;
1648 SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
1649 set->addAll(SSet);
1650 break;
1651 }
1652
1653 case doSetBegin:
1654 {
1655 fixLiterals(FALSE0);
1656 LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
1657 fSetStack.push(lpSet.orphan(), *fStatus);
1658 fSetOpStack.push(setStart, *fStatus);
1659 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1660 fSetOpStack.push(setCaseClose, *fStatus);
1661 }
1662 break;
1663 }
1664
1665 case doSetBeginDifference1:
1666 // We have scanned something like [[abc]-[
1667 // Set up a new UnicodeSet for the set beginning with the just-scanned '['
1668 // Push a Difference operator, which will cause the new set to be subtracted from what
1669 // went before once it is created.
1670 setPushOp(setDifference1);
1671 fSetOpStack.push(setStart, *fStatus);
1672 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1673 fSetOpStack.push(setCaseClose, *fStatus);
1674 }
1675 break;
1676
1677 case doSetBeginIntersection1:
1678 // We have scanned something like [[abc]&[
1679 // Need both the '&' operator and the open '[' operator.
1680 setPushOp(setIntersection1);
1681 fSetOpStack.push(setStart, *fStatus);
1682 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1683 fSetOpStack.push(setCaseClose, *fStatus);
1684 }
1685 break;
1686
1687 case doSetBeginUnion:
1688 // We have scanned something like [[abc][
1689 // Need to handle the union operation explicitly [[abc] | [
1690 setPushOp(setUnion);
1691 fSetOpStack.push(setStart, *fStatus);
1692 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1693 fSetOpStack.push(setCaseClose, *fStatus);
1694 }
1695 break;
1696
1697 case doSetDifference2:
1698 // We have scanned something like [abc--
1699 // Consider this to unambiguously be a set difference operator.
1700 setPushOp(setDifference2);
1701 break;
1702
1703 case doSetEnd:
1704 // Have encountered the ']' that closes a set.
1705 // Force the evaluation of any pending operations within this set,
1706 // leave the completed set on the top of the set stack.
1707 setEval(setEnd);
1708 U_ASSERT(fSetOpStack.peeki()==setStart)(void)0;
1709 fSetOpStack.popi();
1710 break;
1711
1712 case doSetFinish:
1713 {
1714 // Finished a complete set expression, including all nested sets.
1715 // The close bracket has already triggered clearing out pending set operators,
1716 // the operator stack should be empty and the operand stack should have just
1717 // one entry, the result set.
1718 U_ASSERT(fSetOpStack.empty())(void)0;
1719 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1720 U_ASSERT(fSetStack.empty())(void)0;
1721 compileSet(theSet);
1722 break;
1723 }
1724
1725 case doSetIntersection2:
1726 // Have scanned something like [abc&&
1727 setPushOp(setIntersection2);
1728 break;
1729
1730 case doSetLiteral:
1731 // Union the just-scanned literal character into the set being built.
1732 // This operation is the highest precedence set operation, so we can always do
1733 // it immediately, without waiting to see what follows. It is necessary to perform
1734 // any pending '-' or '&' operation first, because these have the same precedence
1735 // as union-ing in a literal'
1736 {
1737 setEval(setUnion);
1738 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1739 s->add(fC.fChar);
1740 fLastSetLiteral = fC.fChar;
1741 break;
1742 }
1743
1744 case doSetLiteralEscaped:
1745 // A back-slash escaped literal character was encountered.
1746 // Processing is the same as with setLiteral, above, with the addition of
1747 // the optional check for errors on escaped ASCII letters.
1748 {
1749 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1750 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1751 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1752 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1753 }
1754 setEval(setUnion);
1755 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1756 s->add(fC.fChar);
1757 fLastSetLiteral = fC.fChar;
1758 break;
1759 }
1760
1761 case doSetNamedChar:
1762 // Scanning a \N{UNICODE CHARACTER NAME}
1763 // Aside from the source of the character, the processing is identical to doSetLiteral,
1764 // above.
1765 {
1766 UChar32 c = scanNamedChar();
1767 setEval(setUnion);
1768 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1769 s->add(c);
1770 fLastSetLiteral = c;
1771 break;
1772 }
1773
1774 case doSetNamedRange:
1775 // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1776 // The left character is already in the set, and is saved in fLastSetLiteral.
1777 // The right side needs to be picked up, the scan is at the 'N'.
1778 // Lower Limit > Upper limit being an error matches both Java
1779 // and ICU UnicodeSet behavior.
1780 {
1781 UChar32 c = scanNamedChar();
1782 if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL(-1) || fLastSetLiteral > c)) {
1783 error(U_REGEX_INVALID_RANGE);
1784 }
1785 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1786 s->add(fLastSetLiteral, c);
1787 fLastSetLiteral = c;
1788 break;
1789 }
1790
1791
1792 case doSetNegate:
1793 // Scanned a '^' at the start of a set.
1794 // Push the negation operator onto the set op stack.
1795 // A twist for case-insensitive matching:
1796 // the case closure operation must happen _before_ negation.
1797 // But the case closure operation will already be on the stack if it's required.
1798 // This requires checking for case closure, and swapping the stack order
1799 // if it is present.
1800 {
1801 int32_t tosOp = fSetOpStack.peeki();
1802 if (tosOp == setCaseClose) {
1803 fSetOpStack.popi();
1804 fSetOpStack.push(setNegation, *fStatus);
1805 fSetOpStack.push(setCaseClose, *fStatus);
1806 } else {
1807 fSetOpStack.push(setNegation, *fStatus);
1808 }
1809 }
1810 break;
1811
1812 case doSetNoCloseError:
1813 error(U_REGEX_MISSING_CLOSE_BRACKET);
1814 break;
1815
1816 case doSetOpError:
1817 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal.
1818 break;
1819
1820 case doSetPosixProp:
1821 {
1822 UnicodeSet *s = scanPosixProp();
1823 if (s != NULL__null) {
1824 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1825 tos->addAll(*s);
1826 delete s;
1827 } // else error. scanProp() reported the error status already.
1828 }
1829 break;
1830
1831 case doSetProp:
1832 // Scanned a \p \P within [brackets].
1833 {
1834 UnicodeSet *s = scanProp();
1835 if (s != NULL__null) {
1836 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1837 tos->addAll(*s);
1838 delete s;
1839 } // else error. scanProp() reported the error status already.
1840 }
1841 break;
1842
1843
1844 case doSetRange:
1845 // We have scanned literal-literal. Add the range to the set.
1846 // The left character is already in the set, and is saved in fLastSetLiteral.
1847 // The right side is the current character.
1848 // Lower Limit > Upper limit being an error matches both Java
1849 // and ICU UnicodeSet behavior.
1850 {
1851
1852 if (fLastSetLiteral == U_SENTINEL(-1) || fLastSetLiteral > fC.fChar) {
1853 error(U_REGEX_INVALID_RANGE);
1854 }
1855 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1856 s->add(fLastSetLiteral, fC.fChar);
1857 break;
1858 }
1859
1860 default:
1861 UPRV_UNREACHABLE_EXITabort();
1862 }
1863
1864 if (U_FAILURE(*fStatus)) {
1865 returnVal = FALSE0;
1866 }
1867
1868 return returnVal;
1869}
1870
1871
1872
1873//------------------------------------------------------------------------------
1874//
1875// literalChar We've encountered a literal character from the pattern,
1876// or an escape sequence that reduces to a character.
1877// Add it to the string containing all literal chars/strings from
1878// the pattern.
1879//
1880//------------------------------------------------------------------------------
1881void RegexCompile::literalChar(UChar32 c) {
1882 fLiteralChars.append(c);
1883}
1884
1885
1886//------------------------------------------------------------------------------
1887//
1888// fixLiterals When compiling something that can follow a literal
1889// string in a pattern, emit the code to match the
1890// accumulated literal string.
1891//
1892// Optionally, split the last char of the string off into
1893// a single "ONE_CHAR" operation, so that quantifiers can
1894// apply to that char alone. Example: abc*
1895// The * must apply to the 'c' only.
1896//
1897//------------------------------------------------------------------------------
1898void RegexCompile::fixLiterals(UBool split) {
1899
1900 // If no literal characters have been scanned but not yet had code generated
1901 // for them, nothing needs to be done.
1902 if (fLiteralChars.length() == 0) {
1903 return;
1904 }
1905
1906 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1907 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1908
1909 // Split: We need to ensure that the last item in the compiled pattern
1910 // refers only to the last literal scanned in the pattern, so that
1911 // quantifiers (*, +, etc.) affect only it, and not a longer string.
1912 // Split before case folding for case insensitive matches.
1913
1914 if (split) {
1915 fLiteralChars.truncate(indexOfLastCodePoint);
1916 fixLiterals(FALSE0); // Recursive call, emit code to match the first part of the string.
1917 // Note that the truncated literal string may be empty, in which case
1918 // nothing will be emitted.
1919
1920 literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal.
1921 fixLiterals(FALSE0); // Second recursive call, code for the final code point.
1922 return;
1923 }
1924
1925 // If we are doing case-insensitive matching, case fold the string. This may expand
1926 // the string, e.g. the German sharp-s turns into "ss"
1927 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1928 fLiteralChars.foldCase();
1929 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1930 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1931 }
1932
1933 if (indexOfLastCodePoint == 0) {
1934 // Single character, emit a URX_ONECHAR op to match it.
1935 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1936 u_hasBinaryPropertyu_hasBinaryProperty_71(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1937 appendOp(URX_ONECHAR_I, lastCodePoint);
1938 } else {
1939 appendOp(URX_ONECHAR, lastCodePoint);
1940 }
1941 } else {
1942 // Two or more chars, emit a URX_STRING to match them.
1943 if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1944 error(U_REGEX_PATTERN_TOO_BIG);
1945 }
1946 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1947 appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1948 } else {
1949 // TODO here: add optimization to split case sensitive strings of length two
1950 // into two single char ops, for efficiency.
1951 appendOp(URX_STRING, fRXPat->fLiteralText.length());
1952 }
1953 appendOp(URX_STRING_LEN, fLiteralChars.length());
1954
1955 // Add this string into the accumulated strings of the compiled pattern.
1956 fRXPat->fLiteralText.append(fLiteralChars);
1957 }
1958
1959 fLiteralChars.remove();
1960}
1961
1962
1963int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1964 if (U_FAILURE(*fStatus)) {
1965 return 0;
1966 }
1967 if (type < 0 || type > 255) {
1968 UPRV_UNREACHABLE_EXITabort();
1969 }
1970 if (val > 0x00ffffff) {
1971 UPRV_UNREACHABLE_EXITabort();
1972 }
1973 if (val < 0) {
1974 if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1975 UPRV_UNREACHABLE_EXITabort();
1976 }
1977 if (URX_TYPE(val)((uint32_t)(val) >> 24) != 0xff) {
1978 UPRV_UNREACHABLE_EXITabort();
1979 }
1980 type = URX_RESERVED_OP_N;
1981 }
1982 return (type << 24) | val;
1983}
1984
1985
1986//------------------------------------------------------------------------------
1987//
1988// appendOp() Append a new instruction onto the compiled pattern
1989// Includes error checking, limiting the size of the
1990// pattern to lengths that can be represented in the
1991// 24 bit operand field of an instruction.
1992//
1993//------------------------------------------------------------------------------
1994void RegexCompile::appendOp(int32_t op) {
1995 if (U_FAILURE(*fStatus)) {
1996 return;
1997 }
1998 fRXPat->fCompiledPat->addElement(op, *fStatus);
1999 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
2000 error(U_REGEX_PATTERN_TOO_BIG);
2001 }
2002}
2003
2004void RegexCompile::appendOp(int32_t type, int32_t val) {
2005 appendOp(buildOp(type, val));
2006}
2007
2008
2009//------------------------------------------------------------------------------
2010//
2011// insertOp() Insert a slot for a new opcode into the already
2012// compiled pattern code.
2013//
2014// Fill the slot with a NOP. Our caller will replace it
2015// with what they really wanted.
2016//
2017//------------------------------------------------------------------------------
2018void RegexCompile::insertOp(int32_t where) {
2019 UVector64 *code = fRXPat->fCompiledPat;
2020 U_ASSERT(where>0 && where < code->size())(void)0;
2021
2022 int32_t nop = buildOp(URX_NOP, 0);
2023 code->insertElementAt(nop, where, *fStatus);
2024
2025 // Walk through the pattern, looking for any ops with targets that
2026 // were moved down by the insert. Fix them.
2027 int32_t loc;
2028 for (loc=0; loc<code->size(); loc++) {
2029 int32_t op = (int32_t)code->elementAti(loc);
2030 int32_t opType = URX_TYPE(op)((uint32_t)(op) >> 24);
2031 int32_t opValue = URX_VAL(op)((op) & 0xffffff);
2032 if ((opType == URX_JMP ||
2033 opType == URX_JMPX ||
2034 opType == URX_STATE_SAVE ||
2035 opType == URX_CTR_LOOP ||
2036 opType == URX_CTR_LOOP_NG ||
2037 opType == URX_JMP_SAV ||
2038 opType == URX_JMP_SAV_X ||
2039 opType == URX_RELOC_OPRND) && opValue > where) {
2040 // Target location for this opcode is after the insertion point and
2041 // needs to be incremented to adjust for the insertion.
2042 opValue++;
2043 op = buildOp(opType, opValue);
2044 code->setElementAt(op, loc);
2045 }
2046 }
2047
2048 // Now fix up the parentheses stack. All positive values in it are locations in
2049 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2050 for (loc=0; loc<fParenStack.size(); loc++) {
2051 int32_t x = fParenStack.elementAti(loc);
2052 U_ASSERT(x < code->size())(void)0;
2053 if (x>where) {
2054 x++;
2055 fParenStack.setElementAt(x, loc);
2056 }
2057 }
2058
2059 if (fMatchCloseParen > where) {
2060 fMatchCloseParen++;
2061 }
2062 if (fMatchOpenParen > where) {
2063 fMatchOpenParen++;
2064 }
2065}
2066
2067
2068//------------------------------------------------------------------------------
2069//
2070// allocateData() Allocate storage in the matcher's static data area.
2071// Return the index for the newly allocated data.
2072// The storage won't actually exist until we are running a match
2073// operation, but the storage indexes are inserted into various
2074// opcodes while compiling the pattern.
2075//
2076//------------------------------------------------------------------------------
2077int32_t RegexCompile::allocateData(int32_t size) {
2078 if (U_FAILURE(*fStatus)) {
2079 return 0;
2080 }
2081 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2082 error(U_REGEX_INTERNAL_ERROR);
2083 return 0;
2084 }
2085 int32_t dataIndex = fRXPat->fDataSize;
2086 fRXPat->fDataSize += size;
2087 if (fRXPat->fDataSize >= 0x00fffff0) {
2088 error(U_REGEX_INTERNAL_ERROR);
2089 }
2090 return dataIndex;
2091}
2092
2093
2094//------------------------------------------------------------------------------
2095//
2096// allocateStackData() Allocate space in the back-tracking stack frame.
2097// Return the index for the newly allocated data.
2098// The frame indexes are inserted into various
2099// opcodes while compiling the pattern, meaning that frame
2100// size must be restricted to the size that will fit
2101// as an operand (24 bits).
2102//
2103//------------------------------------------------------------------------------
2104int32_t RegexCompile::allocateStackData(int32_t size) {
2105 if (U_FAILURE(*fStatus)) {
2106 return 0;
2107 }
2108 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2109 error(U_REGEX_INTERNAL_ERROR);
2110 return 0;
2111 }
2112 int32_t dataIndex = fRXPat->fFrameSize;
2113 fRXPat->fFrameSize += size;
2114 if (fRXPat->fFrameSize >= 0x00fffff0) {
2115 error(U_REGEX_PATTERN_TOO_BIG);
2116 }
2117 return dataIndex;
2118}
2119
2120
2121//------------------------------------------------------------------------------
2122//
2123// blockTopLoc() Find or create a location in the compiled pattern
2124// at the start of the operation or block that has
2125// just been compiled. Needed when a quantifier (* or
2126// whatever) appears, and we need to add an operation
2127// at the start of the thing being quantified.
2128//
2129// (Parenthesized Blocks) have a slot with a NOP that
2130// is reserved for this purpose. .* or similar don't
2131// and a slot needs to be added.
2132//
2133// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2134// at the returned location.
2135// FALSE - just return the address,
2136// do not reserve a location there.
2137//
2138//------------------------------------------------------------------------------
2139int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
2140 int32_t theLoc;
2141 fixLiterals(TRUE1); // Emit code for any pending literals.
2142 // If last item was a string, emit separate op for the its last char.
2143 if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2144 {
2145 // The item just processed is a parenthesized block.
2146 theLoc = fMatchOpenParen; // A slot is already reserved for us.
2147 U_ASSERT(theLoc > 0)(void)0;
2148 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP)(void)0;
2149 }
2150 else {
2151 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2152 // No slot for STATE_SAVE was pre-reserved in the compiled code.
2153 // We need to make space now.
2154 theLoc = fRXPat->fCompiledPat->size()-1;
2155 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2156 if (URX_TYPE(opAtTheLoc)((uint32_t)(opAtTheLoc) >> 24) == URX_STRING_LEN) {
2157 // Strings take two opcode, we want the position of the first one.
2158 // We can have a string at this point if a single character case-folded to two.
2159 theLoc--;
2160 }
2161 if (reserveLoc) {
2162 int32_t nop = buildOp(URX_NOP, 0);
2163 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2164 }
2165 }
2166 return theLoc;
2167}
2168
2169
2170
2171//------------------------------------------------------------------------------
2172//
2173// handleCloseParen When compiling a close paren, we need to go back
2174// and fix up any JMP or SAVE operations within the
2175// parenthesized block that need to target the end
2176// of the block. The locations of these are kept on
2177// the paretheses stack.
2178//
2179// This function is called both when encountering a
2180// real ) and at the end of the pattern.
2181//
2182//------------------------------------------------------------------------------
2183void RegexCompile::handleCloseParen() {
2184 int32_t patIdx;
2185 int32_t patOp;
2186 if (fParenStack.size() <= 0) {
2187 error(U_REGEX_MISMATCHED_PAREN);
2188 return;
2189 }
2190
2191 // Emit code for any pending literals.
2192 fixLiterals(FALSE0);
2193
2194 // Fixup any operations within the just-closed parenthesized group
2195 // that need to reference the end of the (block).
2196 // (The first one popped from the stack is an unused slot for
2197 // alternation (OR) state save, but applying the fixup to it does no harm.)
2198 for (;;) {
2199 patIdx = fParenStack.popi();
2200 if (patIdx < 0) {
2201 // value < 0 flags the start of the frame on the paren stack.
2202 break;
2203 }
2204 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size())(void)0;
2205 patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2206 U_ASSERT(URX_VAL(patOp) == 0)(void)0; // Branch target for JMP should not be set.
2207 patOp |= fRXPat->fCompiledPat->size(); // Set it now.
2208 fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2209 fMatchOpenParen = patIdx;
2210 }
2211
2212 // At the close of any parenthesized block, restore the match mode flags to
2213 // the value they had at the open paren. Saved value is
2214 // at the top of the paren stack.
2215 fModeFlags = fParenStack.popi();
2216 U_ASSERT(fModeFlags < 0)(void)0;
2217
2218 // DO any additional fixups, depending on the specific kind of
2219 // parentesized grouping this is
2220
2221 switch (patIdx) {
2222 case plain:
2223 case flags:
2224 // No additional fixups required.
2225 // (Grouping-only parentheses)
2226 break;
2227 case capturing:
2228 // Capturing Parentheses.
2229 // Insert a End Capture op into the pattern.
2230 // The frame offset of the variables for this cg is obtained from the
2231 // start capture op and put it into the end-capture op.
2232 {
2233 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2234 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE)(void)0;
2235
2236 int32_t frameVarLocation = URX_VAL(captureOp)((captureOp) & 0xffffff);
2237 appendOp(URX_END_CAPTURE, frameVarLocation);
2238 }
2239 break;
2240 case atomic:
2241 // Atomic Parenthesis.
2242 // Insert a LD_SP operation to restore the state stack to the position
2243 // it was when the atomic parens were entered.
2244 {
2245 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2246 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP)(void)0;
2247 int32_t stoLoc = URX_VAL(stoOp)((stoOp) & 0xffffff);
2248 appendOp(URX_LD_SP, stoLoc);
2249 }
2250 break;
2251
2252 case lookAhead:
2253 {
2254 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2255 U_ASSERT(URX_TYPE(startOp) == URX_LA_START)(void)0;
2256 int32_t dataLoc = URX_VAL(startOp)((startOp) & 0xffffff);
2257 appendOp(URX_LA_END, dataLoc);
2258 }
2259 break;
2260
2261 case negLookAhead:
2262 {
2263 // See comment at doOpenLookAheadNeg
2264 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2265 U_ASSERT(URX_TYPE(startOp) == URX_LA_START)(void)0;
2266 int32_t dataLoc = URX_VAL(startOp)((startOp) & 0xffffff);
2267 appendOp(URX_LA_END, dataLoc);
2268 appendOp(URX_BACKTRACK, 0);
2269 appendOp(URX_LA_END, dataLoc);
2270
2271 // Patch the URX_SAVE near the top of the block.
2272 // The destination of the SAVE is the final LA_END that was just added.
2273 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
Value stored to 'saveOp' during its initialization is never read
2274 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE)(void)0;
2275 int32_t dest = fRXPat->fCompiledPat->size()-1;
2276 saveOp = buildOp(URX_STATE_SAVE, dest);
2277 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2278 }
2279 break;
2280
2281 case lookBehind:
2282 {
2283 // See comment at doOpenLookBehind.
2284
2285 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2286 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2287 U_ASSERT(URX_TYPE(startOp) == URX_LB_START)(void)0;
2288 int32_t dataLoc = URX_VAL(startOp)((startOp) & 0xffffff);
2289 appendOp(URX_LB_END, dataLoc);
2290 appendOp(URX_LA_END, dataLoc);
2291
2292 // Determine the min and max bounds for the length of the
2293 // string that the pattern can match.
2294 // An unbounded upper limit is an error.
2295 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2296 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2297 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2298 if (URX_TYPE(maxML)((uint32_t)(maxML) >> 24) != 0) {
2299 error(U_REGEX_LOOK_BEHIND_LIMIT);
2300 break;
2301 }
2302 if (maxML == INT32_MAX(2147483647)) {
2303 error(U_REGEX_LOOK_BEHIND_LIMIT);
2304 break;
2305 }
2306 if (minML == INT32_MAX(2147483647)) {
2307 // This condition happens when no match is possible, such as with a
2308 // [set] expression containing no elements.
2309 // In principle, the generated code to evaluate the expression could be deleted,
2310 // but it's probably not worth the complication.
2311 minML = 0;
2312 }
2313 U_ASSERT(minML <= maxML)(void)0;
2314
2315 // Insert the min and max match len bounds into the URX_LB_CONT op that
2316 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2317 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
2318 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
2319
2320 }
2321 break;
2322
2323
2324
2325 case lookBehindN:
2326 {
2327 // See comment at doOpenLookBehindNeg.
2328
2329 // Append the URX_LBN_END to the compiled pattern.
2330 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2331 U_ASSERT(URX_TYPE(startOp) == URX_LB_START)(void)0;
2332 int32_t dataLoc = URX_VAL(startOp)((startOp) & 0xffffff);
2333 appendOp(URX_LBN_END, dataLoc);
2334
2335 // Determine the min and max bounds for the length of the
2336 // string that the pattern can match.
2337 // An unbounded upper limit is an error.
2338 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2339 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2340 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2341 if (URX_TYPE(maxML)((uint32_t)(maxML) >> 24) != 0) {
2342 error(U_REGEX_LOOK_BEHIND_LIMIT);
2343 break;
2344 }
2345 if (maxML == INT32_MAX(2147483647)) {
2346 error(U_REGEX_LOOK_BEHIND_LIMIT);
2347 break;
2348 }
2349 if (minML == INT32_MAX(2147483647)) {
2350 // This condition happens when no match is possible, such as with a
2351 // [set] expression containing no elements.
2352 // In principle, the generated code to evaluate the expression could be deleted,
2353 // but it's probably not worth the complication.
2354 minML = 0;
2355 }
2356
2357 U_ASSERT(minML <= maxML)(void)0;
2358
2359 // Insert the min and max match len bounds into the URX_LB_CONT op that
2360 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2361 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
2362 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
2363
2364 // Insert the pattern location to continue at after a successful match
2365 // as the last operand of the URX_LBN_CONT
2366 int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2367 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
2368 }
2369 break;
2370
2371
2372
2373 default:
2374 UPRV_UNREACHABLE_EXITabort();
2375 }
2376
2377 // remember the next location in the compiled pattern.
2378 // The compilation of Quantifiers will look at this to see whether its looping
2379 // over a parenthesized block or a single item
2380 fMatchCloseParen = fRXPat->fCompiledPat->size();
2381}
2382
2383
2384
2385//------------------------------------------------------------------------------
2386//
2387// compileSet Compile the pattern operations for a reference to a
2388// UnicodeSet.
2389//
2390//------------------------------------------------------------------------------
2391void RegexCompile::compileSet(UnicodeSet *theSet)
2392{
2393 if (theSet == NULL__null) {
2394 return;
2395 }
2396 // Remove any strings from the set.
2397 // There shouldn't be any, but just in case.
2398 // (Case Closure can add them; if we had a simple case closure available that
2399 // ignored strings, that would be better.)
2400 theSet->removeAllStrings();
2401 int32_t setSize = theSet->size();
2402
2403 switch (setSize) {
2404 case 0:
2405 {
2406 // Set of no elements. Always fails to match.
2407 appendOp(URX_BACKTRACK, 0);
2408 delete theSet;
2409 }
2410 break;
2411
2412 case 1:
2413 {
2414 // The set contains only a single code point. Put it into
2415 // the compiled pattern as a single char operation rather
2416 // than a set, and discard the set itself.
2417 literalChar(theSet->charAt(0));
2418 delete theSet;
2419 }
2420 break;
2421
2422 default:
2423 {
2424 // The set contains two or more chars. (the normal case)
2425 // Put it into the compiled pattern as a set.
2426 theSet->freeze();
2427 int32_t setNumber = fRXPat->fSets->size();
2428 fRXPat->fSets->addElement(theSet, *fStatus);
2429 if (U_SUCCESS(*fStatus)) {
2430 appendOp(URX_SETREF, setNumber);
2431 } else {
2432 delete theSet;
2433 }
2434 }
2435 }
2436}
2437
2438
2439//------------------------------------------------------------------------------
2440//
2441// compileInterval Generate the code for a {min, max} style interval quantifier.
2442// Except for the specific opcodes used, the code is the same
2443// for all three types (greedy, non-greedy, possessive) of
2444// intervals. The opcodes are supplied as parameters.
2445// (There are two sets of opcodes - greedy & possessive use the
2446// same ones, while non-greedy has it's own.)
2447//
2448// The code for interval loops has this form:
2449// 0 CTR_INIT counter loc (in stack frame)
2450// 1 5 patt address of CTR_LOOP at bottom of block
2451// 2 min count
2452// 3 max count (-1 for unbounded)
2453// 4 ... block to be iterated over
2454// 5 CTR_LOOP
2455//
2456// In
2457//------------------------------------------------------------------------------
2458void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
2459{
2460 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2461 // four slots in the compiled code. Reserve them.
2462 int32_t topOfBlock = blockTopLoc(TRUE1);
2463 insertOp(topOfBlock);
2464 insertOp(topOfBlock);
2465 insertOp(topOfBlock);
2466
2467 // The operands for the CTR_INIT opcode include the index in the matcher data
2468 // of the counter. Allocate it now. There are two data items
2469 // counterLoc --> Loop counter
2470 // +1 --> Input index (for breaking non-progressing loops)
2471 // (Only present if unbounded upper limit on loop)
2472 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1;
2473 int32_t counterLoc = allocateStackData(dataSize);
2474
2475 int32_t op = buildOp(InitOp, counterLoc);
2476 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2477
2478 // The second operand of CTR_INIT is the location following the end of the loop.
2479 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2480 // compilation of something later on causes the code to grow and the target
2481 // position to move.
2482 int32_t loopEnd = fRXPat->fCompiledPat->size();
2483 op = buildOp(URX_RELOC_OPRND, loopEnd);
2484 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2485
2486 // Followed by the min and max counts.
2487 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2488 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2489
2490 // Append the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2491 // Goes at end of the block being looped over, so just append to the code so far.
2492 appendOp(LoopOp, topOfBlock);
2493
2494 if ((fIntervalLow & 0xff000000) != 0 ||
2495 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2496 error(U_REGEX_NUMBER_TOO_BIG);
2497 }
2498
2499 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2500 error(U_REGEX_MAX_LT_MIN);
2501 }
2502}
2503
2504
2505
2506UBool RegexCompile::compileInlineInterval() {
2507 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2508 // Too big to inline. Fail, which will cause looping code to be generated.
2509 // (Upper < Lower picks up unbounded upper and errors, both.)
2510 return FALSE0;
2511 }
2512
2513 int32_t topOfBlock = blockTopLoc(FALSE0);
2514 if (fIntervalUpper == 0) {
2515 // Pathological case. Attempt no matches, as if the block doesn't exist.
2516 // Discard the generated code for the block.
2517 // If the block included parens, discard the info pertaining to them as well.
2518 fRXPat->fCompiledPat->setSize(topOfBlock);
2519 if (fMatchOpenParen >= topOfBlock) {
2520 fMatchOpenParen = -1;
2521 }
2522 if (fMatchCloseParen >= topOfBlock) {
2523 fMatchCloseParen = -1;
2524 }
2525 return TRUE1;
2526 }
2527
2528 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2529 // The thing being repeated is not a single op, but some
2530 // more complex block. Do it as a loop, not inlines.
2531 // Note that things "repeated" a max of once are handled as inline, because
2532 // the one copy of the code already generated is just fine.
2533 return FALSE0;
2534 }
2535
2536 // Pick up the opcode that is to be repeated
2537 //
2538 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2539
2540 // Compute the pattern location where the inline sequence
2541 // will end, and set up the state save op that will be needed.
2542 //
2543 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2544 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2545 int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2546 if (fIntervalLow == 0) {
2547 insertOp(topOfBlock);
2548 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2549 }
2550
2551
2552
2553 // Loop, emitting the op for the thing being repeated each time.
2554 // Loop starts at 1 because one instance of the op already exists in the pattern,
2555 // it was put there when it was originally encountered.
2556 int32_t i;
2557 for (i=1; i<fIntervalUpper; i++ ) {
2558 if (i >= fIntervalLow) {
2559 appendOp(saveOp);
2560 }
2561 appendOp(op);
2562 }
2563 return TRUE1;
2564}
2565
2566
2567
2568//------------------------------------------------------------------------------
2569//
2570// caseInsensitiveStart given a single code point from a pattern string, determine the
2571// set of characters that could potentially begin a case-insensitive
2572// match of a string beginning with that character, using full Unicode
2573// case insensitive matching.
2574//
2575// This is used in optimizing find().
2576//
2577// closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2578// misses cases like this:
2579// A string from the pattern begins with 'ss' (although all we know
2580// in this context is that it begins with 's')
2581// The pattern could match a string beginning with a German sharp-s
2582//
2583// To the ordinary case closure for a character c, we add all other
2584// characters cx where the case closure of cx includes a string form that begins
2585// with the original character c.
2586//
2587// This function could be made smarter. The full pattern string is available
2588// and it would be possible to verify that the extra characters being added
2589// to the starting set fully match, rather than having just a first-char of the
2590// folded form match.
2591//
2592//------------------------------------------------------------------------------
2593void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2594
2595// Machine Generated below.
2596// It may need updating with new versions of Unicode.
2597// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2598// The update tool is here:
2599// https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing
2600
2601// Machine Generated Data. Do not hand edit.
2602 static const UChar32 RECaseFixCodePoints[] = {
2603 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2604 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2605 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2606 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2607 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2608
2609 static const int16_t RECaseFixStringOffsets[] = {
2610 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2611 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2612 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2613 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2614 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2615
2616 static const int16_t RECaseFixCounts[] = {
2617 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2618 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2619 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2620 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2621 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2622
2623 static const UChar RECaseFixData[] = {
2624 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2625 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2626 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2627 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2628 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2629 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2630 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2631 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2632 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2633 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2634 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2635
2636// End of machine generated data.
2637
2638 if (c < UCHAR_MIN_VALUE0 || c > UCHAR_MAX_VALUE0x10ffff) {
2639 // This function should never be called with an invalid input character.
2640 UPRV_UNREACHABLE_EXITabort();
2641 } else if (u_hasBinaryPropertyu_hasBinaryProperty_71(c, UCHAR_CASE_SENSITIVE)) {
2642 UChar32 caseFoldedC = u_foldCaseu_foldCase_71(c, U_FOLD_CASE_DEFAULT0);
2643 starterChars->set(caseFoldedC, caseFoldedC);
2644
2645 int32_t i;
2646 for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2647 // Simple linear search through the sorted list of interesting code points.
2648 }
2649
2650 if (RECaseFixCodePoints[i] == c) {
2651 int32_t dataIndex = RECaseFixStringOffsets[i];
2652 int32_t numCharsToAdd = RECaseFixCounts[i];
2653 UChar32 cpToAdd = 0;
2654 for (int32_t j=0; j<numCharsToAdd; j++) {
2655 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd)do { (cpToAdd)=(RECaseFixData)[(dataIndex)++]; if((((cpToAdd)
&0xfffffc00)==0xd800)) { (cpToAdd)=(((UChar32)((cpToAdd))
<<10UL)+(UChar32)((RECaseFixData)[(dataIndex)++])-((0xd800
<<10UL)+0xdc00-0x10000)); } } while (false)
;
2656 starterChars->add(cpToAdd);
2657 }
2658 }
2659
2660 starterChars->closeOver(USET_CASE_INSENSITIVE);
2661 starterChars->removeAllStrings();
2662 } else {
2663 // Not a cased character. Just return it alone.
2664 starterChars->set(c, c);
2665 }
2666}
2667
2668
2669// Increment with overflow check.
2670// val and delta will both be positive.
2671
2672static int32_t safeIncrement(int32_t val, int32_t delta) {
2673 if (INT32_MAX(2147483647) - val > delta) {
2674 return val + delta;
2675 } else {
2676 return INT32_MAX(2147483647);
2677 }
2678}
2679
2680
2681//------------------------------------------------------------------------------
2682//
2683// matchStartType Determine how a match can start.
2684// Used to optimize find() operations.
2685//
2686// Operation is very similar to minMatchLength(). Walk the compiled
2687// pattern, keeping an on-going minimum-match-length. For any
2688// op where the min match coming in is zero, add that ops possible
2689// starting matches to the possible starts for the overall pattern.
2690//
2691//------------------------------------------------------------------------------
2692void RegexCompile::matchStartType() {
2693 if (U_FAILURE(*fStatus)) {
2694 return;
2695 }
2696
2697
2698 int32_t loc; // Location in the pattern of the current op being processed.
2699 int32_t op; // The op being processed
2700 int32_t opType; // The opcode type of the op
2701 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
2702 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
2703
2704 UBool atStart = TRUE1; // True if no part of the pattern yet encountered
2705 // could have advanced the position in a match.
2706 // (Maximum match length so far == 0)
2707
2708 // forwardedLength is a vector holding minimum-match-length values that
2709 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2710 // It must be one longer than the pattern being checked because some ops
2711 // will jmp to a end-of-block+1 location from within a block, and we must
2712 // count those when checking the block.
2713 int32_t end = fRXPat->fCompiledPat->size();
2714 UVector32 forwardedLength(end+1, *fStatus);
2715 forwardedLength.setSize(end+1);
2716 for (loc=3; loc<end; loc++) {
2717 forwardedLength.setElementAt(INT32_MAX(2147483647), loc);
2718 }
2719
2720 for (loc = 3; loc<end; loc++) {
2721 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2722 opType = URX_TYPE(op)((uint32_t)(op) >> 24);
2723
2724 // The loop is advancing linearly through the pattern.
2725 // If the op we are now at was the destination of a branch in the pattern,
2726 // and that path has a shorter minimum length than the current accumulated value,
2727 // replace the current accumulated value.
2728 if (forwardedLength.elementAti(loc) < currentLen) {
2729 currentLen = forwardedLength.elementAti(loc);
2730 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX)(void)0;
2731 }
2732
2733 switch (opType) {
2734 // Ops that don't change the total length matched
2735 case URX_RESERVED_OP:
2736 case URX_END:
2737 case URX_FAIL:
2738 case URX_STRING_LEN:
2739 case URX_NOP:
2740 case URX_START_CAPTURE:
2741 case URX_END_CAPTURE:
2742 case URX_BACKSLASH_B:
2743 case URX_BACKSLASH_BU:
2744 case URX_BACKSLASH_G:
2745 case URX_BACKSLASH_Z:
2746 case URX_DOLLAR:
2747 case URX_DOLLAR_M:
2748 case URX_DOLLAR_D:
2749 case URX_DOLLAR_MD:
2750 case URX_RELOC_OPRND:
2751 case URX_STO_INP_LOC:
2752 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2753 case URX_BACKREF_I:
2754
2755 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2756 case URX_LD_SP:
2757 break;
2758
2759 case URX_CARET:
2760 if (atStart) {
2761 fRXPat->fStartType = START_START;
2762 }
2763 break;
2764
2765 case URX_CARET_M:
2766 case URX_CARET_M_UNIX:
2767 if (atStart) {
2768 fRXPat->fStartType = START_LINE;
2769 }
2770 break;
2771
2772 case URX_ONECHAR:
2773 if (currentLen == 0) {
2774 // This character could appear at the start of a match.
2775 // Add it to the set of possible starting characters.
2776 fRXPat->fInitialChars->add(URX_VAL(op)((op) & 0xffffff));
2777 numInitialStrings += 2;
2778 }
2779 currentLen = safeIncrement(currentLen, 1);
2780 atStart = FALSE0;
2781 break;
2782
2783
2784 case URX_SETREF:
2785 if (currentLen == 0) {
2786 int32_t sn = URX_VAL(op)((op) & 0xffffff);
2787 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size())(void)0;
2788 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2789 fRXPat->fInitialChars->addAll(*s);
2790 numInitialStrings += 2;
2791 }
2792 currentLen = safeIncrement(currentLen, 1);
2793 atStart = FALSE0;
2794 break;
2795
2796 case URX_LOOP_SR_I:
2797 // [Set]*, like a SETREF, above, in what it can match,
2798 // but may not match at all, so currentLen is not incremented.
2799 if (currentLen == 0) {
2800 int32_t sn = URX_VAL(op)((op) & 0xffffff);
2801 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size())(void)0;
2802 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2803 fRXPat->fInitialChars->addAll(*s);
2804 numInitialStrings += 2;
2805 }
2806 atStart = FALSE0;
2807 break;
2808
2809 case URX_LOOP_DOT_I:
2810 if (currentLen == 0) {
2811 // .* at the start of a pattern.
2812 // Any character can begin the match.
2813 fRXPat->fInitialChars->clear();
2814 fRXPat->fInitialChars->complement();
2815 numInitialStrings += 2;
2816 }
2817 atStart = FALSE0;
2818 break;
2819
2820
2821 case URX_STATIC_SETREF:
2822 if (currentLen == 0) {
2823 int32_t sn = URX_VAL(op)((op) & 0xffffff);
2824 U_ASSERT(sn>0 && sn<URX_LAST_SET)(void)0;
2825 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
2826 fRXPat->fInitialChars->addAll(s);
2827 numInitialStrings += 2;
2828 }
2829 currentLen = safeIncrement(currentLen, 1);
2830 atStart = FALSE0;
2831 break;
2832
2833
2834
2835 case URX_STAT_SETREF_N:
2836 if (currentLen == 0) {
2837 int32_t sn = URX_VAL(op)((op) & 0xffffff);
2838 UnicodeSet sc;
2839 sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
2840 fRXPat->fInitialChars->addAll(sc);
2841 numInitialStrings += 2;
2842 }
2843 currentLen = safeIncrement(currentLen, 1);
2844 atStart = FALSE0;
2845 break;
2846
2847
2848
2849 case URX_BACKSLASH_D:
2850 // Digit Char
2851 if (currentLen == 0) {
2852 UnicodeSet s;
2853 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK((uint32_t)1<<(U_DECIMAL_DIGIT_NUMBER)), *fStatus);
2854 if (URX_VAL(op)((op) & 0xffffff) != 0) {
2855 s.complement();
2856 }
2857 fRXPat->fInitialChars->addAll(s);
2858 numInitialStrings += 2;
2859 }
2860 currentLen = safeIncrement(currentLen, 1);
2861 atStart = FALSE0;
2862 break;
2863
2864
2865 case URX_BACKSLASH_H:
2866 // Horiz white space
2867 if (currentLen == 0) {
2868 UnicodeSet s;
2869 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK((uint32_t)1<<(U_SPACE_SEPARATOR)), *fStatus);
2870 s.add((UChar32)9); // Tab
2871 if (URX_VAL(op)((op) & 0xffffff) != 0) {
2872 s.complement();
2873 }
2874 fRXPat->fInitialChars->addAll(s);
2875 numInitialStrings += 2;
2876 }
2877 currentLen = safeIncrement(currentLen, 1);
2878 atStart = FALSE0;
2879 break;
2880
2881
2882 case URX_BACKSLASH_R: // Any line ending sequence
2883 case URX_BACKSLASH_V: // Any line ending code point, with optional negation
2884 if (currentLen == 0) {
2885 UnicodeSet s;
2886 s.add((UChar32)0x0a, (UChar32)0x0d); // add range
2887 s.add((UChar32)0x85);
2888 s.add((UChar32)0x2028, (UChar32)0x2029);
2889 if (URX_VAL(op)((op) & 0xffffff) != 0) {
2890 // Complement option applies to URX_BACKSLASH_V only.
2891 s.complement();
2892 }
2893 fRXPat->fInitialChars->addAll(s);
2894 numInitialStrings += 2;
2895 }
2896 currentLen = safeIncrement(currentLen, 1);
2897 atStart = FALSE0;
2898 break;
2899
2900
2901
2902 case URX_ONECHAR_I:
2903 // Case Insensitive Single Character.
2904 if (currentLen == 0) {
2905 UChar32 c = URX_VAL(op)((op) & 0xffffff);
2906 if (u_hasBinaryPropertyu_hasBinaryProperty_71(c, UCHAR_CASE_SENSITIVE)) {
2907 UnicodeSet starters(c, c);
2908 starters.closeOver(USET_CASE_INSENSITIVE);
2909 // findCaseInsensitiveStarters(c, &starters);
2910 // For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2911 // The expanded folding can't match the pattern.
2912 fRXPat->fInitialChars->addAll(starters);
2913 } else {
2914 // Char has no case variants. Just add it as-is to the
2915 // set of possible starting chars.
2916 fRXPat->fInitialChars->add(c);
2917 }
2918 numInitialStrings += 2;
2919 }
2920 currentLen = safeIncrement(currentLen, 1);
2921 atStart = FALSE0;
2922 break;
2923
2924
2925 case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded.
2926 case URX_DOTANY_ALL: // . matches one or two.
2927 case URX_DOTANY:
2928 case URX_DOTANY_UNIX:
2929 if (currentLen == 0) {
2930 // These constructs are all bad news when they appear at the start
2931 // of a match. Any character can begin the match.
2932 fRXPat->fInitialChars->clear();
2933 fRXPat->fInitialChars->complement();
2934 numInitialStrings += 2;
2935 }
2936 currentLen = safeIncrement(currentLen, 1);
2937 atStart = FALSE0;
2938 break;
2939
2940
2941 case URX_JMPX:
2942 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2943 U_FALLTHROUGH[[clang::fallthrough]];
2944 case URX_JMP:
2945 {
2946 int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
2947 if (jmpDest < loc) {
2948 // Loop of some kind. Can safely ignore, the worst that will happen
2949 // is that we understate the true minimum length
2950 currentLen = forwardedLength.elementAti(loc+1);
2951
2952 } else {
2953 // Forward jump. Propagate the current min length to the target loc of the jump.
2954 U_ASSERT(jmpDest <= end+1)(void)0;
2955 if (forwardedLength.elementAti(jmpDest) > currentLen) {
2956 forwardedLength.setElementAt(currentLen, jmpDest);
2957 }
2958 }
2959 }
2960 atStart = FALSE0;
2961 break;
2962
2963 case URX_JMP_SAV:
2964 case URX_JMP_SAV_X:
2965 // Combo of state save to the next loc, + jmp backwards.
2966 // Net effect on min. length computation is nothing.
2967 atStart = FALSE0;
2968 break;
2969
2970 case URX_BACKTRACK:
2971 // Fails are kind of like a branch, except that the min length was
2972 // propagated already, by the state save.
2973 currentLen = forwardedLength.elementAti(loc+1);
2974 atStart = FALSE0;
2975 break;
2976
2977
2978 case URX_STATE_SAVE:
2979 {
2980 // State Save, for forward jumps, propagate the current minimum.
2981 // of the state save.
2982 int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
2983 if (jmpDest > loc) {
2984 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2985 forwardedLength.setElementAt(currentLen, jmpDest);
2986 }
2987 }
2988 }
2989 atStart = FALSE0;
2990 break;
2991
2992
2993
2994
2995 case URX_STRING:
2996 {
2997 loc++;
2998 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2999 int32_t stringLen = URX_VAL(stringLenOp)((stringLenOp) & 0xffffff);
3000 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN)(void)0;
3001 U_ASSERT(stringLenOp >= 2)(void)0;
3002 if (currentLen == 0) {
3003 // Add the starting character of this string to the set of possible starting
3004 // characters for this pattern.
3005 int32_t stringStartIdx = URX_VAL(op)((op) & 0xffffff);
3006 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3007 fRXPat->fInitialChars->add(c);
3008
3009 // Remember this string. After the entire pattern has been checked,
3010 // if nothing else is identified that can start a match, we'll use it.
3011 numInitialStrings++;
3012 fRXPat->fInitialStringIdx = stringStartIdx;
3013 fRXPat->fInitialStringLen = stringLen;
3014 }
3015
3016 currentLen = safeIncrement(currentLen, stringLen);
3017 atStart = FALSE0;
3018 }
3019 break;
3020
3021 case URX_STRING_I:
3022 {
3023 // Case-insensitive string. Unlike exact-match strings, we won't
3024 // attempt a string search for possible match positions. But we
3025 // do update the set of possible starting characters.
3026 loc++;
3027 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3028 int32_t stringLen = URX_VAL(stringLenOp)((stringLenOp) & 0xffffff);
3029 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN)(void)0;
3030 U_ASSERT(stringLenOp >= 2)(void)0;
3031 if (currentLen == 0) {
3032 // Add the starting character of this string to the set of possible starting
3033 // characters for this pattern.
3034 int32_t stringStartIdx = URX_VAL(op)((op) & 0xffffff);
3035 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3036 UnicodeSet s;
3037 findCaseInsensitiveStarters(c, &s);
3038 fRXPat->fInitialChars->addAll(s);
3039 numInitialStrings += 2; // Matching on an initial string not possible.
3040 }
3041 currentLen = safeIncrement(currentLen, stringLen);
3042 atStart = FALSE0;
3043 }
3044 break;
3045
3046 case URX_CTR_INIT:
3047 case URX_CTR_INIT_NG:
3048 {
3049 // Loop Init Ops. These don't change the min length, but they are 4 word ops
3050 // so location must be updated accordingly.
3051 // Loop Init Ops.
3052 // If the min loop count == 0
3053 // move loc forwards to the end of the loop, skipping over the body.
3054 // If the min count is > 0,
3055 // continue normal processing of the body of the loop.
3056 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3057 loopEndLoc = URX_VAL(loopEndLoc)((loopEndLoc) & 0xffffff);
3058 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3059 if (minLoopCount == 0) {
3060 // Min Loop Count of 0, treat like a forward branch and
3061 // move the current minimum length up to the target
3062 // (end of loop) location.
3063 U_ASSERT(loopEndLoc <= end+1)(void)0;
3064 if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3065 forwardedLength.setElementAt(currentLen, loopEndLoc);
3066 }
3067 }
3068 loc+=3; // Skips over operands of CTR_INIT
3069 }
3070 atStart = FALSE0;
3071 break;
3072
3073
3074 case URX_CTR_LOOP:
3075 case URX_CTR_LOOP_NG:
3076 // Loop ops.
3077 // The jump is conditional, backwards only.
3078 atStart = FALSE0;
3079 break;
3080
3081 case URX_LOOP_C:
3082 // More loop ops. These state-save to themselves.
3083 // don't change the minimum match
3084 atStart = FALSE0;
3085 break;
3086
3087
3088 case URX_LA_START:
3089 case URX_LB_START:
3090 {
3091 // Look-around. Scan forward until the matching look-ahead end,
3092 // without processing the look-around block. This is overly pessimistic.
3093
3094 // Keep track of the nesting depth of look-around blocks. Boilerplate code for
3095 // lookahead contains two LA_END instructions, so count goes up by two
3096 // for each LA_START.
3097 int32_t depth = (opType == URX_LA_START? 2: 1);
3098 for (;;) {
3099 loc++;
3100 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3101 if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_LA_START) {
3102 depth+=2;
3103 }
3104 if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_LB_START) {
3105 depth++;
3106 }
3107 if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_LA_END || URX_TYPE(op)((uint32_t)(op) >> 24)==URX_LBN_END) {
3108 depth--;
3109 if (depth == 0) {
3110 break;
3111 }
3112 }
3113 if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_STATE_SAVE) {
3114 // Need this because neg lookahead blocks will FAIL to outside
3115 // of the block.
3116 int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
3117 if (jmpDest > loc) {
3118 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3119 forwardedLength.setElementAt(currentLen, jmpDest);
3120 }
3121 }
3122 }
3123 U_ASSERT(loc <= end)(void)0;
3124 }
3125 }
3126 break;
3127
3128 case URX_LA_END:
3129 case URX_LB_CONT:
3130 case URX_LB_END:
3131 case URX_LBN_CONT:
3132 case URX_LBN_END:
3133 UPRV_UNREACHABLE_EXITabort(); // Shouldn't get here. These ops should be
3134 // consumed by the scan in URX_LA_START and LB_START
3135 default:
3136 UPRV_UNREACHABLE_EXITabort();
3137 }
3138
3139 }
3140
3141
3142 // We have finished walking through the ops. Check whether some forward jump
3143 // propagated a shorter length to location end+1.
3144 if (forwardedLength.elementAti(end+1) < currentLen) {
3145 currentLen = forwardedLength.elementAti(end+1);
3146 }
3147
3148
3149 fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3150
3151
3152 // Sort out what we should check for when looking for candidate match start positions.
3153 // In order of preference,
3154 // 1. Start of input text buffer.
3155 // 2. A literal string.
3156 // 3. Start of line in multi-line mode.
3157 // 4. A single literal character.
3158 // 5. A character from a set of characters.
3159 //
3160 if (fRXPat->fStartType == START_START) {
3161 // Match only at the start of an input text string.
3162 // start type is already set. We're done.
3163 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3164 // Match beginning only with a literal string.
3165 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3166 U_ASSERT(fRXPat->fInitialChars->contains(c))(void)0;
3167 fRXPat->fStartType = START_STRING;
3168 fRXPat->fInitialChar = c;
3169 } else if (fRXPat->fStartType == START_LINE) {
3170 // Match at start of line in Multi-Line mode.
3171 // Nothing to do here; everything is already set.
3172 } else if (fRXPat->fMinMatchLen == 0) {
3173 // Zero length match possible. We could start anywhere.
3174 fRXPat->fStartType = START_NO_INFO;
3175 } else if (fRXPat->fInitialChars->size() == 1) {
3176 // All matches begin with the same char.
3177 fRXPat->fStartType = START_CHAR;
3178 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3179 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1)(void)0;
3180 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE0 &&
3181 fRXPat->fMinMatchLen > 0) {
3182 // Matches start with a set of character smaller than the set of all chars.
3183 fRXPat->fStartType = START_SET;
3184 } else {
3185 // Matches can start with anything
3186 fRXPat->fStartType = START_NO_INFO;
3187 }
3188
3189 return;
3190}
3191
3192
3193
3194//------------------------------------------------------------------------------
3195//
3196// minMatchLength Calculate the length of the shortest string that could
3197// match the specified pattern.
3198// Length is in 16 bit code units, not code points.
3199//
3200// The calculated length may not be exact. The returned
3201// value may be shorter than the actual minimum; it must
3202// never be longer.
3203//
3204// start and end are the range of p-code operations to be
3205// examined. The endpoints are included in the range.
3206//
3207//------------------------------------------------------------------------------
3208int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
3209 if (U_FAILURE(*fStatus)) {
3210 return 0;
3211 }
3212
3213 U_ASSERT(start <= end)(void)0;
3214 U_ASSERT(end < fRXPat->fCompiledPat->size())(void)0;
3215
3216
3217 int32_t loc;
3218 int32_t op;
3219 int32_t opType;
3220 int32_t currentLen = 0;
3221
3222
3223 // forwardedLength is a vector holding minimum-match-length values that
3224 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
3225 // It must be one longer than the pattern being checked because some ops
3226 // will jmp to a end-of-block+1 location from within a block, and we must
3227 // count those when checking the block.
3228 UVector32 forwardedLength(end+2, *fStatus);
3229 forwardedLength.setSize(end+2);
3230 for (loc=start; loc<=end+1; loc++) {
3231 forwardedLength.setElementAt(INT32_MAX(2147483647), loc);
3232 }
3233
3234 for (loc = start; loc<=end; loc++) {
3235 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3236 opType = URX_TYPE(op)((uint32_t)(op) >> 24);
3237
3238 // The loop is advancing linearly through the pattern.
3239 // If the op we are now at was the destination of a branch in the pattern,
3240 // and that path has a shorter minimum length than the current accumulated value,
3241 // replace the current accumulated value.
3242 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3243 // no-match-possible cases.
3244 if (forwardedLength.elementAti(loc) < currentLen) {
3245 currentLen = forwardedLength.elementAti(loc);
3246 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX)(void)0;
3247 }
3248
3249 switch (opType) {
3250 // Ops that don't change the total length matched
3251 case URX_RESERVED_OP:
3252 case URX_END:
3253 case URX_STRING_LEN:
3254 case URX_NOP:
3255 case URX_START_CAPTURE:
3256 case URX_END_CAPTURE:
3257 case URX_BACKSLASH_B:
3258 case URX_BACKSLASH_BU:
3259 case URX_BACKSLASH_G:
3260 case URX_BACKSLASH_Z:
3261 case URX_CARET:
3262 case URX_DOLLAR:
3263 case URX_DOLLAR_M:
3264 case URX_DOLLAR_D:
3265 case URX_DOLLAR_MD:
3266 case URX_RELOC_OPRND:
3267 case URX_STO_INP_LOC:
3268 case URX_CARET_M:
3269 case URX_CARET_M_UNIX:
3270 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3271 case URX_BACKREF_I:
3272
3273 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3274 case URX_LD_SP:
3275
3276 case URX_JMP_SAV:
3277 case URX_JMP_SAV_X:
3278 break;
3279
3280
3281 // Ops that match a minimum of one character (one or two 16 bit code units.)
3282 //
3283 case URX_ONECHAR:
3284 case URX_STATIC_SETREF:
3285 case URX_STAT_SETREF_N:
3286 case URX_SETREF:
3287 case URX_BACKSLASH_D:
3288 case URX_BACKSLASH_H:
3289 case URX_BACKSLASH_R:
3290 case URX_BACKSLASH_V:
3291 case URX_ONECHAR_I:
3292 case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded.
3293 case URX_DOTANY_ALL: // . matches one or two.
3294 case URX_DOTANY:
3295 case URX_DOTANY_UNIX:
3296 currentLen = safeIncrement(currentLen, 1);
3297 break;
3298
3299
3300 case URX_JMPX:
3301 loc++; // URX_JMPX has an extra operand, ignored here,
3302 // otherwise processed identically to URX_JMP.
3303 U_FALLTHROUGH[[clang::fallthrough]];
3304 case URX_JMP:
3305 {
3306 int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
3307 if (jmpDest < loc) {
3308 // Loop of some kind. Can safely ignore, the worst that will happen
3309 // is that we understate the true minimum length
3310 currentLen = forwardedLength.elementAti(loc+1);
3311 } else {
3312 // Forward jump. Propagate the current min length to the target loc of the jump.
3313 U_ASSERT(jmpDest <= end+1)(void)0;
3314 if (forwardedLength.elementAti(jmpDest) > currentLen) {
3315 forwardedLength.setElementAt(currentLen, jmpDest);
3316 }
3317 }
3318 }
3319 break;
3320
3321 case URX_BACKTRACK:
3322 {
3323 // Back-tracks are kind of like a branch, except that the min length was
3324 // propagated already, by the state save.
3325 currentLen = forwardedLength.elementAti(loc+1);
3326 }
3327 break;
3328
3329
3330 case URX_STATE_SAVE:
3331 {
3332 // State Save, for forward jumps, propagate the current minimum.
3333 // of the state save.
3334 int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
3335 if (jmpDest > loc) {
3336 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3337 forwardedLength.setElementAt(currentLen, jmpDest);
3338 }
3339 }
3340 }
3341 break;
3342
3343
3344 case URX_STRING:
3345 {
3346 loc++;
3347 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3348 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)((stringLenOp) & 0xffffff));
3349 }
3350 break;
3351
3352
3353 case URX_STRING_I:
3354 {
3355 loc++;
3356 // TODO: with full case folding, matching input text may be shorter than
3357 // the string we have here. More smarts could put some bounds on it.
3358 // Assume a min length of one for now. A min length of zero causes
3359 // optimization failures for a pattern like "string"+
3360 // currentLen += URX_VAL(stringLenOp);
3361 currentLen = safeIncrement(currentLen, 1);
3362 }
3363 break;
3364
3365 case URX_CTR_INIT:
3366 case URX_CTR_INIT_NG:
3367 {
3368 // Loop Init Ops.
3369 // If the min loop count == 0
3370 // move loc forwards to the end of the loop, skipping over the body.
3371 // If the min count is > 0,
3372 // continue normal processing of the body of the loop.
3373 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3374 loopEndLoc = URX_VAL(loopEndLoc)((loopEndLoc) & 0xffffff);
3375 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3376 if (minLoopCount == 0) {
3377 loc = loopEndLoc;
3378 } else {
3379 loc+=3; // Skips over operands of CTR_INIT
3380 }
3381 }
3382 break;
3383
3384
3385 case URX_CTR_LOOP:
3386 case URX_CTR_LOOP_NG:
3387 // Loop ops.
3388 // The jump is conditional, backwards only.
3389 break;
3390
3391 case URX_LOOP_SR_I:
3392 case URX_LOOP_DOT_I:
3393 case URX_LOOP_C:
3394 // More loop ops. These state-save to themselves.
3395 // don't change the minimum match - could match nothing at all.
3396 break;
3397
3398
3399 case URX_LA_START:
3400 case URX_LB_START:
3401 {
3402 // Look-around. Scan forward until the matching look-ahead end,
3403 // without processing the look-around block. This is overly pessimistic for look-ahead,
3404 // it assumes that the look-ahead match might be zero-length.
3405 // TODO: Positive lookahead could recursively do the block, then continue
3406 // with the longer of the block or the value coming in. Ticket 6060
3407 int32_t depth = (opType == URX_LA_START? 2: 1);
3408 for (;;) {
3409 loc++;
3410 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3411 if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_LA_START) {
3412 // The boilerplate for look-ahead includes two LA_END instructions,
3413 // Depth will be decremented by each one when it is seen.
3414 depth += 2;
3415 }
3416 if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_LB_START) {
3417 depth++;
3418 }
3419 if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_LA_END) {
3420 depth--;
3421 if (depth == 0) {
3422 break;
3423 }
3424 }
3425 if (URX_TYPE(op)((uint32_t)(op) >> 24)==URX_LBN_END) {
3426 depth--;
3427 if (depth == 0) {
3428 break;
3429 }
3430 }
3431 if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_STATE_SAVE) {
3432 // Need this because neg lookahead blocks will FAIL to outside
3433 // of the block.
3434 int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
3435 if (jmpDest > loc) {
3436 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3437 forwardedLength.setElementAt(currentLen, jmpDest);
3438 }
3439 }
3440 }
3441 U_ASSERT(loc <= end)(void)0;
3442 }
3443 }
3444 break;
3445
3446 case URX_LA_END:
3447 case URX_LB_CONT:
3448 case URX_LB_END:
3449 case URX_LBN_CONT:
3450 case URX_LBN_END:
3451 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3452 // range being sized, which happens when measuring size of look-behind blocks.
3453 break;
3454
3455 default:
3456 UPRV_UNREACHABLE_EXITabort();
3457 }
3458
3459 }
3460
3461 // We have finished walking through the ops. Check whether some forward jump
3462 // propagated a shorter length to location end+1.
3463 if (forwardedLength.elementAti(end+1) < currentLen) {
3464 currentLen = forwardedLength.elementAti(end+1);
3465 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX)(void)0;
3466 }
3467
3468 return currentLen;
3469}
3470
3471//------------------------------------------------------------------------------
3472//
3473// maxMatchLength Calculate the length of the longest string that could
3474// match the specified pattern.
3475// Length is in 16 bit code units, not code points.
3476//
3477// The calculated length may not be exact. The returned
3478// value may be longer than the actual maximum; it must
3479// never be shorter.
3480//
3481// start, end: the range of the pattern to check.
3482// end is inclusive.
3483//
3484//------------------------------------------------------------------------------
3485int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3486 if (U_FAILURE(*fStatus)) {
3487 return 0;
3488 }
3489 U_ASSERT(start <= end)(void)0;
3490 U_ASSERT(end < fRXPat->fCompiledPat->size())(void)0;
3491
3492 int32_t loc;
3493 int32_t op;
3494 int32_t opType;
3495 int32_t currentLen = 0;
3496 UVector32 forwardedLength(end+1, *fStatus);
3497 forwardedLength.setSize(end+1);
3498
3499 for (loc=start; loc<=end; loc++) {
3500 forwardedLength.setElementAt(0, loc);
3501 }
3502
3503 for (loc = start; loc<=end; loc++) {
3504 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3505 opType = URX_TYPE(op)((uint32_t)(op) >> 24);
3506
3507 // The loop is advancing linearly through the pattern.
3508 // If the op we are now at was the destination of a branch in the pattern,
3509 // and that path has a longer maximum length than the current accumulated value,
3510 // replace the current accumulated value.
3511 if (forwardedLength.elementAti(loc) > currentLen) {
3512 currentLen = forwardedLength.elementAti(loc);
3513 }
3514
3515 switch (opType) {
3516 // Ops that don't change the total length matched
3517 case URX_RESERVED_OP:
3518 case URX_END:
3519 case URX_STRING_LEN:
3520 case URX_NOP:
3521 case URX_START_CAPTURE:
3522 case URX_END_CAPTURE:
3523 case URX_BACKSLASH_B:
3524 case URX_BACKSLASH_BU:
3525 case URX_BACKSLASH_G:
3526 case URX_BACKSLASH_Z:
3527 case URX_CARET:
3528 case URX_DOLLAR:
3529 case URX_DOLLAR_M:
3530 case URX_DOLLAR_D:
3531 case URX_DOLLAR_MD:
3532 case URX_RELOC_OPRND:
3533 case URX_STO_INP_LOC:
3534 case URX_CARET_M:
3535 case URX_CARET_M_UNIX:
3536
3537 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3538 case URX_LD_SP:
3539
3540 case URX_LB_END:
3541 case URX_LB_CONT:
3542 case URX_LBN_CONT:
3543 case URX_LBN_END:
3544 break;
3545
3546
3547 // Ops that increase that cause an unbounded increase in the length
3548 // of a matched string, or that increase it a hard to characterize way.
3549 // Call the max length unbounded, and stop further checking.
3550 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3551 case URX_BACKREF_I:
3552 case URX_BACKSLASH_X: // Grapheme Cluster. Minimum is 1, max unbounded.
3553 currentLen = INT32_MAX(2147483647);
3554 break;
3555
3556
3557 // Ops that match a max of one character (possibly two 16 bit code units.)
3558 //
3559 case URX_STATIC_SETREF:
3560 case URX_STAT_SETREF_N:
3561 case URX_SETREF:
3562 case URX_BACKSLASH_D:
3563 case URX_BACKSLASH_H:
3564 case URX_BACKSLASH_R:
3565 case URX_BACKSLASH_V:
3566 case URX_ONECHAR_I:
3567 case URX_DOTANY_ALL:
3568 case URX_DOTANY:
3569 case URX_DOTANY_UNIX:
3570 currentLen = safeIncrement(currentLen, 2);
3571 break;
3572
3573 // Single literal character. Increase current max length by one or two,
3574 // depending on whether the char is in the supplementary range.
3575 case URX_ONECHAR:
3576 currentLen = safeIncrement(currentLen, 1);
3577 if (URX_VAL(op)((op) & 0xffffff) > 0x10000) {
3578 currentLen = safeIncrement(currentLen, 1);
3579 }
3580 break;
3581
3582 // Jumps.
3583 //
3584 case URX_JMP:
3585 case URX_JMPX:
3586 case URX_JMP_SAV:
3587 case URX_JMP_SAV_X:
3588 {
3589 int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
3590 if (jmpDest < loc) {
3591 // Loop of some kind. Max match length is unbounded.
3592 currentLen = INT32_MAX(2147483647);
3593 } else {
3594 // Forward jump. Propagate the current min length to the target loc of the jump.
3595 if (forwardedLength.elementAti(jmpDest) < currentLen) {
3596 forwardedLength.setElementAt(currentLen, jmpDest);
3597 }
3598 currentLen = 0;
3599 }
3600 }
3601 break;
3602
3603 case URX_BACKTRACK:
3604 // back-tracks are kind of like a branch, except that the max length was
3605 // propagated already, by the state save.
3606 currentLen = forwardedLength.elementAti(loc+1);
3607 break;
3608
3609
3610 case URX_STATE_SAVE:
3611 {
3612 // State Save, for forward jumps, propagate the current minimum.
3613 // of the state save.
3614 // For backwards jumps, they create a loop, maximum
3615 // match length is unbounded.
3616 int32_t jmpDest = URX_VAL(op)((op) & 0xffffff);
3617 if (jmpDest > loc) {
3618 if (currentLen > forwardedLength.elementAti(jmpDest)) {
3619 forwardedLength.setElementAt(currentLen, jmpDest);
3620 }
3621 } else {
3622 currentLen = INT32_MAX(2147483647);
3623 }
3624 }
3625 break;
3626
3627
3628
3629
3630 case URX_STRING:
3631 {
3632 loc++;
3633 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3634 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)((stringLenOp) & 0xffffff));
3635 break;
3636 }
3637
3638 case URX_STRING_I:
3639 // TODO: This code assumes that any user string that matches will be no longer
3640 // than our compiled string, with case insensitive matching.
3641 // Our compiled string has been case-folded already.
3642 //
3643 // Any matching user string will have no more code points than our
3644 // compiled (folded) string. Folding may add code points, but
3645 // not remove them.
3646 //
3647 // There is a potential problem if a supplemental code point
3648 // case-folds to a BMP code point. In this case our compiled string
3649 // could be shorter (in code units) than a matching user string.
3650 //
3651 // At this time (Unicode 6.1) there are no such characters, and this case
3652 // is not being handled. A test, intltest regex/Bug9283, will fail if
3653 // any problematic characters are added to Unicode.
3654 //
3655 // If this happens, we can make a set of the BMP chars that the
3656 // troublesome supplementals fold to, scan our string, and bump the
3657 // currentLen one extra for each that is found.
3658 //
3659 {
3660 loc++;
3661 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3662 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)((stringLenOp) & 0xffffff));
3663 }
3664 break;
3665
3666 case URX_CTR_INIT:
3667 case URX_CTR_INIT_NG:
3668 // For Loops, recursively call this function on the pattern for the loop body,
3669 // then multiply the result by the maximum loop count.
3670 {
3671 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1))((fRXPat->fCompiledPat->elementAti(loc+1)) & 0xffffff
)
;
3672 if (loopEndLoc == loc+4) {
3673 // Loop has an empty body. No affect on max match length.
3674 // Continue processing with code after the loop end.
3675 loc = loopEndLoc;
3676 break;
3677 }
3678
3679 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3680 if (maxLoopCount == -1) {
3681 // Unbounded Loop. No upper bound on match length.
3682 currentLen = INT32_MAX(2147483647);
3683 break;
3684 }
3685
3686 U_ASSERT(loopEndLoc >= loc+4)(void)0;
3687 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
3688 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3689 if (updatedLen >= INT32_MAX(2147483647)) {
3690 currentLen = INT32_MAX(2147483647);
3691 break;
3692 }
3693 currentLen = (int32_t)updatedLen;
3694 loc = loopEndLoc;
3695 break;
3696 }
3697
3698 case URX_CTR_LOOP:
3699 case URX_CTR_LOOP_NG:
3700 // These opcodes will be skipped over by code for URX_CTR_INIT.
3701 // We shouldn't encounter them here.
3702 UPRV_UNREACHABLE_EXITabort();
3703
3704 case URX_LOOP_SR_I:
3705 case URX_LOOP_DOT_I:
3706 case URX_LOOP_C:
3707 // For anything to do with loops, make the match length unbounded.
3708 currentLen = INT32_MAX(2147483647);
3709 break;
3710
3711
3712
3713 case URX_LA_START:
3714 case URX_LA_END:
3715 // Look-ahead. Just ignore, treat the look-ahead block as if
3716 // it were normal pattern. Gives a too-long match length,
3717 // but good enough for now.
3718 break;
3719
3720 // End of look-ahead ops should always be consumed by the processing at
3721 // the URX_LA_START op.
3722 // UPRV_UNREACHABLE_EXIT;
3723
3724 case URX_LB_START:
3725 {
3726 // Look-behind. Scan forward until the matching look-around end,
3727 // without processing the look-behind block.
3728 int32_t dataLoc = URX_VAL(op)((op) & 0xffffff);
3729 for (loc = loc + 1; loc <= end; ++loc) {
3730 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3731 int32_t opType = URX_TYPE(op)((uint32_t)(op) >> 24);
3732 if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op)((op) & 0xffffff) == dataLoc)) {
3733 break;
3734 }
3735 }
3736 U_ASSERT(loc <= end)(void)0;
3737 }
3738 break;
3739
3740 default:
3741 UPRV_UNREACHABLE_EXITabort();
3742 }
3743
3744
3745 if (currentLen == INT32_MAX(2147483647)) {
3746 // The maximum length is unbounded.
3747 // Stop further processing of the pattern.
3748 break;
3749 }
3750
3751 }
3752 return currentLen;
3753
3754}
3755
3756
3757//------------------------------------------------------------------------------
3758//
3759// stripNOPs Remove any NOP operations from the compiled pattern code.
3760// Extra NOPs are inserted for some constructs during the initial
3761// code generation to provide locations that may be patched later.
3762// Many end up unneeded, and are removed by this function.
3763//
3764// In order to minimize the number of passes through the pattern,
3765// back-reference fixup is also performed here (adjusting
3766// back-reference operands to point to the correct frame offsets).
3767//
3768//------------------------------------------------------------------------------
3769void RegexCompile::stripNOPs() {
3770
3771 if (U_FAILURE(*fStatus)) {
3772 return;
3773 }
3774
3775 int32_t end = fRXPat->fCompiledPat->size();
3776 UVector32 deltas(end, *fStatus);
3777
3778 // Make a first pass over the code, computing the amount that things
3779 // will be offset at each location in the original code.
3780 int32_t loc;
3781 int32_t d = 0;
3782 for (loc=0; loc<end; loc++) {
3783 deltas.addElement(d, *fStatus);
3784 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3785 if (URX_TYPE(op)((uint32_t)(op) >> 24) == URX_NOP) {
3786 d++;
3787 }
3788 }
3789
3790 UnicodeString caseStringBuffer;
3791
3792 // Make a second pass over the code, removing the NOPs by moving following
3793 // code up, and patching operands that refer to code locations that
3794 // are being moved. The array of offsets from the first step is used
3795 // to compute the new operand values.
3796 int32_t src;
3797 int32_t dst = 0;
3798 for (src=0; src<end; src++) {
3799 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3800 int32_t opType = URX_TYPE(op)((uint32_t)(op) >> 24);
3801 switch (opType) {
3802 case URX_NOP:
3803 break;
3804
3805 case URX_STATE_SAVE:
3806 case URX_JMP:
3807 case URX_CTR_LOOP:
3808 case URX_CTR_LOOP_NG:
3809 case URX_RELOC_OPRND:
3810 case URX_JMPX:
3811 case URX_JMP_SAV:
3812 case URX_JMP_SAV_X:
3813 // These are instructions with operands that refer to code locations.
3814 {
3815 int32_t operandAddress = URX_VAL(op)((op) & 0xffffff);
3816 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size())(void)0;
3817 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3818 op = buildOp(opType, fixedOperandAddress);
3819 fRXPat->fCompiledPat->setElementAt(op, dst);
3820 dst++;
3821 break;
3822 }
3823
3824 case URX_BACKREF:
3825 case URX_BACKREF_I:
3826 {
3827 int32_t where = URX_VAL(op)((op) & 0xffffff);
3828 if (where > fRXPat->fGroupMap->size()) {
3829 error(U_REGEX_INVALID_BACK_REF);
3830 break;
3831 }
3832 where = fRXPat->fGroupMap->elementAti(where-1);
3833 op = buildOp(opType, where);
3834 fRXPat->fCompiledPat->setElementAt(op, dst);
3835 dst++;
3836
3837 fRXPat->fNeedsAltInput = TRUE1;
3838 break;
3839 }
3840 case URX_RESERVED_OP:
3841 case URX_RESERVED_OP_N:
3842 case URX_BACKTRACK:
3843 case URX_END:
3844 case URX_ONECHAR:
3845 case URX_STRING:
3846 case URX_STRING_LEN:
3847 case URX_START_CAPTURE:
3848 case URX_END_CAPTURE:
3849 case URX_STATIC_SETREF:
3850 case URX_STAT_SETREF_N:
3851 case URX_SETREF:
3852 case URX_DOTANY:
3853 case URX_FAIL:
3854 case URX_BACKSLASH_B:
3855 case URX_BACKSLASH_BU:
3856 case URX_BACKSLASH_G:
3857 case URX_BACKSLASH_X:
3858 case URX_BACKSLASH_Z:
3859 case URX_DOTANY_ALL:
3860 case URX_BACKSLASH_D:
3861 case URX_CARET:
3862 case URX_DOLLAR:
3863 case URX_CTR_INIT:
3864 case URX_CTR_INIT_NG:
3865 case URX_DOTANY_UNIX:
3866 case URX_STO_SP:
3867 case URX_LD_SP:
3868 case URX_STO_INP_LOC:
3869 case URX_LA_START:
3870 case URX_LA_END:
3871 case URX_ONECHAR_I:
3872 case URX_STRING_I:
3873 case URX_DOLLAR_M:
3874 case URX_CARET_M:
3875 case URX_CARET_M_UNIX:
3876 case URX_LB_START:
3877 case URX_LB_CONT:
3878 case URX_LB_END:
3879 case URX_LBN_CONT:
3880 case URX_LBN_END:
3881 case URX_LOOP_SR_I:
3882 case URX_LOOP_DOT_I:
3883 case URX_LOOP_C:
3884 case URX_DOLLAR_D:
3885 case URX_DOLLAR_MD:
3886 case URX_BACKSLASH_H:
3887 case URX_BACKSLASH_R:
3888 case URX_BACKSLASH_V:
3889 // These instructions are unaltered by the relocation.
3890 fRXPat->fCompiledPat->setElementAt(op, dst);
3891 dst++;
3892 break;
3893
3894 default:
3895 // Some op is unaccounted for.
3896 UPRV_UNREACHABLE_EXITabort();
3897 }
3898 }
3899
3900 fRXPat->fCompiledPat->setSize(dst);
3901}
3902
3903
3904
3905
3906//------------------------------------------------------------------------------
3907//
3908// Error Report a rule parse error.
3909// Only report it if no previous error has been recorded.
3910//
3911//------------------------------------------------------------------------------
3912void RegexCompile::error(UErrorCode e) {
3913 if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3914 *fStatus = e;
3915 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3916 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3917 // int64_t. If the values of the latter are out of range for the former,
3918 // set them to the appropriate "field not supported" values.
3919 if (fLineNum > 0x7FFFFFFF) {
3920 fParseErr->line = 0;
3921 fParseErr->offset = -1;
3922 } else if (fCharNum > 0x7FFFFFFF) {
3923 fParseErr->line = (int32_t)fLineNum;
3924 fParseErr->offset = -1;
3925 } else {
3926 fParseErr->line = (int32_t)fLineNum;
3927 fParseErr->offset = (int32_t)fCharNum;
3928 }
3929
3930 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3931
3932 // Fill in the context.
3933 // Note: extractBetween() pins supplied indices to the string bounds.
3934 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)):: memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext
))
;
3935 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)):: memset(fParseErr->postContext, 0, sizeof(fParseErr->
postContext))
;
3936 utext_extractutext_extract_71(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3937 utext_extractutext_extract_71(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3938 }
3939}
3940
3941
3942//
3943// Assorted Unicode character constants.
3944// Numeric because there is no portable way to enter them as literals.
3945// (Think EBCDIC).
3946//
3947static const UChar chCR = 0x0d; // New lines, for terminating comments.
3948static const UChar chLF = 0x0a; // Line Feed
3949static const UChar chPound = 0x23; // '#', introduces a comment.
3950static const UChar chDigit0 = 0x30; // '0'
3951static const UChar chDigit7 = 0x37; // '9'
3952static const UChar chColon = 0x3A; // ':'
3953static const UChar chE = 0x45; // 'E'
3954static const UChar chQ = 0x51; // 'Q'
3955//static const UChar chN = 0x4E; // 'N'
3956static const UChar chP = 0x50; // 'P'
3957static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
3958//static const UChar chLBracket = 0x5b; // '['
3959static const UChar chRBracket = 0x5d; // ']'
3960static const UChar chUp = 0x5e; // '^'
3961static const UChar chLowerP = 0x70;
3962static const UChar chLBrace = 0x7b; // '{'
3963static const UChar chRBrace = 0x7d; // '}'
3964static const UChar chNEL = 0x85; // NEL newline variant
3965static const UChar chLS = 0x2028; // Unicode Line Separator
3966
3967
3968//------------------------------------------------------------------------------
3969//
3970// nextCharLL Low Level Next Char from the regex pattern.
3971// Get a char from the string, keep track of input position
3972// for error reporting.
3973//
3974//------------------------------------------------------------------------------
3975UChar32 RegexCompile::nextCharLL() {
3976 UChar32 ch;
3977
3978 if (fPeekChar != -1) {
3979 ch = fPeekChar;
3980 fPeekChar = -1;
3981 return ch;
3982 }
3983
3984 // assume we're already in the right place
3985 ch = UTEXT_NEXT32(fRXPat->fPattern)((fRXPat->fPattern)->chunkOffset < (fRXPat->fPattern
)->chunkLength && ((fRXPat->fPattern)->chunkContents
)[(fRXPat->fPattern)->chunkOffset]<0xd800 ? ((fRXPat
->fPattern)->chunkContents)[((fRXPat->fPattern)->
chunkOffset)++] : utext_next32_71(fRXPat->fPattern))
;
3986 if (ch == U_SENTINEL(-1)) {
3987 return ch;
3988 }
3989
3990 if (ch == chCR ||
3991 ch == chNEL ||
3992 ch == chLS ||
3993 (ch == chLF && fLastChar != chCR)) {
3994 // Character is starting a new line. Bump up the line number, and
3995 // reset the column to 0.
3996 fLineNum++;
3997 fCharNum=0;
3998 }
3999 else {
4000 // Character is not starting a new line. Except in the case of a
4001 // LF following a CR, increment the column position.
4002 if (ch != chLF) {
4003 fCharNum++;
4004 }
4005 }
4006 fLastChar = ch;
4007 return ch;
4008}
4009
4010//------------------------------------------------------------------------------
4011//
4012// peekCharLL Low Level Character Scanning, sneak a peek at the next
4013// character without actually getting it.
4014//
4015//------------------------------------------------------------------------------
4016UChar32 RegexCompile::peekCharLL() {
4017 if (fPeekChar == -1) {
4018 fPeekChar = nextCharLL();
4019 }
4020 return fPeekChar;
4021}
4022
4023
4024//------------------------------------------------------------------------------
4025//
4026// nextChar for pattern scanning. At this level, we handle stripping
4027// out comments and processing some backslash character escapes.
4028// The rest of the pattern grammar is handled at the next level up.
4029//
4030//------------------------------------------------------------------------------
4031void RegexCompile::nextChar(RegexPatternChar &c) {
4032 tailRecursion:
4033 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern)((fRXPat->fPattern)->chunkOffset <= (fRXPat->fPattern
)->nativeIndexingLimit? (fRXPat->fPattern)->chunkNativeStart
+(fRXPat->fPattern)->chunkOffset : (fRXPat->fPattern
)->pFuncs->mapOffsetToNative(fRXPat->fPattern))
;
4034 c.fChar = nextCharLL();
4035 c.fQuoted = FALSE0;
4036
4037 if (fQuoteMode) {
4038 c.fQuoted = TRUE1;
4039 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4040 c.fChar == (UChar32)-1) {
4041 fQuoteMode = FALSE0; // Exit quote mode,
4042 nextCharLL(); // discard the E
4043 // nextChar(c); // recurse to get the real next char
4044 goto tailRecursion; // Note: fuzz testing produced testcases that
4045 // resulted in stack overflow here.
4046 }
4047 }
4048 else if (fInBackslashQuote) {
4049 // The current character immediately follows a '\'
4050 // Don't check for any further escapes, just return it as-is.
4051 // Don't set c.fQuoted, because that would prevent the state machine from
4052 // dispatching on the character.
4053 fInBackslashQuote = FALSE0;
4054 }
4055 else
4056 {
4057 // We are not in a \Q quoted region \E of the source.
4058 //
4059 if (fModeFlags & UREGEX_COMMENTS) {
4060 //
4061 // We are in free-spacing and comments mode.
4062 // Scan through any white space and comments, until we
4063 // reach a significant character or the end of input.
4064 for (;;) {
4065 if (c.fChar == (UChar32)-1) {
4066 break; // End of Input
4067 }
4068 if (c.fChar == chPound && fEOLComments == TRUE1) {
4069 // Start of a comment. Consume the rest of it, until EOF or a new line
4070 for (;;) {
4071 c.fChar = nextCharLL();
4072 if (c.fChar == (UChar32)-1 || // EOF
4073 c.fChar == chCR ||
4074 c.fChar == chLF ||
4075 c.fChar == chNEL ||
4076 c.fChar == chLS) {
4077 break;
4078 }
4079 }
4080 }
4081 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4082 if (PatternProps::isWhiteSpace(c.fChar) == FALSE0) {
4083 break;
4084 }
4085 c.fChar = nextCharLL();
4086 }
4087 }
4088
4089 //
4090 // check for backslash escaped characters.
4091 //
4092 if (c.fChar == chBackSlash) {
4093 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern)((fRXPat->fPattern)->chunkOffset <= (fRXPat->fPattern
)->nativeIndexingLimit? (fRXPat->fPattern)->chunkNativeStart
+(fRXPat->fPattern)->chunkOffset : (fRXPat->fPattern
)->pFuncs->mapOffsetToNative(fRXPat->fPattern))
;
4094 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4095 //
4096 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4097 // Includes \uxxxx, \n, \r, many others.
4098 // Return the single equivalent character.
4099 //
4100 nextCharLL(); // get & discard the peeked char.
4101 c.fQuoted = TRUE1;
4102
4103 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)((0==((fRXPat->fPattern)->chunkNativeStart))&&(
(fPatternLength)==((fRXPat->fPattern)->chunkNativeLimit
))&&((fPatternLength)==((fRXPat->fPattern)->nativeIndexingLimit
)))
) {
4104 int32_t endIndex = (int32_t)pos;
4105 c.fChar = u_unescapeAtu_unescapeAt_71(uregex_ucstr_unescape_charAturegex_ucstr_unescape_charAt_71, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4106
4107 if (endIndex == pos) {
4108 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4109 }
4110 fCharNum += endIndex - pos;
4111 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex)do { int64_t __offset = (endIndex) - (fRXPat->fPattern)->
chunkNativeStart; if (__offset>=0 && __offset<(
int64_t)(fRXPat->fPattern)->nativeIndexingLimit &&
(fRXPat->fPattern)->chunkContents[__offset]<0xdc00)
{ (fRXPat->fPattern)->chunkOffset=(int32_t)__offset; }
else { utext_setNativeIndex_71((fRXPat->fPattern), (endIndex
)); } } while (false)
;
4112 } else {
4113 int32_t offset = 0;
4114 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern){ (fRXPat->fPattern), -1 };
4115
4116 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos)do { int64_t __offset = (pos) - (fRXPat->fPattern)->chunkNativeStart
; if (__offset>=0 && __offset<(int64_t)(fRXPat->
fPattern)->nativeIndexingLimit && (fRXPat->fPattern
)->chunkContents[__offset]<0xdc00) { (fRXPat->fPattern
)->chunkOffset=(int32_t)__offset; } else { utext_setNativeIndex_71
((fRXPat->fPattern), (pos)); } } while (false)
;
4117 c.fChar = u_unescapeAtu_unescapeAt_71(uregex_utext_unescape_charAturegex_utext_unescape_charAt_71, &offset, INT32_MAX(2147483647), &context);
4118
4119 if (offset == 0) {
4120 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4121 } else if (context.lastOffset == offset) {
4122 UTEXT_PREVIOUS32(fRXPat->fPattern)((fRXPat->fPattern)->chunkOffset > 0 && (fRXPat
->fPattern)->chunkContents[(fRXPat->fPattern)->chunkOffset
-1] < 0xd800 ? (fRXPat->fPattern)->chunkContents[--(
(fRXPat->fPattern)->chunkOffset)] : utext_previous32_71
(fRXPat->fPattern))
;
4123 } else if (context.lastOffset != offset-1) {
4124 utext_moveIndex32utext_moveIndex32_71(fRXPat->fPattern, offset - context.lastOffset - 1);
4125 }
4126 fCharNum += offset;
4127 }
4128 }
4129 else if (peekCharLL() == chDigit0) {
4130 // Octal Escape, using Java Regexp Conventions
4131 // which are \0 followed by 1-3 octal digits.
4132 // Different from ICU Unescape handling of Octal, which does not
4133 // require the leading 0.
4134 // Java also has the convention of only consuming 2 octal digits if
4135 // the three digit number would be > 0xff
4136 //
4137 c.fChar = 0;
4138 nextCharLL(); // Consume the initial 0.
4139 int index;
4140 for (index=0; index<3; index++) {
4141 int32_t ch = peekCharLL();
4142 if (ch<chDigit0 || ch>chDigit7) {
4143 if (index==0) {
4144 // \0 is not followed by any octal digits.
4145 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4146 }
4147 break;
4148 }
4149 c.fChar <<= 3;
4150 c.fChar += ch&7;
4151 if (c.fChar <= 255) {
4152 nextCharLL();
4153 } else {
4154 // The last digit made the number too big. Forget we saw it.
4155 c.fChar >>= 3;
4156 }
4157 }
4158 c.fQuoted = TRUE1;
4159 }
4160 else if (peekCharLL() == chQ) {
4161 // "\Q" enter quote mode, which will continue until "\E"
4162 fQuoteMode = TRUE1;
4163 nextCharLL(); // discard the 'Q'.
4164 // nextChar(c); // recurse to get the real next char.
4165 goto tailRecursion; // Note: fuzz testing produced test cases that
4166 // resulted in stack overflow here.
4167 }
4168 else
4169 {
4170 // We are in a '\' escape that will be handled by the state table scanner.
4171 // Just return the backslash, but remember that the following char is to
4172 // be taken literally.
4173 fInBackslashQuote = TRUE1;
4174 }
4175 }
4176 }
4177
4178 // re-enable # to end-of-line comments, in case they were disabled.
4179 // They are disabled by the parser upon seeing '(?', but this lasts for
4180 // the fetching of the next character only.
4181 fEOLComments = TRUE1;
4182
4183 // putc(c.fChar, stdout);
4184}
4185
4186
4187
4188//------------------------------------------------------------------------------
4189//
4190// scanNamedChar
4191// Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4192//
4193// The scan position will be at the 'N'. On return
4194// the scan position should be just after the '}'
4195//
4196// Return the UChar32
4197//
4198//------------------------------------------------------------------------------
4199UChar32 RegexCompile::scanNamedChar() {
4200 if (U_FAILURE(*fStatus)) {
4201 return 0;
4202 }
4203
4204 nextChar(fC);
4205 if (fC.fChar != chLBrace) {
4206 error(U_REGEX_PROPERTY_SYNTAX);
4207 return 0;
4208 }
4209
4210 UnicodeString charName;
4211 for (;;) {
4212 nextChar(fC);
4213 if (fC.fChar == chRBrace) {
4214 break;
4215 }
4216 if (fC.fChar == -1) {
4217 error(U_REGEX_PROPERTY_SYNTAX);
4218 return 0;
4219 }
4220 charName.append(fC.fChar);
4221 }
4222
4223 char name[100];
4224 if (!uprv_isInvariantUStringuprv_isInvariantUString_71(charName.getBuffer(), charName.length()) ||
4225 (uint32_t)charName.length()>=sizeof(name)) {
4226 // All Unicode character names have only invariant characters.
4227 // The API to get a character, given a name, accepts only char *, forcing us to convert,
4228 // which requires this error check
4229 error(U_REGEX_PROPERTY_SYNTAX);
4230 return 0;
4231 }
4232 charName.extract(0, charName.length(), name, sizeof(name), US_INVicu::UnicodeString::kInvariant);
4233
4234 UChar32 theChar = u_charFromNameu_charFromName_71(U_UNICODE_CHAR_NAME, name, fStatus);
4235 if (U_FAILURE(*fStatus)) {
4236 error(U_REGEX_PROPERTY_SYNTAX);
4237 }
4238
4239 nextChar(fC); // Continue overall regex pattern processing with char after the '}'
4240 return theChar;
4241}
4242
4243//------------------------------------------------------------------------------
4244//
4245// scanProp Construct a UnicodeSet from the text at the current scan
4246// position, which will be of the form \p{whaterver}
4247//
4248// The scan position will be at the 'p' or 'P'. On return
4249// the scan position should be just after the '}'
4250//
4251// Return a UnicodeSet, constructed from the \P pattern,
4252// or NULL if the pattern is invalid.
4253//
4254//------------------------------------------------------------------------------
4255UnicodeSet *RegexCompile::scanProp() {
4256 UnicodeSet *uset = NULL__null;
4257
4258 if (U_FAILURE(*fStatus)) {
4259 return NULL__null;
4260 }
4261 (void)chLowerP; // Suppress compiler unused variable warning.
4262 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP)(void)0;
4263 UBool negated = (fC.fChar == chP);
4264
4265 UnicodeString propertyName;
4266 nextChar(fC);
4267 if (fC.fChar != chLBrace) {
4268 error(U_REGEX_PROPERTY_SYNTAX);
4269 return NULL__null;
4270 }
4271 for (;;) {
4272 nextChar(fC);
4273 if (fC.fChar == chRBrace) {
4274 break;
4275 }
4276 if (fC.fChar == -1) {
4277 // Hit the end of the input string without finding the closing '}'
4278 error(U_REGEX_PROPERTY_SYNTAX);
4279 return NULL__null;
4280 }
4281 propertyName.append(fC.fChar);
4282 }
4283 uset = createSetForProperty(propertyName, negated);
4284 nextChar(fC); // Move input scan to position following the closing '}'
4285 return uset;
4286}
4287
4288//------------------------------------------------------------------------------
4289//
4290// scanPosixProp Construct a UnicodeSet from the text at the current scan
4291// position, which is expected be of the form [:property expression:]
4292//
4293// The scan position will be at the opening ':'. On return
4294// the scan position must be on the closing ']'
4295//
4296// Return a UnicodeSet constructed from the pattern,
4297// or NULL if this is not a valid POSIX-style set expression.
4298// If not a property expression, restore the initial scan position
4299// (to the opening ':')
4300//
4301// Note: the opening '[:' is not sufficient to guarantee that
4302// this is a [:property:] expression.
4303// [:'+=,] is a perfectly good ordinary set expression that
4304// happens to include ':' as one of its characters.
4305//
4306//------------------------------------------------------------------------------
4307UnicodeSet *RegexCompile::scanPosixProp() {
4308 UnicodeSet *uset = NULL__null;
4309
4310 if (U_FAILURE(*fStatus)) {
4311 return NULL__null;
4312 }
4313
4314 U_ASSERT(fC.fChar == chColon)(void)0;
4315
4316 // Save the scanner state.
4317 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4318 int64_t savedScanIndex = fScanIndex;
4319 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern)((fRXPat->fPattern)->chunkOffset <= (fRXPat->fPattern
)->nativeIndexingLimit? (fRXPat->fPattern)->chunkNativeStart
+(fRXPat->fPattern)->chunkOffset : (fRXPat->fPattern
)->pFuncs->mapOffsetToNative(fRXPat->fPattern))
;
4320 UBool savedQuoteMode = fQuoteMode;
4321 UBool savedInBackslashQuote = fInBackslashQuote;
4322 UBool savedEOLComments = fEOLComments;
4323 int64_t savedLineNum = fLineNum;
4324 int64_t savedCharNum = fCharNum;
4325 UChar32 savedLastChar = fLastChar;
4326 UChar32 savedPeekChar = fPeekChar;
4327 RegexPatternChar savedfC = fC;
4328
4329 // Scan for a closing ]. A little tricky because there are some perverse
4330 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4331 // ending on the second closing ].
4332
4333 UnicodeString propName;
4334 UBool negated = FALSE0;
4335
4336 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4337 nextChar(fC);
4338 if (fC.fChar == chUp) {
4339 negated = TRUE1;
4340 nextChar(fC);
4341 }
4342
4343 // Scan for the closing ":]", collecting the property name along the way.
4344 UBool sawPropSetTerminator = FALSE0;
4345 for (;;) {
4346 propName.append(fC.fChar);
4347 nextChar(fC);
4348 if (fC.fQuoted || fC.fChar == -1) {
4349 // Escaped characters or end of input - either says this isn't a [:Property:]
4350 break;
4351 }
4352 if (fC.fChar == chColon) {
4353 nextChar(fC);
4354 if (fC.fChar == chRBracket) {
4355 sawPropSetTerminator = TRUE1;
4356 }
4357 break;
4358 }
4359 }
4360
4361 if (sawPropSetTerminator) {
4362 uset = createSetForProperty(propName, negated);
4363 }
4364 else
4365 {
4366 // No closing ":]".
4367 // Restore the original scan position.
4368 // The main scanner will retry the input as a normal set expression,
4369 // not a [:Property:] expression.
4370 fScanIndex = savedScanIndex;
4371 fQuoteMode = savedQuoteMode;
4372 fInBackslashQuote = savedInBackslashQuote;
4373 fEOLComments = savedEOLComments;
4374 fLineNum = savedLineNum;
4375 fCharNum = savedCharNum;
4376 fLastChar = savedLastChar;
4377 fPeekChar = savedPeekChar;
4378 fC = savedfC;
4379 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex)do { int64_t __offset = (savedNextIndex) - (fRXPat->fPattern
)->chunkNativeStart; if (__offset>=0 && __offset
<(int64_t)(fRXPat->fPattern)->nativeIndexingLimit &&
(fRXPat->fPattern)->chunkContents[__offset]<0xdc00)
{ (fRXPat->fPattern)->chunkOffset=(int32_t)__offset; }
else { utext_setNativeIndex_71((fRXPat->fPattern), (savedNextIndex
)); } } while (false)
;
4380 }
4381 return uset;
4382}
4383
4384static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4385 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4386 addCategory(set, U_GC_CF_MASK((uint32_t)1<<(U_FORMAT_CHAR)), ec);
4387}
4388
4389//
4390// Create a Unicode Set from a Unicode Property expression.
4391// This is common code underlying both \p{...} and [:...:] expressions.
4392// Includes trying the Java "properties" that aren't supported as
4393// normal ICU UnicodeSet properties
4394//
4395UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4396
4397 if (U_FAILURE(*fStatus)) {
4398 return nullptr;
4399 }
4400 LocalPointer<UnicodeSet> set;
4401 UErrorCode status = U_ZERO_ERROR;
4402
4403 do { // non-loop, exists to allow breaks from the block.
4404 //
4405 // First try the property as we received it
4406 //
4407 UnicodeString setExpr;
4408 uint32_t usetFlags = 0;
4409 setExpr.append(u"[\\p{", -1);
4410 setExpr.append(propName);
4411 setExpr.append(u"}]", -1);
4412 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4413 usetFlags |= USET_CASE_INSENSITIVE;
4414 }
4415 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL__null, status), status);
4416 if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4417 break;
4418 }
4419
4420 //
4421 // The incoming property wasn't directly recognized by ICU.
4422
4423 // Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4424 // Java accepts 'word' with mixed case.
4425 // Java accepts 'all' only in all lower case.
4426
4427 status = U_ZERO_ERROR;
4428 if (propName.caseCompare(u"word", -1, 0) == 0) {
4429 set.adoptInsteadAndCheckErrorCode(
4430 RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
4431 break;
4432 }
4433 if (propName.compare(u"all", -1) == 0) {
4434 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4435 break;
4436 }
4437
4438
4439 // Do Java InBlock expressions
4440 //
4441 UnicodeString mPropName = propName;
4442 if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4443 status = U_ZERO_ERROR;
4444 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4445 if (U_FAILURE(status)) {
4446 break;
4447 }
4448 UnicodeString blockName(mPropName, 2); // Property with the leading "In" removed.
4449 set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4450 break;
4451 }
4452
4453 // Check for the Java form "IsBooleanPropertyValue", which we will recast
4454 // as "BooleanPropertyValue". The property value can be either a
4455 // a General Category or a Script Name.
4456
4457 if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4458 mPropName.remove(0, 2); // Strip the "Is"
4459 if (mPropName.indexOf(u'=') >= 0) {
4460 // Reject any "Is..." property expression containing an '=', that is,
4461 // any non-binary property expression.
4462 status = U_REGEX_PROPERTY_SYNTAX;
4463 break;
4464 }
4465
4466 if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4467 mPropName.setTo(u"unassigned", -1);
4468 negated = !negated;
4469 } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4470 mPropName.setTo(u"Titlecase_Letter", -1);
4471 }
4472
4473 mPropName.insert(0, u"[\\p{", -1);
4474 mPropName.append(u"}]", -1);
4475 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4476
4477 if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4478 set->closeOver(USET_CASE_INSENSITIVE);
4479 }
4480 break;
4481
4482 }
4483
4484 if (propName.startsWith(u"java", -1)) {
4485 status = U_ZERO_ERROR;
4486 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4487 if (U_FAILURE(status)) {
4488 break;
4489 }
4490 //
4491 // Try the various Java specific properties.
4492 // These all begin with "java"
4493 //
4494 if (propName.compare(u"javaDefined", -1) == 0) {
4495 addCategory(set.getAlias(), U_GC_CN_MASK((uint32_t)1<<(U_GENERAL_OTHER_TYPES)), status);
4496 set->complement();
4497 }
4498 else if (propName.compare(u"javaDigit", -1) == 0) {
4499 addCategory(set.getAlias(), U_GC_ND_MASK((uint32_t)1<<(U_DECIMAL_DIGIT_NUMBER)), status);
4500 }
4501 else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4502 addIdentifierIgnorable(set.getAlias(), status);
4503 }
4504 else if (propName.compare(u"javaISOControl", -1) == 0) {
4505 set->add(0, 0x1F).add(0x7F, 0x9F);
4506 }
4507 else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4508 addCategory(set.getAlias(), U_GC_L_MASK(((uint32_t)1<<(U_UPPERCASE_LETTER))|((uint32_t)1<<
(U_LOWERCASE_LETTER))|((uint32_t)1<<(U_TITLECASE_LETTER
))|((uint32_t)1<<(U_MODIFIER_LETTER))|((uint32_t)1<<
(U_OTHER_LETTER)))
, status);
4509 addCategory(set.getAlias(), U_GC_SC_MASK((uint32_t)1<<(U_CURRENCY_SYMBOL)), status);
4510 addCategory(set.getAlias(), U_GC_PC_MASK((uint32_t)1<<(U_CONNECTOR_PUNCTUATION)), status);
4511 addCategory(set.getAlias(), U_GC_ND_MASK((uint32_t)1<<(U_DECIMAL_DIGIT_NUMBER)), status);
4512 addCategory(set.getAlias(), U_GC_NL_MASK((uint32_t)1<<(U_LETTER_NUMBER)), status);
4513 addCategory(set.getAlias(), U_GC_MC_MASK((uint32_t)1<<(U_COMBINING_SPACING_MARK)), status);
4514 addCategory(set.getAlias(), U_GC_MN_MASK((uint32_t)1<<(U_NON_SPACING_MARK)), status);
4515 addIdentifierIgnorable(set.getAlias(), status);
4516 }
4517 else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4518 addCategory(set.getAlias(), U_GC_L_MASK(((uint32_t)1<<(U_UPPERCASE_LETTER))|((uint32_t)1<<
(U_LOWERCASE_LETTER))|((uint32_t)1<<(U_TITLECASE_LETTER
))|((uint32_t)1<<(U_MODIFIER_LETTER))|((uint32_t)1<<
(U_OTHER_LETTER)))
, status);
4519 addCategory(set.getAlias(), U_GC_NL_MASK((uint32_t)1<<(U_LETTER_NUMBER)), status);
4520 addCategory(set.getAlias(), U_GC_SC_MASK((uint32_t)1<<(U_CURRENCY_SYMBOL)), status);
4521 addCategory(set.getAlias(), U_GC_PC_MASK((uint32_t)1<<(U_CONNECTOR_PUNCTUATION)), status);
4522 }
4523 else if (propName.compare(u"javaLetter", -1) == 0) {
4524 addCategory(set.getAlias(), U_GC_L_MASK(((uint32_t)1<<(U_UPPERCASE_LETTER))|((uint32_t)1<<
(U_LOWERCASE_LETTER))|((uint32_t)1<<(U_TITLECASE_LETTER
))|((uint32_t)1<<(U_MODIFIER_LETTER))|((uint32_t)1<<
(U_OTHER_LETTER)))
, status);
4525 }
4526 else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4527 addCategory(set.getAlias(), U_GC_L_MASK(((uint32_t)1<<(U_UPPERCASE_LETTER))|((uint32_t)1<<
(U_LOWERCASE_LETTER))|((uint32_t)1<<(U_TITLECASE_LETTER
))|((uint32_t)1<<(U_MODIFIER_LETTER))|((uint32_t)1<<
(U_OTHER_LETTER)))
, status);
4528 addCategory(set.getAlias(), U_GC_ND_MASK((uint32_t)1<<(U_DECIMAL_DIGIT_NUMBER)), status);
4529 }
4530 else if (propName.compare(u"javaLowerCase", -1) == 0) {
4531 addCategory(set.getAlias(), U_GC_LL_MASK((uint32_t)1<<(U_LOWERCASE_LETTER)), status);
4532 }
4533 else if (propName.compare(u"javaMirrored", -1) == 0) {
4534 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4535 }
4536 else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4537 addCategory(set.getAlias(), U_GC_Z_MASK(((uint32_t)1<<(U_SPACE_SEPARATOR))|((uint32_t)1<<
(U_LINE_SEPARATOR))|((uint32_t)1<<(U_PARAGRAPH_SEPARATOR
)))
, status);
4538 }
4539 else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4540 set->add(0x10000, UnicodeSet::MAX_VALUE);
4541 }
4542 else if (propName.compare(u"javaTitleCase", -1) == 0) {
4543 addCategory(set.getAlias(), U_GC_LT_MASK((uint32_t)1<<(U_TITLECASE_LETTER)), status);
4544 }
4545 else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4546 addCategory(set.getAlias(), U_GC_L_MASK(((uint32_t)1<<(U_UPPERCASE_LETTER))|((uint32_t)1<<
(U_LOWERCASE_LETTER))|((uint32_t)1<<(U_TITLECASE_LETTER
))|((uint32_t)1<<(U_MODIFIER_LETTER))|((uint32_t)1<<
(U_OTHER_LETTER)))
, status);
4547 addCategory(set.getAlias(), U_GC_NL_MASK((uint32_t)1<<(U_LETTER_NUMBER)), status);
4548 }
4549 else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4550 addCategory(set.getAlias(), U_GC_L_MASK(((uint32_t)1<<(U_UPPERCASE_LETTER))|((uint32_t)1<<
(U_LOWERCASE_LETTER))|((uint32_t)1<<(U_TITLECASE_LETTER
))|((uint32_t)1<<(U_MODIFIER_LETTER))|((uint32_t)1<<
(U_OTHER_LETTER)))
, status);
4551 addCategory(set.getAlias(), U_GC_PC_MASK((uint32_t)1<<(U_CONNECTOR_PUNCTUATION)), status);
4552 addCategory(set.getAlias(), U_GC_ND_MASK((uint32_t)1<<(U_DECIMAL_DIGIT_NUMBER)), status);
4553 addCategory(set.getAlias(), U_GC_NL_MASK((uint32_t)1<<(U_LETTER_NUMBER)), status);
4554 addCategory(set.getAlias(), U_GC_MC_MASK((uint32_t)1<<(U_COMBINING_SPACING_MARK)), status);
4555 addCategory(set.getAlias(), U_GC_MN_MASK((uint32_t)1<<(U_NON_SPACING_MARK)), status);
4556 addIdentifierIgnorable(set.getAlias(), status);
4557 }
4558 else if (propName.compare(u"javaUpperCase", -1) == 0) {
4559 addCategory(set.getAlias(), U_GC_LU_MASK((uint32_t)1<<(U_UPPERCASE_LETTER)), status);
4560 }
4561 else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4562 set->add(0, UnicodeSet::MAX_VALUE);
4563 }
4564 else if (propName.compare(u"javaWhitespace", -1) == 0) {
4565 addCategory(set.getAlias(), U_GC_Z_MASK(((uint32_t)1<<(U_SPACE_SEPARATOR))|((uint32_t)1<<
(U_LINE_SEPARATOR))|((uint32_t)1<<(U_PARAGRAPH_SEPARATOR
)))
, status);
4566 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4567 set->add(9, 0x0d).add(0x1c, 0x1f);
4568 } else {
4569 status = U_REGEX_PROPERTY_SYNTAX;
4570 }
4571
4572 if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4573 set->closeOver(USET_CASE_INSENSITIVE);
4574 }
4575 break;
4576 }
4577
4578 // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4579 // extensions matched it.
4580 status = U_REGEX_PROPERTY_SYNTAX;
4581 } while (false); // End of do loop block. Code above breaks out of the block on success or hard failure.
4582
4583 if (U_SUCCESS(status)) {
4584 // ICU 70 adds emoji properties of strings, but as long as Java does not say how to
4585 // deal with properties of strings and character classes with strings, we ignore them.
4586 // Just in case something downstream might stumble over the strings,
4587 // we remove them from the set.
4588 // Note that when we support strings, the complement of a property (as with \P)
4589 // should be implemented as .complement().removeAllStrings() (code point complement).
4590 set->removeAllStrings();
4591 U_ASSERT(set.isValid())(void)0;
4592 if (negated) {
4593 set->complement();
4594 }
4595 return set.orphan();
4596 } else {
4597 if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4598 status = U_REGEX_PROPERTY_SYNTAX;
4599 }
4600 error(status);
4601 return nullptr;
4602 }
4603}
4604
4605
4606//
4607// SetEval Part of the evaluation of [set expressions].
4608// Perform any pending (stacked) operations with precedence
4609// equal or greater to that of the next operator encountered
4610// in the expression.
4611//
4612void RegexCompile::setEval(int32_t nextOp) {
4613 UnicodeSet *rightOperand = NULL__null;
4614 UnicodeSet *leftOperand = NULL__null;
4615 for (;;) {
4616 U_ASSERT(fSetOpStack.empty()==FALSE)(void)0;
4617 int32_t pendingSetOperation = fSetOpStack.peeki();
4618 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4619 break;
4620 }
4621 fSetOpStack.popi();
4622 U_ASSERT(fSetStack.empty() == FALSE)(void)0;
4623 rightOperand = (UnicodeSet *)fSetStack.peek();
4624 // ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings
4625 // (see comments there).
4626 // We also do not yet support string literals in character classes,
4627 // so there should not be any strings.
4628 // Note that when we support strings, the complement of a set (as with ^ or \P)
4629 // should be implemented as .complement().removeAllStrings() (code point complement).
4630 U_ASSERT(!rightOperand->hasStrings())(void)0;
4631 switch (pendingSetOperation) {
4632 case setNegation:
4633 rightOperand->complement();
4634 break;
4635 case setCaseClose:
4636 // TODO: need a simple close function. Ticket 6065
4637 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4638 rightOperand->removeAllStrings();
4639 break;
4640 case setDifference1:
4641 case setDifference2:
4642 fSetStack.pop();
4643 leftOperand = (UnicodeSet *)fSetStack.peek();
4644 leftOperand->removeAll(*rightOperand);
4645 delete rightOperand;
4646 break;
4647 case setIntersection1:
4648 case setIntersection2:
4649 fSetStack.pop();
4650 leftOperand = (UnicodeSet *)fSetStack.peek();
4651 leftOperand->retainAll(*rightOperand);
4652 delete rightOperand;
4653 break;
4654 case setUnion:
4655 fSetStack.pop();
4656 leftOperand = (UnicodeSet *)fSetStack.peek();
4657 leftOperand->addAll(*rightOperand);
4658 delete rightOperand;
4659 break;
4660 default:
4661 UPRV_UNREACHABLE_EXITabort();
4662 }
4663 }
4664 }
4665
4666void RegexCompile::setPushOp(int32_t op) {
4667 setEval(op);
4668 fSetOpStack.push(op, *fStatus);
4669 LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
4670 fSetStack.push(lpSet.orphan(), *fStatus);
4671}
4672
4673U_NAMESPACE_END}
4674#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
4675