Bug Summary

File:out/../deps/icu-small/source/tools/toolutil/xmlparser.cpp
Warning:line 724, column 13
Called C++ object pointer is null

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name xmlparser.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -pic-is-pie -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/home/maurizio/node-v18.6.0/out -resource-dir /usr/local/lib/clang/16.0.0 -D V8_DEPRECATION_WARNINGS -D V8_IMMINENT_DEPRECATION_WARNINGS -D _GLIBCXX_USE_CXX11_ABI=1 -D NODE_OPENSSL_CONF_NAME=nodejs_conf -D NODE_OPENSSL_HAS_QUIC -D __STDC_FORMAT_MACROS -D OPENSSL_NO_PINSHARED -D OPENSSL_THREADS -D U_COMMON_IMPLEMENTATION=1 -D U_I18N_IMPLEMENTATION=1 -D U_IO_IMPLEMENTATION=1 -D U_TOOLUTIL_IMPLEMENTATION=1 -D U_ATTRIBUTE_DEPRECATED= -D _CRT_SECURE_NO_DEPRECATE= -D U_STATIC_IMPLEMENTATION=1 -D UCONFIG_NO_SERVICE=1 -D U_ENABLE_DYLOAD=0 -D U_HAVE_STD_STRING=1 -D UCONFIG_NO_BREAK_ITERATION=0 -I ../deps/icu-small/source/common -I ../deps/icu-small/source/i18n -I ../deps/icu-small/source/tools/toolutil -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8 -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/x86_64-redhat-linux -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/backward -internal-isystem /usr/local/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-redhat-linux/8/../../../../x86_64-redhat-linux/include -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -Wno-unused-parameter -Wno-deprecated-declarations -Wno-strict-aliasing -std=gnu++17 -fdeprecated-macro -fdebug-compilation-dir=/home/maurizio/node-v18.6.0/out -ferror-limit 19 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-08-22-142216-507842-1 -x c++ ../deps/icu-small/source/tools/toolutil/xmlparser.cpp
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2004-2010, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: xmlparser.cpp
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2004jul21
16* created by: Andy Heninger
17*/
18
19#include <stdio.h>
20#include "unicode/uchar.h"
21#include "unicode/ucnv.h"
22#include "unicode/regex.h"
23#include "filestrm.h"
24#include "xmlparser.h"
25
26#if !UCONFIG_NO_REGULAR_EXPRESSIONS0 && !UCONFIG_NO_CONVERSION0
27
28// character constants
29enum {
30 x_QUOT=0x22,
31 x_AMP=0x26,
32 x_APOS=0x27,
33 x_LT=0x3c,
34 x_GT=0x3e,
35 x_l=0x6c
36};
37
38#define XML_SPACES"[ \\u0009\\u000d\\u000a]" "[ \\u0009\\u000d\\u000a]"
39
40// XML #4
41#define XML_NAMESTARTCHAR"[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
"[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
42 "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
43 "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
44 "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
45
46// XML #5
47#define XML_NAMECHAR"[" "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
"[" XML_NAMESTARTCHAR"[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
"\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
48
49// XML #6
50#define XML_NAME"[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "(?:" "[" "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]"
"[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
")*"
XML_NAMESTARTCHAR"[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
"(?:" XML_NAMECHAR"[" "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
")*"
51
52U_NAMESPACE_BEGINnamespace icu_71 {
53
54UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)UClassID UXMLParser::getStaticClassID() { static char classID
= 0; return (UClassID)&classID; } UClassID UXMLParser::getDynamicClassID
() const { return UXMLParser::getStaticClassID(); }
55UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)UClassID UXMLElement::getStaticClassID() { static char classID
= 0; return (UClassID)&classID; } UClassID UXMLElement::
getDynamicClassID() const { return UXMLElement::getStaticClassID
(); }
56
57//
58// UXMLParser constructor. Mostly just initializes the ICU regexes that are
59// used for parsing.
60//
61UXMLParser::UXMLParser(UErrorCode &status) :
62 // XML Declaration. XML Production #23.
63 // example: "<?xml version=1.0 encoding="utf-16" ?>
64 // This is a sloppy implementation - just look for the leading <?xml and the closing ?>
65 // allow for a possible leading BOM.
66 mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INVicu::UnicodeString::kInvariant), 0, status),
67
68 // XML Comment production #15
69 // example: "<!-- whatever -->
70 // note, does not detect an illegal "--" within comments
71 mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INVicu::UnicodeString::kInvariant), 0, status),
72
73 // XML Spaces
74 // production [3]
75 mXMLSP(UnicodeString(XML_SPACES"[ \\u0009\\u000d\\u000a]" "+", -1, US_INVicu::UnicodeString::kInvariant), 0, status),
76
77 // XML Doctype decl production #28
78 // example "<!DOCTYPE foo SYSTEM "somewhere" >
79 // or "<!DOCTYPE foo [internal dtd]>
80 // TODO: we don't actually parse the DOCTYPE or internal subsets.
81 // Some internal dtd subsets could confuse this simple-minded
82 // attempt at skipping over them, specifically, occurrences
83 // of closing square brackets. These could appear in comments,
84 // or in parameter entity declarations, for example.
85 mXMLDoctype(UnicodeString(
86 "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INVicu::UnicodeString::kInvariant
87 ), 0, status),
88
89 // XML PI production #16
90 // example "<?target stuff?>
91 mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INVicu::UnicodeString::kInvariant), 0, status),
92
93 // XML Element Start Productions #40, #41
94 // example <foo att1='abc' att2="d e f" >
95 // capture #1: the tag name
96 //
97 mXMLElemStart (UnicodeString("(?s)<(" XML_NAME"[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "(?:" "[" "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]"
"[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
")*"
")" // match "<tag_name"
98 "(?:"
99 XML_SPACES"[ \\u0009\\u000d\\u000a]" "+" XML_NAME"[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "(?:" "[" "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]"
"[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
")*"
XML_SPACES"[ \\u0009\\u000d\\u000a]" "*=" XML_SPACES"[ \\u0009\\u000d\\u000a]" "*" // match "ATTR_NAME = "
100 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
101 ")*" // * for zero or more attributes.
102 XML_SPACES"[ \\u0009\\u000d\\u000a]" "*?>", -1, US_INVicu::UnicodeString::kInvariant), 0, status), // match " >"
103
104 // XML Element End production #42
105 // example </foo>
106 mXMLElemEnd (UnicodeString("</(" XML_NAME"[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "(?:" "[" "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]"
"[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
")*"
")" XML_SPACES"[ \\u0009\\u000d\\u000a]" "*>", -1, US_INVicu::UnicodeString::kInvariant), 0, status),
107
108 // XML Element Empty production #44
109 // example <foo att1="abc" att2="d e f" />
110 mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME"[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "(?:" "[" "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]"
"[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
")*"
")" // match "<tag_name"
111 "(?:"
112 XML_SPACES"[ \\u0009\\u000d\\u000a]" "+" XML_NAME"[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "(?:" "[" "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]"
"[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
")*"
XML_SPACES"[ \\u0009\\u000d\\u000a]" "*=" XML_SPACES"[ \\u0009\\u000d\\u000a]" "*" // match "ATTR_NAME = "
113 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
114 ")*" // * for zero or more attributes.
115 XML_SPACES"[ \\u0009\\u000d\\u000a]" "*?/>", -1, US_INVicu::UnicodeString::kInvariant), 0, status), // match " />"
116
117
118 // XMLCharData. Everything but '<'. Note that & will be dealt with later.
119 mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INVicu::UnicodeString::kInvariant), 0, status),
120
121 // Attribute name = "value". XML Productions 10, 40/41
122 // Capture group 1 is name,
123 // 2 is the attribute value, including the quotes.
124 //
125 // Note that attributes are scanned twice. The first time is with
126 // the regex for an entire element start. There, the attributes
127 // are checked syntactically, but not separated out one by one.
128 // Here, we match a single attribute, and make its name and
129 // attribute value available to the parser code.
130 mAttrValue(UnicodeString(XML_SPACES"[ \\u0009\\u000d\\u000a]" "+(" XML_NAME"[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "(?:" "[" "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]"
"[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]"
"[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]"
"[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
")*"
")" XML_SPACES"[ \\u0009\\u000d\\u000a]" "*=" XML_SPACES"[ \\u0009\\u000d\\u000a]" "*"
131 "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INVicu::UnicodeString::kInvariant), 0, status),
132
133
134 mAttrNormalizer(UnicodeString(XML_SPACES"[ \\u0009\\u000d\\u000a]", -1, US_INVicu::UnicodeString::kInvariant), 0, status),
135
136 // Match any of the new-line sequences in content.
137 // All are changed to \u000a.
138 mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INVicu::UnicodeString::kInvariant), 0, status),
139
140 // & char references
141 // We will figure out what we've got based on which capture group has content.
142 // The last one is a catchall for unrecognized entity references..
143 // 1 2 3 4 5 6 7 8
144 mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
145 0, status),
146
147 fNames(status),
148 fElementStack(status),
149 fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization.
150 {
151 }
152
153UXMLParser *
154UXMLParser::createParser(UErrorCode &errorCode) {
155 if (U_FAILURE(errorCode)) {
156 return NULL__null;
157 } else {
158 return new UXMLParser(errorCode);
159 }
160}
161
162UXMLParser::~UXMLParser() {}
163
164UXMLElement *
165UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
166 char bytes[4096], charsetBuffer[100];
167 FileStream *f;
168 const char *charset, *pb;
169 UnicodeString src;
170 UConverter *cnv;
171 UChar *buffer, *pu;
172 int32_t fileLength, bytesLength, length, capacity;
173 UBool flush;
174
175 if(U_FAILURE(errorCode)) {
176 return NULL__null;
177 }
178
179 f=T_FileStream_open(filename, "rb");
180 if(f==NULL__null) {
181 errorCode=U_FILE_ACCESS_ERROR;
182 return NULL__null;
183 }
184
185 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
186 if(bytesLength<(int32_t)sizeof(bytes)) {
187 // we have already read the entire file
188 fileLength=bytesLength;
189 } else {
190 // get the file length
191 fileLength=T_FileStream_size(f);
192 }
193
194 /*
195 * get the charset:
196 * 1. Unicode signature
197 * 2. treat as ISO-8859-1 and read XML encoding="charser"
198 * 3. default to UTF-8
199 */
200 charset=ucnv_detectUnicodeSignatureucnv_detectUnicodeSignature_71(bytes, bytesLength, NULL__null, &errorCode);
201 if(U_SUCCESS(errorCode) && charset!=NULL__null) {
202 // open converter according to Unicode signature
203 cnv=ucnv_openucnv_open_71(charset, &errorCode);
204 } else {
205 // read as Latin-1 and parse the XML declaration and encoding
206 cnv=ucnv_openucnv_open_71("ISO-8859-1", &errorCode);
207 if(U_FAILURE(errorCode)) {
208 // unexpected error opening Latin-1 converter
209 goto exit;
210 }
211
212 buffer=toUCharPtr(src.getBuffer(bytesLength));
213 if(buffer==NULL__null) {
214 // unexpected failure to reserve some string capacity
215 errorCode=U_MEMORY_ALLOCATION_ERROR;
216 goto exit;
217 }
218 pb=bytes;
219 pu=buffer;
220 ucnv_toUnicodeucnv_toUnicode_71(
221 cnv,
222 &pu, buffer+src.getCapacity(),
223 &pb, bytes+bytesLength,
224 NULL__null, TRUE1, &errorCode);
225 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
226 ucnv_closeucnv_close_71(cnv);
227 cnv=NULL__null;
228 if(U_FAILURE(errorCode)) {
229 // unexpected error in conversion from Latin-1
230 src.remove();
231 goto exit;
232 }
233
234 // parse XML declaration
235 if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
236 int32_t declEnd=mXMLDecl.end(errorCode);
237 // go beyond <?xml
238 int32_t pos=src.indexOf((UChar)x_l)+1;
239
240 mAttrValue.reset(src);
241 while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element.
242 UnicodeString attName = mAttrValue.group(1, errorCode);
243 UnicodeString attValue = mAttrValue.group(2, errorCode);
244
245 // Trim the quotes from the att value. These are left over from the original regex
246 // that parsed the attribute, which couldn't conveniently strip them.
247 attValue.remove(0,1); // one char from the beginning
248 attValue.truncate(attValue.length()-1); // and one from the end.
249
250 if(attName==UNICODE_STRING("encoding", 8)icu::UnicodeString(true, u"encoding", 8)) {
251 length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
252 charset=charsetBuffer;
253 break;
254 }
255 pos = mAttrValue.end(2, errorCode);
256 }
257
258 if(charset==NULL__null) {
259 // default to UTF-8
260 charset="UTF-8";
261 }
262 cnv=ucnv_openucnv_open_71(charset, &errorCode);
263 }
264 }
265
266 if(U_FAILURE(errorCode)) {
267 // unable to open the converter
268 goto exit;
269 }
270
271 // convert the file contents
272 capacity=fileLength; // estimated capacity
273 src.getBuffer(capacity);
274 src.releaseBuffer(0); // zero length
275 flush=FALSE0;
276 for(;;) {
277 // convert contents of bytes[bytesLength]
278 pb=bytes;
279 for(;;) {
280 length=src.length();
281 buffer=toUCharPtr(src.getBuffer(capacity));
282 if(buffer==NULL__null) {
283 // unexpected failure to reserve some string capacity
284 errorCode=U_MEMORY_ALLOCATION_ERROR;
285 goto exit;
286 }
287
288 pu=buffer+length;
289 ucnv_toUnicodeucnv_toUnicode_71(
290 cnv, &pu, buffer+src.getCapacity(),
291 &pb, bytes+bytesLength,
292 NULL__null, FALSE0, &errorCode);
293 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
294 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
295 errorCode=U_ZERO_ERROR;
296 capacity=(3*src.getCapacity())/2; // increase capacity by 50%
297 } else {
298 break;
299 }
300 }
301
302 if(U_FAILURE(errorCode)) {
303 break; // conversion error
304 }
305
306 if(flush) {
307 break; // completely converted the file
308 }
309
310 // read next block
311 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
312 if(bytesLength==0) {
313 // reached end of file, convert once more to flush the converter
314 flush=TRUE1;
315 }
316 }
317
318exit:
319 ucnv_closeucnv_close_71(cnv);
320 T_FileStream_close(f);
321
322 if(U_SUCCESS(errorCode)) {
323 return parse(src, errorCode);
324 } else {
325 return NULL__null;
326 }
327}
328
329UXMLElement *
330UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
331 if(U_FAILURE(status)) {
332 return NULL__null;
333 }
334
335 UXMLElement *root = NULL__null;
336 fPos = 0; // TODO use just a local pos variable and pass it into functions
337 // where necessary?
338
339 // set all matchers to work on the input string
340 mXMLDecl.reset(src);
341 mXMLComment.reset(src);
342 mXMLSP.reset(src);
343 mXMLDoctype.reset(src);
344 mXMLPI.reset(src);
345 mXMLElemStart.reset(src);
346 mXMLElemEnd.reset(src);
347 mXMLElemEmpty.reset(src);
348 mXMLCharData.reset(src);
349 mAttrValue.reset(src);
350 mAttrNormalizer.reset(src);
351 mNewLineNormalizer.reset(src);
352 mAmps.reset(src);
353
354 // Consume the XML Declaration, if present.
355 if (mXMLDecl.lookingAt(fPos, status)) {
356 fPos = mXMLDecl.end(status);
357 }
358
359 // Consume "misc" [XML production 27] appearing before DocType
360 parseMisc(status);
361
362 // Consume a DocType declaration, if present.
363 if (mXMLDoctype.lookingAt(fPos, status)) {
364 fPos = mXMLDoctype.end(status);
365 }
366
367 // Consume additional "misc" [XML production 27] appearing after the DocType
368 parseMisc(status);
369
370 // Get the root element
371 if (mXMLElemEmpty.lookingAt(fPos, status)) {
372 // Root is an empty element (no nested elements or content)
373 root = createElement(mXMLElemEmpty, status);
374 fPos = mXMLElemEmpty.end(status);
375 } else {
376 if (mXMLElemStart.lookingAt(fPos, status) == FALSE0) {
377 error("Root Element expected", status);
378 goto errorExit;
379 }
380 root = createElement(mXMLElemStart, status);
381 UXMLElement *el = root;
382
383 //
384 // This is the loop that consumes the root element of the document,
385 // including all nested content. Nested elements are handled by
386 // explicit pushes/pops of the element stack; there is no recursion
387 // in the control flow of this code.
388 // "el" always refers to the current element, the one to which content
389 // is being added. It is above the top of the element stack.
390 for (;;) {
391 // Nested Element Start
392 if (mXMLElemStart.lookingAt(fPos, status)) {
393 UXMLElement *t = createElement(mXMLElemStart, status);
394 el->fChildren.addElement(t, status);
395 t->fParent = el;
396 fElementStack.push(el, status);
397 el = t;
398 continue;
399 }
400
401 // Text Content. String is concatenated onto the current node's content,
402 // but only if it contains something other than spaces.
403 UnicodeString s = scanContent(status);
404 if (s.length() > 0) {
405 mXMLSP.reset(s);
406 if (mXMLSP.matches(status) == FALSE0) {
407 // This chunk of text contains something other than just
408 // white space. Make a child node for it.
409 replaceCharRefs(s, status);
410 el->fChildren.addElement(s.clone(), status);
411 }
412 mXMLSP.reset(src); // The matchers need to stay set to the main input string.
413 continue;
414 }
415
416 // Comments. Discard.
417 if (mXMLComment.lookingAt(fPos, status)) {
418 fPos = mXMLComment.end(status);
419 continue;
420 }
421
422 // PIs. Discard.
423 if (mXMLPI.lookingAt(fPos, status)) {
424 fPos = mXMLPI.end(status);
425 continue;
426 }
427
428 // Element End
429 if (mXMLElemEnd.lookingAt(fPos, status)) {
430 fPos = mXMLElemEnd.end(0, status);
431 const UnicodeString name = mXMLElemEnd.group(1, status);
432 if (name != *el->fName) {
433 error("Element start / end tag mismatch", status);
434 goto errorExit;
435 }
436 if (fElementStack.empty()) {
437 // Close of the root element. We're done with the doc.
438 el = NULL__null;
439 break;
440 }
441 el = (UXMLElement *)fElementStack.pop();
442 continue;
443 }
444
445 // Empty Element. Stored as a child of the current element, but not stacked.
446 if (mXMLElemEmpty.lookingAt(fPos, status)) {
447 UXMLElement *t = createElement(mXMLElemEmpty, status);
448 el->fChildren.addElement(t, status);
449 continue;
450 }
451
452 // Hit something within the document that doesn't match anything.
453 // It's an error.
454 error("Unrecognized markup", status);
455 break;
456 }
457
458 if (el != NULL__null || !fElementStack.empty()) {
459 // We bailed out early, for some reason.
460 error("Root element not closed.", status);
461 goto errorExit;
462 }
463 }
464
465 // Root Element parse is complete.
466 // Consume the annoying xml "Misc" that can appear at the end of the doc.
467 parseMisc(status);
468
469 // We should have reached the end of the input
470 if (fPos != src.length()) {
471 error("Extra content at the end of the document", status);
472 goto errorExit;
473 }
474
475 // Success!
476 return root;
477
478errorExit:
479 delete root;
480 return NULL__null;
481}
482
483//
484// createElement
485// We've just matched an element start tag. Create and fill in a UXMLElement object
486// for it.
487//
488UXMLElement *
489UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) {
490 // First capture group is the element's name.
491 UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
492
493 // Scan for attributes.
494 int32_t pos = mEl.end(1, status); // The position after the end of the tag name
495
496 while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element.
497 UnicodeString attName = mAttrValue.group(1, status);
498 UnicodeString attValue = mAttrValue.group(2, status);
499
500 // Trim the quotes from the att value. These are left over from the original regex
501 // that parsed the attribute, which couldn't conveniently strip them.
502 attValue.remove(0,1); // one char from the beginning
503 attValue.truncate(attValue.length()-1); // and one from the end.
504
505 // XML Attribute value normalization.
506 // This is one of the really screwy parts of the XML spec.
507 // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
508 // Note that non-validating parsers must treat all entities as type CDATA
509 // which simplifies things some.
510
511 // Att normalization step 1: normalize any newlines in the attribute value
512 mNewLineNormalizer.reset(attValue);
513 attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
514
515 // Next change all xml white space chars to plain \u0020 spaces.
516 mAttrNormalizer.reset(attValue);
517 UnicodeString oneSpace((UChar)0x0020);
518 attValue = mAttrNormalizer.replaceAll(oneSpace, status);
519
520 // Replace character entities.
521 replaceCharRefs(attValue, status);
522
523 // Save the attribute name and value in our document structure.
524 el->fAttNames.addElement((void *)intern(attName, status), status);
525 el->fAttValues.addElement(attValue.clone(), status);
526 pos = mAttrValue.end(2, status);
527 }
528 fPos = mEl.end(0, status);
529 return el;
530}
531
532//
533// parseMisc
534// Consume XML "Misc" [production #27]
535// which is any combination of space, PI and comments
536// Need to watch end-of-input because xml MISC stuff is allowed after
537// the document element, so we WILL scan off the end in this function
538//
539void
540UXMLParser::parseMisc(UErrorCode &status) {
541 for (;;) {
542 if (fPos >= mXMLPI.input().length()) {
543 break;
544 }
545 if (mXMLPI.lookingAt(fPos, status)) {
546 fPos = mXMLPI.end(status);
547 continue;
548 }
549 if (mXMLSP.lookingAt(fPos, status)) {
550 fPos = mXMLSP.end(status);
551 continue;
552 }
553 if (mXMLComment.lookingAt(fPos, status)) {
554 fPos = mXMLComment.end(status);
555 continue;
556 }
557 break;
558 }
559}
560
561//
562// Scan for document content.
563//
564UnicodeString
565UXMLParser::scanContent(UErrorCode &status) {
566 UnicodeString result;
567 if (mXMLCharData.lookingAt(fPos, status)) {
568 result = mXMLCharData.group((int32_t)0, status);
569 // Normalize the new-lines. (Before char ref substitution)
570 mNewLineNormalizer.reset(result);
571 result = mNewLineNormalizer.replaceAll(fOneLF, status);
572
573 // TODO: handle CDATA
574 fPos = mXMLCharData.end(0, status);
575 }
576
577 return result;
578}
579
580//
581// replaceCharRefs
582//
583// replace the char entities &lt; &amp; &#123; &#x12ab; etc. in a string
584// with the corresponding actual character.
585//
586void
587UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
588 UnicodeString result;
589 UnicodeString replacement;
590 int i;
591
592 mAmps.reset(s);
593 // See the initialization for the regex matcher mAmps.
594 // Which entity we've matched is determined by which capture group has content,
595 // which is flagged by start() of that group not being -1.
596 while (mAmps.find()) {
597 if (mAmps.start(1, status) != -1) {
598 replacement.setTo((UChar)x_AMP);
599 } else if (mAmps.start(2, status) != -1) {
600 replacement.setTo((UChar)x_LT);
601 } else if (mAmps.start(3, status) != -1) {
602 replacement.setTo((UChar)x_GT);
603 } else if (mAmps.start(4, status) != -1) {
604 replacement.setTo((UChar)x_APOS);
605 } else if (mAmps.start(5, status) != -1) {
606 replacement.setTo((UChar)x_QUOT);
607 } else if (mAmps.start(6, status) != -1) {
608 UnicodeString hexString = mAmps.group(6, status);
609 UChar32 val = 0;
610 for (i=0; i<hexString.length(); i++) {
611 val = (val << 4) + u_digitu_digit_71(hexString.charAt(i), 16);
612 }
613 // TODO: some verification that the character is valid
614 replacement.setTo(val);
615 } else if (mAmps.start(7, status) != -1) {
616 UnicodeString decimalString = mAmps.group(7, status);
617 UChar32 val = 0;
618 for (i=0; i<decimalString.length(); i++) {
619 val = val*10 + u_digitu_digit_71(decimalString.charAt(i), 10);
620 }
621 // TODO: some verification that the character is valid
622 replacement.setTo(val);
623 } else {
624 // An unrecognized &entity; Leave it alone.
625 // TODO: check that it really looks like an entity, and is not some
626 // random & in the text.
627 replacement = mAmps.group((int32_t)0, status);
628 }
629 mAmps.appendReplacement(result, replacement, status);
630 }
631 mAmps.appendTail(result);
632 s = result;
633}
634
635void
636UXMLParser::error(const char *message, UErrorCode &status) {
637 // TODO: something better here...
638 const UnicodeString &src=mXMLDecl.input();
639 int line = 0;
640 int ci = 0;
641 while (ci < fPos && ci>=0) {
642 ci = src.indexOf((UChar)0x0a, ci+1);
643 line++;
644 }
645 fprintf(stderrstderr, "Error: %s at line %d\n", message, line);
646 if (U_SUCCESS(status)) {
647 status = U_PARSE_ERROR;
648 }
649}
650
651// intern strings like in Java
652
653const UnicodeString *
654UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
655 const UHashElement *he=fNames.find(s);
656 if(he!=NULL__null) {
657 // already a known name, return its hashed key pointer
658 return (const UnicodeString *)he->key.pointer;
659 } else {
660 // add this new name and return its hashed key pointer
661 fNames.puti(s, 1, errorCode);
662 he=fNames.find(s);
663 return (const UnicodeString *)he->key.pointer;
664 }
665}
666
667const UnicodeString *
668UXMLParser::findName(const UnicodeString &s) const {
669 const UHashElement *he=fNames.find(s);
670 if(he!=NULL__null) {
671 // a known name, return its hashed key pointer
672 return (const UnicodeString *)he->key.pointer;
673 } else {
674 // unknown name
675 return NULL__null;
676 }
677}
678
679// UXMLElement ------------------------------------------------------------- ***
680
681UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
682 fParser(parser),
683 fName(name),
684 fAttNames(errorCode),
685 fAttValues(errorCode),
686 fChildren(errorCode),
687 fParent(NULL__null)
688{
689}
690
691UXMLElement::~UXMLElement() {
692 int i;
693 // attribute names are owned by the UXMLParser, don't delete them here
694 for (i=fAttValues.size()-1; i>=0; i--) {
695 delete (UObject *)fAttValues.elementAt(i);
696 }
697 for (i=fChildren.size()-1; i>=0; i--) {
698 delete (UObject *)fChildren.elementAt(i);
699 }
700}
701
702const UnicodeString &
703UXMLElement::getTagName() const {
704 return *fName;
705}
706
707UnicodeString
708UXMLElement::getText(UBool recurse) const {
709 UnicodeString text;
710 appendText(text, recurse);
1
Calling 'UXMLElement::appendText'
711 return text;
712}
713
714void
715UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
716 const UObject *node;
717 int32_t i, count=fChildren.size();
718 for(i=0; i<count; ++i) {
2
Assuming 'i' is < 'count'
3
Loop condition is true. Entering loop body
719 node=(const UObject *)fChildren.elementAt(i);
4
Value assigned to 'node'
720 const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
721 if(s!=NULL__null) {
5
Assuming 's' is equal to NULL
6
Taking false branch
722 text.append(*s);
723 } else if(recurse) /* must be a UXMLElement */ {
7
Assuming 'recurse' is not equal to 0
8
Taking true branch
724 ((const UXMLElement *)node)->appendText(text, recurse);
9
Called C++ object pointer is null
725 }
726 }
727}
728
729int32_t
730UXMLElement::countAttributes() const {
731 return fAttNames.size();
732}
733
734const UnicodeString *
735UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
736 if(0<=i && i<fAttNames.size()) {
737 name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
738 value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
739 return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
740 } else {
741 return NULL__null;
742 }
743}
744
745const UnicodeString *
746UXMLElement::getAttribute(const UnicodeString &name) const {
747 // search for the attribute name by comparing the interned pointer,
748 // not the string contents
749 const UnicodeString *p=fParser->findName(name);
750 if(p==NULL__null) {
751 return NULL__null; // no such attribute seen by the parser at all
752 }
753
754 int32_t i, count=fAttNames.size();
755 for(i=0; i<count; ++i) {
756 if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
757 return (const UnicodeString *)fAttValues.elementAt(i);
758 }
759 }
760 return NULL__null;
761}
762
763int32_t
764UXMLElement::countChildren() const {
765 return fChildren.size();
766}
767
768const UObject *
769UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
770 if(0<=i && i<fChildren.size()) {
771 const UObject *node=(const UObject *)fChildren.elementAt(i);
772 if(dynamic_cast<const UXMLElement *>(node)!=NULL__null) {
773 type=UXML_NODE_TYPE_ELEMENT;
774 } else {
775 type=UXML_NODE_TYPE_STRING;
776 }
777 return node;
778 } else {
779 return NULL__null;
780 }
781}
782
783const UXMLElement *
784UXMLElement::nextChildElement(int32_t &i) const {
785 if(i<0) {
786 return NULL__null;
787 }
788
789 const UObject *node;
790 int32_t count=fChildren.size();
791 while(i<count) {
792 node=(const UObject *)fChildren.elementAt(i++);
793 const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
794 if(elem!=NULL__null) {
795 return elem;
796 }
797 }
798 return NULL__null;
799}
800
801const UXMLElement *
802UXMLElement::getChildElement(const UnicodeString &name) const {
803 // search for the element name by comparing the interned pointer,
804 // not the string contents
805 const UnicodeString *p=fParser->findName(name);
806 if(p==NULL__null) {
807 return NULL__null; // no such element seen by the parser at all
808 }
809
810 const UObject *node;
811 int32_t i, count=fChildren.size();
812 for(i=0; i<count; ++i) {
813 node=(const UObject *)fChildren.elementAt(i);
814 const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
815 if(elem!=NULL__null) {
816 if(p==elem->fName) {
817 return elem;
818 }
819 }
820 }
821 return NULL__null;
822}
823
824U_NAMESPACE_END}
825
826#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
827