Bug Summary

File:d/quotehtml.c
Warning:line 351, column 9
Duplicate code detected
Note:line 358, column 9
Similar code here

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-unknown-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name quotehtml.c -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/home/isvv/naviserver/nsd -resource-dir /usr/local/lib/clang/15.0.0 -D _FORTIFY_SOURCE=2 -D NDEBUG -D SYSTEM_MALLOC -I ../include -I /usr/include/tcl8.6 -D HAVE_CONFIG_H -internal-isystem /usr/local/lib/clang/15.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/11/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -std=c99 -fdebug-compilation-dir=/home/isvv/naviserver/nsd -ferror-limit 19 -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-checker alpha -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-07-23-130959-11103-1 -x c quotehtml.c
1/*
2 * The contents of this file are subject to the Mozilla Public License
3 * Version 1.1 (the "License"); you may not use this file except in
4 * compliance with the License. You may obtain a copy of the License at
5 * http://mozilla.org/.
6 *
7 * Software distributed under the License is distributed on an "AS IS"
8 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
9 * the License for the specific language governing rights and limitations
10 * under the License.
11 *
12 * The Original Code is AOLserver Code and related documentation
13 * distributed by AOL.
14 *
15 * The Initial Developer of the Original Code is America Online,
16 * Inc. Portions created by AOL are Copyright (C) 1999 America Online,
17 * Inc. All Rights Reserved.
18 *
19 * Alternatively, the contents of this file may be used under the terms
20 * of the GNU General Public License (the "GPL"), in which case the
21 * provisions of GPL are applicable instead of those above. If you wish
22 * to allow use of your version of this file only under the terms of the
23 * GPL and not to allow others to use your version of this file under the
24 * License, indicate your decision by deleting the provisions above and
25 * replace them with the notice and other provisions required by the GPL.
26 * If you do not delete the provisions above, a recipient may use your
27 * version of this file under either the License or the GPL.
28 */
29
30
31/*
32 * quotehtml.c --
33 *
34 * Take text and make it safe for HTML.
35 */
36
37#include "nsd.h"
38
39/*
40 * Static functions defined in this file.
41 */
42static void QuoteHtml(Ns_DStringTcl_DString *dsPtr, const char *breakChar, const char *htmlString)
43 NS_GNUC_NONNULL(1)__attribute__((__nonnull__(1))) NS_GNUC_NONNULL(2)__attribute__((__nonnull__(2))) NS_GNUC_NONNULL(3)__attribute__((__nonnull__(3)));
44
45static bool_Bool WordEndsInSemi(const char *word, size_t *lengthPtr)
46 NS_GNUC_NONNULL(1)__attribute__((__nonnull__(1)));
47
48static int ToUTF8(long value, char *outPtr)
49 NS_GNUC_NONNULL(2)__attribute__((__nonnull__(2)));
50
51static size_t EntityDecode(const char *entity, size_t length, bool_Bool *needEncodePtr, char *outPtr)
52 NS_GNUC_NONNULL(1)__attribute__((__nonnull__(1))) NS_GNUC_NONNULL(3)__attribute__((__nonnull__(3))) NS_GNUC_NONNULL(4)__attribute__((__nonnull__(4)));
53
54
55
56/*
57 *----------------------------------------------------------------------
58 *
59 * Ns_QuoteHtml --
60 *
61 * Quote an HTML string.
62 *
63 * Results:
64 * None.
65 *
66 * Side effects:
67 * Copies quoted HTML to given dstring.
68 *
69 *----------------------------------------------------------------------
70 */
71static void
72QuoteHtml(Ns_DStringTcl_DString *dsPtr, const char *breakChar, const char *htmlString)
73{
74 const char *toProcess = htmlString;
75
76 NS_NONNULL_ASSERT(dsPtr != NULL)((void) (0));
77 NS_NONNULL_ASSERT(breakChar != NULL)((void) (0));
78 NS_NONNULL_ASSERT(htmlString != NULL)((void) (0));
79
80 do {
81 /*
82 * Append the first part, escape the protected char, and
83 * continue.
84 */
85 Ns_DStringNAppendTcl_DStringAppend(dsPtr, toProcess, (int)(breakChar - toProcess));
86 switch (*breakChar) {
87 case '<':
88 Ns_DStringNAppendTcl_DStringAppend(dsPtr, "&lt;", 4);
89 break;
90
91 case '>':
92 Ns_DStringNAppendTcl_DStringAppend(dsPtr, "&gt;", 4);
93 break;
94
95 case '&':
96 Ns_DStringNAppendTcl_DStringAppend(dsPtr, "&amp;", 5);
97 break;
98
99 case '\'':
100 Ns_DStringNAppendTcl_DStringAppend(dsPtr, "&#39;", 5);
101 break;
102
103 case '"':
104 Ns_DStringNAppendTcl_DStringAppend(dsPtr, "&#34;", 5);
105 break;
106
107 default:
108 /*should not happen */ assert(0)((void) (0));
109 break;
110 }
111 /*
112 * Check for further protected characters.
113 */
114 toProcess = breakChar + 1;
115 breakChar = strpbrk(toProcess, "<>&'\"");
116
117 } while (breakChar != NULL((void*)0));
118
119 /*
120 * Append the last part if nonempty.
121 */
122 if (toProcess != NULL((void*)0)) {
123 Ns_DStringAppend(dsPtr, toProcess)Tcl_DStringAppend((dsPtr), (toProcess), -1);
124 }
125}
126
127
128void
129Ns_QuoteHtml(Ns_DStringTcl_DString *dsPtr, const char *htmlString)
130{
131 NS_NONNULL_ASSERT(dsPtr != NULL)((void) (0));
132 NS_NONNULL_ASSERT(htmlString != NULL)((void) (0));
133
134 /*
135 * If the first character is a null character, there is nothing to do.
136 */
137 if (*htmlString != '\0') {
138 const char *breakChar = strpbrk(htmlString, "<>&'\"");
139
140 if (breakChar != NULL((void*)0)) {
141 QuoteHtml(dsPtr, strpbrk(htmlString, "<>&'\""), htmlString);
142 } else {
143 Ns_DStringAppend(dsPtr, htmlString)Tcl_DStringAppend((dsPtr), (htmlString), -1);
144 }
145 }
146}
147
148
149
150/*
151 *----------------------------------------------------------------------
152 *
153 * NsTclQuoteHtmlObjCmd --
154 *
155 * Implements "ns_quotehtml".
156 *
157 * Results:
158 * Tcl result.
159 *
160 * Side effects:
161 * See docs.
162 *
163 *----------------------------------------------------------------------
164 */
165
166int
167NsTclQuoteHtmlObjCmd(ClientData UNUSED(clientData)UNUSED_clientData __attribute__((__unused__)), Tcl_Interp *interp, int objc, Tcl_Obj *const* objv)
168{
169 int result = TCL_OK0;
170 Tcl_Obj *htmlObj;
171 Ns_ObjvSpec args[] = {
172 {"html", Ns_ObjvObj, &htmlObj, NULL((void*)0)},
173 {NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0)}
174 };
175
176 if (Ns_ParseObjv(NULL((void*)0), args, interp, 1, objc, objv) != NS_OK) {
177 result = TCL_ERROR1;
178
179 } else {
180 const char *htmlString = Tcl_GetString(htmlObj);
181
182 if (*htmlString != '\0') {
183 const char *breakChar = strpbrk(htmlString, "<>&'\"");
184
185 if (breakChar == NULL((void*)0)) {
186 /*
187 * No need to copy anything.
188 */
189 Tcl_SetObjResult(interp, htmlObj);
190 } else {
191 Ns_DStringTcl_DString ds;
192
193 Ns_DStringInitTcl_DStringInit(&ds);
194 QuoteHtml(&ds, breakChar, htmlString);
195 Tcl_DStringResult(interp, &ds);
196
197 }
198 }
199 }
200
201 return result;
202}
203
204
205
206/*
207 *----------------------------------------------------------------------
208 *
209 * NsTclUnquoteHtmlObjCmd --
210 *
211 * This is essentially the opposite operation of NsTclQuoteHtmlObjCmd.
212 *
213 * Implements "ns_unquotehtml".
214 *
215 * Results:
216 * Tcl result.
217 *
218 * Side effects:
219 * See docs.
220 *
221 *----------------------------------------------------------------------
222 */
223
224int
225NsTclUnquoteHtmlObjCmd(ClientData UNUSED(clientData)UNUSED_clientData __attribute__((__unused__)), Tcl_Interp *interp, int objc, Tcl_Obj *const* objv)
226{
227 int result = TCL_OK0;
228 Tcl_Obj *htmlObj;
229 Ns_ObjvSpec args[] = {
230 {"html", Ns_ObjvObj, &htmlObj, NULL((void*)0)},
231 {NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0)}
232 };
233
234 if (Ns_ParseObjv(NULL((void*)0), args, interp, 1, objc, objv) != NS_OK) {
235 result = TCL_ERROR1;
236
237 } else {
238 Ns_DStringTcl_DString ds, *dsPtr = &ds;
239 const char *htmlString = Tcl_GetString(htmlObj);
240 bool_Bool needEncode = NS_FALSE0;
241
242 Ns_DStringInitTcl_DStringInit(&ds);
243
244 if (*htmlString != '\0') {
245
246 for (;;) {
247 const char *possibleEntity = strchr(htmlString, '&');
248
249 if (possibleEntity == NULL((void*)0)) {
250 /*
251 * We are done.
252 */
253 break;
254
255 } else {
256 size_t length = 0u;
257 int prefixLength = (int)(possibleEntity - htmlString);
258
259 /*
260 * Add the string leading to the ampersand to the output
261 * and proceed in the string by this amount of bytes.
262 */
263 if (possibleEntity != htmlString) {
264 Ns_DStringNAppendTcl_DStringAppend(dsPtr, htmlString, prefixLength);
265 htmlString += prefixLength;
266 }
267
268 if (WordEndsInSemi(possibleEntity, &length)) {
269 size_t decoded;
270 int oldLength = dsPtr->length;
271
272 /*
273 * The appended characters are max 4 bytes; make sure, we
274 * have this space in the Tcl_DString.
275 */
276 Tcl_DStringSetLength(dsPtr, oldLength + 4);
277 decoded = EntityDecode(possibleEntity + 1u, length, &needEncode,
278 dsPtr->string + oldLength);
279 Tcl_DStringSetLength(dsPtr, oldLength + (int)decoded);
280
281 /*
282 * Include the boundary characters "&" and ";" in the
283 * length calculation.
284 */
285 htmlString += (length + 2);
286 } else {
287 Ns_DStringNAppendTcl_DStringAppend(dsPtr, "&", 1);
288 htmlString ++;
289 }
290 }
291 }
292
293 /*
294 * Append the last chunk
295 */
296 Ns_DStringNAppendTcl_DStringAppend(dsPtr, htmlString, -1);
297
298 }
299
300 if (needEncode) {
301 Tcl_DString ds2;
302
303 (void)Tcl_ExternalToUtfDString(Ns_GetCharsetEncoding("utf-8"),
304 dsPtr->string, dsPtr->length, &ds2);
305 Tcl_DStringResult(interp, &ds2);
306 Tcl_DStringFree(dsPtr);
307
308 } else {
309 Tcl_DStringResult(interp, dsPtr);
310 }
311 }
312
313 return result;
314}
315
316/*
317 *----------------------------------------------------------------------
318 *
319 * ToUTF8 --
320 *
321 * Convert a unicode code point to UTF8. The function writes from 0 up to
322 * 4 bytes to the output.
323 *
324 * Results:
325 * Returns number of bytes written to the output. The value of 0 means
326 * invalid input.
327 *
328 * Side effects:
329 * None.
330 *
331 *----------------------------------------------------------------------
332 */
333
334static int
335ToUTF8(long value, char *outPtr)
336{
337 int length = 0;
338
339 NS_NONNULL_ASSERT(outPtr != NULL)((void) (0));
340
341 if(value <= 0x7F) {
342 *outPtr = (char)value;
343 length = 1;
344
345 } else if (value <= 0x7FF) {
346 *outPtr++ = (char)(((value >> 6) & 0x1F) | 0xC0);
347 *outPtr++ = (char)(((value >> 0) & 0x3F) | 0x80);
348 length = 2;
349
350 } else if (value <= 0xFFFF) {
351 *outPtr++ = (char) (((value >> 12) & 0x0F) | 0xE0);
Duplicate code detected
352 *outPtr++ = (char) (((value >> 6) & 0x3F) | 0x80);
353 *outPtr++ = (char) (((value >> 0) & 0x3F) | 0x80);
354 length = 3;
355
356 } else if (value <= 0x10FFFF) {
357 *outPtr++ = (char) (((value >> 18) & 0x07) | 0xF0);
358 *outPtr++ = (char) (((value >> 12) & 0x3F) | 0x80);
Similar code here
359 *outPtr++ = (char) (((value >> 6) & 0x3F) | 0x80);
360 *outPtr++ = (char) (((value >> 0) & 0x3F) | 0x80);
361 length = 4;
362 } else {
363 length = 0;
364 }
365 return length;
366}
367
368
369/*
370 *----------------------------------------------------------------------
371 *
372 * EntityDecode --
373 *
374 * Decode an HTML/XML entity, which might be numeric (starting with a '#'
375 * sign) or non-numeric.
376 *
377 * Results:
378 * Number of decoded characters.
379 *
380 * Side effects:
381 * None.
382 *
383 *----------------------------------------------------------------------
384 */
385
386typedef struct namedEntity_t {
387 const char *name;
388 size_t length;
389 const char *value;
390 size_t outputLength;
391} namedEntity_t;
392
393static const namedEntity_t namedEntities[] = {
394 {"AElig", 5, "\xc3\x86", 2}, /* "Æ" */
395 {"Aacute", 6, "\xc3\x81", 2}, /* "Á" */
396 {"Acirc", 5, "\xc3\x82", 2}, /* "Â" */
397 {"Agrave", 6, "\xc3\x80", 2}, /* "À" */
398 {"Alpha", 5, "\xce\x91", 2}, /* "Α" */
399 {"Aring", 5, "\xc3\x85", 2}, /* "Å" */
400 {"Atilde", 6, "\xc3\x83", 2}, /* "Ã" */
401 {"Auml", 4, "\xc3\x84", 2}, /* "Ä" */
402 {"Beta", 4, "\xce\x92", 2}, /* "Β" */
403 {"Ccedil", 6, "\xc3\x87", 2}, /* "Ç" */
404 {"Chi", 3, "\xce\xa7", 2}, /* "Χ" */
405 {"Delta", 5, "\xce\x94", 2}, /* "Δ" */
406 {"ETH", 3, "\xc3\x90", 2}, /* "Ð" */
407 {"Eacute", 6, "\xc3\x89", 2}, /* "É" */
408 {"Ecirc", 5, "\xc3\x8a", 2}, /* "Ê" */
409 {"Egrave", 6, "\xc3\x88", 2}, /* "È" */
410 {"Epsilon", 7, "\xce\x95", 2}, /* "Ε" */
411 {"Eta", 3, "\xce\x97", 2}, /* "Η" */
412 {"Euml", 4, "\xc3\x8b", 2}, /* "Ë" */
413 {"Gamma", 5, "\xce\x93", 2}, /* "Γ" */
414 {"Iacute", 6, "\xc3\x8d", 2}, /* "Í" */
415 {"Icirc", 5, "\xc3\x8e", 2}, /* "Î" */
416 {"Igrave", 6, "\xc3\x8c", 2}, /* "Ì" */
417 {"Iota", 4, "\xce\x99", 2}, /* "Ι" */
418 {"Iuml", 4, "\xc3\x8f", 2}, /* "Ï" */
419 {"Kappa", 5, "\xce\x9a", 2}, /* "Κ" */
420 {"Lambda", 6, "\xce\x9b", 2}, /* "Λ" */
421 {"Mu", 2, "\xce\x9c", 2}, /* "Μ" */
422 {"Ntilde", 6, "\xc3\x91", 2}, /* "Ñ" */
423 {"Nu", 2, "\xce\x9d", 2}, /* "Ν" */
424 {"Oacute", 6, "\xc3\x93", 2}, /* "Ó" */
425 {"Ocirc", 5, "\xc3\x94", 2}, /* "Ô" */
426 {"Ograve", 6, "\xc3\x92", 2}, /* "Ò" */
427 {"Omega", 5, "\xce\xa9", 2}, /* "Ω" */
428 {"Omicron", 7, "\xce\x9f", 2}, /* "Ο" */
429 {"Oslash", 6, "\xc3\x98", 2}, /* "Ø" */
430 {"Otilde", 6, "\xc3\x95", 2}, /* "Õ" */
431 {"Ouml", 4, "\xc3\x96", 2}, /* "Ö" */
432 {"Phi", 3, "\xce\xa6", 2}, /* "Φ" */
433 {"Pi", 2, "\xce\xa0", 2}, /* "Π" */
434 {"Prime", 5, "\xe2\x80\xb3", 3}, /* "″" */
435 {"Psi", 3, "\xce\xa8", 2}, /* "Ψ" */
436 {"Rho", 3, "\xce\xa1", 2}, /* "Ρ" */
437 {"Sigma", 5, "\xce\xa3", 2}, /* "Σ" */
438 {"THORN", 5, "\xc3\x9e", 2}, /* "Þ" */
439 {"Tau", 3, "\xce\xa4", 2}, /* "Τ" */
440 {"Theta", 5, "\xce\x98", 2}, /* "Θ" */
441 {"Uacute", 6, "\xc3\x9a", 2}, /* "Ú" */
442 {"Ucirc", 5, "\xc3\x9b", 2}, /* "Û" */
443 {"Ugrave", 6, "\xc3\x99", 2}, /* "Ù" */
444 {"Upsilon", 7, "\xce\xa5", 2}, /* "Υ" */
445 {"Uuml", 4, "\xc3\x9c", 2}, /* "Ü" */
446 {"Xi", 2, "\xce\x9e", 2}, /* "Ξ" */
447 {"Yacute", 6, "\xc3\x9d", 2}, /* "Ý" */
448 {"Zeta", 4, "\xce\x96", 2}, /* "Ζ" */
449 {"aacute", 6, "\xc3\xa1", 2}, /* "á" */
450 {"acirc", 5, "\xc3\xa2", 2}, /* "â" */
451 {"acute", 5, "\xc2\xb4", 2}, /* "´" */
452 {"aelig", 5, "\xc3\xa6", 2}, /* "æ" */
453 {"agrave", 6, "\xc3\xa0", 2}, /* "à" */
454 {"alefsym", 7, "\xe2\x84\xb5", 3}, /* "ℵ" */
455 {"alpha", 5, "\xce\xb1", 2}, /* "α" */
456 {"amp", 3, "\x26", 1}, /* "&" */
457 {"and", 3, "\xe2\x88\xa7", 3}, /* "∧" */
458 {"ang", 3, "\xe2\x88\xa0", 3}, /* "∠" */
459 {"apos", 4, "\x27", 1}, /* "'" */
460 {"aring", 5, "\xc3\xa5", 2}, /* "å" */
461 {"asymp", 5, "\xe2\x89\x88", 3}, /* "≈" */
462 {"atilde", 6, "\xc3\xa3", 2}, /* "ã" */
463 {"auml", 4, "\xc3\xa4", 2}, /* "ä" */
464 {"beta", 4, "\xce\xb2", 2}, /* "β" */
465 {"brvbar", 6, "\xc2\xa6", 2}, /* "¦" */
466 {"bull", 4, "\xe2\x80\xa2", 3}, /* "•" */
467 {"cap", 3, "\xe2\x88\xa9", 3}, /* "∩" */
468 {"ccedil", 6, "\xc3\xa7", 2}, /* "ç" */
469 {"cedil", 5, "\xc2\xb8", 2}, /* "¸" */
470 {"cent", 4, "\xc2\xa2", 2}, /* "¢" */
471 {"chi", 3, "\xcf\x87", 2}, /* "χ" */
472 {"clubs", 5, "\xe2\x99\xa3", 3}, /* "♣" */
473 {"cong", 4, "\xe2\x89\x85", 3}, /* "≅" */
474 {"copy", 4, "\xc2\xa9", 2}, /* "©" */
475 {"crarr", 5, "\xe2\x86\xb5", 3}, /* "↵" */
476 {"cup", 3, "\xe2\x88\xaa", 3}, /* "∪" */
477 {"curren", 6, "\xc2\xa4", 2}, /* "¤" */
478 {"dArr", 4, "\xe2\x87\x93", 3}, /* "⇓" */
479 {"darr", 4, "\xe2\x86\x93", 3}, /* "↓" */
480 {"deg", 3, "\xc2\xb0", 2}, /* "°" */
481 {"delta", 5, "\xce\xb4", 2}, /* "δ" */
482 {"diams", 5, "\xe2\x99\xa6", 3}, /* "♦" */
483 {"divide", 6, "\xc3\xb7", 2}, /* "÷" */
484 {"eacute", 6, "\xc3\xa9", 2}, /* "é" */
485 {"ecirc", 5, "\xc3\xaa", 2}, /* "ê" */
486 {"egrave", 6, "\xc3\xa8", 2}, /* "è" */
487 {"empty", 5, "\xe2\x88\x85", 3}, /* "∅" */
488 {"epsilon", 7, "\xce\xb5", 2}, /* "ε" */
489 {"equiv", 5, "\xe2\x89\xa1", 3}, /* "≡" */
490 {"eta", 3, "\xce\xb7", 2}, /* "η" */
491 {"eth", 3, "\xc3\xb0", 2}, /* "ð" */
492 {"euml", 4, "\xc3\xab", 2}, /* "ë" */
493 {"euro", 4, "\xe2\x82\xac", 3}, /* "€" */
494 {"exist", 5, "\xe2\x88\x83", 3}, /* "∃" */
495 {"fnof", 4, "\xc6\x92", 2}, /* "ƒ" */
496 {"forall", 6, "\xe2\x88\x80", 3}, /* "∀" */
497 {"frac12", 6, "\xc2\xbd", 2}, /* "½" */
498 {"frac14", 6, "\xc2\xbc", 2}, /* "¼" */
499 {"frac34", 6, "\xc2\xbe", 2}, /* "¾" */
500 {"frasl", 5, "\xe2\x81\x84", 3}, /* "⁄" */
501 {"gamma", 5, "\xce\xb3", 2}, /* "γ" */
502 {"ge", 2, "\xe2\x89\xa5", 3}, /* "≥" */
503 {"gt", 2, "\x3e", 1}, /* ">" */
504 {"hArr", 4, "\xe2\x87\x94", 3}, /* "⇔" */
505 {"harr", 4, "\xe2\x86\x94", 3}, /* "↔" */
506 {"hearts", 6, "\xe2\x99\xa5", 3}, /* "♥" */
507 {"hellip", 6, "\xe2\x80\xa6", 3}, /* "…" */
508 {"iacute", 6, "\xc3\xad", 2}, /* "í" */
509 {"icirc", 5, "\xc3\xae", 2}, /* "î" */
510 {"iexcl", 5, "\xc2\xa1", 2}, /* "¡" */
511 {"igrave", 6, "\xc3\xac", 2}, /* "ì" */
512 {"image", 5, "\xe2\x84\x91", 3}, /* "ℑ" */
513 {"infin", 5, "\xe2\x88\x9e", 3}, /* "∞" */
514 {"int", 3, "\xe2\x88\xab", 3}, /* "∫" */
515 {"iota", 4, "\xce\xb9", 2}, /* "ι" */
516 {"iquest", 6, "\xc2\xbf", 2}, /* "¿" */
517 {"isin", 4, "\xe2\x88\x88", 3}, /* "∈" */
518 {"iuml", 4, "\xc3\xaf", 2}, /* "ï" */
519 {"kappa", 5, "\xce\xba", 2}, /* "κ" */
520 {"lArr", 4, "\xe2\x87\x90", 3}, /* "⇐" */
521 {"lambda", 6, "\xce\xbb", 2}, /* "λ" */
522 {"lang", 4, "\xe3\x80\x88", 3}, /* "〈" */
523 {"laquo", 5, "\xc2\xab", 2}, /* "«" */
524 {"larr", 4, "\xe2\x86\x90", 3}, /* "←" */
525 {"lceil", 5, "\xe2\x8c\x88", 3}, /* "⌈" */
526 {"le", 2, "\xe2\x89\xa4", 3}, /* "≤" */
527 {"lfloor", 6, "\xe2\x8c\x8a", 3}, /* "⌊" */
528 {"lowast", 6, "\xe2\x88\x97", 3}, /* "∗" */
529 {"loz", 3, "\xe2\x97\x8a", 3}, /* "◊" */
530 {"lt", 2, "\x3c", 1}, /* "<" */
531 {"macr", 4, "\xc2\xaf", 2}, /* "¯" */
532 {"micro", 5, "\xc2\xb5", 2}, /* "µ" */
533 {"middot", 6, "\xc2\xb7", 2}, /* "·" */
534 {"minus", 5, "\xe2\x88\x92", 3}, /* "−" */
535 {"mu", 2, "\xce\xbc", 2}, /* "μ" */
536 {"nabla", 5, "\xe2\x88\x87", 3}, /* "∇" */
537 {"nbsp", 4, "\x20", 1}, /* " " */
538 {"ne", 2, "\xe2\x89\xa0", 3}, /* "≠" */
539 {"ni", 2, "\xe2\x88\x8b", 3}, /* "∋" */
540 {"not", 3, "\xc2\xac", 2}, /* "¬" */
541 {"notin", 5, "\xe2\x88\x89", 3}, /* "∉" */
542 {"nsub", 4, "\xe2\x8a\x84", 3}, /* "⊄" */
543 {"ntilde", 6, "\xc3\xb1", 2}, /* "ñ" */
544 {"nu", 2, "\xce\xbd", 2}, /* "ν" */
545 {"oacute", 6, "\xc3\xb3", 2}, /* "ó" */
546 {"ocirc", 5, "\xc3\xb4", 2}, /* "ô" */
547 {"ograve", 6, "\xc3\xb2", 2}, /* "ò" */
548 {"oline", 5, "\xe2\x80\xbe", 3}, /* "‾" */
549 {"omega", 5, "\xcf\x89", 2}, /* "ω" */
550 {"omicron", 7, "\xce\xbf", 2}, /* "ο" */
551 {"oplus", 5, "\xe2\x8a\x95", 3}, /* "⊕" */
552 {"or", 2, "\xe2\x88\xa8", 3}, /* "∨" */
553 {"ordf", 4, "\xc2\xaa", 2}, /* "ª" */
554 {"ordm", 4, "\xc2\xba", 2}, /* "º" */
555 {"oslash", 6, "\xc3\xb8", 2}, /* "ø" */
556 {"otilde", 6, "\xc3\xb5", 2}, /* "õ" */
557 {"otimes", 6, "\xe2\x8a\x97", 3}, /* "⊗" */
558 {"ouml", 4, "\xc3\xb6", 2}, /* "ö" */
559 {"para", 4, "\xc2\xb6", 2}, /* "¶" */
560 {"part", 4, "\xe2\x88\x82", 3}, /* "∂" */
561 {"perp", 4, "\xe2\x8a\xa5", 3}, /* "⊥" */
562 {"phi", 3, "\xcf\x86", 2}, /* "φ" */
563 {"pi", 2, "\xcf\x80", 2}, /* "π" */
564 {"piv", 3, "\xcf\x96", 2}, /* "ϖ" */
565 {"plusmn", 6, "\xc2\xb1", 2}, /* "±" */
566 {"pound", 5, "\xc2\xa3", 2}, /* "£" */
567 {"prime", 5, "\xe2\x80\xb2", 3}, /* "′" */
568 {"prod", 4, "\xe2\x88\x8f", 3}, /* "∏" */
569 {"prop", 4, "\xe2\x88\x9d", 3}, /* "∝" */
570 {"psi", 3, "\xcf\x88", 2}, /* "ψ" */
571 {"quot", 4, "\x22", 1}, /* "\"" */
572 {"rArr", 4, "\xe2\x87\x92", 3}, /* "⇒" */
573 {"radic", 5, "\xe2\x88\x9a", 3}, /* "√" */
574 {"rang", 4, "\xe3\x80\x89", 3}, /* "〉" */
575 {"raquo", 5, "\xc2\xbb", 2}, /* "»" */
576 {"rarr", 4, "\xe2\x86\x92", 3}, /* "→" */
577 {"rceil", 5, "\xe2\x8c\x89", 3}, /* "⌉" */
578 {"real", 4, "\xe2\x84\x9c", 3}, /* "ℜ" */
579 {"reg", 3, "\xc2\xae", 2}, /* "®" */
580 {"rfloor", 6, "\xe2\x8c\x8b", 3}, /* "⌋" */
581 {"rho", 3, "\xcf\x81", 2}, /* "ρ" */
582 {"sdot", 4, "\xe2\x8b\x85", 3}, /* "⋅" */
583 {"sect", 4, "\xc2\xa7", 2}, /* "§" */
584 {"shy", 3, "\xc2\xad", 2}, /* "­" */
585 {"sigma", 5, "\xcf\x83", 2}, /* "σ" */
586 {"sigmaf", 6, "\xcf\x82", 2}, /* "ς" */
587 {"sim", 3, "\xe2\x88\xbc", 3}, /* "∼" */
588 {"spades", 6, "\xe2\x99\xa0", 3}, /* "♠" */
589 {"sub", 3, "\xe2\x8a\x82", 3}, /* "⊂" */
590 {"sube", 4, "\xe2\x8a\x86", 3}, /* "⊆" */
591 {"sum", 3, "\xe2\x88\x91", 3}, /* "∑" */
592 {"sup", 3, "\xe2\x8a\x83", 3}, /* "⊃" */
593 {"sup1", 4, "\xc2\xb9", 2}, /* "¹" */
594 {"sup2", 4, "\xc2\xb2", 2}, /* "²" */
595 {"sup3", 4, "\xc2\xb3", 2}, /* "³" */
596 {"supe", 4, "\xe2\x8a\x87", 3}, /* "⊇" */
597 {"szlig", 5, "\xc3\x9f", 2}, /* "ß" */
598 {"tau", 3, "\xcf\x84", 2}, /* "τ" */
599 {"there4", 6, "\xe2\x88\xb4", 3}, /* "∴" */
600 {"theta", 5, "\xce\xb8", 2}, /* "θ" */
601 {"thetasym", 8, "\xcf\x91", 2}, /* "ϑ" */
602 {"thorn", 5, "\xc3\xbe", 2}, /* "þ" */
603 {"times", 5, "\xc3\x97", 2}, /* "×" */
604 {"trade", 5, "\xe2\x84\xa2", 3}, /* "™" */
605 {"uArr", 4, "\xe2\x87\x91", 3}, /* "⇑" */
606 {"uacute", 6, "\xc3\xba", 2}, /* "ú" */
607 {"uarr", 4, "\xe2\x86\x91", 3}, /* "↑" */
608 {"ucirc", 5, "\xc3\xbb", 2}, /* "û" */
609 {"ugrave", 6, "\xc3\xb9", 2}, /* "ù" */
610 {"uml", 3, "\xc2\xa8", 2}, /* "¨" */
611 {"upsih", 5, "\xcf\x92", 2}, /* "ϒ" */
612 {"upsilon", 7, "\xcf\x85", 2}, /* "υ" */
613 {"uuml", 4, "\xc3\xbc", 2}, /* "ü" */
614 {"weierp", 6, "\xe2\x84\x98", 3}, /* "℘" */
615 {"xi", 2, "\xce\xbe", 2}, /* "ξ" */
616 {"yacute", 6, "\xc3\xbd", 2}, /* "ý" */
617 {"yen", 3, "\xc2\xa5", 2}, /* "¥" */
618 {"yuml", 4, "\xc3\xbf", 2}, /* "ÿ" */
619 {"zeta", 4, "\xce\xb6", 2}, /* "ζ" */
620 {NULL((void*)0), 0, "", 0}
621};
622
623
624static size_t
625EntityDecode(const char *entity, size_t length, bool_Bool *needEncodePtr, char *outPtr)
626{
627 size_t decoded = 0u;
628
629 NS_NONNULL_ASSERT(entity != NULL)((void) (0));
630 NS_NONNULL_ASSERT(outPtr != NULL)((void) (0));
631 NS_NONNULL_ASSERT(needEncodePtr != NULL)((void) (0));
632
633 /*
634 * Handle numeric entities.
635 */
636 if (*entity == '#') {
637 long value;
638
639 if (CHARTYPE(digit, *(entity + 1))(((*__ctype_b_loc ())[(int) (((int)((unsigned char)(*(entity +
1)))))] & (unsigned short int) _ISdigit))
!= 0) {
640 /*
641 * Decimal numeric entity.
642 */
643 value = strtol(entity + 1, NULL((void*)0), 10);
644
645 } else if (*(entity + 1) == 'x' && length >= 3 && length <= 8) {
646 /*
647 * Hexadecimal numeric entity.
648 */
649 value = strtol(entity + 2, NULL((void*)0), 16);
650
651 } else {
652 Ns_Log(Warning, "invalid numeric entity: '%s'", entity);
653 value = 0;
654 }
655
656 if (value >= 32) {
657 int outLength;
658
659 outLength = ToUTF8(value, outPtr);
660 decoded += (size_t)outLength;
661
662 Ns_Log(Debug, "entity decode: code point %.2lx %.2lx "
663 "corresponds to %d UTF-8 characters",
664 ((value >> 8) & 0xff), (value & 0xff), outLength);
665
666 if (value > 127) {
667 *needEncodePtr = NS_TRUE1;
668 }
669 } else {
670 /*
671 * ASCII device control characters should not be present in HTML.
672 */
673 Ns_Log(Notice, "entity decode: ignore numeric entity with value %ld", value);
674 }
675 } else {
676 size_t i;
677
678 for (i = 0; namedEntities[i].name != NULL((void*)0); i++) {
679 char firstChar = *namedEntities[i].name;
680
681 if (firstChar == *entity
682 && length == namedEntities[i].length
683 && strncmp(entity, namedEntities[i].name, length) == 0) {
684
685 /*if (strlen(entities[i].value) != entities[i].outputLength) {
686 fprintf(stderr, "--> name %s found l = %lu\n",
687 entities[i].name, strlen(entities[i].value));
688 }*/
689 if (namedEntities[i].outputLength > 1) {
690
691 memcpy(outPtr, namedEntities[i].value, namedEntities[i].outputLength);
692 decoded += namedEntities[i].outputLength;
693 } else {
694 *outPtr = *namedEntities[i].value;
695 decoded++;
696 }
697 break;
698 }
699
700 if (firstChar > *entity) {
701 Ns_Log(Warning, "ignore unknown named entity '%s'", entity);
702 break;
703 }
704 }
705 }
706
707 return decoded;
708}
709
710
711/*
712 *----------------------------------------------------------------------
713 *
714 * WordEndsInSemi --
715 *
716 * Does this word end in a semicolon or a space?
717 *
718 * Results:
719 * Returns true if the word endes with a semicolon.
720 *
721 * Side effects:
722 * Undefined behavior if string does not end in null
723 *
724 *----------------------------------------------------------------------
725 */
726
727static bool_Bool
728WordEndsInSemi(const char *word, size_t *lengthPtr)
729{
730 const char *start;
731
732 NS_NONNULL_ASSERT(word != NULL)((void) (0));
733
734 /*
735 * Advance past the first '&' so we can check for a second
736 * (i.e. to handle "ben&jerry&nbsp;")
737 */
738 if (*word == '&') {
739 word++;
740 }
741 start = word;
742 while((*word != '\0') && (*word != ' ') && (*word != ';') && (*word != '&')) {
743 word++;
744 }
745 *lengthPtr = (size_t)(word - start);
746
747 return (*word == ';');
748}
749
750
751
752/*
753 *----------------------------------------------------------------------
754 *
755 * NsTclStripHtmlObjCmd --
756 *
757 * Implements "ns_striphtml".
758 *
759 * Results:
760 * Tcl result.
761 *
762 * Side effects:
763 * See docs.
764 *
765 *----------------------------------------------------------------------
766 */
767
768int
769NsTclStripHtmlObjCmd(ClientData UNUSED(clientData)UNUSED_clientData __attribute__((__unused__)), Tcl_Interp *interp, int objc, Tcl_Obj *const* objv)
770{
771 int result = TCL_OK0;
772 char *htmlString = (char *)NS_EMPTY_STRING;
773 Ns_ObjvSpec args[] = {
774 {"html", Ns_ObjvString, &htmlString, NULL((void*)0)},
775 {NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0)}
776 };
777
778 if (Ns_ParseObjv(NULL((void*)0), args, interp, 1, objc, objv) != NS_OK) {
779 result = TCL_ERROR1;
780
781 } else {
782 bool_Bool intag; /* flag to see if are we inside a tag */
783 bool_Bool incomment; /* flag to see if we are inside a comment */
784 char *inString; /* copy of input string */
785 char *outPtr; /* moving pointer to output string */
786 const char *inPtr; /* moving pointer to input string */
787 bool_Bool needEncode;
788
789 /*
790 * Make a copy of the input and point the moving and output ptrs to it.
791 */
792 inString = ns_strdup(htmlString);
793 inPtr = inString;
794 outPtr = inString;
795 intag = NS_FALSE0;
796 incomment = NS_FALSE0;
797 needEncode = NS_FALSE0;
798
799 while (*inPtr != '\0') {
800
801 Ns_Log(Debug, "inptr %c intag %d incomment %d string <%s>",
802 *inPtr, intag, incomment, inPtr);
803
804 if (*inPtr == '<') {
805 intag = NS_TRUE1;
806 if ((*(inPtr + 1) == '!')
807 && (*(inPtr + 2) == '-')
808 && (*(inPtr + 3) == '-')) {
809 incomment = NS_TRUE1;
810 }
811 } else if (incomment) {
812 if ((*(inPtr) == '-')
813 && (*(inPtr + 1) == '-')
814 && (*(inPtr + 2) == '>')) {
815 incomment = NS_FALSE0;
816 }
817 } else if (intag && (*inPtr == '>')) {
818 /*
819 * Closing a tag.
820 */
821 intag = NS_FALSE0;
822
823 } else if (!intag) {
824 /*
825 * Regular text
826 */
827
828 if (*inPtr == '&') {
829 size_t length = 0u;
830
831 /*
832 * Starting an entity.
833 */
834 if (WordEndsInSemi(inPtr, &length)) {
835 size_t decoded = EntityDecode(inPtr + 1u, length, &needEncode, outPtr);
836
837 inPtr += (length + 1u);
838 outPtr += decoded;
839 }
840 Ns_Log(Debug, "...... after entity inptr '%c' intag %d incomment %d string <%s> needEncode %d",
841 *inPtr, intag, incomment, inPtr, needEncode);
842 } else {
843 /*
844 * Plain Text output
845 */
846 *outPtr++ = *inPtr;
847 }
848
849 } else {
850 /*
851 * Must be intag
852 */
853 }
854 ++inPtr;
855 }
856
857 /*
858 * Terminate output string.
859 */
860 *outPtr = '\0';
861
862 if (needEncode) {
863 Tcl_DString ds;
864
865 (void)Tcl_ExternalToUtfDString(Ns_GetCharsetEncoding("utf-8"),
866 inString, (int)strlen(inString), &ds);
867 Tcl_DStringResult(interp, &ds);
868 } else {
869 Tcl_SetObjResult(interp, Tcl_NewStringObj(inString, -1));
870 }
871 ns_free(inString);
872 }
873 return result;
874}
875
876
877/*
878 * Local Variables:
879 * mode: c
880 * c-basic-offset: 4
881 * fill-column: 78
882 * indent-tabs-mode: nil
883 * End:
884 */