File: | d/quotehtml.c |
Warning: | line 351, column 9 Duplicate code detected |
Note: | line 358, column 9 Similar code here |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* |
2 | * The contents of this file are subject to the Mozilla Public License |
3 | * Version 1.1 (the "License"); you may not use this file except in |
4 | * compliance with the License. You may obtain a copy of the License at |
5 | * http://mozilla.org/. |
6 | * |
7 | * Software distributed under the License is distributed on an "AS IS" |
8 | * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See |
9 | * the License for the specific language governing rights and limitations |
10 | * under the License. |
11 | * |
12 | * The Original Code is AOLserver Code and related documentation |
13 | * distributed by AOL. |
14 | * |
15 | * The Initial Developer of the Original Code is America Online, |
16 | * Inc. Portions created by AOL are Copyright (C) 1999 America Online, |
17 | * Inc. All Rights Reserved. |
18 | * |
19 | * Alternatively, the contents of this file may be used under the terms |
20 | * of the GNU General Public License (the "GPL"), in which case the |
21 | * provisions of GPL are applicable instead of those above. If you wish |
22 | * to allow use of your version of this file only under the terms of the |
23 | * GPL and not to allow others to use your version of this file under the |
24 | * License, indicate your decision by deleting the provisions above and |
25 | * replace them with the notice and other provisions required by the GPL. |
26 | * If you do not delete the provisions above, a recipient may use your |
27 | * version of this file under either the License or the GPL. |
28 | */ |
29 | |
30 | |
31 | /* |
32 | * quotehtml.c -- |
33 | * |
34 | * Take text and make it safe for HTML. |
35 | */ |
36 | |
37 | #include "nsd.h" |
38 | |
39 | /* |
40 | * Static functions defined in this file. |
41 | */ |
42 | static void QuoteHtml(Ns_DStringTcl_DString *dsPtr, const char *breakChar, const char *htmlString) |
43 | NS_GNUC_NONNULL(1)__attribute__((__nonnull__(1))) NS_GNUC_NONNULL(2)__attribute__((__nonnull__(2))) NS_GNUC_NONNULL(3)__attribute__((__nonnull__(3))); |
44 | |
45 | static bool_Bool WordEndsInSemi(const char *word, size_t *lengthPtr) |
46 | NS_GNUC_NONNULL(1)__attribute__((__nonnull__(1))); |
47 | |
48 | static int ToUTF8(long value, char *outPtr) |
49 | NS_GNUC_NONNULL(2)__attribute__((__nonnull__(2))); |
50 | |
51 | static size_t EntityDecode(const char *entity, size_t length, bool_Bool *needEncodePtr, char *outPtr) |
52 | NS_GNUC_NONNULL(1)__attribute__((__nonnull__(1))) NS_GNUC_NONNULL(3)__attribute__((__nonnull__(3))) NS_GNUC_NONNULL(4)__attribute__((__nonnull__(4))); |
53 | |
54 | |
55 | |
56 | /* |
57 | *---------------------------------------------------------------------- |
58 | * |
59 | * Ns_QuoteHtml -- |
60 | * |
61 | * Quote an HTML string. |
62 | * |
63 | * Results: |
64 | * None. |
65 | * |
66 | * Side effects: |
67 | * Copies quoted HTML to given dstring. |
68 | * |
69 | *---------------------------------------------------------------------- |
70 | */ |
71 | static void |
72 | QuoteHtml(Ns_DStringTcl_DString *dsPtr, const char *breakChar, const char *htmlString) |
73 | { |
74 | const char *toProcess = htmlString; |
75 | |
76 | NS_NONNULL_ASSERT(dsPtr != NULL)((void) (0)); |
77 | NS_NONNULL_ASSERT(breakChar != NULL)((void) (0)); |
78 | NS_NONNULL_ASSERT(htmlString != NULL)((void) (0)); |
79 | |
80 | do { |
81 | /* |
82 | * Append the first part, escape the protected char, and |
83 | * continue. |
84 | */ |
85 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, toProcess, (int)(breakChar - toProcess)); |
86 | switch (*breakChar) { |
87 | case '<': |
88 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, "<", 4); |
89 | break; |
90 | |
91 | case '>': |
92 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, ">", 4); |
93 | break; |
94 | |
95 | case '&': |
96 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, "&", 5); |
97 | break; |
98 | |
99 | case '\'': |
100 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, "'", 5); |
101 | break; |
102 | |
103 | case '"': |
104 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, """, 5); |
105 | break; |
106 | |
107 | default: |
108 | /*should not happen */ assert(0)((void) (0)); |
109 | break; |
110 | } |
111 | /* |
112 | * Check for further protected characters. |
113 | */ |
114 | toProcess = breakChar + 1; |
115 | breakChar = strpbrk(toProcess, "<>&'\""); |
116 | |
117 | } while (breakChar != NULL((void*)0)); |
118 | |
119 | /* |
120 | * Append the last part if nonempty. |
121 | */ |
122 | if (toProcess != NULL((void*)0)) { |
123 | Ns_DStringAppend(dsPtr, toProcess)Tcl_DStringAppend((dsPtr), (toProcess), -1); |
124 | } |
125 | } |
126 | |
127 | |
128 | void |
129 | Ns_QuoteHtml(Ns_DStringTcl_DString *dsPtr, const char *htmlString) |
130 | { |
131 | NS_NONNULL_ASSERT(dsPtr != NULL)((void) (0)); |
132 | NS_NONNULL_ASSERT(htmlString != NULL)((void) (0)); |
133 | |
134 | /* |
135 | * If the first character is a null character, there is nothing to do. |
136 | */ |
137 | if (*htmlString != '\0') { |
138 | const char *breakChar = strpbrk(htmlString, "<>&'\""); |
139 | |
140 | if (breakChar != NULL((void*)0)) { |
141 | QuoteHtml(dsPtr, strpbrk(htmlString, "<>&'\""), htmlString); |
142 | } else { |
143 | Ns_DStringAppend(dsPtr, htmlString)Tcl_DStringAppend((dsPtr), (htmlString), -1); |
144 | } |
145 | } |
146 | } |
147 | |
148 | |
149 | |
150 | /* |
151 | *---------------------------------------------------------------------- |
152 | * |
153 | * NsTclQuoteHtmlObjCmd -- |
154 | * |
155 | * Implements "ns_quotehtml". |
156 | * |
157 | * Results: |
158 | * Tcl result. |
159 | * |
160 | * Side effects: |
161 | * See docs. |
162 | * |
163 | *---------------------------------------------------------------------- |
164 | */ |
165 | |
166 | int |
167 | NsTclQuoteHtmlObjCmd(ClientData UNUSED(clientData)UNUSED_clientData __attribute__((__unused__)), Tcl_Interp *interp, int objc, Tcl_Obj *const* objv) |
168 | { |
169 | int result = TCL_OK0; |
170 | Tcl_Obj *htmlObj; |
171 | Ns_ObjvSpec args[] = { |
172 | {"html", Ns_ObjvObj, &htmlObj, NULL((void*)0)}, |
173 | {NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0)} |
174 | }; |
175 | |
176 | if (Ns_ParseObjv(NULL((void*)0), args, interp, 1, objc, objv) != NS_OK) { |
177 | result = TCL_ERROR1; |
178 | |
179 | } else { |
180 | const char *htmlString = Tcl_GetString(htmlObj); |
181 | |
182 | if (*htmlString != '\0') { |
183 | const char *breakChar = strpbrk(htmlString, "<>&'\""); |
184 | |
185 | if (breakChar == NULL((void*)0)) { |
186 | /* |
187 | * No need to copy anything. |
188 | */ |
189 | Tcl_SetObjResult(interp, htmlObj); |
190 | } else { |
191 | Ns_DStringTcl_DString ds; |
192 | |
193 | Ns_DStringInitTcl_DStringInit(&ds); |
194 | QuoteHtml(&ds, breakChar, htmlString); |
195 | Tcl_DStringResult(interp, &ds); |
196 | |
197 | } |
198 | } |
199 | } |
200 | |
201 | return result; |
202 | } |
203 | |
204 | |
205 | |
206 | /* |
207 | *---------------------------------------------------------------------- |
208 | * |
209 | * NsTclUnquoteHtmlObjCmd -- |
210 | * |
211 | * This is essentially the opposite operation of NsTclQuoteHtmlObjCmd. |
212 | * |
213 | * Implements "ns_unquotehtml". |
214 | * |
215 | * Results: |
216 | * Tcl result. |
217 | * |
218 | * Side effects: |
219 | * See docs. |
220 | * |
221 | *---------------------------------------------------------------------- |
222 | */ |
223 | |
224 | int |
225 | NsTclUnquoteHtmlObjCmd(ClientData UNUSED(clientData)UNUSED_clientData __attribute__((__unused__)), Tcl_Interp *interp, int objc, Tcl_Obj *const* objv) |
226 | { |
227 | int result = TCL_OK0; |
228 | Tcl_Obj *htmlObj; |
229 | Ns_ObjvSpec args[] = { |
230 | {"html", Ns_ObjvObj, &htmlObj, NULL((void*)0)}, |
231 | {NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0)} |
232 | }; |
233 | |
234 | if (Ns_ParseObjv(NULL((void*)0), args, interp, 1, objc, objv) != NS_OK) { |
235 | result = TCL_ERROR1; |
236 | |
237 | } else { |
238 | Ns_DStringTcl_DString ds, *dsPtr = &ds; |
239 | const char *htmlString = Tcl_GetString(htmlObj); |
240 | bool_Bool needEncode = NS_FALSE0; |
241 | |
242 | Ns_DStringInitTcl_DStringInit(&ds); |
243 | |
244 | if (*htmlString != '\0') { |
245 | |
246 | for (;;) { |
247 | const char *possibleEntity = strchr(htmlString, '&'); |
248 | |
249 | if (possibleEntity == NULL((void*)0)) { |
250 | /* |
251 | * We are done. |
252 | */ |
253 | break; |
254 | |
255 | } else { |
256 | size_t length = 0u; |
257 | int prefixLength = (int)(possibleEntity - htmlString); |
258 | |
259 | /* |
260 | * Add the string leading to the ampersand to the output |
261 | * and proceed in the string by this amount of bytes. |
262 | */ |
263 | if (possibleEntity != htmlString) { |
264 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, htmlString, prefixLength); |
265 | htmlString += prefixLength; |
266 | } |
267 | |
268 | if (WordEndsInSemi(possibleEntity, &length)) { |
269 | size_t decoded; |
270 | int oldLength = dsPtr->length; |
271 | |
272 | /* |
273 | * The appended characters are max 4 bytes; make sure, we |
274 | * have this space in the Tcl_DString. |
275 | */ |
276 | Tcl_DStringSetLength(dsPtr, oldLength + 4); |
277 | decoded = EntityDecode(possibleEntity + 1u, length, &needEncode, |
278 | dsPtr->string + oldLength); |
279 | Tcl_DStringSetLength(dsPtr, oldLength + (int)decoded); |
280 | |
281 | /* |
282 | * Include the boundary characters "&" and ";" in the |
283 | * length calculation. |
284 | */ |
285 | htmlString += (length + 2); |
286 | } else { |
287 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, "&", 1); |
288 | htmlString ++; |
289 | } |
290 | } |
291 | } |
292 | |
293 | /* |
294 | * Append the last chunk |
295 | */ |
296 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, htmlString, -1); |
297 | |
298 | } |
299 | |
300 | if (needEncode) { |
301 | Tcl_DString ds2; |
302 | |
303 | (void)Tcl_ExternalToUtfDString(Ns_GetCharsetEncoding("utf-8"), |
304 | dsPtr->string, dsPtr->length, &ds2); |
305 | Tcl_DStringResult(interp, &ds2); |
306 | Tcl_DStringFree(dsPtr); |
307 | |
308 | } else { |
309 | Tcl_DStringResult(interp, dsPtr); |
310 | } |
311 | } |
312 | |
313 | return result; |
314 | } |
315 | |
316 | /* |
317 | *---------------------------------------------------------------------- |
318 | * |
319 | * ToUTF8 -- |
320 | * |
321 | * Convert a unicode code point to UTF8. The function writes from 0 up to |
322 | * 4 bytes to the output. |
323 | * |
324 | * Results: |
325 | * Returns number of bytes written to the output. The value of 0 means |
326 | * invalid input. |
327 | * |
328 | * Side effects: |
329 | * None. |
330 | * |
331 | *---------------------------------------------------------------------- |
332 | */ |
333 | |
334 | static int |
335 | ToUTF8(long value, char *outPtr) |
336 | { |
337 | int length = 0; |
338 | |
339 | NS_NONNULL_ASSERT(outPtr != NULL)((void) (0)); |
340 | |
341 | if(value <= 0x7F) { |
342 | *outPtr = (char)value; |
343 | length = 1; |
344 | |
345 | } else if (value <= 0x7FF) { |
346 | *outPtr++ = (char)(((value >> 6) & 0x1F) | 0xC0); |
347 | *outPtr++ = (char)(((value >> 0) & 0x3F) | 0x80); |
348 | length = 2; |
349 | |
350 | } else if (value <= 0xFFFF) { |
351 | *outPtr++ = (char) (((value >> 12) & 0x0F) | 0xE0); |
Duplicate code detected | |
352 | *outPtr++ = (char) (((value >> 6) & 0x3F) | 0x80); |
353 | *outPtr++ = (char) (((value >> 0) & 0x3F) | 0x80); |
354 | length = 3; |
355 | |
356 | } else if (value <= 0x10FFFF) { |
357 | *outPtr++ = (char) (((value >> 18) & 0x07) | 0xF0); |
358 | *outPtr++ = (char) (((value >> 12) & 0x3F) | 0x80); |
Similar code here | |
359 | *outPtr++ = (char) (((value >> 6) & 0x3F) | 0x80); |
360 | *outPtr++ = (char) (((value >> 0) & 0x3F) | 0x80); |
361 | length = 4; |
362 | } else { |
363 | length = 0; |
364 | } |
365 | return length; |
366 | } |
367 | |
368 | |
369 | /* |
370 | *---------------------------------------------------------------------- |
371 | * |
372 | * EntityDecode -- |
373 | * |
374 | * Decode an HTML/XML entity, which might be numeric (starting with a '#' |
375 | * sign) or non-numeric. |
376 | * |
377 | * Results: |
378 | * Number of decoded characters. |
379 | * |
380 | * Side effects: |
381 | * None. |
382 | * |
383 | *---------------------------------------------------------------------- |
384 | */ |
385 | |
386 | typedef struct namedEntity_t { |
387 | const char *name; |
388 | size_t length; |
389 | const char *value; |
390 | size_t outputLength; |
391 | } namedEntity_t; |
392 | |
393 | static const namedEntity_t namedEntities[] = { |
394 | {"AElig", 5, "\xc3\x86", 2}, /* "Æ" */ |
395 | {"Aacute", 6, "\xc3\x81", 2}, /* "Á" */ |
396 | {"Acirc", 5, "\xc3\x82", 2}, /* "Â" */ |
397 | {"Agrave", 6, "\xc3\x80", 2}, /* "À" */ |
398 | {"Alpha", 5, "\xce\x91", 2}, /* "Α" */ |
399 | {"Aring", 5, "\xc3\x85", 2}, /* "Å" */ |
400 | {"Atilde", 6, "\xc3\x83", 2}, /* "Ã" */ |
401 | {"Auml", 4, "\xc3\x84", 2}, /* "Ä" */ |
402 | {"Beta", 4, "\xce\x92", 2}, /* "Β" */ |
403 | {"Ccedil", 6, "\xc3\x87", 2}, /* "Ç" */ |
404 | {"Chi", 3, "\xce\xa7", 2}, /* "Χ" */ |
405 | {"Delta", 5, "\xce\x94", 2}, /* "Δ" */ |
406 | {"ETH", 3, "\xc3\x90", 2}, /* "Ð" */ |
407 | {"Eacute", 6, "\xc3\x89", 2}, /* "É" */ |
408 | {"Ecirc", 5, "\xc3\x8a", 2}, /* "Ê" */ |
409 | {"Egrave", 6, "\xc3\x88", 2}, /* "È" */ |
410 | {"Epsilon", 7, "\xce\x95", 2}, /* "Ε" */ |
411 | {"Eta", 3, "\xce\x97", 2}, /* "Η" */ |
412 | {"Euml", 4, "\xc3\x8b", 2}, /* "Ë" */ |
413 | {"Gamma", 5, "\xce\x93", 2}, /* "Γ" */ |
414 | {"Iacute", 6, "\xc3\x8d", 2}, /* "Í" */ |
415 | {"Icirc", 5, "\xc3\x8e", 2}, /* "Î" */ |
416 | {"Igrave", 6, "\xc3\x8c", 2}, /* "Ì" */ |
417 | {"Iota", 4, "\xce\x99", 2}, /* "Ι" */ |
418 | {"Iuml", 4, "\xc3\x8f", 2}, /* "Ï" */ |
419 | {"Kappa", 5, "\xce\x9a", 2}, /* "Κ" */ |
420 | {"Lambda", 6, "\xce\x9b", 2}, /* "Λ" */ |
421 | {"Mu", 2, "\xce\x9c", 2}, /* "Μ" */ |
422 | {"Ntilde", 6, "\xc3\x91", 2}, /* "Ñ" */ |
423 | {"Nu", 2, "\xce\x9d", 2}, /* "Ν" */ |
424 | {"Oacute", 6, "\xc3\x93", 2}, /* "Ó" */ |
425 | {"Ocirc", 5, "\xc3\x94", 2}, /* "Ô" */ |
426 | {"Ograve", 6, "\xc3\x92", 2}, /* "Ò" */ |
427 | {"Omega", 5, "\xce\xa9", 2}, /* "Ω" */ |
428 | {"Omicron", 7, "\xce\x9f", 2}, /* "Ο" */ |
429 | {"Oslash", 6, "\xc3\x98", 2}, /* "Ø" */ |
430 | {"Otilde", 6, "\xc3\x95", 2}, /* "Õ" */ |
431 | {"Ouml", 4, "\xc3\x96", 2}, /* "Ö" */ |
432 | {"Phi", 3, "\xce\xa6", 2}, /* "Φ" */ |
433 | {"Pi", 2, "\xce\xa0", 2}, /* "Π" */ |
434 | {"Prime", 5, "\xe2\x80\xb3", 3}, /* "″" */ |
435 | {"Psi", 3, "\xce\xa8", 2}, /* "Ψ" */ |
436 | {"Rho", 3, "\xce\xa1", 2}, /* "Ρ" */ |
437 | {"Sigma", 5, "\xce\xa3", 2}, /* "Σ" */ |
438 | {"THORN", 5, "\xc3\x9e", 2}, /* "Þ" */ |
439 | {"Tau", 3, "\xce\xa4", 2}, /* "Τ" */ |
440 | {"Theta", 5, "\xce\x98", 2}, /* "Θ" */ |
441 | {"Uacute", 6, "\xc3\x9a", 2}, /* "Ú" */ |
442 | {"Ucirc", 5, "\xc3\x9b", 2}, /* "Û" */ |
443 | {"Ugrave", 6, "\xc3\x99", 2}, /* "Ù" */ |
444 | {"Upsilon", 7, "\xce\xa5", 2}, /* "Υ" */ |
445 | {"Uuml", 4, "\xc3\x9c", 2}, /* "Ü" */ |
446 | {"Xi", 2, "\xce\x9e", 2}, /* "Ξ" */ |
447 | {"Yacute", 6, "\xc3\x9d", 2}, /* "Ý" */ |
448 | {"Zeta", 4, "\xce\x96", 2}, /* "Ζ" */ |
449 | {"aacute", 6, "\xc3\xa1", 2}, /* "á" */ |
450 | {"acirc", 5, "\xc3\xa2", 2}, /* "â" */ |
451 | {"acute", 5, "\xc2\xb4", 2}, /* "´" */ |
452 | {"aelig", 5, "\xc3\xa6", 2}, /* "æ" */ |
453 | {"agrave", 6, "\xc3\xa0", 2}, /* "à" */ |
454 | {"alefsym", 7, "\xe2\x84\xb5", 3}, /* "ℵ" */ |
455 | {"alpha", 5, "\xce\xb1", 2}, /* "α" */ |
456 | {"amp", 3, "\x26", 1}, /* "&" */ |
457 | {"and", 3, "\xe2\x88\xa7", 3}, /* "∧" */ |
458 | {"ang", 3, "\xe2\x88\xa0", 3}, /* "∠" */ |
459 | {"apos", 4, "\x27", 1}, /* "'" */ |
460 | {"aring", 5, "\xc3\xa5", 2}, /* "å" */ |
461 | {"asymp", 5, "\xe2\x89\x88", 3}, /* "≈" */ |
462 | {"atilde", 6, "\xc3\xa3", 2}, /* "ã" */ |
463 | {"auml", 4, "\xc3\xa4", 2}, /* "ä" */ |
464 | {"beta", 4, "\xce\xb2", 2}, /* "β" */ |
465 | {"brvbar", 6, "\xc2\xa6", 2}, /* "¦" */ |
466 | {"bull", 4, "\xe2\x80\xa2", 3}, /* "•" */ |
467 | {"cap", 3, "\xe2\x88\xa9", 3}, /* "∩" */ |
468 | {"ccedil", 6, "\xc3\xa7", 2}, /* "ç" */ |
469 | {"cedil", 5, "\xc2\xb8", 2}, /* "¸" */ |
470 | {"cent", 4, "\xc2\xa2", 2}, /* "¢" */ |
471 | {"chi", 3, "\xcf\x87", 2}, /* "χ" */ |
472 | {"clubs", 5, "\xe2\x99\xa3", 3}, /* "♣" */ |
473 | {"cong", 4, "\xe2\x89\x85", 3}, /* "≅" */ |
474 | {"copy", 4, "\xc2\xa9", 2}, /* "©" */ |
475 | {"crarr", 5, "\xe2\x86\xb5", 3}, /* "↵" */ |
476 | {"cup", 3, "\xe2\x88\xaa", 3}, /* "∪" */ |
477 | {"curren", 6, "\xc2\xa4", 2}, /* "¤" */ |
478 | {"dArr", 4, "\xe2\x87\x93", 3}, /* "⇓" */ |
479 | {"darr", 4, "\xe2\x86\x93", 3}, /* "↓" */ |
480 | {"deg", 3, "\xc2\xb0", 2}, /* "°" */ |
481 | {"delta", 5, "\xce\xb4", 2}, /* "δ" */ |
482 | {"diams", 5, "\xe2\x99\xa6", 3}, /* "♦" */ |
483 | {"divide", 6, "\xc3\xb7", 2}, /* "÷" */ |
484 | {"eacute", 6, "\xc3\xa9", 2}, /* "é" */ |
485 | {"ecirc", 5, "\xc3\xaa", 2}, /* "ê" */ |
486 | {"egrave", 6, "\xc3\xa8", 2}, /* "è" */ |
487 | {"empty", 5, "\xe2\x88\x85", 3}, /* "∅" */ |
488 | {"epsilon", 7, "\xce\xb5", 2}, /* "ε" */ |
489 | {"equiv", 5, "\xe2\x89\xa1", 3}, /* "≡" */ |
490 | {"eta", 3, "\xce\xb7", 2}, /* "η" */ |
491 | {"eth", 3, "\xc3\xb0", 2}, /* "ð" */ |
492 | {"euml", 4, "\xc3\xab", 2}, /* "ë" */ |
493 | {"euro", 4, "\xe2\x82\xac", 3}, /* "€" */ |
494 | {"exist", 5, "\xe2\x88\x83", 3}, /* "∃" */ |
495 | {"fnof", 4, "\xc6\x92", 2}, /* "ƒ" */ |
496 | {"forall", 6, "\xe2\x88\x80", 3}, /* "∀" */ |
497 | {"frac12", 6, "\xc2\xbd", 2}, /* "½" */ |
498 | {"frac14", 6, "\xc2\xbc", 2}, /* "¼" */ |
499 | {"frac34", 6, "\xc2\xbe", 2}, /* "¾" */ |
500 | {"frasl", 5, "\xe2\x81\x84", 3}, /* "⁄" */ |
501 | {"gamma", 5, "\xce\xb3", 2}, /* "γ" */ |
502 | {"ge", 2, "\xe2\x89\xa5", 3}, /* "≥" */ |
503 | {"gt", 2, "\x3e", 1}, /* ">" */ |
504 | {"hArr", 4, "\xe2\x87\x94", 3}, /* "⇔" */ |
505 | {"harr", 4, "\xe2\x86\x94", 3}, /* "↔" */ |
506 | {"hearts", 6, "\xe2\x99\xa5", 3}, /* "♥" */ |
507 | {"hellip", 6, "\xe2\x80\xa6", 3}, /* "…" */ |
508 | {"iacute", 6, "\xc3\xad", 2}, /* "í" */ |
509 | {"icirc", 5, "\xc3\xae", 2}, /* "î" */ |
510 | {"iexcl", 5, "\xc2\xa1", 2}, /* "¡" */ |
511 | {"igrave", 6, "\xc3\xac", 2}, /* "ì" */ |
512 | {"image", 5, "\xe2\x84\x91", 3}, /* "ℑ" */ |
513 | {"infin", 5, "\xe2\x88\x9e", 3}, /* "∞" */ |
514 | {"int", 3, "\xe2\x88\xab", 3}, /* "∫" */ |
515 | {"iota", 4, "\xce\xb9", 2}, /* "ι" */ |
516 | {"iquest", 6, "\xc2\xbf", 2}, /* "¿" */ |
517 | {"isin", 4, "\xe2\x88\x88", 3}, /* "∈" */ |
518 | {"iuml", 4, "\xc3\xaf", 2}, /* "ï" */ |
519 | {"kappa", 5, "\xce\xba", 2}, /* "κ" */ |
520 | {"lArr", 4, "\xe2\x87\x90", 3}, /* "⇐" */ |
521 | {"lambda", 6, "\xce\xbb", 2}, /* "λ" */ |
522 | {"lang", 4, "\xe3\x80\x88", 3}, /* "〈" */ |
523 | {"laquo", 5, "\xc2\xab", 2}, /* "«" */ |
524 | {"larr", 4, "\xe2\x86\x90", 3}, /* "←" */ |
525 | {"lceil", 5, "\xe2\x8c\x88", 3}, /* "⌈" */ |
526 | {"le", 2, "\xe2\x89\xa4", 3}, /* "≤" */ |
527 | {"lfloor", 6, "\xe2\x8c\x8a", 3}, /* "⌊" */ |
528 | {"lowast", 6, "\xe2\x88\x97", 3}, /* "∗" */ |
529 | {"loz", 3, "\xe2\x97\x8a", 3}, /* "◊" */ |
530 | {"lt", 2, "\x3c", 1}, /* "<" */ |
531 | {"macr", 4, "\xc2\xaf", 2}, /* "¯" */ |
532 | {"micro", 5, "\xc2\xb5", 2}, /* "µ" */ |
533 | {"middot", 6, "\xc2\xb7", 2}, /* "·" */ |
534 | {"minus", 5, "\xe2\x88\x92", 3}, /* "−" */ |
535 | {"mu", 2, "\xce\xbc", 2}, /* "μ" */ |
536 | {"nabla", 5, "\xe2\x88\x87", 3}, /* "∇" */ |
537 | {"nbsp", 4, "\x20", 1}, /* " " */ |
538 | {"ne", 2, "\xe2\x89\xa0", 3}, /* "≠" */ |
539 | {"ni", 2, "\xe2\x88\x8b", 3}, /* "∋" */ |
540 | {"not", 3, "\xc2\xac", 2}, /* "¬" */ |
541 | {"notin", 5, "\xe2\x88\x89", 3}, /* "∉" */ |
542 | {"nsub", 4, "\xe2\x8a\x84", 3}, /* "⊄" */ |
543 | {"ntilde", 6, "\xc3\xb1", 2}, /* "ñ" */ |
544 | {"nu", 2, "\xce\xbd", 2}, /* "ν" */ |
545 | {"oacute", 6, "\xc3\xb3", 2}, /* "ó" */ |
546 | {"ocirc", 5, "\xc3\xb4", 2}, /* "ô" */ |
547 | {"ograve", 6, "\xc3\xb2", 2}, /* "ò" */ |
548 | {"oline", 5, "\xe2\x80\xbe", 3}, /* "‾" */ |
549 | {"omega", 5, "\xcf\x89", 2}, /* "ω" */ |
550 | {"omicron", 7, "\xce\xbf", 2}, /* "ο" */ |
551 | {"oplus", 5, "\xe2\x8a\x95", 3}, /* "⊕" */ |
552 | {"or", 2, "\xe2\x88\xa8", 3}, /* "∨" */ |
553 | {"ordf", 4, "\xc2\xaa", 2}, /* "ª" */ |
554 | {"ordm", 4, "\xc2\xba", 2}, /* "º" */ |
555 | {"oslash", 6, "\xc3\xb8", 2}, /* "ø" */ |
556 | {"otilde", 6, "\xc3\xb5", 2}, /* "õ" */ |
557 | {"otimes", 6, "\xe2\x8a\x97", 3}, /* "⊗" */ |
558 | {"ouml", 4, "\xc3\xb6", 2}, /* "ö" */ |
559 | {"para", 4, "\xc2\xb6", 2}, /* "¶" */ |
560 | {"part", 4, "\xe2\x88\x82", 3}, /* "∂" */ |
561 | {"perp", 4, "\xe2\x8a\xa5", 3}, /* "⊥" */ |
562 | {"phi", 3, "\xcf\x86", 2}, /* "φ" */ |
563 | {"pi", 2, "\xcf\x80", 2}, /* "π" */ |
564 | {"piv", 3, "\xcf\x96", 2}, /* "ϖ" */ |
565 | {"plusmn", 6, "\xc2\xb1", 2}, /* "±" */ |
566 | {"pound", 5, "\xc2\xa3", 2}, /* "£" */ |
567 | {"prime", 5, "\xe2\x80\xb2", 3}, /* "′" */ |
568 | {"prod", 4, "\xe2\x88\x8f", 3}, /* "∏" */ |
569 | {"prop", 4, "\xe2\x88\x9d", 3}, /* "∝" */ |
570 | {"psi", 3, "\xcf\x88", 2}, /* "ψ" */ |
571 | {"quot", 4, "\x22", 1}, /* "\"" */ |
572 | {"rArr", 4, "\xe2\x87\x92", 3}, /* "⇒" */ |
573 | {"radic", 5, "\xe2\x88\x9a", 3}, /* "√" */ |
574 | {"rang", 4, "\xe3\x80\x89", 3}, /* "〉" */ |
575 | {"raquo", 5, "\xc2\xbb", 2}, /* "»" */ |
576 | {"rarr", 4, "\xe2\x86\x92", 3}, /* "→" */ |
577 | {"rceil", 5, "\xe2\x8c\x89", 3}, /* "⌉" */ |
578 | {"real", 4, "\xe2\x84\x9c", 3}, /* "ℜ" */ |
579 | {"reg", 3, "\xc2\xae", 2}, /* "®" */ |
580 | {"rfloor", 6, "\xe2\x8c\x8b", 3}, /* "⌋" */ |
581 | {"rho", 3, "\xcf\x81", 2}, /* "ρ" */ |
582 | {"sdot", 4, "\xe2\x8b\x85", 3}, /* "⋅" */ |
583 | {"sect", 4, "\xc2\xa7", 2}, /* "§" */ |
584 | {"shy", 3, "\xc2\xad", 2}, /* "" */ |
585 | {"sigma", 5, "\xcf\x83", 2}, /* "σ" */ |
586 | {"sigmaf", 6, "\xcf\x82", 2}, /* "ς" */ |
587 | {"sim", 3, "\xe2\x88\xbc", 3}, /* "∼" */ |
588 | {"spades", 6, "\xe2\x99\xa0", 3}, /* "♠" */ |
589 | {"sub", 3, "\xe2\x8a\x82", 3}, /* "⊂" */ |
590 | {"sube", 4, "\xe2\x8a\x86", 3}, /* "⊆" */ |
591 | {"sum", 3, "\xe2\x88\x91", 3}, /* "∑" */ |
592 | {"sup", 3, "\xe2\x8a\x83", 3}, /* "⊃" */ |
593 | {"sup1", 4, "\xc2\xb9", 2}, /* "¹" */ |
594 | {"sup2", 4, "\xc2\xb2", 2}, /* "²" */ |
595 | {"sup3", 4, "\xc2\xb3", 2}, /* "³" */ |
596 | {"supe", 4, "\xe2\x8a\x87", 3}, /* "⊇" */ |
597 | {"szlig", 5, "\xc3\x9f", 2}, /* "ß" */ |
598 | {"tau", 3, "\xcf\x84", 2}, /* "τ" */ |
599 | {"there4", 6, "\xe2\x88\xb4", 3}, /* "∴" */ |
600 | {"theta", 5, "\xce\xb8", 2}, /* "θ" */ |
601 | {"thetasym", 8, "\xcf\x91", 2}, /* "ϑ" */ |
602 | {"thorn", 5, "\xc3\xbe", 2}, /* "þ" */ |
603 | {"times", 5, "\xc3\x97", 2}, /* "×" */ |
604 | {"trade", 5, "\xe2\x84\xa2", 3}, /* "™" */ |
605 | {"uArr", 4, "\xe2\x87\x91", 3}, /* "⇑" */ |
606 | {"uacute", 6, "\xc3\xba", 2}, /* "ú" */ |
607 | {"uarr", 4, "\xe2\x86\x91", 3}, /* "↑" */ |
608 | {"ucirc", 5, "\xc3\xbb", 2}, /* "û" */ |
609 | {"ugrave", 6, "\xc3\xb9", 2}, /* "ù" */ |
610 | {"uml", 3, "\xc2\xa8", 2}, /* "¨" */ |
611 | {"upsih", 5, "\xcf\x92", 2}, /* "ϒ" */ |
612 | {"upsilon", 7, "\xcf\x85", 2}, /* "υ" */ |
613 | {"uuml", 4, "\xc3\xbc", 2}, /* "ü" */ |
614 | {"weierp", 6, "\xe2\x84\x98", 3}, /* "℘" */ |
615 | {"xi", 2, "\xce\xbe", 2}, /* "ξ" */ |
616 | {"yacute", 6, "\xc3\xbd", 2}, /* "ý" */ |
617 | {"yen", 3, "\xc2\xa5", 2}, /* "¥" */ |
618 | {"yuml", 4, "\xc3\xbf", 2}, /* "ÿ" */ |
619 | {"zeta", 4, "\xce\xb6", 2}, /* "ζ" */ |
620 | {NULL((void*)0), 0, "", 0} |
621 | }; |
622 | |
623 | |
624 | static size_t |
625 | EntityDecode(const char *entity, size_t length, bool_Bool *needEncodePtr, char *outPtr) |
626 | { |
627 | size_t decoded = 0u; |
628 | |
629 | NS_NONNULL_ASSERT(entity != NULL)((void) (0)); |
630 | NS_NONNULL_ASSERT(outPtr != NULL)((void) (0)); |
631 | NS_NONNULL_ASSERT(needEncodePtr != NULL)((void) (0)); |
632 | |
633 | /* |
634 | * Handle numeric entities. |
635 | */ |
636 | if (*entity == '#') { |
637 | long value; |
638 | |
639 | if (CHARTYPE(digit, *(entity + 1))(((*__ctype_b_loc ())[(int) (((int)((unsigned char)(*(entity + 1)))))] & (unsigned short int) _ISdigit)) != 0) { |
640 | /* |
641 | * Decimal numeric entity. |
642 | */ |
643 | value = strtol(entity + 1, NULL((void*)0), 10); |
644 | |
645 | } else if (*(entity + 1) == 'x' && length >= 3 && length <= 8) { |
646 | /* |
647 | * Hexadecimal numeric entity. |
648 | */ |
649 | value = strtol(entity + 2, NULL((void*)0), 16); |
650 | |
651 | } else { |
652 | Ns_Log(Warning, "invalid numeric entity: '%s'", entity); |
653 | value = 0; |
654 | } |
655 | |
656 | if (value >= 32) { |
657 | int outLength; |
658 | |
659 | outLength = ToUTF8(value, outPtr); |
660 | decoded += (size_t)outLength; |
661 | |
662 | Ns_Log(Debug, "entity decode: code point %.2lx %.2lx " |
663 | "corresponds to %d UTF-8 characters", |
664 | ((value >> 8) & 0xff), (value & 0xff), outLength); |
665 | |
666 | if (value > 127) { |
667 | *needEncodePtr = NS_TRUE1; |
668 | } |
669 | } else { |
670 | /* |
671 | * ASCII device control characters should not be present in HTML. |
672 | */ |
673 | Ns_Log(Notice, "entity decode: ignore numeric entity with value %ld", value); |
674 | } |
675 | } else { |
676 | size_t i; |
677 | |
678 | for (i = 0; namedEntities[i].name != NULL((void*)0); i++) { |
679 | char firstChar = *namedEntities[i].name; |
680 | |
681 | if (firstChar == *entity |
682 | && length == namedEntities[i].length |
683 | && strncmp(entity, namedEntities[i].name, length) == 0) { |
684 | |
685 | /*if (strlen(entities[i].value) != entities[i].outputLength) { |
686 | fprintf(stderr, "--> name %s found l = %lu\n", |
687 | entities[i].name, strlen(entities[i].value)); |
688 | }*/ |
689 | if (namedEntities[i].outputLength > 1) { |
690 | |
691 | memcpy(outPtr, namedEntities[i].value, namedEntities[i].outputLength); |
692 | decoded += namedEntities[i].outputLength; |
693 | } else { |
694 | *outPtr = *namedEntities[i].value; |
695 | decoded++; |
696 | } |
697 | break; |
698 | } |
699 | |
700 | if (firstChar > *entity) { |
701 | Ns_Log(Warning, "ignore unknown named entity '%s'", entity); |
702 | break; |
703 | } |
704 | } |
705 | } |
706 | |
707 | return decoded; |
708 | } |
709 | |
710 | |
711 | /* |
712 | *---------------------------------------------------------------------- |
713 | * |
714 | * WordEndsInSemi -- |
715 | * |
716 | * Does this word end in a semicolon or a space? |
717 | * |
718 | * Results: |
719 | * Returns true if the word endes with a semicolon. |
720 | * |
721 | * Side effects: |
722 | * Undefined behavior if string does not end in null |
723 | * |
724 | *---------------------------------------------------------------------- |
725 | */ |
726 | |
727 | static bool_Bool |
728 | WordEndsInSemi(const char *word, size_t *lengthPtr) |
729 | { |
730 | const char *start; |
731 | |
732 | NS_NONNULL_ASSERT(word != NULL)((void) (0)); |
733 | |
734 | /* |
735 | * Advance past the first '&' so we can check for a second |
736 | * (i.e. to handle "ben&jerry ") |
737 | */ |
738 | if (*word == '&') { |
739 | word++; |
740 | } |
741 | start = word; |
742 | while((*word != '\0') && (*word != ' ') && (*word != ';') && (*word != '&')) { |
743 | word++; |
744 | } |
745 | *lengthPtr = (size_t)(word - start); |
746 | |
747 | return (*word == ';'); |
748 | } |
749 | |
750 | |
751 | |
752 | /* |
753 | *---------------------------------------------------------------------- |
754 | * |
755 | * NsTclStripHtmlObjCmd -- |
756 | * |
757 | * Implements "ns_striphtml". |
758 | * |
759 | * Results: |
760 | * Tcl result. |
761 | * |
762 | * Side effects: |
763 | * See docs. |
764 | * |
765 | *---------------------------------------------------------------------- |
766 | */ |
767 | |
768 | int |
769 | NsTclStripHtmlObjCmd(ClientData UNUSED(clientData)UNUSED_clientData __attribute__((__unused__)), Tcl_Interp *interp, int objc, Tcl_Obj *const* objv) |
770 | { |
771 | int result = TCL_OK0; |
772 | char *htmlString = (char *)NS_EMPTY_STRING; |
773 | Ns_ObjvSpec args[] = { |
774 | {"html", Ns_ObjvString, &htmlString, NULL((void*)0)}, |
775 | {NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0)} |
776 | }; |
777 | |
778 | if (Ns_ParseObjv(NULL((void*)0), args, interp, 1, objc, objv) != NS_OK) { |
779 | result = TCL_ERROR1; |
780 | |
781 | } else { |
782 | bool_Bool intag; /* flag to see if are we inside a tag */ |
783 | bool_Bool incomment; /* flag to see if we are inside a comment */ |
784 | char *inString; /* copy of input string */ |
785 | char *outPtr; /* moving pointer to output string */ |
786 | const char *inPtr; /* moving pointer to input string */ |
787 | bool_Bool needEncode; |
788 | |
789 | /* |
790 | * Make a copy of the input and point the moving and output ptrs to it. |
791 | */ |
792 | inString = ns_strdup(htmlString); |
793 | inPtr = inString; |
794 | outPtr = inString; |
795 | intag = NS_FALSE0; |
796 | incomment = NS_FALSE0; |
797 | needEncode = NS_FALSE0; |
798 | |
799 | while (*inPtr != '\0') { |
800 | |
801 | Ns_Log(Debug, "inptr %c intag %d incomment %d string <%s>", |
802 | *inPtr, intag, incomment, inPtr); |
803 | |
804 | if (*inPtr == '<') { |
805 | intag = NS_TRUE1; |
806 | if ((*(inPtr + 1) == '!') |
807 | && (*(inPtr + 2) == '-') |
808 | && (*(inPtr + 3) == '-')) { |
809 | incomment = NS_TRUE1; |
810 | } |
811 | } else if (incomment) { |
812 | if ((*(inPtr) == '-') |
813 | && (*(inPtr + 1) == '-') |
814 | && (*(inPtr + 2) == '>')) { |
815 | incomment = NS_FALSE0; |
816 | } |
817 | } else if (intag && (*inPtr == '>')) { |
818 | /* |
819 | * Closing a tag. |
820 | */ |
821 | intag = NS_FALSE0; |
822 | |
823 | } else if (!intag) { |
824 | /* |
825 | * Regular text |
826 | */ |
827 | |
828 | if (*inPtr == '&') { |
829 | size_t length = 0u; |
830 | |
831 | /* |
832 | * Starting an entity. |
833 | */ |
834 | if (WordEndsInSemi(inPtr, &length)) { |
835 | size_t decoded = EntityDecode(inPtr + 1u, length, &needEncode, outPtr); |
836 | |
837 | inPtr += (length + 1u); |
838 | outPtr += decoded; |
839 | } |
840 | Ns_Log(Debug, "...... after entity inptr '%c' intag %d incomment %d string <%s> needEncode %d", |
841 | *inPtr, intag, incomment, inPtr, needEncode); |
842 | } else { |
843 | /* |
844 | * Plain Text output |
845 | */ |
846 | *outPtr++ = *inPtr; |
847 | } |
848 | |
849 | } else { |
850 | /* |
851 | * Must be intag |
852 | */ |
853 | } |
854 | ++inPtr; |
855 | } |
856 | |
857 | /* |
858 | * Terminate output string. |
859 | */ |
860 | *outPtr = '\0'; |
861 | |
862 | if (needEncode) { |
863 | Tcl_DString ds; |
864 | |
865 | (void)Tcl_ExternalToUtfDString(Ns_GetCharsetEncoding("utf-8"), |
866 | inString, (int)strlen(inString), &ds); |
867 | Tcl_DStringResult(interp, &ds); |
868 | } else { |
869 | Tcl_SetObjResult(interp, Tcl_NewStringObj(inString, -1)); |
870 | } |
871 | ns_free(inString); |
872 | } |
873 | return result; |
874 | } |
875 | |
876 | |
877 | /* |
878 | * Local Variables: |
879 | * mode: c |
880 | * c-basic-offset: 4 |
881 | * fill-column: 78 |
882 | * indent-tabs-mode: nil |
883 | * End: |
884 | */ |