| File: | d/quotehtml.c |
| Warning: | line 351, column 9 Duplicate code detected |
| Note: | line 358, column 9 Similar code here |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | /* |
| 2 | * The contents of this file are subject to the Mozilla Public License |
| 3 | * Version 1.1 (the "License"); you may not use this file except in |
| 4 | * compliance with the License. You may obtain a copy of the License at |
| 5 | * http://mozilla.org/. |
| 6 | * |
| 7 | * Software distributed under the License is distributed on an "AS IS" |
| 8 | * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See |
| 9 | * the License for the specific language governing rights and limitations |
| 10 | * under the License. |
| 11 | * |
| 12 | * The Original Code is AOLserver Code and related documentation |
| 13 | * distributed by AOL. |
| 14 | * |
| 15 | * The Initial Developer of the Original Code is America Online, |
| 16 | * Inc. Portions created by AOL are Copyright (C) 1999 America Online, |
| 17 | * Inc. All Rights Reserved. |
| 18 | * |
| 19 | * Alternatively, the contents of this file may be used under the terms |
| 20 | * of the GNU General Public License (the "GPL"), in which case the |
| 21 | * provisions of GPL are applicable instead of those above. If you wish |
| 22 | * to allow use of your version of this file only under the terms of the |
| 23 | * GPL and not to allow others to use your version of this file under the |
| 24 | * License, indicate your decision by deleting the provisions above and |
| 25 | * replace them with the notice and other provisions required by the GPL. |
| 26 | * If you do not delete the provisions above, a recipient may use your |
| 27 | * version of this file under either the License or the GPL. |
| 28 | */ |
| 29 | |
| 30 | |
| 31 | /* |
| 32 | * quotehtml.c -- |
| 33 | * |
| 34 | * Take text and make it safe for HTML. |
| 35 | */ |
| 36 | |
| 37 | #include "nsd.h" |
| 38 | |
| 39 | /* |
| 40 | * Static functions defined in this file. |
| 41 | */ |
| 42 | static void QuoteHtml(Ns_DStringTcl_DString *dsPtr, const char *breakChar, const char *htmlString) |
| 43 | NS_GNUC_NONNULL(1)__attribute__((__nonnull__(1))) NS_GNUC_NONNULL(2)__attribute__((__nonnull__(2))) NS_GNUC_NONNULL(3)__attribute__((__nonnull__(3))); |
| 44 | |
| 45 | static bool_Bool WordEndsInSemi(const char *word, size_t *lengthPtr) |
| 46 | NS_GNUC_NONNULL(1)__attribute__((__nonnull__(1))); |
| 47 | |
| 48 | static int ToUTF8(long value, char *outPtr) |
| 49 | NS_GNUC_NONNULL(2)__attribute__((__nonnull__(2))); |
| 50 | |
| 51 | static size_t EntityDecode(const char *entity, size_t length, bool_Bool *needEncodePtr, char *outPtr) |
| 52 | NS_GNUC_NONNULL(1)__attribute__((__nonnull__(1))) NS_GNUC_NONNULL(3)__attribute__((__nonnull__(3))) NS_GNUC_NONNULL(4)__attribute__((__nonnull__(4))); |
| 53 | |
| 54 | |
| 55 | |
| 56 | /* |
| 57 | *---------------------------------------------------------------------- |
| 58 | * |
| 59 | * Ns_QuoteHtml -- |
| 60 | * |
| 61 | * Quote an HTML string. |
| 62 | * |
| 63 | * Results: |
| 64 | * None. |
| 65 | * |
| 66 | * Side effects: |
| 67 | * Copies quoted HTML to given dstring. |
| 68 | * |
| 69 | *---------------------------------------------------------------------- |
| 70 | */ |
| 71 | static void |
| 72 | QuoteHtml(Ns_DStringTcl_DString *dsPtr, const char *breakChar, const char *htmlString) |
| 73 | { |
| 74 | const char *toProcess = htmlString; |
| 75 | |
| 76 | NS_NONNULL_ASSERT(dsPtr != NULL)((void) (0)); |
| 77 | NS_NONNULL_ASSERT(breakChar != NULL)((void) (0)); |
| 78 | NS_NONNULL_ASSERT(htmlString != NULL)((void) (0)); |
| 79 | |
| 80 | do { |
| 81 | /* |
| 82 | * Append the first part, escape the protected char, and |
| 83 | * continue. |
| 84 | */ |
| 85 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, toProcess, (int)(breakChar - toProcess)); |
| 86 | switch (*breakChar) { |
| 87 | case '<': |
| 88 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, "<", 4); |
| 89 | break; |
| 90 | |
| 91 | case '>': |
| 92 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, ">", 4); |
| 93 | break; |
| 94 | |
| 95 | case '&': |
| 96 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, "&", 5); |
| 97 | break; |
| 98 | |
| 99 | case '\'': |
| 100 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, "'", 5); |
| 101 | break; |
| 102 | |
| 103 | case '"': |
| 104 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, """, 5); |
| 105 | break; |
| 106 | |
| 107 | default: |
| 108 | /*should not happen */ assert(0)((void) (0)); |
| 109 | break; |
| 110 | } |
| 111 | /* |
| 112 | * Check for further protected characters. |
| 113 | */ |
| 114 | toProcess = breakChar + 1; |
| 115 | breakChar = strpbrk(toProcess, "<>&'\""); |
| 116 | |
| 117 | } while (breakChar != NULL((void*)0)); |
| 118 | |
| 119 | /* |
| 120 | * Append the last part if nonempty. |
| 121 | */ |
| 122 | if (toProcess != NULL((void*)0)) { |
| 123 | Ns_DStringAppend(dsPtr, toProcess)Tcl_DStringAppend((dsPtr), (toProcess), -1); |
| 124 | } |
| 125 | } |
| 126 | |
| 127 | |
| 128 | void |
| 129 | Ns_QuoteHtml(Ns_DStringTcl_DString *dsPtr, const char *htmlString) |
| 130 | { |
| 131 | NS_NONNULL_ASSERT(dsPtr != NULL)((void) (0)); |
| 132 | NS_NONNULL_ASSERT(htmlString != NULL)((void) (0)); |
| 133 | |
| 134 | /* |
| 135 | * If the first character is a null character, there is nothing to do. |
| 136 | */ |
| 137 | if (*htmlString != '\0') { |
| 138 | const char *breakChar = strpbrk(htmlString, "<>&'\""); |
| 139 | |
| 140 | if (breakChar != NULL((void*)0)) { |
| 141 | QuoteHtml(dsPtr, strpbrk(htmlString, "<>&'\""), htmlString); |
| 142 | } else { |
| 143 | Ns_DStringAppend(dsPtr, htmlString)Tcl_DStringAppend((dsPtr), (htmlString), -1); |
| 144 | } |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | |
| 149 | |
| 150 | /* |
| 151 | *---------------------------------------------------------------------- |
| 152 | * |
| 153 | * NsTclQuoteHtmlObjCmd -- |
| 154 | * |
| 155 | * Implements "ns_quotehtml". |
| 156 | * |
| 157 | * Results: |
| 158 | * Tcl result. |
| 159 | * |
| 160 | * Side effects: |
| 161 | * See docs. |
| 162 | * |
| 163 | *---------------------------------------------------------------------- |
| 164 | */ |
| 165 | |
| 166 | int |
| 167 | NsTclQuoteHtmlObjCmd(ClientData UNUSED(clientData)UNUSED_clientData __attribute__((__unused__)), Tcl_Interp *interp, int objc, Tcl_Obj *const* objv) |
| 168 | { |
| 169 | int result = TCL_OK0; |
| 170 | Tcl_Obj *htmlObj; |
| 171 | Ns_ObjvSpec args[] = { |
| 172 | {"html", Ns_ObjvObj, &htmlObj, NULL((void*)0)}, |
| 173 | {NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0)} |
| 174 | }; |
| 175 | |
| 176 | if (Ns_ParseObjv(NULL((void*)0), args, interp, 1, objc, objv) != NS_OK) { |
| 177 | result = TCL_ERROR1; |
| 178 | |
| 179 | } else { |
| 180 | const char *htmlString = Tcl_GetString(htmlObj); |
| 181 | |
| 182 | if (*htmlString != '\0') { |
| 183 | const char *breakChar = strpbrk(htmlString, "<>&'\""); |
| 184 | |
| 185 | if (breakChar == NULL((void*)0)) { |
| 186 | /* |
| 187 | * No need to copy anything. |
| 188 | */ |
| 189 | Tcl_SetObjResult(interp, htmlObj); |
| 190 | } else { |
| 191 | Ns_DStringTcl_DString ds; |
| 192 | |
| 193 | Ns_DStringInitTcl_DStringInit(&ds); |
| 194 | QuoteHtml(&ds, breakChar, htmlString); |
| 195 | Tcl_DStringResult(interp, &ds); |
| 196 | |
| 197 | } |
| 198 | } |
| 199 | } |
| 200 | |
| 201 | return result; |
| 202 | } |
| 203 | |
| 204 | |
| 205 | |
| 206 | /* |
| 207 | *---------------------------------------------------------------------- |
| 208 | * |
| 209 | * NsTclUnquoteHtmlObjCmd -- |
| 210 | * |
| 211 | * This is essentially the opposite operation of NsTclQuoteHtmlObjCmd. |
| 212 | * |
| 213 | * Implements "ns_unquotehtml". |
| 214 | * |
| 215 | * Results: |
| 216 | * Tcl result. |
| 217 | * |
| 218 | * Side effects: |
| 219 | * See docs. |
| 220 | * |
| 221 | *---------------------------------------------------------------------- |
| 222 | */ |
| 223 | |
| 224 | int |
| 225 | NsTclUnquoteHtmlObjCmd(ClientData UNUSED(clientData)UNUSED_clientData __attribute__((__unused__)), Tcl_Interp *interp, int objc, Tcl_Obj *const* objv) |
| 226 | { |
| 227 | int result = TCL_OK0; |
| 228 | Tcl_Obj *htmlObj; |
| 229 | Ns_ObjvSpec args[] = { |
| 230 | {"html", Ns_ObjvObj, &htmlObj, NULL((void*)0)}, |
| 231 | {NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0)} |
| 232 | }; |
| 233 | |
| 234 | if (Ns_ParseObjv(NULL((void*)0), args, interp, 1, objc, objv) != NS_OK) { |
| 235 | result = TCL_ERROR1; |
| 236 | |
| 237 | } else { |
| 238 | Ns_DStringTcl_DString ds, *dsPtr = &ds; |
| 239 | const char *htmlString = Tcl_GetString(htmlObj); |
| 240 | bool_Bool needEncode = NS_FALSE0; |
| 241 | |
| 242 | Ns_DStringInitTcl_DStringInit(&ds); |
| 243 | |
| 244 | if (*htmlString != '\0') { |
| 245 | |
| 246 | for (;;) { |
| 247 | const char *possibleEntity = strchr(htmlString, '&'); |
| 248 | |
| 249 | if (possibleEntity == NULL((void*)0)) { |
| 250 | /* |
| 251 | * We are done. |
| 252 | */ |
| 253 | break; |
| 254 | |
| 255 | } else { |
| 256 | size_t length = 0u; |
| 257 | int prefixLength = (int)(possibleEntity - htmlString); |
| 258 | |
| 259 | /* |
| 260 | * Add the string leading to the ampersand to the output |
| 261 | * and proceed in the string by this amount of bytes. |
| 262 | */ |
| 263 | if (possibleEntity != htmlString) { |
| 264 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, htmlString, prefixLength); |
| 265 | htmlString += prefixLength; |
| 266 | } |
| 267 | |
| 268 | if (WordEndsInSemi(possibleEntity, &length)) { |
| 269 | size_t decoded; |
| 270 | int oldLength = dsPtr->length; |
| 271 | |
| 272 | /* |
| 273 | * The appended characters are max 4 bytes; make sure, we |
| 274 | * have this space in the Tcl_DString. |
| 275 | */ |
| 276 | Tcl_DStringSetLength(dsPtr, oldLength + 4); |
| 277 | decoded = EntityDecode(possibleEntity + 1u, length, &needEncode, |
| 278 | dsPtr->string + oldLength); |
| 279 | Tcl_DStringSetLength(dsPtr, oldLength + (int)decoded); |
| 280 | |
| 281 | /* |
| 282 | * Include the boundary characters "&" and ";" in the |
| 283 | * length calculation. |
| 284 | */ |
| 285 | htmlString += (length + 2); |
| 286 | } else { |
| 287 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, "&", 1); |
| 288 | htmlString ++; |
| 289 | } |
| 290 | } |
| 291 | } |
| 292 | |
| 293 | /* |
| 294 | * Append the last chunk |
| 295 | */ |
| 296 | Ns_DStringNAppendTcl_DStringAppend(dsPtr, htmlString, -1); |
| 297 | |
| 298 | } |
| 299 | |
| 300 | if (needEncode) { |
| 301 | Tcl_DString ds2; |
| 302 | |
| 303 | (void)Tcl_ExternalToUtfDString(Ns_GetCharsetEncoding("utf-8"), |
| 304 | dsPtr->string, dsPtr->length, &ds2); |
| 305 | Tcl_DStringResult(interp, &ds2); |
| 306 | Tcl_DStringFree(dsPtr); |
| 307 | |
| 308 | } else { |
| 309 | Tcl_DStringResult(interp, dsPtr); |
| 310 | } |
| 311 | } |
| 312 | |
| 313 | return result; |
| 314 | } |
| 315 | |
| 316 | /* |
| 317 | *---------------------------------------------------------------------- |
| 318 | * |
| 319 | * ToUTF8 -- |
| 320 | * |
| 321 | * Convert a unicode code point to UTF8. The function writes from 0 up to |
| 322 | * 4 bytes to the output. |
| 323 | * |
| 324 | * Results: |
| 325 | * Returns number of bytes written to the output. The value of 0 means |
| 326 | * invalid input. |
| 327 | * |
| 328 | * Side effects: |
| 329 | * None. |
| 330 | * |
| 331 | *---------------------------------------------------------------------- |
| 332 | */ |
| 333 | |
| 334 | static int |
| 335 | ToUTF8(long value, char *outPtr) |
| 336 | { |
| 337 | int length = 0; |
| 338 | |
| 339 | NS_NONNULL_ASSERT(outPtr != NULL)((void) (0)); |
| 340 | |
| 341 | if(value <= 0x7F) { |
| 342 | *outPtr = (char)value; |
| 343 | length = 1; |
| 344 | |
| 345 | } else if (value <= 0x7FF) { |
| 346 | *outPtr++ = (char)(((value >> 6) & 0x1F) | 0xC0); |
| 347 | *outPtr++ = (char)(((value >> 0) & 0x3F) | 0x80); |
| 348 | length = 2; |
| 349 | |
| 350 | } else if (value <= 0xFFFF) { |
| 351 | *outPtr++ = (char) (((value >> 12) & 0x0F) | 0xE0); |
Duplicate code detected | |
| 352 | *outPtr++ = (char) (((value >> 6) & 0x3F) | 0x80); |
| 353 | *outPtr++ = (char) (((value >> 0) & 0x3F) | 0x80); |
| 354 | length = 3; |
| 355 | |
| 356 | } else if (value <= 0x10FFFF) { |
| 357 | *outPtr++ = (char) (((value >> 18) & 0x07) | 0xF0); |
| 358 | *outPtr++ = (char) (((value >> 12) & 0x3F) | 0x80); |
Similar code here | |
| 359 | *outPtr++ = (char) (((value >> 6) & 0x3F) | 0x80); |
| 360 | *outPtr++ = (char) (((value >> 0) & 0x3F) | 0x80); |
| 361 | length = 4; |
| 362 | } else { |
| 363 | length = 0; |
| 364 | } |
| 365 | return length; |
| 366 | } |
| 367 | |
| 368 | |
| 369 | /* |
| 370 | *---------------------------------------------------------------------- |
| 371 | * |
| 372 | * EntityDecode -- |
| 373 | * |
| 374 | * Decode an HTML/XML entity, which might be numeric (starting with a '#' |
| 375 | * sign) or non-numeric. |
| 376 | * |
| 377 | * Results: |
| 378 | * Number of decoded characters. |
| 379 | * |
| 380 | * Side effects: |
| 381 | * None. |
| 382 | * |
| 383 | *---------------------------------------------------------------------- |
| 384 | */ |
| 385 | |
| 386 | typedef struct namedEntity_t { |
| 387 | const char *name; |
| 388 | size_t length; |
| 389 | const char *value; |
| 390 | size_t outputLength; |
| 391 | } namedEntity_t; |
| 392 | |
| 393 | static const namedEntity_t namedEntities[] = { |
| 394 | {"AElig", 5, "\xc3\x86", 2}, /* "Æ" */ |
| 395 | {"Aacute", 6, "\xc3\x81", 2}, /* "Á" */ |
| 396 | {"Acirc", 5, "\xc3\x82", 2}, /* "Â" */ |
| 397 | {"Agrave", 6, "\xc3\x80", 2}, /* "À" */ |
| 398 | {"Alpha", 5, "\xce\x91", 2}, /* "Α" */ |
| 399 | {"Aring", 5, "\xc3\x85", 2}, /* "Å" */ |
| 400 | {"Atilde", 6, "\xc3\x83", 2}, /* "Ã" */ |
| 401 | {"Auml", 4, "\xc3\x84", 2}, /* "Ä" */ |
| 402 | {"Beta", 4, "\xce\x92", 2}, /* "Β" */ |
| 403 | {"Ccedil", 6, "\xc3\x87", 2}, /* "Ç" */ |
| 404 | {"Chi", 3, "\xce\xa7", 2}, /* "Χ" */ |
| 405 | {"Delta", 5, "\xce\x94", 2}, /* "Δ" */ |
| 406 | {"ETH", 3, "\xc3\x90", 2}, /* "Ð" */ |
| 407 | {"Eacute", 6, "\xc3\x89", 2}, /* "É" */ |
| 408 | {"Ecirc", 5, "\xc3\x8a", 2}, /* "Ê" */ |
| 409 | {"Egrave", 6, "\xc3\x88", 2}, /* "È" */ |
| 410 | {"Epsilon", 7, "\xce\x95", 2}, /* "Ε" */ |
| 411 | {"Eta", 3, "\xce\x97", 2}, /* "Η" */ |
| 412 | {"Euml", 4, "\xc3\x8b", 2}, /* "Ë" */ |
| 413 | {"Gamma", 5, "\xce\x93", 2}, /* "Γ" */ |
| 414 | {"Iacute", 6, "\xc3\x8d", 2}, /* "Í" */ |
| 415 | {"Icirc", 5, "\xc3\x8e", 2}, /* "Î" */ |
| 416 | {"Igrave", 6, "\xc3\x8c", 2}, /* "Ì" */ |
| 417 | {"Iota", 4, "\xce\x99", 2}, /* "Ι" */ |
| 418 | {"Iuml", 4, "\xc3\x8f", 2}, /* "Ï" */ |
| 419 | {"Kappa", 5, "\xce\x9a", 2}, /* "Κ" */ |
| 420 | {"Lambda", 6, "\xce\x9b", 2}, /* "Λ" */ |
| 421 | {"Mu", 2, "\xce\x9c", 2}, /* "Μ" */ |
| 422 | {"Ntilde", 6, "\xc3\x91", 2}, /* "Ñ" */ |
| 423 | {"Nu", 2, "\xce\x9d", 2}, /* "Ν" */ |
| 424 | {"Oacute", 6, "\xc3\x93", 2}, /* "Ó" */ |
| 425 | {"Ocirc", 5, "\xc3\x94", 2}, /* "Ô" */ |
| 426 | {"Ograve", 6, "\xc3\x92", 2}, /* "Ò" */ |
| 427 | {"Omega", 5, "\xce\xa9", 2}, /* "Ω" */ |
| 428 | {"Omicron", 7, "\xce\x9f", 2}, /* "Ο" */ |
| 429 | {"Oslash", 6, "\xc3\x98", 2}, /* "Ø" */ |
| 430 | {"Otilde", 6, "\xc3\x95", 2}, /* "Õ" */ |
| 431 | {"Ouml", 4, "\xc3\x96", 2}, /* "Ö" */ |
| 432 | {"Phi", 3, "\xce\xa6", 2}, /* "Φ" */ |
| 433 | {"Pi", 2, "\xce\xa0", 2}, /* "Π" */ |
| 434 | {"Prime", 5, "\xe2\x80\xb3", 3}, /* "″" */ |
| 435 | {"Psi", 3, "\xce\xa8", 2}, /* "Ψ" */ |
| 436 | {"Rho", 3, "\xce\xa1", 2}, /* "Ρ" */ |
| 437 | {"Sigma", 5, "\xce\xa3", 2}, /* "Σ" */ |
| 438 | {"THORN", 5, "\xc3\x9e", 2}, /* "Þ" */ |
| 439 | {"Tau", 3, "\xce\xa4", 2}, /* "Τ" */ |
| 440 | {"Theta", 5, "\xce\x98", 2}, /* "Θ" */ |
| 441 | {"Uacute", 6, "\xc3\x9a", 2}, /* "Ú" */ |
| 442 | {"Ucirc", 5, "\xc3\x9b", 2}, /* "Û" */ |
| 443 | {"Ugrave", 6, "\xc3\x99", 2}, /* "Ù" */ |
| 444 | {"Upsilon", 7, "\xce\xa5", 2}, /* "Υ" */ |
| 445 | {"Uuml", 4, "\xc3\x9c", 2}, /* "Ü" */ |
| 446 | {"Xi", 2, "\xce\x9e", 2}, /* "Ξ" */ |
| 447 | {"Yacute", 6, "\xc3\x9d", 2}, /* "Ý" */ |
| 448 | {"Zeta", 4, "\xce\x96", 2}, /* "Ζ" */ |
| 449 | {"aacute", 6, "\xc3\xa1", 2}, /* "á" */ |
| 450 | {"acirc", 5, "\xc3\xa2", 2}, /* "â" */ |
| 451 | {"acute", 5, "\xc2\xb4", 2}, /* "´" */ |
| 452 | {"aelig", 5, "\xc3\xa6", 2}, /* "æ" */ |
| 453 | {"agrave", 6, "\xc3\xa0", 2}, /* "à" */ |
| 454 | {"alefsym", 7, "\xe2\x84\xb5", 3}, /* "ℵ" */ |
| 455 | {"alpha", 5, "\xce\xb1", 2}, /* "α" */ |
| 456 | {"amp", 3, "\x26", 1}, /* "&" */ |
| 457 | {"and", 3, "\xe2\x88\xa7", 3}, /* "∧" */ |
| 458 | {"ang", 3, "\xe2\x88\xa0", 3}, /* "∠" */ |
| 459 | {"apos", 4, "\x27", 1}, /* "'" */ |
| 460 | {"aring", 5, "\xc3\xa5", 2}, /* "å" */ |
| 461 | {"asymp", 5, "\xe2\x89\x88", 3}, /* "≈" */ |
| 462 | {"atilde", 6, "\xc3\xa3", 2}, /* "ã" */ |
| 463 | {"auml", 4, "\xc3\xa4", 2}, /* "ä" */ |
| 464 | {"beta", 4, "\xce\xb2", 2}, /* "β" */ |
| 465 | {"brvbar", 6, "\xc2\xa6", 2}, /* "¦" */ |
| 466 | {"bull", 4, "\xe2\x80\xa2", 3}, /* "•" */ |
| 467 | {"cap", 3, "\xe2\x88\xa9", 3}, /* "∩" */ |
| 468 | {"ccedil", 6, "\xc3\xa7", 2}, /* "ç" */ |
| 469 | {"cedil", 5, "\xc2\xb8", 2}, /* "¸" */ |
| 470 | {"cent", 4, "\xc2\xa2", 2}, /* "¢" */ |
| 471 | {"chi", 3, "\xcf\x87", 2}, /* "χ" */ |
| 472 | {"clubs", 5, "\xe2\x99\xa3", 3}, /* "♣" */ |
| 473 | {"cong", 4, "\xe2\x89\x85", 3}, /* "≅" */ |
| 474 | {"copy", 4, "\xc2\xa9", 2}, /* "©" */ |
| 475 | {"crarr", 5, "\xe2\x86\xb5", 3}, /* "↵" */ |
| 476 | {"cup", 3, "\xe2\x88\xaa", 3}, /* "∪" */ |
| 477 | {"curren", 6, "\xc2\xa4", 2}, /* "¤" */ |
| 478 | {"dArr", 4, "\xe2\x87\x93", 3}, /* "⇓" */ |
| 479 | {"darr", 4, "\xe2\x86\x93", 3}, /* "↓" */ |
| 480 | {"deg", 3, "\xc2\xb0", 2}, /* "°" */ |
| 481 | {"delta", 5, "\xce\xb4", 2}, /* "δ" */ |
| 482 | {"diams", 5, "\xe2\x99\xa6", 3}, /* "♦" */ |
| 483 | {"divide", 6, "\xc3\xb7", 2}, /* "÷" */ |
| 484 | {"eacute", 6, "\xc3\xa9", 2}, /* "é" */ |
| 485 | {"ecirc", 5, "\xc3\xaa", 2}, /* "ê" */ |
| 486 | {"egrave", 6, "\xc3\xa8", 2}, /* "è" */ |
| 487 | {"empty", 5, "\xe2\x88\x85", 3}, /* "∅" */ |
| 488 | {"epsilon", 7, "\xce\xb5", 2}, /* "ε" */ |
| 489 | {"equiv", 5, "\xe2\x89\xa1", 3}, /* "≡" */ |
| 490 | {"eta", 3, "\xce\xb7", 2}, /* "η" */ |
| 491 | {"eth", 3, "\xc3\xb0", 2}, /* "ð" */ |
| 492 | {"euml", 4, "\xc3\xab", 2}, /* "ë" */ |
| 493 | {"euro", 4, "\xe2\x82\xac", 3}, /* "€" */ |
| 494 | {"exist", 5, "\xe2\x88\x83", 3}, /* "∃" */ |
| 495 | {"fnof", 4, "\xc6\x92", 2}, /* "ƒ" */ |
| 496 | {"forall", 6, "\xe2\x88\x80", 3}, /* "∀" */ |
| 497 | {"frac12", 6, "\xc2\xbd", 2}, /* "½" */ |
| 498 | {"frac14", 6, "\xc2\xbc", 2}, /* "¼" */ |
| 499 | {"frac34", 6, "\xc2\xbe", 2}, /* "¾" */ |
| 500 | {"frasl", 5, "\xe2\x81\x84", 3}, /* "⁄" */ |
| 501 | {"gamma", 5, "\xce\xb3", 2}, /* "γ" */ |
| 502 | {"ge", 2, "\xe2\x89\xa5", 3}, /* "≥" */ |
| 503 | {"gt", 2, "\x3e", 1}, /* ">" */ |
| 504 | {"hArr", 4, "\xe2\x87\x94", 3}, /* "⇔" */ |
| 505 | {"harr", 4, "\xe2\x86\x94", 3}, /* "↔" */ |
| 506 | {"hearts", 6, "\xe2\x99\xa5", 3}, /* "♥" */ |
| 507 | {"hellip", 6, "\xe2\x80\xa6", 3}, /* "…" */ |
| 508 | {"iacute", 6, "\xc3\xad", 2}, /* "í" */ |
| 509 | {"icirc", 5, "\xc3\xae", 2}, /* "î" */ |
| 510 | {"iexcl", 5, "\xc2\xa1", 2}, /* "¡" */ |
| 511 | {"igrave", 6, "\xc3\xac", 2}, /* "ì" */ |
| 512 | {"image", 5, "\xe2\x84\x91", 3}, /* "ℑ" */ |
| 513 | {"infin", 5, "\xe2\x88\x9e", 3}, /* "∞" */ |
| 514 | {"int", 3, "\xe2\x88\xab", 3}, /* "∫" */ |
| 515 | {"iota", 4, "\xce\xb9", 2}, /* "ι" */ |
| 516 | {"iquest", 6, "\xc2\xbf", 2}, /* "¿" */ |
| 517 | {"isin", 4, "\xe2\x88\x88", 3}, /* "∈" */ |
| 518 | {"iuml", 4, "\xc3\xaf", 2}, /* "ï" */ |
| 519 | {"kappa", 5, "\xce\xba", 2}, /* "κ" */ |
| 520 | {"lArr", 4, "\xe2\x87\x90", 3}, /* "⇐" */ |
| 521 | {"lambda", 6, "\xce\xbb", 2}, /* "λ" */ |
| 522 | {"lang", 4, "\xe3\x80\x88", 3}, /* "〈" */ |
| 523 | {"laquo", 5, "\xc2\xab", 2}, /* "«" */ |
| 524 | {"larr", 4, "\xe2\x86\x90", 3}, /* "←" */ |
| 525 | {"lceil", 5, "\xe2\x8c\x88", 3}, /* "⌈" */ |
| 526 | {"le", 2, "\xe2\x89\xa4", 3}, /* "≤" */ |
| 527 | {"lfloor", 6, "\xe2\x8c\x8a", 3}, /* "⌊" */ |
| 528 | {"lowast", 6, "\xe2\x88\x97", 3}, /* "∗" */ |
| 529 | {"loz", 3, "\xe2\x97\x8a", 3}, /* "◊" */ |
| 530 | {"lt", 2, "\x3c", 1}, /* "<" */ |
| 531 | {"macr", 4, "\xc2\xaf", 2}, /* "¯" */ |
| 532 | {"micro", 5, "\xc2\xb5", 2}, /* "µ" */ |
| 533 | {"middot", 6, "\xc2\xb7", 2}, /* "·" */ |
| 534 | {"minus", 5, "\xe2\x88\x92", 3}, /* "−" */ |
| 535 | {"mu", 2, "\xce\xbc", 2}, /* "μ" */ |
| 536 | {"nabla", 5, "\xe2\x88\x87", 3}, /* "∇" */ |
| 537 | {"nbsp", 4, "\x20", 1}, /* " " */ |
| 538 | {"ne", 2, "\xe2\x89\xa0", 3}, /* "≠" */ |
| 539 | {"ni", 2, "\xe2\x88\x8b", 3}, /* "∋" */ |
| 540 | {"not", 3, "\xc2\xac", 2}, /* "¬" */ |
| 541 | {"notin", 5, "\xe2\x88\x89", 3}, /* "∉" */ |
| 542 | {"nsub", 4, "\xe2\x8a\x84", 3}, /* "⊄" */ |
| 543 | {"ntilde", 6, "\xc3\xb1", 2}, /* "ñ" */ |
| 544 | {"nu", 2, "\xce\xbd", 2}, /* "ν" */ |
| 545 | {"oacute", 6, "\xc3\xb3", 2}, /* "ó" */ |
| 546 | {"ocirc", 5, "\xc3\xb4", 2}, /* "ô" */ |
| 547 | {"ograve", 6, "\xc3\xb2", 2}, /* "ò" */ |
| 548 | {"oline", 5, "\xe2\x80\xbe", 3}, /* "‾" */ |
| 549 | {"omega", 5, "\xcf\x89", 2}, /* "ω" */ |
| 550 | {"omicron", 7, "\xce\xbf", 2}, /* "ο" */ |
| 551 | {"oplus", 5, "\xe2\x8a\x95", 3}, /* "⊕" */ |
| 552 | {"or", 2, "\xe2\x88\xa8", 3}, /* "∨" */ |
| 553 | {"ordf", 4, "\xc2\xaa", 2}, /* "ª" */ |
| 554 | {"ordm", 4, "\xc2\xba", 2}, /* "º" */ |
| 555 | {"oslash", 6, "\xc3\xb8", 2}, /* "ø" */ |
| 556 | {"otilde", 6, "\xc3\xb5", 2}, /* "õ" */ |
| 557 | {"otimes", 6, "\xe2\x8a\x97", 3}, /* "⊗" */ |
| 558 | {"ouml", 4, "\xc3\xb6", 2}, /* "ö" */ |
| 559 | {"para", 4, "\xc2\xb6", 2}, /* "¶" */ |
| 560 | {"part", 4, "\xe2\x88\x82", 3}, /* "∂" */ |
| 561 | {"perp", 4, "\xe2\x8a\xa5", 3}, /* "⊥" */ |
| 562 | {"phi", 3, "\xcf\x86", 2}, /* "φ" */ |
| 563 | {"pi", 2, "\xcf\x80", 2}, /* "π" */ |
| 564 | {"piv", 3, "\xcf\x96", 2}, /* "ϖ" */ |
| 565 | {"plusmn", 6, "\xc2\xb1", 2}, /* "±" */ |
| 566 | {"pound", 5, "\xc2\xa3", 2}, /* "£" */ |
| 567 | {"prime", 5, "\xe2\x80\xb2", 3}, /* "′" */ |
| 568 | {"prod", 4, "\xe2\x88\x8f", 3}, /* "∏" */ |
| 569 | {"prop", 4, "\xe2\x88\x9d", 3}, /* "∝" */ |
| 570 | {"psi", 3, "\xcf\x88", 2}, /* "ψ" */ |
| 571 | {"quot", 4, "\x22", 1}, /* "\"" */ |
| 572 | {"rArr", 4, "\xe2\x87\x92", 3}, /* "⇒" */ |
| 573 | {"radic", 5, "\xe2\x88\x9a", 3}, /* "√" */ |
| 574 | {"rang", 4, "\xe3\x80\x89", 3}, /* "〉" */ |
| 575 | {"raquo", 5, "\xc2\xbb", 2}, /* "»" */ |
| 576 | {"rarr", 4, "\xe2\x86\x92", 3}, /* "→" */ |
| 577 | {"rceil", 5, "\xe2\x8c\x89", 3}, /* "⌉" */ |
| 578 | {"real", 4, "\xe2\x84\x9c", 3}, /* "ℜ" */ |
| 579 | {"reg", 3, "\xc2\xae", 2}, /* "®" */ |
| 580 | {"rfloor", 6, "\xe2\x8c\x8b", 3}, /* "⌋" */ |
| 581 | {"rho", 3, "\xcf\x81", 2}, /* "ρ" */ |
| 582 | {"sdot", 4, "\xe2\x8b\x85", 3}, /* "⋅" */ |
| 583 | {"sect", 4, "\xc2\xa7", 2}, /* "§" */ |
| 584 | {"shy", 3, "\xc2\xad", 2}, /* "" */ |
| 585 | {"sigma", 5, "\xcf\x83", 2}, /* "σ" */ |
| 586 | {"sigmaf", 6, "\xcf\x82", 2}, /* "ς" */ |
| 587 | {"sim", 3, "\xe2\x88\xbc", 3}, /* "∼" */ |
| 588 | {"spades", 6, "\xe2\x99\xa0", 3}, /* "♠" */ |
| 589 | {"sub", 3, "\xe2\x8a\x82", 3}, /* "⊂" */ |
| 590 | {"sube", 4, "\xe2\x8a\x86", 3}, /* "⊆" */ |
| 591 | {"sum", 3, "\xe2\x88\x91", 3}, /* "∑" */ |
| 592 | {"sup", 3, "\xe2\x8a\x83", 3}, /* "⊃" */ |
| 593 | {"sup1", 4, "\xc2\xb9", 2}, /* "¹" */ |
| 594 | {"sup2", 4, "\xc2\xb2", 2}, /* "²" */ |
| 595 | {"sup3", 4, "\xc2\xb3", 2}, /* "³" */ |
| 596 | {"supe", 4, "\xe2\x8a\x87", 3}, /* "⊇" */ |
| 597 | {"szlig", 5, "\xc3\x9f", 2}, /* "ß" */ |
| 598 | {"tau", 3, "\xcf\x84", 2}, /* "τ" */ |
| 599 | {"there4", 6, "\xe2\x88\xb4", 3}, /* "∴" */ |
| 600 | {"theta", 5, "\xce\xb8", 2}, /* "θ" */ |
| 601 | {"thetasym", 8, "\xcf\x91", 2}, /* "ϑ" */ |
| 602 | {"thorn", 5, "\xc3\xbe", 2}, /* "þ" */ |
| 603 | {"times", 5, "\xc3\x97", 2}, /* "×" */ |
| 604 | {"trade", 5, "\xe2\x84\xa2", 3}, /* "™" */ |
| 605 | {"uArr", 4, "\xe2\x87\x91", 3}, /* "⇑" */ |
| 606 | {"uacute", 6, "\xc3\xba", 2}, /* "ú" */ |
| 607 | {"uarr", 4, "\xe2\x86\x91", 3}, /* "↑" */ |
| 608 | {"ucirc", 5, "\xc3\xbb", 2}, /* "û" */ |
| 609 | {"ugrave", 6, "\xc3\xb9", 2}, /* "ù" */ |
| 610 | {"uml", 3, "\xc2\xa8", 2}, /* "¨" */ |
| 611 | {"upsih", 5, "\xcf\x92", 2}, /* "ϒ" */ |
| 612 | {"upsilon", 7, "\xcf\x85", 2}, /* "υ" */ |
| 613 | {"uuml", 4, "\xc3\xbc", 2}, /* "ü" */ |
| 614 | {"weierp", 6, "\xe2\x84\x98", 3}, /* "℘" */ |
| 615 | {"xi", 2, "\xce\xbe", 2}, /* "ξ" */ |
| 616 | {"yacute", 6, "\xc3\xbd", 2}, /* "ý" */ |
| 617 | {"yen", 3, "\xc2\xa5", 2}, /* "¥" */ |
| 618 | {"yuml", 4, "\xc3\xbf", 2}, /* "ÿ" */ |
| 619 | {"zeta", 4, "\xce\xb6", 2}, /* "ζ" */ |
| 620 | {NULL((void*)0), 0, "", 0} |
| 621 | }; |
| 622 | |
| 623 | |
| 624 | static size_t |
| 625 | EntityDecode(const char *entity, size_t length, bool_Bool *needEncodePtr, char *outPtr) |
| 626 | { |
| 627 | size_t decoded = 0u; |
| 628 | |
| 629 | NS_NONNULL_ASSERT(entity != NULL)((void) (0)); |
| 630 | NS_NONNULL_ASSERT(outPtr != NULL)((void) (0)); |
| 631 | NS_NONNULL_ASSERT(needEncodePtr != NULL)((void) (0)); |
| 632 | |
| 633 | /* |
| 634 | * Handle numeric entities. |
| 635 | */ |
| 636 | if (*entity == '#') { |
| 637 | long value; |
| 638 | |
| 639 | if (CHARTYPE(digit, *(entity + 1))(((*__ctype_b_loc ())[(int) (((int)((unsigned char)(*(entity + 1)))))] & (unsigned short int) _ISdigit)) != 0) { |
| 640 | /* |
| 641 | * Decimal numeric entity. |
| 642 | */ |
| 643 | value = strtol(entity + 1, NULL((void*)0), 10); |
| 644 | |
| 645 | } else if (*(entity + 1) == 'x' && length >= 3 && length <= 8) { |
| 646 | /* |
| 647 | * Hexadecimal numeric entity. |
| 648 | */ |
| 649 | value = strtol(entity + 2, NULL((void*)0), 16); |
| 650 | |
| 651 | } else { |
| 652 | Ns_Log(Warning, "invalid numeric entity: '%s'", entity); |
| 653 | value = 0; |
| 654 | } |
| 655 | |
| 656 | if (value >= 32) { |
| 657 | int outLength; |
| 658 | |
| 659 | outLength = ToUTF8(value, outPtr); |
| 660 | decoded += (size_t)outLength; |
| 661 | |
| 662 | Ns_Log(Debug, "entity decode: code point %.2lx %.2lx " |
| 663 | "corresponds to %d UTF-8 characters", |
| 664 | ((value >> 8) & 0xff), (value & 0xff), outLength); |
| 665 | |
| 666 | if (value > 127) { |
| 667 | *needEncodePtr = NS_TRUE1; |
| 668 | } |
| 669 | } else { |
| 670 | /* |
| 671 | * ASCII device control characters should not be present in HTML. |
| 672 | */ |
| 673 | Ns_Log(Notice, "entity decode: ignore numeric entity with value %ld", value); |
| 674 | } |
| 675 | } else { |
| 676 | size_t i; |
| 677 | |
| 678 | for (i = 0; namedEntities[i].name != NULL((void*)0); i++) { |
| 679 | char firstChar = *namedEntities[i].name; |
| 680 | |
| 681 | if (firstChar == *entity |
| 682 | && length == namedEntities[i].length |
| 683 | && strncmp(entity, namedEntities[i].name, length) == 0) { |
| 684 | |
| 685 | /*if (strlen(entities[i].value) != entities[i].outputLength) { |
| 686 | fprintf(stderr, "--> name %s found l = %lu\n", |
| 687 | entities[i].name, strlen(entities[i].value)); |
| 688 | }*/ |
| 689 | if (namedEntities[i].outputLength > 1) { |
| 690 | |
| 691 | memcpy(outPtr, namedEntities[i].value, namedEntities[i].outputLength); |
| 692 | decoded += namedEntities[i].outputLength; |
| 693 | } else { |
| 694 | *outPtr = *namedEntities[i].value; |
| 695 | decoded++; |
| 696 | } |
| 697 | break; |
| 698 | } |
| 699 | |
| 700 | if (firstChar > *entity) { |
| 701 | Ns_Log(Warning, "ignore unknown named entity '%s'", entity); |
| 702 | break; |
| 703 | } |
| 704 | } |
| 705 | } |
| 706 | |
| 707 | return decoded; |
| 708 | } |
| 709 | |
| 710 | |
| 711 | /* |
| 712 | *---------------------------------------------------------------------- |
| 713 | * |
| 714 | * WordEndsInSemi -- |
| 715 | * |
| 716 | * Does this word end in a semicolon or a space? |
| 717 | * |
| 718 | * Results: |
| 719 | * Returns true if the word endes with a semicolon. |
| 720 | * |
| 721 | * Side effects: |
| 722 | * Undefined behavior if string does not end in null |
| 723 | * |
| 724 | *---------------------------------------------------------------------- |
| 725 | */ |
| 726 | |
| 727 | static bool_Bool |
| 728 | WordEndsInSemi(const char *word, size_t *lengthPtr) |
| 729 | { |
| 730 | const char *start; |
| 731 | |
| 732 | NS_NONNULL_ASSERT(word != NULL)((void) (0)); |
| 733 | |
| 734 | /* |
| 735 | * Advance past the first '&' so we can check for a second |
| 736 | * (i.e. to handle "ben&jerry ") |
| 737 | */ |
| 738 | if (*word == '&') { |
| 739 | word++; |
| 740 | } |
| 741 | start = word; |
| 742 | while((*word != '\0') && (*word != ' ') && (*word != ';') && (*word != '&')) { |
| 743 | word++; |
| 744 | } |
| 745 | *lengthPtr = (size_t)(word - start); |
| 746 | |
| 747 | return (*word == ';'); |
| 748 | } |
| 749 | |
| 750 | |
| 751 | |
| 752 | /* |
| 753 | *---------------------------------------------------------------------- |
| 754 | * |
| 755 | * NsTclStripHtmlObjCmd -- |
| 756 | * |
| 757 | * Implements "ns_striphtml". |
| 758 | * |
| 759 | * Results: |
| 760 | * Tcl result. |
| 761 | * |
| 762 | * Side effects: |
| 763 | * See docs. |
| 764 | * |
| 765 | *---------------------------------------------------------------------- |
| 766 | */ |
| 767 | |
| 768 | int |
| 769 | NsTclStripHtmlObjCmd(ClientData UNUSED(clientData)UNUSED_clientData __attribute__((__unused__)), Tcl_Interp *interp, int objc, Tcl_Obj *const* objv) |
| 770 | { |
| 771 | int result = TCL_OK0; |
| 772 | char *htmlString = (char *)NS_EMPTY_STRING; |
| 773 | Ns_ObjvSpec args[] = { |
| 774 | {"html", Ns_ObjvString, &htmlString, NULL((void*)0)}, |
| 775 | {NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0)} |
| 776 | }; |
| 777 | |
| 778 | if (Ns_ParseObjv(NULL((void*)0), args, interp, 1, objc, objv) != NS_OK) { |
| 779 | result = TCL_ERROR1; |
| 780 | |
| 781 | } else { |
| 782 | bool_Bool intag; /* flag to see if are we inside a tag */ |
| 783 | bool_Bool incomment; /* flag to see if we are inside a comment */ |
| 784 | char *inString; /* copy of input string */ |
| 785 | char *outPtr; /* moving pointer to output string */ |
| 786 | const char *inPtr; /* moving pointer to input string */ |
| 787 | bool_Bool needEncode; |
| 788 | |
| 789 | /* |
| 790 | * Make a copy of the input and point the moving and output ptrs to it. |
| 791 | */ |
| 792 | inString = ns_strdup(htmlString); |
| 793 | inPtr = inString; |
| 794 | outPtr = inString; |
| 795 | intag = NS_FALSE0; |
| 796 | incomment = NS_FALSE0; |
| 797 | needEncode = NS_FALSE0; |
| 798 | |
| 799 | while (*inPtr != '\0') { |
| 800 | |
| 801 | Ns_Log(Debug, "inptr %c intag %d incomment %d string <%s>", |
| 802 | *inPtr, intag, incomment, inPtr); |
| 803 | |
| 804 | if (*inPtr == '<') { |
| 805 | intag = NS_TRUE1; |
| 806 | if ((*(inPtr + 1) == '!') |
| 807 | && (*(inPtr + 2) == '-') |
| 808 | && (*(inPtr + 3) == '-')) { |
| 809 | incomment = NS_TRUE1; |
| 810 | } |
| 811 | } else if (incomment) { |
| 812 | if ((*(inPtr) == '-') |
| 813 | && (*(inPtr + 1) == '-') |
| 814 | && (*(inPtr + 2) == '>')) { |
| 815 | incomment = NS_FALSE0; |
| 816 | } |
| 817 | } else if (intag && (*inPtr == '>')) { |
| 818 | /* |
| 819 | * Closing a tag. |
| 820 | */ |
| 821 | intag = NS_FALSE0; |
| 822 | |
| 823 | } else if (!intag) { |
| 824 | /* |
| 825 | * Regular text |
| 826 | */ |
| 827 | |
| 828 | if (*inPtr == '&') { |
| 829 | size_t length = 0u; |
| 830 | |
| 831 | /* |
| 832 | * Starting an entity. |
| 833 | */ |
| 834 | if (WordEndsInSemi(inPtr, &length)) { |
| 835 | size_t decoded = EntityDecode(inPtr + 1u, length, &needEncode, outPtr); |
| 836 | |
| 837 | inPtr += (length + 1u); |
| 838 | outPtr += decoded; |
| 839 | } |
| 840 | Ns_Log(Debug, "...... after entity inptr '%c' intag %d incomment %d string <%s> needEncode %d", |
| 841 | *inPtr, intag, incomment, inPtr, needEncode); |
| 842 | } else { |
| 843 | /* |
| 844 | * Plain Text output |
| 845 | */ |
| 846 | *outPtr++ = *inPtr; |
| 847 | } |
| 848 | |
| 849 | } else { |
| 850 | /* |
| 851 | * Must be intag |
| 852 | */ |
| 853 | } |
| 854 | ++inPtr; |
| 855 | } |
| 856 | |
| 857 | /* |
| 858 | * Terminate output string. |
| 859 | */ |
| 860 | *outPtr = '\0'; |
| 861 | |
| 862 | if (needEncode) { |
| 863 | Tcl_DString ds; |
| 864 | |
| 865 | (void)Tcl_ExternalToUtfDString(Ns_GetCharsetEncoding("utf-8"), |
| 866 | inString, (int)strlen(inString), &ds); |
| 867 | Tcl_DStringResult(interp, &ds); |
| 868 | } else { |
| 869 | Tcl_SetObjResult(interp, Tcl_NewStringObj(inString, -1)); |
| 870 | } |
| 871 | ns_free(inString); |
| 872 | } |
| 873 | return result; |
| 874 | } |
| 875 | |
| 876 | |
| 877 | /* |
| 878 | * Local Variables: |
| 879 | * mode: c |
| 880 | * c-basic-offset: 4 |
| 881 | * fill-column: 78 |
| 882 | * indent-tabs-mode: nil |
| 883 | * End: |
| 884 | */ |