Improve the efficiency of xml_convert. The old version was wasteful of space;

it started byÂ allocating six times the size of the input string because in the worst case each byte can take up to six bytes when escaped (&#xXX;). It was wasteful of time because it built the string up with strncat, which pads the entire destination buffer with null bytes every time it was called. This led to quadratic time complexity, not linear as expected. The new version uses the usual strategy of doubling the size of the buffer whenever it runs out of space. It builds up the string using memcpy, checking each time that there is space for the new copy.
2025-12-15 12:19:02 +00:00 · 2009-02-08 05:28:38 +00:00
parent 590e7b7574
commit aaaed61c52
1 changed files with 54 additions and 57 deletions
--- a/output.cc
+++ b/output.cc
@@ -887,66 +887,63 @@ log_write(LOG_PLAIN, "%d service%s unrecognized despite returning data. If you k
 }


-/* Note that this escapes newlines, which is generally needed in
-   attributes to avoid parser normalization, but might not be needed
-   or desirable in XML content outside of attributes.  So if we find
-   some cases where we don't want \r\n\t escaped, we'll have to add a
-   parameter to control this. */
-char* xml_convert (const char* str) {
-  char *temp, ch=0, prevch = 0, *p;
-  int strl = strlen(str);
-  temp = (char *) safe_malloc(strl*6+1);
-  char *end = temp + strl * 6 + 1;
-  for (p = temp;(prevch = ch, ch = *str);str++) {
-    const char *a;
-    if ((unsigned char) ch > 0x7F) {
-      /* Escape anything outside of ASCII--we have to emit UTF-8 and an easy
-         way to do that is to emit ASCII. */
-      char buf[32];
-      Snprintf(buf, sizeof(buf), "&#x%02X;", (unsigned char) ch);
-      a = buf;
+/* Escape a string for inclusion in XML. This gets <>&, "' for attribute values,
+   -- for inside comments, and characters with value > 0x7F. It also gets
+   control characters with value < 0x20 to avoid parser normalization of \r\n\t
+   in attribute values. If this is not desired in some cases, we'll have to add
+   a parameter to control this. */
+char *xml_convert(const char *str) {
+  /* result is the result buffer; n + 1 is the allocated size. Double the
+     allocation when space runs out. */
+  char *result = NULL;
+  size_t n = 0, len;
+  const char *p;
+  int i;
+
+  i = 0;
+  for (p = str; *p != '\0'; p++) {
+    const char *repl;
+    char buf[32];
+
+    if (*p == '<')
+      repl = "&lt;";
+    else if (*p == '>')
+      repl = "&gt;";
+    else if (*p == '&')
+      repl = "&amp;";
+    else if (*p == '"')
+      repl = "&quot;";
+    else if (*p == '\'')
+      repl = "&apos;";
+    else if (*p == '-' && p > str && *(p - 1) == '-') {
+      /* Escape -- for comments. */
+      repl = "&#45;";
+    } else if (*p < 0x20 || (unsigned char) *p > 0x7F) {
+      /* Escape control characters and anything outside of ASCII. We have to
+         emit UTF-8 and an easy way to do that is to emit ASCII. */
+      Snprintf(buf, sizeof(buf), "&#x%x;", (unsigned char) *p);
+      repl = buf;
    } else {
-      switch (ch) {
-      case '\t':
-        a = "&#x9;";
-        break;
-      case '\r':
-        a = "&#xd;";
-        break;
-      case '\n':
-        a = "&#xa;";
-        break;
-      case '<':
-        a = "&lt;";
-        break;
-      case '>':
-        a = "&gt;";
-        break;
-      case '&':
-        a =  "&amp;";
-        break;
-      case '"':
-        a = "&quot;";
-        break;
-      case '\'':
-        a = "&apos;";
-        break;
-      case '-': 
-        if (prevch == '-') { /* Must escape -- for comments */
-          a =  "&#45;";
-          break;
-        }
-      default:
-        *p++ = ch;
-        continue;
-      }
+      /* Unescaped character. */
+      buf[0] = *p;
+      buf[1] = '\0';
+      repl = buf;
    }
-    assert(end - p > 1);
-    Strncpy(p,a, end - p - 1); p += strlen(a); // SAFE
+
+    len = strlen(repl);
+    /* Double the size of the result buffer if necessary. */
+    if (i + len > n) {
+      n = (i + len) * 2;
+      result = (char *) safe_realloc(result, n + 1);
+    }
+    memcpy(result + i, repl, len);
+    i += len;
  }
-  *p = 0;
-  temp = (char *) safe_realloc(temp,strlen(temp)+1);
-  return temp;
+  /* Trim to length. (Also does initial allocation when str is empty.) */
+  result = (char *) safe_realloc(result, i + 1);
+  result[i] = '\0';
+
+  return result;
 }

 char *logfilename(const char *str, struct tm *tm)