Parsing improvements for udp payloads

Save some memory and effort when parsing UDP payloads by reusing the rather large buffer inside each token when possible, and only using std::string::append() when necessary. For the current file, this avoids *all* reallocations.
2026-01-24 15:19:03 +00:00 · 2021-12-12 23:42:39 +00:00
parent 2cbc7712da
commit 85c1fd9b18
2 changed files with 161 additions and 88 deletions
--- a/2
+++ b/2
@@ -29,6 +29,8 @@
 # which case they will be all be sent concurrently. There is a limit
 # of 255 payloads per port.
 #
+# Lines longer than 1024 characters will be ignored.
+#
 # Example:
 # udp 1234 "payloaddatapayloaddata"
 #   "payloaddatapayloaddata"
--- a/payload.cc
+++ b/payload.cc
@@ -82,7 +82,17 @@ extern NmapOps o;

 struct payload {
  std::string data;
+
+  payload (const char *c, size_t n)
+    : data(c, n)
+    {}
  /* Extra data such as source port goes here. */
+
+  /* If 2 payloads are equivalent according to this operator, we'll only keep
+   * the first one, so be sure you update it when adding other attributes. */
+  bool operator==(const payload& other) const {
+    return data == other.data;
+  }
 };

 /* The key for the payload lookup map is a (proto, port) pair. */
@@ -108,16 +118,18 @@ static std::vector<struct payload *> uniquePayloads; // for accounting

 /* Newlines are significant because keyword directives (like "source") that
   follow the payload string are significant to the end of the line. */
-enum token_type {
+typedef enum token_type {
+  TOKEN_ERROR = -1,
  TOKEN_EOF = 0,
  TOKEN_NEWLINE,
  TOKEN_SYMBOL,
  TOKEN_STRING,
-};
+} token_t;

 struct token {
-  char text[1024];
+  token_t type;
  size_t len;
+  char text[1024];
 };

 static unsigned long line_no;
@@ -126,7 +138,7 @@ static unsigned long line_no;
   error. The token type is also stored in token->type. For TOKEN_SYMBOL and
   TOKEN_STRING, the text is stored in token->text and token->len. The text is
   null terminated. */
-static int next_token(FILE *fp, struct token *token) {
+static token_t next_token(FILE *fp, struct token *token) {
  unsigned int i, tmplen;
  int c;

@@ -136,79 +148,86 @@ static int next_token(FILE *fp, struct token *token) {
  while (isspace(c = fgetc(fp)) && c != '\n')
    ;

-  if (c == EOF) {
-    return TOKEN_EOF;
-  } else if (c == '\n') {
-    line_no++;
-    return TOKEN_NEWLINE;
-  } else if (c == '#') {
-    while ((c = fgetc(fp)) != EOF && c != '\n')
-      ;
-    if (c == EOF) {
-      return TOKEN_EOF;
-    } else {
+  switch(c) {
+    case EOF:
+      token->type = TOKEN_EOF;
+      break;
+    case '\n':
      line_no++;
-      return TOKEN_NEWLINE;
-    }
-  } else if (c == '"') {
-    i = 0;
-    while ((c = fgetc(fp)) != EOF && c != '\n' && c != '"') {
-      if (i + 1 >= sizeof(token->text))
-        return -1;
-      if (c == '\\') {
-        token->text[i++] = '\\';
-        if (i + 1 >= sizeof(token->text))
-          return -1;
-        c = fgetc(fp);
-        if (c == EOF)
-          return -1;
+      token->type = TOKEN_NEWLINE;
+      break;
+    case '#':
+      while ((c = fgetc(fp)) != EOF && c != '\n')
+        ;
+      if (c == EOF) {
+        token->type = TOKEN_EOF;
+      } else {
+        line_no++;
+        token->type = TOKEN_NEWLINE;
      }
+      break;
+    case '"':
+      token->type = TOKEN_STRING;
+      i = 0;
+      while ((c = fgetc(fp)) != EOF && c != '\n' && c != '"') {
+        if (i + 1 >= sizeof(token->text))
+          return TOKEN_ERROR;
+        if (c == '\\') {
+          token->text[i++] = '\\';
+          if (i + 1 >= sizeof(token->text))
+            return TOKEN_ERROR;
+          c = fgetc(fp);
+          if (c == EOF)
+            return TOKEN_ERROR;
+        }
+        token->text[i++] = c;
+      }
+      if (c != '"')
+        return TOKEN_ERROR;
+      token->text[i] = '\0';
+      if (cstring_unescape(token->text, &tmplen) == NULL)
+        return TOKEN_ERROR;
+      token->len = tmplen;
+      break;
+    default:
+      token->type = TOKEN_SYMBOL;
+      i = 0;
      token->text[i++] = c;
-    }
-    if (c != '"')
-      return -1;
-    token->text[i] = '\0';
-    if (cstring_unescape(token->text, &tmplen) == NULL)
-      return -1;
-    token->len = tmplen;
-    return TOKEN_STRING;
-  } else {
-    i = 0;
-    token->text[i++] = c;
-    while ((c = fgetc(fp)) != EOF && (isalnum(c) || c == ',' || c == '-')) {
-      if (i + 1 >= sizeof(token->text))
-        return -1;
-      token->text[i++] = c;
-    }
-    ungetc(c, fp);
-    token->text[i] = '\0';
-    token->len = i;
-    return TOKEN_SYMBOL;
+      while ((c = fgetc(fp)) != EOF && (isalnum(c) || c == ',' || c == '-')) {
+        if (i + 1 >= sizeof(token->text))
+          return TOKEN_ERROR;
+        token->text[i++] = c;
+      }
+      ungetc(c, fp);
+      token->text[i] = '\0';
+      token->len = i;
+      break;
  }

-  return -1;
+  return token->type;
 }

 /* Loop over fp, reading tokens and adding payloads to the global payloads map
   as they are completed. Returns -1 on error. */
 static int load_payloads_from_file(FILE *fp) {
  struct token token;
-  int type;
+  unsigned long firstline = 0;

  line_no = 1;
-  type = next_token(fp, &token);
+  token_t type = next_token(fp, &token);
  for (;;) {
    unsigned short *ports;
    int count;
+    bool duplicate = false;

-    while (type == TOKEN_NEWLINE)
+    /* Skip everything (unknown keywords from previous payload, unknown file
+     * keywords, etc.) until the next payload entry or EOF */
+    while (type != TOKEN_EOF && !(type == TOKEN_SYMBOL && strcmp(token.text, "udp") == 0))
      type = next_token(fp, &token);
    if (type == TOKEN_EOF)
      break;
-    if (type != TOKEN_SYMBOL || strcmp(token.text, "udp") != 0) {
-      fprintf(stderr, "Expected \"udp\" at line %lu of %s.\n", line_no, PAYLOAD_FILENAME);
-      return -1;
-    }
+
+    firstline = line_no;

    type = next_token(fp, &token);
    if (type != TOKEN_SYMBOL) {
@@ -221,46 +240,98 @@ static int load_payloads_from_file(FILE *fp) {
      return -1;
    }

-    struct payload *portPayload = new struct payload;
-    uniquePayloads.push_back(portPayload);
-    for (;;) {
-      type = next_token(fp, &token);
-      if (type == TOKEN_STRING)
-        portPayload->data.append(token.text, token.len);
-      else if (type == TOKEN_NEWLINE)
-        ; /* Nothing. */
-      else
-        break;
+    while(TOKEN_NEWLINE == (type = next_token(fp, &token)))
+      ; // skip newlines
+
+    if (type != TOKEN_STRING) {
+      log_write(LOG_STDERR, "Payload missing data at line %lu of %s.\n", line_no, PAYLOAD_FILENAME);
+      // Try a new payload
+      free(ports);
+      continue;
    }

-    /* Ignore keywords like "source" to the end of the line. */
-    if (type == TOKEN_SYMBOL && strcmp(token.text, "udp") != 0) {
-      while (type != -1 && type != TOKEN_EOF && type != TOKEN_NEWLINE)
-        type = next_token(fp, &token);
+    struct payload *portPayload = NULL;
+    // Peek at the next significant token
+    struct token peek_token;
+    while (TOKEN_NEWLINE == (type = next_token(fp, &peek_token)))
+      ; // skip newlines
+
+    // If it's a string continuation, see if we can squeeze it into the current token.
+    while (type == TOKEN_STRING) {
+      if (token.len + peek_token.len < sizeof(token.text)) {
+        // Next string fits in this one's buffer!
+        memcpy(token.text + token.len, peek_token.text, peek_token.len);
+        token.len += peek_token.len;
+      }
+      else {
+        // Token is full
+        if (portPayload == NULL) {
+          // Allocate new payload
+          portPayload = new struct payload (token.text, token.len);
+        }
+        else {
+          // append token to current payload
+          portPayload->data.append(token.text, token.len);
+        }
+        // peek_token becomes the previous token
+        token = peek_token;
+      }
+      // Keep peeking forward
+      while (TOKEN_NEWLINE == (type = next_token(fp, &peek_token)))
+        ; // skip newlines
+    }
+
+    // If the string is still going, but we got an error, abandon this payload.
+    if (type == TOKEN_ERROR && peek_token.type == TOKEN_STRING) {
+      log_write(LOG_STDERR, "Error parsing payload data at line %lu of %s.\n", line_no, PAYLOAD_FILENAME);
+      if (portPayload)
+        delete portPayload;
+      // maybe we can pick up at the next payload.
+      type = next_token(fp, &token);
+      free(ports);
+      continue;
+    }
+
+    // Otherwise, stash the last token in the payload and move on.
+    if (portPayload == NULL) {
+      // Allocate new payload
+      portPayload = new struct payload (token.text, token.len);
+    }
+    else {
+      // append token to current payload
+      portPayload->data.append(token.text, token.len);
+    }
+    token = peek_token;
+
+    // Here we would parse additional keywords like "source" that we might care about.
+
+    // Make sure these payloads are actually unique!
+    for (std::vector<struct payload *>::const_iterator it = uniquePayloads.begin();
+        it != uniquePayloads.end(); ++it) {
+      if (**it == *portPayload) {
+        // Probably not what they intended.
+        log_write(LOG_STDERR, "Duplicate payload on line %lu of %s.\n", firstline, PAYLOAD_FILENAME);
+        // Since they're functionally equivalent, only keep one copy.
+        duplicate = true;
+        delete portPayload;
+        portPayload = *it;
+        break;
+      }
+    }
+    if (!duplicate) {
+      uniquePayloads.push_back(portPayload);
+      duplicate = false;
    }

    for (int p = 0; p < count; p++) {
-      std::vector<struct payload *>::const_iterator portPayloadVectorIterator;
      const struct proto_dport key(IPPROTO_UDP, ports[p]);
-      bool duplicate = false;

      std::vector<struct payload *> &portPayloadVector = portPayloads[key];

-      for (portPayloadVectorIterator = portPayloadVector.begin();
-          portPayloadVectorIterator != portPayloadVector.end();
-          portPayloadVectorIterator++) {
-          if (*portPayloadVectorIterator == portPayload) {
-            log_write(LOG_STDERR, "UDP port payload duplication found on port: %u\n", ports[p]);
-            duplicate = true;
-            break;
-          }
-      }
-
-      if (!duplicate) {
-        portPayloadVector.push_back(portPayload);
-        if (portPayloadVector.size() > MAX_PAYLOADS_PER_PORT) {
-          fatal("Number of UDP payloads for port %u exceeds the limit of %u.\n", ports[p], MAX_PAYLOADS_PER_PORT);
-        }
+      // Ports are unique, and we ensured payloads are unique earlier, so no chance of duplicate here.
+      portPayloadVector.push_back(portPayload);
+      if (portPayloadVector.size() > MAX_PAYLOADS_PER_PORT) {
+        fatal("Number of UDP payloads for port %u exceeds the limit of %u.\n", ports[p], MAX_PAYLOADS_PER_PORT);
      }
    }