1
0
mirror of https://github.com/nmap/nmap.git synced 2025-12-09 14:11:29 +00:00

Upgrading libpcre from 7.4 to 7.6. Tested on Linux and Windows XP.

This commit is contained in:
kris
2008-02-16 20:55:50 +00:00
parent a02bf67a70
commit a3a78c535b
39 changed files with 1567 additions and 609 deletions

View File

@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2007 University of Cambridge
Copyright (c) 1997-2008 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -48,6 +48,7 @@ supporting internal functions that are not used by other modules. */
#include "config.h"
#endif
#define NLBLOCK cd /* Block containing newline information */
#define PSSTART start_pattern /* Field containing processed string start */
#define PSEND end_pattern /* Field containing processed string end */
@@ -243,7 +244,7 @@ static const char error_texts[] =
/* 10 */
"operand of unlimited repeat could match the empty string\0" /** DEAD **/
"internal error: unexpected repeat\0"
"unrecognized character after (?\0"
"unrecognized character after (? or (?-\0"
"POSIX named classes are supported only within a class\0"
"missing )\0"
/* 15 */
@@ -302,7 +303,9 @@ static const char error_texts[] =
"(*VERB) with an argument is not supported\0"
/* 60 */
"(*VERB) not recognized\0"
"number is too big";
"number is too big\0"
"subpattern name expected\0"
"digit expected after (?+";
/* Table to identify digits and hex digits. This is used when compiling
@@ -498,16 +501,16 @@ ptr--; /* Set pointer back to the last byte */
if (c == 0) *errorcodeptr = ERR1;
/* Non-alphamerics are literals. For digits or letters, do an initial lookup in
a table. A non-zero result is something that can be returned immediately.
/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
in a table. A non-zero result is something that can be returned immediately.
Otherwise further processing may be required. */
#ifndef EBCDIC /* ASCII coding */
else if (c < '0' || c > 'z') {} /* Not alphameric */
else if (c < '0' || c > 'z') {} /* Not alphanumeric */
else if ((i = escapes[c - '0']) != 0) c = i;
#else /* EBCDIC coding */
else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
else if ((i = escapes[c - 0x48]) != 0) c = i;
#endif
@@ -724,10 +727,10 @@ else
break;
/* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
for Perl compatibility, it is a literal. This code looks a bit odd, but
there used to be some cases other than the default, and there may be again
in future, so I haven't "optimized" it. */
other alphanumeric following \ is an error if PCRE_EXTRA was set;
otherwise, for Perl compatibility, it is a literal. This code looks a bit
odd, but there used to be some cases other than the default, and there may
be again in future, so I haven't "optimized" it. */
default:
if ((options & PCRE_EXTRA) != 0) switch(c)
@@ -1508,8 +1511,9 @@ for (;;)
can match the empty string or not. It is called from could_be_empty()
below and from compile_branch() when checking for an unlimited repeat of a
group that can match nothing. Note that first_significant_code() skips over
assertions. If we hit an unclosed bracket, we return "empty" - this means we've
struck an inner bracket whose current branch will already have been scanned.
backward and negative forward assertions when its final argument is TRUE. If we
hit an unclosed bracket, we return "empty" - this means we've struck an inner
bracket whose current branch will already have been scanned.
Arguments:
code points to start of search
@@ -1531,6 +1535,16 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE
c = *code;
/* Skip over forward assertions; the other assertions are skipped by
first_significant_code() with a TRUE final argument. */
if (c == OP_ASSERT)
{
do code += GET(code, 1); while (*code == OP_ALT);
c = *code;
continue;
}
/* Groups with zero repeats can of course be empty; skip them. */
if (c == OP_BRAZERO || c == OP_BRAMINZERO)
@@ -1726,29 +1740,48 @@ return TRUE;
*************************************************/
/* This function is called when the sequence "[:" or "[." or "[=" is
encountered in a character class. It checks whether this is followed by an
optional ^ and then a sequence of letters, terminated by a matching ":]" or
".]" or "=]".
encountered in a character class. It checks whether this is followed by a
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
reach an unescaped ']' without the special preceding character, return FALSE.
Argument:
Originally, this function only recognized a sequence of letters between the
terminators, but it seems that Perl recognizes any sequence of characters,
though of course unknown POSIX names are subsequently rejected. Perl gives an
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
didn't consider this to be a POSIX class. Likewise for [:1234:].
The problem in trying to be exactly like Perl is in the handling of escapes. We
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
below handles the special case of \], but does not try to do any other escape
processing. This makes it different from Perl for cases such as [:l\ower:]
where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
I think.
Arguments:
ptr pointer to the initial [
endptr where to return the end pointer
cd pointer to compile data
Returns: TRUE or FALSE
*/
static BOOL
check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
check_posix_syntax(const uschar *ptr, const uschar **endptr)
{
int terminator; /* Don't combine these lines; the Solaris cc */
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
if (*(++ptr) == '^') ptr++;
while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
if (*ptr == terminator && ptr[1] == ']')
for (++ptr; *ptr != 0; ptr++)
{
*endptr = ptr;
return TRUE;
if (*ptr == '\\' && ptr[1] == ']') ptr++; else
{
if (*ptr == ']') return FALSE;
if (*ptr == terminator && ptr[1] == ']')
{
*endptr = ptr;
return TRUE;
}
}
}
return FALSE;
}
@@ -2346,6 +2379,7 @@ uschar classbits[32];
BOOL class_utf8;
BOOL utf8 = (options & PCRE_UTF8) != 0;
uschar *class_utf8data;
uschar *class_utf8data_base;
uschar utf8_char[6];
#else
BOOL utf8 = FALSE;
@@ -2385,6 +2419,7 @@ req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
for (;; ptr++)
{
BOOL negate_class;
BOOL should_flip_negation;
BOOL possessive_quantifier;
BOOL is_quantifier;
BOOL is_recurse;
@@ -2608,7 +2643,7 @@ for (;; ptr++)
they are encountered at the top level, so we'll do that too. */
if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
check_posix_syntax(ptr, &tempptr, cd))
check_posix_syntax(ptr, &tempptr))
{
*errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
goto FAILED;
@@ -2633,6 +2668,12 @@ for (;; ptr++)
else break;
}
/* If a class contains a negative special such as \S, we need to flip the
negation flag at the end, so that support for characters > 255 works
correctly (they are all included in the class). */
should_flip_negation = FALSE;
/* Keep a count of chars with values < 256 so that we can optimize the case
of just a single character (as long as it's < 256). However, For higher
valued UTF-8 characters, we don't yet do any optimization. */
@@ -2650,6 +2691,7 @@ for (;; ptr++)
#ifdef SUPPORT_UTF8
class_utf8 = FALSE; /* No chars >= 256 */
class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
#endif
/* Process characters until ] is reached. By writing this as a "do" it
@@ -2665,6 +2707,18 @@ for (;; ptr++)
{ /* Braces are required because the */
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
}
/* In the pre-compile phase, accumulate the length of any UTF-8 extra
data and reset the pointer. This is so that very large classes that
contain a zillion UTF-8 characters no longer overwrite the work space
(which is on the stack). */
if (lengthptr != NULL)
{
*lengthptr += class_utf8data - class_utf8data_base;
class_utf8data = class_utf8data_base;
}
#endif
/* Inside \Q...\E everything is literal except \E */
@@ -2688,7 +2742,7 @@ for (;; ptr++)
if (c == '[' &&
(ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
check_posix_syntax(ptr, &tempptr, cd))
check_posix_syntax(ptr, &tempptr))
{
BOOL local_negate = FALSE;
int posix_class, taboffset, tabopt;
@@ -2705,6 +2759,7 @@ for (;; ptr++)
if (*ptr == '^')
{
local_negate = TRUE;
should_flip_negation = TRUE; /* Note negative special */
ptr++;
}
@@ -2779,7 +2834,7 @@ for (;; ptr++)
c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
if (*errorcodeptr != 0) goto FAILED;
if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
else if (-c == ESC_Q) /* Handle start of quoted string */
@@ -2807,6 +2862,7 @@ for (;; ptr++)
continue;
case ESC_D:
should_flip_negation = TRUE;
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
continue;
@@ -2815,6 +2871,7 @@ for (;; ptr++)
continue;
case ESC_W:
should_flip_negation = TRUE;
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
continue;
@@ -2824,13 +2881,11 @@ for (;; ptr++)
continue;
case ESC_S:
should_flip_negation = TRUE;
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
continue;
case ESC_E: /* Perl ignores an orphan \E */
continue;
default: /* Not recognized; fall through */
break; /* Need "default" setting to stop compiler warning. */
}
@@ -3065,7 +3120,7 @@ for (;; ptr++)
d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
if (*errorcodeptr != 0) goto FAILED;
/* \b is backslash; \X is literal X; \R is literal R; any other
/* \b is backspace; \X is literal X; \R is literal R; any other
special means the '-' was literal */
if (d < 0)
@@ -3329,11 +3384,14 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
zeroreqbyte = reqbyte;
/* If there are characters with values > 255, we have to compile an
extended class, with its own opcode. If there are no characters < 256,
we can omit the bitmap in the actual compiled code. */
extended class, with its own opcode, unless there was a negated special
such as \S in the class, because in that case all characters > 255 are in
the class, so any that were explicitly given as well can be ignored. If
(when there are explicit characters > 255 that must be listed) there are no
characters < 256, we can omit the bitmap in the actual compiled code. */
#ifdef SUPPORT_UTF8
if (class_utf8)
if (class_utf8 && !should_flip_negation)
{
*class_utf8data++ = XCL_END; /* Marks the end of extra data */
*code++ = OP_XCLASS;
@@ -3359,20 +3417,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
}
#endif
/* If there are no characters > 255, negate the 32-byte map if necessary,
and copy it into the code vector. If this is the first thing in the branch,
there can be no first char setting, whatever the repeat count. Any reqbyte
setting must remain unchanged after any kind of repeat. */
/* If there are no characters > 255, set the opcode to OP_CLASS or
OP_NCLASS, depending on whether the whole class was negated and whether
there were negative specials such as \S in the class. Then copy the 32-byte
map into the code vector, negating it if necessary. */
*code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
if (negate_class)
{
*code++ = OP_NCLASS;
if (lengthptr == NULL) /* Save time in the pre-compile phase */
for (c = 0; c < 32; c++) code[c] = ~classbits[c];
}
else
{
*code++ = OP_CLASS;
memcpy(code, classbits, 32);
}
code += 32;
@@ -4008,7 +4065,9 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
int len;
if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
*tempcode == OP_NOTEXACT)
tempcode += _pcre_OP_lengths[*tempcode];
tempcode += _pcre_OP_lengths[*tempcode] +
((*tempcode == OP_TYPEEXACT &&
(tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
len = code - tempcode;
if (len > 0) switch (*tempcode)
{
@@ -4235,16 +4294,13 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
*errorcodeptr = ERR58;
goto FAILED;
}
if (refsign == '-')
recno = (refsign == '-')?
cd->bracount - recno + 1 : recno +cd->bracount;
if (recno <= 0 || recno > cd->final_bracount)
{
recno = cd->bracount - recno + 1;
if (recno <= 0)
{
*errorcodeptr = ERR15;
goto FAILED;
}
*errorcodeptr = ERR15;
goto FAILED;
}
else recno += cd->bracount;
PUT2(code, 2+LINK_SIZE, recno);
break;
}
@@ -4316,9 +4372,10 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
skipbytes = 1;
}
/* Check for the "name" actually being a subpattern number. */
/* Check for the "name" actually being a subpattern number. We are
in the second pass here, so final_bracount is set. */
else if (recno > 0)
else if (recno > 0 && recno <= cd->final_bracount)
{
PUT2(code, 2+LINK_SIZE, recno);
}
@@ -4512,7 +4569,9 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
/* We come here from the Python syntax above that handles both
references (?P=name) and recursion (?P>name), as well as falling
through from the Perl recursion syntax (?&name). */
through from the Perl recursion syntax (?&name). We also come here from
the Perl \k<name> or \k'name' back reference syntax and the \k{name}
.NET syntax. */
NAMED_REF_OR_RECURSE:
name = ++ptr;
@@ -4524,6 +4583,11 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
if (lengthptr != NULL)
{
if (namelen == 0)
{
*errorcodeptr = ERR62;
goto FAILED;
}
if (*ptr != terminator)
{
*errorcodeptr = ERR42;
@@ -4537,14 +4601,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
recno = 0;
}
/* In the real compile, seek the name in the table */
/* In the real compile, seek the name in the table. We check the name
first, and then check that we have reached the end of the name in the
table. That way, if the name that is longer than any in the table,
the comparison will fail without reading beyond the table entry. */
else
{
slot = cd->name_table;
for (i = 0; i < cd->names_found; i++)
{
if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
slot[2+namelen] == 0)
break;
slot += cd->name_entry_size;
}
@@ -4581,7 +4650,15 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
{
const uschar *called;
if ((refsign = *ptr) == '+') ptr++;
if ((refsign = *ptr) == '+')
{
ptr++;
if ((digitab[*ptr] & ctype_digit) == 0)
{
*errorcodeptr = ERR63;
goto FAILED;
}
}
else if (refsign == '-')
{
if ((digitab[ptr[1]] & ctype_digit) == 0)
@@ -5747,7 +5824,6 @@ to fill in forward references to subpatterns. */
uschar cworkspace[COMPILE_WORK_SIZE];
/* Set this early so that early errors get offset 0. */
ptr = (const uschar *)pattern;
@@ -5908,7 +5984,7 @@ to compile parts of the pattern into; the compiled code is discarded when it is
no longer needed, so hopefully this workspace will never overflow, though there
is a test for its doing so. */
cd->bracount = 0;
cd->bracount = cd->final_bracount = 0;
cd->names_found = 0;
cd->name_entry_size = 0;
cd->name_table = NULL;
@@ -5985,6 +6061,7 @@ field. Reset the bracket count and the names_found field. Also reset the hwm
field; this time it's used for remembering forward references to subpatterns.
*/
cd->final_bracount = cd->bracount; /* Save for checking forward references */
cd->bracount = 0;
cd->names_found = 0;
cd->name_table = (uschar *)re + re->name_table_offset;