diff --git a/libpcre/AUTHORS b/libpcre/AUTHORS index 33df90f8d..adb4fc401 100644 --- a/libpcre/AUTHORS +++ b/libpcre/AUTHORS @@ -8,7 +8,7 @@ Email domain: cam.ac.uk University of Cambridge Computing Service, Cambridge, England. Phone: +44 1223 334714. -Copyright (c) 1997-2005 University of Cambridge +Copyright (c) 1997-2006 University of Cambridge All rights reserved @@ -17,7 +17,7 @@ THE C++ WRAPPER LIBRARY Written by: Google Inc. -Copyright (c) 2005 Google Inc +Copyright (c) 2006 Google Inc All rights reserved #### diff --git a/libpcre/LICENCE b/libpcre/LICENCE index e8eb0d937..daea2e48a 100644 --- a/libpcre/LICENCE +++ b/libpcre/LICENCE @@ -22,7 +22,7 @@ Email domain: cam.ac.uk University of Cambridge Computing Service, Cambridge, England. Phone: +44 1223 334714. -Copyright (c) 1997-2005 University of Cambridge +Copyright (c) 1997-2006 University of Cambridge All rights reserved. @@ -31,7 +31,7 @@ THE C++ WRAPPER FUNCTIONS Contributed by: Google Inc. -Copyright (c) 2005, Google Inc. +Copyright (c) 2006, Google Inc. All rights reserved. diff --git a/libpcre/Makefile.in b/libpcre/Makefile.in index e9b83e8ef..6ec7d09d5 100644 --- a/libpcre/Makefile.in +++ b/libpcre/Makefile.in @@ -83,6 +83,7 @@ CXX = @CXX@ CFLAGS = @CFLAGS@ CXXFLAGS = @CXXFLAGS@ LDFLAGS = @LDFLAGS@ +CXXLDFLAGS = @CXXLDFLAGS@ CC_FOR_BUILD = @CC_FOR_BUILD@ CFLAGS_FOR_BUILD = @CFLAGS_FOR_BUILD@ @@ -94,7 +95,7 @@ UCP = @UCP@ NEWLINE = @NEWLINE@ POSIX_MALLOC_THRESHOLD = @POSIX_MALLOC_THRESHOLD@ LINK_SIZE = @LINK_SIZE@ -MATCH_LIMIT = @MATCH_LIMIT@ +MATCH_LIMIT = @MATCH_LIMIT@ @MATCH_LIMIT_RECURSION@ NO_RECURSE = @NO_RECURSE@ EBCDIC = @EBCDIC@ @@ -139,83 +140,83 @@ pcre_chartables.@OBJEXT@: pcre_chartables.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) \ $(POSIX_MALLOC_THRESHOLD) pcre_chartables.c -pcre_compile.@OBJEXT@: Makefile config.h pcre.h \ +pcre_compile.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_compile.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_compile.c -pcre_config.@OBJEXT@: Makefile config.h pcre.h \ +pcre_config.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_config.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_config.c -pcre_dfa_exec.@OBJEXT@: Makefile config.h pcre.h \ +pcre_dfa_exec.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_dfa_exec.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_dfa_exec.c -pcre_exec.@OBJEXT@: Makefile config.h pcre.h \ +pcre_exec.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_exec.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_exec.c -pcre_fullinfo.@OBJEXT@: Makefile config.h pcre.h \ +pcre_fullinfo.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_fullinfo.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_fullinfo.c -pcre_get.@OBJEXT@: Makefile config.h pcre.h \ +pcre_get.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_get.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_get.c -pcre_globals.@OBJEXT@: Makefile config.h pcre.h \ +pcre_globals.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_globals.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_globals.c -pcre_info.@OBJEXT@: Makefile config.h pcre.h \ +pcre_info.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_info.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_info.c -pcre_maketables.@OBJEXT@: Makefile config.h pcre.h \ +pcre_maketables.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_maketables.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_maketables.c -pcre_refcount.@OBJEXT@: Makefile config.h pcre.h \ +pcre_refcount.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_refcount.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_refcount.c -pcre_study.@OBJEXT@: Makefile config.h pcre.h \ +pcre_study.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_study.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_study.c -pcre_tables.@OBJEXT@: Makefile config.h pcre.h \ +pcre_tables.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_tables.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_tables.c -pcre_try_flipped.@OBJEXT@: Makefile config.h pcre.h \ +pcre_try_flipped.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_try_flipped.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_try_flipped.c -pcre_version.@OBJEXT@: Makefile config.h pcre.h \ +pcre_version.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_version.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_version.c -pcre_xclass.@OBJEXT@: Makefile config.h pcre.h \ +pcre_xclass.@OBJEXT@: Makefile config.h $(top_srcdir)/pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_xclass.c $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) $(POSIX_MALLOC_THRESHOLD) \ $(top_srcdir)/pcre_xclass.c pcreposix.@OBJEXT@: $(top_srcdir)/pcreposix.c $(top_srcdir)/pcreposix.h \ - $(top_srcdir)/pcre_internal.h pcre.h config.h Makefile + $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre.h config.h Makefile @$(LTCOMPILE) $(POSIX_MALLOC_THRESHOLD) $(top_srcdir)/pcreposix.c $(TARGET): $(OBJ) @@ -231,7 +232,7 @@ pcre_chartables.c: dftables@BUILD_EXEEXT@ dftables.@BUILD_OBJEXT@: $(top_srcdir)/dftables.c \ $(top_srcdir)/pcre_maketables.c $(top_srcdir)/pcre_internal.h \ - pcre.h config.h Makefile + $(top_srcdir)/pcre.h config.h Makefile $(CC) -c $(CFLAGS) $(INCLS) $(DEFS) -I. $(top_srcdir)/dftables.c dftables@BUILD_EXEEXT@: dftables.@BUILD_OBJEXT@ @@ -247,7 +248,7 @@ clean:; -rm -rf *.@OBJEXT@ *.lo *.a *.la .libs pcretest@EXEEXT@ pcre_str distclean: clean -rm -f pcre_chartables.c libtool pcre-config libpcre.pc \ - pcre.h pcre_stringpiece.h pcrecpp.h \ + pcre_stringpiece.h pcrecpparg.h \ dftables@EXEEXT@ RunGrepTest RunTest \ Makefile config.h config.status config.log config.cache diff --git a/libpcre/NON-UNIX-USE b/libpcre/NON-UNIX-USE index fc02ba154..459bdf2b4 100644 --- a/libpcre/NON-UNIX-USE +++ b/libpcre/NON-UNIX-USE @@ -11,8 +11,9 @@ the Contrib directory on the ftp site that you may find useful. See If you want to compile PCRE for a non-Unix system (or perhaps, more strictly, for a system that does not support "configure" and "make" files), note that -PCRE consists entirely of code written in Standard C, and so should compile -successfully on any system that has a Standard C compiler and library. +the basic PCRE library consists entirely of code written in Standard C, and so +should compile successfully on any system that has a Standard C compiler and +library. The C++ wrapper functions are a separate issue (see below). GENERIC INSTRUCTIONS FOR THE C LIBRARY @@ -34,27 +35,16 @@ your compiler gives to '\n'. rem Use write, because notepad cannot handle UNIX files. Change values. write config.h -(2) Copy or rename the file pcre.in as pcre.h, and change the macro definitions -for PCRE_MAJOR, PCRE_MINOR, and PCRE_DATE near its start to the values set in -configure.in. - - rem Mark Tetrode's commands - copy pcre.in pcre.h - rem Read values from configure.in - write configure.in - rem Change values - write pcre.h - -(3) Compile dftables.c as a stand-alone program, and then run it with +(2) Compile dftables.c as a stand-alone program, and then run it with the single argument "pcre_chartables.c". This generates a set of standard character tables and writes them to that file. rem Mark Tetrode's commands rem Compile & run cl -DSUPPORT_UTF8 -DSUPPORT_UCP dftables.c - dftables.exe chartables.c + dftables.exe pcre_chartables.c -(4) Compile the following source files: +(3) Compile the following source files: pcre_chartables.c pcre_compile.c @@ -67,12 +57,11 @@ character tables and writes them to that file. pcre_info.c pcre_maketables.c pcre_ord2utf8.c - pcre_printint.c pcre_refcount.c pcre_study.c pcre_tables.c pcre_try_flipped.c - pcre_ucp_findchar.c + pcre_ucp_searchfuncs.c pcre_valid_utf8.c pcre_version.c pcre_xclass.c @@ -88,7 +77,7 @@ shared libraries, you may have to do this once for each type. cl -DSUPPORT_UTF8 -DSUPPORT_UCP -DPOSIX_MALLOC_THRESHOLD=10 /c maketables.c get.c study.c pcre.c lib /OUT:pcre.lib maketables.obj get.obj study.obj pcre.obj -(5) Similarly, compile pcreposix.c and link it (on its own) as the pcreposix +(4) Similarly, compile pcreposix.c and link it (on its own) as the pcreposix library. rem Mark Tetrode's commands, for a static library @@ -96,14 +85,14 @@ library. cl -DSUPPORT_UTF8 -DSUPPORT_UCP -DPOSIX_MALLOC_THRESHOLD=10 /c pcreposix.c lib /OUT:pcreposix.lib pcreposix.obj -(6) Compile the test program pcretest.c. This needs the functions in the +(5) Compile the test program pcretest.c. This needs the functions in the pcre and pcreposix libraries when linking. rem Mark Tetrode's commands rem compile & link cl /F0x400000 pcretest.c pcre.lib pcreposix.lib -(7) Run pcretest on the testinput files in the testdata directory, and check +(6) Run pcretest on the testinput files in the testdata directory, and check that the output matches the corresponding testoutput files. You must use the -i option when checking testinput2. Note that the supplied files are in Unix format, with just LF characters as line terminators. You may need to edit them @@ -126,6 +115,9 @@ to change this if your system uses a different convention. Note that there are now three more tests (7, 8, 9) that did not exist when Mark wrote those comments. The test the new pcre_dfa_exec() function. +(7) If you want to use the pcregrep command, compile and link pcregrep.c; it +uses only the basic PCRE library. + THE C++ WRAPPER FUNCTIONS diff --git a/libpcre/README b/libpcre/README index 2ede6fc2e..24d0b9749 100644 --- a/libpcre/README +++ b/libpcre/README @@ -34,7 +34,7 @@ Documentation for PCRE ---------------------- If you install PCRE in the normal way, you will end up with an installed set of -man pages whose names all start with "pcre". The one that is called "pcre" +man pages whose names all start with "pcre". The one that is just called "pcre" lists all the others. In addition to these man pages, the PCRE documentation is supplied in two other forms; however, as there is no standard place to install them, they are left in the doc directory of the unpacked source distribution. @@ -68,6 +68,9 @@ others are pointers to URLs containing relevant files. Building PCRE on a Unix-like system ----------------------------------- +If you are using HP's ANSI C++ compiler (aCC), please see the special note +in the section entitled "Using HP's ANSI C++ compiler (aCC)" below. + To build PCRE on a Unix-like system, first run the "configure" command from the PCRE distribution directory, with your current directory set to the directory where you want the files to be created. This command is a standard GNU @@ -91,6 +94,10 @@ into /source/pcre/pcre-xxx, but you want to build it in /build/pcre/pcre-xxx: cd /build/pcre/pcre-xxx /source/pcre/pcre-xxx/configure +PCRE is written in C and is normally compiled as a C library. However, it is +possible to build it as a C++ library, though the provided building apparatus +does not have any features to support this. + There are some optional features that can be included or omitted from the PCRE library. You can read more about them in the pcrebuild man page. @@ -107,15 +114,17 @@ library. You can read more about them in the pcrebuild man page. . If, in addition to support for UTF-8 character strings, you want to include support for the \P, \p, and \X sequences that recognize Unicode character properties, you must add --enable-unicode-properties to the "configure" - command. This adds about 90K to the size of the library (in the form of a + command. This adds about 30K to the size of the library (in the form of a property table); only the basic two-letter properties such as Lu are supported. -. You can build PCRE to recognized CR or NL as the newline character, instead - of whatever your compiler uses for "\n", by adding --newline-is-cr or - --newline-is-nl to the "configure" command, respectively. Only do this if you - really understand what you are doing. On traditional Unix-like systems, the - newline character is NL. +. You can build PCRE to recognize either CR or LF or the sequence CRLF as + indicating the end of a line. Whatever you specify at build time is the + default; the caller of PCRE can change the selection at run time. The default + newline indicator is a single LF character (the Unix standard). You can + specify the default newline indicator by adding --newline-is-cr or + --newline-is-lf or --newline-is-crlf to the "configure" command, + respectively. . When called via the POSIX interface, PCRE uses malloc() to get additional storage for processing capturing parentheses if there are more than 10 of @@ -135,6 +144,16 @@ library. You can read more about them in the pcrebuild man page. pcre_exec() can supply their own value. There is discussion on the pcreapi man page. +. There is a separate counter that limits the depth of recursive function calls + during a matching process. This also has a default of ten million, which is + essentially "unlimited". You can change the default by setting, for example, + + --with-match-limit-recursion=500000 + + Recursive function calls use up the runtime stack; running out of stack can + cause programs to crash in strange ways. There is a discussion about stack + sizes in the pcrestack man page. + . The default maximum compiled pattern size is around 64K. You can increase this by adding --with-link-size=3 to the "configure" command. You can increase it even more by setting --with-link-size=4, but this is unlikely @@ -158,7 +177,6 @@ library. You can read more about them in the pcrebuild man page. The "configure" script builds eight files for the basic C library: -. pcre.h is the header file for C programs that call PCRE . Makefile is the makefile that builds the library . config.h contains build-time configuration options for the library . pcre-config is a script that shows the settings of "configure" options @@ -262,6 +280,22 @@ when calling the "configure" command. If they are not specified, they default to the values of CC and CFLAGS. +Using HP's ANSI C++ compiler (aCC) +---------------------------------- + +Unless C++ support is disabled by specifiying the "--disable-cpp" option of the +"configure" script, you *must* include the "-AA" option in the CXXFLAGS +environment variable in order for the C++ components to compile correctly. + +Also, note that the aCC compiler on PA-RISC platforms may have a defect whereby +needed libraries fail to get included when specifying the "-AA" compiler +option. If you experience unresolved symbols when linking the C++ programs, +use the workaround of specifying the following environment variable prior to +running the "configure" script: + + CXXLDFLAGS="-lstd_v2 -lCsup_v2" + + Building on non-Unix systems ---------------------------- @@ -409,28 +443,28 @@ The distribution should contain the following files: pcre_info.c ) pcre_maketables.c ) pcre_ord2utf8.c ) - pcre_printint.c ) + pcre_refcount.c ) pcre_study.c ) pcre_tables.c ) pcre_try_flipped.c ) - pcre_ucp_findchar.c ) + pcre_ucp_searchfuncs.c) pcre_valid_utf8.c ) pcre_version.c ) pcre_xclass.c ) - - ucp_findchar.c ) - ucp.h ) source for the code that is used for - ucpinternal.h ) Unicode property handling ucptable.c ) - ucptypetable.c ) - pcre.in "source" for the header for the external API; pcre.h - is built from this by "configure" + pcre_printint.src ) debugging function that is #included in pcretest, and + ) can also be #included in pcre_compile() + + pcre.h the public PCRE header file pcreposix.h header for the external POSIX wrapper API pcre_internal.h header for internal use + ucp.h ) headers concerned with + ucpinternal.h ) Unicode property handling config.in template for config.h, which is built by configure - pcrecpp.h.in "source" for the header file for the C++ wrapper + pcrecpp.h the header file for the C++ wrapper + pcrecpparg.h.in "source" for another C++ header file pcrecpp.cc ) pcre_scanner.cc ) source for the C++ wrapper library @@ -453,8 +487,9 @@ The distribution should contain the following files: RunGrepTest.in template for a Unix shell script for pcregrep tests config.guess ) files used by libtool, config.sub ) used only when building a shared library + config.h.in "source" for the config.h header file configure a configuring shell script (built by autoconf) - configure.in the autoconf input used to build configure + configure.ac the autoconf input used to build configure doc/Tech.Notes notes on the encoding doc/*.3 man page sources for the PCRE functions doc/*.1 man page sources for pcregrep and pcretest @@ -482,7 +517,6 @@ The distribution should contain the following files: libpcre.def libpcreposix.def - pcre.def (D) Auxiliary file for VPASCAL @@ -491,4 +525,4 @@ The distribution should contain the following files: Philip Hazel Email local part: ph10 Email domain: cam.ac.uk -August 2005 +June 2006 diff --git a/libpcre/config.h.in b/libpcre/config.h.in index a6d1451db..efc537154 100644 --- a/libpcre/config.h.in +++ b/libpcre/config.h.in @@ -1,15 +1,16 @@ -/* On Unix systems config.in is converted by configure into config.h. PCRE is -written in Standard C, but there are a few non-standard things it can cope -with, allowing it to run on SunOS4 and other "close to standard" systems. +/* On Unix-like systems config.in is converted by "configure" into config.h. +Some other environments also support the use of "configure". PCRE is written in +Standard C, but there are a few non-standard things it can cope with, allowing +it to run on SunOS4 and other "close to standard" systems. -On a non-Unix system you should just copy this file into config.h, and set up -the macros the way you need them. You should normally change the definitions of -HAVE_STRERROR and HAVE_MEMMOVE to 1. Unfortunately, because of the way autoconf -works, these cannot be made the defaults. If your system has bcopy() and not -memmove(), change the definition of HAVE_BCOPY instead of HAVE_MEMMOVE. If your -system has neither bcopy() nor memmove(), leave them both as 0; an emulation -function will be used. */ +On a non-Unix-like system you should just copy this file into config.h, and set +up the macros the way you need them. You should normally change the definitions +of HAVE_STRERROR and HAVE_MEMMOVE to 1. Unfortunately, because of the way +autoconf works, these cannot be made the defaults. If your system has bcopy() +and not memmove(), change the definition of HAVE_BCOPY instead of HAVE_MEMMOVE. +If your system has neither bcopy() nor memmove(), leave them both as 0; an +emulation function will be used. */ /* If you are compiling for a system that uses EBCDIC instead of ASCII character codes, define this macro as 1. On systems that can use "configure", @@ -19,76 +20,70 @@ this can be done via --enable-ebcdic. */ #define EBCDIC 0 #endif -/* If you are compiling for a system that needs some magic to be inserted -before the definition of an exported function, define this macro to contain the -relevant magic. It apears at the start of every exported function. */ +/* If you are compiling for a system other than a Unix-like system or Win32, +and it needs some magic to be inserted before the definition of a function that +is exported by the library, define this macro to contain the relevant magic. If +you do not define this macro, it defaults to "extern" for a C compiler and +"extern C" for a C++ compiler on non-Win32 systems. This macro apears at the +start of every exported function that is part of the external API. It does not +appear on functions that are "external" in the C sense, but which are internal +to the library. */ -#define PCRE_EXPORT +/* #define PCRE_DATA_SCOPE */ -/* Define to empty if the "const" keyword does not work. */ +/* Define the following macro to empty if the "const" keyword does not work. */ #undef const -/* Define to "unsigned" if doesn't define size_t. */ +/* Define the following macro to "unsigned" if does not define +size_t. */ #undef size_t /* The following two definitions are mainly for the benefit of SunOS4, which -doesn't have the strerror() or memmove() functions that should be present in +does not have the strerror() or memmove() functions that should be present in all Standard C libraries. The macros HAVE_STRERROR and HAVE_MEMMOVE should normally be defined with the value 1 for other systems, but unfortunately we -can't make this the default because "configure" files generated by autoconf +cannot make this the default because "configure" files generated by autoconf will only change 0 to 1; they won't change 1 to 0 if the functions are not found. */ #define HAVE_STRERROR 0 #define HAVE_MEMMOVE 0 -/* There are some non-Unix systems that don't even have bcopy(). If this macro -is false, an emulation is used. If HAVE_MEMMOVE is set to 1, the value of +/* There are some non-Unix-like systems that don't even have bcopy(). If this +macro is false, an emulation is used. If HAVE_MEMMOVE is set to 1, the value of HAVE_BCOPY is not relevant. */ #define HAVE_BCOPY 0 /* The value of NEWLINE determines the newline character. The default is to leave it up to the compiler, but some sites want to force a particular value. -On Unix systems, "configure" can be used to override this default. */ +On Unix-like systems, "configure" can be used to override this default. */ #ifndef NEWLINE #define NEWLINE '\n' #endif -/* The value of LINK_SIZE determines the number of bytes used to store -links as offsets within the compiled regex. The default is 2, which allows for -compiled patterns up to 64K long. This covers the vast majority of cases. -However, PCRE can also be compiled to use 3 or 4 bytes instead. This allows for -longer patterns in extreme cases. On Unix systems, "configure" can be used to -override this default. */ +/* The value of LINK_SIZE determines the number of bytes used to store links as +offsets within the compiled regex. The default is 2, which allows for compiled +patterns up to 64K long. This covers the vast majority of cases. However, PCRE +can also be compiled to use 3 or 4 bytes instead. This allows for longer +patterns in extreme cases. On systems that support it, "configure" can be used +to override this default. */ #ifndef LINK_SIZE #define LINK_SIZE 2 #endif -/* The value of MATCH_LIMIT determines the default number of times the match() -function can be called during a single execution of pcre_exec(). (There is a -runtime method of setting a different limit.) The limit exists in order to -catch runaway regular expressions that take for ever to determine that they do -not match. The default is set very large so that it does not accidentally catch -legitimate cases. On Unix systems, "configure" can be used to override this -default default. */ - -#ifndef MATCH_LIMIT -#define MATCH_LIMIT 10000000 -#endif - /* When calling PCRE via the POSIX interface, additional working storage is required for holding the pointers to capturing substrings because PCRE requires three integers per substring, whereas the POSIX interface provides only two. If the number of expected substrings is small, the wrapper function uses space on the stack, because this is faster than using malloc() for each call. The -threshold above which the stack is no longer use is defined by POSIX_MALLOC_ -THRESHOLD. On Unix systems, "configure" can be used to override this default. -*/ +threshold above which the stack is no longer used is defined by POSIX_MALLOC_ +THRESHOLD. On systems that support it, "configure" can be used to override this +default. */ #ifndef POSIX_MALLOC_THRESHOLD #define POSIX_MALLOC_THRESHOLD 10 @@ -97,11 +92,52 @@ THRESHOLD. On Unix systems, "configure" can be used to override this default. /* PCRE uses recursive function calls to handle backtracking while matching. This can sometimes be a problem on systems that have stacks of limited size. Define NO_RECURSE to get a version that doesn't use recursion in the match() -function; instead it creates its own stack by steam using pcre_recurse_malloc -to get memory. For more detail, see comments and other stuff just above the -match() function. On Unix systems, "configure" can be used to set this in the -Makefile (use --disable-stack-for-recursion). */ +function; instead it creates its own stack by steam using pcre_recurse_malloc() +to obtain memory from the heap. For more detail, see the comments and other +stuff just above the match() function. On systems that support it, "configure" +can be used to set this in the Makefile (use --disable-stack-for-recursion). */ /* #define NO_RECURSE */ +/* The value of MATCH_LIMIT determines the default number of times the internal +match() function can be called during a single execution of pcre_exec(). There +is a runtime interface for setting a different limit. The limit exists in order +to catch runaway regular expressions that take for ever to determine that they +do not match. The default is set very large so that it does not accidentally +catch legitimate cases. On systems that support it, "configure" can be used to +override this default default. */ + +#ifndef MATCH_LIMIT +#define MATCH_LIMIT 10000000 +#endif + +/* The above limit applies to all calls of match(), whether or not they +increase the recursion depth. In some environments it is desirable to limit the +depth of recursive calls of match() more strictly, in order to restrict the +maximum amount of stack (or heap, if NO_RECURSE is defined) that is used. The +value of MATCH_LIMIT_RECURSION applies only to recursive calls of match(). To +have any useful effect, it must be less than the value of MATCH_LIMIT. There is +a runtime method for setting a different limit. On systems that support it, +"configure" can be used to override this default default. */ + +#ifndef MATCH_LIMIT_RECURSION +#define MATCH_LIMIT_RECURSION MATCH_LIMIT +#endif + +/* These three limits are parameterized just in case anybody ever wants to +change them. Care must be taken if they are increased, because they guard +against integer overflow caused by enormously large patterns. */ + +#ifndef MAX_NAME_SIZE +#define MAX_NAME_SIZE 32 +#endif + +#ifndef MAX_NAME_COUNT +#define MAX_NAME_COUNT 10000 +#endif + +#ifndef MAX_DUPLENGTH +#define MAX_DUPLENGTH 30000 +#endif + /* End */ diff --git a/libpcre/configure b/libpcre/configure index 871a74dd7..ba13c87e3 100755 --- a/libpcre/configure +++ b/libpcre/configure @@ -272,6 +272,7 @@ PACKAGE_STRING= PACKAGE_BUGREPORT= ac_unique_file="dftables.c" +ac_unique_file="pcre.h" # Factoring default headers for most tests. ac_includes_default="\ #include @@ -309,7 +310,7 @@ ac_includes_default="\ # include #endif" -ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT RANLIB ac_ct_RANLIB CPP EGREP pcre_have_long_long pcre_have_ulong_long build build_cpu build_vendor build_os host host_cpu host_vendor host_os BUILD_EXEEXT BUILD_OBJEXT CC_FOR_BUILD CFLAGS_FOR_BUILD EBCDIC HAVE_MEMMOVE HAVE_STRERROR LINK_SIZE MATCH_LIMIT NEWLINE NO_RECURSE PCRE_MAJOR PCRE_MINOR PCRE_DATE PCRE_VERSION PCRE_LIB_VERSION PCRE_POSIXLIB_VERSION POSIX_MALLOC_THRESHOLD UCP UTF8 POSIX_OBJ POSIX_LOBJ POSIX_LIB LIBOBJS LTLIBOBJS' +ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT RANLIB ac_ct_RANLIB CPP EGREP pcre_have_long_long pcre_have_ulong_long build build_cpu build_vendor build_os host host_cpu host_vendor host_os BUILD_EXEEXT BUILD_OBJEXT CC_FOR_BUILD CFLAGS_FOR_BUILD CXXLDFLAGS EBCDIC HAVE_MEMMOVE HAVE_STRERROR LINK_SIZE MATCH_LIMIT MATCH_LIMIT_RECURSION NEWLINE NO_RECURSE PCRE_LIB_VERSION PCRE_POSIXLIB_VERSION PCRE_VERSION POSIX_MALLOC_THRESHOLD UCP UTF8 POSIX_OBJ POSIX_LOBJ POSIX_LIB LIBOBJS LTLIBOBJS' ac_subst_files='' # Initialize some variables set by options. @@ -848,15 +849,17 @@ Optional Features: --enable-unicode-properties enable Unicode properties support --enable-newline-is-cr use CR as the newline character --enable-newline-is-lf use LF as the newline character + --enable-newline-is-crlf use CRLF as the newline sequence --enable-ebcdic assume EBCDIC coding rather than ASCII --disable-stack-for-recursion disable use of stack recursion when matching Optional Packages: --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) - --with-posix-malloc-threshold=5 threshold for POSIX malloc usage + --with-posix-malloc-threshold=10 threshold for POSIX malloc usage --with-link-size=2 internal link size (2, 3, or 4 allowed) - --with-match-limit=10000000 default limit on internal looping) + --with-match-limit=10000000 default limit on internal looping + --with-match-limit-recursion=10000000 default limit on internal recursion Some influential environment variables: CC C compiler command @@ -1307,18 +1310,13 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu + ac_config_headers="$ac_config_headers config.h" -PCRE_MAJOR=6 -PCRE_MINOR=4 -PCRE_DATE=05-Sep-2005 -PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR} - - POSIX_MALLOC_THRESHOLD=-DPOSIX_MALLOC_THRESHOLD=10 @@ -1326,6 +1324,12 @@ PCRE_LIB_VERSION=0:1:0 PCRE_POSIXLIB_VERSION=0:0:0 +PCRE_MAJOR=`grep '#define PCRE_MAJOR' ${srcdir}/pcre.h | cut -c 29-` +PCRE_MINOR=`grep '#define PCRE_MINOR' ${srcdir}/pcre.h | cut -c 29-` +PCRE_PRERELEASE=`grep '#define PCRE_PRERELEASE' ${srcdir}/pcre.h | cut -c 29-` +PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR}${PCRE_PRERELEASE} + + ac_ext=c ac_cpp='$CPP $CPPFLAGS' ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' @@ -3447,6 +3451,16 @@ fi fi; +# Check whether --enable-newline-is-crlf or --disable-newline-is-crlf was given. +if test "${enable_newline_is_crlf+set}" = set; then + enableval="$enable_newline_is_crlf" + if test "$enableval" = "yes"; then + NEWLINE=-DNEWLINE=3338 +fi + +fi; + + # Check whether --enable-ebcdic or --disable-ebcdic was given. if test "${enable_ebcdic+set}" = set; then enableval="$enable_ebcdic" @@ -3495,6 +3509,15 @@ if test "${with_match_limit+set}" = set; then fi; + +# Check whether --with-match-limit-recursion or --without-match-limit-recursion was given. +if test "${with_match_limit_recursion+set}" = set; then + withval="$with_match_limit_recursion" + MATCH_LIMIT_RECURSION=-DMATCH_LIMIT_RECURSION=$withval + +fi; + + if test "$UCP" != "" ; then UTF8=-DSUPPORT_UTF8 fi @@ -3615,7 +3638,6 @@ esac - if test "x$enable_shared" = "xno" ; then @@ -3626,7 +3648,7 @@ _ACEOF fi - ac_config_files="$ac_config_files Makefile pcre.h:pcre.h.in" + ac_config_files="$ac_config_files Makefile" cat >confcache <<\_ACEOF # This file is a shell script that caches the results of configure # tests run on this system so they can be shared between configure @@ -4152,7 +4174,6 @@ do case "$ac_config_target" in # Handling of arguments. "Makefile" ) CONFIG_FILES="$CONFIG_FILES Makefile" ;; - "pcre.h" ) CONFIG_FILES="$CONFIG_FILES pcre.h:pcre.h.in" ;; "config.h" ) CONFIG_HEADERS="$CONFIG_HEADERS config.h" ;; *) { { echo "$as_me:$LINENO: error: invalid argument: $ac_config_target" >&5 echo "$as_me: error: invalid argument: $ac_config_target" >&2;} @@ -4263,19 +4284,18 @@ s,@BUILD_EXEEXT@,$BUILD_EXEEXT,;t t s,@BUILD_OBJEXT@,$BUILD_OBJEXT,;t t s,@CC_FOR_BUILD@,$CC_FOR_BUILD,;t t s,@CFLAGS_FOR_BUILD@,$CFLAGS_FOR_BUILD,;t t +s,@CXXLDFLAGS@,$CXXLDFLAGS,;t t s,@EBCDIC@,$EBCDIC,;t t s,@HAVE_MEMMOVE@,$HAVE_MEMMOVE,;t t s,@HAVE_STRERROR@,$HAVE_STRERROR,;t t s,@LINK_SIZE@,$LINK_SIZE,;t t s,@MATCH_LIMIT@,$MATCH_LIMIT,;t t +s,@MATCH_LIMIT_RECURSION@,$MATCH_LIMIT_RECURSION,;t t s,@NEWLINE@,$NEWLINE,;t t s,@NO_RECURSE@,$NO_RECURSE,;t t -s,@PCRE_MAJOR@,$PCRE_MAJOR,;t t -s,@PCRE_MINOR@,$PCRE_MINOR,;t t -s,@PCRE_DATE@,$PCRE_DATE,;t t -s,@PCRE_VERSION@,$PCRE_VERSION,;t t s,@PCRE_LIB_VERSION@,$PCRE_LIB_VERSION,;t t s,@PCRE_POSIXLIB_VERSION@,$PCRE_POSIXLIB_VERSION,;t t +s,@PCRE_VERSION@,$PCRE_VERSION,;t t s,@POSIX_MALLOC_THRESHOLD@,$POSIX_MALLOC_THRESHOLD,;t t s,@UCP@,$UCP,;t t s,@UTF8@,$UTF8,;t t diff --git a/libpcre/configure.ac b/libpcre/configure.ac index 5412d05a8..240e78432 100644 --- a/libpcre/configure.ac +++ b/libpcre/configure.ac @@ -13,27 +13,16 @@ dnl This is required at the start; the name is the name of a file dnl it should be seeing, to verify it is in the same directory. AC_INIT(dftables.c) +AC_CONFIG_SRCDIR([pcre.h]) dnl A safety precaution AC_PREREQ(2.57) -dnl Arrange to build config.h from config.h.in. Note that pcre.h is -dnl built differently, as it is just a "substitution" file. +dnl Arrange to build config.h from config.h.in. dnl Manual says this macro should come right after AC_INIT. AC_CONFIG_HEADER(config.h) -dnl Provide the current PCRE version information. Do not use numbers -dnl with leading zeros for the minor version, as they end up in a C -dnl macro, and may be treated as octal constants. Stick to single -dnl digits for minor numbers less than 10. There are unlikely to be -dnl that many releases anyway. - -PCRE_MAJOR=6 -PCRE_MINOR=4 -PCRE_DATE=05-Sep-2005 -PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR} - dnl Default values for miscellaneous macros POSIX_MALLOC_THRESHOLD=-DPOSIX_MALLOC_THRESHOLD=10 @@ -44,6 +33,14 @@ dnl are built by default on Unix systems. PCRE_LIB_VERSION=0:1:0 PCRE_POSIXLIB_VERSION=0:0:0 +dnl Find the PCRE version from the pcre.h file. The PCRE_VERSION variable is +dnl substituted in pcre-config.in. + +PCRE_MAJOR=`grep '#define PCRE_MAJOR' ${srcdir}/pcre.h | cut -c 29-` +PCRE_MINOR=`grep '#define PCRE_MINOR' ${srcdir}/pcre.h | cut -c 29-` +PCRE_PRERELEASE=`grep '#define PCRE_PRERELEASE' ${srcdir}/pcre.h | cut -c 29-` +PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR}${PCRE_PRERELEASE} + dnl Checks for programs. AC_PROG_CC @@ -120,6 +117,15 @@ if test "$enableval" = "yes"; then fi ) +dnl Handle --enable-newline-is-crlf + +AC_ARG_ENABLE(newline-is-crlf, +[ --enable-newline-is-crlf use CRLF as the newline sequence], +if test "$enableval" = "yes"; then + NEWLINE=-DNEWLINE=3338 +fi +) + dnl Handle --enable-ebcdic AC_ARG_ENABLE(ebcdic, @@ -145,7 +151,7 @@ dnl I've done. dnl Handle --with-posix-malloc-threshold=n AC_ARG_WITH(posix-malloc-threshold, -[ --with-posix-malloc-threshold=5 threshold for POSIX malloc usage], +[ --with-posix-malloc-threshold=10 threshold for POSIX malloc usage], POSIX_MALLOC_THRESHOLD=-DPOSIX_MALLOC_THRESHOLD=$withval ) @@ -156,13 +162,20 @@ AC_ARG_WITH(link-size, LINK_SIZE=-DLINK_SIZE=$withval ) -dnl Handle --with-match_limit=n +dnl Handle --with-match-limit=n AC_ARG_WITH(match-limit, -[ --with-match-limit=10000000 default limit on internal looping)], +[ --with-match-limit=10000000 default limit on internal looping], MATCH_LIMIT=-DMATCH_LIMIT=$withval ) +dnl Handle --with-match-limit_recursion=n + +AC_ARG_WITH(match-limit-recursion, +[ --with-match-limit-recursion=10000000 default limit on internal recursion], + MATCH_LIMIT_RECURSION=-DMATCH_LIMIT_RECURSION=$withval +) + dnl Unicode character property support implies UTF-8 support if test "$UCP" != "" ; then @@ -187,19 +200,18 @@ AC_SUBST(BUILD_EXEEXT) AC_SUBST(BUILD_OBJEXT) AC_SUBST(CC_FOR_BUILD) AC_SUBST(CFLAGS_FOR_BUILD) +AC_SUBST(CXXLDFLAGS) AC_SUBST(EBCDIC) AC_SUBST(HAVE_MEMMOVE) AC_SUBST(HAVE_STRERROR) AC_SUBST(LINK_SIZE) AC_SUBST(MATCH_LIMIT) +AC_SUBST(MATCH_LIMIT_RECURSION) AC_SUBST(NEWLINE) AC_SUBST(NO_RECURSE) -AC_SUBST(PCRE_MAJOR) -AC_SUBST(PCRE_MINOR) -AC_SUBST(PCRE_DATE) -AC_SUBST(PCRE_VERSION) AC_SUBST(PCRE_LIB_VERSION) AC_SUBST(PCRE_POSIXLIB_VERSION) +AC_SUBST(PCRE_VERSION) AC_SUBST(POSIX_MALLOC_THRESHOLD) AC_SUBST(UCP) AC_SUBST(UTF8) @@ -214,4 +226,4 @@ if test "x$enable_shared" = "xno" ; then fi dnl This must be last; it determines what files are written as well as config.h -AC_OUTPUT(Makefile pcre.h:pcre.h.in) +AC_OUTPUT(Makefile ) diff --git a/libpcre/dftables.c b/libpcre/dftables.c index 480753887..5ba207aeb 100644 --- a/libpcre/dftables.c +++ b/libpcre/dftables.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/libpcre/pcre.h b/libpcre/pcre.h index 2f79a8ee9..7a7051ca1 100644 --- a/libpcre/pcre.h +++ b/libpcre/pcre.h @@ -2,8 +2,8 @@ * Perl-Compatible Regular Expressions * *************************************************/ -/* In its original form, this is the .in file that is transformed by -"configure" into pcre.h. +/* This is the public header file for the PCRE library, to be #included by +applications that call the PCRE functions. Copyright (c) 1997-2005 University of Cambridge @@ -39,17 +39,40 @@ POSSIBILITY OF SUCH DAMAGE. #ifndef _PCRE_H #define _PCRE_H -/* The file pcre.h is build by "configure". Do not edit it; instead -make changes to pcre.in. */ +/* The current PCRE version information. */ + +/* NOTES FOR FUTURE MAINTAINERS: Do not use numbers with leading zeros, because +they may be treated as octal constants. The PCRE_PRERELEASE feature is for +identifying release candidates. It might be defined as -RC2, for example. In +real releases, it should be defined empty. Do not change the alignment of these +statments. The code in ./configure greps out the version numbers by using "cut" +to get values from column 29 onwards. These are substituted into pcre-config +and libpcre.pc. The values are not put into configure.ac and substituted here +(which would simplify this issue) because that makes life harder for those who +cannot run ./configure. As it now stands, this file need not be edited in that +circumstance. */ #define PCRE_MAJOR 6 -#define PCRE_MINOR 3 -#define PCRE_DATE 15-Aug-2005 +#define PCRE_MINOR 7 +#define PCRE_PRERELEASE +#define PCRE_DATE 04-Jul-2006 -/* Win32 uses DLL by default; it needs special stuff for exported functions. */ -/* Removed some defines here as I always compile staticly */ +/* Win32 uses DLL by default; it needs special stuff for exported functions +when building PCRE. */ -/* For other operating systems, we use the standard "extern". */ +#ifdef _WIN32 +# ifdef PCRE_DEFINITION +# ifdef DLL_EXPORT +# define PCRE_DATA_SCOPE __declspec(dllexport) +# endif +# else +# ifndef PCRE_STATIC +# define PCRE_DATA_SCOPE extern __declspec(dllimport) +# endif +# endif +#endif + +/* Otherwise, we use the standard "extern". */ #ifndef PCRE_DATA_SCOPE # ifdef __cplusplus @@ -91,6 +114,10 @@ extern "C" { #define PCRE_DFA_SHORTEST 0x00010000 #define PCRE_DFA_RESTART 0x00020000 #define PCRE_FIRSTLINE 0x00040000 +#define PCRE_DUPNAMES 0x00080000 +#define PCRE_NEWLINE_CR 0x00100000 +#define PCRE_NEWLINE_LF 0x00200000 +#define PCRE_NEWLINE_CRLF 0x00300000 /* Exec-time and get/set-time error codes */ @@ -114,6 +141,7 @@ extern "C" { #define PCRE_ERROR_DFA_UMLIMIT (-18) #define PCRE_ERROR_DFA_WSSIZE (-19) #define PCRE_ERROR_DFA_RECURSE (-20) +#define PCRE_ERROR_RECURSIONLIMIT (-21) /* Request types for pcre_fullinfo() */ @@ -131,7 +159,8 @@ extern "C" { #define PCRE_INFO_STUDYSIZE 10 #define PCRE_INFO_DEFAULT_TABLES 11 -/* Request types for pcre_config() */ +/* Request types for pcre_config(). Do not re-arrange, in order to remain +compatible. */ #define PCRE_CONFIG_UTF8 0 #define PCRE_CONFIG_NEWLINE 1 @@ -140,19 +169,30 @@ extern "C" { #define PCRE_CONFIG_MATCH_LIMIT 4 #define PCRE_CONFIG_STACKRECURSE 5 #define PCRE_CONFIG_UNICODE_PROPERTIES 6 +#define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7 -/* Bit flags for the pcre_extra structure */ +/* Bit flags for the pcre_extra structure. Do not re-arrange or redefine +these bits, just add new ones on the end, in order to remain compatible. */ -#define PCRE_EXTRA_STUDY_DATA 0x0001 -#define PCRE_EXTRA_MATCH_LIMIT 0x0002 -#define PCRE_EXTRA_CALLOUT_DATA 0x0004 -#define PCRE_EXTRA_TABLES 0x0008 +#define PCRE_EXTRA_STUDY_DATA 0x0001 +#define PCRE_EXTRA_MATCH_LIMIT 0x0002 +#define PCRE_EXTRA_CALLOUT_DATA 0x0004 +#define PCRE_EXTRA_TABLES 0x0008 +#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010 /* Types */ struct real_pcre; /* declaration; the definition is private */ typedef struct real_pcre pcre; +/* When PCRE is compiled as a C++ library, the subject pointer type can be +replaced with a custom type. For conventional use, the public interface is a +const char *. */ + +#ifndef PCRE_SPTR +#define PCRE_SPTR const char * +#endif + /* The structure for passing additional data to pcre_exec(). This is defined in such as way as to be extensible. Always add new fields at the end, in order to remain compatible. */ @@ -163,6 +203,7 @@ typedef struct pcre_extra { unsigned long int match_limit; /* Maximum number of calls to match() */ void *callout_data; /* Data passed back in callouts */ const unsigned char *tables; /* Pointer to character tables */ + unsigned long int match_limit_recursion; /* Max recursive calls to match() */ } pcre_extra; /* The structure for passing out data via the pcre_callout_function. We use a @@ -175,7 +216,7 @@ typedef struct pcre_callout_block { /* ------------------------ Version 0 ------------------------------- */ int callout_number; /* Number compiled into pattern */ int *offset_vector; /* The offset vector */ - const char *subject; /* The subject being matched */ + PCRE_SPTR subject; /* The subject being matched */ int subject_length; /* The length of the subject */ int start_match; /* Offset to start of this match attempt */ int current_position; /* Where we currently are in the subject */ @@ -221,7 +262,7 @@ PCRE_DATA_SCOPE int pcre_copy_substring(const char *, int *, int, int, char *, int); PCRE_DATA_SCOPE int pcre_dfa_exec(const pcre *, const pcre_extra *, const char *, int, int, int, int *, int , int *, int); -PCRE_DATA_SCOPE int pcre_exec(const pcre *, const pcre_extra *, const char *, +PCRE_DATA_SCOPE int pcre_exec(const pcre *, const pcre_extra *, PCRE_SPTR, int, int, int, int *, int); PCRE_DATA_SCOPE void pcre_free_substring(const char *); PCRE_DATA_SCOPE void pcre_free_substring_list(const char **); @@ -230,6 +271,8 @@ PCRE_DATA_SCOPE int pcre_fullinfo(const pcre *, const pcre_extra *, int, PCRE_DATA_SCOPE int pcre_get_named_substring(const pcre *, const char *, int *, int, const char *, const char **); PCRE_DATA_SCOPE int pcre_get_stringnumber(const pcre *, const char *); +PCRE_DATA_SCOPE int pcre_get_stringtable_entries(const pcre *, const char *, + char **, char **); PCRE_DATA_SCOPE int pcre_get_substring(const char *, int *, int, int, const char **); PCRE_DATA_SCOPE int pcre_get_substring_list(const char *, int *, int, diff --git a/libpcre/pcre_compile.c b/libpcre/pcre_compile.c index 9850399a8..a91ae8861 100644 --- a/libpcre/pcre_compile.c +++ b/libpcre/pcre_compile.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -42,6 +42,7 @@ POSSIBILITY OF SUCH DAMAGE. supporting internal functions that are not used by other modules. */ +#define NLBLOCK cd /* The block containing newline information */ #include "pcre_internal.h" @@ -116,7 +117,7 @@ static const short int escapes[] = { /* Tables of names of POSIX character classes and their lengths. The list is -terminated by a zero length entry. The first three must be alpha, upper, lower, +terminated by a zero length entry. The first three must be alpha, lower, upper, as this is assumed for handling case independence. */ static const char *const posix_names[] = { @@ -127,25 +128,31 @@ static const char *const posix_names[] = { static const uschar posix_name_lengths[] = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; -/* Table of class bit maps for each POSIX class; up to three may be combined -to form the class. The table for [:blank:] is dynamically modified to remove -the vertical space characters. */ +/* Table of class bit maps for each POSIX class. Each class is formed from a +base map, with an optional addition or removal of another map. Then, for some +classes, there is some additional tweaking: for [:blank:] the vertical space +characters are removed, and for [:alpha:] and [:alnum:] the underscore +character is removed. The triples in the table consist of the base map offset, +second map offset or -1 if no second map, and a non-negative value for map +addition or a negative value for map subtraction (if there are two maps). The +absolute value of the third field has these meanings: 0 => no tweaking, 1 => +remove vertical space characters, 2 => remove underscore. */ static const int posix_class_maps[] = { - cbit_lower, cbit_upper, -1, /* alpha */ - cbit_lower, -1, -1, /* lower */ - cbit_upper, -1, -1, /* upper */ - cbit_digit, cbit_lower, cbit_upper, /* alnum */ - cbit_print, cbit_cntrl, -1, /* ascii */ - cbit_space, -1, -1, /* blank - a GNU extension */ - cbit_cntrl, -1, -1, /* cntrl */ - cbit_digit, -1, -1, /* digit */ - cbit_graph, -1, -1, /* graph */ - cbit_print, -1, -1, /* print */ - cbit_punct, -1, -1, /* punct */ - cbit_space, -1, -1, /* space */ - cbit_word, -1, -1, /* word - a Perl extension */ - cbit_xdigit,-1, -1 /* xdigit */ + cbit_word, cbit_digit, -2, /* alpha */ + cbit_lower, -1, 0, /* lower */ + cbit_upper, -1, 0, /* upper */ + cbit_word, -1, 2, /* alnum - word without underscore */ + cbit_print, cbit_cntrl, 0, /* ascii */ + cbit_space, -1, 1, /* blank - a GNU extension */ + cbit_cntrl, -1, 0, /* cntrl */ + cbit_digit, -1, 0, /* digit */ + cbit_graph, -1, 0, /* graph */ + cbit_print, -1, 0, /* print */ + cbit_punct, -1, 0, /* punct */ + cbit_space, -1, 0, /* space */ + cbit_word, -1, 0, /* word - a Perl extension */ + cbit_xdigit,-1, 0 /* xdigit */ }; @@ -184,7 +191,7 @@ static const char *error_texts[] = { "unrecognized character after (?<", /* 25 */ "lookbehind assertion is not fixed length", - "malformed number after (?(", + "malformed number or name after (?(", "conditional group contains more than two branches", "assertion expected after (?(", "(?R or (?digits must be followed by )", @@ -204,12 +211,17 @@ static const char *error_texts[] = { "recursive call could loop indefinitely", "unrecognized character after (?P", "syntax error after (?P", - "two named groups have the same name", + "two named subpatterns have the same name", "invalid UTF-8 string", /* 45 */ "support for \\P, \\p, and \\X has not been compiled", "malformed \\P or \\p sequence", - "unknown property name after \\P or \\p" + "unknown property name after \\P or \\p", + "subpattern name is too long (maximum 32 characters)", + "too many named subpatterns (maximum 10,000)", + /* 50 */ + "repeated subpattern is too long", + "octal value is greater than \\377 (not in UTF-8 mode)" }; @@ -371,12 +383,15 @@ static int check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, int options, BOOL isclass) { -const uschar *ptr = *ptrptr; +BOOL utf8 = (options & PCRE_UTF8) != 0; +const uschar *ptr = *ptrptr + 1; int c, i; +GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ +ptr--; /* Set pointer back to the last byte */ + /* If backslash is at the end of the pattern, it's an error. */ -c = *(++ptr); if (c == 0) *errorcodeptr = ERR1; /* Non-alphamerics are literals. For digits or letters, do an initial lookup in @@ -451,49 +466,56 @@ else } /* \0 always starts an octal number, but we may drop through to here with a - larger first octal digit. */ + larger first octal digit. The original code used just to take the least + significant 8 bits of octal numbers (I think this is what early Perls used + to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more + than 3 octal digits. */ case '0': c -= '0'; while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') c = c * 8 + *(++ptr) - '0'; - c &= 255; /* Take least significant 8 bits */ + if (!utf8 && c > 255) *errorcodeptr = ERR51; break; - /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number - which can be greater than 0xff, but only if the ddd are hex digits. */ + /* \x is complicated. \x{ddd} is a character number which can be greater + than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is + treated as a data character. */ case 'x': -#ifdef SUPPORT_UTF8 - if (ptr[1] == '{' && (options & PCRE_UTF8) != 0) + if (ptr[1] == '{') { const uschar *pt = ptr + 2; - register int count = 0; + int count = 0; + c = 0; while ((digitab[*pt] & ctype_xdigit) != 0) { - int cc = *pt++; + register int cc = *pt++; + if (c == 0 && cc == '0') continue; /* Leading zeroes */ count++; + #if !EBCDIC /* ASCII coding */ if (cc >= 'a') cc -= 32; /* Convert to upper case */ - c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); + c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10)); #else /* EBCDIC coding */ if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ - c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); + c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10)); #endif } + if (*pt == '}') { - if (c < 0 || count > 8) *errorcodeptr = ERR34; + if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34; ptr = pt; break; } + /* If the sequence of hex digits does not end with '}', then we don't recognize this construct; fall through to the normal \x handling. */ } -#endif - /* Read just a single hex char */ + /* Read just a single-byte hex-defined char */ c = 0; while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) @@ -569,25 +591,26 @@ escape sequence. Argument: ptrptr points to the pattern position pointer negptr points to a boolean that is set TRUE for negation else FALSE + dptr points to an int that is set to the detailed property value errorcodeptr points to the error code variable -Returns: value from ucp_type_table, or -1 for an invalid type +Returns: type value from ucp_type_table, or -1 for an invalid type */ static int -get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr) +get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr) { int c, i, bot, top; const uschar *ptr = *ptrptr; -char name[4]; +char name[32]; c = *(++ptr); if (c == 0) goto ERROR_RETURN; *negptr = FALSE; -/* \P or \p can be followed by a one- or two-character name in {}, optionally -preceded by ^ for negation. */ +/* \P or \p can be followed by a name in {}, optionally preceded by ^ for +negation. */ if (c == '{') { @@ -596,18 +619,14 @@ if (c == '{') *negptr = TRUE; ptr++; } - for (i = 0; i <= 2; i++) + for (i = 0; i < sizeof(name) - 1; i++) { c = *(++ptr); if (c == 0) goto ERROR_RETURN; if (c == '}') break; name[i] = c; } - if (c !='}') /* Try to distinguish error cases */ - { - while (*(++ptr) != 0 && *ptr != '}'); - if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN; - } + if (c !='}') goto ERROR_RETURN; name[i] = 0; } @@ -628,13 +647,16 @@ top = _pcre_utt_size; while (bot < top) { - i = (bot + top)/2; + i = (bot + top) >> 1; c = strcmp(name, _pcre_utt[i].name); - if (c == 0) return _pcre_utt[i].value; + if (c == 0) + { + *dptr = _pcre_utt[i].value; + return _pcre_utt[i].type; + } if (c > 0) bot = i + 1; else top = i; } -UNKNOWN_RETURN: *errorcodeptr = ERR47; *ptrptr = ptr; return -1; @@ -749,6 +771,48 @@ return p; +/************************************************* +* Find forward referenced named subpattern * +*************************************************/ + +/* This function scans along a pattern looking for capturing subpatterns, and +counting them. If it finds a named pattern that matches the name it is given, +it returns its number. This is used for forward references to named +subpatterns. We know that if (?P< is encountered, the name will be terminated +by '>' because that is checked in the first pass. + +Arguments: + pointer current position in the pattern + count current count of capturing parens + name name to seek + namelen name length + +Returns: the number of the named subpattern, or -1 if not found +*/ + +static int +find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen) +{ +const uschar *thisname; +for (; *ptr != 0; ptr++) + { + if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; } + if (*ptr != '(') continue; + if (ptr[1] != '?') { count++; continue; } + if (ptr[2] == '(') { ptr += 2; continue; } + if (ptr[2] != 'P' || ptr[3] != '<') continue; + count++; + ptr += 4; + thisname = ptr; + while (*ptr != '>') ptr++; + if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0) + return count; + } +return -1; +} + + + /************************************************* * Find first significant op code * *************************************************/ @@ -904,6 +968,7 @@ for (;;) case OP_CHAR: case OP_CHARNC: + case OP_NOT: branchlength++; cc += 2; #ifdef SUPPORT_UTF8 @@ -937,7 +1002,7 @@ for (;;) case OP_PROP: case OP_NOTPROP: - cc++; + cc += 2; /* Fall through */ case OP_NOT_DIGIT: @@ -1018,14 +1083,19 @@ Returns: pointer to the opcode for the bracket, or NULL if not found static const uschar * find_bracket(const uschar *code, BOOL utf8, int number) { -#ifndef SUPPORT_UTF8 -utf8 = utf8; /* Stop pedantic compilers complaining */ -#endif - for (;;) { register int c = *code; if (c == OP_END) return NULL; + + /* XCLASS is used for classes that cannot be represented just by a bit + map. This includes negated single high-valued characters. The length in + the table is zero; the actual length is stored in the compiled code. */ + + if (c == OP_XCLASS) code += GET(code, 1); + + /* Handle bracketed group */ + else if (c > OP_BRA) { int n = c - OP_BRA; @@ -1033,17 +1103,16 @@ for (;;) if (n == number) return (uschar *)code; code += _pcre_OP_lengths[OP_BRA]; } + + /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes + that are followed by a character may be followed by a multi-byte character. + The length in the table is a minimum, so we have to scan along to skip the + extra bytes. All opcodes are less than 128, so we can use relatively + efficient code. */ + else { code += _pcre_OP_lengths[c]; - -#ifdef SUPPORT_UTF8 - - /* In UTF-8 mode, opcodes that are followed by a character may be followed - by a multi-byte character. The length in the table is a minimum, so we have - to scan along to skip the extra bytes. All opcodes are less than 128, so we - can use relatively efficient code. */ - if (utf8) switch(c) { case OP_CHAR: @@ -1059,16 +1128,7 @@ for (;;) case OP_MINQUERY: while ((*code & 0xc0) == 0x80) code++; break; - - /* XCLASS is used for classes that cannot be represented just by a bit - map. This includes negated single high-valued characters. The length in - the table is zero; the actual length is stored in the compiled code. */ - - case OP_XCLASS: - code += GET(code, 1) + 1; - break; } -#endif } } } @@ -1092,30 +1152,34 @@ Returns: pointer to the opcode for OP_RECURSE, or NULL if not found static const uschar * find_recurse(const uschar *code, BOOL utf8) { -#ifndef SUPPORT_UTF8 -utf8 = utf8; /* Stop pedantic compilers complaining */ -#endif - for (;;) { register int c = *code; if (c == OP_END) return NULL; - else if (c == OP_RECURSE) return code; + if (c == OP_RECURSE) return code; + + /* XCLASS is used for classes that cannot be represented just by a bit + map. This includes negated single high-valued characters. The length in + the table is zero; the actual length is stored in the compiled code. */ + + if (c == OP_XCLASS) code += GET(code, 1); + + /* All bracketed groups have the same length. */ + else if (c > OP_BRA) { code += _pcre_OP_lengths[OP_BRA]; } + + /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes + that are followed by a character may be followed by a multi-byte character. + The length in the table is a minimum, so we have to scan along to skip the + extra bytes. All opcodes are less than 128, so we can use relatively + efficient code. */ + else { code += _pcre_OP_lengths[c]; - -#ifdef SUPPORT_UTF8 - - /* In UTF-8 mode, opcodes that are followed by a character may be followed - by a multi-byte character. The length in the table is a minimum, so we have - to scan along to skip the extra bytes. All opcodes are less than 128, so we - can use relatively efficient code. */ - if (utf8) switch(c) { case OP_CHAR: @@ -1131,16 +1195,7 @@ for (;;) case OP_MINQUERY: while ((*code & 0xc0) == 0x80) code++; break; - - /* XCLASS is used for classes that cannot be represented just by a bit - map. This includes negated single high-valued characters. The length in - the table is zero; the actual length is stored in the compiled code. */ - - case OP_XCLASS: - code += GET(code, 1) + 1; - break; } -#endif } } } @@ -1497,13 +1552,10 @@ Yield: TRUE when range returned; FALSE when no more static BOOL get_othercase_range(int *cptr, int d, int *ocptr, int *odptr) { -int c, chartype, othercase, next; +int c, othercase, next; for (c = *cptr; c <= d; c++) - { - if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) - break; - } + { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; } if (c > d) return FALSE; @@ -1512,9 +1564,7 @@ next = othercase + 1; for (++c; c <= d; c++) { - if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L || - othercase != next) - break; + if (_pcre_ucp_othercase(c) != next) break; next++; } @@ -1561,7 +1611,6 @@ int greedy_default, greedy_non_default; int firstbyte, reqbyte; int zeroreqbyte, zerofirstbyte; int req_caseopt, reqvary, tempreqvary; -int condcount = 0; int options = *optionsptr; int after_manual_callout = 0; register int c; @@ -1675,10 +1724,14 @@ for (;; ptr++) if ((cd->ctypes[c] & ctype_space) != 0) continue; if (c == '#') { - /* The space before the ; is to avoid a warning on a silly compiler - on the Macintosh. */ - while ((c = *(++ptr)) != 0 && c != NEWLINE) ; - if (c != 0) continue; /* Else fall through to handle end of string */ + while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break; + if (*ptr != 0) + { + ptr += cd->nllen - 1; + continue; + } + /* Else fall through to handle end of string */ + c = 0; } } @@ -1731,11 +1784,11 @@ for (;; ptr++) *code++ = OP_ANY; break; - /* Character classes. If the included characters are all < 255 in value, we - build a 32-byte bitmap of the permitted characters, except in the special - case where there is only one such character. For negated classes, we build - the map as usual, then invert it at the end. However, we use a different - opcode so that data characters > 255 can be handled correctly. + /* Character classes. If the included characters are all < 256, we build a + 32-byte bitmap of the permitted characters, except in the special case + where there is only one such character. For negated classes, we build the + map as usual, then invert it at the end. However, we use a different opcode + so that data characters > 255 can be handled correctly. If the class contains characters outside the 0-255 range, a different opcode is compiled. It may optionally have a bit map for characters < 256, @@ -1826,8 +1879,9 @@ for (;; ptr++) check_posix_syntax(ptr, &tempptr, cd)) { BOOL local_negate = FALSE; - int posix_class, i; + int posix_class, taboffset, tabopt; register const uschar *cbits = cd->cbits; + uschar pbits[32]; if (ptr[1] != ':') { @@ -1856,31 +1910,45 @@ for (;; ptr++) if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) posix_class = 0; - /* Or into the map we are building up to 3 of the static class - tables, or their negations. The [:blank:] class sets up the same - chars as the [:space:] class (all white space). We remove the vertical - white space chars afterwards. */ + /* We build the bit map for the POSIX class in a chunk of local store + because we may be adding and subtracting from it, and we don't want to + subtract bits that may be in the main map already. At the end we or the + result into the bit map that is being built. */ posix_class *= 3; - for (i = 0; i < 3; i++) - { - BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0; - int taboffset = posix_class_maps[posix_class + i]; - if (taboffset < 0) break; - if (local_negate) + + /* Copy in the first table (always present) */ + + memcpy(pbits, cbits + posix_class_maps[posix_class], + 32 * sizeof(uschar)); + + /* If there is a second table, add or remove it as required. */ + + taboffset = posix_class_maps[posix_class + 1]; + tabopt = posix_class_maps[posix_class + 2]; + + if (taboffset >= 0) { - if (i == 0) - for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset]; + if (tabopt >= 0) + for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset]; else - for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset]; - if (blankclass) classbits[1] |= 0x3c; + for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset]; } + + /* Not see if we need to remove any special characters. An option + value of 1 removes vertical space and 2 removes underscore. */ + + if (tabopt < 0) tabopt = -tabopt; + if (tabopt == 1) pbits[1] &= ~0x3c; + else if (tabopt == 2) pbits[11] &= 0x7f; + + /* Add the POSIX table or its complement into the main table that is + being built and we are done. */ + + if (local_negate) + for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c]; else - { - for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset]; - if (blankclass) classbits[1] &= ~0x3c; - } - } + for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; ptr = tempptr + 1; class_charcount = 10; /* Set > 1; assumes more than 1 per class */ @@ -1948,12 +2016,14 @@ for (;; ptr++) case ESC_P: { BOOL negated; - int property = get_ucp(&ptr, &negated, errorcodeptr); - if (property < 0) goto FAILED; + int pdata; + int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); + if (ptype < 0) goto FAILED; class_utf8 = TRUE; *class_utf8data++ = ((-c == ESC_p) != negated)? XCL_PROP : XCL_NOTPROP; - *class_utf8data++ = property; + *class_utf8data++ = ptype; + *class_utf8data++ = pdata; class_charcount -= 2; /* Not a < 256 character */ } continue; @@ -2135,10 +2205,8 @@ for (;; ptr++) #ifdef SUPPORT_UCP if ((options & PCRE_CASELESS) != 0) { - int chartype; int othercase; - if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 && - othercase > 0) + if ((othercase = _pcre_ucp_othercase(c)) >= 0) { *class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); @@ -2423,13 +2491,17 @@ for (;; ptr++) else if (*previous < OP_EODN) { uschar *oldcode; - int prop_type; + int prop_type, prop_value; op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ c = *previous; OUTPUT_SINGLE_REPEAT: - prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)? - previous[1] : -1; + if (*previous == OP_PROP || *previous == OP_NOTPROP) + { + prop_type = previous[1]; + prop_value = previous[2]; + } + else prop_type = prop_value = -1; oldcode = code; code = previous; /* Usually overwrite previous item */ @@ -2490,7 +2562,7 @@ for (;; ptr++) /* If the maximum is unlimited, insert an OP_STAR. Before doing so, we have to insert the character for the previous code. For a repeated - Unicode property match, there is an extra byte that defines the + Unicode property match, there are two extra bytes that define the required property. In UTF-8 mode, long characters have their length in c, with the 0x80 bit as a flag. */ @@ -2506,7 +2578,11 @@ for (;; ptr++) #endif { *code++ = c; - if (prop_type >= 0) *code++ = prop_type; + if (prop_type >= 0) + { + *code++ = prop_type; + *code++ = prop_value; + } } *code++ = OP_STAR + repeat_type; } @@ -2525,7 +2601,11 @@ for (;; ptr++) else #endif *code++ = c; - if (prop_type >= 0) *code++ = prop_type; + if (prop_type >= 0) + { + *code++ = prop_type; + *code++ = prop_value; + } repeat_max -= repeat_min; *code++ = OP_UPTO + repeat_type; PUT2INC(code, 0, repeat_max); @@ -2544,11 +2624,15 @@ for (;; ptr++) #endif *code++ = c; - /* For a repeated Unicode property match, there is an extra byte that - defines the required property. */ + /* For a repeated Unicode property match, there are two extra bytes that + define the required property. */ #ifdef SUPPORT_UCP - if (prop_type >= 0) *code++ = prop_type; + if (prop_type >= 0) + { + *code++ = prop_type; + *code++ = prop_value; + } #endif } @@ -2812,37 +2896,91 @@ for (;; ptr++) case '(': bravalue = OP_COND; /* Conditional group */ - /* Condition to test for recursion */ + /* A condition can be a number, referring to a numbered group, a name, + referring to a named group, 'R', referring to recursion, or an + assertion. There are two unfortunate ambiguities, caused by history. + (a) 'R' can be the recursive thing or the name 'R', and (b) a number + could be a name that consists of digits. In both cases, we look for a + name first; if not found, we try the other cases. If the first + character after (?( is a word character, we know the rest up to ) will + also be word characters because the syntax was checked in the first + pass. */ - if (ptr[1] == 'R') + if ((cd->ctypes[ptr[1]] & ctype_word) != 0) { - code[1+LINK_SIZE] = OP_CREF; - PUT2(code, 2+LINK_SIZE, CREF_RECURSE); + int i, namelen; + int condref = 0; + const uschar *name; + uschar *slot = cd->name_table; + + /* This is needed for all successful cases. */ + skipbytes = 3; - ptr += 3; - } - /* Condition to test for a numbered subpattern match. We know that - if a digit follows ( then there will just be digits until ) because - the syntax was checked in the first pass. */ + /* Read the name, but also get it as a number if it's all digits */ - else if ((digitab[ptr[1]] && ctype_digit) != 0) - { - int condref; /* Don't amalgamate; some compilers */ - condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */ - while (*(++ptr) != ')') condref = condref*10 + *ptr - '0'; - if (condref == 0) + name = ++ptr; + while (*ptr != ')') { - *errorcodeptr = ERR35; - goto FAILED; + if (condref >= 0) + condref = ((digitab[*ptr] & ctype_digit) != 0)? + condref * 10 + *ptr - '0' : -1; + ptr++; } + namelen = ptr - name; ptr++; + + for (i = 0; i < cd->names_found; i++) + { + if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; + slot += cd->name_entry_size; + } + + /* Found a previous named subpattern */ + + if (i < cd->names_found) + { + condref = GET2(slot, 0); + code[1+LINK_SIZE] = OP_CREF; + PUT2(code, 2+LINK_SIZE, condref); + } + + /* Search the pattern for a forward reference */ + + else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0) + { + code[1+LINK_SIZE] = OP_CREF; + PUT2(code, 2+LINK_SIZE, i); + } + + /* Check for 'R' for recursion */ + + else if (namelen == 1 && *name == 'R') + { + code[1+LINK_SIZE] = OP_CREF; + PUT2(code, 2+LINK_SIZE, CREF_RECURSE); + } + + /* Check for a subpattern number */ + + else if (condref > 0) + { code[1+LINK_SIZE] = OP_CREF; PUT2(code, 2+LINK_SIZE, condref); - skipbytes = 3; } + + /* Either an unidentified subpattern, or a reference to (?(0) */ + + else + { + *errorcodeptr = (condref == 0)? ERR35: ERR15; + goto FAILED; + } + } + /* For conditions that are assertions, we just fall through, having set bravalue above. */ + break; case '=': /* Positive lookahead */ @@ -2914,10 +3052,13 @@ for (;; ptr++) { if (slot[2+namelen] == 0) { + if ((options & PCRE_DUPNAMES) == 0) + { *errorcodeptr = ERR43; goto FAILED; } - crc = -1; /* Current name is substring */ + } + else crc = -1; /* Current name is substring */ } if (crc < 0) { @@ -2950,14 +3091,18 @@ for (;; ptr++) if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; slot += cd->name_entry_size; } - if (i >= cd->names_found) + + if (i < cd->names_found) /* Back reference */ + { + recno = GET2(slot, 0); + } + else if ((recno = /* Forward back reference */ + find_named_parens(ptr, *brackets, name, namelen)) <= 0) { *errorcodeptr = ERR15; goto FAILED; } - recno = GET2(slot, 0); - if (type == '>') goto HANDLE_RECURSION; /* A few lines below */ /* Back reference */ @@ -2997,9 +3142,8 @@ for (;; ptr++) regex in case it doesn't exist. */ *code = OP_END; - called = (recno == 0)? - cd->start_code : find_bracket(cd->start_code, utf8, recno); - + called = (recno == 0)? cd->start_code : + find_bracket(cd->start_code, utf8, recno); if (called == NULL) { *errorcodeptr = ERR15; @@ -3016,11 +3160,20 @@ for (;; ptr++) goto FAILED; } - /* Insert the recursion/subroutine item */ + /* Insert the recursion/subroutine item, automatically wrapped inside + "once" brackets. */ + + *code = OP_ONCE; + PUT(code, 1, 2 + 2*LINK_SIZE); + code += 1 + LINK_SIZE; *code = OP_RECURSE; PUT(code, 1, called - cd->start_code); code += 1 + LINK_SIZE; + + *code = OP_KET; + PUT(code, 1, 2 + 2*LINK_SIZE); + code += 1 + LINK_SIZE; } continue; @@ -3037,6 +3190,7 @@ for (;; ptr++) case '-': optset = &unset; break; case 'i': *optset |= PCRE_CASELESS; break; + case 'J': *optset |= PCRE_DUPNAMES; break; case 'm': *optset |= PCRE_MULTILINE; break; case 's': *optset |= PCRE_DOTALL; break; case 'x': *optset |= PCRE_EXTENDED; break; @@ -3153,7 +3307,7 @@ for (;; ptr++) else if (bravalue == OP_COND) { uschar *tc = code; - condcount = 0; + int condcount = 0; do { condcount++; @@ -3290,10 +3444,12 @@ for (;; ptr++) else if (-c == ESC_P || -c == ESC_p) { BOOL negated; - int value = get_ucp(&ptr, &negated, errorcodeptr); + int pdata; + int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); previous = code; *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP; - *code++ = value; + *code++ = ptype; + *code++ = pdata; } #endif @@ -3848,7 +4004,7 @@ Returns: pointer to compiled data block, or NULL on error, with errorptr and erroroffset set */ -PCRE_EXPORT pcre * +PCRE_DATA_SCOPE pcre * pcre_compile(const char *pattern, int options, const char **errorptr, int *erroroffset, const unsigned char *tables) { @@ -3856,13 +4012,14 @@ return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); } -PCRE_EXPORT pcre * + +PCRE_DATA_SCOPE pcre * pcre_compile2(const char *pattern, int options, int *errorcodeptr, const char **errorptr, int *erroroffset, const unsigned char *tables) { real_pcre *re; int length = 1 + LINK_SIZE; /* For initial BRA plus length */ -int c, firstbyte, reqbyte; +int c, firstbyte, reqbyte, newline; int bracount = 0; int branch_extra = 0; int branch_newextra; @@ -3883,6 +4040,7 @@ uschar *code; const uschar *codestart; const uschar *ptr; compile_data compile_block; +compile_data *cd = &compile_block; int brastack[BRASTACK_SIZE]; uschar bralenstack[BRASTACK_SIZE]; @@ -3936,18 +4094,42 @@ if ((options & ~PUBLIC_OPTIONS) != 0) /* Set up pointers to the individual character tables */ if (tables == NULL) tables = _pcre_default_tables; -compile_block.lcc = tables + lcc_offset; -compile_block.fcc = tables + fcc_offset; -compile_block.cbits = tables + cbits_offset; -compile_block.ctypes = tables + ctypes_offset; +cd->lcc = tables + lcc_offset; +cd->fcc = tables + fcc_offset; +cd->cbits = tables + cbits_offset; +cd->ctypes = tables + ctypes_offset; + +/* Handle different types of newline. The two bits give four cases. The current +code allows for one- or two-byte sequences. */ + +switch (options & PCRE_NEWLINE_CRLF) + { + default: newline = NEWLINE; break; /* Compile-time default */ + case PCRE_NEWLINE_CR: newline = '\r'; break; + case PCRE_NEWLINE_LF: newline = '\n'; break; + case PCRE_NEWLINE_CR+ + PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; + } + +if (newline > 255) + { + cd->nllen = 2; + cd->nl[0] = (newline >> 8) & 255; + cd->nl[1] = newline & 255; + } +else + { + cd->nllen = 1; + cd->nl[0] = newline; + } /* Maximum back reference and backref bitmap. This is updated for numeric references during the first pass, but for named references during the actual compile pass. The bitmap records up to 31 back references to help in deciding whether (.*) can be treated as anchored or not. */ -compile_block.top_backref = 0; -compile_block.backref_map = 0; +cd->top_backref = 0; +cd->backref_map = 0; /* Reflect pattern for debugging output */ @@ -3981,15 +4163,17 @@ while ((c = *(++ptr)) != 0) if ((options & PCRE_EXTENDED) != 0) { - if ((compile_block.ctypes[c] & ctype_space) != 0) continue; + if ((cd->ctypes[c] & ctype_space) != 0) continue; if (c == '#') { - /* The space before the ; is to avoid a warning on a silly compiler - on the Macintosh. */ - while ((c = *(++ptr)) != 0 && c != NEWLINE) ; - if (c == 0) break; + while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break; + if (*ptr != 0) + { + ptr += cd->nllen - 1; continue; } + break; /* End loop at end of pattern */ + } } item_count++; /* Is zero for the first non-comment item */ @@ -4049,15 +4233,17 @@ while ((c = *(++ptr)) != 0) #endif /* \P and \p are for Unicode properties, but only when the support has - been compiled. Each item needs 2 bytes. */ + been compiled. Each item needs 3 bytes. */ else if (-c == ESC_P || -c == ESC_p) { #ifdef SUPPORT_UCP BOOL negated; - length += 2; - lastitemlength = 2; - if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN; + BOOL pdata; + length += 3; + lastitemlength = 3; + if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0) + goto PCRE_ERROR_RETURN; continue; #else errorcode = ERR45; @@ -4076,9 +4262,9 @@ while ((c = *(++ptr)) != 0) if (c <= -ESC_REF) { int refnum = -c - ESC_REF; - compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1; - if (refnum > compile_block.top_backref) - compile_block.top_backref = refnum; + cd->backref_map |= (refnum < 32)? (1 << refnum) : 1; + if (refnum > cd->top_backref) + cd->top_backref = refnum; length += 2; /* For single back reference */ if (ptr[1] == '{' && is_counted_repeat(ptr+2)) { @@ -4223,7 +4409,7 @@ while ((c = *(++ptr)) != 0) class_utf8 = TRUE; length += LINK_SIZE + 2; } - length += 2; + length += 3; } #endif } @@ -4232,7 +4418,9 @@ while ((c = *(++ptr)) != 0) /* Check the syntax for POSIX stuff. The bits we actually handle are checked during the real compile phase. */ - else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block)) + else if (*ptr == '[' && + (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && + check_posix_syntax(ptr, &ptr, cd)) { ptr++; class_optcount = 10; /* Make sure > 1 */ @@ -4465,6 +4653,61 @@ while ((c = *(++ptr)) != 0) ptr += 2; break; + /* Named subpatterns are an extension copied from Python */ + + case 'P': + ptr += 3; + + /* Handle the definition of a named subpattern */ + + if (*ptr == '<') + { + const uschar *p; /* Don't amalgamate; some compilers */ + p = ++ptr; /* grumble at autoincrement in declaration */ + while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; + if (*ptr != '>') + { + errorcode = ERR42; + goto PCRE_ERROR_RETURN; + } + name_count++; + if (name_count > MAX_NAME_COUNT) + { + errorcode = ERR49; + goto PCRE_ERROR_RETURN; + } + if (ptr - p > max_name_size) + { + max_name_size = (ptr - p); + if (max_name_size > MAX_NAME_SIZE) + { + errorcode = ERR48; + goto PCRE_ERROR_RETURN; + } + } + capturing = TRUE; /* Named parentheses are always capturing */ + break; /* Go handle capturing parentheses */ + } + + /* Handle back references and recursive calls to named subpatterns */ + + if (*ptr == '=' || *ptr == '>') + { + length += 3 + 3*LINK_SIZE; /* Allow for the automatic "once" */ + while ((cd->ctypes[*(++ptr)] & ctype_word) != 0); + if (*ptr != ')') + { + errorcode = ERR42; + goto PCRE_ERROR_RETURN; + } + goto RECURSE_CHECK_QUANTIFIED; + } + + /* Unknown character after (?P */ + + errorcode = ERR41; + goto PCRE_ERROR_RETURN; + /* (?R) specifies a recursive call to the regex, which is an extension to provide the facility which can be obtained by (?p{perl-code}) in Perl 5.6. In Perl 5.8 this has become (??{perl-code}). @@ -4486,12 +4729,14 @@ while ((c = *(++ptr)) != 0) errorcode = ERR29; goto PCRE_ERROR_RETURN; } - length += 1 + LINK_SIZE; + length += 3 + 3*LINK_SIZE; /* Allows for the automatic "once" */ /* If this item is quantified, it will get wrapped inside brackets so as to use the code for quantified brackets. We jump down and use the - code that handles this for real brackets. */ + code that handles this for real brackets. Come here from code for + named recursions/subroutines. */ + RECURSE_CHECK_QUANTIFIED: if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{') { length += 2 + 2 * LINK_SIZE; /* to make bracketed */ @@ -4515,47 +4760,6 @@ while ((c = *(++ptr)) != 0) length += 2 + 2*LINK_SIZE; continue; - /* Named subpatterns are an extension copied from Python */ - - case 'P': - ptr += 3; - - /* Handle the definition of a named subpattern */ - - if (*ptr == '<') - { - const uschar *p; /* Don't amalgamate; some compilers */ - p = ++ptr; /* grumble at autoincrement in declaration */ - while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++; - if (*ptr != '>') - { - errorcode = ERR42; - goto PCRE_ERROR_RETURN; - } - name_count++; - if (ptr - p > max_name_size) max_name_size = (ptr - p); - capturing = TRUE; /* Named parentheses are always capturing */ - break; - } - - /* Handle back references and recursive calls to named subpatterns */ - - if (*ptr == '=' || *ptr == '>') - { - while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0); - if (*ptr != ')') - { - errorcode = ERR42; - goto PCRE_ERROR_RETURN; - } - break; - } - - /* Unknown character after (?P */ - - errorcode = ERR41; - goto PCRE_ERROR_RETURN; - /* Lookbehinds are in Perl from version 5.005 */ case '<': @@ -4571,19 +4775,17 @@ while ((c = *(++ptr)) != 0) /* Conditionals are in Perl from version 5.005. The bracket must either be followed by a number (for bracket reference) or by an assertion - group, or (a PCRE extension) by 'R' for a recursion test. */ + group. PCRE extends this by allowing a name to reference a named group; + unfortunately, previously 'R' was implemented for a recursion test. + When this is compiled, we look for the named group 'R' first. At this + point we just do a basic syntax check. */ case '(': - if (ptr[3] == 'R' && ptr[4] == ')') + if ((cd->ctypes[ptr[3]] & ctype_word) != 0) { ptr += 4; length += 3; - } - else if ((digitab[ptr[3]] & ctype_digit) != 0) - { - ptr += 4; - length += 3; - while ((digitab[*ptr] & ctype_digit) != 0) ptr++; + while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; if (*ptr != ')') { errorcode = ERR26; @@ -4622,6 +4824,11 @@ while ((c = *(++ptr)) != 0) *optset |= PCRE_CASELESS; continue; + case 'J': + *optset |= PCRE_DUPNAMES; + options |= PCRE_JCHANGED; /* Record that it changed */ + continue; + case 'm': *optset |= PCRE_MULTILINE; continue; @@ -4687,16 +4894,13 @@ while ((c = *(++ptr)) != 0) will lead to an over-estimate on the length, but this shouldn't matter very much. We also have to allow for resetting options at the start of any alternations, which we do by setting - branch_newextra to 2. Finally, we record whether the case-dependent - flag ever changes within the regex. This is used by the "required - character" code. */ + branch_newextra to 2. */ case ':': if (((set|unset) & PCRE_IMS) != 0) { length += 4; branch_newextra = 2; - if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED; } goto END_OPTIONS; @@ -4776,6 +4980,12 @@ while ((c = *(++ptr)) != 0) { duplength = length - brastack[--brastackptr]; branch_extra = bralenstack[brastackptr]; + /* This is a paranoid check to stop integer overflow later on */ + if (duplength > MAX_DUPLENGTH) + { + errorcode = ERR50; + goto PCRE_ERROR_RETURN; + } } else duplength = 0; @@ -4880,7 +5090,8 @@ if (length > MAX_PATTERN_SIZE) } /* Compute the size of data block needed and get it, either from malloc or -externally provided function. */ +externally provided function. Integer overflow should no longer be possible +because nowadays we limit the maximum value of name_count and max_name size. */ size = length + sizeof(real_pcre) + name_count * (max_name_size + 3); re = (real_pcre *)(pcre_malloc)(size); @@ -4910,14 +5121,14 @@ re->nullpad = NULL; /* The starting points of the name/number translation table and of the code are passed around in the compile data block. */ -compile_block.names_found = 0; -compile_block.name_entry_size = max_name_size + 3; -compile_block.name_table = (uschar *)re + re->name_table_offset; -codestart = compile_block.name_table + re->name_entry_size * re->name_count; -compile_block.start_code = codestart; -compile_block.start_pattern = (const uschar *)pattern; -compile_block.req_varyopt = 0; -compile_block.nopartial = FALSE; +cd->names_found = 0; +cd->name_entry_size = max_name_size + 3; +cd->name_table = (uschar *)re + re->name_table_offset; +codestart = cd->name_table + re->name_entry_size * re->name_count; +cd->start_code = codestart; +cd->start_pattern = (const uschar *)pattern; +cd->req_varyopt = 0; +cd->nopartial = FALSE; /* Set up a starting, non-extracting bracket, then compile the expression. On error, errorcode will be set non-zero, so we don't need to look at the result @@ -4928,11 +5139,11 @@ code = (uschar *)codestart; *code = OP_BRA; bracount = 0; (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr, - &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block); + &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd); re->top_bracket = bracount; -re->top_backref = compile_block.top_backref; +re->top_backref = cd->top_backref; -if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL; +if (cd->nopartial) re->options |= PCRE_NOPARTIAL; /* If not reached end of pattern on success, there's an excess bracket. */ @@ -4978,7 +5189,7 @@ start with ^. and also when all branches start with .* for non-DOTALL matches. if ((options & PCRE_ANCHORED) == 0) { int temp_options = options; - if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map)) + if (is_anchored(codestart, &temp_options, 0, cd->backref_map)) re->options |= PCRE_ANCHORED; else { @@ -4988,10 +5199,10 @@ if ((options & PCRE_ANCHORED) == 0) { int ch = firstbyte & 255; re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && - compile_block.fcc[ch] == ch)? ch : firstbyte; + cd->fcc[ch] == ch)? ch : firstbyte; re->options |= PCRE_FIRSTSET; } - else if (is_startline(codestart, 0, compile_block.backref_map)) + else if (is_startline(codestart, 0, cd->backref_map)) re->options |= PCRE_STARTLINE; } } @@ -5005,7 +5216,7 @@ if (reqbyte >= 0 && { int ch = reqbyte & 255; re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && - compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; + cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; re->options |= PCRE_REQCHSET; } @@ -5019,11 +5230,10 @@ printf("Length = %d top_bracket = %d top_backref = %d\n", if (re->options != 0) { - printf("%s%s%s%s%s%s%s%s%s%s\n", + printf("%s%s%s%s%s%s%s%s%s\n", ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "", ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", ((re->options & PCRE_CASELESS) != 0)? "caseless " : "", - ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "", ((re->options & PCRE_EXTENDED) != 0)? "extended " : "", ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "", ((re->options & PCRE_DOTALL) != 0)? "dotall " : "", @@ -5035,7 +5245,8 @@ if (re->options != 0) if ((re->options & PCRE_FIRSTSET) != 0) { int ch = re->first_byte & 255; - const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)"; + const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? + "" : " (caseless)"; if (isprint(ch)) printf("First char = %c%s\n", ch, caseless); else printf("First char = \\x%02x%s\n", ch, caseless); } @@ -5043,7 +5254,8 @@ if ((re->options & PCRE_FIRSTSET) != 0) if ((re->options & PCRE_REQCHSET) != 0) { int ch = re->req_byte & 255; - const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)"; + const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? + "" : " (caseless)"; if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless); else printf("Req char = \\x%02x%s\n", ch, caseless); } diff --git a/libpcre/pcre_config.c b/libpcre/pcre_config.c index 5538a70a3..29e6c1a35 100644 --- a/libpcre/pcre_config.c +++ b/libpcre/pcre_config.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -58,7 +58,7 @@ Arguments: Returns: 0 if data returned, negative on error */ -PCRE_EXPORT int +PCRE_DATA_SCOPE int pcre_config(int what, void *where) { switch (what) @@ -95,6 +95,10 @@ switch (what) *((unsigned int *)where) = MATCH_LIMIT; break; + case PCRE_CONFIG_MATCH_LIMIT_RECURSION: + *((unsigned int *)where) = MATCH_LIMIT_RECURSION; + break; + case PCRE_CONFIG_STACKRECURSE: #ifdef NO_RECURSE *((int *)where) = 0; diff --git a/libpcre/pcre_dfa_exec.c b/libpcre/pcre_dfa_exec.c index c68f2329f..5396d633b 100644 --- a/libpcre/pcre_dfa_exec.c +++ b/libpcre/pcre_dfa_exec.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -43,6 +43,7 @@ alternative matching function that uses a DFA algorithm. This is NOT Perl- compatible, but it has advantages in certain applications. */ +#define NLBLOCK md /* The block containing newline information */ #include "pcre_internal.h" @@ -288,7 +289,9 @@ const uschar *start_subject = md->start_subject; const uschar *end_subject = md->end_subject; const uschar *start_code = md->start_code; +#ifdef SUPPORT_UTF8 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; +#endif rlevel++; offsetcount &= (-2); @@ -421,7 +424,8 @@ ptr = current_subject; for (;;) { int i, j; - int c, d, clen, dlen; + int clen, dlen; + unsigned int c, d; /* Make the new state list into the active state list and empty the new state list. */ @@ -480,7 +484,7 @@ for (;;) const uschar *code; int state_offset = current_state->offset; int count, codevalue; - int chartype, othercase; + int chartype, script; #ifdef DEBUG printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); @@ -645,7 +649,10 @@ for (;;) /*-----------------------------------------------------------------*/ case OP_CIRC: if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || - ((ims & PCRE_MULTILINE) != 0 && ptr[-1] == NEWLINE)) + ((ims & PCRE_MULTILINE) != 0 && + ptr >= start_subject + md->nllen && + ptr != end_subject && + IS_NEWLINE(ptr - md->nllen))) { ADD_ACTIVE(state_offset + 1, 0); } break; @@ -679,13 +686,16 @@ for (;;) /*-----------------------------------------------------------------*/ case OP_ANY: - if (clen > 0 && (c != NEWLINE || (ims & PCRE_DOTALL) != 0)) + if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || + ptr > end_subject - md->nllen || + !IS_NEWLINE(ptr))) { ADD_NEW(state_offset + 1, 0); } break; /*-----------------------------------------------------------------*/ case OP_EODN: - if (clen == 0 || (c == NEWLINE && ptr + 1 == end_subject)) + if (clen == 0 || + (ptr == end_subject - md->nllen && IS_NEWLINE(ptr))) { ADD_ACTIVE(state_offset + 1, 0); } break; @@ -693,11 +703,14 @@ for (;;) case OP_DOLL: if ((md->moptions & PCRE_NOTEOL) == 0) { - if (clen == 0 || (c == NEWLINE && (ptr + 1 == end_subject || - (ims & PCRE_MULTILINE) != 0))) + if (clen == 0 || + (ptr <= end_subject - md->nllen && IS_NEWLINE(ptr) && + ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen) + )) { ADD_ACTIVE(state_offset + 1, 0); } } - else if (c == NEWLINE && (ims & PCRE_MULTILINE) != 0) + else if ((ims & PCRE_MULTILINE) != 0 && + ptr <= end_subject - md->nllen && IS_NEWLINE(ptr)) { ADD_ACTIVE(state_offset + 1, 0); } break; @@ -757,19 +770,38 @@ for (;;) case OP_NOTPROP: if (clen > 0) { - int rqdtype, category; - category = _pcre_ucp_findchar(c, &chartype, &othercase); - rqdtype = code[1]; - if (rqdtype >= 128) + BOOL OK; + int category = _pcre_ucp_findprop(c, &chartype, &script); + switch(code[1]) { - if ((rqdtype - 128 == category) == (codevalue == OP_PROP)) - { ADD_NEW(state_offset + 2, 0); } - } - else - { - if ((rqdtype == chartype) == (codevalue == OP_PROP)) - { ADD_NEW(state_offset + 2, 0); } + case PT_ANY: + OK = TRUE; + break; + + case PT_LAMP: + OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; + break; + + case PT_GC: + OK = category == code[2]; + break; + + case PT_PC: + OK = chartype == code[2]; + break; + + case PT_SC: + OK = script == code[2]; + break; + + /* Should never occur, but keep compilers from grumbling. */ + + default: + OK = codevalue != OP_PROP; + break; } + + if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } } break; #endif @@ -790,7 +822,11 @@ for (;;) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && - (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && + (d != OP_ANY || + (ims & PCRE_DOTALL) != 0 || + ptr > end_subject - md->nllen || + !IS_NEWLINE(ptr) + ) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { count++; @@ -807,7 +843,11 @@ for (;;) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && - (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && + (d != OP_ANY || + (ims & PCRE_DOTALL) != 0 || + ptr > end_subject - md->nllen || + !IS_NEWLINE(ptr) + ) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { ADD_NEW(state_offset + 2, 0); @@ -823,7 +863,11 @@ for (;;) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && - (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && + (d != OP_ANY || + (ims & PCRE_DOTALL) != 0 || + ptr > end_subject - md->nllen || + !IS_NEWLINE(ptr) + ) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { ADD_NEW(state_offset, 0); @@ -842,7 +886,11 @@ for (;;) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && - (d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && + (d != OP_ANY || + (ims & PCRE_DOTALL) != 0 || + ptr > end_subject - md->nllen || + !IS_NEWLINE(ptr) + ) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (++count >= GET2(code, 1)) @@ -862,14 +910,41 @@ for (;;) case OP_PROP_EXTRA + OP_TYPEPLUS: case OP_PROP_EXTRA + OP_TYPEMINPLUS: count = current_state->count; /* Already matched */ - if (count > 0) { ADD_ACTIVE(state_offset + 3, 0); } + if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } if (clen > 0) { - int category = _pcre_ucp_findchar(c, &chartype, &othercase); - int rqdtype = code[2]; - if ((d == OP_PROP) == - (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype))) - { count++; ADD_NEW(state_offset, count); } + BOOL OK; + int category = _pcre_ucp_findprop(c, &chartype, &script); + switch(code[2]) + { + case PT_ANY: + OK = TRUE; + break; + + case PT_LAMP: + OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; + break; + + case PT_GC: + OK = category == code[3]; + break; + + case PT_PC: + OK = chartype == code[3]; + break; + + case PT_SC: + OK = script == code[3]; + break; + + /* Should never occur, but keep compilers from grumbling. */ + + default: + OK = codevalue != OP_PROP; + break; + } + + if (OK == (d == OP_PROP)) { count++; ADD_NEW(state_offset, count); } } break; @@ -878,7 +953,7 @@ for (;;) case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: count = current_state->count; /* Already matched */ if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } - if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M) + if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) { const uschar *nptr = ptr + clen; int ncount = 0; @@ -887,7 +962,7 @@ for (;;) int nd; int ndlen = 1; GETCHARLEN(nd, nptr, ndlen); - if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break; + if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; ncount++; nptr += ndlen; } @@ -899,7 +974,7 @@ for (;;) /*-----------------------------------------------------------------*/ case OP_PROP_EXTRA + OP_TYPEQUERY: case OP_PROP_EXTRA + OP_TYPEMINQUERY: - count = 3; + count = 4; goto QS1; case OP_PROP_EXTRA + OP_TYPESTAR: @@ -908,14 +983,41 @@ for (;;) QS1: - ADD_ACTIVE(state_offset + 3, 0); + ADD_ACTIVE(state_offset + 4, 0); if (clen > 0) { - int category = _pcre_ucp_findchar(c, &chartype, &othercase); - int rqdtype = code[2]; - if ((d == OP_PROP) == - (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype))) - { ADD_NEW(state_offset + count, 0); } + BOOL OK; + int category = _pcre_ucp_findprop(c, &chartype, &script); + switch(code[2]) + { + case PT_ANY: + OK = TRUE; + break; + + case PT_LAMP: + OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; + break; + + case PT_GC: + OK = category == code[3]; + break; + + case PT_PC: + OK = chartype == code[3]; + break; + + case PT_SC: + OK = script == code[3]; + break; + + /* Should never occur, but keep compilers from grumbling. */ + + default: + OK = codevalue != OP_PROP; + break; + } + + if (OK == (d == OP_PROP)) { ADD_NEW(state_offset + count, 0); } } break; @@ -932,7 +1034,7 @@ for (;;) QS2: ADD_ACTIVE(state_offset + 2, 0); - if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M) + if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) { const uschar *nptr = ptr + clen; int ncount = 0; @@ -941,7 +1043,7 @@ for (;;) int nd; int ndlen = 1; GETCHARLEN(nd, nptr, ndlen); - if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break; + if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; ncount++; nptr += ndlen; } @@ -954,17 +1056,45 @@ for (;;) case OP_PROP_EXTRA + OP_TYPEUPTO: case OP_PROP_EXTRA + OP_TYPEMINUPTO: if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) - { ADD_ACTIVE(state_offset + 5, 0); } + { ADD_ACTIVE(state_offset + 6, 0); } count = current_state->count; /* Number already matched */ if (clen > 0) { - int category = _pcre_ucp_findchar(c, &chartype, &othercase); - int rqdtype = code[4]; - if ((d == OP_PROP) == - (rqdtype == ((rqdtype >= 128)? (category + 128) : chartype))) + BOOL OK; + int category = _pcre_ucp_findprop(c, &chartype, &script); + switch(code[4]) + { + case PT_ANY: + OK = TRUE; + break; + + case PT_LAMP: + OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; + break; + + case PT_GC: + OK = category == code[5]; + break; + + case PT_PC: + OK = chartype == code[5]; + break; + + case PT_SC: + OK = script == code[5]; + break; + + /* Should never occur, but keep compilers from grumbling. */ + + default: + OK = codevalue != OP_PROP; + break; + } + + if (OK == (d == OP_PROP)) { if (++count >= GET2(code, 1)) - { ADD_NEW(state_offset + 5, 0); } + { ADD_NEW(state_offset + 6, 0); } else { ADD_NEW(state_offset, count); } } @@ -978,7 +1108,7 @@ for (;;) if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) { ADD_ACTIVE(state_offset + 4, 0); } count = current_state->count; /* Number already matched */ - if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M) + if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) { const uschar *nptr = ptr + clen; int ncount = 0; @@ -987,7 +1117,7 @@ for (;;) int nd; int ndlen = 1; GETCHARLEN(nd, nptr, ndlen); - if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break; + if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; ncount++; nptr += ndlen; } @@ -1018,17 +1148,17 @@ for (;;) { if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else { + int othercase; if (c < 128) othercase = fcc[c]; else /* If we have Unicode property support, we can use it to test the - other case of the character, if there is one. The result of - _pcre_ucp_findchar() is < 0 if the char isn't found, and othercase is - returned as zero if there isn't another case. */ + other case of the character. */ #ifdef SUPPORT_UCP - if (_pcre_ucp_findchar(c, &chartype, &othercase) < 0) -#endif + othercase = _pcre_ucp_othercase(c); +#else othercase = -1; +#endif if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } } @@ -1050,7 +1180,7 @@ for (;;) to wait for them to pass before continuing. */ case OP_EXTUNI: - if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M) + if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) { const uschar *nptr = ptr + clen; int ncount = 0; @@ -1058,7 +1188,7 @@ for (;;) { int nclen = 1; GETCHARLEN(c, nptr, nclen); - if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M) break; + if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break; ncount++; nptr += nclen; } @@ -1093,10 +1223,10 @@ for (;;) if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 - if (utf8 && c >= 128) + if (utf8 && d >= 128) { #ifdef SUPPORT_UCP - if (_pcre_ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1; + otherd = _pcre_ucp_othercase(d); #endif /* SUPPORT_UCP */ } else @@ -1117,13 +1247,13 @@ for (;;) if (clen > 0) { int otherd = -1; - if ((ims && PCRE_CASELESS) != 0) + if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 - if (utf8 && c >= 128) + if (utf8 && d >= 128) { #ifdef SUPPORT_UCP - if (_pcre_ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1; + otherd = _pcre_ucp_othercase(d); #endif /* SUPPORT_UCP */ } else @@ -1144,13 +1274,13 @@ for (;;) if (clen > 0) { int otherd = -1; - if ((ims && PCRE_CASELESS) != 0) + if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 - if (utf8 && c >= 128) + if (utf8 && d >= 128) { #ifdef SUPPORT_UCP - if (_pcre_ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1; + otherd = _pcre_ucp_othercase(d); #endif /* SUPPORT_UCP */ } else @@ -1178,10 +1308,10 @@ for (;;) if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 - if (utf8 && c >= 128) + if (utf8 && d >= 128) { #ifdef SUPPORT_UCP - if (_pcre_ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1; + otherd = _pcre_ucp_othercase(d); #endif /* SUPPORT_UCP */ } else @@ -1267,7 +1397,8 @@ for (;;) { ADD_ACTIVE(next_state_offset + 5, 0); } if (isinclass) { - if (++count >= GET2(ecode, 3)) + int max = GET2(ecode, 3); + if (++count >= max && max != 0) /* Max 0 => no limit */ { ADD_NEW(next_state_offset + 5, 0); } else { ADD_NEW(state_offset, count); } @@ -1519,7 +1650,7 @@ for (;;) cb.version = 1; /* Version 1 of the callout block */ cb.callout_number = code[1]; cb.offset_vector = offsets; - cb.subject = (char *)start_subject; + cb.subject = (PCRE_SPTR)start_subject; cb.subject_length = end_subject - start_subject; cb.start_match = current_subject - start_subject; cb.current_position = ptr - start_subject; @@ -1567,7 +1698,7 @@ for (;;) DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, rlevel*2-2, SP)); - return match_count; + break; /* In effect, "return", but see the comment below */ } /* One or more states are active for the next character. */ @@ -1575,11 +1706,13 @@ for (;;) ptr += clen; /* Advance to next subject character */ } /* Loop to move along the subject string */ -/* Control never gets here, but we must keep the compiler happy. */ +/* Control gets here from "break" a few lines above. We do it this way because +if we use "return" above, we have compiler trouble. Some compilers warn if +there's nothing here because they think the function doesn't return a value. On +the other hand, if we put a dummy statement here, some more clever compilers +complain that it can't be reached. Sigh. */ -DPRINTF(("%.*s+++ Unexpected end of internal_dfa_exec %d +++\n" - "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, rlevel*2-2, SP)); -return PCRE_ERROR_NOMATCH; +return match_count; } @@ -1611,13 +1744,14 @@ Returns: > 0 => number of match offset pairs placed in offsets < -1 => some kind of unexpected problem */ -PCRE_EXPORT int +PCRE_DATA_SCOPE int pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, const char *subject, int length, int start_offset, int options, int *offsets, int offsetcount, int *workspace, int wscount) { real_pcre *re = (real_pcre *)argument_re; dfa_match_data match_block; +dfa_match_data *md = &match_block; BOOL utf8, anchored, startline, firstline; const uschar *current_subject, *end_subject, *lcc; @@ -1632,6 +1766,7 @@ BOOL req_byte_caseless = FALSE; int first_byte = -1; int req_byte = -1; int req_byte2 = -1; +int newline; /* Plausibility checks */ @@ -1646,8 +1781,8 @@ flipping, so we scan the extra_data block first. This may set two fields in the match block, so we must initialize them beforehand. However, the other fields in the match block must not be set until after the byte flipping. */ -match_block.tables = re->tables; -match_block.callout_data = NULL; +md->tables = re->tables; +md->callout_data = NULL; if (extra_data != NULL) { @@ -1655,10 +1790,12 @@ if (extra_data != NULL) if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) study = (const pcre_study_data *)extra_data->study_data; if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; + if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) + return PCRE_ERROR_DFA_UMLIMIT; if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) - match_block.callout_data = extra_data->callout_data; + md->callout_data = extra_data->callout_data; if ((flags & PCRE_EXTRA_TABLES) != 0) - match_block.tables = extra_data->tables; + md->tables = extra_data->tables; } /* Check that the first field in the block is the magic number. If it is not, @@ -1679,17 +1816,48 @@ current_subject = (const unsigned char *)subject + start_offset; end_subject = (const unsigned char *)subject + length; req_byte_ptr = current_subject - 1; +#ifdef SUPPORT_UTF8 utf8 = (re->options & PCRE_UTF8) != 0; -anchored = (options & PCRE_ANCHORED) != 0 || (re->options & PCRE_ANCHORED) != 0; +#else +utf8 = FALSE; +#endif + +anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || + (re->options & PCRE_ANCHORED) != 0; /* The remaining fixed data for passing around. */ -match_block.start_code = (const uschar *)argument_re + +md->start_code = (const uschar *)argument_re + re->name_table_offset + re->name_count * re->name_entry_size; -match_block.start_subject = (const unsigned char *)subject; -match_block.end_subject = end_subject; -match_block.moptions = options; -match_block.poptions = re->options; +md->start_subject = (const unsigned char *)subject; +md->end_subject = end_subject; +md->moptions = options; +md->poptions = re->options; + +/* Handle different types of newline. The two bits give four cases. If nothing +is set at run time, whatever was used at compile time applies. */ + +switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) & + PCRE_NEWLINE_CRLF) + { + default: newline = NEWLINE; break; /* Compile-time default */ + case PCRE_NEWLINE_CR: newline = '\r'; break; + case PCRE_NEWLINE_LF: newline = '\n'; break; + case PCRE_NEWLINE_CR+ + PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; + } + +if (newline > 255) + { + md->nllen = 2; + md->nl[0] = (newline >> 8) & 255; + md->nl[1] = newline & 255; + } +else + { + md->nllen = 1; + md->nl[0] = newline; + } /* Check a UTF-8 string if required. Unfortunately there's no way of passing back the character offset. */ @@ -1715,12 +1883,12 @@ if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) is a feature that makes it possible to save compiled regex and re-use them in other programs later. */ -if (match_block.tables == NULL) match_block.tables = _pcre_default_tables; +if (md->tables == NULL) md->tables = _pcre_default_tables; /* The lower casing table and the "must be at the start of a line" flag are used in a loop when finding where to start. */ -lcc = match_block.tables + lcc_offset; +lcc = md->tables + lcc_offset; startline = (re->options & PCRE_STARTLINE) != 0; firstline = (re->options & PCRE_FIRSTLINE) != 0; @@ -1753,7 +1921,7 @@ if ((re->options & PCRE_REQCHSET) != 0) { req_byte = re->req_byte & 255; req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; - req_byte2 = (match_block.tables + fcc_offset)[req_byte]; /* case flipped */ + req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */ } /* Call the main matching function, looping for a non-anchored regex after a @@ -1771,14 +1939,14 @@ for (;;) /* Advance to a unique first char if possible. If firstline is TRUE, the start of the match is constrained to the first line of a multiline string. - Implement this by temporarily adjusting end_subject so that we stop scanning - at a newline. If the match fails at the newline, later code breaks this loop. - */ + Implement this by temporarily adjusting end_subject so that we stop + scanning at a newline. If the match fails at the newline, later code breaks + this loop. */ if (firstline) { const uschar *t = current_subject; - while (t < save_end_subject && *t != '\n') t++; + while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++; end_subject = t; } @@ -1793,13 +1961,15 @@ for (;;) current_subject++; } - /* Or to just after \n for a multiline match if possible */ + /* Or to just after a linebreak for a multiline match if possible */ else if (startline) { - if (current_subject > match_block.start_subject + start_offset) + if (current_subject > md->start_subject + md->nllen + + start_offset) { - while (current_subject < end_subject && current_subject[-1] != NEWLINE) + while (current_subject <= end_subject && + !IS_NEWLINE(current_subject - md->nllen)) current_subject++; } } @@ -1880,8 +2050,8 @@ for (;;) /* OK, now we can do the business */ rc = internal_dfa_exec( - &match_block, /* fixed match data */ - match_block.start_code, /* this subexpression's code */ + md, /* fixed match data */ + md->start_code, /* this subexpression's code */ current_subject, /* where we currently are */ start_offset, /* start offset in subject */ offsets, /* offset vector */ @@ -1900,17 +2070,15 @@ for (;;) /* Advance to the next subject character unless we are at the end of a line and firstline is set. */ - if (firstline && *current_subject == NEWLINE) break; + if (firstline && + current_subject <= end_subject - md->nllen && + IS_NEWLINE(current_subject)) break; current_subject++; - -#ifdef SUPPORT_UTF8 if (utf8) { while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) current_subject++; } -#endif - if (current_subject > end_subject) break; } diff --git a/libpcre/pcre_exec.c b/libpcre/pcre_exec.c index 133b3a743..13ad9b140 100644 --- a/libpcre/pcre_exec.c +++ b/libpcre/pcre_exec.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -42,7 +42,7 @@ POSSIBILITY OF SUCH DAMAGE. pattern matching using an NFA algorithm, trying to mimic Perl as closely as possible. There are also some static supporting functions. */ - +#define NLBLOCK md /* The block containing newline information */ #include "pcre_internal.h" @@ -54,7 +54,7 @@ are on the heap, not on the stack. */ typedef struct eptrblock { struct eptrblock *epb_prev; - const uschar *epb_saved_eptr; + USPTR epb_saved_eptr; } eptrblock; /* Flag bits for the match() function */ @@ -128,10 +128,10 @@ Returns: TRUE if matched */ static BOOL -match_ref(int offset, register const uschar *eptr, int length, match_data *md, +match_ref(int offset, register USPTR eptr, int length, match_data *md, unsigned long int ims) { -const uschar *p = md->start_subject + md->offset_vector[offset]; +USPTR p = md->start_subject + md->offset_vector[offset]; #ifdef DEBUG if (eptr >= md->end_subject) @@ -169,32 +169,50 @@ return TRUE; **************************************************************************** RECURSION IN THE match() FUNCTION -The match() function is highly recursive. Some regular expressions can cause -it to recurse thousands of times. I was writing for Unix, so I just let it -call itself recursively. This uses the stack for saving everything that has -to be saved for a recursive call. On Unix, the stack can be large, and this -works fine. +The match() function is highly recursive, though not every recursive call +increases the recursive depth. Nevertheless, some regular expressions can cause +it to recurse to a great depth. I was writing for Unix, so I just let it call +itself recursively. This uses the stack for saving everything that has to be +saved for a recursive call. On Unix, the stack can be large, and this works +fine. -It turns out that on non-Unix systems there are problems with programs that -use a lot of stack. (This despite the fact that every last chip has oodles -of memory these days, and techniques for extending the stack have been known -for decades.) So.... +It turns out that on some non-Unix-like systems there are problems with +programs that use a lot of stack. (This despite the fact that every last chip +has oodles of memory these days, and techniques for extending the stack have +been known for decades.) So.... There is a fudge, triggered by defining NO_RECURSE, which avoids recursive calls by keeping local variables that need to be preserved in blocks of memory -obtained from malloc instead instead of on the stack. Macros are used to +obtained from malloc() instead instead of on the stack. Macros are used to achieve this so that the actual code doesn't look very different to what it always used to. **************************************************************************** ***************************************************************************/ -/* These versions of the macros use the stack, as normal */ +/* These versions of the macros use the stack, as normal. There are debugging +versions and production versions. */ #ifndef NO_RECURSE #define REGISTER register -#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg) +#ifdef DEBUG +#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \ + { \ + printf("match() called in line %d\n", __LINE__); \ + rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \ + printf("to line %d\n", __LINE__); \ + } +#define RRETURN(ra) \ + { \ + printf("match() returned %d from line %d ", ra, __LINE__); \ + return ra; \ + } +#else +#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \ + rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1) #define RRETURN(ra) return ra +#endif + #else @@ -215,6 +233,7 @@ match(), which never changes. */ newframe->Xims = re;\ newframe->Xeptrb = rf;\ newframe->Xflags = rg;\ + newframe->Xrdepth = frame->Xrdepth + 1;\ newframe->Xprevframe = frame;\ frame = newframe;\ DPRINTF(("restarting from line %d\n", __LINE__));\ @@ -256,6 +275,7 @@ typedef struct heapframe { long int Xims; eptrblock *Xeptrb; int Xflags; + unsigned int Xrdepth; /* Function local variables */ @@ -278,11 +298,11 @@ typedef struct heapframe { #ifdef SUPPORT_UCP int Xprop_type; + int Xprop_value; int Xprop_fail_result; int Xprop_category; int Xprop_chartype; - int Xprop_othercase; - int Xprop_test_against; + int Xprop_script; int *Xprop_test_variable; #endif @@ -343,17 +363,18 @@ Arguments: flags can contain match_condassert - this is an assertion condition match_isgroup - this is the start of a bracketed group + rdepth the recursion depth Returns: MATCH_MATCH if matched ) these values are >= 0 MATCH_NOMATCH if failed to match ) a negative PCRE_ERROR_xxx value if aborted by an error condition - (e.g. stopped by recursion limit) + (e.g. stopped by repeated call or recursion limit) */ static int -match(REGISTER const uschar *eptr, REGISTER const uschar *ecode, +match(REGISTER USPTR eptr, REGISTER const uschar *ecode, int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, - int flags) + int flags, unsigned int rdepth) { /* These variables do not need to be preserved over recursion in this function, so they can be ordinary variables in all cases. Mark them with "register" @@ -361,7 +382,7 @@ because they are used a lot in loops. */ register int rrc; /* Returns from recursive calls */ register int i; /* Used for loops not involving calls to RMATCH() */ -register int c; /* Character values not kept over RMATCH() calls */ +register unsigned int c; /* Character values not kept over RMATCH() calls */ register BOOL utf8; /* Local copy of UTF-8 flag for speed */ /* When recursion is not being used, all "local" variables that have to be @@ -381,6 +402,7 @@ frame->Xoffset_top = offset_top; frame->Xims = ims; frame->Xeptrb = eptrb; frame->Xflags = flags; +frame->Xrdepth = rdepth; /* This is where control jumps back to to effect "recursion" */ @@ -394,6 +416,7 @@ HEAP_RECURSE: #define ims frame->Xims #define eptrb frame->Xeptrb #define flags frame->Xflags +#define rdepth frame->Xrdepth /* Ditto for the local variables */ @@ -418,11 +441,11 @@ HEAP_RECURSE: #ifdef SUPPORT_UCP #define prop_type frame->Xprop_type +#define prop_value frame->Xprop_value #define prop_fail_result frame->Xprop_fail_result #define prop_category frame->Xprop_category #define prop_chartype frame->Xprop_chartype -#define prop_othercase frame->Xprop_othercase -#define prop_test_against frame->Xprop_test_against +#define prop_script frame->Xprop_script #define prop_test_variable frame->Xprop_test_variable #endif @@ -452,20 +475,20 @@ i, and fc and c, can be the same variables. */ #define fc c -#ifdef SUPPORT_UTF8 /* Many of these variables are used ony */ -const uschar *charptr; /* small blocks of the code. My normal */ +#ifdef SUPPORT_UTF8 /* Many of these variables are used only */ +const uschar *charptr; /* in small blocks of the code. My normal */ #endif /* style of coding would have declared */ const uschar *callpat; /* them within each of those blocks. */ const uschar *data; /* However, in order to accommodate the */ const uschar *next; /* version of this code that uses an */ -const uschar *pp; /* external "stack" implemented on the */ -const uschar *prev; /* heap, it is easier to declare them */ -const uschar *saved_eptr; /* all here, so the declarations can */ - /* be cut out in a block. The only */ -recursion_info new_recursive; /* declarations within blocks below are */ - /* for variables that do not have to */ -BOOL cur_is_word; /* be preserved over a recursive call */ -BOOL condition; /* to RMATCH(). */ +USPTR pp; /* external "stack" implemented on the */ +const uschar *prev; /* heap, it is easier to declare them all */ +USPTR saved_eptr; /* here, so the declarations can be cut */ + /* out in a block. The only declarations */ +recursion_info new_recursive; /* within blocks below are for variables */ + /* that do not have to be preserved over */ +BOOL cur_is_word; /* a recursive call to RMATCH(). */ +BOOL condition; BOOL minimize; BOOL prev_is_word; @@ -473,11 +496,11 @@ unsigned long int original_ims; #ifdef SUPPORT_UCP int prop_type; +int prop_value; int prop_fail_result; int prop_category; int prop_chartype; -int prop_othercase; -int prop_test_against; +int prop_script; int *prop_test_variable; #endif @@ -499,22 +522,39 @@ eptrblock newptrb; variables. */ #ifdef SUPPORT_UCP +prop_value = 0; prop_fail_result = 0; -prop_test_against = 0; prop_test_variable = NULL; #endif -/* OK, now we can get on with the real code of the function. Recursion is -specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined, -these just turn into a recursive call to match() and a "return", respectively. -However, RMATCH isn't like a function call because it's quite a complicated -macro. It has to be used in one particular way. This shouldn't, however, impact -performance when true recursion is being used. */ +/* This label is used for tail recursion, which is used in a few cases even +when NO_RECURSE is not defined, in order to reduce the amount of stack that is +used. Thanks to Ian Taylor for noticing this possibility and sending the +original patch. */ + +TAIL_RECURSE: + +/* OK, now we can get on with the real code of the function. Recursive calls +are specified by the macro RMATCH and RRETURN is used to return. When +NO_RECURSE is *not* defined, these just turn into a recursive call to match() +and a "return", respectively (possibly with some debugging if DEBUG is +defined). However, RMATCH isn't like a function call because it's quite a +complicated macro. It has to be used in one particular way. This shouldn't, +however, impact performance when true recursion is being used. */ + +/* First check that we haven't called match() too many times, or that we +haven't exceeded the recursive call limit. */ if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); +if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT); original_ims = ims; /* Save for resetting on ')' */ + +#ifdef SUPPORT_UTF8 utf8 = md->utf8; /* Local copy of the flag */ +#else +utf8 = FALSE; +#endif /* At the start of a bracketed group, add the current subject pointer to the stack of such pointers, to be re-instated at the end of the group when we hit @@ -614,21 +654,38 @@ for (;;) { case OP_BRA: /* Non-capturing bracket: optimized */ DPRINTF(("start bracket 0\n")); - do + + /* Loop for all the alternatives */ + + for (;;) { + /* When we get to the final alternative within the brackets, we would + return the result of a recursive call to match() whatever happened. We + can reduce stack usage by turning this into a tail recursion. */ + + if (ecode[GET(ecode, 1)] != OP_ALT) + { + ecode += 1 + LINK_SIZE; + flags = match_isgroup; + DPRINTF(("bracket 0 tail recursion\n")); + goto TAIL_RECURSE; + } + + /* For non-final alternatives, continue the loop for a NOMATCH result; + otherwise return. */ + RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, match_isgroup); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += GET(ecode, 1); } - while (*ecode == OP_ALT); - DPRINTF(("bracket 0 failed\n")); - RRETURN(MATCH_NOMATCH); + /* Control never reaches here. */ /* Conditional group: compilation checked that there are no more than two branches. If the condition is false, skipping the first branch takes us past the end if there is only one branch, but that's OK because that is - exactly what going to the ket would do. */ + exactly what going to the ket would do. As there is only one branch to be + obeyed, we can use tail recursion to avoid using another stack frame. */ case OP_COND: if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */ @@ -637,10 +694,9 @@ for (;;) condition = (offset == CREF_RECURSE * 2)? (md->recursive != NULL) : (offset < offset_top && md->offset_vector[offset] >= 0); - RMATCH(rrc, eptr, ecode + (condition? - (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))), - offset_top, md, ims, eptrb, match_isgroup); - RRETURN(rrc); + ecode += condition? (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1)); + flags = match_isgroup; + goto TAIL_RECURSE; } /* The condition is an assertion. Call match() to evaluate it - setting @@ -660,9 +716,13 @@ for (;;) RRETURN(rrc); /* Need braces because of following else */ } else ecode += GET(ecode, 1); - RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, - match_isgroup); - RRETURN(rrc); + + /* We are now at the branch that is to be obeyed. As there is only one, + we can use tail recursion to avoid using another stack frame. */ + + ecode += 1 + LINK_SIZE; + flags = match_isgroup; + goto TAIL_RECURSE; } /* Control never reaches here */ @@ -681,7 +741,7 @@ for (;;) if (md->recursive != NULL && md->recursive->group_num == 0) { recursion_info *rec = md->recursive; - DPRINTF(("Hit the end in a (?0) recursion\n")); + DPRINTF(("End of pattern in a (?0) recursion\n")); md->recursive = rec->prevrec; memmove(md->offset_vector, rec->offset_save, rec->saved_max * sizeof(int)); @@ -800,7 +860,7 @@ for (;;) cb.version = 1; /* Version 1 of the callout block */ cb.callout_number = ecode[1]; cb.offset_vector = md->offset_vector; - cb.subject = (const char *)md->start_subject; + cb.subject = (PCRE_SPTR)md->start_subject; cb.subject_length = md->end_subject - md->start_subject; cb.start_match = md->start_match - md->start_subject; cb.current_position = eptr - md->start_subject; @@ -882,12 +942,17 @@ for (;;) eptrb, match_isgroup); if (rrc == MATCH_MATCH) { + DPRINTF(("Recursion matched\n")); md->recursive = new_recursive.prevrec; if (new_recursive.offset_save != stacksave) (pcre_free)(new_recursive.offset_save); RRETURN(MATCH_MATCH); } - else if (rrc != MATCH_NOMATCH) RRETURN(rrc); + else if (rrc != MATCH_NOMATCH) + { + DPRINTF(("Recursion gave error %d\n", rrc)); + RRETURN(rrc); + } md->recursive = &new_recursive; memcpy(md->offset_vector, new_recursive.offset_save, @@ -912,7 +977,6 @@ for (;;) the end of a normal bracket, leaving the subject pointer. */ case OP_ONCE: - { prev = ecode; saved_eptr = eptr; @@ -951,9 +1015,10 @@ for (;;) } /* The repeating kets try the rest of the pattern or restart from the - preceding bracket, in the appropriate order. We need to reset any options - that changed within the bracket before re-running it, so check the next - opcode. */ + preceding bracket, in the appropriate order. The second "call" of match() + uses tail recursion, to avoid using another stack frame. We need to reset + any options that changed within the bracket before re-running it, so + check the next opcode. */ if (ecode[1+LINK_SIZE] == OP_OPT) { @@ -965,18 +1030,19 @@ for (;;) { RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode = prev; + flags = match_isgroup; + goto TAIL_RECURSE; } else /* OP_KETRMAX */ { RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - } + ecode += 1 + LINK_SIZE; + flags = 0; + goto TAIL_RECURSE; } - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ /* An alternation is the end of a branch; scan along to find the end of the bracketed group and go to there. */ @@ -1020,7 +1086,6 @@ for (;;) case OP_KET: case OP_KETRMIN: case OP_KETRMAX: - { prev = ecode - GET(ecode, 1); saved_eptr = eptrb->epb_saved_eptr; @@ -1109,25 +1174,26 @@ for (;;) } /* The repeating kets try the rest of the pattern or restart from the - preceding bracket, in the appropriate order. */ + preceding bracket, in the appropriate order. In the second case, we can use + tail recursion to avoid using another stack frame. */ if (*ecode == OP_KETRMIN) { RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); + ecode = prev; + flags = match_isgroup; + goto TAIL_RECURSE; } else /* OP_KETRMAX */ { RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); - } + ecode += 1 + LINK_SIZE; + flags = 0; + goto TAIL_RECURSE; } - - RRETURN(MATCH_NOMATCH); + /* Control never gets here */ /* Start of subject unless notbol, or after internal newline if multiline */ @@ -1135,7 +1201,10 @@ for (;;) if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); if ((ims & PCRE_MULTILINE) != 0) { - if (eptr != md->start_subject && eptr[-1] != NEWLINE) + if (eptr != md->start_subject && + (eptr == md->end_subject || + eptr < md->start_subject + md->nllen || + !IS_NEWLINE(eptr - md->nllen))) RRETURN(MATCH_NOMATCH); ecode++; break; @@ -1163,7 +1232,7 @@ for (;;) if ((ims & PCRE_MULTILINE) != 0) { if (eptr < md->end_subject) - { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); } + { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); } else { if (md->noteol) RRETURN(MATCH_NOMATCH); } ecode++; @@ -1174,14 +1243,14 @@ for (;;) if (md->noteol) RRETURN(MATCH_NOMATCH); if (!md->endonly) { - if (eptr < md->end_subject - 1 || - (eptr == md->end_subject - 1 && *eptr != NEWLINE)) + if (eptr != md->end_subject && + (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); ecode++; break; } } - /* ... else fall through */ + /* ... else fall through for endonly */ /* End of subject assertion (\z) */ @@ -1193,8 +1262,9 @@ for (;;) /* End of subject or ending \n assertion (\Z) */ case OP_EODN: - if (eptr < md->end_subject - 1 || - (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH); + if (eptr != md->end_subject && + (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); ecode++; break; @@ -1247,13 +1317,14 @@ for (;;) /* Match a single character type; inline for speed */ case OP_ANY: - if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE) + if ((ims & PCRE_DOTALL) == 0) + { + if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + } if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); -#ifdef SUPPORT_UTF8 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; -#endif ecode++; break; @@ -1352,23 +1423,43 @@ for (;;) if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); { - int chartype, rqdtype; - int othercase; - int category = _pcre_ucp_findchar(c, &chartype, &othercase); + int chartype, script; + int category = _pcre_ucp_findprop(c, &chartype, &script); - rqdtype = *(++ecode); - ecode++; + switch(ecode[1]) + { + case PT_ANY: + if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); + break; - if (rqdtype >= 128) - { - if ((rqdtype - 128 != category) == (op == OP_PROP)) + case PT_LAMP: + if ((chartype == ucp_Lu || + chartype == ucp_Ll || + chartype == ucp_Lt) == (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); - } - else - { - if ((rqdtype != chartype) == (op == OP_PROP)) + break; + + case PT_GC: + if ((ecode[2] != category) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); + break; + + case PT_PC: + if ((ecode[2] != chartype) == (op == OP_PROP)) + RRETURN(MATCH_NOMATCH); + break; + + case PT_SC: + if ((ecode[2] != script) == (op == OP_PROP)) + RRETURN(MATCH_NOMATCH); + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + break; } + + ecode += 3; } break; @@ -1379,9 +1470,8 @@ for (;;) if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); { - int chartype; - int othercase; - int category = _pcre_ucp_findchar(c, &chartype, &othercase); + int chartype, script; + int category = _pcre_ucp_findprop(c, &chartype, &script); if (category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { @@ -1390,7 +1480,7 @@ for (;;) { GETCHARLEN(c, eptr, len); } - category = _pcre_ucp_findchar(c, &chartype, &othercase); + category = _pcre_ucp_findprop(c, &chartype, &script); if (category != ucp_M) break; eptr += len; } @@ -1683,8 +1773,8 @@ for (;;) while (eptr >= pp) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); - eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); + eptr--; } } @@ -1841,16 +1931,12 @@ for (;;) ecode += length; /* If we have Unicode property support, we can use it to test the other - case of the character, if there is one. The result of _pcre_ucp_findchar() is - < 0 if the char isn't found, and othercase is returned as zero if there - isn't one. */ + case of the character, if there is one. */ if (fc != dc) { #ifdef SUPPORT_UCP - int chartype; - int othercase; - if (_pcre_ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase) + if (dc != _pcre_ucp_othercase(fc)) #endif RRETURN(MATCH_NOMATCH); } @@ -1918,10 +2004,9 @@ for (;;) #ifdef SUPPORT_UCP int othercase; - int chartype; if ((ims & PCRE_CASELESS) != 0 && - _pcre_ucp_findchar(fc, &chartype, &othercase) >= 0 && - othercase > 0) + (othercase = _pcre_ucp_othercase(fc)) >= 0 && + othercase >= 0) oclength = _pcre_ord2utf8(othercase, occhars); #endif /* SUPPORT_UCP */ @@ -2408,16 +2493,7 @@ for (;;) { prop_fail_result = ctype == OP_NOTPROP; prop_type = *ecode++; - if (prop_type >= 128) - { - prop_test_against = prop_type - 128; - prop_test_variable = &prop_category; - } - else - { - prop_test_against = prop_type; - prop_test_variable = &prop_chartype; - } + prop_value = *ecode++; } else prop_type = -1; #endif @@ -2434,15 +2510,69 @@ for (;;) if (min > 0) { #ifdef SUPPORT_UCP - if (prop_type > 0) + if (prop_type >= 0) { + switch(prop_type) + { + case PT_ANY: + if (prop_fail_result) RRETURN(MATCH_NOMATCH); for (i = 1; i <= min; i++) { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); - if ((*prop_test_variable == prop_test_against) == prop_fail_result) + } + break; + + case PT_LAMP: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_chartype == ucp_Lu || + prop_chartype == ucp_Ll || + prop_chartype == ucp_Lt) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_GC: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_category == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } + break; + + case PT_PC: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_chartype == prop_value) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_SC: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_script == prop_value) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + break; + } } /* Match extended Unicode sequences. We will get here only if the @@ -2453,7 +2583,7 @@ for (;;) for (i = 1; i <= min; i++) { GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { @@ -2462,7 +2592,7 @@ for (;;) { GETCHARLEN(c, eptr, len); } - prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if (prop_category != ucp_M) break; eptr += len; } @@ -2481,8 +2611,11 @@ for (;;) for (i = 1; i <= min; i++) { if (eptr >= md->end_subject || - (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0)) + ((ims & PCRE_DOTALL) == 0 && + eptr <= md->end_subject - md->nllen && + IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); + eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } break; @@ -2567,7 +2700,11 @@ for (;;) if ((ims & PCRE_DOTALL) == 0) { for (i = 1; i <= min; i++) - if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH); + { + if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)) + RRETURN(MATCH_NOMATCH); + eptr++; + } } else eptr += min; break; @@ -2624,18 +2761,79 @@ for (;;) if (minimize) { #ifdef SUPPORT_UCP - if (prop_type > 0) + if (prop_type >= 0) { + switch(prop_type) + { + case PT_ANY: + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + if (prop_fail_result) RRETURN(MATCH_NOMATCH); + } + break; + + case PT_LAMP: + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_chartype == ucp_Lu || + prop_chartype == ucp_Ll || + prop_chartype == ucp_Lt) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_GC: + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_category == prop_value) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_PC: + for (fi = min;; fi++) + { + RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + GETCHARINC(c, eptr); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_chartype == prop_value) == prop_fail_result) + RRETURN(MATCH_NOMATCH); + } + break; + + case PT_SC: for (fi = min;; fi++) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); - if ((*prop_test_variable == prop_test_against) == prop_fail_result) + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_script == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } + break; + + default: + RRETURN(PCRE_ERROR_INTERNAL); + break; + } } /* Match extended Unicode sequences. We will get here only if the @@ -2649,7 +2847,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { @@ -2658,7 +2856,7 @@ for (;;) { GETCHARLEN(c, eptr, len); } - prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if (prop_category != ucp_M) break; eptr += len; } @@ -2676,13 +2874,15 @@ for (;;) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (fi >= max || eptr >= md->end_subject || + (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 && + eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(ctype) { - case OP_ANY: - if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH); + case OP_ANY: /* This is the DOTALL case */ break; case OP_ANYBYTE: @@ -2731,12 +2931,15 @@ for (;;) { RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + if (fi >= max || eptr >= md->end_subject || + ((ims & PCRE_DOTALL) == 0 && + eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) + RRETURN(MATCH_NOMATCH); + c = *eptr++; switch(ctype) { - case OP_ANY: - if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH); + case OP_ANY: /* This is the DOTALL case */ break; case OP_ANYBYTE: @@ -2783,18 +2986,75 @@ for (;;) pp = eptr; /* Remember where we started */ #ifdef SUPPORT_UCP - if (prop_type > 0) + if (prop_type >= 0) { + switch(prop_type) + { + case PT_ANY: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + if (prop_fail_result) break; + eptr+= len; + } + break; + + case PT_LAMP: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_chartype == ucp_Lu || + prop_chartype == ucp_Ll || + prop_chartype == ucp_Lt) == prop_fail_result) + break; + eptr+= len; + } + break; + + case PT_GC: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_category == prop_value) == prop_fail_result) + break; + eptr+= len; + } + break; + + case PT_PC: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); - prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); - if ((*prop_test_variable == prop_test_against) == prop_fail_result) + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_chartype == prop_value) == prop_fail_result) break; eptr+= len; } + break; + + case PT_SC: + for (i = min; i < max; i++) + { + int len = 1; + if (eptr >= md->end_subject) break; + GETCHARLEN(c, eptr, len); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + if ((prop_script == prop_value) == prop_fail_result) + break; + eptr+= len; + } + break; + } /* eptr is now past the end of the maximum run */ @@ -2816,7 +3076,7 @@ for (;;) { if (eptr >= md->end_subject) break; GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if (prop_category == ucp_M) break; while (eptr < md->end_subject) { @@ -2825,7 +3085,7 @@ for (;;) { GETCHARLEN(c, eptr, len); } - prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if (prop_category != ucp_M) break; eptr += len; } @@ -2846,7 +3106,7 @@ for (;;) { GETCHARLEN(c, eptr, len); } - prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase); + prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); if (prop_category != ucp_M) break; eptr--; } @@ -2865,9 +3125,9 @@ for (;;) { case OP_ANY: - /* Special code is required for UTF8, but when the maximum is unlimited - we don't need it, so we repeat the non-UTF8 code. This is probably - worth it, because .* is quite a common idiom. */ + /* Special code is required for UTF8, but when the maximum is + unlimited we don't need it, so we repeat the non-UTF8 code. This is + probably worth it, because .* is quite a common idiom. */ if (max < INT_MAX) { @@ -2875,7 +3135,9 @@ for (;;) { for (i = min; i < max; i++) { - if (eptr >= md->end_subject || *eptr == NEWLINE) break; + if (eptr >= md->end_subject || + (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) + break; eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } @@ -2884,6 +3146,7 @@ for (;;) { for (i = min; i < max; i++) { + if (eptr >= md->end_subject) break; eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } @@ -2898,7 +3161,9 @@ for (;;) { for (i = min; i < max; i++) { - if (eptr >= md->end_subject || *eptr == NEWLINE) break; + if (eptr >= md->end_subject || + (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) + break; eptr++; } break; @@ -3012,7 +3277,9 @@ for (;;) { for (i = min; i < max; i++) { - if (eptr >= md->end_subject || *eptr == NEWLINE) break; + if (eptr >= md->end_subject || + (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))) + break; eptr++; } break; @@ -3200,16 +3467,17 @@ Returns: > 0 => success; value is the number of elements filled in < -1 => some kind of unexpected problem */ -PCRE_EXPORT int +PCRE_DATA_SCOPE int pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, - const char *subject, int length, int start_offset, int options, int *offsets, + PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, int offsetcount) { int rc, resetcount, ocount; int first_byte = -1; int req_byte = -1; int req_byte2 = -1; -unsigned long int ims = 0; +int newline; +unsigned long int ims; BOOL using_temporary_offsets = FALSE; BOOL anchored; BOOL startline; @@ -3217,11 +3485,12 @@ BOOL firstline; BOOL first_byte_caseless = FALSE; BOOL req_byte_caseless = FALSE; match_data match_block; +match_data *md = &match_block; const uschar *tables; const uschar *start_bits = NULL; -const uschar *start_match = (const uschar *)subject + start_offset; -const uschar *end_subject; -const uschar *req_byte_ptr = start_match - 1; +USPTR start_match = (USPTR)subject + start_offset; +USPTR end_subject; +USPTR req_byte_ptr = start_match - 1; pcre_study_data internal_study; const pcre_study_data *study; @@ -3241,8 +3510,9 @@ if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; the default values. */ study = NULL; -match_block.match_limit = MATCH_LIMIT; -match_block.callout_data = NULL; +md->match_limit = MATCH_LIMIT; +md->match_limit_recursion = MATCH_LIMIT_RECURSION; +md->callout_data = NULL; /* The table pointer is always in native byte order. */ @@ -3254,9 +3524,11 @@ if (extra_data != NULL) if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) study = (const pcre_study_data *)extra_data->study_data; if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) - match_block.match_limit = extra_data->match_limit; + md->match_limit = extra_data->match_limit; + if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) + md->match_limit_recursion = extra_data->match_limit_recursion; if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) - match_block.callout_data = extra_data->callout_data; + md->callout_data = extra_data->callout_data; if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; } @@ -3286,39 +3558,64 @@ firstline = (re->options & PCRE_FIRSTLINE) != 0; /* The code starts after the real_pcre block and the capture name table. */ -match_block.start_code = (const uschar *)external_re + re->name_table_offset + +md->start_code = (const uschar *)external_re + re->name_table_offset + re->name_count * re->name_entry_size; -match_block.start_subject = (const uschar *)subject; -match_block.start_offset = start_offset; -match_block.end_subject = match_block.start_subject + length; -end_subject = match_block.end_subject; +md->start_subject = (USPTR)subject; +md->start_offset = start_offset; +md->end_subject = md->start_subject + length; +end_subject = md->end_subject; -match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; -match_block.utf8 = (re->options & PCRE_UTF8) != 0; +md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; +md->utf8 = (re->options & PCRE_UTF8) != 0; -match_block.notbol = (options & PCRE_NOTBOL) != 0; -match_block.noteol = (options & PCRE_NOTEOL) != 0; -match_block.notempty = (options & PCRE_NOTEMPTY) != 0; -match_block.partial = (options & PCRE_PARTIAL) != 0; -match_block.hitend = FALSE; +md->notbol = (options & PCRE_NOTBOL) != 0; +md->noteol = (options & PCRE_NOTEOL) != 0; +md->notempty = (options & PCRE_NOTEMPTY) != 0; +md->partial = (options & PCRE_PARTIAL) != 0; +md->hitend = FALSE; -match_block.recursive = NULL; /* No recursion at top level */ +md->recursive = NULL; /* No recursion at top level */ -match_block.lcc = tables + lcc_offset; -match_block.ctypes = tables + ctypes_offset; +md->lcc = tables + lcc_offset; +md->ctypes = tables + ctypes_offset; + +/* Handle different types of newline. The two bits give four cases. If nothing +is set at run time, whatever was used at compile time applies. */ + +switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) & + PCRE_NEWLINE_CRLF) + { + default: newline = NEWLINE; break; /* Compile-time default */ + case PCRE_NEWLINE_CR: newline = '\r'; break; + case PCRE_NEWLINE_LF: newline = '\n'; break; + case PCRE_NEWLINE_CR+ + PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; + } + +if (newline > 255) + { + md->nllen = 2; + md->nl[0] = (newline >> 8) & 255; + md->nl[1] = newline & 255; + } +else + { + md->nllen = 1; + md->nl[0] = newline; + } /* Partial matching is supported only for a restricted set of regexes at the moment. */ -if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0) +if (md->partial && (re->options & PCRE_NOPARTIAL) != 0) return PCRE_ERROR_BADPARTIAL; /* Check a UTF-8 string if required. Unfortunately there's no way of passing back the character offset. */ #ifdef SUPPORT_UTF8 -if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) +if (md->utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) { if (_pcre_valid_utf8((uschar *)subject, length) >= 0) return PCRE_ERROR_BADUTF8; @@ -3350,17 +3647,17 @@ ocount = offsetcount - (offsetcount % 3); if (re->top_backref > 0 && re->top_backref >= ocount/3) { ocount = re->top_backref * 3 + 3; - match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int)); - if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY; + md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int)); + if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY; using_temporary_offsets = TRUE; DPRINTF(("Got memory to hold back references\n")); } -else match_block.offset_vector = offsets; +else md->offset_vector = offsets; -match_block.offset_end = ocount; -match_block.offset_max = (2*ocount)/3; -match_block.offset_overflow = FALSE; -match_block.capture_last = -1; +md->offset_end = ocount; +md->offset_max = (2*ocount)/3; +md->offset_overflow = FALSE; +md->capture_last = -1; /* Compute the minimum number of offsets that we need to reset each time. Doing this makes a huge difference to execution time when there aren't many brackets @@ -3373,9 +3670,9 @@ if (resetcount > offsetcount) resetcount = ocount; never be used unless previously set, but they get saved and restored, and so we initialize them to avoid reading uninitialized locations. */ -if (match_block.offset_vector != NULL) +if (md->offset_vector != NULL) { - register int *iptr = match_block.offset_vector + ocount; + register int *iptr = md->offset_vector + ocount; register int *iend = iptr - resetcount/2 + 1; while (--iptr >= iend) *iptr = -1; } @@ -3392,7 +3689,7 @@ if (!anchored) { first_byte = re->first_byte & 255; if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) - first_byte = match_block.lcc[first_byte]; + first_byte = md->lcc[first_byte]; } else if (!startline && study != NULL && @@ -3415,13 +3712,13 @@ the loop runs just once. */ do { - const uschar *save_end_subject = end_subject; + USPTR save_end_subject = end_subject; /* Reset the maximum number of extractions we might see. */ - if (match_block.offset_vector != NULL) + if (md->offset_vector != NULL) { - register int *iptr = match_block.offset_vector; + register int *iptr = md->offset_vector; register int *iend = iptr + resetcount; while (iptr < iend) *iptr++ = -1; } @@ -3434,8 +3731,8 @@ do if (firstline) { - const uschar *t = start_match; - while (t < save_end_subject && *t != '\n') t++; + USPTR t = start_match; + while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++; end_subject = t; } @@ -3445,20 +3742,22 @@ do { if (first_byte_caseless) while (start_match < end_subject && - match_block.lcc[*start_match] != first_byte) + md->lcc[*start_match] != first_byte) start_match++; else while (start_match < end_subject && *start_match != first_byte) start_match++; } - /* Or to just after \n for a multiline match if possible */ + /* Or to just after a linebreak for a multiline match if possible */ else if (startline) { - if (start_match > match_block.start_subject + start_offset) + if (start_match >= md->start_subject + md->nllen + + start_offset) { - while (start_match < end_subject && start_match[-1] != NEWLINE) + while (start_match <= end_subject && + !IS_NEWLINE(start_match - md->nllen)) start_match++; } } @@ -3480,7 +3779,7 @@ do #ifdef DEBUG /* Sigh. Some compilers never learn. */ printf(">>>> Match against: "); - pchars(start_match, end_subject - start_match, TRUE, &match_block); + pchars(start_match, end_subject - start_match, TRUE, md); printf("\n"); #endif @@ -3502,9 +3801,9 @@ do if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX && - !match_block.partial) + !md->partial) { - register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0); + register USPTR p = start_match + ((first_byte >= 0)? 1 : 0); /* We don't need to repeat the search if we haven't yet reached the place we found it at last time. */ @@ -3546,11 +3845,10 @@ do those back references that we can. In this case there need not be overflow if certain parts of the pattern were not used. */ - match_block.start_match = start_match; - match_block.match_call_count = 0; + md->start_match = start_match; + md->match_call_count = 0; - rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL, - match_isgroup); + rc = match(start_match, md->start_code, 2, md, ims, NULL, match_isgroup, 0); /* When the result is no match, if the subject's first character was a newline and the PCRE_FIRSTLINE option is set, break (which will return @@ -3561,10 +3859,13 @@ do if (rc == MATCH_NOMATCH) { - if (firstline && *start_match == NEWLINE) break; + if (firstline && + start_match <= md->end_subject - md->nllen && + IS_NEWLINE(start_match)) + break; start_match++; #ifdef SUPPORT_UTF8 - if (match_block.utf8) + if (md->utf8) while(start_match < end_subject && (*start_match & 0xc0) == 0x80) start_match++; #endif @@ -3584,23 +3885,23 @@ do { if (offsetcount >= 4) { - memcpy(offsets + 2, match_block.offset_vector + 2, + memcpy(offsets + 2, md->offset_vector + 2, (offsetcount - 2) * sizeof(int)); DPRINTF(("Copied offsets from temporary memory\n")); } - if (match_block.end_offset_top > offsetcount) - match_block.offset_overflow = TRUE; + if (md->end_offset_top > offsetcount) + md->offset_overflow = TRUE; DPRINTF(("Freeing temporary memory\n")); - (pcre_free)(match_block.offset_vector); + (pcre_free)(md->offset_vector); } - rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2; + rc = md->offset_overflow? 0 : md->end_offset_top/2; if (offsetcount < 2) rc = 0; else { - offsets[0] = start_match - match_block.start_subject; - offsets[1] = match_block.end_match_ptr - match_block.start_subject; + offsets[0] = start_match - md->start_subject; + offsets[1] = md->end_match_ptr - md->start_subject; } DPRINTF((">>>> returning %d\n", rc)); @@ -3614,10 +3915,10 @@ while (!anchored && start_match <= end_subject); if (using_temporary_offsets) { DPRINTF(("Freeing temporary memory\n")); - (pcre_free)(match_block.offset_vector); + (pcre_free)(md->offset_vector); } -if (match_block.partial && match_block.hitend) +if (md->partial && md->hitend) { DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); return PCRE_ERROR_PARTIAL; diff --git a/libpcre/pcre_fullinfo.c b/libpcre/pcre_fullinfo.c index cd1a9a4ff..4a8edc6f4 100644 --- a/libpcre/pcre_fullinfo.c +++ b/libpcre/pcre_fullinfo.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -61,7 +61,7 @@ Arguments: Returns: 0 if data returned, negative on error */ -PCRE_EXPORT int +PCRE_DATA_SCOPE int pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what, void *where) { diff --git a/libpcre/pcre_get.c b/libpcre/pcre_get.c index fc4a14ac1..0588c61a7 100644 --- a/libpcre/pcre_get.c +++ b/libpcre/pcre_get.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -50,8 +50,8 @@ for these functions came from Scott Wimer. */ * Find number for named string * *************************************************/ -/* This function is used by the two extraction functions below, as well -as being generally available. +/* This function is used by the get_first_set() function below, as well +as being generally available. It assumes that names are unique. Arguments: code the compiled regex @@ -93,6 +93,113 @@ return PCRE_ERROR_NOSUBSTRING; +/************************************************* +* Find (multiple) entries for named string * +*************************************************/ + +/* This is used by the get_first_set() function below, as well as being +generally available. It is used when duplicated names are permitted. + +Arguments: + code the compiled regex + stringname the name whose entries required + firstptr where to put the pointer to the first entry + lastptr where to put the pointer to the last entry + +Returns: the length of each entry, or a negative number + (PCRE_ERROR_NOSUBSTRING) if not found +*/ + +int +pcre_get_stringtable_entries(const pcre *code, const char *stringname, + char **firstptr, char **lastptr) +{ +int rc; +int entrysize; +int top, bot; +uschar *nametable, *lastentry; + +if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0) + return rc; +if (top <= 0) return PCRE_ERROR_NOSUBSTRING; + +if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0) + return rc; +if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0) + return rc; + +lastentry = nametable + entrysize * (top - 1); +bot = 0; +while (top > bot) + { + int mid = (top + bot) / 2; + uschar *entry = nametable + entrysize*mid; + int c = strcmp(stringname, (char *)(entry + 2)); + if (c == 0) + { + uschar *first = entry; + uschar *last = entry; + while (first > nametable) + { + if (strcmp(stringname, (char *)(first - entrysize + 2)) != 0) break; + first -= entrysize; + } + while (last < lastentry) + { + if (strcmp(stringname, (char *)(last + entrysize + 2)) != 0) break; + last += entrysize; + } + *firstptr = (char *)first; + *lastptr = (char *)last; + return entrysize; + } + if (c > 0) bot = mid + 1; else top = mid; + } + +return PCRE_ERROR_NOSUBSTRING; +} + + + +/************************************************* +* Find first set of multiple named strings * +*************************************************/ + +/* This function allows for duplicate names in the table of named substrings. +It returns the number of the first one that was set in a pattern match. + +Arguments: + code the compiled regex + stringname the name of the capturing substring + ovector the vector of matched substrings + +Returns: the number of the first that is set, + or the number of the last one if none are set, + or a negative number on error +*/ + +static int +get_first_set(const pcre *code, const char *stringname, int *ovector) +{ +const real_pcre *re = (const real_pcre *)code; +int entrysize; +char *first, *last; +uschar *entry; +if ((re->options & (PCRE_DUPNAMES | PCRE_JCHANGED)) == 0) + return pcre_get_stringnumber(code, stringname); +entrysize = pcre_get_stringtable_entries(code, stringname, &first, &last); +if (entrysize <= 0) return entrysize; +for (entry = (uschar *)first; entry <= (uschar *)last; entry += entrysize) + { + int n = (entry[0] << 8) + entry[1]; + if (ovector[n*2] >= 0) return n; + } +return (first[0] << 8) + first[1]; +} + + + + /************************************************* * Copy captured string to given buffer * *************************************************/ @@ -142,7 +249,8 @@ return yield; *************************************************/ /* This function copies a single captured substring into a given buffer, -identifying it by name. +identifying it by name. If the regex permits duplicate names, the first +substring that is set is chosen. Arguments: code the compiled regex @@ -168,7 +276,7 @@ int pcre_copy_named_substring(const pcre *code, const char *subject, int *ovector, int stringcount, const char *stringname, char *buffer, int size) { -int n = pcre_get_stringnumber(code, stringname); +int n = get_first_set(code, stringname, ovector); if (n <= 0) return n; return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size); } @@ -299,7 +407,8 @@ return yield; *************************************************/ /* This function copies a single captured substring, identified by name, into -new store. +new store. If the regex permits duplicate names, the first substring that is +set is chosen. Arguments: code the compiled regex @@ -324,9 +433,10 @@ int pcre_get_named_substring(const pcre *code, const char *subject, int *ovector, int stringcount, const char *stringname, const char **stringptr) { -int n = pcre_get_stringnumber(code, stringname); +int n = get_first_set(code, stringname, ovector); if (n <= 0) return n; return pcre_get_substring(subject, ovector, stringcount, n, stringptr); + } diff --git a/libpcre/pcre_globals.c b/libpcre/pcre_globals.c index 1a839802b..f829acfb1 100644 --- a/libpcre/pcre_globals.c +++ b/libpcre/pcre_globals.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without diff --git a/libpcre/pcre_info.c b/libpcre/pcre_info.c index 18741b173..b318b93ea 100644 --- a/libpcre/pcre_info.c +++ b/libpcre/pcre_info.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -68,7 +68,7 @@ Returns: number of capturing subpatterns or negative values on error */ -PCRE_EXPORT int +PCRE_DATA_SCOPE int pcre_info(const pcre *argument_re, int *optptr, int *first_byte) { real_pcre internal_re; diff --git a/libpcre/pcre_internal.h b/libpcre/pcre_internal.h index e2330396f..b13a4abcb 100644 --- a/libpcre/pcre_internal.h +++ b/libpcre/pcre_internal.h @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -121,6 +121,32 @@ Unix, where it is defined in sys/types, so use "uschar" instead. */ typedef unsigned char uschar; +/* PCRE is able to support 3 different kinds of newline (CR, LF, CRLF). The +following macro is used to package up testing for newlines. NLBLOCK is defined +in the various modules to indicate in which datablock the parameters exist. */ + +#define IS_NEWLINE(p) \ + ((p)[0] == NLBLOCK->nl[0] && \ + (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1])) + +/* When PCRE is compiled as a C++ library, the subject pointer can be +replaced with a custom type. This makes it possible, for example, to +allow pcre_exec() to process subject strings that are discontinuous by +using a smart pointer class. It must always be possible to inspect all +of the subject string in pcre_exec() because of the way it +backtracks. Two macros are required in the normal case, for +sign-unspecified and unsigned char pointers. The former is used for +the external interface and appears in pcre.h, which is why its name +must begin with PCRE_. */ + +#ifdef CUSTOM_SUBJECT_PTR +#define PCRE_SPTR CUSTOM_SUBJECT_PTR +#define USPTR CUSTOM_SUBJECT_PTR +#else +#define PCRE_SPTR const char * +#define USPTR const unsigned char * +#endif + /* Include the public PCRE header and the definitions of UCP character property values. */ @@ -156,13 +182,14 @@ case in PCRE. */ #if HAVE_BCOPY #define memmove(a, b, c) bcopy(b, a, c) #else /* HAVE_BCOPY */ -void * +static void * pcre_memmove(unsigned char *dest, const unsigned char *src, size_t n) { -int i; +size_t i; dest += n; src += n; for (i = 0; i < n; ++i) *(--dest) = *(--src); +return dest; } #define memmove(a, b, c) pcre_memmove(a, b, c) #endif /* not HAVE_BCOPY */ @@ -368,16 +395,17 @@ Standard C system should have one. */ #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL) -/* Private options flags start at the most significant end of the four bytes, -but skip the top bit so we can use ints for convenience without getting tangled -with negative values. The public options defined in pcre.h start at the least -significant end. Make sure they don't overlap! */ +/* Private options flags start at the most significant end of the four bytes. +The public options defined in pcre.h start at the least significant end. Make +sure they don't overlap! The bits are getting a bit scarce now -- when we run +out, there is a dummy word in the structure that could be used for the private +bits. */ +#define PCRE_NOPARTIAL 0x80000000 /* can't use partial with this regex */ #define PCRE_FIRSTSET 0x40000000 /* first_byte is set */ #define PCRE_REQCHSET 0x20000000 /* req_byte is set */ #define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */ -#define PCRE_ICHANGED 0x08000000 /* i option changes within regex */ -#define PCRE_NOPARTIAL 0x04000000 /* can't use partial with this regex */ +#define PCRE_JCHANGED 0x08000000 /* j option changes within regex */ /* Options for the "extra" block produced by pcre_study(). */ @@ -389,15 +417,17 @@ time, run time, or study time, respectively. */ #define PUBLIC_OPTIONS \ (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ - PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE) + PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ + PCRE_DUPNAMES|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF) #define PUBLIC_EXEC_OPTIONS \ (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ - PCRE_PARTIAL) + PCRE_PARTIAL|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF) #define PUBLIC_DFA_EXEC_OPTIONS \ (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ - PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART) + PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_CR| \ + PCRE_NEWLINE_LF) #define PUBLIC_STUDY_OPTIONS 0 /* None defined */ @@ -456,6 +486,26 @@ ESC_n is defined as yet another macro, which is set in config.h to either \n #define ESC_tee '\t' #endif +/* Codes for different types of Unicode property */ + +#define PT_ANY 0 /* Any property - matches all chars */ +#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */ +#define PT_GC 2 /* General characteristic (e.g. L) */ +#define PT_PC 3 /* Particular characteristic (e.g. Lu) */ +#define PT_SC 4 /* Script (e.g. Han) */ + +/* Flag bits and data types for the extended class (OP_XCLASS) for classes that +contain UTF-8 characters with values greater than 255. */ + +#define XCL_NOT 0x01 /* Flag: this is a negative class */ +#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ + +#define XCL_END 0 /* Marks end of individual items */ +#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ +#define XCL_RANGE 2 /* A range (two multibyte chars) follows */ +#define XCL_PROP 3 /* Unicode property (2-byte property code follows) */ +#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ + /* These are escaped items that aren't just an encoding of a particular data value such as \n. They must have non-zero values, as check_escape() returns their negation. Also, they must appear in the same order as in the opcode @@ -471,19 +521,6 @@ enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_REF }; -/* Flag bits and data types for the extended class (OP_XCLASS) for classes that -contain UTF-8 characters with values greater than 255. */ - -#define XCL_NOT 0x01 /* Flag: this is a negative class */ -#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */ - -#define XCL_END 0 /* Marks end of individual items */ -#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */ -#define XCL_RANGE 2 /* A range (two multibyte chars) follows */ -#define XCL_PROP 3 /* Unicode property (one property code) follows */ -#define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ - - /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets that extract substrings. Starting from 1 (i.e. after OP_END), the values up to OP_EOD must correspond in order to the list of escapes immediately above. @@ -518,7 +555,7 @@ enum { OP_DOLL, /* 20 End of line - varies with multiline switch */ OP_CHAR, /* 21 Match one character, casefully */ OP_CHARNC, /* 22 Match one character, caselessly */ - OP_NOT, /* 23 Match anything but the following char */ + OP_NOT, /* 23 Match one character, not the following one */ OP_STAR, /* 24 The maximizing and minimizing versions of */ OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */ @@ -647,7 +684,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 1, /* End */ \ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \ 1, 1, /* Any, Anybyte */ \ - 2, 2, 1, /* NOTPROP, PROP, EXTUNI */ \ + 3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \ 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ 2, /* Char - the minimum length */ \ 2, /* Charnc - the minimum length */ \ @@ -698,7 +735,8 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, - ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47 }; + ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, + ERR50, ERR51 }; /* The real format of the start of the pcre block; the index of names and the code vector run on as long as necessary after the end. We store an explicit @@ -762,6 +800,8 @@ typedef struct compile_data { unsigned int backref_map; /* Bitmap of low back refs */ int req_varyopt; /* "After variable item" flag for reqbyte */ BOOL nopartial; /* Set TRUE if partial won't work */ + int nllen; /* 1 or 2 for newline string length */ + uschar nl[4]; /* Newline string */ } compile_data; /* Structure for maintaining a chain of pointers to the currently incomplete @@ -779,18 +819,18 @@ typedef struct recursion_info { struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ int group_num; /* Number of group that was called */ const uschar *after_call; /* "Return value": points after the call in the expr */ - const uschar *save_start; /* Old value of md->start_match */ + USPTR save_start; /* Old value of md->start_match */ int *offset_save; /* Pointer to start of saved offsets */ int saved_max; /* Number of saved offsets */ } recursion_info; /* When compiling in a mode that doesn't use recursive calls to match(), a structure is used to remember local variables on the heap. It is defined in -pcre.c, close to the match() function, so that it is easy to keep it in step -with any changes of local variable. However, the pointer to the current frame -must be saved in some "static" place over a longjmp(). We declare the -structure here so that we can put a pointer in the match_data structure. -NOTE: This isn't used for a "normal" compilation of pcre. */ +pcre_exec.c, close to the match() function, so that it is easy to keep it in +step with any changes of local variable. However, the pointer to the current +frame must be saved in some "static" place over a longjmp(). We declare the +structure here so that we can put a pointer in the match_data structure. NOTE: +This isn't used for a "normal" compilation of pcre. */ struct heapframe; @@ -799,10 +839,13 @@ doing traditional NFA matching, so that they are thread-safe. */ typedef struct match_data { unsigned long int match_call_count; /* As it says */ - unsigned long int match_limit;/* As it says */ + unsigned long int match_limit; /* As it says */ + unsigned long int match_limit_recursion; /* As it says */ int *offset_vector; /* Offset vector */ int offset_end; /* One past the end */ int offset_max; /* The maximum usable for return data */ + int nllen; /* 1 or 2 for newline string length */ + uschar nl[4]; /* Newline string */ const uschar *lcc; /* Points to lower casing table */ const uschar *ctypes; /* Points to table of type maps */ BOOL offset_overflow; /* Set if too many extractions */ @@ -814,10 +857,10 @@ typedef struct match_data { BOOL partial; /* PARTIAL flag */ BOOL hitend; /* Hit the end of the subject at some point */ const uschar *start_code; /* For use when recursing */ - const uschar *start_subject; /* Start of the subject string */ - const uschar *end_subject; /* End of the subject string */ - const uschar *start_match; /* Start of this match attempt */ - const uschar *end_match_ptr; /* Subject position at end match */ + USPTR start_subject; /* Start of the subject string */ + USPTR end_subject; /* End of the subject string */ + USPTR start_match; /* Start of this match attempt */ + USPTR end_match_ptr; /* Subject position at end match */ int end_offset_top; /* Highwater mark at end of match */ int capture_last; /* Most recent capture number */ int start_offset; /* The start offset value */ @@ -836,6 +879,8 @@ typedef struct dfa_match_data { const uschar *tables; /* Character tables */ int moptions; /* Match options */ int poptions; /* Pattern options */ + int nllen; /* 1 or 2 for newline string length */ + uschar nl[4]; /* Newline string */ void *callout_data; /* To pass back to callouts */ } dfa_match_data; @@ -872,12 +917,13 @@ total length. */ #define ctypes_offset (cbits_offset + cbit_length) #define tables_length (ctypes_offset + 256) -/* Layout of the UCP type table that translates property names into codes for -pcre_ucp_findchar(). */ +/* Layout of the UCP type table that translates property names into types and +codes. */ typedef struct { const char *name; - int value; + pcre_uint16 type; + pcre_uint16 value; } ucp_type_table; @@ -908,7 +954,8 @@ sense, but are not part of the PCRE public API. */ extern int _pcre_ord2utf8(int, uschar *); extern real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *, const pcre_study_data *, pcre_study_data *); -extern int _pcre_ucp_findchar(const int, int *, int *); +extern int _pcre_ucp_findprop(const unsigned int, int *, int *); +extern int _pcre_ucp_othercase(const int); extern int _pcre_valid_utf8(const uschar *, int); extern BOOL _pcre_xclass(int, const uschar *); diff --git a/libpcre/pcre_maketables.c b/libpcre/pcre_maketables.c index c4954b264..afe63d1b1 100644 --- a/libpcre/pcre_maketables.c +++ b/libpcre/pcre_maketables.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -86,29 +86,22 @@ for (i = 0; i < 256; i++) *p++ = tolower(i); for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i); -/* Then the character class tables. Don't try to be clever and save effort -on exclusive ones - in some locales things may be different. Note that the -table for "space" includes everything "isspace" gives, including VT in the -default locale. This makes it work for the POSIX class [:space:]. */ +/* Then the character class tables. Don't try to be clever and save effort on +exclusive ones - in some locales things may be different. Note that the table +for "space" includes everything "isspace" gives, including VT in the default +locale. This makes it work for the POSIX class [:space:]. Note also that it is +possible for a character to be alnum or alpha without being lower or upper, +such as "male and female ordinals" (\xAA and \xBA) in the fr_FR locale (at +least under Debian Linux's locales as of 12/2005). So we must test for alnum +specially. */ memset(p, 0, cbit_length); for (i = 0; i < 256; i++) { - if (isdigit(i)) - { - p[cbit_digit + i/8] |= 1 << (i&7); - p[cbit_word + i/8] |= 1 << (i&7); - } - if (isupper(i)) - { - p[cbit_upper + i/8] |= 1 << (i&7); - p[cbit_word + i/8] |= 1 << (i&7); - } - if (islower(i)) - { - p[cbit_lower + i/8] |= 1 << (i&7); - p[cbit_word + i/8] |= 1 << (i&7); - } + if (isdigit(i)) p[cbit_digit + i/8] |= 1 << (i&7); + if (isupper(i)) p[cbit_upper + i/8] |= 1 << (i&7); + if (islower(i)) p[cbit_lower + i/8] |= 1 << (i&7); + if (isalnum(i)) p[cbit_word + i/8] |= 1 << (i&7); if (i == '_') p[cbit_word + i/8] |= 1 << (i&7); if (isspace(i)) p[cbit_space + i/8] |= 1 << (i&7); if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7); @@ -137,7 +130,9 @@ for (i = 0; i < 256; i++) meta-character, which in this sense is any character that terminates a run of data characters. */ - if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta; *p++ = x; } + if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta; + *p++ = x; + } return yield; } diff --git a/libpcre/pcre_printint.src b/libpcre/pcre_printint.src deleted file mode 100644 index 410f92082..000000000 --- a/libpcre/pcre_printint.src +++ /dev/null @@ -1,454 +0,0 @@ -/************************************************* -* Perl-Compatible Regular Expressions * -*************************************************/ - -/* PCRE is a library of functions to support regular expressions whose syntax -and semantics are as close as possible to those of the Perl 5 language. - - Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge - ------------------------------------------------------------------------------ -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of the University of Cambridge nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------ -*/ - - -/* This module contains a PCRE private debugging function for printing out the -internal form of a compiled regular expression, along with some supporting -local functions. This source file is used in two places: - -(1) It is #included by pcre_compile.c when it is compiled in debugging mode -(DEBUG defined in pcre_internal.h). It is not included in production compiles. - -(2) It is always #included by pcretest.c, which can be asked to print out a -compiled regex for debugging purposes. */ - - -static const char *OP_names[] = { OP_NAME_LIST }; - - -/************************************************* -* Print single- or multi-byte character * -*************************************************/ - -static int -print_char(FILE *f, uschar *ptr, BOOL utf8) -{ -int c = *ptr; - -if (!utf8 || (c & 0xc0) != 0xc0) - { - if (isprint(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); - return 0; - } -else - { - int i; - int a = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ - int s = 6*a; - c = (c & _pcre_utf8_table3[a]) << s; - for (i = 1; i <= a; i++) - { - /* This is a check for malformed UTF-8; it should only occur if the sanity - check has been turned off. Rather than swallow random bytes, just stop if - we hit a bad one. Print it with \X instead of \x as an indication. */ - - if ((ptr[i] & 0xc0) != 0x80) - { - fprintf(f, "\\X{%x}", c); - return i - 1; - } - - /* The byte is OK */ - - s -= 6; - c |= (ptr[i] & 0x3f) << s; - } - if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c); - return a; - } -} - - - -/************************************************* -* Find Unicode property name * -*************************************************/ - -static const char * -get_ucpname(int property) -{ -#ifdef SUPPORT_UCP -int i; -for (i = _pcre_utt_size; i >= 0; i--) - { - if (property == _pcre_utt[i].value) break; - } -return (i >= 0)? _pcre_utt[i].name : "??"; -#else -return "??"; -#endif -} - - - -/************************************************* -* Print compiled regex * -*************************************************/ - -/* Make this function work for a regex with integers either byte order. -However, we assume that what we are passed is a compiled regex. */ - -static void -pcre_printint(pcre *external_re, FILE *f) -{ -real_pcre *re = (real_pcre *)external_re; -uschar *codestart, *code; -BOOL utf8; - -unsigned int options = re->options; -int offset = re->name_table_offset; -int count = re->name_count; -int size = re->name_entry_size; - -if (re->magic_number != MAGIC_NUMBER) - { - offset = ((offset << 8) & 0xff00) | ((offset >> 8) & 0xff); - count = ((count << 8) & 0xff00) | ((count >> 8) & 0xff); - size = ((size << 8) & 0xff00) | ((size >> 8) & 0xff); - options = ((options << 24) & 0xff000000) | - ((options << 8) & 0x00ff0000) | - ((options >> 8) & 0x0000ff00) | - ((options >> 24) & 0x000000ff); - } - -code = codestart = (uschar *)re + offset + count * size; -utf8 = (options & PCRE_UTF8) != 0; - -for(;;) - { - uschar *ccode; - int c; - int extra = 0; - - fprintf(f, "%3d ", (int)(code - codestart)); - - if (*code >= OP_BRA) - { - if (*code - OP_BRA > EXTRACT_BASIC_MAX) - fprintf(f, "%3d Bra extra\n", GET(code, 1)); - else - fprintf(f, "%3d Bra %d\n", GET(code, 1), *code - OP_BRA); - code += _pcre_OP_lengths[OP_BRA]; - continue; - } - - switch(*code) - { - case OP_END: - fprintf(f, " %s\n", OP_names[*code]); - fprintf(f, "------------------------------------------------------------------\n"); - return; - - case OP_OPT: - fprintf(f, " %.2x %s", code[1], OP_names[*code]); - break; - - case OP_CHAR: - { - fprintf(f, " "); - do - { - code++; - code += 1 + print_char(f, code, utf8); - } - while (*code == OP_CHAR); - fprintf(f, "\n"); - continue; - } - break; - - case OP_CHARNC: - { - fprintf(f, " NC "); - do - { - code++; - code += 1 + print_char(f, code, utf8); - } - while (*code == OP_CHARNC); - fprintf(f, "\n"); - continue; - } - break; - - case OP_KETRMAX: - case OP_KETRMIN: - case OP_ALT: - case OP_KET: - case OP_ASSERT: - case OP_ASSERT_NOT: - case OP_ASSERTBACK: - case OP_ASSERTBACK_NOT: - case OP_ONCE: - case OP_COND: - case OP_REVERSE: - fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]); - break; - - case OP_BRANUMBER: - printf("%3d %s", GET2(code, 1), OP_names[*code]); - break; - - case OP_CREF: - if (GET2(code, 1) == CREF_RECURSE) - fprintf(f, " Cond recurse"); - else - fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]); - break; - - case OP_STAR: - case OP_MINSTAR: - case OP_PLUS: - case OP_MINPLUS: - case OP_QUERY: - case OP_MINQUERY: - case OP_TYPESTAR: - case OP_TYPEMINSTAR: - case OP_TYPEPLUS: - case OP_TYPEMINPLUS: - case OP_TYPEQUERY: - case OP_TYPEMINQUERY: - fprintf(f, " "); - if (*code >= OP_TYPESTAR) - { - fprintf(f, "%s", OP_names[code[1]]); - if (code[1] == OP_PROP || code[1] == OP_NOTPROP) - { - fprintf(f, " %s ", get_ucpname(code[2])); - extra = 1; - } - } - else extra = print_char(f, code+1, utf8); - fprintf(f, "%s", OP_names[*code]); - break; - - case OP_EXACT: - case OP_UPTO: - case OP_MINUPTO: - fprintf(f, " "); - extra = print_char(f, code+3, utf8); - fprintf(f, "{"); - if (*code != OP_EXACT) fprintf(f, ","); - fprintf(f, "%d}", GET2(code,1)); - if (*code == OP_MINUPTO) fprintf(f, "?"); - break; - - case OP_TYPEEXACT: - case OP_TYPEUPTO: - case OP_TYPEMINUPTO: - fprintf(f, " %s", OP_names[code[3]]); - if (code[3] == OP_PROP || code[3] == OP_NOTPROP) - { - fprintf(f, " %s ", get_ucpname(code[4])); - extra = 1; - } - fprintf(f, "{"); - if (*code != OP_TYPEEXACT) fprintf(f, "0,"); - fprintf(f, "%d}", GET2(code,1)); - if (*code == OP_TYPEMINUPTO) fprintf(f, "?"); - break; - - case OP_NOT: - if (isprint(c = code[1])) fprintf(f, " [^%c]", c); - else fprintf(f, " [^\\x%02x]", c); - break; - - case OP_NOTSTAR: - case OP_NOTMINSTAR: - case OP_NOTPLUS: - case OP_NOTMINPLUS: - case OP_NOTQUERY: - case OP_NOTMINQUERY: - if (isprint(c = code[1])) fprintf(f, " [^%c]", c); - else fprintf(f, " [^\\x%02x]", c); - fprintf(f, "%s", OP_names[*code]); - break; - - case OP_NOTEXACT: - case OP_NOTUPTO: - case OP_NOTMINUPTO: - if (isprint(c = code[3])) fprintf(f, " [^%c]{", c); - else fprintf(f, " [^\\x%02x]{", c); - if (*code != OP_NOTEXACT) fprintf(f, "0,"); - fprintf(f, "%d}", GET2(code,1)); - if (*code == OP_NOTMINUPTO) fprintf(f, "?"); - break; - - case OP_RECURSE: - fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]); - break; - - case OP_REF: - fprintf(f, " \\%d", GET2(code,1)); - ccode = code + _pcre_OP_lengths[*code]; - goto CLASS_REF_REPEAT; - - case OP_CALLOUT: - fprintf(f, " %s %d %d %d", OP_names[*code], code[1], GET(code,2), - GET(code, 2 + LINK_SIZE)); - break; - - case OP_PROP: - case OP_NOTPROP: - fprintf(f, " %s %s", OP_names[*code], get_ucpname(code[1])); - break; - - /* OP_XCLASS can only occur in UTF-8 mode. However, there's no harm in - having this code always here, and it makes it less messy without all those - #ifdefs. */ - - case OP_CLASS: - case OP_NCLASS: - case OP_XCLASS: - { - int i, min, max; - BOOL printmap; - - fprintf(f, " ["); - - if (*code == OP_XCLASS) - { - extra = GET(code, 1); - ccode = code + LINK_SIZE + 1; - printmap = (*ccode & XCL_MAP) != 0; - if ((*ccode++ & XCL_NOT) != 0) fprintf(f, "^"); - } - else - { - printmap = TRUE; - ccode = code + 1; - } - - /* Print a bit map */ - - if (printmap) - { - for (i = 0; i < 256; i++) - { - if ((ccode[i/8] & (1 << (i&7))) != 0) - { - int j; - for (j = i+1; j < 256; j++) - if ((ccode[j/8] & (1 << (j&7))) == 0) break; - if (i == '-' || i == ']') fprintf(f, "\\"); - if (isprint(i)) fprintf(f, "%c", i); else fprintf(f, "\\x%02x", i); - if (--j > i) - { - if (j != i + 1) fprintf(f, "-"); - if (j == '-' || j == ']') fprintf(f, "\\"); - if (isprint(j)) fprintf(f, "%c", j); else fprintf(f, "\\x%02x", j); - } - i = j; - } - } - ccode += 32; - } - - /* For an XCLASS there is always some additional data */ - - if (*code == OP_XCLASS) - { - int ch; - while ((ch = *ccode++) != XCL_END) - { - if (ch == XCL_PROP) - { - fprintf(f, "\\p{%s}", get_ucpname(*ccode++)); - } - else if (ch == XCL_NOTPROP) - { - fprintf(f, "\\P{%s}", get_ucpname(*ccode++)); - } - else - { - ccode += 1 + print_char(f, ccode, TRUE); - if (ch == XCL_RANGE) - { - fprintf(f, "-"); - ccode += 1 + print_char(f, ccode, TRUE); - } - } - } - } - - /* Indicate a non-UTF8 class which was created by negation */ - - fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : ""); - - /* Handle repeats after a class or a back reference */ - - CLASS_REF_REPEAT: - switch(*ccode) - { - case OP_CRSTAR: - case OP_CRMINSTAR: - case OP_CRPLUS: - case OP_CRMINPLUS: - case OP_CRQUERY: - case OP_CRMINQUERY: - fprintf(f, "%s", OP_names[*ccode]); - extra += _pcre_OP_lengths[*ccode]; - break; - - case OP_CRRANGE: - case OP_CRMINRANGE: - min = GET2(ccode,1); - max = GET2(ccode,3); - if (max == 0) fprintf(f, "{%d,}", min); - else fprintf(f, "{%d,%d}", min, max); - if (*ccode == OP_CRMINRANGE) fprintf(f, "?"); - extra += _pcre_OP_lengths[*ccode]; - break; - } - } - break; - - /* Anything else is just an item with no data*/ - - default: - fprintf(f, " %s", OP_names[*code]); - break; - } - - code += _pcre_OP_lengths[*code] + extra; - fprintf(f, "\n"); - } -} - -/* End of pcre_printint.src */ diff --git a/libpcre/pcre_refcount.c b/libpcre/pcre_refcount.c index 79fde4249..e6965812e 100644 --- a/libpcre/pcre_refcount.c +++ b/libpcre/pcre_refcount.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -63,7 +63,7 @@ Returns: the (possibly updated) count value (a non-negative number), or a negative error number */ -PCRE_EXPORT int +PCRE_DATA_SCOPE int pcre_refcount(pcre *argument_re, int adjust) { real_pcre *re = (real_pcre *)argument_re; diff --git a/libpcre/pcre_study.c b/libpcre/pcre_study.c index 58f241400..5091324ae 100644 --- a/libpcre/pcre_study.c +++ b/libpcre/pcre_study.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -95,6 +95,13 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, { register int c; +#if 0 +/* ========================================================================= */ +/* The following comment and code was inserted in January 1999. In May 2006, +when it was observed to cause compiler warnings about unused values, I took it +out again. If anybody is still using OS/2, they will have to put it back +manually. */ + /* This next statement and the later reference to dummy are here in order to trick the optimizer of the IBM C compiler for OS/2 into generating correct code. Apparently IBM isn't going to fix the problem, and we would rather not @@ -102,6 +109,8 @@ disable optimization (in this module it actually makes a big difference, and the pcre module can use all the optimization it can get). */ volatile int dummy; +/* ========================================================================= */ +#endif do { @@ -159,7 +168,11 @@ do case OP_BRAMINZERO: if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd)) return FALSE; +/* ========================================================================= + See the comment at the head of this function concerning the next line, + which was an old fudge for the benefit of OS/2. dummy = 1; + ========================================================================= */ do tcode += GET(tcode,1); while (*tcode == OP_ALT); tcode += 1+LINK_SIZE; break; @@ -215,15 +228,29 @@ do try_next = FALSE; break; + /* The cbit_space table has vertical tab as whitespace; we have to + discard it. */ + case OP_NOT_WHITESPACE: for (c = 0; c < 32; c++) - start_bits[c] |= ~cd->cbits[c+cbit_space]; + { + int d = cd->cbits[c+cbit_space]; + if (c == 1) d &= ~0x08; + start_bits[c] |= ~d; + } try_next = FALSE; break; + /* The cbit_space table has vertical tab as whitespace; we have to + discard it. */ + case OP_WHITESPACE: for (c = 0; c < 32; c++) - start_bits[c] |= cd->cbits[c+cbit_space]; + { + int d = cd->cbits[c+cbit_space]; + if (c == 1) d &= ~0x08; + start_bits[c] |= d; + } try_next = FALSE; break; @@ -277,14 +304,28 @@ do start_bits[c] |= cd->cbits[c+cbit_digit]; break; + /* The cbit_space table has vertical tab as whitespace; we have to + discard it. */ + case OP_NOT_WHITESPACE: for (c = 0; c < 32; c++) - start_bits[c] |= ~cd->cbits[c+cbit_space]; + { + int d = cd->cbits[c+cbit_space]; + if (c == 1) d &= ~0x08; + start_bits[c] |= ~d; + } break; + /* The cbit_space table has vertical tab as whitespace; we have to + discard it. */ + case OP_WHITESPACE: for (c = 0; c < 32; c++) - start_bits[c] |= cd->cbits[c+cbit_space]; + { + int d = cd->cbits[c+cbit_space]; + if (c == 1) d &= ~0x08; + start_bits[c] |= d; + } break; case OP_NOT_WORDCHAR: @@ -401,17 +442,16 @@ Returns: pointer to a pcre_extra block, with study_data filled in and the NULL on error or if no optimization possible */ -PCRE_EXPORT pcre_extra * +PCRE_DATA_SCOPE pcre_extra * pcre_study(const pcre *external_re, int options, const char **errorptr) { uschar start_bits[32]; pcre_extra *extra; pcre_study_data *study; const uschar *tables; -const real_pcre *re = (const real_pcre *)external_re; -uschar *code = (uschar *)re + re->name_table_offset + - (re->name_count * re->name_entry_size); +uschar *code; compile_data compile_block; +const real_pcre *re = (const real_pcre *)external_re; *errorptr = NULL; @@ -427,6 +467,9 @@ if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) return NULL; } +code = (uschar *)re + re->name_table_offset + + (re->name_count * re->name_entry_size); + /* For an anchored pattern, or an unanchored pattern that has a first char, or a multiline pattern that matches only at "line starts", no further processing at present. */ diff --git a/libpcre/pcre_tables.c b/libpcre/pcre_tables.c index f91e881c6..480257cab 100644 --- a/libpcre/pcre_tables.c +++ b/libpcre/pcre_tables.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -48,7 +48,7 @@ clashes with the library. */ /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that -the definition is next to the definition of the opcodes in internal.h. */ +the definition is next to the definition of the opcodes in pcre_internal.h. */ const uschar _pcre_OP_lengths[] = { OP_LENGTHS }; @@ -82,47 +82,110 @@ const uschar _pcre_utf8_table4[] = { 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; -/* This table translates Unicode property names into code values for the -ucp_findchar() function. */ +/* This table translates Unicode property names into type and code values. It +is searched by binary chop, so must be in collating sequence of name. */ const ucp_type_table _pcre_utt[] = { - { "C", 128 + ucp_C }, - { "Cc", ucp_Cc }, - { "Cf", ucp_Cf }, - { "Cn", ucp_Cn }, - { "Co", ucp_Co }, - { "Cs", ucp_Cs }, - { "L", 128 + ucp_L }, - { "Ll", ucp_Ll }, - { "Lm", ucp_Lm }, - { "Lo", ucp_Lo }, - { "Lt", ucp_Lt }, - { "Lu", ucp_Lu }, - { "M", 128 + ucp_M }, - { "Mc", ucp_Mc }, - { "Me", ucp_Me }, - { "Mn", ucp_Mn }, - { "N", 128 + ucp_N }, - { "Nd", ucp_Nd }, - { "Nl", ucp_Nl }, - { "No", ucp_No }, - { "P", 128 + ucp_P }, - { "Pc", ucp_Pc }, - { "Pd", ucp_Pd }, - { "Pe", ucp_Pe }, - { "Pf", ucp_Pf }, - { "Pi", ucp_Pi }, - { "Po", ucp_Po }, - { "Ps", ucp_Ps }, - { "S", 128 + ucp_S }, - { "Sc", ucp_Sc }, - { "Sk", ucp_Sk }, - { "Sm", ucp_Sm }, - { "So", ucp_So }, - { "Z", 128 + ucp_Z }, - { "Zl", ucp_Zl }, - { "Zp", ucp_Zp }, - { "Zs", ucp_Zs } + { "Any", PT_ANY, 0 }, + { "Arabic", PT_SC, ucp_Arabic }, + { "Armenian", PT_SC, ucp_Armenian }, + { "Bengali", PT_SC, ucp_Bengali }, + { "Bopomofo", PT_SC, ucp_Bopomofo }, + { "Braille", PT_SC, ucp_Braille }, + { "Buginese", PT_SC, ucp_Buginese }, + { "Buhid", PT_SC, ucp_Buhid }, + { "C", PT_GC, ucp_C }, + { "Canadian_Aboriginal", PT_SC, ucp_Canadian_Aboriginal }, + { "Cc", PT_PC, ucp_Cc }, + { "Cf", PT_PC, ucp_Cf }, + { "Cherokee", PT_SC, ucp_Cherokee }, + { "Cn", PT_PC, ucp_Cn }, + { "Co", PT_PC, ucp_Co }, + { "Common", PT_SC, ucp_Common }, + { "Coptic", PT_SC, ucp_Coptic }, + { "Cs", PT_PC, ucp_Cs }, + { "Cypriot", PT_SC, ucp_Cypriot }, + { "Cyrillic", PT_SC, ucp_Cyrillic }, + { "Deseret", PT_SC, ucp_Deseret }, + { "Devanagari", PT_SC, ucp_Devanagari }, + { "Ethiopic", PT_SC, ucp_Ethiopic }, + { "Georgian", PT_SC, ucp_Georgian }, + { "Glagolitic", PT_SC, ucp_Glagolitic }, + { "Gothic", PT_SC, ucp_Gothic }, + { "Greek", PT_SC, ucp_Greek }, + { "Gujarati", PT_SC, ucp_Gujarati }, + { "Gurmukhi", PT_SC, ucp_Gurmukhi }, + { "Han", PT_SC, ucp_Han }, + { "Hangul", PT_SC, ucp_Hangul }, + { "Hanunoo", PT_SC, ucp_Hanunoo }, + { "Hebrew", PT_SC, ucp_Hebrew }, + { "Hiragana", PT_SC, ucp_Hiragana }, + { "Inherited", PT_SC, ucp_Inherited }, + { "Kannada", PT_SC, ucp_Kannada }, + { "Katakana", PT_SC, ucp_Katakana }, + { "Kharoshthi", PT_SC, ucp_Kharoshthi }, + { "Khmer", PT_SC, ucp_Khmer }, + { "L", PT_GC, ucp_L }, + { "L&", PT_LAMP, 0 }, + { "Lao", PT_SC, ucp_Lao }, + { "Latin", PT_SC, ucp_Latin }, + { "Limbu", PT_SC, ucp_Limbu }, + { "Linear_B", PT_SC, ucp_Linear_B }, + { "Ll", PT_PC, ucp_Ll }, + { "Lm", PT_PC, ucp_Lm }, + { "Lo", PT_PC, ucp_Lo }, + { "Lt", PT_PC, ucp_Lt }, + { "Lu", PT_PC, ucp_Lu }, + { "M", PT_GC, ucp_M }, + { "Malayalam", PT_SC, ucp_Malayalam }, + { "Mc", PT_PC, ucp_Mc }, + { "Me", PT_PC, ucp_Me }, + { "Mn", PT_PC, ucp_Mn }, + { "Mongolian", PT_SC, ucp_Mongolian }, + { "Myanmar", PT_SC, ucp_Myanmar }, + { "N", PT_GC, ucp_N }, + { "Nd", PT_PC, ucp_Nd }, + { "New_Tai_Lue", PT_SC, ucp_New_Tai_Lue }, + { "Nl", PT_PC, ucp_Nl }, + { "No", PT_PC, ucp_No }, + { "Ogham", PT_SC, ucp_Ogham }, + { "Old_Italic", PT_SC, ucp_Old_Italic }, + { "Old_Persian", PT_SC, ucp_Old_Persian }, + { "Oriya", PT_SC, ucp_Oriya }, + { "Osmanya", PT_SC, ucp_Osmanya }, + { "P", PT_GC, ucp_P }, + { "Pc", PT_PC, ucp_Pc }, + { "Pd", PT_PC, ucp_Pd }, + { "Pe", PT_PC, ucp_Pe }, + { "Pf", PT_PC, ucp_Pf }, + { "Pi", PT_PC, ucp_Pi }, + { "Po", PT_PC, ucp_Po }, + { "Ps", PT_PC, ucp_Ps }, + { "Runic", PT_SC, ucp_Runic }, + { "S", PT_GC, ucp_S }, + { "Sc", PT_PC, ucp_Sc }, + { "Shavian", PT_SC, ucp_Shavian }, + { "Sinhala", PT_SC, ucp_Sinhala }, + { "Sk", PT_PC, ucp_Sk }, + { "Sm", PT_PC, ucp_Sm }, + { "So", PT_PC, ucp_So }, + { "Syloti_Nagri", PT_SC, ucp_Syloti_Nagri }, + { "Syriac", PT_SC, ucp_Syriac }, + { "Tagalog", PT_SC, ucp_Tagalog }, + { "Tagbanwa", PT_SC, ucp_Tagbanwa }, + { "Tai_Le", PT_SC, ucp_Tai_Le }, + { "Tamil", PT_SC, ucp_Tamil }, + { "Telugu", PT_SC, ucp_Telugu }, + { "Thaana", PT_SC, ucp_Thaana }, + { "Thai", PT_SC, ucp_Thai }, + { "Tibetan", PT_SC, ucp_Tibetan }, + { "Tifinagh", PT_SC, ucp_Tifinagh }, + { "Ugaritic", PT_SC, ucp_Ugaritic }, + { "Yi", PT_SC, ucp_Yi }, + { "Z", PT_GC, ucp_Z }, + { "Zl", PT_PC, ucp_Zl }, + { "Zp", PT_PC, ucp_Zp }, + { "Zs", PT_PC, ucp_Zs } }; const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table); diff --git a/libpcre/pcre_try_flipped.c b/libpcre/pcre_try_flipped.c index 536eb05a4..00c94fccf 100644 --- a/libpcre/pcre_try_flipped.c +++ b/libpcre/pcre_try_flipped.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -62,8 +62,8 @@ Arguments: Returns: the flipped value */ -static long int -byteflip(long int value, int n) +static unsigned long int +byteflip(unsigned long int value, int n) { if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8); return ((value & 0x000000ff) << 24) | @@ -94,7 +94,7 @@ Returns: the new block if is is indeed a byte-flipped regex NULL if it is not */ -PCRE_EXPORT real_pcre * +real_pcre * _pcre_try_flipped(const real_pcre *re, real_pcre *internal_re, const pcre_study_data *study, pcre_study_data *internal_study) { diff --git a/libpcre/pcre_version.c b/libpcre/pcre_version.c index 2d3080a70..c2aad4da0 100644 --- a/libpcre/pcre_version.c +++ b/libpcre/pcre_version.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -52,10 +52,13 @@ string that identifies the PCRE version that is in use. */ #define STRING(a) # a #define XSTRING(s) STRING(s) -PCRE_EXPORT const char * +PCRE_DATA_SCOPE const char * pcre_version(void) { -return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE); +return XSTRING(PCRE_MAJOR) + "." XSTRING(PCRE_MINOR) + XSTRING(PCRE_PRERELEASE) + " " XSTRING(PCRE_DATE); } /* End of pcre_version.c */ diff --git a/libpcre/pcre_xclass.c b/libpcre/pcre_xclass.c index 4a4f895f0..57c514b78 100644 --- a/libpcre/pcre_xclass.c +++ b/libpcre/pcre_xclass.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -60,7 +60,7 @@ Arguments: Returns: TRUE if character matches, else FALSE */ -PCRE_EXPORT BOOL +BOOL _pcre_xclass(int c, const uschar *data) { int t; @@ -100,17 +100,40 @@ while ((t = *data++) != XCL_END) #ifdef SUPPORT_UCP else /* XCL_PROP & XCL_NOTPROP */ { - int chartype, othercase; - int rqdtype = *data++; - int category = _pcre_ucp_findchar(c, &chartype, &othercase); - if (rqdtype >= 128) + int chartype, script; + int category = _pcre_ucp_findprop(c, &chartype, &script); + + switch(*data) { - if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated; - } - else - { - if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated; + case PT_ANY: + if (t == XCL_PROP) return !negated; + break; + + case PT_LAMP: + if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) == + (t == XCL_PROP)) return !negated; + break; + + case PT_GC: + if ((data[1] == category) == (t == XCL_PROP)) return !negated; + break; + + case PT_PC: + if ((data[1] == chartype) == (t == XCL_PROP)) return !negated; + break; + + case PT_SC: + if ((data[1] == script) == (t == XCL_PROP)) return !negated; + break; + + /* This should never occur, but compilers may mutter if there is no + default. */ + + default: + return FALSE; } + + data += 2; } #endif /* SUPPORT_UCP */ } diff --git a/libpcre/pcreposix.c b/libpcre/pcreposix.c index c8f25ad3a..79397f029 100644 --- a/libpcre/pcreposix.c +++ b/libpcre/pcreposix.c @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -77,7 +77,7 @@ static const int eint[] = { REG_ASSERT, /* internal error: code overflow */ REG_BADPAT, /* unrecognized character after (?< */ REG_BADPAT, /* lookbehind assertion is not fixed length */ - REG_BADPAT, /* malformed number after (?( */ + REG_BADPAT, /* malformed number or name after (?( */ REG_BADPAT, /* conditional group containe more than two branches */ REG_BADPAT, /* assertion expected after (?( */ REG_BADPAT, /* (?R or (?digits must be followed by ) */ @@ -94,11 +94,15 @@ static const int eint[] = { REG_BADPAT, /* recursive call could loop indefinitely */ REG_BADPAT, /* unrecognized character after (?P */ REG_BADPAT, /* syntax error after (?P */ - REG_BADPAT, /* two named groups have the same name */ + REG_BADPAT, /* two named subpatterns have the same name */ REG_BADPAT, /* invalid UTF-8 string */ REG_BADPAT, /* support for \P, \p, and \X has not been compiled */ REG_BADPAT, /* malformed \P or \p sequence */ - REG_BADPAT /* unknown property name after \P or \p */ + REG_BADPAT, /* unknown property name after \P or \p */ + REG_BADPAT, /* subpattern name is too long (maximum 32 characters) */ + REG_BADPAT, /* too many named subpatterns (maximum 10,000) */ + REG_BADPAT, /* repeated subpattern is too long */ + REG_BADPAT /* octal value is greater than \377 (not in UTF-8 mode) */ }; /* Table of texts corresponding to POSIX error codes */ @@ -131,7 +135,7 @@ static const char *const pstring[] = { * Translate error code to string * *************************************************/ -PCRE_EXPORT size_t +PCRE_DATA_SCOPE size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) { const char *message, *addmessage; @@ -166,7 +170,7 @@ return length + addlength; * Free store held by a regex * *************************************************/ -PCRE_EXPORT void +PCRE_DATA_SCOPE void regfree(regex_t *preg) { (pcre_free)(preg->re_pcre); @@ -189,7 +193,7 @@ Returns: 0 on success various non-zero codes on failure */ -PCRE_EXPORT int +PCRE_DATA_SCOPE int regcomp(regex_t *preg, const char *pattern, int cflags) { const char *errorptr; @@ -200,6 +204,8 @@ int options = 0; if ((cflags & REG_ICASE) != 0) options |= PCRE_CASELESS; if ((cflags & REG_NEWLINE) != 0) options |= PCRE_MULTILINE; if ((cflags & REG_DOTALL) != 0) options |= PCRE_DOTALL; +if ((cflags & REG_NOSUB) != 0) options |= PCRE_NO_AUTO_CAPTURE; +if ((cflags & REG_UTF8) != 0) options |= PCRE_UTF8; preg->re_pcre = pcre_compile2(pattern, options, &errorcode, &errorptr, &erroffset, NULL); @@ -223,9 +229,13 @@ substring, so we have to get and release working store instead of just using the POSIX structures as was done in earlier releases when PCRE needed only 2 ints. However, if the number of possible capturing brackets is small, use a block of store on the stack, to reduce the use of malloc/free. The threshold is -in a macro that can be changed at configure time. */ +in a macro that can be changed at configure time. -PCRE_EXPORT int +If REG_NOSUB was specified at compile time, the PCRE_NO_AUTO_CAPTURE flag will +be set. When this is the case, the nmatch and pmatch arguments are ignored, and +the only result is yes/no/error. */ + +PCRE_DATA_SCOPE int regexec(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags) { @@ -234,13 +244,20 @@ int options = 0; int *ovector = NULL; int small_ovector[POSIX_MALLOC_THRESHOLD * 3]; BOOL allocated_ovector = FALSE; +BOOL nosub = + (((const pcre *)preg->re_pcre)->options & PCRE_NO_AUTO_CAPTURE) != 0; if ((eflags & REG_NOTBOL) != 0) options |= PCRE_NOTBOL; if ((eflags & REG_NOTEOL) != 0) options |= PCRE_NOTEOL; ((regex_t *)preg)->re_erroffset = (size_t)(-1); /* Only has meaning after compile */ -if (nmatch > 0) +/* When no string data is being returned, ensure that nmatch is zero. +Otherwise, ensure the vector for holding the return data is large enough. */ + +if (nosub) nmatch = 0; + +else if (nmatch > 0) { if (nmatch <= POSIX_MALLOC_THRESHOLD) { @@ -248,6 +265,7 @@ if (nmatch > 0) } else { + if (nmatch > INT_MAX/(sizeof(int) * 3)) return REG_ESPACE; ovector = (int *)malloc(sizeof(int) * nmatch * 3); if (ovector == NULL) return REG_ESPACE; allocated_ovector = TRUE; @@ -262,6 +280,8 @@ if (rc == 0) rc = nmatch; /* All captured slots were filled in */ if (rc >= 0) { size_t i; + if (!nosub) + { for (i = 0; i < (size_t)rc; i++) { pmatch[i].rm_so = ovector[i*2]; @@ -269,6 +289,7 @@ if (rc >= 0) } if (allocated_ovector) free(ovector); for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1; + } return 0; } diff --git a/libpcre/pcreposix.h b/libpcre/pcreposix.h index 4f1b1abd4..31ee03749 100644 --- a/libpcre/pcreposix.h +++ b/libpcre/pcreposix.h @@ -9,7 +9,7 @@ Compatible Regular Expression library. It defines the things POSIX says should be there. I hope. - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2006 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -50,22 +50,20 @@ POSSIBILITY OF SUCH DAMAGE. extern "C" { #endif -/* Options defined by POSIX. */ +/* Options, mostly defined by POSIX, but with a couple of extras. */ -#define REG_ICASE 0x01 -#define REG_NEWLINE 0x02 -#define REG_NOTBOL 0x04 -#define REG_NOTEOL 0x08 +#define REG_ICASE 0x0001 +#define REG_NEWLINE 0x0002 +#define REG_NOTBOL 0x0004 +#define REG_NOTEOL 0x0008 +#define REG_DOTALL 0x0010 /* NOT defined by POSIX. */ +#define REG_NOSUB 0x0020 +#define REG_UTF8 0x0040 /* NOT defined by POSIX. */ -/* Additional options, not defined by POSIX, but somebody wanted them. */ - -#define REG_DOTALL 0x10 - -/* These are not used by PCRE, but by defining them we make it easier +/* This is not used by PCRE, but by defining it we make it easier to slot PCRE into existing programs that make POSIX calls. */ #define REG_EXTENDED 0 -#define REG_NOSUB 0 /* Error values. Not all these are relevant or used by the wrapper. */ @@ -107,12 +105,40 @@ typedef struct { regoff_t rm_eo; } regmatch_t; +/* Win32 uses DLL by default; it needs special stuff for exported functions +when building PCRE. */ + +#ifndef PCRE_DATA_SCOPE +#ifdef _WIN32 +# ifdef PCRE_DEFINITION +# ifdef DLL_EXPORT +# define PCRE_DATA_SCOPE __declspec(dllexport) +# endif +# else +# ifndef PCRE_STATIC +# define PCRE_DATA_SCOPE extern __declspec(dllimport) +# endif +# endif +#endif +#endif + +/* Otherwise, we use the standard "extern". */ + +#ifndef PCRE_DATA_SCOPE +# ifdef __cplusplus +# define PCRE_DATA_SCOPE extern "C" +# else +# define PCRE_DATA_SCOPE extern +# endif +#endif + /* The functions */ -extern int regcomp(regex_t *, const char *, int); -extern int regexec(const regex_t *, const char *, size_t, regmatch_t *, int); -extern size_t regerror(int, const regex_t *, char *, size_t); -extern void regfree(regex_t *); +PCRE_DATA_SCOPE int regcomp(regex_t *, const char *, int); +PCRE_DATA_SCOPE int regexec(const regex_t *, const char *, size_t, + regmatch_t *, int); +PCRE_DATA_SCOPE size_t regerror(int, const regex_t *, char *, size_t); +PCRE_DATA_SCOPE void regfree(regex_t *); #ifdef __cplusplus } /* extern "C" */ diff --git a/libpcre/ucp.h b/libpcre/ucp.h index b216e0e5e..1ca23c9c1 100644 --- a/libpcre/ucp.h +++ b/libpcre/ucp.h @@ -1,12 +1,14 @@ /************************************************* -* libucp - Unicode Property Table handler * +* Unicode Property Table handler * *************************************************/ - #ifndef _UCP_H #define _UCP_H -/* These are the character categories that are returned by ucp_findchar */ +/* This file contains definitions of the property values that are returned by +the function _pcre_ucp_findprop(). */ + +/* These are the general character categories. */ enum { ucp_C, /* Other */ @@ -18,7 +20,7 @@ enum { ucp_Z /* Separator */ }; -/* These are the detailed character types that are returned by ucp_findchar */ +/* These are the particular character types. */ enum { ucp_Cc, /* Control */ @@ -53,6 +55,72 @@ enum { ucp_Zs /* Space separator */ }; +/* These are the script identifications. */ + +enum { + ucp_Arabic, + ucp_Armenian, + ucp_Bengali, + ucp_Bopomofo, + ucp_Braille, + ucp_Buginese, + ucp_Buhid, + ucp_Canadian_Aboriginal, + ucp_Cherokee, + ucp_Common, + ucp_Coptic, + ucp_Cypriot, + ucp_Cyrillic, + ucp_Deseret, + ucp_Devanagari, + ucp_Ethiopic, + ucp_Georgian, + ucp_Glagolitic, + ucp_Gothic, + ucp_Greek, + ucp_Gujarati, + ucp_Gurmukhi, + ucp_Han, + ucp_Hangul, + ucp_Hanunoo, + ucp_Hebrew, + ucp_Hiragana, + ucp_Inherited, + ucp_Kannada, + ucp_Katakana, + ucp_Kharoshthi, + ucp_Khmer, + ucp_Lao, + ucp_Latin, + ucp_Limbu, + ucp_Linear_B, + ucp_Malayalam, + ucp_Mongolian, + ucp_Myanmar, + ucp_New_Tai_Lue, + ucp_Ogham, + ucp_Old_Italic, + ucp_Old_Persian, + ucp_Oriya, + ucp_Osmanya, + ucp_Runic, + ucp_Shavian, + ucp_Sinhala, + ucp_Syloti_Nagri, + ucp_Syriac, + ucp_Tagalog, + ucp_Tagbanwa, + ucp_Tai_Le, + ucp_Tamil, + ucp_Telugu, + ucp_Thaana, + ucp_Thai, + ucp_Tibetan, + ucp_Tifinagh, + ucp_Ugaritic, + ucp_Yi +}; + #endif /* End of ucp.h */