diff --git a/ChangeLog b/ChangeLog index ce57091..9d61911 100644 --- a/ChangeLog +++ b/ChangeLog @@ -216,6 +216,9 @@ unit". Previously only non-anchored patterns did this. 49. Update extended grapheme breaking rules to the latest set that are in Unicode Standard Annex #29. +50. Added experimental foreign pattern conversion facilities +(pcre2_pattern_convert() and friends). + Version 10.23 14-February-2017 ------------------------------ diff --git a/Makefile.am b/Makefile.am index 56d3434..bbf23b8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -36,6 +36,10 @@ dist_html_DATA = \ doc/html/pcre2_compile_context_create.html \ doc/html/pcre2_compile_context_free.html \ doc/html/pcre2_config.html \ + doc/html/pcre2_convert_context_copy.html \ + doc/html/pcre2_convert_context_create.html \ + doc/html/pcre2_convert_context_free.html \ + doc/html/pcre2_converted_pattern_free.html \ doc/html/pcre2_dfa_match.html \ doc/html/pcre2_general_context_copy.html \ doc/html/pcre2_general_context_create.html \ @@ -59,6 +63,7 @@ dist_html_DATA = \ doc/html/pcre2_match_data_create.html \ doc/html/pcre2_match_data_create_from_pattern.html \ doc/html/pcre2_match_data_free.html \ + doc/html/pcre2_pattern_convert.html \ doc/html/pcre2_pattern_info.html \ doc/html/pcre2_serialize_decode.html \ doc/html/pcre2_serialize_encode.html \ @@ -70,6 +75,8 @@ dist_html_DATA = \ doc/html/pcre2_set_compile_extra_options.html \ doc/html/pcre2_set_compile_recursion_guard.html \ doc/html/pcre2_set_depth_limit.html \ + doc/html/pcre2_set_glob_escape.html \ + doc/html/pcre2_set_glob_separator.html \ doc/html/pcre2_set_heap_limit.html \ doc/html/pcre2_set_match_limit.html \ doc/html/pcre2_set_max_pattern_length.html \ @@ -94,6 +101,7 @@ dist_html_DATA = \ doc/html/pcre2build.html \ doc/html/pcre2callout.html \ doc/html/pcre2compat.html \ + doc/html/pcre2convert.html \ doc/html/pcre2demo.html \ doc/html/pcre2grep.html \ doc/html/pcre2jit.html \ @@ -121,6 +129,10 @@ dist_man_MANS = \ doc/pcre2_compile_context_create.3 \ doc/pcre2_compile_context_free.3 \ doc/pcre2_config.3 \ + doc/pcre2_convert_context_copy.3 \ + doc/pcre2_convert_context_create.3 \ + doc/pcre2_convert_context_free.3 \ + doc/pcre2_converted_pattern_free.3 \ doc/pcre2_dfa_match.3 \ doc/pcre2_general_context_copy.3 \ doc/pcre2_general_context_create.3 \ @@ -144,6 +156,7 @@ dist_man_MANS = \ doc/pcre2_match_data_create.3 \ doc/pcre2_match_data_create_from_pattern.3 \ doc/pcre2_match_data_free.3 \ + doc/pcre2_pattern_convert.3 \ doc/pcre2_pattern_info.3 \ doc/pcre2_serialize_decode.3 \ doc/pcre2_serialize_encode.3 \ @@ -155,6 +168,8 @@ dist_man_MANS = \ doc/pcre2_set_compile_extra_options.3 \ doc/pcre2_set_compile_recursion_guard.3 \ doc/pcre2_set_depth_limit.3 \ + doc/pcre2_set_glob_escape.3 \ + doc/pcre2_set_glob_separator.3 \ doc/pcre2_set_heap_limit.3 \ doc/pcre2_set_match_limit.3 \ doc/pcre2_set_max_pattern_length.3 \ @@ -179,6 +194,7 @@ dist_man_MANS = \ doc/pcre2build.3 \ doc/pcre2callout.3 \ doc/pcre2compat.3 \ + doc/pcre2convert.3 \ doc/pcre2demo.3 \ doc/pcre2grep.1 \ doc/pcre2jit.3 \ diff --git a/doc/html/index.html b/doc/html/index.html index 2a373f5..b9393d9 100644 --- a/doc/html/index.html +++ b/doc/html/index.html @@ -35,6 +35,9 @@ first. pcre2compat   Compability with Perl +pcre2convert +   Experimental foreign pattern conversion functions + pcre2demo   A demonstration C program that uses the PCRE2 library @@ -112,6 +115,18 @@ in the library. pcre2_config   Show build-time configuration options +pcre2_convert_context_copy +   Copy a convert context + +pcre2_convert_context_create +   Create a convert context + +pcre2_convert_context_free +   Free a convert context + +pcre2_converted_pattern_free +   Free converted foreign pattern + pcre2_dfa_match   Match a compiled pattern to a subject string (DFA algorithm; not Perl compatible) @@ -183,6 +198,9 @@ in the library. pcre2_match_data_free   Free a match data block +pcre2_pattern_convert +   Experimental foreign pattern converter + pcre2_pattern_info   Extract information about a pattern @@ -216,6 +234,12 @@ in the library. pcre2_set_depth_limit   Set the match backtracking depth limit +pcre2_set_glob_escape +   Set glob escape character + +pcre2_set_glob_separator +   Set glob separator character + pcre2_set_heap_limit   Set the match backtracking heap limit diff --git a/doc/html/pcre2_convert_context_copy.html b/doc/html/pcre2_convert_context_copy.html new file mode 100644 index 0000000..3c44ac6 --- /dev/null +++ b/doc/html/pcre2_convert_context_copy.html @@ -0,0 +1,40 @@ + + +pcre2_convert_context_copy specification + + +

pcre2_convert_context_copy man page

+

+Return to the PCRE2 index page. +

+

+This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +
+
+SYNOPSIS +
+

+#include <pcre2.h> +

+

+pcre2_convert_context *pcre2_convert_context_copy( + pcre2_convert_context *cvcontext); +

+
+DESCRIPTION +
+

+This function is part of an experimental set of pattern conversion functions. +It makes a new copy of a convert context, using the memory allocation function +that was used for the original context. The result is NULL if the memory cannot +be obtained. +

+

+The pattern conversion functions are described in the +pcre2convert +documentation. +

+Return to the PCRE2 index page. +

diff --git a/doc/html/pcre2_convert_context_create.html b/doc/html/pcre2_convert_context_create.html new file mode 100644 index 0000000..2564780 --- /dev/null +++ b/doc/html/pcre2_convert_context_create.html @@ -0,0 +1,41 @@ + + +pcre2_convert_context_create specification + + +

pcre2_convert_context_create man page

+

+Return to the PCRE2 index page. +

+

+This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +
+
+SYNOPSIS +
+

+#include <pcre2.h> +

+

+pcre2_convert_context *pcre2_convert_context_create( + pcre2_general_context *gcontext); +

+
+DESCRIPTION +
+

+This function is part of an experimental set of pattern conversion functions. +It creates and initializes a new convert context. If its argument is +NULL, malloc() is used to get the necessary memory; otherwise the memory +allocation function within the general context is used. The result is NULL if +the memory could not be obtained. +

+

+The pattern conversion functions are described in the +pcre2convert +documentation. +

+Return to the PCRE2 index page. +

diff --git a/doc/html/pcre2_convert_context_free.html b/doc/html/pcre2_convert_context_free.html new file mode 100644 index 0000000..ab6db6c --- /dev/null +++ b/doc/html/pcre2_convert_context_free.html @@ -0,0 +1,39 @@ + + +pcre2_convert_context_free specification + + +

pcre2_convert_context_free man page

+

+Return to the PCRE2 index page. +

+

+This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +
+
+SYNOPSIS +
+

+#include <pcre2.h> +

+

+void pcre2_convert_context_free(pcre2_convert_context *cvcontext); +

+
+DESCRIPTION +
+

+This function is part of an experimental set of pattern conversion functions. +It frees the memory occupied by a convert context, using the memory +freeing function from the general context with which it was created, or +free() if that was not set. +

+

+The pattern conversion functions are described in the +pcre2convert +documentation. +

+Return to the PCRE2 index page. +

diff --git a/doc/html/pcre2_converted_pattern_free.html b/doc/html/pcre2_converted_pattern_free.html new file mode 100644 index 0000000..961f04f --- /dev/null +++ b/doc/html/pcre2_converted_pattern_free.html @@ -0,0 +1,39 @@ + + +pcre2_converted_pattern_free specification + + +

pcre2_converted_pattern_free man page

+

+Return to the PCRE2 index page. +

+

+This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +
+
+SYNOPSIS +
+

+#include <pcre2.h> +

+

+void pcre2_converted_pattern_free(PCRE2_UCHAR *converted_pattern); +

+
+DESCRIPTION +
+

+This function is part of an experimental set of pattern conversion functions. +It frees the memory occupied by a converted pattern that was obtained by +calling pcre2_pattern_convert() with arguments that caused it to place +the converted pattern into newly obtained heap memory. +

+

+The pattern conversion functions are described in the +pcre2convert +documentation. +

+Return to the PCRE2 index page. +

diff --git a/doc/html/pcre2_pattern_convert.html b/doc/html/pcre2_pattern_convert.html new file mode 100644 index 0000000..2fcd7cc --- /dev/null +++ b/doc/html/pcre2_pattern_convert.html @@ -0,0 +1,70 @@ + + +pcre2_pattern_convert specification + + +

pcre2_pattern_convert man page

+

+Return to the PCRE2 index page. +

+

+This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +
+
+SYNOPSIS +
+

+#include <pcre2.h> +

+

+int pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE length, + uint32_t options, PCRE2_UCHAR **buffer, + PCRE2_SIZE *blength, pcre2_convert_context *cvcontext); +

+
+DESCRIPTION +
+

+This function is part of an experimental set of pattern conversion functions. +It converts a foreign pattern (for example, a glob) into a PCRE2 regular +expression pattern. Its arguments are: +

+  pattern     The foreign pattern
+  length      The length of the input pattern or PCRE2_ZERO_TERMINATED
+  options     Option bits
+  buffer      Pointer to pointer to output buffer, or NULL
+  blength     Pointer to output length field
+  cvcontext   Pointer to a convert context or NULL
+
+The length of the converted pattern (excluding the terminating zero) is +returned via blength. If buffer is NULL, the function just returns +the output length. If buffer points to a NULL pointer, heap memory is +obtained for the converted pattern, using the allocator in the context if +present (or else malloc()), and the field pointed to by buffer is +updated. If buffer points to a non-NULL field, that must point to a +buffer whose size is in the variable pointed to by blength. This value is +updated. +

+

+The option bits are: +

+  PCRE2_CONVERT_UTF                     Input is UTF
+  PCRE2_CONVERT_NO_UTF_CHECK            Do not check UTF validity
+  PCRE2_CONVERT_POSIX_BASIC             Convert POSIX basic pattern
+  PCRE2_CONVERT_POSIX_EXTENDED          Convert POSIX extended pattern
+  PCRE2_CONVERT_GLOB                    ) Convert
+  PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR  )   various types
+  PCRE2_CONVERT_GLOB_NO_STARSTAR        )     of glob
+
+The return value from pcre2_pattern_convert() is zero on success or a +non-zero PCRE2 error code. +

+

+The pattern conversion functions are described in the +pcre2convert +documentation. +

+Return to the PCRE2 index page. +

diff --git a/doc/html/pcre2_set_glob_escape.html b/doc/html/pcre2_set_glob_escape.html new file mode 100644 index 0000000..2b55627 --- /dev/null +++ b/doc/html/pcre2_set_glob_escape.html @@ -0,0 +1,43 @@ + + +pcre2_set_glob_escape specification + + +

pcre2_set_glob_escape man page

+

+Return to the PCRE2 index page. +

+

+This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +
+
+SYNOPSIS +
+

+#include <pcre2.h> +

+

+int pcre2_set_glob_escape(pcre2_convert_context *cvcontext, + uint32_t escape_char); +

+
+DESCRIPTION +
+

+This function is part of an experimental set of pattern conversion functions. +It sets the escape character that is used when converting globs. The second +argument must either be zero (meaning there is no escape character) or a +punctuation character whose code point is less than 256. The default is grave +accent if running under Windows, otherwise backslash. The result of the +function is zero for success or PCRE2_ERROR_BADDATA if the second argument is +invalid. +

+

+The pattern conversion functions are described in the +pcre2convert +documentation. +

+Return to the PCRE2 index page. +

diff --git a/doc/html/pcre2_set_glob_separator.html b/doc/html/pcre2_set_glob_separator.html new file mode 100644 index 0000000..538748d --- /dev/null +++ b/doc/html/pcre2_set_glob_separator.html @@ -0,0 +1,42 @@ + + +pcre2_set_glob_separator specification + + +

pcre2_set_glob_separator man page

+

+Return to the PCRE2 index page. +

+

+This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +
+
+SYNOPSIS +
+

+#include <pcre2.h> +

+

+int pcre2_set_glob_separator(pcre2_convert_context *cvcontext, + uint32_t separator_char); +

+
+DESCRIPTION +
+

+This function is part of an experimental set of pattern conversion functions. +It sets the component separator character that is used when converting globs. +The second argument must one of the characters forward slash, backslash, or +dot. The default is backslash when running under Windows, otherwise forward +slash. The result of the function is zero for success or PCRE2_ERROR_BADDATA if +the second argument is invalid. +

+

+The pattern conversion functions are described in the +pcre2convert +documentation. +

+Return to the PCRE2 index page. +

diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html index 1fb5738..67c5802 100644 --- a/doc/html/pcre2api.html +++ b/doc/html/pcre2api.html @@ -24,37 +24,38 @@ please consult the man page, in case the conversion went wrong.
  • PCRE2 NATIVE API SERIALIZATION FUNCTIONS
  • PCRE2 NATIVE API AUXILIARY FUNCTIONS
  • PCRE2 NATIVE API OBSOLETE FUNCTIONS -
  • PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES -
  • PCRE2 API OVERVIEW -
  • STRING LENGTHS AND OFFSETS -
  • NEWLINES -
  • MULTITHREADING -
  • PCRE2 CONTEXTS -
  • CHECKING BUILD-TIME OPTIONS -
  • COMPILING A PATTERN -
  • COMPILATION ERROR CODES -
  • JUST-IN-TIME (JIT) COMPILATION -
  • LOCALE SUPPORT -
  • INFORMATION ABOUT A COMPILED PATTERN -
  • INFORMATION ABOUT A PATTERN'S CALLOUTS -
  • SERIALIZATION AND PRECOMPILING -
  • THE MATCH DATA BLOCK -
  • MATCHING A PATTERN: THE TRADITIONAL FUNCTION -
  • NEWLINE HANDLING WHEN MATCHING -
  • HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS -
  • OTHER INFORMATION ABOUT A MATCH -
  • ERROR RETURNS FROM pcre2_match() -
  • OBTAINING A TEXTUAL ERROR MESSAGE -
  • EXTRACTING CAPTURED SUBSTRINGS BY NUMBER -
  • EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS -
  • EXTRACTING CAPTURED SUBSTRINGS BY NAME -
  • CREATING A NEW STRING WITH SUBSTITUTIONS -
  • DUPLICATE SUBPATTERN NAMES -
  • FINDING ALL POSSIBLE MATCHES AT ONE POSITION -
  • MATCHING A PATTERN: THE ALTERNATIVE FUNCTION -
  • SEE ALSO -
  • AUTHOR -
  • REVISION +
  • PCRE2 EXPERIMENTAL PATTERN CONVERSION FUNCTIONS +
  • PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES +
  • PCRE2 API OVERVIEW +
  • STRING LENGTHS AND OFFSETS +
  • NEWLINES +
  • MULTITHREADING +
  • PCRE2 CONTEXTS +
  • CHECKING BUILD-TIME OPTIONS +
  • COMPILING A PATTERN +
  • COMPILATION ERROR CODES +
  • JUST-IN-TIME (JIT) COMPILATION +
  • LOCALE SUPPORT +
  • INFORMATION ABOUT A COMPILED PATTERN +
  • INFORMATION ABOUT A PATTERN'S CALLOUTS +
  • SERIALIZATION AND PRECOMPILING +
  • THE MATCH DATA BLOCK +
  • MATCHING A PATTERN: THE TRADITIONAL FUNCTION +
  • NEWLINE HANDLING WHEN MATCHING +
  • HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS +
  • OTHER INFORMATION ABOUT A MATCH +
  • ERROR RETURNS FROM pcre2_match() +
  • OBTAINING A TEXTUAL ERROR MESSAGE +
  • EXTRACTING CAPTURED SUBSTRINGS BY NUMBER +
  • EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS +
  • EXTRACTING CAPTURED SUBSTRINGS BY NAME +
  • CREATING A NEW STRING WITH SUBSTITUTIONS +
  • DUPLICATE SUBPATTERN NAMES +
  • FINDING ALL POSSIBLE MATCHES AT ONE POSITION +
  • MATCHING A PATTERN: THE ALTERNATIVE FUNCTION +
  • SEE ALSO +
  • AUTHOR +
  • REVISION

    #include <pcre2.h> @@ -334,7 +335,43 @@ backward compatibility. They should not be used in new code. The first is replaced by pcre2_set_depth_limit(); the second is no longer needed and has no effect (it always returns zero).

    -
    PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES
    +
    PCRE2 EXPERIMENTAL PATTERN CONVERSION FUNCTIONS
    +

    +pcre2_convert_context *pcre2_convert_context_create( + pcre2_general_context *gcontext); +
    +
    +pcre2_convert_context *pcre2_convert_context_copy( + pcre2_convert_context *cvcontext); +
    +
    +void pcre2_convert_context_free(pcre2_convert_context *cvcontext); +
    +
    +int pcre2_set_glob_escape(pcre2_convert_context *cvcontext, + uint32_t escape_char); +
    +
    +int pcre2_set_glob_separator(pcre2_convert_context *cvcontext, + uint32_t separator_char); +
    +
    +int pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE length, + uint32_t options, PCRE2_UCHAR **buffer, + PCRE2_SIZE *blength, pcre2_convert_context *cvcontext); +
    +
    +void pcre2_converted_pattern_free(PCRE2_UCHAR *converted_pattern); +
    +
    +These functions provide a way of converting non-PCRE2 patterns into +patterns that can be processed by pcre2_compile(). This facility is +experimental and may be changed in future releases. At present, "globs" and +POSIX basic and extended patterns can be converted. Details are given in the +pcre2convert +documentation. +

    +
    PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES

    There are three PCRE2 libraries, supporting 8-bit, 16-bit, and 32-bit code units, respectively. However, there is just one header file, pcre2.h. @@ -395,7 +432,7 @@ In the function summaries above, and in the rest of this document and other PCRE2 documents, functions and data types are described using their generic names, without the _8, _16, or _32 suffix.

    -
    PCRE2 API OVERVIEW
    +
    PCRE2 API OVERVIEW

    PCRE2 has its own native API, which is described in this document. There are also some wrapper functions for the 8-bit library that correspond to the @@ -503,7 +540,7 @@ Functions with names ending with _free() are used for freeing memory blocks of various sorts. In all cases, if one of these functions is called with a NULL argument, it does nothing.

    -
    STRING LENGTHS AND OFFSETS
    +
    STRING LENGTHS AND OFFSETS

    The PCRE2 API uses string lengths and offsets into strings of code units in several places. These values are always of type PCRE2_SIZE, which is an @@ -513,7 +550,7 @@ as a special indicator for zero-terminated strings and unset offsets. Therefore, the longest string that can be handled is one less than this maximum.

    -
    NEWLINES
    +
    NEWLINES

    PCRE2 supports five different conventions for indicating line breaks in strings: a single CR (carriage return) character, a single LF (linefeed) @@ -548,7 +585,7 @@ The choice of newline convention does not affect the interpretation of the \n or \r escape sequences, nor does it affect what \R matches; this has its own separate convention.

    -
    MULTITHREADING
    +
    MULTITHREADING

    In a multithreaded application it is important to keep thread-specific data separate from data that can be shared between threads. The PCRE2 library code @@ -628,7 +665,7 @@ match. This includes details of what was matched, as well as additional information such as the name of a (*MARK) setting. Each thread must provide its own copy of this memory.

    -
    PCRE2 CONTEXTS
    +
    PCRE2 CONTEXTS

    Some PCRE2 functions have a lot of parameters, many of which are used only by specialist applications, for example, those that use custom memory management @@ -1013,7 +1050,7 @@ where ddd is a decimal number. However, such a setting is ignored unless ddd is less than the limit set by the caller of pcre2_match() or pcre2_dfa_match() or, if no such limit is set, less than the default.

    -
    CHECKING BUILD-TIME OPTIONS
    +
    CHECKING BUILD-TIME OPTIONS

    int pcre2_config(uint32_t what, void *where);

    @@ -1150,7 +1187,7 @@ the PCRE2 version string, zero-terminated. The number of code units used is returned. This is the length of the string plus one unit for the terminating zero.

    -
    COMPILING A PATTERN
    +
    COMPILING A PATTERN

    pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length, uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, @@ -1741,7 +1778,7 @@ dangerous option. Use with care. PCRE2_EXTRA_MATCH_LINE This option is provided for use by the -x option of pcre2grep. It -causes the pattern only to match complete lines. This is achieved by +causes the pattern only to match complete lines. This is achieved by automatically inserting the code for "^(?:" at the start of the compiled pattern and ")$" at the end. Thus, when PCRE2_MULTILINE is set, the matched line may be in the middle of the subject string. This option can be used with @@ -1756,7 +1793,7 @@ at the start of the compiled pattern and ")\b" at the end. The option may be used with PCRE2_LITERAL. However, it is ignored if PCRE2_EXTRA_MATCH_LINE is also set.

    -
    COMPILATION ERROR CODES
    +
    COMPILATION ERROR CODES

    There are nearly 100 positive error codes that pcre2_compile() may return (via errorcode) if it finds an error in the pattern. There are also some @@ -1769,7 +1806,7 @@ error message" below) can be called to obtain a textual error message from any error code.

    -
    JUST-IN-TIME (JIT) COMPILATION
    +
    JUST-IN-TIME (JIT) COMPILATION

    int pcre2_jit_compile(pcre2_code *code, uint32_t options);
    @@ -1807,7 +1844,7 @@ patterns to be analyzed, and for one-off matches and simple patterns the benefit of faster execution might be offset by a much slower compilation time. Most (but not all) patterns can be optimized by the JIT compiler.

    -
    LOCALE SUPPORT
    +
    LOCALE SUPPORT

    PCRE2 handles caseless matching, and determines whether characters are letters, digits, or whatever, by reference to a set of tables, indexed by character code @@ -1863,7 +1900,7 @@ is saved with the compiled pattern, and the same tables are used by compilation and matching both happen in the same locale, but different patterns can be processed in different locales.

    -
    INFORMATION ABOUT A COMPILED PATTERN
    +
    INFORMATION ABOUT A COMPILED PATTERN

    int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);

    @@ -2188,7 +2225,7 @@ value returned by this option, because there are cases where the code that calculates the size has to over-estimate. Processing a pattern with the JIT compiler does not alter the value returned by this option.

    -
    INFORMATION ABOUT A PATTERN'S CALLOUTS
    +
    INFORMATION ABOUT A PATTERN'S CALLOUTS

    int pcre2_callout_enumerate(const pcre2_code *code, int (*callback)(pcre2_callout_enumerate_block *, void *), @@ -2207,7 +2244,7 @@ contents of the callout enumeration block are described in the pcre2callout documentation, which also gives further details about callouts.

    -
    SERIALIZATION AND PRECOMPILING
    +
    SERIALIZATION AND PRECOMPILING

    It is possible to save compiled patterns on disc or elsewhere, and reload them later, subject to a number of restrictions. The functions whose names begin @@ -2216,7 +2253,7 @@ the pcre2serialize documentation.

    -
    THE MATCH DATA BLOCK
    +
    THE MATCH DATA BLOCK

    pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize, pcre2_general_context *gcontext); @@ -2287,7 +2324,7 @@ match data block (for that match) have taken place. When a match data block itself is no longer needed, it should be freed by calling pcre2_match_data_free().

    -
    MATCHING A PATTERN: THE TRADITIONAL FUNCTION
    +
    MATCHING A PATTERN: THE TRADITIONAL FUNCTION

    int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, @@ -2525,7 +2562,7 @@ examples, in the pcre2partial documentation.

    -
    NEWLINE HANDLING WHEN MATCHING
    +
    NEWLINE HANDLING WHEN MATCHING

    When PCRE2 is built, a default newline convention is set; this is usually the standard convention for the operating system. The default can be overridden in @@ -2565,7 +2602,7 @@ does \s, even though it includes CR and LF in the characters that it matches. Notwithstanding the above, anomalous effects may still occur when CRLF is a valid newline sequence and explicit \r or \n escapes appear in the pattern.

    -
    HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS
    +
    HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS

    uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
    @@ -2664,7 +2701,7 @@ parentheses, no more than ovector[0] to ovector[2n+1] are set by pcre2_match(). The other elements retain whatever values they previously had.

    -
    OTHER INFORMATION ABOUT A MATCH
    +
    OTHER INFORMATION ABOUT A MATCH

    PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
    @@ -2714,7 +2751,7 @@ the code unit offset of the invalid UTF character. Details are given in the pcre2unicode page.

    -
    ERROR RETURNS FROM pcre2_match()
    +
    ERROR RETURNS FROM pcre2_match()

    If pcre2_match() fails, it returns a negative number. This can be converted to a text string by calling the pcre2_get_error_message() @@ -2820,7 +2857,7 @@ faulted at compile time, but more complicated cases, in particular mutual recursions between two different subpatterns, cannot be detected until matching is attempted.

    -
    OBTAINING A TEXTUAL ERROR MESSAGE
    +
    OBTAINING A TEXTUAL ERROR MESSAGE

    int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer, PCRE2_SIZE bufflen); @@ -2841,7 +2878,7 @@ returned. If the buffer is too small, the message is truncated (but still with a trailing zero), and the negative error code PCRE2_ERROR_NOMEMORY is returned. None of the messages are very long; a buffer size of 120 code units is ample.

    -
    EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
    +
    EXTRACTING CAPTURED SUBSTRINGS BY NUMBER

    int pcre2_substring_length_bynumber(pcre2_match_data *match_data, uint32_t number, PCRE2_SIZE *length); @@ -2938,7 +2975,7 @@ The substring did not participate in the match. For example, if the pattern is (abc)|(def) and the subject is "def", and the ovector contains at least two capturing slots, substring number 1 is unset.

    -
    EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS
    +
    EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS

    int pcre2_substring_list_get(pcre2_match_data *match_data, " PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr); @@ -2977,7 +3014,7 @@ can be distinguished from a genuine zero-length substring by inspecting the appropriate offset in the ovector, which contain PCRE2_UNSET for unset substrings, or by calling pcre2_substring_length_bynumber().

    -
    EXTRACTING CAPTURED SUBSTRINGS BY NAME
    +
    EXTRACTING CAPTURED SUBSTRINGS BY NAME

    int pcre2_substring_number_from_name(const pcre2_code *code, PCRE2_SPTR name); @@ -3037,7 +3074,7 @@ names are not included in the compiled code. The matching process uses only numbers. For this reason, the use of different names for subpatterns of the same number causes an error at compile time.

    -
    CREATING A NEW STRING WITH SUBSTITUTIONS
    +
    CREATING A NEW STRING WITH SUBSTITUTIONS

    int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, @@ -3244,7 +3281,7 @@ obtained by calling the pcre2_get_error_message() function (see "Obtaining a textual error message" above).

    -
    DUPLICATE SUBPATTERN NAMES
    +
    DUPLICATE SUBPATTERN NAMES

    int pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); @@ -3289,7 +3326,7 @@ in the section entitled Information about a pattern. Given all the relevant entries for the name, you can extract each of their numbers, and hence the captured data.

    -
    FINDING ALL POSSIBLE MATCHES AT ONE POSITION
    +
    FINDING ALL POSSIBLE MATCHES AT ONE POSITION

    The traditional matching function uses a similar algorithm to Perl, which stops when it finds the first match at a given point in the subject. If you want to @@ -3307,7 +3344,7 @@ substring. Then return 1, which forces pcre2_match() to backtrack and try other alternatives. Ultimately, when it runs out of matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.

    -
    MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
    +
    MATCHING A PATTERN: THE ALTERNATIVE FUNCTION

    int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, @@ -3503,13 +3540,13 @@ some plausibility checks are made on the contents of the workspace, which should contain data about the previous partial match. If any of these checks fail, this error is given.

    -
    SEE ALSO
    +
    SEE ALSO

    pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3), pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2unicode(3).

    -
    AUTHOR
    +
    AUTHOR

    Philip Hazel
    @@ -3518,9 +3555,9 @@ University Computing Service Cambridge, England.

    -
    REVISION
    +
    REVISION

    -Last updated: 16 June 2017 +Last updated: 10 July 2017
    Copyright © 1997-2017 University of Cambridge.
    diff --git a/doc/html/pcre2convert.html b/doc/html/pcre2convert.html new file mode 100644 index 0000000..8b4d87f --- /dev/null +++ b/doc/html/pcre2convert.html @@ -0,0 +1,190 @@ + + +pcre2convert specification + + +

    pcre2convert man page

    +

    +Return to the PCRE2 index page. +

    +

    +This page is part of the PCRE2 HTML documentation. It was generated +automatically from the original man page. If there is any nonsense in it, +please consult the man page, in case the conversion went wrong. +
    +

    +
    EXPERIMENTAL PATTERN CONVERSION FUNCTIONS
    +

    +This document describes a set of functions that can be used to convert +"foreign" patterns into PCRE2 regular expressions. This facility is currently +experimental, and may be changed in future releases. Two kinds of pattern, +globs and POSIX patterns, are supported. +

    +
    THE CONVERT CONTEXT
    +

    +pcre2_convert_context *pcre2_convert_context_create( + pcre2_general_context *gcontext); +
    +
    +pcre2_convert_context *pcre2_convert_context_copy( + pcre2_convert_context *cvcontext); +
    +
    +void pcre2_convert_context_free(pcre2_convert_context *cvcontext); +
    +
    +int pcre2_set_glob_escape(pcre2_convert_context *cvcontext, + uint32_t escape_char); +
    +
    +int pcre2_set_glob_separator(pcre2_convert_context *cvcontext, + uint32_t separator_char); +
    +
    +A convert context is used to hold parameters that affect the way that pattern +conversion works. Like all PCRE2 contexts, you need to use a context only if +you want to override the defaults. There are the usual create, copy, and free +functions. If custom memory management functions are set in a general context +that is passed to pcre2_convert_context_create(), they are used for all +memory management within the conversion functions. +

    +

    +There are only two parameters in the convert context at present. Both apply +only to glob conversions. The escape character defaults to grave accent under +Windows, otherwise backslash. It can be set to zero, meaning no escape +character, or to any punctuation character with a code point less than 256. +The separator character defaults to backslash under Windows, otherwise forward +slash. It can be set to forward slash, backslash, or dot. +

    +

    +The two setting functions return zero on success, or PCRE2_ERROR_BADDATA if +their second argument is invalid. +

    +
    THE CONVERSION FUNCTION
    +

    +int pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE length, + uint32_t options, PCRE2_UCHAR **buffer, + PCRE2_SIZE *blength, pcre2_convert_context *cvcontext); +
    +
    +void pcre2_converted_pattern_free(PCRE2_UCHAR *converted_pattern); +
    +
    +The first two arguments of pcre2_pattern_convert() define the foreign +pattern that is to be converted. The length may be given as +PCRE2_ZERO_TERMINATED. The options argument defines how the pattern is to +be processed. If the input is UTF, the PCRE2_CONVERT_UTF option should be set. +PCRE2_CONVERT_NO_UTF_CHECK may also be set if you are sure the input is valid. +One or more of the glob options, or one of the following POSIX options must be +set to define the type of conversion that is required: +

    +  PCRE2_CONVERT_GLOB
    +  PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR
    +  PCRE2_CONVERT_GLOB_NO_STARSTAR
    +  PCRE2_CONVERT_POSIX_BASIC
    +  PCRE2_CONVERT_POSIX_EXTENDED
    +
    +Details of the conversions are given below. The buffer and blength +arguments define how the output is handled: +

    +

    +If buffer is NULL, the function just returns the length of the converted +pattern via blength. This is one less than the length of buffer needed, +because a terminating zero is always added to the output. +

    +

    +If buffer points to a NULL pointer, an output buffer is obtained using +the allocator in the context or malloc() if no context is supplied. A +pointer to this buffer is placed in the variable to which buffer points. +When no longer needed the output buffer must be freed by calling +pcre2_converted_pattern_free(). +

    +

    +If buffer points to a non-NULL pointer, blength must be set to the +actual length of the buffer provided (in code units). +

    +

    +In all cases, after successful conversion, the variable pointed to by +blength is updated to the length actually used (in code units), excluding +the terminating zero that is always added. +

    +

    +If an error occurs, the length (via blength) is set to the offset +within the input pattern where the error was detected. Only gross syntax errors +are caught; there are plenty of errors that will get passed on for +pcre2_compile() to discover. +

    +

    +The return from pcre2_pattern_convert() is zero on success or a non-zero +PCRE2 error code. Note that PCRE2 error codes may be positive or negative: +pcre2_compile() uses mostly positive codes and pcre2_match() +negative ones; pcre2_convert() uses existing codes of both kinds. A +textual error message can be obtained by calling +pcre2_get_error_message(). +

    +
    CONVERTING GLOBS
    +

    +Globs are used to match file names, and consequently have the concept of a +"path separator", which defaults to backslash under Windows and forward slash +otherwise. If PCRE2_CONVERT_GLOB is set, the wildcards * and ? are not +permitted to match separator characters, but the double-star (**) feature +(which does match separators) is supported. +

    +

    +PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to +match separator characters. PCRE2_GLOB_NO_STARSTAR matches globs with the +double-star feature disabled. These options may be given together. +

    +
    CONVERTING POSIX PATTERNS
    +

    +POSIX defines two kinds of regular expression pattern: basic and extended. +These can be processed by setting PCRE2_CONVERT_POSIX_BASIC or +PCRE2_CONVERT_POSIX_EXTENDED, respectively. +

    +

    +In POSIX patterns, backslash is not special in a character class. Unmatched +closing parentheses are treated as literals. +

    +

    +In basic patterns, ? + | {} and () must be escaped to be recognized +as metacharacters outside a character class. If the first character in the +pattern is * it is treated as a literal. ^ is a metacharacter only at the start +of a branch. +

    +

    +In extended patterns, a backslash not in a character class always +makes the next character literal, whatever it is. There are no backreferences. +

    +

    +Note: POSIX mandates that the longest possible match at the first matching +position must be found. This is not what pcre2_match() does; it yields +the first match that is found. An application can use pcre2_dfa_match() +to find the longest match, but that does not support backreferences (but then +neither do POSIX extended patterns). +

    +
    AUTHOR
    +

    +Philip Hazel +
    +University Computing Service +
    +Cambridge, England. +
    +

    +
    REVISION
    +

    +Last updated: 12 July 2017 +
    +Copyright © 1997-2017 University of Cambridge. +
    +

    +Return to the PCRE2 index page. +

    diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html index aaf8336..12ff36b 100644 --- a/doc/html/pcre2test.html +++ b/doc/html/pcre2test.html @@ -630,6 +630,10 @@ heavily used in the test files. bsr=[anycrlf|unicode] specify \R handling /B bincode show binary code without lengths callout_info show callout information + convert=<options> request foreign pattern conversion + convert_glob_escape=c set glob escape character + convert_glob_separator=c set glob separator character + convert_length set convert buffer length debug same as info,fullbincode framesize show matching frame size fullbincode show binary code with lengths @@ -1065,6 +1069,41 @@ are ignored (for the stacked copy), with a warning message, except for replace, which causes an error. Note that jitverify, which is allowed, does not carry through to any subsequent matching that uses a stacked pattern. +

    +
    +Testing foreign pattern conversion +
    +

    +The experimental foreign pattern conversion functions in PCRE2 can be tested by +setting the convert modifier. Its argument is a colon-separated list of +options, which set the equivalent option for the pcre2_pattern_convert() +function: +

    +  glob                    PCRE2_CONVERT_GLOB
    +  glob_no_starstar        PCRE2_CONVERT_GLOB_NO_STARSTAR
    +  glob_no_wild_separator  PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR
    +  posix_basic             PCRE2_CONVERT_POSIX_BASIC
    +  posix_extended          PCRE2_CONVERT_POSIX_EXTENDED
    +  unset                   Unset all options
    +
    +The "unset" value is useful for turning off a default that has been set by a +#pattern command. When one of these options is set, the input pattern is +passed to pcre2_pattern_convert(). If the conversion is successful, the +result is reflected in the output and then passed to pcre2_compile(). The +normal utf and no_utf_check options, if set, cause the +PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be passed to +pcre2_pattern_convert(). +

    +

    +By default, the conversion function is allowed to allocate a buffer for its +output. However, if the convert_length modifier is set to a value greater +than zero, pcre2test passes a buffer of the given length. This makes it +possible to test the length check. +

    +

    +The convert_glob_escape and convert_glob_separator modifiers can be +used to specify the escape and separator characters for glob processing, +overriding the defaults, which are operating-system dependent.


    SUBJECT MODIFIERS

    @@ -1866,7 +1905,7 @@ Cambridge, England.


    REVISION

    -Last updated: 02 July 2017 +Last updated: 12 July 2017
    Copyright © 1997-2017 University of Cambridge.
    diff --git a/doc/index.html.src b/doc/index.html.src index 2a373f5..b9393d9 100644 --- a/doc/index.html.src +++ b/doc/index.html.src @@ -35,6 +35,9 @@ first. pcre2compat   Compability with Perl +pcre2convert +   Experimental foreign pattern conversion functions + pcre2demo   A demonstration C program that uses the PCRE2 library @@ -112,6 +115,18 @@ in the library. pcre2_config   Show build-time configuration options +pcre2_convert_context_copy +   Copy a convert context + +pcre2_convert_context_create +   Create a convert context + +pcre2_convert_context_free +   Free a convert context + +pcre2_converted_pattern_free +   Free converted foreign pattern + pcre2_dfa_match   Match a compiled pattern to a subject string (DFA algorithm; not Perl compatible) @@ -183,6 +198,9 @@ in the library. pcre2_match_data_free   Free a match data block +pcre2_pattern_convert +   Experimental foreign pattern converter + pcre2_pattern_info   Extract information about a pattern @@ -216,6 +234,12 @@ in the library. pcre2_set_depth_limit   Set the match backtracking depth limit +pcre2_set_glob_escape +   Set glob escape character + +pcre2_set_glob_separator +   Set glob separator character + pcre2_set_heap_limit   Set the match backtracking heap limit diff --git a/doc/pcre2.txt b/doc/pcre2.txt index 1f7be3d..186cbc7 100644 --- a/doc/pcre2.txt +++ b/doc/pcre2.txt @@ -413,6 +413,35 @@ PCRE2 NATIVE API OBSOLETE FUNCTIONS needed and has no effect (it always returns zero). +PCRE2 EXPERIMENTAL PATTERN CONVERSION FUNCTIONS + + pcre2_convert_context *pcre2_convert_context_create( + pcre2_general_context *gcontext); + + pcre2_convert_context *pcre2_convert_context_copy( + pcre2_convert_context *cvcontext); + + void pcre2_convert_context_free(pcre2_convert_context *cvcontext); + + int pcre2_set_glob_escape(pcre2_convert_context *cvcontext, + uint32_t escape_char); + + int pcre2_set_glob_separator(pcre2_convert_context *cvcontext, + uint32_t separator_char); + + int pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE length, + uint32_t options, PCRE2_UCHAR **buffer, + PCRE2_SIZE *blength, pcre2_convert_context *cvcontext); + + void pcre2_converted_pattern_free(PCRE2_UCHAR *converted_pattern); + + These functions provide a way of converting non-PCRE2 patterns into + patterns that can be processed by pcre2_compile(). This facility is + experimental and may be changed in future releases. At present, "globs" + and POSIX basic and extended patterns can be converted. Details are + given in the pcre2convert documentation. + + PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES There are three PCRE2 libraries, supporting 8-bit, 16-bit, and 32-bit @@ -3400,7 +3429,7 @@ AUTHOR REVISION - Last updated: 16 June 2017 + Last updated: 10 July 2017 Copyright (c) 1997-2017 University of Cambridge. ------------------------------------------------------------------------------ diff --git a/doc/pcre2_convert_context_copy.3 b/doc/pcre2_convert_context_copy.3 new file mode 100644 index 0000000..827c3e9 --- /dev/null +++ b/doc/pcre2_convert_context_copy.3 @@ -0,0 +1,26 @@ +.TH PCRE2_CONVERT_CONTEXT_COPY 3 "10 July 2017" "PCRE2 10.30" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +.SH SYNOPSIS +.rs +.sp +.B #include +.PP +.nf +.B pcre2_convert_context *pcre2_convert_context_copy( +.B " pcre2_convert_context *\fIcvcontext\fP);" +.fi +. +.SH DESCRIPTION +.rs +.sp +This function is part of an experimental set of pattern conversion functions. +It makes a new copy of a convert context, using the memory allocation function +that was used for the original context. The result is NULL if the memory cannot +be obtained. +.P +The pattern conversion functions are described in the +.\" HREF +\fBpcre2convert\fP +.\" +documentation. diff --git a/doc/pcre2_convert_context_create.3 b/doc/pcre2_convert_context_create.3 new file mode 100644 index 0000000..91c17fb --- /dev/null +++ b/doc/pcre2_convert_context_create.3 @@ -0,0 +1,27 @@ +.TH PCRE2_CONVERT_CONTEXT_CREATE 3 "10 July 2017" "PCRE2 10.30" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +.SH SYNOPSIS +.rs +.sp +.B #include +.PP +.nf +.B pcre2_convert_context *pcre2_convert_context_create( +.B " pcre2_general_context *\fIgcontext\fP);" +.fi +. +.SH DESCRIPTION +.rs +.sp +This function is part of an experimental set of pattern conversion functions. +It creates and initializes a new convert context. If its argument is +NULL, \fBmalloc()\fP is used to get the necessary memory; otherwise the memory +allocation function within the general context is used. The result is NULL if +the memory could not be obtained. +.P +The pattern conversion functions are described in the +.\" HREF +\fBpcre2convert\fP +.\" +documentation. diff --git a/doc/pcre2_convert_context_free.3 b/doc/pcre2_convert_context_free.3 new file mode 100644 index 0000000..fd5b13c --- /dev/null +++ b/doc/pcre2_convert_context_free.3 @@ -0,0 +1,25 @@ +.TH PCRE2_CONVERT_CONTEXT_FREE 3 "10 July 2017" "PCRE2 10.30" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +.SH SYNOPSIS +.rs +.sp +.B #include +.PP +.nf +.B void pcre2_convert_context_free(pcre2_convert_context *\fIcvcontext\fP); +.fi +. +.SH DESCRIPTION +.rs +.sp +This function is part of an experimental set of pattern conversion functions. +It frees the memory occupied by a convert context, using the memory +freeing function from the general context with which it was created, or +\fBfree()\fP if that was not set. +.P +The pattern conversion functions are described in the +.\" HREF +\fBpcre2convert\fP +.\" +documentation. diff --git a/doc/pcre2_converted_pattern_free.3 b/doc/pcre2_converted_pattern_free.3 new file mode 100644 index 0000000..1f4c8e6 --- /dev/null +++ b/doc/pcre2_converted_pattern_free.3 @@ -0,0 +1,25 @@ +.TH PCRE2_CONVERTED_PATTERN_FREE 3 "11 July 2017" "PCRE2 10.30" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +.SH SYNOPSIS +.rs +.sp +.B #include +.PP +.nf +.B void pcre2_converted_pattern_free(PCRE2_UCHAR *\fIconverted_pattern\fP); +.fi +. +.SH DESCRIPTION +.rs +.sp +This function is part of an experimental set of pattern conversion functions. +It frees the memory occupied by a converted pattern that was obtained by +calling \fBpcre2_pattern_convert()\fP with arguments that caused it to place +the converted pattern into newly obtained heap memory. +.P +The pattern conversion functions are described in the +.\" HREF +\fBpcre2convert\fP +.\" +documentation. diff --git a/doc/pcre2_pattern_convert.3 b/doc/pcre2_pattern_convert.3 new file mode 100644 index 0000000..b72acb7 --- /dev/null +++ b/doc/pcre2_pattern_convert.3 @@ -0,0 +1,55 @@ +.TH PCRE2_PATTERN_CONVERT 3 "11 July 2017" "PCRE2 10.30" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +.SH SYNOPSIS +.rs +.sp +.B #include +.PP +.nf +.B int pcre2_pattern_convert(PCRE2_SPTR \fIpattern\fP, PCRE2_SIZE \fIlength\fP, +.B " uint32_t \fIoptions\fP, PCRE2_UCHAR **\fIbuffer\fP," +.B " PCRE2_SIZE *\fIblength\fP, pcre2_convert_context *\fIcvcontext\fP);" +.fi +. +.SH DESCRIPTION +.rs +.sp +This function is part of an experimental set of pattern conversion functions. +It converts a foreign pattern (for example, a glob) into a PCRE2 regular +expression pattern. Its arguments are: +.sp + \fIpattern\fP The foreign pattern + \fIlength\fP The length of the input pattern or PCRE2_ZERO_TERMINATED + \fIoptions\fP Option bits + \fIbuffer\fP Pointer to pointer to output buffer, or NULL + \fIblength\fP Pointer to output length field + \fIcvcontext\fP Pointer to a convert context or NULL +.sp +The length of the converted pattern (excluding the terminating zero) is +returned via \fIblength\fP. If \fIbuffer\fP is NULL, the function just returns +the output length. If \fIbuffer\fP points to a NULL pointer, heap memory is +obtained for the converted pattern, using the allocator in the context if +present (or else \fBmalloc()\fP), and the field pointed to by \fIbuffer\fP is +updated. If \fIbuffer\fP points to a non-NULL field, that must point to a +buffer whose size is in the variable pointed to by \fIblength\fP. This value is +updated. +.P +The option bits are: +.sp + PCRE2_CONVERT_UTF Input is UTF + PCRE2_CONVERT_NO_UTF_CHECK Do not check UTF validity + PCRE2_CONVERT_POSIX_BASIC Convert POSIX basic pattern + PCRE2_CONVERT_POSIX_EXTENDED Convert POSIX extended pattern + PCRE2_CONVERT_GLOB ) Convert + PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR ) various types + PCRE2_CONVERT_GLOB_NO_STARSTAR ) of glob +.sp +The return value from \fBpcre2_pattern_convert()\fP is zero on success or a +non-zero PCRE2 error code. +.P +The pattern conversion functions are described in the +.\" HREF +\fBpcre2convert\fP +.\" +documentation. diff --git a/doc/pcre2_set_glob_escape.3 b/doc/pcre2_set_glob_escape.3 new file mode 100644 index 0000000..d5637af --- /dev/null +++ b/doc/pcre2_set_glob_escape.3 @@ -0,0 +1,29 @@ +.TH PCRE2_SET_GLOB_ESCAPE 3 "11 July 2017" "PCRE2 10.30" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +.SH SYNOPSIS +.rs +.sp +.B #include +.PP +.nf +.B int pcre2_set_glob_escape(pcre2_convert_context *\fIcvcontext\fP, +.B " uint32_t \fIescape_char\fP);" +.fi +. +.SH DESCRIPTION +.rs +.sp +This function is part of an experimental set of pattern conversion functions. +It sets the escape character that is used when converting globs. The second +argument must either be zero (meaning there is no escape character) or a +punctuation character whose code point is less than 256. The default is grave +accent if running under Windows, otherwise backslash. The result of the +function is zero for success or PCRE2_ERROR_BADDATA if the second argument is +invalid. +.P +The pattern conversion functions are described in the +.\" HREF +\fBpcre2convert\fP +.\" +documentation. diff --git a/doc/pcre2_set_glob_separator.3 b/doc/pcre2_set_glob_separator.3 new file mode 100644 index 0000000..273b515 --- /dev/null +++ b/doc/pcre2_set_glob_separator.3 @@ -0,0 +1,28 @@ +.TH PCRE2_SET_GLOB_SEPARATOR 3 "11 July 2017" "PCRE2 10.30" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +.SH SYNOPSIS +.rs +.sp +.B #include +.PP +.nf +.B int pcre2_set_glob_separator(pcre2_convert_context *\fIcvcontext\fP, +.B " uint32_t \fIseparator_char\fP);" +.fi +. +.SH DESCRIPTION +.rs +.sp +This function is part of an experimental set of pattern conversion functions. +It sets the component separator character that is used when converting globs. +The second argument must one of the characters forward slash, backslash, or +dot. The default is backslash when running under Windows, otherwise forward +slash. The result of the function is zero for success or PCRE2_ERROR_BADDATA if +the second argument is invalid. +.P +The pattern conversion functions are described in the +.\" HREF +\fBpcre2convert\fP +.\" +documentation. diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index fe589fb..f80ae58 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "16 June 2017" "PCRE2 10.30" +.TH PCRE2API 3 "10 July 2017" "PCRE2 10.30" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -272,6 +272,41 @@ replaced by \fBpcre2_set_depth_limit()\fP; the second is no longer needed and has no effect (it always returns zero). . . +.SH "PCRE2 EXPERIMENTAL PATTERN CONVERSION FUNCTIONS" +.rs +.sp +.nf +.B pcre2_convert_context *pcre2_convert_context_create( +.B " pcre2_general_context *\fIgcontext\fP);" +.sp +.B pcre2_convert_context *pcre2_convert_context_copy( +.B " pcre2_convert_context *\fIcvcontext\fP);" +.sp +.B void pcre2_convert_context_free(pcre2_convert_context *\fIcvcontext\fP); +.sp +.B int pcre2_set_glob_escape(pcre2_convert_context *\fIcvcontext\fP, +.B " uint32_t \fIescape_char\fP);" +.sp +.B int pcre2_set_glob_separator(pcre2_convert_context *\fIcvcontext\fP, +.B " uint32_t \fIseparator_char\fP);" +.sp +.B int pcre2_pattern_convert(PCRE2_SPTR \fIpattern\fP, PCRE2_SIZE \fIlength\fP, +.B " uint32_t \fIoptions\fP, PCRE2_UCHAR **\fIbuffer\fP," +.B " PCRE2_SIZE *\fIblength\fP, pcre2_convert_context *\fIcvcontext\fP);" +.sp +.B void pcre2_converted_pattern_free(PCRE2_UCHAR *\fIconverted_pattern\fP); +.fi +.sp +These functions provide a way of converting non-PCRE2 patterns into +patterns that can be processed by \fBpcre2_compile()\fP. This facility is +experimental and may be changed in future releases. At present, "globs" and +POSIX basic and extended patterns can be converted. Details are given in the +.\" HREF +\fBpcre2convert\fP +.\" +documentation. +. +. .SH "PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES" .rs .sp @@ -1695,7 +1730,7 @@ dangerous option. Use with care. PCRE2_EXTRA_MATCH_LINE .sp This option is provided for use by the \fB-x\fP option of \fBpcre2grep\fP. It -causes the pattern only to match complete lines. This is achieved by +causes the pattern only to match complete lines. This is achieved by automatically inserting the code for "^(?:" at the start of the compiled pattern and ")$" at the end. Thus, when PCRE2_MULTILINE is set, the matched line may be in the middle of the subject string. This option can be used with @@ -3539,6 +3574,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 16 June 2017 +Last updated: 10 July 2017 Copyright (c) 1997-2017 University of Cambridge. .fi diff --git a/doc/pcre2convert.3 b/doc/pcre2convert.3 new file mode 100644 index 0000000..3dadf6e --- /dev/null +++ b/doc/pcre2convert.3 @@ -0,0 +1,163 @@ +.TH PCRE2CONVERT 3 "12 July 2017" "PCRE2 10.30" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +.SH "EXPERIMENTAL PATTERN CONVERSION FUNCTIONS" +.rs +.sp +This document describes a set of functions that can be used to convert +"foreign" patterns into PCRE2 regular expressions. This facility is currently +experimental, and may be changed in future releases. Two kinds of pattern, +globs and POSIX patterns, are supported. +. +. +.SH "THE CONVERT CONTEXT" +.rs +.sp +.nf +.B pcre2_convert_context *pcre2_convert_context_create( +.B " pcre2_general_context *\fIgcontext\fP);" +.sp +.B pcre2_convert_context *pcre2_convert_context_copy( +.B " pcre2_convert_context *\fIcvcontext\fP);" +.sp +.B void pcre2_convert_context_free(pcre2_convert_context *\fIcvcontext\fP); +.sp +.B int pcre2_set_glob_escape(pcre2_convert_context *\fIcvcontext\fP, +.B " uint32_t \fIescape_char\fP);" +.sp +.B int pcre2_set_glob_separator(pcre2_convert_context *\fIcvcontext\fP, +.B " uint32_t \fIseparator_char\fP);" +.fi +.sp +A convert context is used to hold parameters that affect the way that pattern +conversion works. Like all PCRE2 contexts, you need to use a context only if +you want to override the defaults. There are the usual create, copy, and free +functions. If custom memory management functions are set in a general context +that is passed to \fBpcre2_convert_context_create()\fP, they are used for all +memory management within the conversion functions. +.P +There are only two parameters in the convert context at present. Both apply +only to glob conversions. The escape character defaults to grave accent under +Windows, otherwise backslash. It can be set to zero, meaning no escape +character, or to any punctuation character with a code point less than 256. +The separator character defaults to backslash under Windows, otherwise forward +slash. It can be set to forward slash, backslash, or dot. +.P +The two setting functions return zero on success, or PCRE2_ERROR_BADDATA if +their second argument is invalid. +. +. +.SH "THE CONVERSION FUNCTION" +.rs +.sp +.nf +.B int pcre2_pattern_convert(PCRE2_SPTR \fIpattern\fP, PCRE2_SIZE \fIlength\fP, +.B " uint32_t \fIoptions\fP, PCRE2_UCHAR **\fIbuffer\fP," +.B " PCRE2_SIZE *\fIblength\fP, pcre2_convert_context *\fIcvcontext\fP);" +.sp +.B void pcre2_converted_pattern_free(PCRE2_UCHAR *\fIconverted_pattern\fP); +.fi +.sp +The first two arguments of \fBpcre2_pattern_convert()\fP define the foreign +pattern that is to be converted. The length may be given as +PCRE2_ZERO_TERMINATED. The \fBoptions\fP argument defines how the pattern is to +be processed. If the input is UTF, the PCRE2_CONVERT_UTF option should be set. +PCRE2_CONVERT_NO_UTF_CHECK may also be set if you are sure the input is valid. +One or more of the glob options, or one of the following POSIX options must be +set to define the type of conversion that is required: +.sp + PCRE2_CONVERT_GLOB + PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR + PCRE2_CONVERT_GLOB_NO_STARSTAR + PCRE2_CONVERT_POSIX_BASIC + PCRE2_CONVERT_POSIX_EXTENDED +.sp +Details of the conversions are given below. The \fBbuffer\fP and \fBblength\fP +arguments define how the output is handled: +.P +If \fBbuffer\fP is NULL, the function just returns the length of the converted +pattern via \fBblength\fP. This is one less than the length of buffer needed, +because a terminating zero is always added to the output. +.P +If \fBbuffer\fP points to a NULL pointer, an output buffer is obtained using +the allocator in the context or \fBmalloc()\fP if no context is supplied. A +pointer to this buffer is placed in the variable to which \fBbuffer\fP points. +When no longer needed the output buffer must be freed by calling +\fBpcre2_converted_pattern_free()\fP. +.P +If \fBbuffer\fP points to a non-NULL pointer, \fBblength\fP must be set to the +actual length of the buffer provided (in code units). +.P +In all cases, after successful conversion, the variable pointed to by +\fBblength\fP is updated to the length actually used (in code units), excluding +the terminating zero that is always added. +.P +If an error occurs, the length (via \fBblength\fP) is set to the offset +within the input pattern where the error was detected. Only gross syntax errors +are caught; there are plenty of errors that will get passed on for +\fBpcre2_compile()\fP to discover. +.P +The return from \fBpcre2_pattern_convert()\fP is zero on success or a non-zero +PCRE2 error code. Note that PCRE2 error codes may be positive or negative: +\fBpcre2_compile()\fP uses mostly positive codes and \fBpcre2_match()\fP +negative ones; \fBpcre2_convert()\fP uses existing codes of both kinds. A +textual error message can be obtained by calling +\fBpcre2_get_error_message()\fP. +. +. +.SH "CONVERTING GLOBS" +.rs +.sp +Globs are used to match file names, and consequently have the concept of a +"path separator", which defaults to backslash under Windows and forward slash +otherwise. If PCRE2_CONVERT_GLOB is set, the wildcards * and ? are not +permitted to match separator characters, but the double-star (**) feature +(which does match separators) is supported. +.P +PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to +match separator characters. PCRE2_GLOB_NO_STARSTAR matches globs with the +double-star feature disabled. These options may be given together. +. +. +.SH "CONVERTING POSIX PATTERNS" +.rs +.sp +POSIX defines two kinds of regular expression pattern: basic and extended. +These can be processed by setting PCRE2_CONVERT_POSIX_BASIC or +PCRE2_CONVERT_POSIX_EXTENDED, respectively. +.P +In POSIX patterns, backslash is not special in a character class. Unmatched +closing parentheses are treated as literals. +.P +In basic patterns, ? + | {} and () must be escaped to be recognized +as metacharacters outside a character class. If the first character in the +pattern is * it is treated as a literal. ^ is a metacharacter only at the start +of a branch. +.P +In extended patterns, a backslash not in a character class always +makes the next character literal, whatever it is. There are no backreferences. +.P +Note: POSIX mandates that the longest possible match at the first matching +position must be found. This is not what \fBpcre2_match()\fP does; it yields +the first match that is found. An application can use \fBpcre2_dfa_match()\fP +to find the longest match, but that does not support backreferences (but then +neither do POSIX extended patterns). +. +. +.SH AUTHOR +.rs +.sp +.nf +Philip Hazel +University Computing Service +Cambridge, England. +.fi +. +. +.SH REVISION +.rs +.sp +.nf +Last updated: 12 July 2017 +Copyright (c) 1997-2017 University of Cambridge. +.fi diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index c13188c..5f6f9dc 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -1,4 +1,4 @@ -.TH PCRE2TEST 1 "02 July 2017" "PCRE 10.30" +.TH PCRE2TEST 1 "12 July 2017" "PCRE 10.30" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -592,6 +592,10 @@ heavily used in the test files. bsr=[anycrlf|unicode] specify \eR handling /B bincode show binary code without lengths callout_info show callout information + convert= request foreign pattern conversion + convert_glob_escape=c set glob escape character + convert_glob_separator=c set glob separator character + convert_length set convert buffer length debug same as info,fullbincode framesize show matching frame size fullbincode show binary code with lengths @@ -1035,6 +1039,39 @@ allowed, does not carry through to any subsequent matching that uses a stacked pattern. . . +.SS "Testing foreign pattern conversion" +.rs +.sp +The experimental foreign pattern conversion functions in PCRE2 can be tested by +setting the \fBconvert\fP modifier. Its argument is a colon-separated list of +options, which set the equivalent option for the \fBpcre2_pattern_convert()\fP +function: +.sp + glob PCRE2_CONVERT_GLOB + glob_no_starstar PCRE2_CONVERT_GLOB_NO_STARSTAR + glob_no_wild_separator PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR + posix_basic PCRE2_CONVERT_POSIX_BASIC + posix_extended PCRE2_CONVERT_POSIX_EXTENDED + unset Unset all options +.sp +The "unset" value is useful for turning off a default that has been set by a +\fB#pattern\fP command. When one of these options is set, the input pattern is +passed to \fBpcre2_pattern_convert()\fP. If the conversion is successful, the +result is reflected in the output and then passed to \fBpcre2_compile()\fP. The +normal \fButf\fP and \fBno_utf_check\fP options, if set, cause the +PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be passed to +\fBpcre2_pattern_convert()\fP. +.P +By default, the conversion function is allowed to allocate a buffer for its +output. However, if the \fBconvert_length\fP modifier is set to a value greater +than zero, \fBpcre2test\fP passes a buffer of the given length. This makes it +possible to test the length check. +.P +The \fBconvert_glob_escape\fP and \fBconvert_glob_separator\fP modifiers can be +used to specify the escape and separator characters for glob processing, +overriding the defaults, which are operating-system dependent. +. +. .\" HTML .SH "SUBJECT MODIFIERS" .rs @@ -1850,6 +1887,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 02 July 2017 +Last updated: 12 July 2017 Copyright (c) 1997-2017 University of Cambridge. .fi diff --git a/doc/pcre2test.txt b/doc/pcre2test.txt index c1a8302..56577f8 100644 --- a/doc/pcre2test.txt +++ b/doc/pcre2test.txt @@ -570,6 +570,10 @@ PATTERN MODIFIERS bsr=[anycrlf|unicode] specify \R handling /B bincode show binary code without lengths callout_info show callout information + convert= request foreign pattern conversion + convert_glob_escape=c set glob escape character + convert_glob_separator=c set glob separator character + convert_length set convert buffer length debug same as info,fullbincode framesize show matching frame size fullbincode show binary code with lengths @@ -953,6 +957,37 @@ PATTERN MODIFIERS that jitverify, which is allowed, does not carry through to any subse- quent matching that uses a stacked pattern. + Testing foreign pattern conversion + + The experimental foreign pattern conversion functions in PCRE2 can be + tested by setting the convert modifier. Its argument is a colon-sepa- + rated list of options, which set the equivalent option for the + pcre2_pattern_convert() function: + + glob PCRE2_CONVERT_GLOB + glob_no_starstar PCRE2_CONVERT_GLOB_NO_STARSTAR + glob_no_wild_separator PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR + posix_basic PCRE2_CONVERT_POSIX_BASIC + posix_extended PCRE2_CONVERT_POSIX_EXTENDED + unset Unset all options + + The "unset" value is useful for turning off a default that has been set + by a #pattern command. When one of these options is set, the input pat- + tern is passed to pcre2_pattern_convert(). If the conversion is suc- + cessful, the result is reflected in the output and then passed to + pcre2_compile(). The normal utf and no_utf_check options, if set, cause + the PCRE2_CONVERT_UTF and PCRE2_CONVERT_NO_UTF_CHECK options to be + passed to pcre2_pattern_convert(). + + By default, the conversion function is allowed to allocate a buffer for + its output. However, if the convert_length modifier is set to a value + greater than zero, pcre2test passes a buffer of the given length. This + makes it possible to test the length check. + + The convert_glob_escape and convert_glob_separator modifiers can be + used to specify the escape and separator characters for glob process- + ing, overriding the defaults, which are operating-system dependent. + SUBJECT MODIFIERS @@ -1692,5 +1727,5 @@ AUTHOR REVISION - Last updated: 02 July 2017 + Last updated: 12 July 2017 Copyright (c) 1997-2017 University of Cambridge. diff --git a/src/pcre2.h b/src/pcre2.h index bab45b2..36d357a 100644 --- a/src/pcre2.h +++ b/src/pcre2.h @@ -193,8 +193,6 @@ ignored for pcre2_jit_match(). */ #define PCRE2_CONVERT_GLOB 0x00000010u #define PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR 0x00000030u #define PCRE2_CONVERT_GLOB_NO_STARSTAR 0x00000050u -#define PCRE2_CONVERT_GLOB_BASIC 0x00000070u -#define PCRE2_CONVERT_GLOB_IGNORE_DOT_START 0x00000080u /* Newline and \R settings, for use in compile contexts. The newline values must be kept in step with values set in config.h and both sets must all be diff --git a/src/pcre2.h.in b/src/pcre2.h.in index a110638..399ddcd 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -193,8 +193,6 @@ ignored for pcre2_jit_match(). */ #define PCRE2_CONVERT_GLOB 0x00000010u #define PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR 0x00000030u #define PCRE2_CONVERT_GLOB_NO_STARSTAR 0x00000050u -#define PCRE2_CONVERT_GLOB_BASIC 0x00000070u -#define PCRE2_CONVERT_GLOB_IGNORE_DOT_START 0x00000080u /* Newline and \R settings, for use in compile contexts. The newline values must be kept in step with values set in config.h and both sets must all be diff --git a/src/pcre2_convert.c b/src/pcre2_convert.c index 00d713a..0e29aca 100644 --- a/src/pcre2_convert.c +++ b/src/pcre2_convert.c @@ -49,7 +49,6 @@ POSSIBILITY OF SUCH DAMAGE. PCRE2_CONVERT_POSIX_BASIC|PCRE2_CONVERT_POSIX_EXTENDED) #define ALL_OPTIONS (PCRE2_CONVERT_UTF|PCRE2_CONVERT_NO_UTF_CHECK| \ - PCRE2_CONVERT_GLOB_IGNORE_DOT_START| \ PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR| \ PCRE2_CONVERT_GLOB_NO_STARSTAR| \ TYPE_OPTIONS) diff --git a/src/pcre2test.c b/src/pcre2test.c index e825947..3ce261c 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -401,8 +401,6 @@ typedef struct convertstruct { static convertstruct convertlist[] = { { "glob", PCRE2_CONVERT_GLOB }, - { "glob_basic", PCRE2_CONVERT_GLOB_BASIC }, - { "glob_ignore_dot_start", PCRE2_CONVERT_GLOB_IGNORE_DOT_START }, { "glob_no_starstar", PCRE2_CONVERT_GLOB_NO_STARSTAR }, { "glob_no_wild_separator", PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR }, { "posix_basic", PCRE2_CONVERT_POSIX_BASIC }, diff --git a/testdata/testinput25 b/testdata/testinput25 index 021cb16..f21d9ad 100644 --- a/testdata/testinput25 +++ b/testdata/testinput25 @@ -8,7 +8,7 @@ # Set the glob separator explicitly so that different OS defaults are not a # problem. Then test various errors. -#pattern convert=glob_basic,convert_glob_escape=\,convert_glob_separator=/ +#pattern convert=glob,convert_glob_escape=\,convert_glob_separator=/ # The fact that this one works in 13 bytes in the 8-bit library shows that the # output is in UTF-8, though pcre2test shows the character as an escape. diff --git a/testdata/testoutput25 b/testdata/testoutput25 index 80abbe5..4990293 100644 --- a/testdata/testoutput25 +++ b/testdata/testoutput25 @@ -8,7 +8,7 @@ # Set the glob separator explicitly so that different OS defaults are not a # problem. Then test various errors. -#pattern convert=glob_basic,convert_glob_escape=\,convert_glob_separator=/ +#pattern convert=glob,convert_glob_escape=\,convert_glob_separator=/ # The fact that this one works in 13 bytes in the 8-bit library shows that the # output is in UTF-8, though pcre2test shows the character as an escape.