From 05a8186117530323b2b8aaeedd190a31ae0e109f Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sat, 22 Apr 2017 14:35:14 +0000 Subject: [PATCH] Add a fancy test for multiple named subpatterns. --- testdata/testinput1 | 61 +++++++++++++++++++++++++++++++++++++ testdata/testoutput1 | 72 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) diff --git a/testdata/testinput1 b/testdata/testinput1 index b449969..1f32d95 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -5920,4 +5920,65 @@ ef) x/x,mark /^(?1)\d{3}(a)/ a123a +# This pattern uses a lot of named subpatterns in order to match email +# addresses in various formats. It's a heavy test for named subpatterns. In the +# group, slash is coded as \x{2f} so that this pattern can also be +# processed by perltest.sh, which does not cater for an escaped delimiter +# within the pattern. All $ and @ characters in subject strings are escaped so +# that Perl doesn't interpret them as variable insertions and " characters must +# also be escaped for Perl. + +# This set of subpatterns is more or less a direct transliteration of the BNF +# definitions in RFC2822, without any of the obsolete features. The addition of +# a possessive + to the definition of reduced the match limit in PCRE2 +# from over 5 million to just under 400, and eliminated a very noticeable delay +# when this file was passed to perltest.sh. + +/(?ix)(?(DEFINE) +(? (?&local_part) \@ (?&domain) ) +(? (?&CFWS)?+ < (?&addr_spec) > (?&CFWS)?+ ) +(? [a-z\d!#$%&'*+-\x{2f}=?^_`{|}~] ) +(? (?&CFWS)?+ (?&atext)+ (?&CFWS)?+ ) +(? (?&ctext) | (?"ed_pair) | (?&comment) ) +(? [^\x{9}\x{10}\x{13}\x{7f}-\x{ff}\ ()\\] ) +(? \( (?: (?&FWS)?+ (?&ccontent) )*+ (?&FWS)?+ \) ) +(? (?: (?&FWS)?+ (?&comment) )* (?# NOT possessive) + (?: (?&FWS)?+ (?&comment) | (?&FWS) ) ) +(? (?&dtext) | (?"ed_pair) ) +(? (?&phrase) ) +(? (?&dot_atom) | (?&domain_literal) ) +(? (?&CFWS)?+ \[ (?: (?&FWS)?+ (?&dcontent) )* (?&FWS)?+ \] + (?&CFWS)?+ ) +(? (?&CFWS)?+ (?&dot_atom_text) (?&CFWS)?+ ) +(? (?&atext)++ (?: \. (?&atext)++)*+ ) +(? [^\x{9}\x{10}\x{13}\x{7f}-\x{ff}\ \[\]\\] ) +(? (?: [\t\ ]*+ \n)?+ [\t\ ]++ ) +(? (?&dot_atom) | (?"ed_string) ) +(? (?&name_addr) | (?&addr_spec) ) +(? (?&display_name)? (?&angle_addr) ) +(? (?&word)++ ) +(? (?&qtext) | (?"ed_pair) ) +(? " (?&text) ) +(? (?&CFWS)?+ " (?: (?&FWS)?+ (?&qcontent))* (?&FWS)?+ " + (?&CFWS)?+ ) +(? [^\x{9}\x{10}\x{13}\x{7f}-\x{ff}\ "\\] ) +(? [^\r\n] ) +(? (?&atom) | (?"ed_string) ) +) # End DEFINE +^(?&mailbox)$/ + Alan Other + + user\@dom.ain + user\@[] + user\@[domain literal] + user\@[domain literal with \"[square brackets\"] inside] + \"A. Other\" (a comment) + A. Other (a comment) + \"/s=user/ou=host/o=place/prmd=uu.yy/admd= /c=gb/\"\@x400-re.lay +\= Expect no match + A missing angle group, slash is coded as \x{2f} so that this pattern can also be +# processed by perltest.sh, which does not cater for an escaped delimiter +# within the pattern. All $ and @ characters in subject strings are escaped so +# that Perl doesn't interpret them as variable insertions and " characters must +# also be escaped for Perl. + +# This set of subpatterns is more or less a direct transliteration of the BNF +# definitions in RFC2822, without any of the obsolete features. The addition of +# a possessive + to the definition of reduced the match limit in PCRE2 +# from over 5 million to just under 400, and eliminated a very noticeable delay +# when this file was passed to perltest.sh. + +/(?ix)(?(DEFINE) +(? (?&local_part) \@ (?&domain) ) +(? (?&CFWS)?+ < (?&addr_spec) > (?&CFWS)?+ ) +(? [a-z\d!#$%&'*+-\x{2f}=?^_`{|}~] ) +(? (?&CFWS)?+ (?&atext)+ (?&CFWS)?+ ) +(? (?&ctext) | (?"ed_pair) | (?&comment) ) +(? [^\x{9}\x{10}\x{13}\x{7f}-\x{ff}\ ()\\] ) +(? \( (?: (?&FWS)?+ (?&ccontent) )*+ (?&FWS)?+ \) ) +(? (?: (?&FWS)?+ (?&comment) )* (?# NOT possessive) + (?: (?&FWS)?+ (?&comment) | (?&FWS) ) ) +(? (?&dtext) | (?"ed_pair) ) +(? (?&phrase) ) +(? (?&dot_atom) | (?&domain_literal) ) +(? (?&CFWS)?+ \[ (?: (?&FWS)?+ (?&dcontent) )* (?&FWS)?+ \] + (?&CFWS)?+ ) +(? (?&CFWS)?+ (?&dot_atom_text) (?&CFWS)?+ ) +(? (?&atext)++ (?: \. (?&atext)++)*+ ) +(? [^\x{9}\x{10}\x{13}\x{7f}-\x{ff}\ \[\]\\] ) +(? (?: [\t\ ]*+ \n)?+ [\t\ ]++ ) +(? (?&dot_atom) | (?"ed_string) ) +(? (?&name_addr) | (?&addr_spec) ) +(? (?&display_name)? (?&angle_addr) ) +(? (?&word)++ ) +(? (?&qtext) | (?"ed_pair) ) +(? " (?&text) ) +(? (?&CFWS)?+ " (?: (?&FWS)?+ (?&qcontent))* (?&FWS)?+ " + (?&CFWS)?+ ) +(? [^\x{9}\x{10}\x{13}\x{7f}-\x{ff}\ "\\] ) +(? [^\r\n] ) +(? (?&atom) | (?"ed_string) ) +) # End DEFINE +^(?&mailbox)$/ + Alan Other + 0: Alan Other + + 0: + user\@dom.ain + 0: user@dom.ain + user\@[] + 0: user@[] + user\@[domain literal] + 0: user@[domain literal] + user\@[domain literal with \"[square brackets\"] inside] + 0: user@[domain literal with "[square brackets"] inside] + \"A. Other\" (a comment) + 0: "A. Other" (a comment) + A. Other (a comment) + 0: A. Other (a comment) + \"/s=user/ou=host/o=place/prmd=uu.yy/admd= /c=gb/\"\@x400-re.lay + 0: "/s=user/ou=host/o=place/prmd=uu.yy/admd= /c=gb/"@x400-re.lay +\= Expect no match + A missing angle