From bdac9df4cfeaa66c0a76366b4f6ab004edc86048 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20Herczeg?= <hzmester@freemail.hu>
Date: Fri, 31 Mar 2017 05:40:37 +0000
Subject: [PATCH] Fix character type detection when 32-bit and UCP are enabled
 but UTF is not in JIT.

---
 src/pcre2_jit_compile.c  | 31 +++++++++++++++++++++++++++++++
 testdata/testinput12     |  2 +-
 testdata/testoutput12-16 |  2 +-
 testdata/testoutput12-32 |  2 +-
 4 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c
index 846510a..e93143d 100644
--- a/src/pcre2_jit_compile.c
+++ b/src/pcre2_jit_compile.c
@@ -588,6 +588,8 @@ the start pointers when the end of the capturing group has not yet reached. */
 
 #define READ_CHAR_MAX 0x7fffffff
 
+#define INVALID_UTF_CHAR 888
+
 static PCRE2_SPTR bracketend(PCRE2_SPTR cc)
 {
 SLJIT_ASSERT((*cc >= OP_ASSERT && *cc <= OP_ASSERTBACK_NOT) || (*cc >= OP_ONCE && *cc <= OP_SCOND));
@@ -3558,10 +3560,30 @@ static void do_getucd(compiler_common *common)
 /* Search the UCD record for the character comes in TMP1.
 Returns chartype in TMP1 and UCD offset in TMP2. */
 DEFINE_COMPILER;
+#if PCRE2_CODE_UNIT_WIDTH == 32
+struct sljit_jump *jump;
+#endif
+
+#if defined SLJIT_DEBUG && SLJIT_DEBUG
+/* dummy_ucd_record */
+const ucd_record *record = GET_UCD(INVALID_UTF_CHAR);
+SLJIT_ASSERT(record->script == ucp_Common && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
+SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0);
+#endif
 
 SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 8);
 
 sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
+
+#if PCRE2_CODE_UNIT_WIDTH == 32
+if (!common->utf)
+  {
+  jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1);
+  OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
+  JUMPHERE(jump);
+  }
+#endif
+
 OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
 OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1));
 OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK);
@@ -5969,6 +5991,15 @@ if (needstype || needsscript)
   if (needschar && !charsaved)
     OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP1, 0);
 
+#if PCRE2_CODE_UNIT_WIDTH == 32
+  if (!common->utf)
+    {
+    jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1);
+    OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
+    JUMPHERE(jump);
+    }
+#endif
+
   OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
   OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1));
   OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK);
diff --git a/testdata/testinput12 b/testdata/testinput12
index decfe82..cca5dfa 100644
--- a/testdata/testinput12
+++ b/testdata/testinput12
@@ -361,6 +361,6 @@
 /[\s[:^ascii:]]/B,ucp
 
 /\pP/ucp
-    \x{7fffffff}\=no_jit
+    \x{7fffffff}
 
 # End of testinput12
diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16
index 41e0a48..33b8a33 100644
--- a/testdata/testoutput12-16
+++ b/testdata/testoutput12-16
@@ -1416,7 +1416,7 @@ No match
 ------------------------------------------------------------------
 
 /\pP/ucp
-    \x{7fffffff}\=no_jit
+    \x{7fffffff}
 ** Character \x{7fffffff} is greater than 0xffff and UTF-16 mode is not enabled.
 ** Truncation will probably give the wrong result.
 No match
diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32
index e9130b9..1abeb59 100644
--- a/testdata/testoutput12-32
+++ b/testdata/testoutput12-32
@@ -1410,7 +1410,7 @@ No match
 ------------------------------------------------------------------
 
 /\pP/ucp
-    \x{7fffffff}\=no_jit
+    \x{7fffffff}
 No match
 
 # End of testinput12