1 files changed, 256 insertions, 0 deletions
diff --git a/dev-lang/perl/files/perl-5.8.8-CVE-2008-1927.patch b/dev-lang/perl/files/perl-5.8.8-CVE-2008-1927.patch
new file mode 100644
index 000000000000..470005804f57
--- /dev/null
+++ b/dev-lang/perl/files/perl-5.8.8-CVE-2008-1927.patch
@@ -0,0 +1,256 @@
+Fix a double free / segfault with utf8 regexps 
+Debian #454792
+[rt.cpan.org #48156]
+[rt.cpan.org #40641]
+upstream change 29204
+
+UTF8_ALLOW_DEFAULT definition in utf8.h picked from upstream change 27688
+
+diff --git a/embed.fnc b/embed.fnc
+index edfbc0e..26524c7 100644
+--- a/embed.fnc
++++ b/embed.fnc
+@@ -1168,6 +1168,7 @@ Es	|void	|reguni		|NN const struct RExC_state_t *state|UV uv|NN char *s|NN STRLE
+ Es	|regnode*|regclass	|NN struct RExC_state_t *state
+ ERs	|I32	|regcurly	|NN const char *
+ Es	|regnode*|reg_node	|NN struct RExC_state_t *state|U8 op
++Es	|UV	|reg_recode	|const char value|NULLOK SV **encp
+ Es	|regnode*|regpiece	|NN struct RExC_state_t *state|NN I32 *flagp
+ Es	|void	|reginsert	|NN struct RExC_state_t *state|U8 op|NN regnode *opnd
+ Es	|void	|regoptail	|NN struct RExC_state_t *state|NN regnode *p|NN regnode *val
+diff --git a/embed.h b/embed.h
+index 2b38fd5..372b04f 100644
+--- a/embed.h
++++ b/embed.h
+@@ -1234,6 +1234,7 @@
+ #define regclass		S_regclass
+ #define regcurly		S_regcurly
+ #define reg_node		S_reg_node
++#define reg_recode		S_reg_recode
+ #define regpiece		S_regpiece
+ #define reginsert		S_reginsert
+ #define regoptail		S_regoptail
+@@ -3277,6 +3278,7 @@
+ #define regclass(a)		S_regclass(aTHX_ a)
+ #define regcurly(a)		S_regcurly(aTHX_ a)
+ #define reg_node(a,b)		S_reg_node(aTHX_ a,b)
++#define reg_recode(a,b)		S_reg_recode(aTHX_ a,b)
+ #define regpiece(a,b)		S_regpiece(aTHX_ a,b)
+ #define reginsert(a,b,c)	S_reginsert(aTHX_ a,b,c)
+ #define regoptail(a,b,c)	S_regoptail(aTHX_ a,b,c)
+diff --git a/pod/perldiag.pod b/pod/perldiag.pod
+index 9b3134c..7d95216 100644
+--- a/pod/perldiag.pod
++++ b/pod/perldiag.pod
+@@ -1900,6 +1900,15 @@ recognized by Perl or by a user-supplied handler.  See L<attributes>.
+ (W printf) Perl does not understand the given format conversion.  See
+ L<perlfunc/sprintf>.
+ 
++=item Invalid escape in the specified encoding in regex; marked by <-- HERE in m/%s/
++
++(W regexp) The numeric escape (for example C<\xHH>) of value < 256
++didn't correspond to a single character through the conversion
++from the encoding specified by the encoding pragma.
++The escape was replaced with REPLACEMENT CHARACTER (U+FFFD) instead.
++The <-- HERE shows in the regular expression about where the
++escape was discovered.
++
+ =item Invalid [] range "%s" in regex; marked by <-- HERE in m/%s/
+ 
+ (F) The range specified in a character class had a minimum character
+diff --git a/proto.h b/proto.h
+index 6d185dd..ef6c0cf 100644
+--- a/proto.h
++++ b/proto.h
+@@ -1748,6 +1748,7 @@ STATIC I32	S_regcurly(pTHX_ const char *)
+ 			__attribute__warn_unused_result__;
+ 
+ STATIC regnode*	S_reg_node(pTHX_ struct RExC_state_t *state, U8 op);
++STATIC UV	S_reg_recode(pTHX_ const char value, SV **encp);
+ STATIC regnode*	S_regpiece(pTHX_ struct RExC_state_t *state, I32 *flagp);
+ STATIC void	S_reginsert(pTHX_ struct RExC_state_t *state, U8 op, regnode *opnd);
+ STATIC void	S_regoptail(pTHX_ struct RExC_state_t *state, regnode *p, regnode *val);
+diff --git a/regcomp.c b/regcomp.c
+index 928cf39..98d48dd 100644
+--- a/regcomp.c
++++ b/regcomp.c
+@@ -2791,6 +2791,39 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
+ }
+ 
+ /*
++ * reg_recode
++ *
++ * It returns the code point in utf8 for the value in *encp.
++ *    value: a code value in the source encoding
++ *    encp:  a pointer to an Encode object
++ *
++ * If the result from Encode is not a single character,
++ * it returns U+FFFD (Replacement character) and sets *encp to NULL.
++ */
++STATIC UV
++S_reg_recode(pTHX_ const char value, SV **encp)
++{
++    STRLEN numlen = 1;
++    SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
++    const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
++					 : SvPVX(sv);
++    const STRLEN newlen = SvCUR(sv);
++    UV uv = UNICODE_REPLACEMENT;
++
++    if (newlen)
++	uv = SvUTF8(sv)
++	     ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
++	     : *(U8*)s;
++
++    if (!newlen || numlen != newlen) {
++	uv = UNICODE_REPLACEMENT;
++	if (encp)
++	    *encp = NULL;
++    }
++    return uv;
++}
++
++/*
+  - regatom - the lowest level
+  *
+  * Optimization:  gobbles an entire sequence of ordinary characters so that
+@@ -3182,6 +3215,8 @@ tryagain:
+ 			    ender = grok_hex(p, &numlen, &flags, NULL);
+ 			    p += numlen;
+ 			}
++			if (PL_encoding && ender < 0x100)
++			    goto recode_encoding;
+ 			break;
+ 		    case 'c':
+ 			p++;
+@@ -3201,6 +3236,17 @@ tryagain:
+ 			    --p;
+ 			    goto loopdone;
+ 			}
++			if (PL_encoding && ender < 0x100)
++			    goto recode_encoding;
++			break;
++		    recode_encoding:
++			{
++			    SV* enc = PL_encoding;
++			    ender = reg_recode((const char)(U8)ender, &enc);
++			    if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
++				vWARN(p, "Invalid escape in the specified encoding");
++			    RExC_utf8 = 1;
++			}
+ 			break;
+ 		    case '\0':
+ 			if (p >= RExC_end)
+@@ -3331,32 +3377,6 @@ tryagain:
+ 	break;
+     }
+ 
+-    /* If the encoding pragma is in effect recode the text of
+-     * any EXACT-kind nodes. */
+-    if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
+-	STRLEN oldlen = STR_LEN(ret);
+-	SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
+-
+-	if (RExC_utf8)
+-	    SvUTF8_on(sv);
+-	if (sv_utf8_downgrade(sv, TRUE)) {
+-	    const char * const s = sv_recode_to_utf8(sv, PL_encoding);
+-	    const STRLEN newlen = SvCUR(sv);
+-
+-	    if (SvUTF8(sv))
+-		RExC_utf8 = 1;
+-	    if (!SIZE_ONLY) {
+-		DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
+-				      (int)oldlen, STRING(ret),
+-				      (int)newlen, s));
+-		Copy(s, STRING(ret), newlen, char);
+-		STR_LEN(ret) += newlen - oldlen;
+-		RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
+-	    } else
+-		RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
+-	}
+-    }
+-
+     return(ret);
+ }
+ 
+@@ -3734,6 +3754,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
+ 		    value = grok_hex(RExC_parse, &numlen, &flags, NULL);
+ 		    RExC_parse += numlen;
+ 		}
++		if (PL_encoding && value < 0x100)
++		    goto recode_encoding;
+ 		break;
+ 	    case 'c':
+ 		value = UCHARAT(RExC_parse++);
+@@ -3741,13 +3763,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
+ 		break;
+ 	    case '0': case '1': case '2': case '3': case '4':
+ 	    case '5': case '6': case '7': case '8': case '9':
+-            {
+-                I32 flags = 0;
+-		numlen = 3;
+-		value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
+-		RExC_parse += numlen;
+-		break;
+-            }
++		{
++		    I32 flags = 0;
++		    numlen = 3;
++		    value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
++		    RExC_parse += numlen;
++		    if (PL_encoding && value < 0x100)
++			goto recode_encoding;
++		    break;
++		}
++	    recode_encoding:
++		{
++		    SV* enc = PL_encoding;
++		    value = reg_recode((const char)(U8)value, &enc);
++		    if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
++			vWARN(RExC_parse,
++			      "Invalid escape in the specified encoding");
++		    break;
++		}
+ 	    default:
+ 		if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
+ 		    vWARN2(RExC_parse,
+diff --git a/t/uni/tr_utf8.t b/t/uni/tr_utf8.t
+index 606a84a..354156a 100755
+--- a/t/uni/tr_utf8.t
++++ b/t/uni/tr_utf8.t
+@@ -31,7 +31,7 @@ BEGIN {
+ }
+ 
+ use strict;
+-use Test::More tests => 7;
++use Test::More tests => 8;
+ 
+ use encoding 'utf8';
+ 
+@@ -67,4 +67,12 @@ is($str, $hiragana, "s/// # hiragana -> katakana");
+   $line =~ tr/bcdeghijklmnprstvwxyz$02578/בצדעגהיײקלמנפּרסטװשכיזשױתײחא/;
+   is($line, "aבצדעfגהיײקלמנoפqּרסuטװשכיזש1ױ34ת6ײח9", "[perl #16843]");
+ }
++
++{
++  # [perl #40641]
++  my $str = qq/Gebääääääääääääääääääääude/;
++  my $reg = qr/Gebääääääääääääääääääääude/;
++  ok($str =~ /$reg/, "[perl #40641]");
++}
++
+ __END__
+diff --git a/utf8.h b/utf8.h
+index 6d63897..3800866 100644
+--- a/utf8.h
++++ b/utf8.h
+@@ -198,6 +198,8 @@ encoded character.
+ 					 UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
+ #define UTF8_ALLOW_ANY			0x00FF
+ #define UTF8_CHECK_ONLY			0x0200
++#define UTF8_ALLOW_DEFAULT             (ckWARN(WARN_UTF8) ? 0 : \
++                                        UTF8_ALLOW_ANYUV)
+ 
+ #define UNICODE_SURROGATE_FIRST		0xD800
+ #define UNICODE_SURROGATE_LAST		0xDFFF