Skip to content

Commit 2532d83

Browse files
authored
Merge pull request #1361 from WebFuzzing/external-pr-lmasroca
Regex support for octal, control and long hex escapes
2 parents 209e449 + 1a9dd53 commit 2532d83

7 files changed

Lines changed: 134 additions & 36 deletions

File tree

core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -96,24 +96,22 @@ atom
9696

9797

9898
//TODO
99-
CharacterEscape
100-
// : ControlEscape
101-
// | 'c' ControlLetter
102-
: HexEscapeSequence
99+
fragment CharacterEscape
100+
: ControlEscape
101+
| 'c' ControlLetter
102+
| HexEscapeSequence
103103
| UnicodeEscapeSequence
104104
//| IdentityEscape
105105
;
106106

107-
//TODO
108-
//ControlEscape
109-
// //one of f n r t v
110-
// : [fnrtv]
111-
// ;
107+
fragment ControlEscape
108+
//one of f n r t v
109+
: [fnrtv]
110+
;
112111

113-
//TODO
114-
//ControlLetter
115-
// : [a-zA-Z]
116-
// ;
112+
fragment ControlLetter
113+
: [a-zA-Z]
114+
;
117115

118116

119117
//TODO
@@ -238,11 +236,11 @@ BaseChar
238236
: ~[0-9,^$\\.*+?()[\]{}|-]
239237
;
240238

241-
UnicodeEscapeSequence
239+
fragment UnicodeEscapeSequence
242240
: 'u' HexDigit HexDigit HexDigit HexDigit
243241
;
244242

245-
HexEscapeSequence
243+
fragment HexEscapeSequence
246244
: 'x' HexDigit HexDigit
247245
;
248246

core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -119,24 +119,23 @@ quoteChar
119119
;
120120

121121
//TODO
122-
CharacterEscape
123-
// : ControlEscape
124-
// | 'c' ControlLetter
125-
: HexEscapeSequence
122+
fragment CharacterEscape
123+
: ControlEscape
124+
| 'c' ControlLetter
125+
| HexEscapeSequence
126126
| UnicodeEscapeSequence
127+
| OctalEscapeSequence
127128
//| IdentityEscape
128129
;
129130

130-
//TODO
131-
//ControlEscape
132-
// //one of f n r t v
133-
// : [fnrtv]
134-
// ;
131+
fragment ControlEscape
132+
//one of f n r t v
133+
: [aefnrt]
134+
;
135135

136-
//TODO
137-
//ControlLetter
138-
// : [a-zA-Z]
139-
// ;
136+
fragment ControlLetter
137+
: [?-_a-z]
138+
;
140139

141140

142141
//TODO
@@ -267,18 +266,29 @@ BaseChar
267266
: ~[0-9,^$\\.*+?()[\]{}|-]
268267
;
269268

270-
UnicodeEscapeSequence:
269+
fragment OctalEscapeSequence
270+
: '0' OctalDigit
271+
| '0' OctalDigit OctalDigit
272+
| '0' [0-3] OctalDigit OctalDigit
273+
;
274+
275+
fragment UnicodeEscapeSequence:
271276
'u' HexDigit HexDigit HexDigit HexDigit
272277
;
273278

274-
HexEscapeSequence
279+
fragment HexEscapeSequence
275280
: 'x' HexDigit HexDigit
281+
| 'x' BRACE_open HexDigit+ BRACE_close
276282
;
277283

278284
fragment HexDigit:
279285
[a-fA-F0-9]
280286
;
281287

288+
fragment OctalDigit:
289+
[0-7]
290+
;
291+
282292
//TODO
283293
//DecimalIntegerLiteral
284294
// : '0'

core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,9 +171,23 @@ class GeneRegexEcma262Visitor : RegexEcma262BaseVisitor<VisitResult>(){
171171
if(ctx.AtomEscape() != null) {
172172
val txt = ctx.AtomEscape().text
173173
when {
174+
txt[1]== 'c' -> {
175+
val controlLetterValue = txt[2].uppercaseChar().code.xor(0x40)
176+
return VisitResult(PatternCharacterBlockGene(txt, controlLetterValue.toChar().toString()))
177+
}
178+
txt[1] in "fnrtv" -> {
179+
val escape = when {
180+
txt[1] == 'n' -> "\u000A"
181+
txt[1] == 'v' -> "\u000B"
182+
txt[1] == 'f' -> "\u000C"
183+
txt[1] == 'r' -> "\u000D"
184+
else -> "\u0009"
185+
}
186+
return VisitResult(PatternCharacterBlockGene(txt, escape))
187+
}
174188
txt[1] == 'x' || txt[1] == 'u' -> {
175189
val hexValue =
176-
txt.subSequence(2, txt.length).toString().toInt(16)
190+
txt.substring(2).toInt(16)
177191
return VisitResult(
178192
PatternCharacterBlockGene(
179193
txt,

core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -183,13 +183,47 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
183183
if(ctx.AtomEscape() != null){
184184
val txt = ctx.AtomEscape().text
185185
when {
186+
txt[1] == '0' -> {
187+
val octalValue = txt.substring(2).toInt(8)
188+
return VisitResult(
189+
PatternCharacterBlockGene(
190+
txt,
191+
String(Character.toChars(octalValue))
192+
)
193+
)
194+
}
195+
txt[1]== 'c' -> {
196+
val controlLetterValue = if (txt[2].isLowerCase()){
197+
txt[2].uppercaseChar().code.xor(0x60)
198+
} else {
199+
txt[2].code.xor(0x40)
200+
}
201+
return VisitResult(PatternCharacterBlockGene(txt, controlLetterValue.toChar().toString()))
202+
}
203+
txt[1] in "aefnrt" -> {
204+
val escape = when {
205+
txt[1] == 'a' -> "\u0007"
206+
txt[1] == 'e' -> "\u001B"
207+
txt[1] == 'f' -> "\u000C"
208+
txt[1] == 'n' -> "\u000A"
209+
txt[1] == 'r' -> "\u000D"
210+
else -> "\u0009"
211+
}
212+
return VisitResult(PatternCharacterBlockGene(txt, escape))
213+
}
186214
txt[1] == 'x' || txt[1] == 'u' -> {
187-
val hexValue =
188-
txt.subSequence(2, txt.length).toString().toInt(16)
215+
val hexValue = when {
216+
txt[1] == 'x' && txt.length > 4 && txt[2] == '{' && txt[txt.length - 1] == '}'
217+
-> txt.substring(3, txt.length - 1).toInt(16)
218+
else -> txt.substring(2).toInt(16)
219+
}
220+
if(hexValue !in Character.MIN_CODE_POINT..Character.MAX_CODE_POINT){
221+
throw IllegalArgumentException("Hexadecimal escape out of range: ${ctx.text}")
222+
}
189223
return VisitResult(
190224
PatternCharacterBlockGene(
191225
txt,
192-
hexValue.toChar().toString()
226+
String(Character.toChars(hexValue))
193227
)
194228
)
195229
}

core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -329,11 +329,23 @@ open class GeneRegexEcma262VisitorTest : RegexTestTemplate(){
329329

330330
@Test
331331
fun testHexEscape(){
332-
checkSameAsJava("""\x00\x0a\xba\xFF""")
332+
checkSameAsJava("""x00\x00\x0a\xba\xFF""")
333333
}
334334

335335
@Test
336336
fun testUnicodeEscape(){
337-
checkSameAsJava("""\u0000\u0a0b\uffff""")
337+
checkSameAsJava("""u0000\u0000\u0a0b\uffff""")
338+
}
339+
340+
@Test
341+
open fun testControlEscape(){
342+
checkSameAsJava("""ftnrv\f\t\n\r\v""")
343+
}
344+
345+
@Test
346+
open fun testControlLetterEscape(){
347+
checkSameAsJava("""cac!\cA\cG\cZ""")
348+
// The following escape sequences behave differently in Java and JavaScript.
349+
checkCanSample("""\ca\cg\cz""","\u0001\u0007\u001A",10_000)
338350
}
339351
}

core/src/test/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitorTest.kt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,4 +72,24 @@ class GeneRegexJavaVisitorTest : GeneRegexEcma262VisitorTest() {
7272
//checkSameAsJava("[9-1]") //not valid in Java
7373
checkCanSample("[9-1]", listOf("1","5","9"),200)
7474
}
75+
76+
@Test
77+
fun testJavaHexEscape(){
78+
checkSameAsJava("""x{3}\x{0}\x{FFFf}\x{0FFFf}\x{01FFFf}\x{10FFFf}""")
79+
}
80+
81+
@Test
82+
fun testJavaOctalEscape(){
83+
checkSameAsJava("""00\00\07\077\0377\0378\0400""")
84+
}
85+
86+
@Test
87+
override fun testControlEscape(){
88+
checkSameAsJava("""aefnrt\a\e\f\n\r\t""")
89+
}
90+
91+
@Test
92+
override fun testControlLetterEscape() {
93+
checkSameAsJava("""cac!\ca\cg\cz\cA\cG\cZ\c@\c[\c\\c]\c^\c\c_\c?""")
94+
}
7595
}

core/src/test/kotlin/org/evomaster/core/parser/RegexHandlerTest.kt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,11 +125,21 @@ internal class RegexHandlerTest{
125125
fun testCreateGeneForJVMInvalidRegex() {
126126

127127
assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForJVM("\\xR") }
128+
assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForJVM("\\ugggg") }
129+
assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForJVM("\\x{}") }
130+
assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForJVM("\\x{") }
131+
assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForJVM("\\x}") }
132+
assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForJVM("\\x[h}") }
133+
assertThrows(IllegalArgumentException::class.java) { RegexHandler.createGeneForJVM("\\x{110000}") }
134+
assertThrows(IllegalArgumentException::class.java) { RegexHandler.createGeneForJVM("\\x{ffffff}") }
135+
assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForJVM("\\0") }
136+
assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForJVM("\\09") }
128137
}
129138

130139
@Test
131140
fun testCreateGeneForEcma262InvalidRegex() {
132141

133142
assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForEcma262("\\xR") }
143+
assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForJVM("\\ugggg") }
134144
}
135145
}

0 commit comments

Comments
 (0)