Skip to content

Commit

Permalink
feat: fail format validation on unnecessary uppercase character classes
Browse files Browse the repository at this point in the history
Check only character classes for uppercase letters in case-insensitive
expressions. Other casing in assembly files can give context and it may
be better to leave it as is.
  • Loading branch information
theseion committed Jul 13, 2024
1 parent 3c14106 commit da60552
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 60 deletions.
52 changes: 29 additions & 23 deletions cmd/regex_format.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"os"
"path"
"path/filepath"
"regexp"
"strings"

"github.com/spf13/cobra"
Expand All @@ -20,6 +21,7 @@ import (
"github.com/coreruleset/crs-toolchain/v2/regex"
"github.com/coreruleset/crs-toolchain/v2/regex/parser"
"github.com/coreruleset/crs-toolchain/v2/regex/processors"
"github.com/coreruleset/crs-toolchain/v2/utils"
)

const (
Expand Down Expand Up @@ -191,7 +193,7 @@ func processFile(filePath string, ctxt *processors.Context, checkOnly bool) erro
}

if !checkStandardHeader(lines) {
logger.Info().Msgf("file %s does not have standard header", filePath)
logger.Info().Msgf("file %s does not have standard header", filename)
// prepend the standard header
lines = append([]string{regexAssemblyStandardHeader}, lines...)
}
Expand All @@ -205,15 +207,15 @@ func processFile(filePath string, ctxt *processors.Context, checkOnly bool) erro
return err
}
// sanity check: if we are using an ignore-case flag, we don't need to have any uppercase letters in the file
foundUppercase, errMessage := findUpperCaseOnIgnoreCaseFlag(lines, raParser.Flags['i'])
foundUppercase, errMessage := findUpperCaseCharacterClassOnIgnoreCaseFlag(lines, raParser.Flags['i'])
if foundUppercase {
logger.Warn().Msgf("File contains uppercase letters, but ignore-case flag is set. Please check your source files.")
logger.Warn().Msgf("%s contains uppercase letters in character classes, but ignore-case flag is set. Please check your source files.", filename)
logger.Warn().Msgf("%s", errMessage)
logger.Warn().Msg("Be aware that because of file inclusions and definitions, the actual line number or file might be different.")
}
equalContent := bytes.Equal(currentContents, newContents)
if !equalContent || foundUppercase {
message = formatMessage(fmt.Sprintf("File %s not properly formatted", filePath))
message = formatMessage(fmt.Sprintf("%s not properly formatted", filename))
fmt.Println(message)
processFileError = &UnformattedFileError{filePath: filePath}
}
Expand Down Expand Up @@ -318,9 +320,9 @@ func checkStandardHeader(lines []string) bool {
return false
}

// findUpperCaseOnIgnoreCaseFlag checks if the file contains uppercase letters when the ignore-case flag is set
// findUpperCaseCharacterClassOnIgnoreCaseFlag checks if the file contains uppercase letters when the ignore-case flag is set
// returns true if the file contains uppercase letters, and an error message pointing the line where it was found.
func findUpperCaseOnIgnoreCaseFlag(lines []string, iFlag bool) (bool, string) {
func findUpperCaseCharacterClassOnIgnoreCaseFlag(lines []string, iFlag bool) (bool, string) {
res := false
definition := false
message := ""
Expand All @@ -344,7 +346,7 @@ func findUpperCaseOnIgnoreCaseFlag(lines []string, iFlag bool) (bool, string) {
if definition {
// restore the original line
line = lines[i]
index = definitionRegex.FindSubmatchIndex([]byte(line))[3]
index += definitionRegex.FindSubmatchIndex([]byte(line))[3]
}
if index > 0 {
fill = strings.Repeat("=", index)
Expand All @@ -361,24 +363,28 @@ func findUpperCaseOnIgnoreCaseFlag(lines []string, iFlag bool) (bool, string) {
// findUppercaseNonEscaped finds an uppercase character that is not escaped in a given line
// returns true if the line contains an uppercase character that is not escaped, and the index of the character
func findUppercaseNonEscaped(line string) (bool, int) {
for i, c := range line {
if c >= 'A' && c <= 'Z' {
// go back and check if the character is escaped
count := 0
runes := []rune(line[:i]) // Convert string to rune slice
// Iterate over the rune slice in reverse order
for j := len(runes) - 1; j >= 0; j-- {
if runes[j] == '\\' {
// we found a backslash, so we need to check if it is escaped
count++
// if the character is not escaped, return the index
} else {
break
characterClassRegex := regexp.MustCompile(`(?:^|[^\\])\[.+?[^\\]\]`)
matches := characterClassRegex.FindAllStringIndex(line, -1)
if matches == nil {
return false, -1
}

for _, location := range matches {
start := location[0]
end := location[1]
// Skip match of non-backslash character before the first bracket.
// Don't skip if the match is at the start of the string.
if start > 0 {
start += 1
}
match := line[start:end]
for i, c := range match {
if c >= 'A' && c <= 'Z' {
// Escaped upper case letters are probably shortahands like `\W`, `\S`
if !utils.IsEscaped(match, i) {
return true, i + start
}
}
if count%2 == 0 {
return true, i
}
}
}
return false, -1
Expand Down
79 changes: 55 additions & 24 deletions cmd/regex_format_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -508,101 +508,132 @@ this is a regex

_, err := rootCmd.ExecuteC()
s.EqualError(err, fmt.Sprintf("File not properly formatted: %s", path.Join(s.dataDir, "123456.ra")))

s.Contains(out.String(), "File contains uppercase letters, but ignore-case flag is set. Please check your source files.")
s.Contains(out.String(), "123456.ra contains uppercase letters in character classes, but ignore-case flag is set. Please check your source files.")
s.Contains(out.String(), "{1,3}[Bb]lah\\n======^ [HERE]\\n\"}\n")
}

func (s *formatTestSuite) TestIgnoreCaseFlagWithUppercasePositionIndicator() {
func (s *formatTestSuite) TestIgnoreCaseFlagWithUppercase_FirstCharacter() {
// send logs to buffer
out := &bytes.Buffer{}
log := zerolog.New(out)
logger = log.With().Str("component", "parser-test").Logger()

s.writeDataFile("123456.ra", regexAssemblyStandardHeader+`
##!+ i
First letter is uppercase
[First] letter is uppercase
`)
rootCmd.SetArgs([]string{"-d", s.tempDir, "regex", "format", "-c", "123456", "-o", "github"})

_, err := rootCmd.ExecuteC()
s.EqualError(err, fmt.Sprintf("File not properly formatted: %s", path.Join(s.dataDir, "123456.ra")))
s.Contains(out.String(), "123456.ra contains uppercase letters in character classes, but ignore-case flag is set. Please check your source files.")
s.Contains(out.String(), "[First] letter is uppercase\\n=^ [HERE]\\n\"}\n")
}

func (s *formatTestSuite) TestIgnoreCaseFlagWithUppercase_LastCharacter() {
// send logs to buffer
out := &bytes.Buffer{}
log := zerolog.New(out)
logger = log.With().Str("component", "parser-test").Logger()

s.Contains(out.String(), "File contains uppercase letters, but ignore-case flag is set. Please check your source files.")
s.Contains(out.String(), "First letter is uppercase\\n^ [HERE]\\n\"}\n")
s.writeDataFile("123456.ra", regexAssemblyStandardHeader+`
##!+ i
Last letter is upper[casE]
`)
rootCmd.SetArgs([]string{"-d", s.tempDir, "regex", "format", "-c", "123456", "-o", "github"})

_, err := rootCmd.ExecuteC()
s.EqualError(err, fmt.Sprintf("File not properly formatted: %s", path.Join(s.dataDir, "123456.ra")))
s.Contains(out.String(), "123456.ra contains uppercase letters in character classes, but ignore-case flag is set. Please check your source files.")
s.Contains(out.String(), "Last letter is upper[casE]\\n========================^ [HERE]\\n\"}\n")
}

func (s *formatTestSuite) TestIgnoreCaseFlagWithUppercaseEscapedUppercase() {
func (s *formatTestSuite) TestIgnoreCaseFlagWithUppercase_PlusDefinitions() {
// send logs to buffer
out := &bytes.Buffer{}
log := zerolog.New(out)
logger = log.With().Str("component", "parser-test").Logger()

s.writeDataFile("123456.ra", regexAssemblyStandardHeader+`
##!> define homer [simpson]
##!+ i
this regex has a \S letter which should not match
multiple escape sequences \S\S\S
multiple escape sequences \A\B\S should be good.
`)
rootCmd.SetArgs([]string{"-d", s.tempDir, "regex", "format", "-c", "123456", "-o", "github"})

_, err := rootCmd.ExecuteC()
s.NoError(err)
s.Require().NoError(err)
}

func (s *formatTestSuite) TestIgnoreCaseFlagWithUppercaseUnevenlyEscapedUppercase() {
func (s *formatTestSuite) TestIgnoreCaseFlagWithUppercase_PlusDefinitionsWithUppercase() {
// send logs to buffer
out := &bytes.Buffer{}
log := zerolog.New(out)
logger = log.With().Str("component", "parser-test").Logger()

s.writeDataFile("123456.ra", regexAssemblyStandardHeader+`
##!> define homer No_[Bueno]
##!+ i
multiple escape sequences \A\B\S should be good.
odd number of escape sequences should be good also \\\A\\\S
even number of escape sequences should be bad \\\\A\\B
`)
rootCmd.SetArgs([]string{"-d", s.tempDir, "regex", "format", "-c", "123456", "-o", "github"})

_, err := rootCmd.ExecuteC()
s.EqualError(err, fmt.Sprintf("File not properly formatted: %s", path.Join(s.dataDir, "123456.ra")))
s.Contains(out.String(), "File contains uppercase letters, but ignore-case flag is set. Please check your source files.")
s.Contains(out.String(), "even number of escape sequences should be bad")
s.Contains(out.String(), "123456.ra contains uppercase letters in character classes, but ignore-case flag is set. Please check your source files.")
s.Contains(out.String(), "##!> define homer No_[Bueno]\\n======================^ [HERE]")
}

func (s *formatTestSuite) TestIgnoreCaseFlagWithUppercase_PlusDefinitions() {
func (s *formatTestSuite) TestIgnoreCaseFlagWithUppercase_ComplexCharacterClass() {
// send logs to buffer
out := &bytes.Buffer{}
log := zerolog.New(out)
logger = log.With().Str("component", "parser-test").Logger()

s.writeDataFile("123456.ra", regexAssemblyStandardHeader+`
##!> define homer simpson
##!+ i
multiple escape sequences \A\B\S should be good.
I'm complex: [^S$%_+-fG-].
`)
rootCmd.SetArgs([]string{"-d", s.tempDir, "regex", "format", "-c", "123456", "-o", "github"})

_, err := rootCmd.ExecuteC()
s.Require().NoError(err)
s.EqualError(err, fmt.Sprintf("File not properly formatted: %s", path.Join(s.dataDir, "123456.ra")))
s.Contains(out.String(), "123456.ra contains uppercase letters in character classes, but ignore-case flag is set. Please check your source files.")
s.Contains(out.String(), "I'm complex: [^S$%_+-fG-].\\n===============^ [HERE]")
}

func (s *formatTestSuite) TestIgnoreCaseFlagWithUppercase_PlusDefinitionsWithUppercase() {
func (s *formatTestSuite) TestIgnoreCaseFlagWithUppercase_CharacterClassWithBrackets() {
// send logs to buffer
out := &bytes.Buffer{}
log := zerolog.New(out)
logger = log.With().Str("component", "parser-test").Logger()

s.writeDataFile("123456.ra", regexAssemblyStandardHeader+`
##!> define homer No_Bueno
##!+ i
multiple escape sequences \A\B\S should be good.
Chara[ct]er class with brackets: [$l\][2R [fl\]iF]
`)
rootCmd.SetArgs([]string{"-d", s.tempDir, "regex", "format", "-c", "123456", "-o", "github"})

_, err := rootCmd.ExecuteC()
s.EqualError(err, fmt.Sprintf("File not properly formatted: %s", path.Join(s.dataDir, "123456.ra")))
s.Contains(out.String(), "File contains uppercase letters, but ignore-case flag is set. Please check your source files.")
s.Contains(out.String(), "##!> define homer No_Bueno\\n==================^ [HERE]")
s.Contains(out.String(), "123456.ra contains uppercase letters in character classes, but ignore-case flag is set. Please check your source files.")
s.Contains(out.String(), `Chara[ct]er class with brackets: [$l\\][2R [fl\\]iF]\n========================================^ [HERE]`)
}

func (s *formatTestSuite) TestIgnoreCaseFlagWithUppercase_IgnoreShortHands() {
// send logs to buffer
out := &bytes.Buffer{}
log := zerolog.New(out)
logger = log.With().Str("component", "parser-test").Logger()

s.writeDataFile("123456.ra", regexAssemblyStandardHeader+`
##!+ i
[\W\S]
`)
rootCmd.SetArgs([]string{"-d", s.tempDir, "regex", "format", "-c", "123456", "-o", "github"})

_, err := rootCmd.ExecuteC()
s.Require().NoError(err)
}

func (s *formatTestSuite) writeDataFile(filename string, contents string) {
Expand Down
16 changes: 3 additions & 13 deletions regex/operators/assembler.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/coreruleset/crs-toolchain/v2/regex"
"github.com/coreruleset/crs-toolchain/v2/regex/parser"
"github.com/coreruleset/crs-toolchain/v2/regex/processors"
"github.com/coreruleset/crs-toolchain/v2/utils"
)

// Create the processor stack
Expand Down Expand Up @@ -326,11 +327,11 @@ func (o *Operator) findGroupBodyEnd(input string, groupBodyStart int) (int, bool
char := input[index]
switch char {
case '(':
if !isEscaped(input, index) {
if !utils.IsEscaped(input, index) {
parensCounter++
}
case ')':
if !isEscaped(input, index) {
if !utils.IsEscaped(input, index) {
parensCounter--
}
case '|':
Expand All @@ -343,17 +344,6 @@ func (o *Operator) findGroupBodyEnd(input string, groupBodyStart int) (int, bool
return index - 2, hasAlternation
}

func isEscaped(input string, position int) bool {
escapeCounter := 0
for backtrackIndex := position - 1; backtrackIndex >= 0; backtrackIndex++ {
if input[backtrackIndex] != '\\' {
break
}
escapeCounter++
}
return escapeCounter%2 != 0
}

func (a *Operator) startPreprocessor(processorName string, args []string) error {
logger.Trace().Msgf("Found processor %s start\n", processorName)
switch processorName {
Expand Down
12 changes: 12 additions & 0 deletions utils/utils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package utils

func IsEscaped(input string, position int) bool {
escapeCounter := 0
for backtrackIndex := position - 1; backtrackIndex >= 0; backtrackIndex++ {
if input[backtrackIndex] != '\\' {
break
}
escapeCounter++
}
return escapeCounter%2 != 0
}

0 comments on commit da60552

Please sign in to comment.