Skip to content

Commit

Permalink
Compile adapted regex patterns on .NET 7+ only if they don't contain …
Browse files Browse the repository at this point in the history
…negative lookaround assertions
  • Loading branch information
adams85 committed Apr 1, 2024
1 parent a5a6c4b commit d0ac02b
Show file tree
Hide file tree
Showing 11 changed files with 291 additions and 239 deletions.
3 changes: 3 additions & 0 deletions src/Acornima/Ast/INode.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,8 @@ public interface INode
ref readonly Range RangeRef { get; }
SourceLocation Location { get; }
ref readonly SourceLocation LocationRef { get; }
/// <remarks>
/// The operation is not guaranteed to be thread-safe. In case concurrent access or update is possible, the necessary synchronization is caller's responsibility.
/// </remarks>
object? UserData { get; }
}
3 changes: 3 additions & 0 deletions src/Acornima/RegExpParseMode.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ public enum RegExpParseMode
/// In case an invalid regular expression is encountered, <see cref="SyntaxErrorException"/> is thrown.<br/>
/// In the case of a valid regular expression for which an equivalent <see cref="Regex"/> cannot be constructed, either <see cref="RegExpConversionErrorException"/> is thrown
/// or a <see cref="Token"/> is created with the <see cref="Token.Value"/> property set to <see langword="null"/>, depending on the <see cref="TokenizerOptions.Tolerant"/> option.
/// <para>
/// Please note that adapted patterns containing negative lookaround assertions won't be compiled on .NET 7+ because of a <seealso href="https://github.com/dotnet/runtime/issues/97455">regression of .NET's regex compiler</seealso>.
/// </para>
/// </remarks>
AdaptToCompiled,
}
21 changes: 17 additions & 4 deletions src/Acornima/Tokenizer.RegExpParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ internal partial struct RegExpParser
private const int SetRangeNotStarted = int.MaxValue;
private const int SetRangeStartedWithCharClass = int.MaxValue - 1;

// Negative lookaround assertions don't work as expected under .NET 7 and .NET 8 when the regex is compiled
// (see also https://github.com/dotnet/runtime/issues/97455).
private static readonly bool s_canCompileNegativeLookaroundAssertions = typeof(Regex).Assembly.GetName().Version?.Major < 7;

internal static RegExpFlags ParseFlags(string value, int startIndex, Tokenizer tokenizer)
{
var flags = RegExpFlags.None;
Expand Down Expand Up @@ -175,7 +179,7 @@ public RegExpParseResult Parse()
}
}

var adaptedPattern = ParseCore(out var capturingGroups, out conversionError);
var adaptedPattern = ParseCore(out var capturingGroups, out conversionError, out var canCompile);
if (adaptedPattern is null)
{
// NOTE: ParseCore should return null
Expand All @@ -188,7 +192,7 @@ public RegExpParseResult Parse()
Debug.Assert(conversionError is null);
capturingGroups.TrimExcess();

var options = FlagsToOptions(_flags, compiled: _tokenizer._options._regExpParseMode == RegExpParseMode.AdaptToCompiled);
var options = FlagsToOptions(_flags, compiled: _tokenizer._options._regExpParseMode == RegExpParseMode.AdaptToCompiled && canCompile);
var matchTimeout = _tokenizer._options._regexTimeout;

try
Expand All @@ -202,7 +206,7 @@ public RegExpParseResult Parse()
}
}

internal string? ParseCore(out ArrayList<RegExpCapturingGroup> capturingGroups, out RegExpConversionError? conversionError)
internal string? ParseCore(out ArrayList<RegExpCapturingGroup> capturingGroups, out RegExpConversionError? conversionError, out bool canCompile)
{
_tokenizer.AcquireStringBuilder(out var sb);
try
Expand Down Expand Up @@ -234,9 +238,11 @@ public RegExpParseResult Parse()
};
context.SetFollowingQuantifierError(RegExpNothingToRepeat);

return (_flags & RegExpFlags.Unicode) != 0
var adaptedPattern = (_flags & RegExpFlags.Unicode) != 0
? ParsePattern(UnicodeMode.Instance, ref context, out conversionError)
: ParsePattern(LegacyMode.Instance, ref context, out conversionError);
canCompile = context.CanCompile;
return adaptedPattern;
}
finally
{
Expand Down Expand Up @@ -514,6 +520,10 @@ private void CheckBracesBalance(out ArrayList<RegExpCapturingGroup> capturingGro
context.SetFollowingQuantifierError(RegExpNothingToRepeat);
break;
}
else if (!s_canCompileNegativeLookaroundAssertions && groupType is RegExpGroupType.NegativeLookaheadAssertion or RegExpGroupType.NegativeLookbehindAssertion)
{
context.CanCompile = false;
}

sb?.Append(_pattern, i, 1 + ((int)groupType >> 2));
i += (int)groupType >> 2;
Expand Down Expand Up @@ -1166,6 +1176,7 @@ public ParsePatternContext(StringBuilder? sb, ReadOnlySpan<RegExpCapturingGroup>

CapturingGroups = capturingGroups;
CapturingGroupNames = capturingGroupNames;
CanCompile = true;
}

public int Index;
Expand Down Expand Up @@ -1225,6 +1236,8 @@ public void SetFollowingQuantifierError(string message, [CallerArgumentExpressio
// * Lone surrogates need special care too.
// We use the following list to build the adjusted character set.
public ArrayList<CodePointRange> UnicodeSet;

public bool CanCompile;
}

private interface IMode
Expand Down
2 changes: 1 addition & 1 deletion test/Acornima.Tests/Acornima.Tests.csproj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFrameworks>net8.0</TargetFrameworks>
<TargetFrameworks>net6.0;net8.0</TargetFrameworks>
<TargetFrameworks Condition="'$(OS)' == 'Windows_NT'">$(TargetFrameworks);net462</TargetFrameworks>
<SignAssembly>true</SignAssembly>
<AssemblyOriginatorKeyFile>..\..\src\Karambolo.Public.snk</AssemblyOriginatorKeyFile>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
<AssemblyName>Acornima.Tests</AssemblyName>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<Nullable>enable</Nullable>
<SignAssembly>true</SignAssembly>
<AssemblyOriginatorKeyFile>..\..\..\..\src\Karambolo.Public.snk</AssemblyOriginatorKeyFile>
</PropertyGroup>

<ItemGroup>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ VisualStudioVersion = 17.6.33717.318
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Generator", "Generator.csproj", "{5FC6B784-BAA9-4BF6-8845-0D762D938816}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Acornima.Core", "..\..\..\..\src\Acornima.Core\Acornima.Core.csproj", "{C3607046-115D-45CE-8EA2-580D270DDC08}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Acornima.Core", "..\..\..\..\src\Acornima\Acornima.csproj", "{C3607046-115D-45CE-8EA2-580D270DDC08}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Expand Down
2 changes: 1 addition & 1 deletion test/Acornima.Tests/Fixtures.RegExp/Generator/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ static string DecodeStringIfEscaped(string value) => JavaScriptString.IsStringLi
var flags = DecodeStringIfEscaped(parts[1]);

var regexParser = new Tokenizer.RegExpParser(pattern, flags, tokenizerOptions);
try { adaptedPattern = regexParser.ParseCore(out _, out _) ?? ")inconvertible("; }
try { adaptedPattern = regexParser.ParseCore(out _, out _, out _) ?? ")inconvertible("; }
catch (SyntaxErrorException) { adaptedPattern = ")syntax-error("; }
var encodedDotnetPattern = JavaScriptString.Encode(adaptedPattern, addDoubleQuotes: false);
if (adaptedPattern != encodedDotnetPattern)
Expand Down
Loading

0 comments on commit d0ac02b

Please sign in to comment.