Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add is valid sql descriptor #1332

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@
"evaluate>=0.4.1",
"transformers[torch]>=4.39.3",
"sentence-transformers>=2.7.0",
"sqlglot>=25.24.3",
],
"spark": ["pyspark>=3.4.0"],
"fsspec": [
Expand Down
2 changes: 2 additions & 0 deletions src/evidently/descriptors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .custom_descriptor import CustomPairColumnEval
from .hf_descriptor import HuggingFaceModel
from .hf_descriptor import HuggingFaceToxicityModel
from .is_valid_sql_descriptor import IsValidSQL
from .llm_judges import BiasLLMEval
from .llm_judges import ContextQualityLLMEval
from .llm_judges import DeclineLLMEval
Expand Down Expand Up @@ -55,5 +56,6 @@
"SentenceCount",
"Sentiment",
"RegExp",
"IsValidSQL",
"_registry",
]
3 changes: 3 additions & 0 deletions src/evidently/descriptors/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,6 @@
"evidently.descriptors.custom_descriptor.CustomPairColumnEval",
"evidently:descriptor:CustomPairColumnEval",
)
register_type_alias(
FeatureDescriptor, "evidently.descriptors.is_valid_sql_descriptor.IsValidSQL", "evidently:descriptor:IsValidSQL"
)
11 changes: 11 additions & 0 deletions src/evidently/descriptors/is_valid_sql_descriptor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from evidently.features import is_valid_sql_feature
from evidently.features.generated_features import FeatureDescriptor
from evidently.features.generated_features import GeneratedFeature


class IsValidSQL(FeatureDescriptor):
class Config:
type_alias = "evidently:descriptor:IsValidSQL"

def feature(self, column_name: str) -> GeneratedFeature:
return is_valid_sql_feature.IsValidSQL(column_name, self.display_name)
3 changes: 3 additions & 0 deletions src/evidently/features/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,6 @@
register_type_alias(
GeneratedFeatures, "evidently.features.words_feature.WordsPresence", "evidently:feature:WordsPresence"
)
register_type_alias(
GeneratedFeatures, "evidently.features.is_valid_sql_feature.IsValidSQL", "evidently:feature:IsValidSQL"
)
44 changes: 44 additions & 0 deletions src/evidently/features/is_valid_sql_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from typing import Any
from typing import ClassVar
from typing import Optional

import sqlvalidator

from evidently import ColumnType
from evidently.features.generated_features import ApplyColumnGeneratedFeature


class IsValidSQL(ApplyColumnGeneratedFeature):
class Config:
type_alias = "evidently:feature:IsValidSQL"

__feature_type__: ClassVar = ColumnType.Categorical
display_name_template: ClassVar = "SQL Validity Check for {column_name}"

column_name: str

def __init__(self, column_name: str, display_name: Optional[str] = None):
self.column_name = column_name
self.display_name = display_name
super().__init__()

def apply(self, value: Any):
if value is None or not isinstance(value, str):
return False

return self.is_valid_sql(value)

def is_valid_sql(self, query: str) -> bool:
queries = query.strip().split(";") # Split by semicolon

for q in queries:
q = q.strip() # Remove extra whitespace
if not q: # Skip empty queries
continue

try:
sqlvalidator.format_sql(q) # Validate SQL syntax
except Exception:
return False # Invalid SQL

return True # All queries are valid
29 changes: 29 additions & 0 deletions tests/features/test_is_valid_sql_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pandas as pd

from evidently.features.is_valid_sql_feature import IsValidSQL
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.utils.data_preprocessing import create_data_definition


def test_is_valid_sql_feature():
feature_generator = IsValidSQL("column_1")
data = pd.DataFrame(
dict(
column_1=[
"SELECT * FROM users", # Valid SQL (simple query)
"SELECT id, address FROM users; SELECT count(id) FROM users", # Valid SQL (multiple SQL queries)
"INSERT INTO table", # Invalid SQL (incomplete query)
"SLECT * FROM users", # Invalid SQL (typo)
"SLECT * FROM users; SELECT id, address FROM users", # Invalid SQL (1 invalid sub-query)
]
)
)

result = feature_generator.generate_feature(
data=data,
data_definition=create_data_definition(None, data, ColumnMapping()),
)

expected_result = pd.DataFrame(dict(column_1=[True, True, False, False, False]))

assert result.equals(expected_result)