-
-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
model to score content #15
Labels
enhancement
New feature or request
Comments
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import joblib
from sklearn.preprocessing import LabelEncoder
class AIDetector:
"""
A machine learning model to detect AI-generated content in news articles.
Attributes:
pipeline: A pipeline consisting of TF-IDF vectorization and logistic regression.
le: A LabelEncoder to encode the version numbers as integers.
"""
def __init__(self):
self.pipeline = Pipeline([
('vectorizer', TfidfVectorizer(stop_words='english')),
('clf', LogisticRegression(max_iter=10000))
])
self.le = LabelEncoder()
def load_dataset(self, json_files):
"""
Load the dataset from JSON files.
Args:
json_files (list): A list of JSON files containing the dataset.
Returns:
pd.DataFrame: A Pandas DataFrame containing the dataset.
"""
dataset = []
for file in json_files:
with open(file, 'r') as f:
data = json.load(f)
original_text = data['original']
versions = [data[f'version_{i+1}'] for i in range(len(data) - 1)]
dataset.extend([(original_text, version, i+1) for i, version in enumerate(versions)])
return pd.DataFrame(dataset, columns=['original', 'rewritten', 'version'])
def train(self, X, y):
"""
Train the model on the dataset.
Args:
X (pd.Series): The rewritten news articles.
y (pd.Series): The version numbers of the rewritten articles.
Returns:
self: The trained model.
"""
y_encoded = self.le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
param_grid = {
'vectorizer__max_df': [0.5, 0.75, 1.0],
'vectorizer__min_df': [0, 0.1, 0.5],
'clf__C': [0.1, 1, 10],
'clf__penalty': ['l1', 'l2']
}
grid_search = GridSearchCV(self.pipeline, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)
self.pipeline = grid_search.best_estimator_
return self
def predict(self, text):
"""
Predict the version number of a rewritten news article.
Args:
text (str): The rewritten news article.
Returns:
int: The predicted version number.
"""
vectorized_text = self.pipeline.named_steps['vectorizer'].transform([text])
prediction = self.pipeline.named_steps['clf'].predict(vectorized_text)
return self.le.inverse_transform([prediction])[0]
def evaluate(self, X, y):
"""
Evaluate the model on the test set.
Args:
X (pd.Series): The rewritten news articles.
y (pd.Series): The version numbers of the rewritten articles.
Returns:
tuple: A tuple containing the accuracy, classification report, and confusion matrix.
"""
y_pred = self.pipeline.predict(X)
accuracy = accuracy_score(y, y_pred)
report = classification_report(y, y_pred)
matrix = confusion_matrix(y, y_pred)
return accuracy, report, matrix
def save(self, filename):
"""
Save the trained model to a file.
Args:
filename (str): The filename to save the model to.
"""
joblib.dump(self.pipeline, filename)
def load(self, filename):
"""
Load a trained model from a file.
Args:
filename (str): The filename to load the model from.
Returns:
self: The loaded model.
"""
self.pipeline = joblib.load(filename)
return self Documentation: Class: Attributes:
Methods:
Example Usage: ai_detector = AIDetector()
dataset = ai_detector.load_dataset(json_files)
X = dataset['rewritten']
y = dataset['version']
ai_detector.train(X, y)
new_text = "This is a rewritten news article."
ai_score = ai_detector.predict(new_text)
print("AI score:", ai_score)
ai_detector.save('ai_detector_model.joblib')
loaded_ai_detector = AIDetector().load('ai_detector_model.joblib')
loaded_ai_score = loaded_ai_detector.predict(new_text)
print("Loaded AI score:", loaded_ai_score) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Improvements:
LabelEncoder
to encode the version numbers as integers, which is required for logistic regression.predict_ai_score
function to use theinverse_transform
method to convert the predicted integer back to the original version number.Documentation for the PoC:
Dataset:
Machine Learning Pipeline:
Hyperparameter Tuning:
max_df
andmin_df
for the TF-IDF vectorizer.C
andpenalty
for the logistic regression model.Evaluation Metrics:
Model Deployment:
Example Usage:
predict_ai_score
function takes in a rewritten news article text and returns the predicted version number.The text was updated successfully, but these errors were encountered: