Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

126 roc #131

Merged
merged 12 commits into from
Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

68 changes: 64 additions & 4 deletions pxtextmining/factories/factory_model_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ def get_multilabel_metrics(
enhance_with_rules=enhance_with_rules,
already_encoded=already_encoded,
)
y_pred = np.array(y_pred_df)[:, :-1].astype("int64")
elif model_type == "sklearn":
y_pred_df = predict_multilabel_sklearn(
x_test,
Expand All @@ -143,17 +142,28 @@ def get_multilabel_metrics(
enhance_with_probs=True,
enhance_with_rules=enhance_with_rules,
)
y_pred = np.array(y_pred_df)[:, :-1].astype("int64")
else:
raise ValueError(
'Please select valid model_type. Options are "bert" or "sklearn"'
)
y_pred = np.array(y_pred_df[labels]).astype("int64")
# Calculate various metrics
model_metrics["exact_accuracy"] = metrics.accuracy_score(y_test, y_pred)
model_metrics["hamming_loss"] = metrics.hamming_loss(y_test, y_pred)
model_metrics["macro_jaccard_score"] = metrics.jaccard_score(
y_test, y_pred, average="macro"
)
y_probs = y_pred_df.filter(like="Probability", axis=1)
model_metrics["macro_roc_auc"] = metrics.roc_auc_score(
y_test, y_probs, multi_class="ovr"
)
model_metrics[
"Label ranking average precision"
] = metrics.label_ranking_average_precision_score(
y_test,
y_probs,
)
# Model summary
if model_type in ("bert", "tf"):
stringlist = []
model.summary(print_fn=lambda x: stringlist.append(x))
Expand Down Expand Up @@ -218,14 +228,64 @@ def parse_metrics_file(metrics_file, labels):
"precision": [],
"recall": [],
"f1_score": [],
"support": [],
"support (label count in test data)": [],
}
for each in lines:
splitted = each.split(" ")
metrics_dict["label"].append(splitted[0].strip())
metrics_dict["precision"].append(splitted[1].strip())
metrics_dict["recall"].append(splitted[2].strip())
metrics_dict["f1_score"].append(splitted[3].strip())
metrics_dict["support"].append(splitted[4].strip())
metrics_dict["support (label count in test data)"].append(splitted[4].strip())
metrics_df = pd.DataFrame.from_dict(metrics_dict)
return metrics_df


def get_y_score(probs):
"""Converts probabilities into format (n_samples, n_classes) so they can be passed into sklearn roc_auc_score function

Args:
probs (np.ndarray): Probability estimates outputted by model

Returns:
np.ndarray: Probability estimates in format (n_samples, n_classes)
"""
if probs.ndim == 3:
score = np.transpose([pred[:, 1] for pred in probs])
elif probs.ndim == 2:
score = probs
return score


def additional_analysis(preds_df, y_true, labels):
"""For given predictions, returns dataframe containing: macro one-vs-one ROC AUC score, number of True Positives, True Negatives, False Positives, and False Negatives.

Args:
preds_df (pd.DataFrame): Dataframe containing predicted labels in one-hot encoded format
y_true (np.array): One-hot encoded real Y values
labels (List): List of the target labels

Returns:
pd.DataFrame: dataframe containing: macro one-vs-one ROC AUC score, number of True Positives, True Negatives, False Positives, and False Negatives.
"""
# include threshold?? (later)
y_score = np.array(preds_df.filter(like="Probability", axis=1))
cm = metrics.multilabel_confusion_matrix(y_true, np.array(preds_df[labels]))
cm_dict = {}
average_precision = {}
for i, label in enumerate(labels):
cm_meaning = {}
tn, fp = cm[i][0]
fn, tp = cm[i][1]
cm_meaning["True Negative"] = tn
cm_meaning["False Negative"] = fn
cm_meaning["True Positive"] = tp
cm_meaning["False Positive"] = fp
cm_dict[label] = cm_meaning
average_precision[label] = metrics.average_precision_score(
y_true[:, i], y_score[:, i]
)
df = pd.DataFrame.from_dict(cm_dict, orient="index")
average_precision = pd.Series(average_precision)
df["average_precision_score"] = average_precision
return df
8 changes: 8 additions & 0 deletions pxtextmining/factories/factory_predict_unlabelled_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ def predict_multilabel_sklearn(
predictions[row][label_index] = 1
preds_df = pd.DataFrame(predictions, index=processed_text.index, columns=labels)
preds_df["labels"] = preds_df.apply(get_labels, args=(labels,), axis=1)
# add probs to df
if pred_probs.ndim == 3:
pred_probs = np.transpose([pred[:, 1] for pred in pred_probs])
label_list = ['Probability of "' + label + '"' for label in labels]
preds_df[label_list] = pred_probs
return preds_df


Expand Down Expand Up @@ -142,6 +147,9 @@ def predict_multilabel_bert(
predictions = y_binary
preds_df = pd.DataFrame(predictions, index=processed_text.index, columns=labels)
preds_df["labels"] = preds_df.apply(get_labels, args=(labels,), axis=1)
# add probs to df
label_list = ['Probability of "' + label + '"' for label in labels]
preds_df[label_list] = y_probs
return preds_df


Expand Down
73 changes: 44 additions & 29 deletions pxtextmining/factories/factory_write_results.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
import pickle
import os
import pickle

import numpy as np
import pandas as pd

from tensorflow.keras import Model, Sequential

from pxtextmining.factories.factory_model_performance import (
additional_analysis,
parse_metrics_file,
)
from pxtextmining.factories.factory_predict_unlabelled_text import (
get_labels,
predict_multilabel_sklearn,
predict_multilabel_bert,
get_probabilities,
predict_with_bert
predict_multilabel_bert,
predict_multilabel_sklearn,
)
from pxtextmining.factories.factory_model_performance import parse_metrics_file


def write_multilabel_models_and_metrics(models, model_metrics, path):
Expand Down Expand Up @@ -40,7 +43,14 @@ def write_multilabel_models_and_metrics(models, model_metrics, path):


def write_model_preds(
x, y, model, labels, additional_features=True, path="labels.xlsx"
x,
y,
model,
labels,
additional_features=True,
path="labels.xlsx",
enhance_with_rules=False,
return_df=False,
):
"""Writes an Excel file to enable easier analysis of model outputs using the test set. Columns of the Excel file are: comment_id, actual_labels, predicted_labels, actual_label_probs, and predicted_label_probs.

Expand All @@ -59,38 +69,33 @@ def write_model_preds(
)
actual_labels.name = "actual_labels"
if isinstance(model, Model) is True:
predicted_labels = predict_multilabel_bert(
preds_df = predict_multilabel_bert(
x,
model,
labels=labels,
additional_features=additional_features,
label_fix=True,
).reset_index()["labels"]
enhance_with_rules=enhance_with_rules,
)

else:
predicted_labels = predict_multilabel_sklearn(
preds_df = predict_multilabel_sklearn(
x,
model,
labels=labels,
additional_features=additional_features,
label_fix=True,
enhance_with_probs=True,
).reset_index()["labels"]
enhance_with_rules=enhance_with_rules,
)
predicted_labels = preds_df.reset_index()["labels"]
predicted_labels.name = "predicted_labels"
df = x.reset_index()
probabilities = np.array(preds_df.filter(like="Probability", axis=1))
if isinstance(model, Model) is True:
probabilities = predict_with_bert(
x,
model,
max_length=150,
additional_features=additional_features,
already_encoded=False,
)
else:
probabilities = np.array(model.predict_proba(x))
if isinstance(model, Model) is True:
model_type = 'bert'
model_type = "bert"
else:
model_type = 'sklearn'
model_type = "sklearn"
probs_actual = get_probabilities(
actual_labels, labels, probabilities, model_type=model_type
)
Expand All @@ -102,13 +107,18 @@ def write_model_preds(
df = df.merge(probs_actual, left_index=True, right_index=True)
df = df.merge(probs_predicted, left_index=True, right_index=True)
# Deal with any rogue characters
df.applymap(lambda x: x.encode('unicode_escape').
decode('utf-8') if isinstance(x, str) else x)
df.applymap(
lambda x: x.encode("unicode_escape").decode("utf-8")
if isinstance(x, str)
else x
)
df.to_excel(path, index=False)
print(f"Successfully completed, written to {path}")
if return_df is True:
return preds_df


def write_model_analysis(model_name, labels, dataset, path):
def write_model_analysis(model_name, labels, dataset, path, preds_df=None, y_true=None):
"""Writes an Excel file with the performance metrics of each label, as well as the counts of samples for each label.

Args:
Expand All @@ -120,6 +130,11 @@ def write_model_analysis(model_name, labels, dataset, path):
metrics_df = parse_metrics_file(f"{path}/{model_name}.txt", labels=labels)
label_counts = pd.DataFrame(dataset[labels].sum())
label_counts = label_counts.reset_index()
label_counts = label_counts.rename(columns={"index": "label", 0: "label_count"})
metrics_df = metrics_df.merge(label_counts, on="label")
metrics_df.to_excel(f"{path}/{model_name}_perf.xlsx", index=False)
label_counts = label_counts.rename(
columns={"index": "label", 0: "label_count_in_full_dataset"}
)
metrics_df = metrics_df.merge(label_counts, on="label").set_index("label")
if preds_df is not None and y_true is not None:
more_metrics = additional_analysis(preds_df, y_true, labels)
metrics_df = pd.concat([metrics_df, more_metrics], axis=1)
metrics_df.to_excel(f"{path}/{model_name}_perf.xlsx", index=True)
52 changes: 38 additions & 14 deletions pxtextmining/pipelines/multilabel_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,23 @@ def run_sklearn_pipeline(
if include_analysis is True:
for i in range(len(models)):
model_name = f"model_{i}"
write_model_preds(
preds_df = write_model_preds(
X_test,
Y_test,
models[i],
labels=target,
additional_features=additional_features,
path=f"{path}/{model_name}_labels.xlsx",
return_df=True,
)
write_model_analysis(
model_name,
labels=target,
dataset=df,
path=path,
preds_df=preds_df,
y_true=Y_test,
)
write_model_analysis(model_name, labels=target, dataset=df, path=path)
print("Pipeline complete")


Expand Down Expand Up @@ -157,15 +165,23 @@ def run_svc_pipeline(
)
write_multilabel_models_and_metrics([model], [model_metrics], path=path)
if include_analysis is True:
write_model_preds(
preds_df = write_model_preds(
X_test,
Y_test,
model,
labels=target,
additional_features=additional_features,
path=f"{path}/labels.xlsx",
return_df=True,
)
write_model_analysis(
model_name="model_0",
labels=target,
dataset=df,
path=path,
preds_df=preds_df,
y_true=Y_test,
)
write_model_analysis(model_name="model_0", labels=target, dataset=df, path=path)
print("Pipeline complete!")


Expand Down Expand Up @@ -237,15 +253,23 @@ def run_bert_pipeline(
)
write_multilabel_models_and_metrics([model_trained], [model_metrics], path=path)
if include_analysis is True:
write_model_preds(
preds_df = write_model_preds(
X_test,
Y_test,
model,
labels=target,
additional_features=additional_features,
path=f"{path}/labels.xlsx",
return_df=True,
)
write_model_analysis(
model_name="model_0",
labels=target,
dataset=df,
path=path,
preds_df=preds_df,
y_true=Y_test,
)
write_model_analysis(model_name="model_0", labels=target, dataset=df, path=path)
print("Pipeline complete!")


Expand Down Expand Up @@ -346,18 +370,18 @@ def run_two_layer_sklearn_pipeline(
# path="test_multilabel/v6_230724/svc_nofeats",
# include_analysis=True,
# )
# run_svc_pipeline(
# additional_features=True,
# target=minor_cats,
# path="test_multilabel/v6_230724/svc",
# include_analysis=True,
# )
run_bert_pipeline(
run_svc_pipeline(
additional_features=True,
path="test_multilabel/v6_230724/bert",
target=minor_cats,
path="test_multilabel/test_roc/svc",
include_analysis=True,
)
# run_bert_pipeline(
# additional_features=True,
# path="test_multilabel/v6_230724/bert",
# target=minor_cats,
# include_analysis=True,
# )
# run_sklearn_pipeline(
# additional_features=True,
# target=minor_cats,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ matplotlib = "^3.3.2"
numpy = ">=1.22"
pandas = "^1.4.0"
scikit-learn = "1.0.2"
tensorflow = "^2.11.0"
tensorflow = "2.12.0"
transformers = "^4.26.1"
scipy = "^1.10.1"
xgboost = "^1.7.5"
Expand Down
8 changes: 5 additions & 3 deletions tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from fastapi.testclient import TestClient
from api.api import app
from unittest.mock import AsyncMock, Mock, patch

import numpy as np
from fastapi.testclient import TestClient

from api.api import app

client = TestClient(app)

Expand Down Expand Up @@ -76,4 +78,4 @@ def test_sentiment_predictions():
]
response = client.post("/predict_sentiment", json=test_json).json()
assert len(test_json) == len(response)
assert type(response[0]["sentiment"]) == int
assert isinstance(response[0]["sentiment"], int) is True
Loading