diff --git a/poetry.lock b/poetry.lock index cad3715..e7dd46d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2819,7 +2819,7 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "1.1" python-versions = ">=3.8, <3.11" -content-hash = "af285a24507a1a3688f515a97d1bcde4d76a818869df18c1498c27a970f71fc9" +content-hash = "81ef676376f6ce7390e5c720a4a5cbfb626038a6544bbf5d78d1a11617263e31" [metadata.files] absl-py = [ diff --git a/pxtextmining/factories/factory_model_performance.py b/pxtextmining/factories/factory_model_performance.py index fc8bbfc..9737acb 100644 --- a/pxtextmining/factories/factory_model_performance.py +++ b/pxtextmining/factories/factory_model_performance.py @@ -132,7 +132,6 @@ def get_multilabel_metrics( enhance_with_rules=enhance_with_rules, already_encoded=already_encoded, ) - y_pred = np.array(y_pred_df)[:, :-1].astype("int64") elif model_type == "sklearn": y_pred_df = predict_multilabel_sklearn( x_test, @@ -143,17 +142,28 @@ def get_multilabel_metrics( enhance_with_probs=True, enhance_with_rules=enhance_with_rules, ) - y_pred = np.array(y_pred_df)[:, :-1].astype("int64") else: raise ValueError( 'Please select valid model_type. Options are "bert" or "sklearn"' ) + y_pred = np.array(y_pred_df[labels]).astype("int64") # Calculate various metrics model_metrics["exact_accuracy"] = metrics.accuracy_score(y_test, y_pred) model_metrics["hamming_loss"] = metrics.hamming_loss(y_test, y_pred) model_metrics["macro_jaccard_score"] = metrics.jaccard_score( y_test, y_pred, average="macro" ) + y_probs = y_pred_df.filter(like="Probability", axis=1) + model_metrics["macro_roc_auc"] = metrics.roc_auc_score( + y_test, y_probs, multi_class="ovr" + ) + model_metrics[ + "Label ranking average precision" + ] = metrics.label_ranking_average_precision_score( + y_test, + y_probs, + ) + # Model summary if model_type in ("bert", "tf"): stringlist = [] model.summary(print_fn=lambda x: stringlist.append(x)) @@ -218,7 +228,7 @@ def parse_metrics_file(metrics_file, labels): "precision": [], "recall": [], "f1_score": [], - "support": [], + "support (label count in test data)": [], } for each in lines: splitted = each.split(" ") @@ -226,6 +236,56 @@ def parse_metrics_file(metrics_file, labels): metrics_dict["precision"].append(splitted[1].strip()) metrics_dict["recall"].append(splitted[2].strip()) metrics_dict["f1_score"].append(splitted[3].strip()) - metrics_dict["support"].append(splitted[4].strip()) + metrics_dict["support (label count in test data)"].append(splitted[4].strip()) metrics_df = pd.DataFrame.from_dict(metrics_dict) return metrics_df + + +def get_y_score(probs): + """Converts probabilities into format (n_samples, n_classes) so they can be passed into sklearn roc_auc_score function + + Args: + probs (np.ndarray): Probability estimates outputted by model + + Returns: + np.ndarray: Probability estimates in format (n_samples, n_classes) + """ + if probs.ndim == 3: + score = np.transpose([pred[:, 1] for pred in probs]) + elif probs.ndim == 2: + score = probs + return score + + +def additional_analysis(preds_df, y_true, labels): + """For given predictions, returns dataframe containing: macro one-vs-one ROC AUC score, number of True Positives, True Negatives, False Positives, and False Negatives. + + Args: + preds_df (pd.DataFrame): Dataframe containing predicted labels in one-hot encoded format + y_true (np.array): One-hot encoded real Y values + labels (List): List of the target labels + + Returns: + pd.DataFrame: dataframe containing: macro one-vs-one ROC AUC score, number of True Positives, True Negatives, False Positives, and False Negatives. + """ + # include threshold?? (later) + y_score = np.array(preds_df.filter(like="Probability", axis=1)) + cm = metrics.multilabel_confusion_matrix(y_true, np.array(preds_df[labels])) + cm_dict = {} + average_precision = {} + for i, label in enumerate(labels): + cm_meaning = {} + tn, fp = cm[i][0] + fn, tp = cm[i][1] + cm_meaning["True Negative"] = tn + cm_meaning["False Negative"] = fn + cm_meaning["True Positive"] = tp + cm_meaning["False Positive"] = fp + cm_dict[label] = cm_meaning + average_precision[label] = metrics.average_precision_score( + y_true[:, i], y_score[:, i] + ) + df = pd.DataFrame.from_dict(cm_dict, orient="index") + average_precision = pd.Series(average_precision) + df["average_precision_score"] = average_precision + return df diff --git a/pxtextmining/factories/factory_predict_unlabelled_text.py b/pxtextmining/factories/factory_predict_unlabelled_text.py index 58d566c..ca95e7f 100644 --- a/pxtextmining/factories/factory_predict_unlabelled_text.py +++ b/pxtextmining/factories/factory_predict_unlabelled_text.py @@ -84,6 +84,11 @@ def predict_multilabel_sklearn( predictions[row][label_index] = 1 preds_df = pd.DataFrame(predictions, index=processed_text.index, columns=labels) preds_df["labels"] = preds_df.apply(get_labels, args=(labels,), axis=1) + # add probs to df + if pred_probs.ndim == 3: + pred_probs = np.transpose([pred[:, 1] for pred in pred_probs]) + label_list = ['Probability of "' + label + '"' for label in labels] + preds_df[label_list] = pred_probs return preds_df @@ -142,6 +147,9 @@ def predict_multilabel_bert( predictions = y_binary preds_df = pd.DataFrame(predictions, index=processed_text.index, columns=labels) preds_df["labels"] = preds_df.apply(get_labels, args=(labels,), axis=1) + # add probs to df + label_list = ['Probability of "' + label + '"' for label in labels] + preds_df[label_list] = y_probs return preds_df diff --git a/pxtextmining/factories/factory_write_results.py b/pxtextmining/factories/factory_write_results.py index baa63ed..a1c4815 100644 --- a/pxtextmining/factories/factory_write_results.py +++ b/pxtextmining/factories/factory_write_results.py @@ -1,17 +1,20 @@ -import pickle import os +import pickle + import numpy as np import pandas as pd - from tensorflow.keras import Model, Sequential + +from pxtextmining.factories.factory_model_performance import ( + additional_analysis, + parse_metrics_file, +) from pxtextmining.factories.factory_predict_unlabelled_text import ( get_labels, - predict_multilabel_sklearn, - predict_multilabel_bert, get_probabilities, - predict_with_bert + predict_multilabel_bert, + predict_multilabel_sklearn, ) -from pxtextmining.factories.factory_model_performance import parse_metrics_file def write_multilabel_models_and_metrics(models, model_metrics, path): @@ -40,7 +43,14 @@ def write_multilabel_models_and_metrics(models, model_metrics, path): def write_model_preds( - x, y, model, labels, additional_features=True, path="labels.xlsx" + x, + y, + model, + labels, + additional_features=True, + path="labels.xlsx", + enhance_with_rules=False, + return_df=False, ): """Writes an Excel file to enable easier analysis of model outputs using the test set. Columns of the Excel file are: comment_id, actual_labels, predicted_labels, actual_label_probs, and predicted_label_probs. @@ -59,38 +69,33 @@ def write_model_preds( ) actual_labels.name = "actual_labels" if isinstance(model, Model) is True: - predicted_labels = predict_multilabel_bert( + preds_df = predict_multilabel_bert( x, model, labels=labels, additional_features=additional_features, label_fix=True, - ).reset_index()["labels"] + enhance_with_rules=enhance_with_rules, + ) + else: - predicted_labels = predict_multilabel_sklearn( + preds_df = predict_multilabel_sklearn( x, model, labels=labels, additional_features=additional_features, label_fix=True, enhance_with_probs=True, - ).reset_index()["labels"] + enhance_with_rules=enhance_with_rules, + ) + predicted_labels = preds_df.reset_index()["labels"] predicted_labels.name = "predicted_labels" df = x.reset_index() + probabilities = np.array(preds_df.filter(like="Probability", axis=1)) if isinstance(model, Model) is True: - probabilities = predict_with_bert( - x, - model, - max_length=150, - additional_features=additional_features, - already_encoded=False, - ) - else: - probabilities = np.array(model.predict_proba(x)) - if isinstance(model, Model) is True: - model_type = 'bert' + model_type = "bert" else: - model_type = 'sklearn' + model_type = "sklearn" probs_actual = get_probabilities( actual_labels, labels, probabilities, model_type=model_type ) @@ -102,13 +107,18 @@ def write_model_preds( df = df.merge(probs_actual, left_index=True, right_index=True) df = df.merge(probs_predicted, left_index=True, right_index=True) # Deal with any rogue characters - df.applymap(lambda x: x.encode('unicode_escape'). - decode('utf-8') if isinstance(x, str) else x) + df.applymap( + lambda x: x.encode("unicode_escape").decode("utf-8") + if isinstance(x, str) + else x + ) df.to_excel(path, index=False) print(f"Successfully completed, written to {path}") + if return_df is True: + return preds_df -def write_model_analysis(model_name, labels, dataset, path): +def write_model_analysis(model_name, labels, dataset, path, preds_df=None, y_true=None): """Writes an Excel file with the performance metrics of each label, as well as the counts of samples for each label. Args: @@ -120,6 +130,11 @@ def write_model_analysis(model_name, labels, dataset, path): metrics_df = parse_metrics_file(f"{path}/{model_name}.txt", labels=labels) label_counts = pd.DataFrame(dataset[labels].sum()) label_counts = label_counts.reset_index() - label_counts = label_counts.rename(columns={"index": "label", 0: "label_count"}) - metrics_df = metrics_df.merge(label_counts, on="label") - metrics_df.to_excel(f"{path}/{model_name}_perf.xlsx", index=False) + label_counts = label_counts.rename( + columns={"index": "label", 0: "label_count_in_full_dataset"} + ) + metrics_df = metrics_df.merge(label_counts, on="label").set_index("label") + if preds_df is not None and y_true is not None: + more_metrics = additional_analysis(preds_df, y_true, labels) + metrics_df = pd.concat([metrics_df, more_metrics], axis=1) + metrics_df.to_excel(f"{path}/{model_name}_perf.xlsx", index=True) diff --git a/pxtextmining/pipelines/multilabel_pipeline.py b/pxtextmining/pipelines/multilabel_pipeline.py index fdb136c..43bcd55 100644 --- a/pxtextmining/pipelines/multilabel_pipeline.py +++ b/pxtextmining/pipelines/multilabel_pipeline.py @@ -98,15 +98,23 @@ def run_sklearn_pipeline( if include_analysis is True: for i in range(len(models)): model_name = f"model_{i}" - write_model_preds( + preds_df = write_model_preds( X_test, Y_test, models[i], labels=target, additional_features=additional_features, path=f"{path}/{model_name}_labels.xlsx", + return_df=True, + ) + write_model_analysis( + model_name, + labels=target, + dataset=df, + path=path, + preds_df=preds_df, + y_true=Y_test, ) - write_model_analysis(model_name, labels=target, dataset=df, path=path) print("Pipeline complete") @@ -157,15 +165,23 @@ def run_svc_pipeline( ) write_multilabel_models_and_metrics([model], [model_metrics], path=path) if include_analysis is True: - write_model_preds( + preds_df = write_model_preds( X_test, Y_test, model, labels=target, additional_features=additional_features, path=f"{path}/labels.xlsx", + return_df=True, + ) + write_model_analysis( + model_name="model_0", + labels=target, + dataset=df, + path=path, + preds_df=preds_df, + y_true=Y_test, ) - write_model_analysis(model_name="model_0", labels=target, dataset=df, path=path) print("Pipeline complete!") @@ -237,15 +253,23 @@ def run_bert_pipeline( ) write_multilabel_models_and_metrics([model_trained], [model_metrics], path=path) if include_analysis is True: - write_model_preds( + preds_df = write_model_preds( X_test, Y_test, model, labels=target, additional_features=additional_features, path=f"{path}/labels.xlsx", + return_df=True, + ) + write_model_analysis( + model_name="model_0", + labels=target, + dataset=df, + path=path, + preds_df=preds_df, + y_true=Y_test, ) - write_model_analysis(model_name="model_0", labels=target, dataset=df, path=path) print("Pipeline complete!") @@ -346,18 +370,18 @@ def run_two_layer_sklearn_pipeline( # path="test_multilabel/v6_230724/svc_nofeats", # include_analysis=True, # ) - # run_svc_pipeline( - # additional_features=True, - # target=minor_cats, - # path="test_multilabel/v6_230724/svc", - # include_analysis=True, - # ) - run_bert_pipeline( + run_svc_pipeline( additional_features=True, - path="test_multilabel/v6_230724/bert", target=minor_cats, + path="test_multilabel/test_roc/svc", include_analysis=True, ) + # run_bert_pipeline( + # additional_features=True, + # path="test_multilabel/v6_230724/bert", + # target=minor_cats, + # include_analysis=True, + # ) # run_sklearn_pipeline( # additional_features=True, # target=minor_cats, diff --git a/pyproject.toml b/pyproject.toml index d95e329..baa8f2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ matplotlib = "^3.3.2" numpy = ">=1.22" pandas = "^1.4.0" scikit-learn = "1.0.2" -tensorflow = "^2.11.0" +tensorflow = "2.12.0" transformers = "^4.26.1" scipy = "^1.10.1" xgboost = "^1.7.5" diff --git a/tests/test_api.py b/tests/test_api.py index 1420bef..c103894 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,7 +1,9 @@ -from fastapi.testclient import TestClient -from api.api import app from unittest.mock import AsyncMock, Mock, patch + import numpy as np +from fastapi.testclient import TestClient + +from api.api import app client = TestClient(app) @@ -76,4 +78,4 @@ def test_sentiment_predictions(): ] response = client.post("/predict_sentiment", json=test_json).json() assert len(test_json) == len(response) - assert type(response[0]["sentiment"]) == int + assert isinstance(response[0]["sentiment"], int) is True diff --git a/tests/test_factory_pipeline.py b/tests/test_factory_pipeline.py index 3c8c223..ff2b35c 100644 --- a/tests/test_factory_pipeline.py +++ b/tests/test_factory_pipeline.py @@ -15,7 +15,7 @@ def test_create_sklearn_pipeline_sentiment(model_type, additional_features): pipe, params = factory_pipeline.create_sklearn_pipeline_sentiment( model_type, 3, tokenizer=None, additional_features=additional_features ) - assert type(params) == dict + assert isinstance(params, dict) is True assert is_classifier(pipe) is True @@ -23,7 +23,7 @@ def test_create_sklearn_pipeline_sentiment(model_type, additional_features): def test_create_bert_model(multilabel): Y_train = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]]) model = factory_pipeline.create_bert_model(Y_train, multilabel=multilabel) - assert type(model) == Functional + assert isinstance(model, Functional) is True @pytest.mark.parametrize("multilabel", [True, False]) @@ -32,7 +32,7 @@ def test_create_bert_model_additional_features(multilabel): model = factory_pipeline.create_bert_model_additional_features( Y_train, multilabel=multilabel ) - assert type(model) == Functional + assert isinstance(model, Functional) is True def test_train_bert_model(): @@ -43,7 +43,7 @@ def test_train_bert_model(): train_dataset, test_dataset, model ) model.fit.assert_called_once() - assert type(training_time) == str + assert isinstance(training_time, str) is True def test_calculating_class_weights(): @@ -51,7 +51,7 @@ def test_calculating_class_weights(): [[0, 1, 0], [1, 0, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 0, 0]] ) class_weights_dict = factory_pipeline.calculating_class_weights(Y_train) - assert type(class_weights_dict) == dict + assert isinstance(class_weights_dict, dict) is True @pytest.mark.parametrize("model_type", ["svm", "xgb", "rfc", "mnb", "knn"]) @@ -62,7 +62,7 @@ def test_create_sklearn_pipeline(model_type, tokenizer, additional_features): model_type, tokenizer, additional_features ) assert is_classifier(pipe) is True - assert type(params) == dict + assert isinstance(params, dict) is True @pytest.mark.parametrize("target", ["sentiment", None]) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 55edd9c..f1cdf88 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,13 +1,16 @@ -from pxtextmining.helpers.text_preprocessor import tf_preprocessing # from pxtextmining.helpers.tokenization import spacy_tokenizer import numpy as np +from pxtextmining.helpers.text_preprocessor import tf_preprocessing + + def test_text_preprocessor(grab_test_X_additional_feats): - data = grab_test_X_additional_feats['FFT answer'] + data = grab_test_X_additional_feats["FFT answer"] X_pad, vocab_size = tf_preprocessing(data) - assert type(X_pad) == np.ndarray + assert isinstance(X_pad, np.ndarray) is True assert len(X_pad) == data.shape[0] - assert type(vocab_size) == int + assert isinstance(vocab_size, int) is True + # def test_spacy_tokenizer(): # document = 'This is some incredibly interesting text' diff --git a/tests/test_model_performance.py b/tests/test_model_performance.py index c842788..194d280 100644 --- a/tests/test_model_performance.py +++ b/tests/test_model_performance.py @@ -78,7 +78,7 @@ def test_multiclass_metrics_sklearn(grab_test_X_additional_feats): metrics_string = factory_model_performance.get_multiclass_metrics( x, y, labels, random_state, model, additional_features ) - assert type(metrics_string) == str + assert isinstance(metrics_string, str) is True def test_multiclass_metrics_bert( @@ -101,7 +101,7 @@ def test_multiclass_metrics_bert( metrics_string = factory_model_performance.get_multiclass_metrics( x, y, labels, random_state, model, additional_features ) - assert type(metrics_string) == str + assert isinstance(metrics_string, str) is True def test_multilabel_metrics_sklearn(grab_test_X_additional_feats): @@ -129,7 +129,7 @@ def test_multilabel_metrics_sklearn(grab_test_X_additional_feats): model, additional_features=additional_features, ) - assert type(metrics_string) == str + assert isinstance(metrics_string, str) is True def test_multilabel_metrics_bert( @@ -159,7 +159,7 @@ def test_multilabel_metrics_bert( model, additional_features=additional_features, ) - assert type(metrics_string) == str + assert isinstance(metrics_string, str) is True def test_accuracy_per_class(): diff --git a/tests/test_predict_unlabelled_text.py b/tests/test_predict_unlabelled_text.py index 379e654..4332496 100644 --- a/tests/test_predict_unlabelled_text.py +++ b/tests/test_predict_unlabelled_text.py @@ -1,8 +1,10 @@ -from pxtextmining.factories import factory_predict_unlabelled_text -import pandas as pd -import numpy as np from unittest.mock import Mock +import numpy as np +import pandas as pd + +from pxtextmining.factories import factory_predict_unlabelled_text + def test_get_probabilities_bert(): label_series = pd.Series([["label_one"], ["label_two", "label_three"]], name="test") @@ -65,11 +67,25 @@ def test_predict_multilabel_sklearn(): ] ).set_index("Comment ID") predictions = np.array([[0, 1, 0], [1, 0, 1], [0, 0, 1]]) - predicted_probs = [ - [[0.80465788, 0.19534212], [0.94292979, 0.05707021], [0.33439024, 0.66560976]], - [[0.33439024, 0.66560976], [0.9949298, 0.0050702], [0.99459238, 0.00540762]], - [[0.97472981, 0.02527019], [0.25069129, 0.74930871], [0.33439024, 0.66560976]], - ] + predicted_probs = np.array( + [ + [ + [0.80465788, 0.19534212], + [0.94292979, 0.05707021], + [0.33439024, 0.66560976], + ], + [ + [0.33439024, 0.66560976], + [0.9949298, 0.0050702], + [0.99459238, 0.00540762], + ], + [ + [0.97472981, 0.02527019], + [0.25069129, 0.74930871], + [0.33439024, 0.66560976], + ], + ] + ) labels = ["first", "second", "third"] model = Mock( predict=Mock(return_value=predictions), @@ -78,7 +94,8 @@ def test_predict_multilabel_sklearn(): preds_df = factory_predict_unlabelled_text.predict_multilabel_sklearn( data, model, labels=labels, additional_features=True ) - assert preds_df.shape == (3, 4) + cols = len(labels) * 2 + 1 + assert preds_df.shape == (3, cols) def test_predict_multilabel_sklearn_additional_params(grab_test_X_additional_feats): @@ -94,7 +111,8 @@ def test_predict_multilabel_sklearn_additional_params(grab_test_X_additional_fea label_fix=False, enhance_with_probs=False, ) - assert preds_df.shape == (3, 4) + cols = len(labels) * 2 + 1 + assert preds_df.shape == (3, cols) def test_predict_multilabel_bert(): @@ -140,7 +158,8 @@ def test_predict_multilabel_bert(): preds_df = factory_predict_unlabelled_text.predict_multilabel_bert( data, model, labels=labels, additional_features=True ) - assert preds_df.shape == (4, 6) + cols = len(labels) * 2 + 1 + assert preds_df.shape == (4, cols) def test_predict_sentiment_bert(): diff --git a/tests/test_write_results.py b/tests/test_write_results.py index 390651f..b4a1532 100644 --- a/tests/test_write_results.py +++ b/tests/test_write_results.py @@ -1,8 +1,10 @@ -from pxtextmining.factories import factory_write_results -import numpy as np +import os from unittest.mock import Mock, mock_open, patch + +import numpy as np from tensorflow.keras import Model -import os + +from pxtextmining.factories import factory_write_results @patch("pickle.dump", Mock()) @@ -43,9 +45,10 @@ def test_write_model_preds_sklearn(mock_toexcel, grab_test_X_additional_feats): # act factory_write_results.write_model_preds(x, y, mock_model, labels, path=path) # assert - mock_model.predict_proba.assert_called_with(x) + mock_model.predict_proba.assert_called() mock_toexcel.assert_called() + @patch("pxtextmining.factories.factory_write_results.pd.DataFrame.to_excel") def test_write_model_preds_bert(mock_toexcel, grab_test_X_additional_feats): # arrange @@ -57,10 +60,8 @@ def test_write_model_preds_bert(mock_toexcel, grab_test_X_additional_feats): [9.8868138e-01, 1.9990385e-03, 5.4453085e-03], [5.6546849e-01, 4.2310607e-01, 9.3136989e-03], ] - ) - mock_model = Mock(spec=Model, - predict=Mock(return_value=predicted_probs) ) + mock_model = Mock(spec=Model, predict=Mock(return_value=predicted_probs)) labels = ["A", "B", "C"] path = "somepath.xlsx" # act