From 3b906ea97e8a88a539e505936960f94d86b3a20f Mon Sep 17 00:00:00 2001 From: LemonadeXyz <62137944+LemonadeXyz@users.noreply.github.com> Date: Fri, 20 Aug 2021 21:53:36 +0800 Subject: [PATCH] Add files via upload 20210820 by Y.Wang --- README.md | 52 +++++++- main.py | 248 ++++++++++++++++++++++++++++++++++++ my_kappa_calculator.py | 18 +++ quadratic_weighted_kappa.py | 229 +++++++++++++++++++++++++++++++++ stepwise_selection.py | 87 +++++++++++++ 5 files changed, 632 insertions(+), 2 deletions(-) create mode 100644 main.py create mode 100644 my_kappa_calculator.py create mode 100644 quadratic_weighted_kappa.py create mode 100644 stepwise_selection.py diff --git a/README.md b/README.md index 76f4b08..28a3f73 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,50 @@ -# L2C-rater -Automated Essay Scoring Method for Chinese Second Language Writing +# L2C-rater (Automated Essay Scoring Method for Chinese Second Language Writing) + +An automatic essay scoring method for Chinese L2 writing based on linear models, tree-structure models and logistic regression models, especially Ordinal Logistic Regression(OLR), combining 90 linguistic features and Tf-Idf textual representations. + +### Enviroment +Python 3.6+ + +### Main Package +numpy 1.19.5 +pandas 1.1.5 +scipy 1.5.2 +sci-kit-learn 0.24.1 +pyltp 0.1.9.1 +jieba +xgboost +mord + +### Set Up ### + +* Configure the environment +* Prepare data +* Run main.py + +### Dataset ### + +We conduct 5-fold cross validation on HSK Dynamic Composition Corpus to evaluate our system. This dataset (this code can be run with part of the data in [here]()) is publicly available [here](http://hsk.blcu.edu.cn/). + +### Options + +You can see the list of available options by running: + +```bash +python main.py -h +``` + +For ```feature_mode```, ```t``` means using **text representation** only when training the model, ```l``` means using **linguistics features** only, and ```b``` means using both the above features. + +For ```feature_type```, ```c``` means segmenting the essay to **character** level when generating text representation, ```w``` means segmenting the essay to **word** level, ```cw``` means using both the above, ```wp``` means segmenting the essay to **word** level and tagging their **part-of-speech(POS)**, and ```cwp``` means using both ```c``` and ```wp```. + +### Example ### + +The following command trains the best model among all automated models. Note that you will not get a convincing result with data provided [here](), because the amount of data is not large enough. + +```bash +python main.py -m b -t wp +``` + +### Publication ### + +Wang Y., Hu R. (2021) A Prompt-Independent and Interpretable Automated Essay Scoring Method for Chinese Second Language Writing. In: Li S. et al. (eds) Chinese Computational Linguistics. CCL 2021. Lecture Notes in Computer Science, vol 12869. Springer, Cham. https://doi.org/10.1007/978-3-030-84186-7_30 diff --git a/main.py b/main.py new file mode 100644 index 0000000..f59503e --- /dev/null +++ b/main.py @@ -0,0 +1,248 @@ +import argparse +import time +import logging +import numpy as np +import pandas as pd +import scipy.stats +import xgboost as xgb +from sklearn.utils import shuffle +from sklearn.preprocessing import MinMaxScaler +from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression +from sklearn.ensemble import RandomForestRegressor +from mord import LogisticAT, OrdinalRidge +from sklearn.metrics import mean_squared_error, confusion_matrix +from utils.text import * +from utils.text_repre import * +from linguistic import getLinguisticIndices +from stepwise_selection import * +from my_kappa_calculator import quadratic_weighted_kappa as qwk + +# set logger +logging.basicConfig(filename='hsk.log', level=logging.INFO) +logger = logging.getLogger(__name__) + +# set argparse +parser = argparse.ArgumentParser() +parser.add_argument("-mp", "--modelpath", dest="model_path", type=str, metavar='', default='./ltp_models', + help="the model directory") +parser.add_argument("-dp", "--datapath", dest="data_path", type=str, metavar='', default='./data', + help="the input directory") +parser.add_argument("-m", "--feature_mode", dest="feature_mode", type=str, metavar='', default='l', + help="feature mode (t|l|b)(default=l)") + +parser.add_argument("-t", "--feature_type", dest="feature_type", type=str, metavar='', default='c', + help="feature type (c|w|cw|wp|cwp) (default=c)") + +parser.add_argument("--max_depth", dest="max_depth", type=int, metavar='', default=40, + help="max_depth of random forest regressior") + +parser.add_argument("--maxd_xgb", dest="maxd_xgb", type=int, metavar='', default=3, + help="max_depth of XGBRegressor") +parser.add_argument("-lr", dest="learning_rate", type=float, metavar='', default=0.05, + help="learning_rate of XGBRegressor") +parser.add_argument("--min_cw", dest="min_child_weight", type=int, metavar='', default=5, + help="min_child_weight of XGBRegressor") +parser.add_argument("--n_estimators", dest="n_estimators", type=int, metavar='', default=300, + help="n_estimators of XGBRegressor") +parser.add_argument("-gm", dest="gamma", type=int, metavar='', default=5, + help="gamma of XBClassifier") + +parser.add_argument("--olr_penalty", dest="olr_penalty", type=int, metavar='', default=1., + help="regular parameter for OLR. Zero value means no regularization") +parser.add_argument("--rdg_penalty", dest="rdg_penalty", type=int, metavar='', default=1., + help="regular parameter for Ridge") + +parser.add_argument("--ngram_max", dest="max_n", type=int, metavar='', default=1, + help="upper bound of ngram range") +parser.add_argument("--ngram_min", dest="min_n", type=int, metavar='', default=1, + help="lower bound of ngram range") +parser.add_argument("--df_threshold", dest="min_df", type=int, metavar='', default=10, + help="document frequency when building vocabulary") +args = parser.parse_args() + +model_path = args.model_path +data_path = args.data_path +feature_mode = args.feature_mode +feature_type = args.feature_type +max_depth = args.max_depth +maxd_xgb = args.maxd_xgb +n_estimators = args.n_estimators +lr = args.learning_rate +min_child_weight = args.min_child_weight +gamma = args.gamma +olr_penalty = args.olr_penalty +rdg_penalty = args.rdg_penalty +max_n = args.max_n +min_n = args.min_n +min_df = args.min_df + +logger.info("feature_mode: " + str(feature_mode)) +logger.info("feature_type: " + str(feature_type)) +logger.info("n-grams range: [ %s, %s]" % (str(min_n), str(max_n))) +logger.info("term frequency threshold: " + str(min_df)) + +t1 = time.time() + +# Generating Linguistics Features +index_data = {} +corpus = [] +df_data = pd.read_csv(data_path + '/essay_revised.csv') + +# shuffle +# df_data = shuffle(df_data) +# df_data = df_data.reset_index(drop=True) + +segmentor, postagger, parser = load_ltpmodel(model_path) + +for index, row in df_data.iterrows(): + essay_id, essay_text, essay_score = row['essay_ID'], row['ESSAY'], row['SCORE'] + text_dict = text_process(essay_text, segmentor, postagger, parser) + # build corpus for "text|both" feature setting using the text_dict, the parsing results from ltp + if feature_mode in ['t', 'b']: + corpus_line = get_text_feature_from_ltp_results(essay_text, text_dict, feature=feature_type) + corpus.append(corpus_line) + indices = getLinguisticIndices(text_dict) + indices['essay_ID'] = essay_id + indices['SCORE'] = essay_score + index_data[index] = indices + +release_ltpmodel(segmentor, postagger, parser) + +df_ling = pd.DataFrame.from_dict(index_data, orient='index') + +for column_name in df_ling.columns: + if column_name not in ['SCORE', 'essay_ID']: + x = df_ling[column_name].values.reshape(-1, 1) + min_max_scaler = MinMaxScaler() + df_ling[column_name] = min_max_scaler.fit_transform(x) + +df_ling = df_ling.fillna(0) +df_ling.to_csv(data_path + '/essay_linguistic_indices.csv', index=False) + +ling_name = list(df_ling.columns) +ling_name.remove('essay_ID') + +# Get effective linguistic feature stepwise regression +logger.info('>>>>>>>>> STEPWISE REGRESSION <<<<<<<<<') +features_automatic = stepwiseSelection(df_ling.filter(items=ling_name), 'SCORE', verbose=False) +logger.info(f'Linguistics Features Selected:\n{features_automatic}') + +if feature_mode == 't': # text mode + # Get text representation + X = get_text_matrix(corpus, ngram_min=min_n, ngram_max=max_n, df_threshold=min_df) + Y = np.array([score2ord[score] for score in df_data['SCORE']]) + +elif feature_mode == 'l': # ling mode + # Get linguistic Feature Vec + X = np.array(df_ling.filter(items=features_automatic)) + Y = np.array([score2ord[score] for score in df_ling['SCORE']]) + +else: # both mode + # Get text representation + txt_vec = get_text_matrix(corpus, ngram_min=min_n, ngram_max=max_n, + df_threshold=min_df, sparse=True) + # Get linguistic Feature Vec + lng_vec = np.array(df_ling.filter(items=features_automatic)) + X = np.concatenate((txt_vec, lng_vec), axis=1) + Y = np.array([score2ord[score] for score in df_ling['SCORE']]) + + +# Set models and their parameters +# Note that OrdinalRidge and Ridge Regression are identical +models = { + 'LinearRegression': LinearRegression(), + 'Ridge': Ridge(alpha=rdg_penalty), + 'RandomForestRegressor': RandomForestRegressor(max_depth=max_depth, random_state=0), + 'XGBRegressor': xgb.XGBRegressor(max_depth=maxd_xgb, + learning_rate=lr, + n_estimators=n_estimators, + min_child_weight=min_child_weight, + subsample=0.7, + colsample_bytree=0.7, + reg_alpha=0, + reg_lambda=0, + silent=True, + objective='reg:gamma', + missing=None, + seed=123, + gamma=gamma), + 'LogisticRegression': LogisticRegression(max_iter=1000), + 'OrderedLR_LogisticAT': LogisticAT(alpha=olr_penalty) + # 'OrdinalRidge': OrdinalRidge(alpha=olr_penalty) + } + +# train/dev/test split +# Rewrite following codes according to your data +X_tr_val = X[:90] +Y_tr_val = Y[:90] +X_test = X[90:] +Y_test = Y[90:] + +X_lst = np.array_split(X_tr_val, 5) +Y_lst = np.array_split(Y_tr_val, 5) + +logger.info('>>>>>>>>>> Cross Validation <<<<<<<<<') + +# split train_valid to train/valid/test and CV +for i in range(5): + X_frame = list() + Y_frame = list() + for j in range(5): + if j != i: + X_frame.append(X_lst[j]) + Y_frame.append(Y_lst[j]) + X_train = np.concatenate(X_frame, axis=0) + Y_train = np.concatenate(Y_frame, axis=0) + X_valid = X_lst[i] + Y_valid = Y_lst[i] + + # write shape of train/dev/test in log + logger.info('\n########## Fold {} ##########'.format(i+1)) + logger.info(f'tr: {X_train.shape} dev: {X_valid.shape} ts: {X_test.shape}') + logger.info(f'tr: {Y_train.shape} dev: {Y_valid.shape} ts: {Y_test.shape}') + + for name, lm in models.items(): + logger.info('\n=========== {} ============'.format(name)) + lm.fit(X_train, Y_train) + + # predict + Y_train_pred = lm.predict(X_train) + Y_valid_pred = lm.predict(X_valid) + Y_test_pred = lm.predict(X_test) + + # Modified too small values to the lower bound 0 + # Modified too large values to the upper bound 11 + Y_train_pred[np.where(Y_train_pred < 0)] = 0 + Y_train_pred[np.where(Y_train_pred > 11)] = 11 + Y_valid_pred[np.where(Y_valid_pred < 0)] = 0 + Y_valid_pred[np.where(Y_valid_pred > 11)] = 11 + Y_test_pred[np.where(Y_test_pred < 0)] = 0 + Y_test_pred[np.where(Y_test_pred > 11)] = 11 + + # transform predicted values to integer + Y_train_pred_int = np.rint(Y_train_pred).astype('int32') + Y_valid_pred_int = np.rint(Y_valid_pred).astype('int32') + Y_test_pred_int = np.rint(Y_test_pred).astype('int32') + + # output + logger.info(f'[TRAIN QWK] {qwk(Y_train, Y_train_pred_int, 0, 11)}') + logger.info(f'[VALID QWK] {qwk(Y_valid, Y_valid_pred_int, 0, 11)}') + logger.info(f'[TEST QWK] {qwk(Y_test, Y_test_pred_int, 0, 11)}') + + train_rmse = np.sqrt(mean_squared_error(Y_train, Y_train_pred_int)) + valid_rmse = np.sqrt(mean_squared_error(Y_valid, Y_valid_pred_int)) + test_rmse = np.sqrt(mean_squared_error(Y_test, Y_test_pred_int)) + logger.info(f'[TRAIN RMSE] {train_rmse}') + logger.info(f'[VALID RMSE] {valid_rmse}') + logger.info(f'[TEST RMSE] {test_rmse}') + + train_pears = scipy.stats.pearsonr(Y_train, Y_train_pred_int) + valid_pears = scipy.stats.pearsonr(Y_valid, Y_valid_pred_int) + test_pears = scipy.stats.pearsonr(Y_test, Y_test_pred_int) + logger.info(f'[TRAIN PEARS] {train_pears[0]}') + logger.info(f'[VALID PEARS] {valid_pears[0]}') + logger.info(f'[TEST PEARS] {test_pears[0]}') + +t2 = time.time() +logger.info('\nMain Program Runs for %s s' % str(t2-t1)) +logger.info('===============================================================\n') diff --git a/my_kappa_calculator.py b/my_kappa_calculator.py new file mode 100644 index 0000000..573f5e6 --- /dev/null +++ b/my_kappa_calculator.py @@ -0,0 +1,18 @@ +import numpy as np +from quadratic_weighted_kappa import quadratic_weighted_kappa as qwk +from quadratic_weighted_kappa import linear_weighted_kappa as lwk + + +def assert_inputs(rater_a, rater_b): + assert np.issubdtype(rater_a.dtype, np.integer), 'Integer array expected, got ' + str(rater_a.dtype) + assert np.issubdtype(rater_b.dtype, np.integer), 'Integer array expected, got ' + str(rater_b.dtype) + + +def quadratic_weighted_kappa(rater_a, rater_b, min_rating, max_rating): + assert_inputs(rater_a, rater_b) + return qwk(rater_a, rater_b, min_rating, max_rating) + + +def linear_weighted_kappa(rater_a, rater_b, min_rating, max_rating): + assert_inputs(rater_a, rater_b) + return lwk(rater_a, rater_b, min_rating, max_rating) diff --git a/quadratic_weighted_kappa.py b/quadratic_weighted_kappa.py new file mode 100644 index 0000000..a12a5d0 --- /dev/null +++ b/quadratic_weighted_kappa.py @@ -0,0 +1,229 @@ +import numpy as np + + +def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None): + """ + Returns the confusion matrix between rater's ratings + """ + assert(len(rater_a) == len(rater_b)) + if min_rating is None: # if there's no min, get min + min_rating = min(rater_a + rater_b) # splice instead of plus + if max_rating is None: # if there's no max, get max + max_rating = max(rater_a + rater_b) + num_ratings = int(max_rating - min_rating + 1) + # initialize the conf_mat to all zeros + conf_mat = [[0 for i in range(num_ratings)] + for j in range(num_ratings)] + for a, b in zip(rater_a, rater_b): # count how many ratings are there in both raters' results + conf_mat[a - min_rating][b - min_rating] += 1 + return conf_mat + + +def histogram(ratings, min_rating=None, max_rating=None): + """ + Returns the counts of each type of rating that a rater made + """ + if min_rating is None: + min_rating = min(ratings) + if max_rating is None: + max_rating = max(ratings) + num_ratings = int(max_rating - min_rating + 1) + hist_ratings = [0 for x in range(num_ratings)] + for r in ratings: + hist_ratings[r - min_rating] += 1 + return hist_ratings + + +def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None): + """ + Calculates the quadratic weighted kappa + quadratic_weighted_kappa calculates the quadratic weighted kappa + value, which is a measure of inter-rater agreement between two raters + that provide discrete numeric ratings. Potential values range from -1 + (representing complete disagreement) to 1 (representing complete + agreement). A kappa value of 0 is expected if all agreement is due to + chance. + + quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b + each correspond to a list of integer ratings. These lists must have the + same length. + + The ratings should be integers, and it is assumed that they contain + the complete range of possible ratings. + + quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating + is the minimum possible rating, and max_rating is the maximum possible + rating + """ + rater_a = np.array(rater_a, dtype=int) + rater_b = np.array(rater_b, dtype=int) + assert(len(rater_a) == len(rater_b)) + if min_rating is None: + min_rating = min(min(rater_a), min(rater_b)) + if max_rating is None: + max_rating = max(max(rater_a), max(rater_b)) + conf_mat = confusion_matrix(rater_a, rater_b, + min_rating, max_rating) + num_ratings = len(conf_mat) + num_scored_items = float(len(rater_a)) + + # these two vector refer to their counts + hist_rater_a = histogram(rater_a, min_rating, max_rating) + hist_rater_b = histogram(rater_b, min_rating, max_rating) + + numerator = 0.0 + denominator = 0.0 + + for i in range(num_ratings): + for j in range(num_ratings): + # expected_count is so call outer product of two vec?? + expected_count = (hist_rater_a[i] * hist_rater_b[j] + / num_scored_items) + d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0) + numerator += d * conf_mat[i][j] / num_scored_items + denominator += d * expected_count / num_scored_items + + return 1.0 - numerator / denominator + + +def linear_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None): + """ + Calculates the linear weighted kappa + linear_weighted_kappa calculates the linear weighted kappa + value, which is a measure of inter-rater agreement between two raters + that provide discrete numeric ratings. Potential values range from -1 + (representing complete disagreement) to 1 (representing complete + agreement). A kappa value of 0 is expected if all agreement is due to + chance. + + linear_weighted_kappa(rater_a, rater_b), where rater_a and rater_b + each correspond to a list of integer ratings. These lists must have the + same length. + + The ratings should be integers, and it is assumed that they contain + the complete range of possible ratings. + + linear_weighted_kappa(X, min_rating, max_rating), where min_rating + is the minimum possible rating, and max_rating is the maximum possible + rating + """ + assert(len(rater_a) == len(rater_b)) + if min_rating is None: + min_rating = min(rater_a + rater_b) + if max_rating is None: + max_rating = max(rater_a + rater_b) + conf_mat = confusion_matrix(rater_a, rater_b, + min_rating, max_rating) + num_ratings = len(conf_mat) + num_scored_items = float(len(rater_a)) + + hist_rater_a = histogram(rater_a, min_rating, max_rating) + hist_rater_b = histogram(rater_b, min_rating, max_rating) + + numerator = 0.0 + denominator = 0.0 + + for i in range(num_ratings): + for j in range(num_ratings): + expected_count = (hist_rater_a[i] * hist_rater_b[j] + / num_scored_items) + d = abs(i - j) / float(num_ratings - 1) + numerator += d * conf_mat[i][j] / num_scored_items + denominator += d * expected_count / num_scored_items + + return 1.0 - numerator / denominator + + +def kappa(rater_a, rater_b, min_rating=None, max_rating=None): + """ + Calculates the kappa + kappa calculates the kappa + value, which is a measure of inter-rater agreement between two raters + that provide discrete numeric ratings. Potential values range from -1 + (representing complete disagreement) to 1 (representing complete + agreement). A kappa value of 0 is expected if all agreement is due to + chance. + + kappa(rater_a, rater_b), where rater_a and rater_b + each correspond to a list of integer ratings. These lists must have the + same length. + + The ratings should be integers, and it is assumed that they contain + the complete range of possible ratings. + + kappa(X, min_rating, max_rating), where min_rating + is the minimum possible rating, and max_rating is the maximum possible + rating + """ + assert(len(rater_a) == len(rater_b)) + if min_rating is None: + min_rating = min(rater_a + rater_b) + if max_rating is None: + max_rating = max(rater_a + rater_b) + conf_mat = confusion_matrix(rater_a, rater_b, + min_rating, max_rating) + num_ratings = len(conf_mat) + num_scored_items = float(len(rater_a)) + + hist_rater_a = histogram(rater_a, min_rating, max_rating) + hist_rater_b = histogram(rater_b, min_rating, max_rating) + + numerator = 0.0 + denominator = 0.0 + + for i in range(num_ratings): + for j in range(num_ratings): + expected_count = (hist_rater_a[i] * hist_rater_b[j] + / num_scored_items) + if i == j: + d = 0.0 + else: + d = 1.0 + numerator += d * conf_mat[i][j] / num_scored_items + denominator += d * expected_count / num_scored_items + + return 1.0 - numerator / denominator + + +def mean_quadratic_weighted_kappa(kappas, weights=None): + """ + Calculates the mean of the quadratic + weighted kappas after applying Fisher's r-to-z transform, which is + approximately a variance-stabilizing transformation. This + transformation is undefined if one of the kappas is 1.0, so all kappa + values are capped in the range (-0.999, 0.999). The reverse + transformation is then applied before returning the result. + + mean_quadratic_weighted_kappa(kappas), where kappas is a vector of + kappa values + + mean_quadratic_weighted_kappa(kappas, weights), where weights is a vector + of weights that is the same size as kappas. Weights are applied in the + z-space + """ + kappas = np.array(kappas, dtype=float) + if weights is None: + weights = np.ones(np.shape(kappas)) + else: + weights = weights / np.mean(weights) + + # ensure that kappas are in the range [-.999, .999] + kappas = np.array([min(x, .999) for x in kappas]) + kappas = np.array([max(x, -.999) for x in kappas]) + + z = 0.5 * np.log((1 + kappas) / (1 - kappas)) * weights + z = np.mean(z) + return (np.exp(2 * z) - 1) / (np.exp(2 * z) + 1) + + +def weighted_mean_quadratic_weighted_kappa(solution, submission): + predicted_score = submission[submission.columns[-1]].copy() + predicted_score.name = "predicted_score" + if predicted_score.index[0] == 0: + predicted_score = predicted_score[:len(solution)] + predicted_score.index = solution.index + combined = solution.join(predicted_score, how="left") + groups = combined.groupby(by="essay_set") + kappas = [quadratic_weighted_kappa(group[1]["essay_score"], group[1]["predicted_score"]) for group in groups] + weights = [group[1]["essay_weight"].irow(0) for group in groups] + return mean_quadratic_weighted_kappa(kappas, weights=weights) diff --git a/stepwise_selection.py b/stepwise_selection.py new file mode 100644 index 0000000..01cc2a1 --- /dev/null +++ b/stepwise_selection.py @@ -0,0 +1,87 @@ +import pandas as pd +import statsmodels.api as sm +from statsmodels.formula.api import ols + + +def stepwiseSelection(data, tag, + initial_list=list(), + threshold_in=0.05, + threshold_out=0.10, + verbose=True): + ''' + data: contaning indep/dep variables + tag: name of target + initial_list: independent variables that must be included + threshold_in: partial F-test threshold for entering a variable + threshold_in: partial F-test threshold for eliminating a variable + verbose: show stepwise details of entering and eliminating + >>> NOTE that {threshold_in < threshold_out} must be satisfied! + ''' + + ab_dic = dict() + ab_mark = ['~', '+', '-', ':', '*', '/'] + for col in data.columns: + for mark in ab_mark: + if mark in col: + ab_dic[col.replace(mark, '')] = col + data.rename(columns={col: col.replace(mark, '')}, inplace=True) + + included = initial_list + formula = f'{tag}~1' # set a constant model as initial reduced_model + best_r2_dif = .0 + + while True: # end loop when no variable gets in/out + + excluded = list(set(data.columns) - set(included)) + excluded.remove(tag) + changed = False + + full_model = ols(formula=formula, data=data).fit() + last_adj_r2 = full_model.rsquared_adj + + # forward step + for new_feature in excluded: + + # Note here the test_model has more variable than full_model + test_model = ols(formula=formula+f'+{new_feature}', data=data).fit() + + # find feature whose contribution to adj_r2 largest + if test_model.rsquared_adj - last_adj_r2 > best_r2_dif: + best_r2_dif = test_model.rsquared_adj - last_adj_r2 + last_adj_r2 = test_model.rsquared_adj + best_feature = new_feature + + # Partial F-test + # Note that in anova_lm models with few variables are put forward + full_model_pro = ols(formula=formula+f'+{best_feature}', data=data).fit() + anova_tbl = sm.stats.anova_lm(full_model, full_model_pro) + criterion = anova_tbl['Pr(>F)'][1] + + if criterion <= threshold_in: + included.append(best_feature) + formula += f'+{best_feature}' + full_model = full_model_pro + changed = True + best_r2_dif = .0 + if verbose: + print('Add {:25} with f_pvalue {:.6}'.format(best_feature, criterion)) + + # backward step + for old_feature in included: + test_model = ols(formula=formula.replace(f'+{old_feature}', ''), data=data).fit() + + # Note here the test_model has less variable than full_model + anova_tbl = sm.stats.anova_lm(test_model, full_model) + criterion = anova_tbl['Pr(>F)'][1] + if criterion >= threshold_out: + included.remove(old_feature) + formula = formula.replace(f'+{old_feature}', '') + changed = True + best_r2_dif = .0 + if verbose: + print('Drop {:25} with f_pvalue {:.6}'.format(old_feature, criterion)) + + if not changed: + break + + return [ab_dic[x] if x in ab_dic.keys() else x for x in included]