From 3b906ea97e8a88a539e505936960f94d86b3a20f Mon Sep 17 00:00:00 2001
From: LemonadeXyz <62137944+LemonadeXyz@users.noreply.github.com>
Date: Fri, 20 Aug 2021 21:53:36 +0800
Subject: [PATCH] Add files via upload
20210820 by Y.Wang
---
README.md | 52 +++++++-
main.py | 248 ++++++++++++++++++++++++++++++++++++
my_kappa_calculator.py | 18 +++
quadratic_weighted_kappa.py | 229 +++++++++++++++++++++++++++++++++
stepwise_selection.py | 87 +++++++++++++
5 files changed, 632 insertions(+), 2 deletions(-)
create mode 100644 main.py
create mode 100644 my_kappa_calculator.py
create mode 100644 quadratic_weighted_kappa.py
create mode 100644 stepwise_selection.py
diff --git a/README.md b/README.md
index 76f4b08..28a3f73 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,50 @@
-# L2C-rater
-Automated Essay Scoring Method for Chinese Second Language Writing
+# L2C-rater (Automated Essay Scoring Method for Chinese Second Language Writing)
+
+An automatic essay scoring method for Chinese L2 writing based on linear models, tree-structure models and logistic regression models, especially Ordinal Logistic Regression(OLR), combining 90 linguistic features and Tf-Idf textual representations.
+
+### Enviroment
+Python 3.6+
+
+### Main Package
+numpy 1.19.5
+pandas 1.1.5
+scipy 1.5.2
+sci-kit-learn 0.24.1
+pyltp 0.1.9.1
+jieba
+xgboost
+mord
+
+### Set Up ###
+
+* Configure the environment
+* Prepare data
+* Run main.py
+
+### Dataset ###
+
+We conduct 5-fold cross validation on HSK Dynamic Composition Corpus to evaluate our system. This dataset (this code can be run with part of the data in [here]()) is publicly available [here](http://hsk.blcu.edu.cn/).
+
+### Options
+
+You can see the list of available options by running:
+
+```bash
+python main.py -h
+```
+
+For ```feature_mode```, ```t``` means using **text representation** only when training the model, ```l``` means using **linguistics features** only, and ```b``` means using both the above features.
+
+For ```feature_type```, ```c``` means segmenting the essay to **character** level when generating text representation, ```w``` means segmenting the essay to **word** level, ```cw``` means using both the above, ```wp``` means segmenting the essay to **word** level and tagging their **part-of-speech(POS)**, and ```cwp``` means using both ```c``` and ```wp```.
+
+### Example ###
+
+The following command trains the best model among all automated models. Note that you will not get a convincing result with data provided [here](), because the amount of data is not large enough.
+
+```bash
+python main.py -m b -t wp
+```
+
+### Publication ###
+
+Wang Y., Hu R. (2021) A Prompt-Independent and Interpretable Automated Essay Scoring Method for Chinese Second Language Writing. In: Li S. et al. (eds) Chinese Computational Linguistics. CCL 2021. Lecture Notes in Computer Science, vol 12869. Springer, Cham. https://doi.org/10.1007/978-3-030-84186-7_30
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..f59503e
--- /dev/null
+++ b/main.py
@@ -0,0 +1,248 @@
+import argparse
+import time
+import logging
+import numpy as np
+import pandas as pd
+import scipy.stats
+import xgboost as xgb
+from sklearn.utils import shuffle
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
+from sklearn.ensemble import RandomForestRegressor
+from mord import LogisticAT, OrdinalRidge
+from sklearn.metrics import mean_squared_error, confusion_matrix
+from utils.text import *
+from utils.text_repre import *
+from linguistic import getLinguisticIndices
+from stepwise_selection import *
+from my_kappa_calculator import quadratic_weighted_kappa as qwk
+
+# set logger
+logging.basicConfig(filename='hsk.log', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# set argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("-mp", "--modelpath", dest="model_path", type=str, metavar='', default='./ltp_models',
+ help="the model directory")
+parser.add_argument("-dp", "--datapath", dest="data_path", type=str, metavar='', default='./data',
+ help="the input directory")
+parser.add_argument("-m", "--feature_mode", dest="feature_mode", type=str, metavar='', default='l',
+ help="feature mode (t|l|b)(default=l)")
+
+parser.add_argument("-t", "--feature_type", dest="feature_type", type=str, metavar='', default='c',
+ help="feature type (c|w|cw|wp|cwp) (default=c)")
+
+parser.add_argument("--max_depth", dest="max_depth", type=int, metavar='', default=40,
+ help="max_depth of random forest regressior")
+
+parser.add_argument("--maxd_xgb", dest="maxd_xgb", type=int, metavar='', default=3,
+ help="max_depth of XGBRegressor")
+parser.add_argument("-lr", dest="learning_rate", type=float, metavar='', default=0.05,
+ help="learning_rate of XGBRegressor")
+parser.add_argument("--min_cw", dest="min_child_weight", type=int, metavar='', default=5,
+ help="min_child_weight of XGBRegressor")
+parser.add_argument("--n_estimators", dest="n_estimators", type=int, metavar='', default=300,
+ help="n_estimators of XGBRegressor")
+parser.add_argument("-gm", dest="gamma", type=int, metavar='', default=5,
+ help="gamma of XBClassifier")
+
+parser.add_argument("--olr_penalty", dest="olr_penalty", type=int, metavar='', default=1.,
+ help="regular parameter for OLR. Zero value means no regularization")
+parser.add_argument("--rdg_penalty", dest="rdg_penalty", type=int, metavar='', default=1.,
+ help="regular parameter for Ridge")
+
+parser.add_argument("--ngram_max", dest="max_n", type=int, metavar='', default=1,
+ help="upper bound of ngram range")
+parser.add_argument("--ngram_min", dest="min_n", type=int, metavar='', default=1,
+ help="lower bound of ngram range")
+parser.add_argument("--df_threshold", dest="min_df", type=int, metavar='', default=10,
+ help="document frequency when building vocabulary")
+args = parser.parse_args()
+
+model_path = args.model_path
+data_path = args.data_path
+feature_mode = args.feature_mode
+feature_type = args.feature_type
+max_depth = args.max_depth
+maxd_xgb = args.maxd_xgb
+n_estimators = args.n_estimators
+lr = args.learning_rate
+min_child_weight = args.min_child_weight
+gamma = args.gamma
+olr_penalty = args.olr_penalty
+rdg_penalty = args.rdg_penalty
+max_n = args.max_n
+min_n = args.min_n
+min_df = args.min_df
+
+logger.info("feature_mode: " + str(feature_mode))
+logger.info("feature_type: " + str(feature_type))
+logger.info("n-grams range: [ %s, %s]" % (str(min_n), str(max_n)))
+logger.info("term frequency threshold: " + str(min_df))
+
+t1 = time.time()
+
+# Generating Linguistics Features
+index_data = {}
+corpus = []
+df_data = pd.read_csv(data_path + '/essay_revised.csv')
+
+# shuffle
+# df_data = shuffle(df_data)
+# df_data = df_data.reset_index(drop=True)
+
+segmentor, postagger, parser = load_ltpmodel(model_path)
+
+for index, row in df_data.iterrows():
+ essay_id, essay_text, essay_score = row['essay_ID'], row['ESSAY'], row['SCORE']
+ text_dict = text_process(essay_text, segmentor, postagger, parser)
+ # build corpus for "text|both" feature setting using the text_dict, the parsing results from ltp
+ if feature_mode in ['t', 'b']:
+ corpus_line = get_text_feature_from_ltp_results(essay_text, text_dict, feature=feature_type)
+ corpus.append(corpus_line)
+ indices = getLinguisticIndices(text_dict)
+ indices['essay_ID'] = essay_id
+ indices['SCORE'] = essay_score
+ index_data[index] = indices
+
+release_ltpmodel(segmentor, postagger, parser)
+
+df_ling = pd.DataFrame.from_dict(index_data, orient='index')
+
+for column_name in df_ling.columns:
+ if column_name not in ['SCORE', 'essay_ID']:
+ x = df_ling[column_name].values.reshape(-1, 1)
+ min_max_scaler = MinMaxScaler()
+ df_ling[column_name] = min_max_scaler.fit_transform(x)
+
+df_ling = df_ling.fillna(0)
+df_ling.to_csv(data_path + '/essay_linguistic_indices.csv', index=False)
+
+ling_name = list(df_ling.columns)
+ling_name.remove('essay_ID')
+
+# Get effective linguistic feature stepwise regression
+logger.info('>>>>>>>>> STEPWISE REGRESSION <<<<<<<<<')
+features_automatic = stepwiseSelection(df_ling.filter(items=ling_name), 'SCORE', verbose=False)
+logger.info(f'Linguistics Features Selected:\n{features_automatic}')
+
+if feature_mode == 't': # text mode
+ # Get text representation
+ X = get_text_matrix(corpus, ngram_min=min_n, ngram_max=max_n, df_threshold=min_df)
+ Y = np.array([score2ord[score] for score in df_data['SCORE']])
+
+elif feature_mode == 'l': # ling mode
+ # Get linguistic Feature Vec
+ X = np.array(df_ling.filter(items=features_automatic))
+ Y = np.array([score2ord[score] for score in df_ling['SCORE']])
+
+else: # both mode
+ # Get text representation
+ txt_vec = get_text_matrix(corpus, ngram_min=min_n, ngram_max=max_n,
+ df_threshold=min_df, sparse=True)
+ # Get linguistic Feature Vec
+ lng_vec = np.array(df_ling.filter(items=features_automatic))
+ X = np.concatenate((txt_vec, lng_vec), axis=1)
+ Y = np.array([score2ord[score] for score in df_ling['SCORE']])
+
+
+# Set models and their parameters
+# Note that OrdinalRidge and Ridge Regression are identical
+models = {
+ 'LinearRegression': LinearRegression(),
+ 'Ridge': Ridge(alpha=rdg_penalty),
+ 'RandomForestRegressor': RandomForestRegressor(max_depth=max_depth, random_state=0),
+ 'XGBRegressor': xgb.XGBRegressor(max_depth=maxd_xgb,
+ learning_rate=lr,
+ n_estimators=n_estimators,
+ min_child_weight=min_child_weight,
+ subsample=0.7,
+ colsample_bytree=0.7,
+ reg_alpha=0,
+ reg_lambda=0,
+ silent=True,
+ objective='reg:gamma',
+ missing=None,
+ seed=123,
+ gamma=gamma),
+ 'LogisticRegression': LogisticRegression(max_iter=1000),
+ 'OrderedLR_LogisticAT': LogisticAT(alpha=olr_penalty)
+ # 'OrdinalRidge': OrdinalRidge(alpha=olr_penalty)
+ }
+
+# train/dev/test split
+# Rewrite following codes according to your data
+X_tr_val = X[:90]
+Y_tr_val = Y[:90]
+X_test = X[90:]
+Y_test = Y[90:]
+
+X_lst = np.array_split(X_tr_val, 5)
+Y_lst = np.array_split(Y_tr_val, 5)
+
+logger.info('>>>>>>>>>> Cross Validation <<<<<<<<<')
+
+# split train_valid to train/valid/test and CV
+for i in range(5):
+ X_frame = list()
+ Y_frame = list()
+ for j in range(5):
+ if j != i:
+ X_frame.append(X_lst[j])
+ Y_frame.append(Y_lst[j])
+ X_train = np.concatenate(X_frame, axis=0)
+ Y_train = np.concatenate(Y_frame, axis=0)
+ X_valid = X_lst[i]
+ Y_valid = Y_lst[i]
+
+ # write shape of train/dev/test in log
+ logger.info('\n########## Fold {} ##########'.format(i+1))
+ logger.info(f'tr: {X_train.shape} dev: {X_valid.shape} ts: {X_test.shape}')
+ logger.info(f'tr: {Y_train.shape} dev: {Y_valid.shape} ts: {Y_test.shape}')
+
+ for name, lm in models.items():
+ logger.info('\n=========== {} ============'.format(name))
+ lm.fit(X_train, Y_train)
+
+ # predict
+ Y_train_pred = lm.predict(X_train)
+ Y_valid_pred = lm.predict(X_valid)
+ Y_test_pred = lm.predict(X_test)
+
+ # Modified too small values to the lower bound 0
+ # Modified too large values to the upper bound 11
+ Y_train_pred[np.where(Y_train_pred < 0)] = 0
+ Y_train_pred[np.where(Y_train_pred > 11)] = 11
+ Y_valid_pred[np.where(Y_valid_pred < 0)] = 0
+ Y_valid_pred[np.where(Y_valid_pred > 11)] = 11
+ Y_test_pred[np.where(Y_test_pred < 0)] = 0
+ Y_test_pred[np.where(Y_test_pred > 11)] = 11
+
+ # transform predicted values to integer
+ Y_train_pred_int = np.rint(Y_train_pred).astype('int32')
+ Y_valid_pred_int = np.rint(Y_valid_pred).astype('int32')
+ Y_test_pred_int = np.rint(Y_test_pred).astype('int32')
+
+ # output
+ logger.info(f'[TRAIN QWK] {qwk(Y_train, Y_train_pred_int, 0, 11)}')
+ logger.info(f'[VALID QWK] {qwk(Y_valid, Y_valid_pred_int, 0, 11)}')
+ logger.info(f'[TEST QWK] {qwk(Y_test, Y_test_pred_int, 0, 11)}')
+
+ train_rmse = np.sqrt(mean_squared_error(Y_train, Y_train_pred_int))
+ valid_rmse = np.sqrt(mean_squared_error(Y_valid, Y_valid_pred_int))
+ test_rmse = np.sqrt(mean_squared_error(Y_test, Y_test_pred_int))
+ logger.info(f'[TRAIN RMSE] {train_rmse}')
+ logger.info(f'[VALID RMSE] {valid_rmse}')
+ logger.info(f'[TEST RMSE] {test_rmse}')
+
+ train_pears = scipy.stats.pearsonr(Y_train, Y_train_pred_int)
+ valid_pears = scipy.stats.pearsonr(Y_valid, Y_valid_pred_int)
+ test_pears = scipy.stats.pearsonr(Y_test, Y_test_pred_int)
+ logger.info(f'[TRAIN PEARS] {train_pears[0]}')
+ logger.info(f'[VALID PEARS] {valid_pears[0]}')
+ logger.info(f'[TEST PEARS] {test_pears[0]}')
+
+t2 = time.time()
+logger.info('\nMain Program Runs for %s s' % str(t2-t1))
+logger.info('===============================================================\n')
diff --git a/my_kappa_calculator.py b/my_kappa_calculator.py
new file mode 100644
index 0000000..573f5e6
--- /dev/null
+++ b/my_kappa_calculator.py
@@ -0,0 +1,18 @@
+import numpy as np
+from quadratic_weighted_kappa import quadratic_weighted_kappa as qwk
+from quadratic_weighted_kappa import linear_weighted_kappa as lwk
+
+
+def assert_inputs(rater_a, rater_b):
+ assert np.issubdtype(rater_a.dtype, np.integer), 'Integer array expected, got ' + str(rater_a.dtype)
+ assert np.issubdtype(rater_b.dtype, np.integer), 'Integer array expected, got ' + str(rater_b.dtype)
+
+
+def quadratic_weighted_kappa(rater_a, rater_b, min_rating, max_rating):
+ assert_inputs(rater_a, rater_b)
+ return qwk(rater_a, rater_b, min_rating, max_rating)
+
+
+def linear_weighted_kappa(rater_a, rater_b, min_rating, max_rating):
+ assert_inputs(rater_a, rater_b)
+ return lwk(rater_a, rater_b, min_rating, max_rating)
diff --git a/quadratic_weighted_kappa.py b/quadratic_weighted_kappa.py
new file mode 100644
index 0000000..a12a5d0
--- /dev/null
+++ b/quadratic_weighted_kappa.py
@@ -0,0 +1,229 @@
+import numpy as np
+
+
+def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
+ """
+ Returns the confusion matrix between rater's ratings
+ """
+ assert(len(rater_a) == len(rater_b))
+ if min_rating is None: # if there's no min, get min
+ min_rating = min(rater_a + rater_b) # splice instead of plus
+ if max_rating is None: # if there's no max, get max
+ max_rating = max(rater_a + rater_b)
+ num_ratings = int(max_rating - min_rating + 1)
+ # initialize the conf_mat to all zeros
+ conf_mat = [[0 for i in range(num_ratings)]
+ for j in range(num_ratings)]
+ for a, b in zip(rater_a, rater_b): # count how many ratings are there in both raters' results
+ conf_mat[a - min_rating][b - min_rating] += 1
+ return conf_mat
+
+
+def histogram(ratings, min_rating=None, max_rating=None):
+ """
+ Returns the counts of each type of rating that a rater made
+ """
+ if min_rating is None:
+ min_rating = min(ratings)
+ if max_rating is None:
+ max_rating = max(ratings)
+ num_ratings = int(max_rating - min_rating + 1)
+ hist_ratings = [0 for x in range(num_ratings)]
+ for r in ratings:
+ hist_ratings[r - min_rating] += 1
+ return hist_ratings
+
+
+def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
+ """
+ Calculates the quadratic weighted kappa
+ quadratic_weighted_kappa calculates the quadratic weighted kappa
+ value, which is a measure of inter-rater agreement between two raters
+ that provide discrete numeric ratings. Potential values range from -1
+ (representing complete disagreement) to 1 (representing complete
+ agreement). A kappa value of 0 is expected if all agreement is due to
+ chance.
+
+ quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
+ each correspond to a list of integer ratings. These lists must have the
+ same length.
+
+ The ratings should be integers, and it is assumed that they contain
+ the complete range of possible ratings.
+
+ quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
+ is the minimum possible rating, and max_rating is the maximum possible
+ rating
+ """
+ rater_a = np.array(rater_a, dtype=int)
+ rater_b = np.array(rater_b, dtype=int)
+ assert(len(rater_a) == len(rater_b))
+ if min_rating is None:
+ min_rating = min(min(rater_a), min(rater_b))
+ if max_rating is None:
+ max_rating = max(max(rater_a), max(rater_b))
+ conf_mat = confusion_matrix(rater_a, rater_b,
+ min_rating, max_rating)
+ num_ratings = len(conf_mat)
+ num_scored_items = float(len(rater_a))
+
+ # these two vector refer to their counts
+ hist_rater_a = histogram(rater_a, min_rating, max_rating)
+ hist_rater_b = histogram(rater_b, min_rating, max_rating)
+
+ numerator = 0.0
+ denominator = 0.0
+
+ for i in range(num_ratings):
+ for j in range(num_ratings):
+ # expected_count is so call outer product of two vec??
+ expected_count = (hist_rater_a[i] * hist_rater_b[j]
+ / num_scored_items)
+ d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
+ numerator += d * conf_mat[i][j] / num_scored_items
+ denominator += d * expected_count / num_scored_items
+
+ return 1.0 - numerator / denominator
+
+
+def linear_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
+ """
+ Calculates the linear weighted kappa
+ linear_weighted_kappa calculates the linear weighted kappa
+ value, which is a measure of inter-rater agreement between two raters
+ that provide discrete numeric ratings. Potential values range from -1
+ (representing complete disagreement) to 1 (representing complete
+ agreement). A kappa value of 0 is expected if all agreement is due to
+ chance.
+
+ linear_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
+ each correspond to a list of integer ratings. These lists must have the
+ same length.
+
+ The ratings should be integers, and it is assumed that they contain
+ the complete range of possible ratings.
+
+ linear_weighted_kappa(X, min_rating, max_rating), where min_rating
+ is the minimum possible rating, and max_rating is the maximum possible
+ rating
+ """
+ assert(len(rater_a) == len(rater_b))
+ if min_rating is None:
+ min_rating = min(rater_a + rater_b)
+ if max_rating is None:
+ max_rating = max(rater_a + rater_b)
+ conf_mat = confusion_matrix(rater_a, rater_b,
+ min_rating, max_rating)
+ num_ratings = len(conf_mat)
+ num_scored_items = float(len(rater_a))
+
+ hist_rater_a = histogram(rater_a, min_rating, max_rating)
+ hist_rater_b = histogram(rater_b, min_rating, max_rating)
+
+ numerator = 0.0
+ denominator = 0.0
+
+ for i in range(num_ratings):
+ for j in range(num_ratings):
+ expected_count = (hist_rater_a[i] * hist_rater_b[j]
+ / num_scored_items)
+ d = abs(i - j) / float(num_ratings - 1)
+ numerator += d * conf_mat[i][j] / num_scored_items
+ denominator += d * expected_count / num_scored_items
+
+ return 1.0 - numerator / denominator
+
+
+def kappa(rater_a, rater_b, min_rating=None, max_rating=None):
+ """
+ Calculates the kappa
+ kappa calculates the kappa
+ value, which is a measure of inter-rater agreement between two raters
+ that provide discrete numeric ratings. Potential values range from -1
+ (representing complete disagreement) to 1 (representing complete
+ agreement). A kappa value of 0 is expected if all agreement is due to
+ chance.
+
+ kappa(rater_a, rater_b), where rater_a and rater_b
+ each correspond to a list of integer ratings. These lists must have the
+ same length.
+
+ The ratings should be integers, and it is assumed that they contain
+ the complete range of possible ratings.
+
+ kappa(X, min_rating, max_rating), where min_rating
+ is the minimum possible rating, and max_rating is the maximum possible
+ rating
+ """
+ assert(len(rater_a) == len(rater_b))
+ if min_rating is None:
+ min_rating = min(rater_a + rater_b)
+ if max_rating is None:
+ max_rating = max(rater_a + rater_b)
+ conf_mat = confusion_matrix(rater_a, rater_b,
+ min_rating, max_rating)
+ num_ratings = len(conf_mat)
+ num_scored_items = float(len(rater_a))
+
+ hist_rater_a = histogram(rater_a, min_rating, max_rating)
+ hist_rater_b = histogram(rater_b, min_rating, max_rating)
+
+ numerator = 0.0
+ denominator = 0.0
+
+ for i in range(num_ratings):
+ for j in range(num_ratings):
+ expected_count = (hist_rater_a[i] * hist_rater_b[j]
+ / num_scored_items)
+ if i == j:
+ d = 0.0
+ else:
+ d = 1.0
+ numerator += d * conf_mat[i][j] / num_scored_items
+ denominator += d * expected_count / num_scored_items
+
+ return 1.0 - numerator / denominator
+
+
+def mean_quadratic_weighted_kappa(kappas, weights=None):
+ """
+ Calculates the mean of the quadratic
+ weighted kappas after applying Fisher's r-to-z transform, which is
+ approximately a variance-stabilizing transformation. This
+ transformation is undefined if one of the kappas is 1.0, so all kappa
+ values are capped in the range (-0.999, 0.999). The reverse
+ transformation is then applied before returning the result.
+
+ mean_quadratic_weighted_kappa(kappas), where kappas is a vector of
+ kappa values
+
+ mean_quadratic_weighted_kappa(kappas, weights), where weights is a vector
+ of weights that is the same size as kappas. Weights are applied in the
+ z-space
+ """
+ kappas = np.array(kappas, dtype=float)
+ if weights is None:
+ weights = np.ones(np.shape(kappas))
+ else:
+ weights = weights / np.mean(weights)
+
+ # ensure that kappas are in the range [-.999, .999]
+ kappas = np.array([min(x, .999) for x in kappas])
+ kappas = np.array([max(x, -.999) for x in kappas])
+
+ z = 0.5 * np.log((1 + kappas) / (1 - kappas)) * weights
+ z = np.mean(z)
+ return (np.exp(2 * z) - 1) / (np.exp(2 * z) + 1)
+
+
+def weighted_mean_quadratic_weighted_kappa(solution, submission):
+ predicted_score = submission[submission.columns[-1]].copy()
+ predicted_score.name = "predicted_score"
+ if predicted_score.index[0] == 0:
+ predicted_score = predicted_score[:len(solution)]
+ predicted_score.index = solution.index
+ combined = solution.join(predicted_score, how="left")
+ groups = combined.groupby(by="essay_set")
+ kappas = [quadratic_weighted_kappa(group[1]["essay_score"], group[1]["predicted_score"]) for group in groups]
+ weights = [group[1]["essay_weight"].irow(0) for group in groups]
+ return mean_quadratic_weighted_kappa(kappas, weights=weights)
diff --git a/stepwise_selection.py b/stepwise_selection.py
new file mode 100644
index 0000000..01cc2a1
--- /dev/null
+++ b/stepwise_selection.py
@@ -0,0 +1,87 @@
+import pandas as pd
+import statsmodels.api as sm
+from statsmodels.formula.api import ols
+
+
+def stepwiseSelection(data, tag,
+ initial_list=list(),
+ threshold_in=0.05,
+ threshold_out=0.10,
+ verbose=True):
+ '''
+ data: contaning indep/dep variables
+ tag: name of target
+ initial_list: independent variables that must be included
+ threshold_in: partial F-test threshold for entering a variable
+ threshold_in: partial F-test threshold for eliminating a variable
+ verbose: show stepwise details of entering and eliminating
+ >>> NOTE that {threshold_in < threshold_out} must be satisfied!
+ '''
+
+ ab_dic = dict()
+ ab_mark = ['~', '+', '-', ':', '*', '/']
+ for col in data.columns:
+ for mark in ab_mark:
+ if mark in col:
+ ab_dic[col.replace(mark, '')] = col
+ data.rename(columns={col: col.replace(mark, '')}, inplace=True)
+
+ included = initial_list
+ formula = f'{tag}~1' # set a constant model as initial reduced_model
+ best_r2_dif = .0
+
+ while True: # end loop when no variable gets in/out
+
+ excluded = list(set(data.columns) - set(included))
+ excluded.remove(tag)
+ changed = False
+
+ full_model = ols(formula=formula, data=data).fit()
+ last_adj_r2 = full_model.rsquared_adj
+
+ # forward step
+ for new_feature in excluded:
+
+ # Note here the test_model has more variable than full_model
+ test_model = ols(formula=formula+f'+{new_feature}', data=data).fit()
+
+ # find feature whose contribution to adj_r2 largest
+ if test_model.rsquared_adj - last_adj_r2 > best_r2_dif:
+ best_r2_dif = test_model.rsquared_adj - last_adj_r2
+ last_adj_r2 = test_model.rsquared_adj
+ best_feature = new_feature
+
+ # Partial F-test
+ # Note that in anova_lm models with few variables are put forward
+ full_model_pro = ols(formula=formula+f'+{best_feature}', data=data).fit()
+ anova_tbl = sm.stats.anova_lm(full_model, full_model_pro)
+ criterion = anova_tbl['Pr(>F)'][1]
+
+ if criterion <= threshold_in:
+ included.append(best_feature)
+ formula += f'+{best_feature}'
+ full_model = full_model_pro
+ changed = True
+ best_r2_dif = .0
+ if verbose:
+ print('Add {:25} with f_pvalue {:.6}'.format(best_feature, criterion))
+
+ # backward step
+ for old_feature in included:
+ test_model = ols(formula=formula.replace(f'+{old_feature}', ''), data=data).fit()
+
+ # Note here the test_model has less variable than full_model
+ anova_tbl = sm.stats.anova_lm(test_model, full_model)
+ criterion = anova_tbl['Pr(>F)'][1]
+ if criterion >= threshold_out:
+ included.remove(old_feature)
+ formula = formula.replace(f'+{old_feature}', '')
+ changed = True
+ best_r2_dif = .0
+ if verbose:
+ print('Drop {:25} with f_pvalue {:.6}'.format(old_feature, criterion))
+
+ if not changed:
+ break
+
+ return [ab_dic[x] if x in ab_dic.keys() else x for x in included]