From 3b906ea97e8a88a539e505936960f94d86b3a20f Mon Sep 17 00:00:00 2001
From: LemonadeXyz <62137944+LemonadeXyz@users.noreply.github.com>
Date: Fri, 20 Aug 2021 21:53:36 +0800
Subject: [PATCH] Add files via upload

20210820 by Y.Wang
---
 README.md                   |  52 +++++++-
 main.py                     | 248 ++++++++++++++++++++++++++++++++++++
 my_kappa_calculator.py      |  18 +++
 quadratic_weighted_kappa.py | 229 +++++++++++++++++++++++++++++++++
 stepwise_selection.py       |  87 +++++++++++++
 5 files changed, 632 insertions(+), 2 deletions(-)
 create mode 100644 main.py
 create mode 100644 my_kappa_calculator.py
 create mode 100644 quadratic_weighted_kappa.py
 create mode 100644 stepwise_selection.py
diff --git a/README.md b/README.md
index 76f4b08..28a3f73 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,50 @@
-# L2C-rater
-Automated Essay Scoring Method for Chinese Second Language Writing
+# L2C-rater (Automated Essay Scoring Method for Chinese Second Language Writing)
+
+An automatic essay scoring method for Chinese L2 writing based on linear models, tree-structure models and logistic regression models, especially Ordinal Logistic Regression(OLR), combining 90 linguistic features and Tf-Idf textual representations.
+
+### Enviroment
+Python 3.6+
+
+### Main Package
+numpy 1.19.5
+pandas 1.1.5
+scipy 1.5.2
+sci-kit-learn 0.24.1
+pyltp 0.1.9.1
+jieba
+xgboost
+mord
+
+### Set Up ###
+
+* Configure the environment
+* Prepare data
+* Run main.py
+
+### Dataset ###
+
+We conduct 5-fold cross validation on HSK Dynamic Composition Corpus to evaluate our system. This dataset (this code can be run with part of the data in [here]()) is publicly available [here](http://hsk.blcu.edu.cn/).
+
+### Options
+
+You can see the list of available options by running:
+
+```bash
+python main.py -h
+```
+
+For ```feature_mode```, ```t``` means using **<em>text representation</em>** only when training the model, ```l``` means using **<em>linguistics features</em>** only, and ```b``` means using both the above features.
+
+For ```feature_type```, ```c``` means segmenting the essay to **<em>character</em>** level when generating <em>text representation</em>, ```w``` means segmenting the essay to **<em>word</em>** level, ```cw``` means using both the above, ```wp``` means segmenting the essay to **<em>word</em>** level and tagging their **<em>part-of-speech(POS)</em>**, and ```cwp``` means using both ```c``` and ```wp```.
+
+### Example ###
+
+The following command  trains the best model among all automated models. Note that you will not get a convincing result with data provided [here](), because the amount of data is not large enough.
+
+```bash
+python main.py -m b -t wp
+```
+
+### Publication ###
+
+Wang Y., Hu R. (2021) A Prompt-Independent and Interpretable Automated Essay Scoring Method for Chinese Second Language Writing. In: Li S. et al. (eds) Chinese Computational Linguistics. CCL 2021. Lecture Notes in Computer Science, vol 12869. Springer, Cham. https://doi.org/10.1007/978-3-030-84186-7_30
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..f59503e
--- /dev/null
+++ b/main.py
@@ -0,0 +1,248 @@
+import argparse
+import time
+import logging
+import numpy as np
+import pandas as pd
+import scipy.stats
+import xgboost as xgb
+from sklearn.utils import shuffle
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
+from sklearn.ensemble import RandomForestRegressor
+from mord import LogisticAT, OrdinalRidge
+from sklearn.metrics import mean_squared_error, confusion_matrix
+from utils.text import *
+from utils.text_repre import *
+from linguistic import getLinguisticIndices
+from stepwise_selection import *
+from my_kappa_calculator import quadratic_weighted_kappa as qwk
+
+# set logger
+logging.basicConfig(filename='hsk.log', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# set argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("-mp", "--modelpath", dest="model_path", type=str, metavar='<str>', default='./ltp_models',
+                    help="the model directory")
+parser.add_argument("-dp", "--datapath", dest="data_path", type=str, metavar='<str>', default='./data',
+                    help="the input directory")
+parser.add_argument("-m", "--feature_mode", dest="feature_mode", type=str, metavar='<str>', default='l',
+                    help="feature mode (t|l|b)(default=l)")
+
+parser.add_argument("-t", "--feature_type", dest="feature_type", type=str, metavar='<str>', default='c',
+                    help="feature type (c|w|cw|wp|cwp) (default=c)")
+
+parser.add_argument("--max_depth", dest="max_depth", type=int, metavar='<int>', default=40,
+                    help="max_depth of random forest regressior")
+
+parser.add_argument("--maxd_xgb", dest="maxd_xgb", type=int, metavar='<int>', default=3,
+                    help="max_depth of XGBRegressor")
+parser.add_argument("-lr", dest="learning_rate", type=float, metavar='<float>', default=0.05,
+                    help="learning_rate of XGBRegressor")
+parser.add_argument("--min_cw", dest="min_child_weight", type=int, metavar='<int>', default=5,
+                    help="min_child_weight of XGBRegressor")
+parser.add_argument("--n_estimators", dest="n_estimators", type=int, metavar='<int>', default=300,
+                    help="n_estimators of XGBRegressor")
+parser.add_argument("-gm", dest="gamma", type=int, metavar='<int>', default=5,
+                    help="gamma of XBClassifier")
+
+parser.add_argument("--olr_penalty", dest="olr_penalty", type=int, metavar='<float>', default=1.,
+                    help="regular parameter for OLR. Zero value means no regularization")
+parser.add_argument("--rdg_penalty", dest="rdg_penalty", type=int, metavar='<float>', default=1.,
+                    help="regular parameter for Ridge")
+
+parser.add_argument("--ngram_max", dest="max_n", type=int, metavar='<int>', default=1,
+                    help="upper bound of ngram range")
+parser.add_argument("--ngram_min", dest="min_n", type=int, metavar='<int>', default=1,
+                    help="lower bound of ngram range")
+parser.add_argument("--df_threshold", dest="min_df", type=int, metavar='<int>', default=10,
+                    help="document frequency when building vocabulary")
+args = parser.parse_args()
+
+model_path = args.model_path
+data_path = args.data_path
+feature_mode = args.feature_mode
+feature_type = args.feature_type
+max_depth = args.max_depth
+maxd_xgb = args.maxd_xgb
+n_estimators = args.n_estimators
+lr = args.learning_rate
+min_child_weight = args.min_child_weight
+gamma = args.gamma
+olr_penalty = args.olr_penalty
+rdg_penalty = args.rdg_penalty
+max_n = args.max_n
+min_n = args.min_n
+min_df = args.min_df
+
+logger.info("feature_mode: " + str(feature_mode))
+logger.info("feature_type: " + str(feature_type))
+logger.info("n-grams range: [ %s, %s]" % (str(min_n), str(max_n)))
+logger.info("term frequency threshold: " + str(min_df))
+
+t1 = time.time()
+
+# Generating Linguistics Features 
+index_data = {}
+corpus = []
+df_data = pd.read_csv(data_path + '/essay_revised.csv')
+
+# shuffle
+# df_data = shuffle(df_data)
+# df_data = df_data.reset_index(drop=True)
+
+segmentor, postagger, parser = load_ltpmodel(model_path)
+
+for index, row in df_data.iterrows():
+    essay_id, essay_text, essay_score = row['essay_ID'], row['ESSAY'], row['SCORE']
+    text_dict = text_process(essay_text, segmentor, postagger, parser)
+    # build corpus for "text|both" feature setting using the text_dict, the parsing results from ltp
+    if feature_mode in ['t', 'b']:
+        corpus_line = get_text_feature_from_ltp_results(essay_text, text_dict, feature=feature_type)
+        corpus.append(corpus_line)
+    indices = getLinguisticIndices(text_dict)
+    indices['essay_ID'] = essay_id
+    indices['SCORE'] = essay_score
+    index_data[index] = indices
+
+release_ltpmodel(segmentor, postagger, parser)
+
+df_ling = pd.DataFrame.from_dict(index_data, orient='index')
+
+for column_name in df_ling.columns:
+    if column_name not in ['SCORE', 'essay_ID']:
+        x = df_ling[column_name].values.reshape(-1, 1)
+        min_max_scaler = MinMaxScaler()
+        df_ling[column_name] = min_max_scaler.fit_transform(x)
+
+df_ling = df_ling.fillna(0)
+df_ling.to_csv(data_path + '/essay_linguistic_indices.csv', index=False)
+
+ling_name = list(df_ling.columns)
+ling_name.remove('essay_ID')
+
+# Get effective linguistic feature stepwise regression
+logger.info('>>>>>>>>> STEPWISE REGRESSION <<<<<<<<<')
+features_automatic = stepwiseSelection(df_ling.filter(items=ling_name), 'SCORE', verbose=False)
+logger.info(f'Linguistics Features Selected:\n{features_automatic}')
+
+if feature_mode == 't':  # text mode
+    # Get text representation
+    X = get_text_matrix(corpus, ngram_min=min_n, ngram_max=max_n, df_threshold=min_df)
+    Y = np.array([score2ord[score] for score in df_data['SCORE']])
+    
+elif feature_mode == 'l':  # ling mode
+    # Get linguistic Feature Vec
+    X = np.array(df_ling.filter(items=features_automatic))
+    Y = np.array([score2ord[score] for score in df_ling['SCORE']])
+
+else:  # both mode
+    # Get text representation
+    txt_vec = get_text_matrix(corpus, ngram_min=min_n, ngram_max=max_n, 
+                               df_threshold=min_df, sparse=True)
+    # Get linguistic Feature Vec
+    lng_vec = np.array(df_ling.filter(items=features_automatic))
+    X = np.concatenate((txt_vec, lng_vec), axis=1)
+    Y = np.array([score2ord[score] for score in df_ling['SCORE']])
+
+
+# Set models and their parameters
+# Note that OrdinalRidge and Ridge Regression are identical
+models = {
+          'LinearRegression': LinearRegression(),
+          'Ridge': Ridge(alpha=rdg_penalty),
+          'RandomForestRegressor': RandomForestRegressor(max_depth=max_depth, random_state=0),
+          'XGBRegressor': xgb.XGBRegressor(max_depth=maxd_xgb,
+                                           learning_rate=lr,
+                                           n_estimators=n_estimators,
+                                           min_child_weight=min_child_weight,
+                                           subsample=0.7,
+                                           colsample_bytree=0.7,
+                                           reg_alpha=0,
+                                           reg_lambda=0,
+                                           silent=True,
+                                           objective='reg:gamma',
+                                           missing=None,
+                                           seed=123,
+                                           gamma=gamma),
+          'LogisticRegression': LogisticRegression(max_iter=1000),
+          'OrderedLR_LogisticAT': LogisticAT(alpha=olr_penalty)
+          # 'OrdinalRidge': OrdinalRidge(alpha=olr_penalty)
+          }
+
+# train/dev/test split
+# Rewrite following codes according to your data
+X_tr_val = X[:90]
+Y_tr_val = Y[:90]
+X_test = X[90:]
+Y_test = Y[90:]
+
+X_lst = np.array_split(X_tr_val, 5)
+Y_lst = np.array_split(Y_tr_val, 5)
+
+logger.info('>>>>>>>>>> Cross Validation <<<<<<<<<')
+
+# split train_valid to train/valid/test and CV
+for i in range(5):
+    X_frame = list()
+    Y_frame = list()
+    for j in range(5):
+        if j != i:
+            X_frame.append(X_lst[j])
+            Y_frame.append(Y_lst[j])
+    X_train = np.concatenate(X_frame, axis=0)
+    Y_train = np.concatenate(Y_frame, axis=0)
+    X_valid = X_lst[i]
+    Y_valid = Y_lst[i]
+
+    # write shape of train/dev/test in log
+    logger.info('\n########## Fold {} ##########'.format(i+1))
+    logger.info(f'tr: {X_train.shape} dev: {X_valid.shape} ts: {X_test.shape}')
+    logger.info(f'tr: {Y_train.shape} dev: {Y_valid.shape} ts: {Y_test.shape}')
+
+    for name, lm in models.items():
+        logger.info('\n=========== {} ============'.format(name))
+        lm.fit(X_train, Y_train)
+
+        # predict
+        Y_train_pred = lm.predict(X_train)
+        Y_valid_pred = lm.predict(X_valid)
+        Y_test_pred = lm.predict(X_test)
+
+        # Modified too small values to the lower bound 0
+        # Modified too large values to the upper bound 11
+        Y_train_pred[np.where(Y_train_pred < 0)] = 0
+        Y_train_pred[np.where(Y_train_pred > 11)] = 11
+        Y_valid_pred[np.where(Y_valid_pred < 0)] = 0
+        Y_valid_pred[np.where(Y_valid_pred > 11)] = 11
+        Y_test_pred[np.where(Y_test_pred < 0)] = 0
+        Y_test_pred[np.where(Y_test_pred > 11)] = 11
+
+        # transform predicted values to integer
+        Y_train_pred_int = np.rint(Y_train_pred).astype('int32')
+        Y_valid_pred_int = np.rint(Y_valid_pred).astype('int32')
+        Y_test_pred_int = np.rint(Y_test_pred).astype('int32')
+
+        # output
+        logger.info(f'[TRAIN QWK] {qwk(Y_train, Y_train_pred_int, 0, 11)}')
+        logger.info(f'[VALID QWK] {qwk(Y_valid, Y_valid_pred_int, 0, 11)}')
+        logger.info(f'[TEST QWK]  {qwk(Y_test, Y_test_pred_int,  0, 11)}')
+
+        train_rmse = np.sqrt(mean_squared_error(Y_train, Y_train_pred_int))
+        valid_rmse = np.sqrt(mean_squared_error(Y_valid, Y_valid_pred_int))
+        test_rmse = np.sqrt(mean_squared_error(Y_test, Y_test_pred_int))
+        logger.info(f'[TRAIN RMSE] {train_rmse}')
+        logger.info(f'[VALID RMSE] {valid_rmse}')
+        logger.info(f'[TEST RMSE] {test_rmse}')
+
+        train_pears = scipy.stats.pearsonr(Y_train, Y_train_pred_int)
+        valid_pears = scipy.stats.pearsonr(Y_valid, Y_valid_pred_int)
+        test_pears = scipy.stats.pearsonr(Y_test, Y_test_pred_int)
+        logger.info(f'[TRAIN PEARS] {train_pears[0]}')
+        logger.info(f'[VALID PEARS] {valid_pears[0]}')
+        logger.info(f'[TEST PEARS] {test_pears[0]}')
+
+t2 = time.time()
+logger.info('\nMain Program Runs for %s s' % str(t2-t1))
+logger.info('===============================================================\n')
diff --git a/my_kappa_calculator.py b/my_kappa_calculator.py
new file mode 100644
index 0000000..573f5e6
--- /dev/null
+++ b/my_kappa_calculator.py
@@ -0,0 +1,18 @@
+import numpy as np
+from quadratic_weighted_kappa import quadratic_weighted_kappa as qwk
+from quadratic_weighted_kappa import linear_weighted_kappa as lwk
+
+
+def assert_inputs(rater_a, rater_b):
+	assert np.issubdtype(rater_a.dtype, np.integer), 'Integer array expected, got ' + str(rater_a.dtype)
+	assert np.issubdtype(rater_b.dtype, np.integer), 'Integer array expected, got ' + str(rater_b.dtype)
+
+
+def quadratic_weighted_kappa(rater_a, rater_b, min_rating, max_rating):
+	assert_inputs(rater_a, rater_b)
+	return qwk(rater_a, rater_b, min_rating, max_rating)
+
+
+def linear_weighted_kappa(rater_a, rater_b, min_rating, max_rating):
+	assert_inputs(rater_a, rater_b)
+	return lwk(rater_a, rater_b, min_rating, max_rating)
diff --git a/quadratic_weighted_kappa.py b/quadratic_weighted_kappa.py
new file mode 100644
index 0000000..a12a5d0
--- /dev/null
+++ b/quadratic_weighted_kappa.py
@@ -0,0 +1,229 @@
+import numpy as np
+
+
+def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
+    """
+    Returns the confusion matrix between rater's ratings
+    """
+    assert(len(rater_a) == len(rater_b))
+    if min_rating is None:  # if there's no min, get min
+        min_rating = min(rater_a + rater_b)  # splice instead of plus
+    if max_rating is None:  # if there's no max, get max
+        max_rating = max(rater_a + rater_b)
+    num_ratings = int(max_rating - min_rating + 1)
+    # initialize the conf_mat to all zeros
+    conf_mat = [[0 for i in range(num_ratings)]
+                for j in range(num_ratings)]
+    for a, b in zip(rater_a, rater_b):  # count how many ratings are there in both raters' results
+        conf_mat[a - min_rating][b - min_rating] += 1
+    return conf_mat
+
+
+def histogram(ratings, min_rating=None, max_rating=None):
+    """
+    Returns the counts of each type of rating that a rater made
+    """
+    if min_rating is None:
+        min_rating = min(ratings)
+    if max_rating is None:
+        max_rating = max(ratings)
+    num_ratings = int(max_rating - min_rating + 1)
+    hist_ratings = [0 for x in range(num_ratings)]
+    for r in ratings:
+        hist_ratings[r - min_rating] += 1
+    return hist_ratings
+
+
+def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
+    """
+    Calculates the quadratic weighted kappa
+    quadratic_weighted_kappa calculates the quadratic weighted kappa
+    value, which is a measure of inter-rater agreement between two raters
+    that provide discrete numeric ratings.  Potential values range from -1
+    (representing complete disagreement) to 1 (representing complete
+    agreement).  A kappa value of 0 is expected if all agreement is due to
+    chance.
+
+    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
+    each correspond to a list of integer ratings.  These lists must have the
+    same length.
+
+    The ratings should be integers, and it is assumed that they contain
+    the complete range of possible ratings.
+
+    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
+    is the minimum possible rating, and max_rating is the maximum possible
+    rating
+    """
+    rater_a = np.array(rater_a, dtype=int)
+    rater_b = np.array(rater_b, dtype=int)
+    assert(len(rater_a) == len(rater_b))
+    if min_rating is None:
+        min_rating = min(min(rater_a), min(rater_b))
+    if max_rating is None:
+        max_rating = max(max(rater_a), max(rater_b))
+    conf_mat = confusion_matrix(rater_a, rater_b,
+                                min_rating, max_rating)
+    num_ratings = len(conf_mat)
+    num_scored_items = float(len(rater_a))
+
+    # these two vector refer to their counts
+    hist_rater_a = histogram(rater_a, min_rating, max_rating)
+    hist_rater_b = histogram(rater_b, min_rating, max_rating)
+
+    numerator = 0.0
+    denominator = 0.0
+
+    for i in range(num_ratings):
+        for j in range(num_ratings):
+            # expected_count is so call outer product of two vec??
+            expected_count = (hist_rater_a[i] * hist_rater_b[j]
+                              / num_scored_items)
+            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
+            numerator += d * conf_mat[i][j] / num_scored_items
+            denominator += d * expected_count / num_scored_items
+
+    return 1.0 - numerator / denominator
+
+
+def linear_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
+    """
+    Calculates the linear weighted kappa
+    linear_weighted_kappa calculates the linear weighted kappa
+    value, which is a measure of inter-rater agreement between two raters
+    that provide discrete numeric ratings.  Potential values range from -1
+    (representing complete disagreement) to 1 (representing complete
+    agreement).  A kappa value of 0 is expected if all agreement is due to
+    chance.
+
+    linear_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
+    each correspond to a list of integer ratings.  These lists must have the
+    same length.
+
+    The ratings should be integers, and it is assumed that they contain
+    the complete range of possible ratings.
+
+    linear_weighted_kappa(X, min_rating, max_rating), where min_rating
+    is the minimum possible rating, and max_rating is the maximum possible
+    rating
+    """
+    assert(len(rater_a) == len(rater_b))
+    if min_rating is None:
+        min_rating = min(rater_a + rater_b)
+    if max_rating is None:
+        max_rating = max(rater_a + rater_b)
+    conf_mat = confusion_matrix(rater_a, rater_b,
+                                min_rating, max_rating)
+    num_ratings = len(conf_mat)
+    num_scored_items = float(len(rater_a))
+
+    hist_rater_a = histogram(rater_a, min_rating, max_rating)
+    hist_rater_b = histogram(rater_b, min_rating, max_rating)
+
+    numerator = 0.0
+    denominator = 0.0
+
+    for i in range(num_ratings):
+        for j in range(num_ratings):
+            expected_count = (hist_rater_a[i] * hist_rater_b[j]
+                              / num_scored_items)
+            d = abs(i - j) / float(num_ratings - 1)
+            numerator += d * conf_mat[i][j] / num_scored_items
+            denominator += d * expected_count / num_scored_items
+
+    return 1.0 - numerator / denominator
+
+
+def kappa(rater_a, rater_b, min_rating=None, max_rating=None):
+    """
+    Calculates the kappa
+    kappa calculates the kappa
+    value, which is a measure of inter-rater agreement between two raters
+    that provide discrete numeric ratings.  Potential values range from -1
+    (representing complete disagreement) to 1 (representing complete
+    agreement).  A kappa value of 0 is expected if all agreement is due to
+    chance.
+
+    kappa(rater_a, rater_b), where rater_a and rater_b
+    each correspond to a list of integer ratings.  These lists must have the
+    same length.
+
+    The ratings should be integers, and it is assumed that they contain
+    the complete range of possible ratings.
+
+    kappa(X, min_rating, max_rating), where min_rating
+    is the minimum possible rating, and max_rating is the maximum possible
+    rating
+    """
+    assert(len(rater_a) == len(rater_b))
+    if min_rating is None:
+        min_rating = min(rater_a + rater_b)
+    if max_rating is None:
+        max_rating = max(rater_a + rater_b)
+    conf_mat = confusion_matrix(rater_a, rater_b,
+                                min_rating, max_rating)
+    num_ratings = len(conf_mat)
+    num_scored_items = float(len(rater_a))
+
+    hist_rater_a = histogram(rater_a, min_rating, max_rating)
+    hist_rater_b = histogram(rater_b, min_rating, max_rating)
+
+    numerator = 0.0
+    denominator = 0.0
+
+    for i in range(num_ratings):
+        for j in range(num_ratings):
+            expected_count = (hist_rater_a[i] * hist_rater_b[j]
+                              / num_scored_items)
+            if i == j:
+                d = 0.0
+            else:
+                d = 1.0
+            numerator += d * conf_mat[i][j] / num_scored_items
+            denominator += d * expected_count / num_scored_items
+
+    return 1.0 - numerator / denominator
+
+
+def mean_quadratic_weighted_kappa(kappas, weights=None):
+    """
+    Calculates the mean of the quadratic
+    weighted kappas after applying Fisher's r-to-z transform, which is
+    approximately a variance-stabilizing transformation.  This
+    transformation is undefined if one of the kappas is 1.0, so all kappa
+    values are capped in the range (-0.999, 0.999).  The reverse
+    transformation is then applied before returning the result.
+
+    mean_quadratic_weighted_kappa(kappas), where kappas is a vector of
+    kappa values
+
+    mean_quadratic_weighted_kappa(kappas, weights), where weights is a vector
+    of weights that is the same size as kappas.  Weights are applied in the
+    z-space
+    """
+    kappas = np.array(kappas, dtype=float)
+    if weights is None:
+        weights = np.ones(np.shape(kappas))
+    else:
+        weights = weights / np.mean(weights)
+
+    # ensure that kappas are in the range [-.999, .999]
+    kappas = np.array([min(x, .999) for x in kappas])
+    kappas = np.array([max(x, -.999) for x in kappas])
+
+    z = 0.5 * np.log((1 + kappas) / (1 - kappas)) * weights
+    z = np.mean(z)
+    return (np.exp(2 * z) - 1) / (np.exp(2 * z) + 1)
+
+
+def weighted_mean_quadratic_weighted_kappa(solution, submission):
+    predicted_score = submission[submission.columns[-1]].copy()
+    predicted_score.name = "predicted_score"
+    if predicted_score.index[0] == 0:
+        predicted_score = predicted_score[:len(solution)]
+        predicted_score.index = solution.index
+    combined = solution.join(predicted_score, how="left")
+    groups = combined.groupby(by="essay_set")
+    kappas = [quadratic_weighted_kappa(group[1]["essay_score"], group[1]["predicted_score"]) for group in groups]
+    weights = [group[1]["essay_weight"].irow(0) for group in groups]
+    return mean_quadratic_weighted_kappa(kappas, weights=weights)
diff --git a/stepwise_selection.py b/stepwise_selection.py
new file mode 100644
index 0000000..01cc2a1
--- /dev/null
+++ b/stepwise_selection.py
@@ -0,0 +1,87 @@
+import pandas as pd
+import statsmodels.api as sm
+from statsmodels.formula.api import ols
+
+
+def stepwiseSelection(data, tag, 
+                       initial_list=list(), 
+                       threshold_in=0.05, 
+                       threshold_out=0.10,
+                       verbose=True):
+    '''
+    data: <Pandas.DataFrame> contaning indep/dep variables
+    tag: <String> name of target
+    initial_list: <List> independent variables that must be included
+    threshold_in: <float> partial F-test threshold for entering a variable
+    threshold_in: <float> partial F-test threshold for eliminating a variable
+    verbose: <Boolean> show stepwise details of entering and eliminating
+    >>> NOTE that {threshold_in < threshold_out} must be satisfied!
+    '''
+
+    ab_dic = dict()
+    ab_mark = ['~', '+', '-', ':', '*', '/']
+    for col in data.columns:
+        for mark in ab_mark:
+            if mark in col:
+                ab_dic[col.replace(mark, '')] = col
+                data.rename(columns={col: col.replace(mark, '')}, inplace=True)
+    
+    included = initial_list
+    formula = f'{tag}~1'  # set a constant model as initial reduced_model
+    best_r2_dif = .0
+
+    while True:  # end loop when no variable gets in/out
+
+        excluded = list(set(data.columns) - set(included))
+        excluded.remove(tag)
+        changed = False
+        
+        full_model = ols(formula=formula, data=data).fit()
+        last_adj_r2 = full_model.rsquared_adj
+
+        # forward step
+        for new_feature in excluded:
+
+            # Note here the test_model has more variable than full_model
+            test_model = ols(formula=formula+f'+{new_feature}', data=data).fit()
+
+            # find feature whose contribution to adj_r2 largest
+            if test_model.rsquared_adj - last_adj_r2 > best_r2_dif:
+                best_r2_dif = test_model.rsquared_adj - last_adj_r2
+                last_adj_r2 = test_model.rsquared_adj
+                best_feature = new_feature
+        
+        # Partial F-test
+        # Note that in anova_lm models with few variables are put forward
+        full_model_pro = ols(formula=formula+f'+{best_feature}', data=data).fit()
+        anova_tbl = sm.stats.anova_lm(full_model, full_model_pro)
+        criterion = anova_tbl['Pr(>F)'][1]
+
+        if criterion <= threshold_in:
+            included.append(best_feature)
+            formula += f'+{best_feature}'
+            full_model = full_model_pro
+            changed = True
+            best_r2_dif = .0
+            if verbose:
+                print('Add  {:25} with f_pvalue {:.6}'.format(best_feature, criterion))
+
+        # backward step
+        for old_feature in included:
+            test_model = ols(formula=formula.replace(f'+{old_feature}', ''), data=data).fit()
+
+            # Note here the test_model has less variable than full_model
+            anova_tbl = sm.stats.anova_lm(test_model, full_model)
+            criterion = anova_tbl['Pr(>F)'][1]
+            if criterion >= threshold_out:
+                included.remove(old_feature)
+                formula = formula.replace(f'+{old_feature}', '')
+                changed = True
+                best_r2_dif = .0
+                if verbose:
+                    print('Drop {:25} with f_pvalue {:.6}'.format(old_feature, criterion))
+        
+        if not changed:
+            break     
+
+    return [ab_dic[x] if x in ab_dic.keys() else x for x in included]