-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_training.R
106 lines (84 loc) · 2.81 KB
/
model_training.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
library(xgboost)
library(caret)
# notes for myself
# preparing data for xgboost
prepare_data_for_xgboost <- function(data, target_column) {
# one part goes to features, another one goes to target
features <- data[, !colnames(data) %in% target_column, drop = FALSE]
target <- data[[target_column]]
feature_matrix <- as.matrix(features)
# dmatrix = the format for xgboost
dtrain <- xgb.DMatrix(data = feature_matrix, label = target)
return(list(dtrain = dtrain, feature_names = colnames(features)))
}
# main function for xgboost model training
train_xgboost_model <- function(data,
target_column,
params = list(),
nrounds = 100) {
prepared_data <- prepare_data_for_xgboost(data, target_column)
dtrain <- prepared_data$dtrain
feature_names <- prepared_data$feature_names
# default parameters here
default_params <- list(
objective = "binary:logistic",
eval_metric = "auc",
eta = 0.1,
max_depth = 6,
subsample = 0.8,
colsample_bytree = 0.8
)
# merge default and provided parameters
final_params <- modifyList(default_params, params)
# train the model
model <- xgb.train(
params = final_params,
data = dtrain,
nrounds = nrounds,
watchlist = list(train = dtrain),
early_stopping_rounds = 10,
verbose = 1
)
model$feature_names <- feature_names
return(model)
}
# cross validation function
cross_validate_xgboost <- function(data, target_column, params = list(),
nrounds = 100, nfold = 5,
early_stopping_rounds = 10, verbose = 1) {
# Prepare the data for XGBoost using the existing data preparation function
prepared_data <- prepare_data_for_xgboost(data, target_column)
dtrain <- prepared_data$dtrain
# Define default parameters for the XGBoost model
default_params <- list(
objective = "binary:logistic",
eval_metric = "auc",
eta = 0.1,
max_depth = 6,
subsample = 0.8,
colsample_bytree = 0.8
)
# Merge default parameters with any user-provided parameters
final_params <- modifyList(default_params, params)
cv_results <- xgb.cv(
params = final_params,
data = dtrain,
nrounds = nrounds,
nfold = nfold,
early_stopping_rounds = 10,
verbose = 1
)
return(cv_results)
}
if (interactive()) {
featured_data <- read.csv("data/processed/featured_ngs_data.csv")
# Assume 'target' is the column name for our prediction target
target_column <- "target"
# Train model
model <- train_xgboost_model(featured_data, target_column)
# Perform cross-validation
cv_results <- cross_validate_xgboost(featured_data, target_column)
# Print results
print(model)
print(cv_results)
}