param <- list("objective" = "multi:softprob",
"eval_metric" = "mlogloss",
"num_class" = 12)
cv.nround <- 11
cv.nfold <- 5
mdcv <-xgb.cv(data=dtrain,params = param,nthread=6,nfold = cv.nfold,nrounds = cv.nround,verbose = T)
md <-xgb.train(data=dtrain,params = param,nround = 80,watchlist = list(train=dtrain,test=dtest),nthread=6)
This post was edited by Rakesh Racharla at September 12, 2020 5:47 PM IST
I think you need more details about the parameters used in your code. I would try to explain these parameters as follow :
start_char: int. This character is used to mark the start of the sequence. This function needs to Set to 1 because 0 is usually the padding character.
oov_char: int. words that were cut out because of the num_words or skip_top limit will be replaced with this character.
index_from: int. Index actual words with this index and higher.
It looks like the word indices in your dictionary starts from 1.
If you noticed that the indices returned by your keras have <START> and <UNKNOWN> as indexes 1 and 2. (And it assumes you will use 0 for <PADDING>).
Code for solution:
mport keras
NUM_WORDS=1000 # only use top 1000 words
INDEX_FROM=3 # word index offset
train,test = keras.datasets.imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM)
train_x,train_y = train
test_x,test_y = test
word_to_id = keras.datasets.imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
id_to_word = {value:key for key,value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in train_x[0] ))
Output:
"<START> this film was just brilliant casting <UNK> <UNK> story direction <UNK> really <UNK> the part they played and you could just imagine being there robert <UNK> is an amazing actor ...
min_rmse_index <- mdcv$best_iteration
min_rmse <- mdcv$evaluation_log[min_rmse_index]$test_rmse_mean
And depends on the application (linear, logistic,etc...), the objective, eval_metric and parameters shall be adjusted accordingly.
For the convenience of anyone who is running a regression, here is the slightly adjusted version of code (most are the same as above).
library(xgboost)
# Matrix for xgb: dtrain and dtest, "label" is the dependent variable
dtrain <- xgb.DMatrix(X_train, label = Y_train)
dtest <- xgb.DMatrix(X_test, label = Y_test)
best_param <- list()
best_seednumber <- 1234
best_rmse <- Inf
best_rmse_index <- 0
set.seed(123)
for (iter in 1:100) {
param <- list(objective = "reg:linear",
eval_metric = "rmse",
max_depth = sample(6:10, 1),
eta = runif(1, .01, .3), # Learning rate, default: 0.3
subsample = runif(1, .6, .9),
colsample_bytree = runif(1, .5, .8),
min_child_weight = sample(1:40, 1),
max_delta_step = sample(1:10, 1)
)
cv.nround <- 1000
cv.nfold <- 5 # 5-fold cross-validation
seed.number <- sample.int(10000, 1) # set seed for the cv
set.seed(seed.number)
mdcv <- xgb.cv(data = dtrain, params = param,
nfold = cv.nfold, nrounds = cv.nround,
verbose = F, early_stopping_rounds = 8, maximize = FALSE)
min_rmse_index <- mdcv$best_iteration
min_rmse <- mdcv$evaluation_log[min_rmse_index]$test_rmse_mean
if (min_rmse < best_rmse) {
best_rmse <- min_rmse
best_rmse_index <- min_rmse_index
best_seednumber <- seed.number
best_param <- param
}
}
# The best index (min_rmse_index) is the best "nround" in the model
nround = best_rmse_index
set.seed(best_seednumber)
xg_mod <- xgboost(data = dtest, params = best_param, nround = nround, verbose = F)
# Check error in testing data
yhat_xg <- predict(xg_mod, dtest)
(MSE_xgb <- mean((yhat_xg - Y_test)^2))
cv_folds <- KFold(dataFTR$isPreIctalTrain, nfolds = 5, stratified = FALSE, seed = seedNum)
xgb_cv_bayes <- function(nround,max.depth, min_child_weight, subsample,eta,gamma,colsample_bytree,max_delta_step) {
param<-list(booster = "gbtree",
max_depth = max.depth,
min_child_weight = min_child_weight,
eta=eta,gamma=gamma,
subsample = subsample, colsample_bytree = colsample_bytree,
max_delta_step=max_delta_step,
lambda = 1, alpha = 0,
objective = "binary:logistic",
eval_metric = "auc")
cv <- xgb.cv(params = param, data = dtrain, folds = cv_folds,nrounds = 1000,early_stopping_rounds = 10, maximize = TRUE, verbose = verbose)
list(Score = cv$evaluation_log$test_auc_mean[cv$best_iteration],
Pred=cv$best_iteration)
# we don't need cross-validation prediction and we need the number of rounds.
# a workaround is to pass the number of rounds(best_iteration) to the Pred, which is a default parameter in the rbayesianoptimization library.
}
OPT_Res <- BayesianOptimization(xgb_cv_bayes,
bounds = list(max.depth =c(3L, 10L),min_child_weight = c(1L, 40L),
subsample = c(0.6, 0.9),
eta=c(0.01,0.3),gamma = c(0.0, 0.2),
colsample_bytree=c(0.5,0.8),max_delta_step=c(1L,10L)),
init_grid_dt = NULL, init_points = 10, n_iter = 10,
acq = "ucb", kappa = 2.576, eps = 0.0,
verbose = verbose)
best_param <- list(
booster = "gbtree",
eval.metric = "auc",
objective = "binary:logistic",
max_depth = OPT_Res$Best_Par["max.depth"],
eta = OPT_Res$Best_Par["eta"],
gamma = OPT_Res$Best_Par["gamma"],
subsample = OPT_Res$Best_Par["subsample"],
colsample_bytree = OPT_Res$Best_Par["colsample_bytree"],
min_child_weight = OPT_Res$Best_Par["min_child_weight"],
max_delta_step = OPT_Res$Best_Par["max_delta_step"])
# number of rounds should be tuned using CV
#https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/beginners-tutorial-on-xgboost-parameter-tuning-r/tutorial/
# However, nrounds can not be directly derivied from the bayesianoptimization function
# Here, OPT_Res$Pred, which was supposed to be used for cross-validation, is used to record the number of rounds
nrounds=OPT_Res$Pred[[which.max(OPT_Res$History$Value)]]
xgb_model <- xgb.train (params = best_param, data = dtrain, nrounds = nrounds)
param <- list(objective = "multi:softprob",
eval_metric = "mlogloss",
num_class = 12,
max_depth = 8,
eta = 0.05,
gamma = 0.01,
subsample = 0.9,
colsample_bytree = 0.8,
min_child_weight = 4,
max_delta_step = 1
)
cv.nround = 1000
cv.nfold = 5
mdcv <- xgb.cv(data=dtrain, params = param, nthread=6,
nfold=cv.nfold, nrounds=cv.nround,
verbose = T)
min_logloss = min(mdcv[, test.mlogloss.mean])
min_logloss_index = which.min(mdcv[, test.mlogloss.mean])
min_logloss = 2.1457, min_logloss_index = 840
min_logloss = 2.2293, min_logloss_index = 920
min_logloss = 1.9745, min_logloss_index = 780
# best_param is global best param with minimum min_logloss
# best_min_logloss_index is the global minimum logloss index
nround = 780
md <- xgb.train(data=dtrain, params=best_param, nrounds=nround, nthread=6)
mdcv <- xgb.cv(data=dtrain, params=param, nthread=6,
nfold=cv.nfold, nrounds=cv.nround,
verbose = T, early.stop.round=8, maximize=FALSE)
best_param = list()
best_seednumber = 1234
best_logloss = Inf
best_logloss_index = 0
for (iter in 1:100) {
param <- list(objective = "multi:softprob",
eval_metric = "mlogloss",
num_class = 12,
max_depth = sample(6:10, 1),
eta = runif(1, .01, .3),
gamma = runif(1, 0.0, 0.2),
subsample = runif(1, .6, .9),
colsample_bytree = runif(1, .5, .8),
min_child_weight = sample(1:40, 1),
max_delta_step = sample(1:10, 1)
)
cv.nround = 1000
cv.nfold = 5
seed.number = sample.int(10000, 1)[[1]]
set.seed(seed.number)
mdcv <- xgb.cv(data=dtrain, params = param, nthread=6,
nfold=cv.nfold, nrounds=cv.nround,
verbose = T, early.stop.round=8, maximize=FALSE)
min_logloss = min(mdcv[, test.mlogloss.mean])
min_logloss_index = which.min(mdcv[, test.mlogloss.mean])
if (min_logloss < best_logloss) {
best_logloss = min_logloss
best_logloss_index = min_logloss_index
best_seednumber = seed.number
best_param = param
}
}
nround = best_logloss_index
set.seed(best_seednumber)
md <- xgb.train(data=dtrain, params=best_param, nrounds=nround, nthread=6)
I found silo's answer is very helpful. In addition to his approach of random research, you may want to use Bayesian optimization to facilitate the process of hyperparameter search, e.g. rBayesianOptimization library. The following is my code with rbayesianoptimization library.
cv_folds <- KFold(dataFTR$isPreIctalTrain, nfolds = 5, stratified = FALSE, seed = seedNum)
xgb_cv_bayes <- function(nround,max.depth, min_child_weight, subsample,eta,gamma,colsample_bytree,max_delta_step) {
param<-list(booster = "gbtree",
max_depth = max.depth,
min_child_weight = min_child_weight,
eta=eta,gamma=gamma,
subsample = subsample, colsample_bytree = colsample_bytree,
max_delta_step=max_delta_step,
lambda = 1, alpha = 0,
objective = "binary:logistic",
eval_metric = "auc")
cv <- xgb.cv(params = param, data = dtrain, folds = cv_folds,nrounds = 1000,early_stopping_rounds = 10, maximize = TRUE, verbose = verbose)
list(Score = cv$evaluation_log$test_auc_mean[cv$best_iteration],
Pred=cv$best_iteration)
# we don't need cross-validation prediction and we need the number of rounds.
# a workaround is to pass the number of rounds(best_iteration) to the Pred, which is a default parameter in the rbayesianoptimization library.
}
OPT_Res <- BayesianOptimization(xgb_cv_bayes,
bounds = list(max.depth =c(3L, 10L),min_child_weight = c(1L, 40L),
subsample = c(0.6, 0.9),
eta=c(0.01,0.3),gamma = c(0.0, 0.2),
colsample_bytree=c(0.5,0.8),max_delta_step=c(1L,10L)),
init_grid_dt = NULL, init_points = 10, n_iter = 10,
acq = "ucb", kappa = 2.576, eps = 0.0,
verbose = verbose)
best_param <- list(
booster = "gbtree",
eval.metric = "auc",
objective = "binary:logistic",
max_depth = OPT_Res$Best_Par["max.depth"],
eta = OPT_Res$Best_Par["eta"],
gamma = OPT_Res$Best_Par["gamma"],
subsample = OPT_Res$Best_Par["subsample"],
colsample_bytree = OPT_Res$Best_Par["colsample_bytree"],
min_child_weight = OPT_Res$Best_Par["min_child_weight"],
max_delta_step = OPT_Res$Best_Par["max_delta_step"])
# number of rounds should be tuned using CV
#https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/beginners-tutorial-on-xgboost-parameter-tuning-r/tutorial/
# However, nrounds can not be directly derivied from the bayesianoptimization function
# Here, OPT_Res$Pred, which was supposed to be used for cross-validation, is used to record the number of rounds
nrounds=OPT_Res$Pred[[which.max(OPT_Res$History$Value)]]
xgb_model <- xgb.train (params = best_param, data = dtrain, nrounds = nrounds)