SuperML R package is designed to unify the model training process in
R like Python. Generally, it’s seen that people spend lot of time in
searching for packages, figuring out the syntax for training machine
learning models in R. This behaviour is highly apparent in users who
frequently switch between R and Python. This package provides a python´s
scikit-learn interface (fit
, predict
) to train
models faster.
In addition to building machine learning models, there are handy functionalities to do feature engineering
This ambitious package is my ongoing effort to help the r-community build ML models easily and faster in R.
You can install latest cran version using (recommended):
install.packages("superml")
You can install the developmemt version directly from github using:
::install_github("saraswatmks/superml") devtools
For machine learning, superml is based on the existing R packages. Hence, while installing the package, we don’t install all the dependencies. However, while training any model, superml will automatically install the package if its not found. Still, if you want to install all dependencies at once, you can simply do:
install.packages("superml", dependencies=TRUE)
This package uses existing r-packages to build machine learning model. In this tutorial, we’ll use data.table R package to do all tasks related to data manipulation.
We’ll quickly prepare the data set to be ready to served for model training.
load("../data/reg_train.rda")
# if the above doesn't work, you can try: load("reg_train.rda")
library(data.table)
library(caret)
#> Loading required package: ggplot2
#> Loading required package: lattice
library(superml)
library(Metrics)
#>
#> Attaching package: 'Metrics'
#> The following objects are masked from 'package:caret':
#>
#> precision, recall
head(reg_train)
#> Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
#> 1: 1 60 RL 65 8450 Pave <NA> Reg Lvl
#> 2: 2 20 RL 80 9600 Pave <NA> Reg Lvl
#> 3: 3 60 RL 68 11250 Pave <NA> IR1 Lvl
#> 4: 4 70 RL 60 9550 Pave <NA> IR1 Lvl
#> 5: 5 60 RL 84 14260 Pave <NA> IR1 Lvl
#> 6: 6 50 RL 85 14115 Pave <NA> IR1 Lvl
#> Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
#> 1: AllPub Inside Gtl CollgCr Norm Norm 1Fam
#> 2: AllPub FR2 Gtl Veenker Feedr Norm 1Fam
#> 3: AllPub Inside Gtl CollgCr Norm Norm 1Fam
#> 4: AllPub Corner Gtl Crawfor Norm Norm 1Fam
#> 5: AllPub FR2 Gtl NoRidge Norm Norm 1Fam
#> 6: AllPub Inside Gtl Mitchel Norm Norm 1Fam
#> HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
#> 1: 2Story 7 5 2003 2003 Gable CompShg
#> 2: 1Story 6 8 1976 1976 Gable CompShg
#> 3: 2Story 7 5 2001 2002 Gable CompShg
#> 4: 2Story 7 5 1915 1970 Gable CompShg
#> 5: 2Story 8 5 2000 2000 Gable CompShg
#> 6: 1.5Fin 5 5 1993 1995 Gable CompShg
#> Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
#> 1: VinylSd VinylSd BrkFace 196 Gd TA PConc
#> 2: MetalSd MetalSd None 0 TA TA CBlock
#> 3: VinylSd VinylSd BrkFace 162 Gd TA PConc
#> 4: Wd Sdng Wd Shng None 0 TA TA BrkTil
#> 5: VinylSd VinylSd BrkFace 350 Gd TA PConc
#> 6: VinylSd VinylSd None 0 TA TA Wood
#> BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
#> 1: Gd TA No GLQ 706 Unf
#> 2: Gd TA Gd ALQ 978 Unf
#> 3: Gd TA Mn GLQ 486 Unf
#> 4: TA Gd No ALQ 216 Unf
#> 5: Gd TA Av GLQ 655 Unf
#> 6: Gd TA No GLQ 732 Unf
#> BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
#> 1: 0 150 856 GasA Ex Y SBrkr
#> 2: 0 284 1262 GasA Ex Y SBrkr
#> 3: 0 434 920 GasA Ex Y SBrkr
#> 4: 0 540 756 GasA Gd Y SBrkr
#> 5: 0 490 1145 GasA Ex Y SBrkr
#> 6: 0 64 796 GasA Ex Y SBrkr
#> 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
#> 1: 856 854 0 1710 1 0 2
#> 2: 1262 0 0 1262 0 1 2
#> 3: 920 866 0 1786 1 0 2
#> 4: 961 756 0 1717 1 0 1
#> 5: 1145 1053 0 2198 1 0 2
#> 6: 796 566 0 1362 1 0 1
#> HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
#> 1: 1 3 1 Gd 8 Typ
#> 2: 0 3 1 TA 6 Typ
#> 3: 1 3 1 Gd 6 Typ
#> 4: 0 3 1 Gd 7 Typ
#> 5: 1 4 1 Gd 9 Typ
#> 6: 1 1 1 TA 5 Typ
#> Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
#> 1: 0 <NA> Attchd 2003 RFn 2
#> 2: 1 TA Attchd 1976 RFn 2
#> 3: 1 TA Attchd 2001 RFn 2
#> 4: 1 Gd Detchd 1998 Unf 3
#> 5: 1 TA Attchd 2000 RFn 3
#> 6: 0 <NA> Attchd 1993 Unf 2
#> GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
#> 1: 548 TA TA Y 0 61
#> 2: 460 TA TA Y 298 0
#> 3: 608 TA TA Y 0 42
#> 4: 642 TA TA Y 0 35
#> 5: 836 TA TA Y 192 84
#> 6: 480 TA TA Y 40 30
#> EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
#> 1: 0 0 0 0 <NA> <NA> <NA>
#> 2: 0 0 0 0 <NA> <NA> <NA>
#> 3: 0 0 0 0 <NA> <NA> <NA>
#> 4: 272 0 0 0 <NA> <NA> <NA>
#> 5: 0 0 0 0 <NA> <NA> <NA>
#> 6: 0 320 0 0 <NA> MnPrv Shed
#> MiscVal MoSold YrSold SaleType SaleCondition SalePrice
#> 1: 0 2 2008 WD Normal 208500
#> 2: 0 5 2007 WD Normal 181500
#> 3: 0 9 2008 WD Normal 223500
#> 4: 0 2 2006 WD Abnorml 140000
#> 5: 0 12 2008 WD Normal 250000
#> 6: 700 10 2009 WD Normal 143000
<- createDataPartition(y = reg_train$SalePrice, p = 0.7)
split <- reg_train[split$Resample1]
xtrain <- reg_train[!split$Resample1] xtest
# remove features with 90% or more missing values
# we will also remove the Id column because it doesn't contain
# any useful information
<- colSums(is.na(xtrain)) / nrow(xtrain)
na_cols <- names(na_cols[which(na_cols > 0.9)])
na_cols
c(na_cols, "Id") := NULL]
xtrain[, c(na_cols, "Id") := NULL]
xtest[,
# encode categorical variables
<- names(xtrain)[sapply(xtrain, is.character)]
cat_cols
for(c in cat_cols){
<- LabelEncoder$new()
lbl $fit(c(xtrain[[c]], xtest[[c]]))
lbl<- lbl$transform(xtrain[[c]])
xtrain[[c]] <- lbl$transform(xtest[[c]])
xtest[[c]]
}#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
# removing noise column
<- c('GrLivArea','TotalBsmtSF')
noise
c(noise) := NULL]
xtrain[, c(noise) := NULL]
xtest[,
# fill missing value with -1
is.na(xtrain)] <- -1
xtrain[is.na(xtest)] <- -1 xtest[
KNN Regression
<- KNNTrainer$new(k = 2,prob = T,type = 'reg')
knn $fit(train = xtrain, test = xtest, y = 'SalePrice')
knn<- knn$predict(type = 'prob')
probs <- knn$predict(type='raw')
labels rmse(actual = xtest$SalePrice, predicted=labels)
#> [1] 57335.33
SVM Regression
<- SVMTrainer$new()
svm $fit(xtrain, 'SalePrice')
svm<- svm$predict(xtest)
pred rmse(actual = xtest$SalePrice, predicted = pred)
Simple Regresison
<- LMTrainer$new(family="gaussian")
lf $fit(X = xtrain, y = "SalePrice")
lfsummary(lf$model)
#>
#> Call:
#> stats::glm(formula = f, family = self$family, data = X, weights = self$weights)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -330921 -14137 -1245 11893 267369
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -1.226e+06 1.666e+06 -0.736 0.462036
#> MSSubClass -1.421e+02 5.630e+01 -2.524 0.011761 *
#> MSZoning 3.102e+02 1.468e+03 0.211 0.832694
#> LotFrontage -1.620e+01 3.474e+01 -0.466 0.641062
#> LotArea 3.865e-01 1.283e-01 3.012 0.002666 **
#> Street -1.813e+04 3.604e+04 -0.503 0.615036
#> LotShape 2.264e+03 2.204e+03 1.027 0.304638
#> LandContour -4.321e+03 1.693e+03 -2.553 0.010846 *
#> Utilities -5.654e+04 3.505e+04 -1.613 0.107070
#> LotConfig 7.147e+02 1.140e+03 0.627 0.530715
#> LandSlope 1.383e+04 4.566e+03 3.029 0.002519 **
#> Neighborhood -8.145e+02 2.015e+02 -4.042 5.73e-05 ***
#> Condition1 -2.848e+03 1.036e+03 -2.749 0.006087 **
#> Condition2 -1.319e+04 3.694e+03 -3.569 0.000376 ***
#> BldgType -3.667e+02 2.160e+03 -0.170 0.865229
#> HouseStyle 6.169e+01 8.609e+02 0.072 0.942886
#> OverallQual 1.635e+04 1.440e+03 11.355 < 2e-16 ***
#> OverallCond 5.797e+03 1.266e+03 4.578 5.33e-06 ***
#> YearBuilt 2.976e+02 8.817e+01 3.376 0.000766 ***
#> YearRemodAdd 1.628e+02 8.245e+01 1.974 0.048637 *
#> RoofStyle 6.383e+03 2.039e+03 3.131 0.001798 **
#> RoofMatl -1.326e+04 2.236e+03 -5.932 4.19e-09 ***
#> Exterior1st -9.721e+02 6.013e+02 -1.617 0.106262
#> Exterior2nd 7.869e+02 5.352e+02 1.470 0.141797
#> MasVnrType 3.058e+03 1.710e+03 1.788 0.074098 .
#> MasVnrArea 1.675e+01 7.692e+00 2.178 0.029645 *
#> ExterQual 3.332e+03 2.526e+03 1.319 0.187452
#> ExterCond -1.988e+02 2.803e+03 -0.071 0.943465
#> Foundation -3.813e+03 2.111e+03 -1.807 0.071116 .
#> BsmtQual 7.079e+03 1.591e+03 4.451 9.57e-06 ***
#> BsmtCond -4.050e+03 1.955e+03 -2.071 0.038582 *
#> BsmtExposure 1.527e+03 9.947e+02 1.536 0.124968
#> BsmtFinType1 -1.391e+03 6.765e+02 -2.056 0.040013 *
#> BsmtFinSF1 7.268e+00 6.089e+00 1.194 0.232901
#> BsmtFinType2 -1.604e+03 1.031e+03 -1.557 0.119909
#> BsmtFinSF2 1.823e+01 1.085e+01 1.681 0.093177 .
#> BsmtUnfSF 2.660e+00 5.801e+00 0.459 0.646698
#> Heating -1.669e+03 3.480e+03 -0.480 0.631565
#> HeatingQC -2.749e+03 1.479e+03 -1.858 0.063486 .
#> CentralAir 1.820e+03 5.718e+03 0.318 0.750274
#> Electrical 5.472e+03 3.136e+03 1.745 0.081346 .
#> `1stFlrSF` 5.262e+01 7.552e+00 6.968 6.02e-12 ***
#> `2ndFlrSF` 5.335e+01 5.987e+00 8.910 < 2e-16 ***
#> LowQualFinSF 5.399e+01 2.587e+01 2.086 0.037205 *
#> BsmtFullBath 1.010e+04 3.052e+03 3.309 0.000971 ***
#> BsmtHalfBath 5.129e+03 5.042e+03 1.017 0.309327
#> FullBath 6.117e+03 3.345e+03 1.829 0.067767 .
#> HalfBath -4.578e+02 3.143e+03 -0.146 0.884243
#> BedroomAbvGr -6.693e+03 2.082e+03 -3.215 0.001349 **
#> KitchenAbvGr -1.383e+04 6.351e+03 -2.178 0.029623 *
#> KitchenQual 8.734e+03 1.881e+03 4.642 3.93e-06 ***
#> TotRmsAbvGrd 1.964e+03 1.476e+03 1.330 0.183742
#> Functional -5.005e+03 1.576e+03 -3.177 0.001538 **
#> Fireplaces 9.951e+02 2.695e+03 0.369 0.712072
#> FireplaceQu 3.271e+03 1.376e+03 2.377 0.017646 *
#> GarageType -8.831e+01 1.338e+03 -0.066 0.947379
#> GarageYrBlt -9.273e+00 4.862e+00 -1.907 0.056797 .
#> GarageFinish 1.970e+03 1.529e+03 1.288 0.197962
#> GarageCars 1.720e+04 3.565e+03 4.824 1.64e-06 ***
#> GarageArea -6.004e+00 1.216e+01 -0.494 0.621666
#> GarageQual 3.204e+01 2.139e+03 0.015 0.988053
#> GarageCond -1.661e+03 2.992e+03 -0.555 0.579030
#> PavedDrive -4.212e+02 3.172e+03 -0.133 0.894393
#> WoodDeckSF 2.917e+01 9.431e+00 3.093 0.002037 **
#> OpenPorchSF -1.177e+01 1.765e+01 -0.667 0.505015
#> EnclosedPorch 7.927e+00 1.929e+01 0.411 0.681125
#> `3SsnPorch` 3.307e+01 3.336e+01 0.991 0.321886
#> ScreenPorch 5.248e+01 1.889e+01 2.778 0.005574 **
#> PoolArea -4.912e+01 3.112e+01 -1.578 0.114870
#> Fence -1.169e+03 1.313e+03 -0.890 0.373727
#> MiscVal 3.280e+00 3.743e+00 0.876 0.381108
#> MoSold -1.747e+02 3.943e+02 -0.443 0.657803
#> YrSold 1.349e+02 8.313e+02 0.162 0.871151
#> SaleType 3.308e+03 1.429e+03 2.315 0.020828 *
#> SaleCondition 2.184e+03 1.538e+03 1.420 0.155956
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for gaussian family taken to be 1065673415)
#>
#> Null deviance: 6.4763e+12 on 1023 degrees of freedom
#> Residual deviance: 1.0113e+12 on 949 degrees of freedom
#> AIC: 24266
#>
#> Number of Fisher Scoring iterations: 2
<- lf$predict(df = xtest)
predictions rmse(actual = xtest$SalePrice, predicted = predictions)
#> [1] 31052.78
Lasso Regression
<- LMTrainer$new(family = "gaussian", alpha = 1, lambda = 1000)
lf $fit(X = xtrain, y = "SalePrice")
lf<- lf$predict(df = xtest)
predictions rmse(actual = xtest$SalePrice, predicted = predictions)
#> [1] 36575.61
Ridge Regression
<- LMTrainer$new(family = "gaussian", alpha=0)
lf $fit(X = xtrain, y = "SalePrice")
lf<- lf$predict(df = xtest)
predictions rmse(actual = xtest$SalePrice, predicted = predictions)
#> [1] 36881.46
Logistic Regression with CV
<- LMTrainer$new(family = "gaussian")
lf $cv_model(X = xtrain, y = 'SalePrice', nfolds = 5, parallel = FALSE)
lf<- lf$cv_predict(df = xtest)
predictions <- lf$get_importance()
coefs rmse(actual = xtest$SalePrice, predicted = predictions)
Random Forest
<- RFTrainer$new(n_estimators = 500,classification = 0)
rf $fit(X = xtrain, y = "SalePrice")
rf<- rf$predict(df = xtest)
pred $get_importance()
rf#> tmp.order.tmp..decreasing...TRUE..
#> OverallQual 834476848152
#> GarageCars 510012304162
#> 1stFlrSF 451232914440
#> GarageArea 434552531675
#> YearBuilt 385936284845
#> GarageYrBlt 303667654374
#> BsmtFinSF1 273157791836
#> FullBath 268627238645
#> 2ndFlrSF 218927159791
#> YearRemodAdd 188553798393
#> LotArea 186274672234
#> ExterQual 184166782017
#> Fireplaces 166267239903
#> TotRmsAbvGrd 158712323105
#> FireplaceQu 152475747600
#> KitchenQual 130819908447
#> MasVnrArea 127523147818
#> BsmtQual 107269073251
#> Neighborhood 100596580689
#> Foundation 97764380344
#> LotFrontage 96099956679
#> OpenPorchSF 92040362246
#> BsmtFinType1 72011674399
#> WoodDeckSF 68342931902
#> BsmtUnfSF 65123909353
#> HeatingQC 48551532657
#> Exterior2nd 42614732188
#> BedroomAbvGr 42324982884
#> MoSold 42205131243
#> MSSubClass 36049902052
#> GarageType 33828515003
#> OverallCond 33518533857
#> HalfBath 33030418712
#> RoofStyle 32429340814
#> Exterior1st 27372452250
#> GarageFinish 27266328366
#> HouseStyle 26926668870
#> BsmtFullBath 23777127946
#> YrSold 22298786177
#> LotShape 21624370631
#> BsmtExposure 20265925749
#> PoolArea 19890279154
#> SaleCondition 19139456614
#> LandContour 19058016432
#> SaleType 18075779821
#> MasVnrType 15679728702
#> ScreenPorch 13883083306
#> BsmtHalfBath 13880768867
#> Fence 13408713216
#> MSZoning 13062092694
#> RoofMatl 12283375783
#> GarageQual 12088685592
#> BldgType 12050964102
#> LandSlope 11842142417
#> LotConfig 11630524776
#> GarageCond 11247793644
#> Condition1 10535908684
#> CentralAir 9452861962
#> BsmtCond 9306403356
#> KitchenAbvGr 8400243459
#> BsmtFinSF2 7867830778
#> EnclosedPorch 7847895078
#> BsmtFinType2 5367972150
#> Functional 4776347446
#> ExterCond 4712428836
#> PavedDrive 4079217252
#> LowQualFinSF 2916480851
#> Heating 2812130826
#> 3SsnPorch 2771742639
#> MiscVal 2287937272
#> Electrical 2237279769
#> Condition2 1472736680
#> Street 178667557
#> Utilities 14780242
rmse(actual = xtest$SalePrice, predicted = pred)
#> [1] 27917.52
Xgboost
<- XGBTrainer$new(objective = "reg:linear"
xgb n_estimators = 500
, eval_metric = "rmse"
, maximize = F
, learning_rate = 0.1
, max_depth = 6)
,$fit(X = xtrain, y = "SalePrice", valid = xtest)
xgb<- xgb$predict(xtest)
pred rmse(actual = xtest$SalePrice, predicted = pred)
Grid Search
<- XGBTrainer$new(objective = "reg:linear")
xgb
<- GridSearchCV$new(trainer = xgb,
gst parameters = list(n_estimators = c(10,50), max_depth = c(5,2)),
n_folds = 3,
scoring = c('accuracy','auc'))
$fit(xtrain, "SalePrice")
gst$best_iteration() gst
Random Search
<- RFTrainer$new()
rf <- RandomSearchCV$new(trainer = rf,
rst parameters = list(n_estimators = c(5,10),
max_depth = c(5,2)),
n_folds = 3,
scoring = c('accuracy','auc'),
n_iter = 3)
$fit(xtrain, "SalePrice")
rst#> [1] "In total, 3 models will be trained"
$best_iteration()
rst#> $n_estimators
#> [1] 10
#>
#> $max_depth
#> [1] 2
#>
#> $accuracy_avg
#> [1] 0.006837021
#>
#> $accuracy_sd
#> [1] 0.006110339
#>
#> $auc_avg
#> [1] NaN
#>
#> $auc_sd
#> [1] NA
Here, we will solve a simple binary classification problem (predict people who survived on titanic ship). The idea here is to demonstrate how to use this package to solve classification problems.
Data Preparation
# load class
load('../data/cla_train.rda')
# if the above doesn't work, you can try: load("cla_train.rda")
head(cla_train)
#> PassengerId Survived Pclass
#> 1: 1 0 3
#> 2: 2 1 1
#> 3: 3 1 3
#> 4: 4 1 1
#> 5: 5 0 3
#> 6: 6 0 3
#> Name Sex Age SibSp Parch
#> 1: Braund, Mr. Owen Harris male 22 1 0
#> 2: Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
#> 3: Heikkinen, Miss. Laina female 26 0 0
#> 4: Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
#> 5: Allen, Mr. William Henry male 35 0 0
#> 6: Moran, Mr. James male NA 0 0
#> Ticket Fare Cabin Embarked
#> 1: A/5 21171 7.2500 S
#> 2: PC 17599 71.2833 C85 C
#> 3: STON/O2. 3101282 7.9250 S
#> 4: 113803 53.1000 C123 S
#> 5: 373450 8.0500 S
#> 6: 330877 8.4583 Q
# split the data
<- createDataPartition(y = cla_train$Survived,p = 0.7)
split <- cla_train[split$Resample1]
xtrain <- cla_train[!split$Resample1]
xtest
# encode categorical variables - shorter way
for(c in c('Embarked','Sex','Cabin')) {
<- LabelEncoder$new()
lbl $fit(c(xtrain[[c]], xtest[[c]]))
lbl<- lbl$transform(xtrain[[c]])
xtrain[[c]] <- lbl$transform(xtest[[c]])
xtest[[c]]
}#> The data contains blank values. Imputing them with 'NA'
#> The data contains blank values. Imputing them with 'NA'
#> The data contains blank values. Imputing them with 'NA'
#> The data contains blank values. Imputing them with 'NA'
#> The data contains blank values. Imputing them with 'NA'
# impute missing values
:= replace(Age, is.na(Age), median(Age, na.rm = T))]
xtrain[, Age := replace(Age, is.na(Age), median(Age, na.rm = T))]
xtest[, Age
# drop these features
<- c('PassengerId','Ticket','Name')
to_drop
<- xtrain[,-c(to_drop), with=F]
xtrain <- xtest[,-c(to_drop), with=F] xtest
Now, our data is ready to be served for model training. Let’s do it.
KNN Classification
<- KNNTrainer$new(k = 2,prob = T,type = 'class')
knn $fit(train = xtrain, test = xtest, y = 'Survived')
knn<- knn$predict(type = 'prob')
probs <- knn$predict(type = 'raw')
labels auc(actual = xtest$Survived, predicted = labels)
#> [1] 0.6385027
Naive Bayes Classification
<- NBTrainer$new()
nb $fit(xtrain, 'Survived')
nb<- nb$predict(xtest)
pred #> Warning: predict.naive_bayes(): more features in the newdata are provided as
#> there are probability tables in the object. Calculation is performed based on
#> features to be found in the tables.
auc(actual = xtest$Survived, predicted = pred)
#> [1] 0.7771836
SVM Classification
#predicts labels
<- SVMTrainer$new()
svm $fit(xtrain, 'Survived')
svm<- svm$predict(xtest)
pred auc(actual = xtest$Survived, predicted=pred)
Logistic Regression
<- LMTrainer$new(family = "binomial")
lf $fit(X = xtrain, y = "Survived")
lfsummary(lf$model)
#>
#> Call:
#> stats::glm(formula = f, family = self$family, data = X, weights = self$weights)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -2.6102 -0.6018 -0.4367 0.7038 2.4493
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 1.830070 0.616894 2.967 0.00301 **
#> Pclass -0.980785 0.192493 -5.095 3.48e-07 ***
#> Sex 2.508241 0.230374 10.888 < 2e-16 ***
#> Age -0.041034 0.009309 -4.408 1.04e-05 ***
#> SibSp -0.235520 0.117715 -2.001 0.04542 *
#> Parch -0.098742 0.137791 -0.717 0.47361
#> Fare 0.001281 0.002842 0.451 0.65230
#> Cabin 0.008408 0.004786 1.757 0.07899 .
#> Embarked 0.248088 0.166616 1.489 0.13649
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 831.52 on 623 degrees of freedom
#> Residual deviance: 564.76 on 615 degrees of freedom
#> AIC: 582.76
#>
#> Number of Fisher Scoring iterations: 5
<- lf$predict(df = xtest)
predictions auc(actual = xtest$Survived, predicted = predictions)
#> [1] 0.8832145
Lasso Logistic Regression
<- LMTrainer$new(family="binomial", alpha=1)
lf $cv_model(X = xtrain, y = "Survived", nfolds = 5, parallel = FALSE)
lf<- lf$cv_predict(df = xtest)
pred auc(actual = xtest$Survived, predicted = pred)
Ridge Logistic Regression
<- LMTrainer$new(family="binomial", alpha=0)
lf $cv_model(X = xtrain, y = "Survived", nfolds = 5, parallel = FALSE)
lf<- lf$cv_predict(df = xtest)
pred auc(actual = xtest$Survived, predicted = pred)
Random Forest
<- RFTrainer$new(n_estimators = 500,classification = 1, max_features = 3)
rf $fit(X = xtrain, y = "Survived")
rf
<- rf$predict(df = xtest)
pred $get_importance()
rf#> tmp.order.tmp..decreasing...TRUE..
#> Sex 67.80128
#> Fare 57.97193
#> Age 48.37045
#> Pclass 24.64915
#> Cabin 21.45972
#> SibSp 13.51637
#> Parch 10.45743
#> Embarked 10.23844
auc(actual = xtest$Survived, predicted = pred)
#> [1] 0.7976827
Xgboost
<- XGBTrainer$new(objective = "binary:logistic"
xgb n_estimators = 500
, eval_metric = "auc"
, maximize = T
, learning_rate = 0.1
, max_depth = 6)
,$fit(X = xtrain, y = "Survived", valid = xtest)
xgb
<- xgb$predict(xtest)
pred auc(actual = xtest$Survived, predicted = pred)
Grid Search
<- XGBTrainer$new(objective="binary:logistic")
xgb <-GridSearchCV$new(trainer = xgb,
gst parameters = list(n_estimators = c(10,50),
max_depth = c(5,2)),
n_folds = 3,
scoring = c('accuracy','auc'))
$fit(xtrain, "Survived")
gst$best_iteration() gst
Random Search
<- RFTrainer$new()
rf <- RandomSearchCV$new(trainer = rf,
rst parameters = list(n_estimators = c(10,50), max_depth = c(5,2)),
n_folds = 3,
scoring = c('accuracy','auc'),
n_iter = 3)
$fit(xtrain, "Survived")
rst#> [1] "In total, 3 models will be trained"
$best_iteration()
rst#> $n_estimators
#> [1] 50
#>
#> $max_depth
#> [1] 5
#>
#> $accuracy_avg
#> [1] 0.7964744
#>
#> $accuracy_sd
#> [1] 0.03090914
#>
#> $auc_avg
#> [1] 0.7729436
#>
#> $auc_sd
#> [1] 0.04283084
Let’s create some new feature based on target variable using target encoding and test a model.
# add target encoding features
:= smoothMean(train_df = xtrain,
xtrain[, feat_01 test_df = xtest,
colname = "Embarked",
target = "Survived")$train[[2]]]
:= smoothMean(train_df = xtrain,
xtest[, feat_01 test_df = xtest,
colname = "Embarked",
target = "Survived")$test[[2]]]
# train a random forest
# Random Forest
<- RFTrainer$new(n_estimators = 500,classification = 1, max_features = 4)
rf $fit(X = xtrain, y = "Survived")
rf<- rf$predict(df = xtest)
pred $get_importance()
rf#> tmp.order.tmp..decreasing...TRUE..
#> Sex 69.787235
#> Fare 60.832089
#> Age 52.982604
#> Pclass 24.419818
#> Cabin 21.419274
#> SibSp 13.112177
#> Parch 10.175269
#> feat_01 6.675399
#> Embarked 6.450819
auc(actual = xtest$Survived, predicted = pred)
#> [1] 0.8018717