library("quanteda.textmodels")
library("quanteda")
## Package version: 3.2.3
## Unicode version: 14.0
## ICU version: 70.1
## Parallel computing: 10 of 10 threads used.
## See https://quanteda.io for tutorials and examples.
quanteda.textmodels implements fast methods for fitting and predicting Naive Bayes textmodels built especially for sparse document-feature matrices from textual data. It implements two models: multinomial and Bernoulli. (See Manning, Raghavan, and Schütze 2008, Chapter 13.)
Here, we compare performance for the two models, and then to the performance from two other packages for fitting these models.
For these tests, we will choose the dataset of 50,000 movie reviews from Maas et. al. (2011). We will use their partition into test and training sets for training and fitting our models.
# large movie review database of 50,000 movie reviews
load(url("https://quanteda.org/data/data_corpus_LMRD.rda"))
<- tokens(data_corpus_LMRD) %>%
dfmat dfm()
<- dfm_subset(dfmat, set == "train")
dfmat_train <- dfm_subset(dfmat, set == "test") dfmat_test
Comparing the performance of fitting the model:
library("microbenchmark")
microbenchmark(
multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
times = 20
)## Warning in microbenchmark(multi = textmodel_nb(dfmat_train,
## dfmat_train$polarity, : less accurate nanosecond times to avoid potential
## integer overflows
## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 61.39000 64.11133 66.99309 65.77839 68.45346 83.24009 20
## bern 71.24263 72.95905 78.45931 76.31775 82.19606 101.66655 20
And for prediction:
microbenchmark(
multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
newdata = dfmat_test),
bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
newdata = dfmat_test),
times = 20
)## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 73.78959 76.19046 80.88285 77.25919 79.75921 108.1905 20
## bern 105.20981 106.44771 114.85248 112.08242 124.13461 132.1529 20
Now let’s see how textmodel_nb()
compares to equivalent
functions from other packages. Multinomial:
library("fastNaiveBayes")
library("naivebayes")
## naivebayes 0.9.7 loaded
microbenchmark(
textmodels = {
<- textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial")
tmod <- predict(tmod, newdata = dfmat_test)
pred
},fastNaiveBayes = {
<- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
tmod <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
pred
},naivebayes = {
= multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
tmod <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
pred
},times = 20
)## Unit: milliseconds
## expr min lq mean median uq max
## textmodels 74.15322 77.37278 80.6312 78.65249 81.30023 96.74073
## fastNaiveBayes 100.90112 104.12653 110.8884 108.84645 116.13853 135.95924
## naivebayes 83.41003 87.28964 100.0759 89.05637 92.83388 271.13074
## neval
## 20
## 20
## 20
And Bernoulli. Note here that while we are supplying the boolean
matrix to textmodel_nb()
, this re-weighting from the count
matrix would have been performed automatically within the function had
we not done so in advance - it’s done here just for comparison.
<- dfm_weight(dfmat_train, scheme = "boolean")
dfmat_train_bern <- dfm_weight(dfmat_test, scheme = "boolean")
dfmat_test_bern
microbenchmark(
textmodels = {
<- textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli")
tmod <- predict(tmod, newdata = dfmat_test)
pred
},fastNaiveBayes = {
<- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
tmod <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
pred
},naivebayes = {
= bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
tmod <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
pred
},times = 20
)## Unit: milliseconds
## expr min lq mean median uq max neval
## textmodels 104.86525 107.91493 117.5311 116.7810 126.6451 131.5138 20
## fastNaiveBayes 114.11329 116.01469 125.5177 123.1699 131.8557 146.4291 20
## naivebayes 96.52031 98.37962 107.5170 100.7663 102.7416 201.5234 20
Maas, Andrew L., Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts (2011). “Learning Word Vectors for Sentiment Analysis”. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
Majka M (2020). naivebayes: High Performance Implementation of the Naive Bayes Algorithm in R. R package version 0.9.7, <URL: https://CRAN.R-project.org/package=naivebayes>. Date: 2020-03-08.
Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schütze (2008). Introduction to Information Retrieval. Cambridge University Press.
Skogholt, Martin (2020). fastNaiveBayes: Extremely Fast Implementation of a Naive Bayes Classifier. R package version 2.2.0. https://github.com/mskogholt/fastNaiveBayes. Date: 2020-02-23.