textmodel Performance Comparisons

Kenneth Benoit

library("quanteda.textmodels")
library("quanteda")
## Package version: 3.2.3
## Unicode version: 14.0
## ICU version: 70.1
## Parallel computing: 10 of 10 threads used.
## See https://quanteda.io for tutorials and examples.

Naive Bayes

quanteda.textmodels implements fast methods for fitting and predicting Naive Bayes textmodels built especially for sparse document-feature matrices from textual data. It implements two models: multinomial and Bernoulli. (See Manning, Raghavan, and Schütze 2008, Chapter 13.)

Here, we compare performance for the two models, and then to the performance from two other packages for fitting these models.

For these tests, we will choose the dataset of 50,000 movie reviews from Maas et. al. (2011). We will use their partition into test and training sets for training and fitting our models.

# large movie review database of 50,000 movie reviews
load(url("https://quanteda.org/data/data_corpus_LMRD.rda"))

dfmat <- tokens(data_corpus_LMRD) %>%
  dfm()
dfmat_train <- dfm_subset(dfmat, set == "train")
dfmat_test <- dfm_subset(dfmat, set == "test")

Comparing the performance of fitting the model:

library("microbenchmark")
microbenchmark(
    multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
    bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
    times = 20
)
## Warning in microbenchmark(multi = textmodel_nb(dfmat_train,
## dfmat_train$polarity, : less accurate nanosecond times to avoid potential
## integer overflows
## Unit: milliseconds
##   expr      min       lq     mean   median       uq       max neval
##  multi 61.39000 64.11133 66.99309 65.77839 68.45346  83.24009    20
##   bern 71.24263 72.95905 78.45931 76.31775 82.19606 101.66655    20

And for prediction:

microbenchmark(
    multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
                    newdata = dfmat_test),
    bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
                   newdata = dfmat_test),
    times = 20
)
## Unit: milliseconds
##   expr       min        lq      mean    median        uq      max neval
##  multi  73.78959  76.19046  80.88285  77.25919  79.75921 108.1905    20
##   bern 105.20981 106.44771 114.85248 112.08242 124.13461 132.1529    20

Now let’s see how textmodel_nb() compares to equivalent functions from other packages. Multinomial:

library("fastNaiveBayes")
library("naivebayes")
## naivebayes 0.9.7 loaded

microbenchmark(
    textmodels = {
      tmod <-  textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial")
      pred <- predict(tmod, newdata = dfmat_test)
    },
    fastNaiveBayes = { 
      tmod <- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
      pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
    },
    naivebayes = {
      tmod = multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
      pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
    },
    times = 20
)
## Unit: milliseconds
##            expr       min        lq     mean    median        uq       max
##      textmodels  74.15322  77.37278  80.6312  78.65249  81.30023  96.74073
##  fastNaiveBayes 100.90112 104.12653 110.8884 108.84645 116.13853 135.95924
##      naivebayes  83.41003  87.28964 100.0759  89.05637  92.83388 271.13074
##  neval
##     20
##     20
##     20

And Bernoulli. Note here that while we are supplying the boolean matrix to textmodel_nb(), this re-weighting from the count matrix would have been performed automatically within the function had we not done so in advance - it’s done here just for comparison.

dfmat_train_bern <- dfm_weight(dfmat_train, scheme = "boolean")
dfmat_test_bern <- dfm_weight(dfmat_test, scheme = "boolean")

microbenchmark(
    textmodels = {
      tmod <-  textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli")
      pred <- predict(tmod, newdata = dfmat_test)
    },
    fastNaiveBayes = { 
      tmod <- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
      pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
    },
    naivebayes = {
      tmod = bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
      pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
    },
    times = 20
)
## Unit: milliseconds
##            expr       min        lq     mean   median       uq      max neval
##      textmodels 104.86525 107.91493 117.5311 116.7810 126.6451 131.5138    20
##  fastNaiveBayes 114.11329 116.01469 125.5177 123.1699 131.8557 146.4291    20
##      naivebayes  96.52031  98.37962 107.5170 100.7663 102.7416 201.5234    20

References

Maas, Andrew L., Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts (2011). “Learning Word Vectors for Sentiment Analysis”. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).

Majka M (2020). naivebayes: High Performance Implementation of the Naive Bayes Algorithm in R. R package version 0.9.7, <URL: https://CRAN.R-project.org/package=naivebayes>. Date: 2020-03-08.

Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schütze (2008). Introduction to Information Retrieval. Cambridge University Press.

Skogholt, Martin (2020). fastNaiveBayes: Extremely Fast Implementation of a Naive Bayes Classifier. R package version 2.2.0. https://github.com/mskogholt/fastNaiveBayes. Date: 2020-02-23.