Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ Imports:
Rcpp,
RcppProgress,
RSpectra,
utils
utils,
mvrsquared
Suggests:
spelling,
digest,
Expand All @@ -36,6 +37,7 @@ Suggests:
knitr,
lda,
MASS,
markdown,
rmarkdown,
SnowballC,
stringi,
Expand Down
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

S3method(fitted,lda_topic_model)
S3method(posterior,lda_topic_model)
S3method(predict,ctm_topic_model)
S3method(predict,lda_topic_model)
Expand All @@ -11,7 +12,6 @@ export(CalcJSDivergence)
export(CalcLikelihood)
export(CalcLikelihoodC)
export(CalcProbCoherence)
export(CalcSumSquares)
export(CalcTopicModelR2)
export(Cluster2TopicModel)
export(CreateDtm)
Expand Down
4 changes: 0 additions & 4 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,6 @@ CalcLikelihoodC <- function(dtm, phi, theta) {
.Call(`_textmineR_CalcLikelihoodC`, dtm, phi, theta)
}

CalcSumSquares <- function(dtm, phi, theta, ybar) {
.Call(`_textmineR_CalcSumSquares`, dtm, phi, theta, ybar)
}

Dtm2DocsC <- function(dtm, vocab) {
.Call(`_textmineR_Dtm2DocsC`, dtm, vocab)
}
Expand Down
74 changes: 24 additions & 50 deletions R/evaluation_metrics.R
Original file line number Diff line number Diff line change
Expand Up @@ -216,21 +216,21 @@ CalcProbCoherence<- function(phi, dtm, M = 5){

#' Calculate the R-squared of a topic model.
#' @description Function to calculate R-squared for a topic model.
#' This uses a geometric interpretation of R-squared as the proportion of total distance
#' each document is from the center of all the documents that is explained by the model.
#' This is a wrapper for \code{mvrsquared::calc_rsquared} with additional
#' checks in place to check for the inputs.
#' @param dtm A documents by terms dimensional document term matrix of class
#' \code{dgCMatrix} or of class \code{matrix}.
#' @param phi A topics by terms dimensional matrix where each entry is p(term_i |topic_j)
#' @param theta A documents by topics dimensional matrix where each entry is p(topic_j|document_d)
#' @param ... Other arguments to be passed to \code{\link[textmineR]{TmParallelApply}}. See note, below.
#' @param phi A topics by terms dimensional matrix where each entry is
#' p(term_i |topic_j)
#' @param theta A documents by topics dimensional matrix where each entry
#' is p(topic_j|document_d)
#' @param cpus An integer of how many threads to use. Defaults to 1 and is
#' equivalent of setting \code{threads} in \link{mvrsquared}[calc_rsquared].
#' @param ... Other arguments to be passed to
#' \code{\link[mvrsquared]{calc_rsquared}}. See note, below.
#' @return
#' Returns an object of class \code{numeric} representing the proportion of variability
#' in the data that is explained by the topic model.
#' @note
#' This function performs parallel computation if \code{dtm} has more than 3,000
#' rows. The default is to use all available cores according to \code{\link[parallel]{detectCores}}.
#' However, this can be modified by passing the \code{cpus} argument when calling
#' this function.
#' Returns an object of class \code{numeric} representing the proportion
#' of variability in the data that is explained by the topic model.
#' @export
#' @examples
#' # Load a pre-formatted dtm and topic model
Expand All @@ -244,7 +244,7 @@ CalcProbCoherence<- function(phi, dtm, M = 5){
#'
#'
#' r2
CalcTopicModelR2 <- function(dtm, phi, theta, ...){
CalcTopicModelR2 <- function(dtm, phi, theta, cpus = 1, ...){

# check that inputs have necessary formats
if(nrow(dtm) != nrow(theta) ){
Expand Down Expand Up @@ -305,40 +305,14 @@ CalcTopicModelR2 <- function(dtm, phi, theta, ...){


# get ybar, the "center" of the documents
ybar <- Matrix::colMeans(dtm)

# do in parallel in batches of about 3000 if we have more than 3000 docs
if(nrow(dtm) > 3000){

batches <- seq(1, nrow(dtm), by = 3000)

data_divided <- lapply(batches, function(j){

dtm_divided <- dtm[ j:min(j + 2999, nrow(dtm)) , ]

theta_divided <- theta[ j:min(j + 2999, nrow(dtm)) , ]

list(dtm_divided=dtm_divided, theta_divided=theta_divided)
})

result <-TmParallelApply(X = data_divided, FUN = function(x){
CalcSumSquares(dtm = x$dtm_divided,
phi = phi,
theta = x$theta_divided,
ybar=ybar)
}, export=c("phi", "ybar"), ...)

result <- do.call(rbind, result)

result <- 1 - sum(result[ , 1 ]) / sum(result[ , 2 ])

# do sequentially otherwise
}else{
sum_squares <- CalcSumSquares(dtm = dtm, phi = phi, theta = theta, ybar=ybar)

result <- 1 - sum_squares[ 1 ] / sum_squares[ 2 ]

}

result
}
ybar <- Matrix::colMeans(dtm)

# use mvrsquared to calculate rsquared
result <- mvrsquared::calc_rsquared(
y = dtm,
yhat = list(x = Matrix::rowSums(dtm) * theta,
w = phi),
ybar = ybar, threads = cpus, ...
)
return(result)
}
1 change: 0 additions & 1 deletion R/textmineR.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ NULL
#' @importFrom methods as
#' @importFrom stopwords stopwords
#' @export CalcLikelihoodC
#' @export CalcSumSquares
#' @export Dtm2DocsC
#' @export Hellinger_cpp
#' @export HellingerMat
Expand Down
17 changes: 17 additions & 0 deletions R/topic_modeling_core.R
Original file line number Diff line number Diff line change
Expand Up @@ -1284,3 +1284,20 @@ update.lda_topic_model <- function(object, dtm, additional_k = 0,
result

}

#' Get the fitted values for a Latent Dirichlet Allocation topic model
#' @description Obtains the fitted values for the specified LDA model.
#' @param object a fitted object of class \code{lda_topic_model}
#' @param ... other arguments.
#' @return a matrix of class \code{dgCMatrix}
#' @export
#' @examples
#' # load a premade topic model
#' data(nih_sample_topic_model)
#' # fit values using fitted()
#' fitted_values <- fitted(nih_sample_topic_model)
#' fitted_values

fitted.lda_topic_model <- function(object, ...) {
(Matrix::rowSums(object$data) * object$theta) %*% object$phi
}
28 changes: 14 additions & 14 deletions man/CalcTopicModelR2.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions man/fitted.lda_topic_model.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

57 changes: 0 additions & 57 deletions src/CalcSumSquares.cpp

This file was deleted.

15 changes: 0 additions & 15 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,6 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// CalcSumSquares
NumericVector CalcSumSquares(arma::sp_mat dtm, NumericMatrix phi, NumericMatrix theta, NumericVector ybar);
RcppExport SEXP _textmineR_CalcSumSquares(SEXP dtmSEXP, SEXP phiSEXP, SEXP thetaSEXP, SEXP ybarSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< arma::sp_mat >::type dtm(dtmSEXP);
Rcpp::traits::input_parameter< NumericMatrix >::type phi(phiSEXP);
Rcpp::traits::input_parameter< NumericMatrix >::type theta(thetaSEXP);
Rcpp::traits::input_parameter< NumericVector >::type ybar(ybarSEXP);
rcpp_result_gen = Rcpp::wrap(CalcSumSquares(dtm, phi, theta, ybar));
return rcpp_result_gen;
END_RCPP
}
// Dtm2DocsC
List Dtm2DocsC(arma::sp_mat dtm, std::vector< std::string> vocab);
RcppExport SEXP _textmineR_Dtm2DocsC(SEXP dtmSEXP, SEXP vocabSEXP) {
Expand Down Expand Up @@ -142,7 +128,6 @@ END_RCPP

static const R_CallMethodDef CallEntries[] = {
{"_textmineR_CalcLikelihoodC", (DL_FUNC) &_textmineR_CalcLikelihoodC, 3},
{"_textmineR_CalcSumSquares", (DL_FUNC) &_textmineR_CalcSumSquares, 4},
{"_textmineR_Dtm2DocsC", (DL_FUNC) &_textmineR_Dtm2DocsC, 2},
{"_textmineR_HellingerMat", (DL_FUNC) &_textmineR_HellingerMat, 1},
{"_textmineR_Hellinger_cpp", (DL_FUNC) &_textmineR_Hellinger_cpp, 2},
Expand Down