Skip to content

Commit

Permalink
Full draft of H5Results and R3step extraction; switch to fread from r…
Browse files Browse the repository at this point in the history
…ead.table
  • Loading branch information
michaelhallquist committed May 14, 2024
1 parent cc38899 commit 47b4d6c
Show file tree
Hide file tree
Showing 9 changed files with 211 additions and 76 deletions.
2 changes: 0 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,6 @@ importFrom(stats,reshape)
importFrom(stats,setNames)
importFrom(stats,update)
importFrom(utils,packageDescription)
importFrom(utils,read.fwf)
importFrom(utils,read.table)
importFrom(utils,tail)
importFrom(utils,write.table)
importFrom(xtable,xtable)
3 changes: 3 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
Version 1.2 (DEVELOPMENT)
- feature: New submitModels function supports asynchronous submission of Mplus models on high-performance clusters using slurm or torque
- feature: Rewrote parser for fixed-width Mplus output objects (esp. SAVEDATA files). Now 10x faster!
- feature: Use data.table fread for reading delimited files (e.g., BPARAMETERS). Approximately 10x faster!
- feature: Parser for SAVEDATA outputs now handles massive records where Mplus splits records across rows (esp. SAVE = FSCORES)
- feature: readModels extracts results of the R3STEP procedure in $r3step. Closes #204
- feature: readModels extracts new H5RESULTS object from Mplus v8.11 in $h5results
- notice: Package namespace has been cleaned up to remove long-deprecated functions including extractModelSummaries, extractModelParameters, and extractModIndices.

Version 1.1.2
Expand Down
50 changes: 50 additions & 0 deletions R/extractAuxDis.R
Original file line number Diff line number Diff line change
Expand Up @@ -282,3 +282,53 @@ extractAux <- function(outfiletext, filename) {

return(ret)
}

#' Extract output of R3STEP procedure
#'
#' @param outfiletext character vector of Mplus output file from which to extract the AUX section
#' @param filename filename of the Mplus output file being processed
#' @return A list containing the parsed R3STEP sections
#' @keywords internal
extractR3step <- function(outfiletext, filename) {
allSections <- list() # holds parameters for all identified sections

parR3stepSection <- getSection("^TESTS OF CATEGORICAL LATENT VARIABLE MULTINOMIAL LOGISTIC REGRESSIONS USING::^THE 3-STEP PROCEDURE", outfiletext)
if (is.null(parR3stepSection)) return(allSections) # nothing

# The highest class value is the default reference, so we need to add 1 to the highest alternative reference class number to infer the first class
classNums <- strapply(parR3stepSection, "Parameterization using Reference Class (\\d+)", as.numeric, empty=NA_integer_, simplify=TRUE)
firstClass <- max(na.omit(classNums)) + 1
firstLine <- paste("Parameterization using Reference Class", firstClass) # Add this on the first line so that we parse the structure correctly

parR3stepSection <- c(firstLine, parR3stepSection)
allSections <- appendListElements(allSections, extractParameters_1section(filename, parR3stepSection, "multinomialTests"))

# odds ratios
parR3oddsSection <- getSection("^ODDS RATIOS FOR TESTS OF CATEGORICAL LATENT VARIABLE MULTINOMIAL LOGISTIC REGRESSIONS::^USING THE 3-STEP PROCEDURE$", outfiletext)

if (!is.null(parR3oddsSection)) {
parR3oddsSection <- c(firstLine, parR3oddsSection)
allSections <- appendListElements(allSections, extractParameters_1section(filename, parR3oddsSection, "multinomialOdds"))
}

# confidence intervals for 3-step procedure
ciR3stepSection <- getSection("^CONFIDENCE INTERVALS FOR TESTS OF CATEGORICAL LATENT VARIABLE MULTINOMIAL LOGISTIC REGRESSIONS$::^USING THE 3-STEP PROCEDURE$", outfiletext)

if (!is.null(ciR3stepSection)) {
ciR3stepSection <- c(firstLine, ciR3stepSection)
hline <- detectColumnNames(filename, ciR3stepSection, "confidence_intervals")
allSections <- appendListElements(allSections, extractParameters_1section(filename, ciR3stepSection, "ci.unstandardized"))
}

# confidence intervals odds ratios for 3-step procedure
ciR3stepOddsSection <- getSection("^CONFIDENCE INTERVALS OF ODDS RATIOS FOR TESTS OF CATEGORICAL LATENT VARIABLE MULTINOMIAL$::^LOGISTIC REGRESSIONS USING THE 3-STEP PROCEDURE$", outfiletext)

if (!is.null(ciR3stepOddsSection)) {
# at present, the CI header is not reprinted in the odds ratio section, so we need to copy-paste header line for parsing
ciR3stepOddsSection <- c(firstLine, ciR3stepSection[attr(hline, "header_lines")], ciR3stepOddsSection)
allSections <- appendListElements(allSections, extractParameters_1section(filename, ciR3stepOddsSection, "ci.odds"))
}

return(allSections)

}
28 changes: 11 additions & 17 deletions R/extractParameters.R
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ extractParameters_1section <- function(filename, modelSection, sectionName) {

#detectColumnNames sub-divides (perhaps unnecessarily) the matches based on the putative section type of the output
#current distinctions include modification indices, confidence intervals, and model results.
if (sectionName %in% c("ci.unstandardized", "ci.stdyx.standardized", "ci.stdy.standardized", "ci.std.standardized")) {
if (sectionName %in% c("ci.unstandardized", "ci.stdyx.standardized", "ci.stdy.standardized", "ci.std.standardized", "ci.odds")) {
sectionType <- "confidence_intervals"
} else if (sectionName == "irt.parameterization" || sectionName == "probability.scale") {
#the IRT section follows from the MODEL RESULTS section, and column headers are not reprinted.
Expand Down Expand Up @@ -278,20 +278,23 @@ extractParameters_1section <- function(filename, modelSection, sectionName) {
catLatentMatches <- grep("^\\s*Categorical Latent Variables\\s*$", modelSection, ignore.case=TRUE)
classPropMatches <- grep("^\\s*Class Proportions\\s*$", modelSection, ignore.case=TRUE)
classSpecificMatches <- grep("^\\s*(Results|Parameters) for Class-specific Model Parts of [\\w_\\.]+\\s*$", modelSection, ignore.case=TRUE, perl=TRUE)
referenceClassMatches <- grep("^\\s*Parameterization using Reference Class [0-9]+", modelSection, ignore.case=TRUE)

topLevelMatches <- sort(c(betweenWithinMatches, latentClassMatches, multipleGroupMatches, catLatentMatches, classPropMatches, classSpecificMatches))
topLevelMatches <- sort(c(betweenWithinMatches, latentClassMatches, multipleGroupMatches, catLatentMatches, classPropMatches, classSpecificMatches, referenceClassMatches))

if (length(topLevelMatches) > 0) {

lcNum <- NULL
bwWi <- NULL
groupName <- NULL
rcNum <- NULL

matchIndex <- 1
for (match in topLevelMatches) {
if (grepl("\\s*Within-Level Standardized Estimates Averaged Over Clusters\\s*", modelSection[match], perl=TRUE)) {
bwWi <- "Within.Std.Averaged.Over.Clusters"
} else if (match %in% betweenWithinMatches) { bwWi <- sub("\\s+Level\\s*$", "", modelSection[match], perl=TRUE)
} else if (match %in% betweenWithinMatches) {
bwWi <- sub("\\s+Level\\s*$", "", modelSection[match], perl=TRUE)
} else if (match %in% latentClassMatches) {
if ((pos <- regexpr("Pattern", modelSection[match], ignore.case=TRUE)) > 0) {
#need to pull out and concatenate all numerical values following pattern
Expand Down Expand Up @@ -320,6 +323,10 @@ extractParameters_1section <- function(filename, modelSection, sectionName) {
#N.B.: 15Mar2012. the parse chunk routine can't handle the class proportions output right now
#because there is no nesting. Need to come back and fix.
#for now, this output is just ignored.
} else if (match %in% referenceClassMatches) {
lcNum <- NULL
bwWi <- NULL
rcNum <- sub("^\\s*Parameterization using Reference Class ([0-9]+)\\s*$", "\\1", modelSection[match], perl=TRUE)
}

#if the subsequent top level match is more than 2 lines away, assume that there is a
Expand Down Expand Up @@ -395,6 +402,7 @@ extractParameters_1section <- function(filename, modelSection, sectionName) {
parsedChunk$LatentClass <- lcNum
parsedChunk$BetweenWithin <- bwWi
parsedChunk$Group <- groupName
parsedChunk$ReferenceClass <- rcNum
allSectionParameters <- rbind(allSectionParameters, parsedChunk)
}
}
Expand Down Expand Up @@ -464,20 +472,6 @@ extractParameters_1file <- function(outfiletext, filename, resultType, efa = FAL
# warning(paste0("EFA, MIXTURE EFA, and TWOLEVEL EFA files are not currently supported by extractModelParameters.\n Skipping outfile: ", filename))
# return(NULL) #skip file
# }

# copy elements of append into target. note that data.frames inherit list,
# so could be wonky if append is a data.frame (shouldn't happen here)
appendListElements <- function(target, append) {
if (!is.list(target)) stop("target is not a list.")
if (!is.list(append)) stop("append is not a list.")

for (elementName in names(append)) {
if (!is.null(target[[elementName]])) warning("Element is already present in target list: ", elementName)
target[[elementName]] <- append[[elementName]]
}

return(target)
}

allSections <- list() #holds parameters for all identified sections
if (length(multisectionMatches <- grep("^\\s*MODEL RESULTS FOR .*", outfiletext, perl=TRUE, value=TRUE)) > 0L) {
Expand Down
31 changes: 19 additions & 12 deletions R/extractSaveData.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ l_getSavedata_Fileinfo <- function(outfile, outfiletext, summaries) {
"Sample/H1/Pooled-Within Matrix", #sample
"Bayesian Parameters", #bparameters
"Within and between sample statistics with Weight matrix", #swmatrix
"Model Estimated Covariance Matrix" #covariance
"Model Estimated Covariance Matrix", #covariance
"Results in H5 Format"
)

#extract entire savedata section
Expand All @@ -93,7 +94,7 @@ l_getSavedata_Fileinfo <- function(outfile, outfiletext, summaries) {
#initialize these variables to empty character strings so that list return value is complete
#important in cases where some savedata output available, but other sections unused
listVars <- c("saveFile", "fileVarNames", "fileVarFormats", "fileVarWidths", "bayesFile", "bayesVarNames",
"tech3File", "tech4File", "covFile", "sampleFile", "estimatesFile")
"tech3File", "tech4File", "covFile", "sampleFile", "estimatesFile", "h5resultsFile")
l_ply(listVars, assign, value=NA_character_, envir=environment())

# NULL bparameters iterations if not available (needs to be NULL instead of NA since the return is a data.frame)
Expand Down Expand Up @@ -340,13 +341,23 @@ l_getSavedata_Fileinfo <- function(outfile, outfiletext, summaries) {
#future: plausible values output from Bayesian runs
#PLAUSIBLE VALUE MEAN, MEDIAN, SD, AND PERCENTILES FOR EACH OBSERVATION


# v8.11+: results in HDF5 format
h5resultsSection <- getSection("^\\s*Results in H5 Format\\s*$", savedataSection, sectionStarts)

if (!is.null(h5resultsSection)) {
h5Start <- grep("^\\s*Save file\\s*$", h5resultsSection, ignore.case=TRUE, perl=TRUE)
if (length(h5Start) > 0L) {
h5resultsFile <- trimSpace(h5resultsSection[h5Start+1]) # save file is on the next line
}
linesParsed <- c(linesParsed, attr(h5resultsSection, "lines"))
}

#return the file information as a list
#N.B. Would like to shift return to "saveFile", but need to update everywhere and note deprecation in changelog

#bayesVarTypes=bayesVarTypes,
return(list(fileName=saveFile, fileVarNames=fileVarNames, fileVarFormats=fileVarFormats, fileVarWidths=fileVarWidths, chunkVarNames=chunkVarNames, chunkVarWidths=chunkVarWidths,
bayesFile=bayesFile, bayesVarNames=bayesVarNames, bayesIterDetails=bayesIterDetails, tech3File=tech3File, tech4File=tech4File))
bayesFile=bayesFile, bayesVarNames=bayesVarNames, bayesIterDetails=bayesIterDetails, tech3File=tech3File, tech4File=tech4File, h5resultsFile=h5resultsFile))
}

#' Load an analysis dataset from the SAVEDATA command into an R data.frame
Expand Down Expand Up @@ -529,7 +540,6 @@ l_getSavedata_Bparams <- function(outfile, outfiletext, fileInfo, discardBurnin=
#' @param input list of parsed Mplus input section extracted upstream in readModels
#' @return A data frame of the extracted data.
#' @keywords internal
#' @importFrom utils read.table read.fwf
#' @importFrom data.table setDF setnames fread
getSavedata_readRawFile <- function(outfile, outfiletext, format="fixed", fileName, varNames, varWidths, input) {
outfileDirectory <- splitFilePath(outfile)$directory
Expand All @@ -556,10 +566,6 @@ getSavedata_readRawFile <- function(outfile, outfiletext, format="fixed", fileNa
#cat("Savedata directory: ", savedataSplit$directory, "\n")
#cat("concat result: ", savedataFile, "\n")

#need to read as fixed width format given the way Mplus left-aligns missing vals (*)
#dataset <- read.table(file=file.path(path, fileInfo$fileName), header=FALSE,
# na.strings="*", col.names=fileInfo$varNames)

if (format == "free") {
#handle case where filename contains * indicating Monte Carlo (MC) or multiple imputation (MI) dataset with reps
if (grepl("\\*", savedataSplit$filename, perl=TRUE)) {
Expand Down Expand Up @@ -613,13 +619,14 @@ getSavedata_readRawFile <- function(outfile, outfiletext, format="fixed", fileNa
warning("Unable to locate SAVEDATA files for filename: ", resplit$filename)
} else {
for (f in 1:length(fileList)) {
dataset[[make.names(fileNames[f])]] <- read.table(file=fileList[f], header=FALSE,
na.strings="*", col.names=varNames, strip.white=TRUE)
dataset[[make.names(fileNames[f])]] <- fread(file=fileList[f], header=FALSE,
na.strings="*", col.names=varNames, strip.white=TRUE, data.table=FALSE)
}
}

} else {
dataset <- read.table(file=savedataFile, header=FALSE, na.strings="*", strip.white=TRUE)
dataset <- fread(file=savedataFile, header=FALSE, na.strings="*", strip.white=TRUE, data.table = FALSE)

if (length(varNames) > ncol(dataset)) {
warning("Number of variable names for Bayesian Parameters section exceeds number of columns in: ", savedataFile)
varNames <- varNames[1:ncol(dataset)] #heuristically, just using columns names up to the last column in the data
Expand Down
Loading

0 comments on commit 47b4d6c

Please sign in to comment.