Skip to content

Commit

Permalink
Custom parser for fwf files that is 10x faster and handles multi-line…
Browse files Browse the repository at this point in the history
… records
  • Loading branch information
michaelhallquist committed May 9, 2024
1 parent d2258ba commit 2a984b4
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 35 deletions.
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
Version 1.2 (DEVELOPMENT)
- feature: New submitModels function supports asynchronous submission of Mplus models on high-performance clusters using slurm or torque
- feature: Rewrote parser for fixed-width Mplus output objects (esp. SAVEDATA files). Now 10x faster!
- feature: Parser for SAVEDATA outputs now handles massive records where Mplus splits records across rows (esp. SAVE = FSCORES)
- notice: Package namespace has been cleaned up to remove long-deprecated functions including extractModelSummaries, extractModelParameters, and extractModIndices.

Version 1.1.2
Expand Down
12 changes: 6 additions & 6 deletions R/extractParameters.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
extractParameters_1chunk <- function(filename, thisChunk, columnNames, sectionName) {
if (isEmpty(thisChunk)) stop("Missing chunk to parse.\n ", filename)
if (isEmpty(columnNames)) stop("Missing column names for chunk.\n ", filename)
if (missing(sectionName)) { sectionName <- "" } #right now, just use sectionName for R-SQUARE section, where there are no subheaders per se
if (missing(sectionName)) sectionName <- "" #right now, just use sectionName for R-SQUARE section, where there are no subheaders per se

#R-SQUARE sections are not divided into the usual subheader sections, and the order of top-level headers is not comparable to typical output.
#Create a single match for the whole section
Expand Down Expand Up @@ -66,7 +66,7 @@ extractParameters_1chunk <- function(filename, thisChunk, columnNames, sectionNa

#sometimes chunks have no parameters because they are empty. e.g., stdyx for ex7.30
#in this case, return null
if (nrow(convertMatches)==0) { return(NULL) }
if (nrow(convertMatches)==0) return(NULL)

#develop a dataframe that divides into keyword matches versus variable matches
convertMatches <- ddply(convertMatches, "startline", function(row) {
Expand Down Expand Up @@ -498,16 +498,16 @@ extractParameters_1file <- function(outfiletext, filename, resultType, efa = FAL
allSections <- appendListElements(allSections, extractParameters_1section(filename, unstandardizedSection, "unstandardized"))
}

alt.undstandardized <- getSection("^ALTERNATIVE PARAMETERIZATIONS FOR THE CATEGORICAL LATENT VARIABLE REGRESSION$", outfiletext)
alt.unstandardized <- getSection("^ALTERNATIVE PARAMETERIZATIONS FOR THE CATEGORICAL LATENT VARIABLE REGRESSION$", outfiletext)

if (!is.null(alt.undstandardized)) {
header.idx <- c(grep("Parameterization using Reference Class (\\d+)", alt.undstandardized), length(alt.undstandardized))
if (!is.null(alt.unstandardized)) {
header.idx <- c(grep("Parameterization using Reference Class (\\d+)", alt.unstandardized), length(alt.unstandardized))

alt.params <- llply(1:(length(header.idx)-1), function(i) {
start.line <- header.idx[[i]]
end.line <- header.idx[[i+1]]-1

alt.section <- c(alt.undstandardized[1:4], alt.undstandardized[start.line:end.line])
alt.section <- c(alt.unstandardized[1:4], alt.unstandardized[start.line:end.line])

res <- extractParameters_1section(filename, alt.section, "alt")

Expand Down
99 changes: 78 additions & 21 deletions R/extractSaveData.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ getSavedata_Fileinfo <- function(outfile) {
#if returnData is false, just the variable names are returned
#considering using addHeader to prepend a header row

if(!file.exists(outfile)) {
stop("Cannot locate outfile: ", outfile)
}
if(!file.exists(outfile)) stop("Cannot locate outfile: ", outfile)

outfiletext <- scan(outfile, what="character", sep="\n", strip.white=FALSE, blank.lines.skip=FALSE, quiet=TRUE)

Expand All @@ -45,7 +43,6 @@ getSavedata_Fileinfo <- function(outfile) {
}

return(l_getSavedata_Fileinfo(outfile, outfiletext))

}

#' local function that does the work of getSaveData_Fileinfo
Expand Down Expand Up @@ -250,14 +247,6 @@ l_getSavedata_Fileinfo <- function(outfile, outfiletext, summaries) {
saveFile <- trimSpace(savedataSection[(saveFile.text[1L] + 1)])
}

#save file format (sometimes on same line, sometimes on next line)
# saveFileFormat.text <- getSection("^\\s*Save file format", savedataSection, sectionStarts)
#
# #actually, sometimes the format is "Free" and falls on the same line...
# if (!is.null(saveFileFormat.text)) {
# saveFileFormat <- trimSpace(saveFileFormat.text[1]) #first line contains format
# }
#

# #file record length
# savefile.recordlength <- getSection("^\\s*Save file record length\\s+\\d+\\s*$", savedataSection, sectionStarts)
Expand All @@ -280,22 +269,60 @@ l_getSavedata_Fileinfo <- function(outfile, outfiletext, summaries) {
varsSplit <- strsplit(trimSpace(variablesToParse), "\\s+")

# Use rlang::flatten to remove the nested lists that result from the inner lapply over imputations
# 2024: removed rlang::flatten in favor of inner unlist. Cannot find case where this was important
# 2024: removed rlang::flatten in favor of local function. Allows us to remove dependency
varsSplit <- lapply(1:length(varsSplit), function(x) {
vname <- varsSplit[[x]]
if (x %in% which_imp) {
#replicate the variable for each imputation
return(unlist(lapply(1:nimp, function(i) { c(vname[1:(length(vname)-1)], sprintf("I_%03d", i), vname[length(vname)]) })))
return(lapply(1:nimp, function(i) { c(vname[1:(length(vname)-1)], sprintf("I_%03d", i), vname[length(vname)]) }))
} else {
return(vname)
}
})

# https://stackoverflow.com/questions/19734412/flatten-nested-list-into-1-deep-list
renquote <- function(l) if (is.list(l)) lapply(l, renquote) else enquote(l)
varsSplit <- lapply(unlist(renquote(varsSplit)), eval) # flatten list hierarchy

fileVarNames <- sapply(varsSplit, function(x) { paste(x[1:(length(x) - 1)], collapse=".") })
fileVarFormats <- sapply(varsSplit, function(x) { x[length(x)] }) #last element
fileVarWidths <- strapply(fileVarFormats, "[IEFG]+(\\d+)(\\.\\d+)*", as.numeric, perl=TRUE, simplify=TRUE)

}

# For some large outputs, records span multiple rows, which is a hard fixed-width parsing demand. Identify the chunks and split out the variables by chunk
# save file format (sometimes on same line, sometimes on next line)
# Mplus uses slashes to denote when single records span multiple lines: 512F10.3 / 505F10.3 / 505F10.3 / 540F10.3 I6 I7
saveFileFormat.text <- grep("^\\s*Save file format\\s*$", savedataSection, perl=TRUE)

if (length(saveFileFormat.text) > 0L) {
saveFileFormat <- trimSpace(savedataSection[saveFileFormat.text[1L] + 1])
by_row <- strsplit(saveFileFormat, "\\s*/\\s*", perl=TRUE)[[1]]
record_chunks <- length(by_row)
chunkVarWidths <- lapply(by_row, function(cc) {
# add a 1 in front of records that have no repeat digit(s) in front
cc <- gsub("(?<![0-9])([IEFG][0-9\\.]+)", "1\\1", cc, perl=TRUE)
nreps <- strapply(cc, "(\\d+)[IEFG][0-9\\.]+", as.numeric, perl=TRUE, simplify = TRUE)
var_widths <- strapply(cc, "(?:\\d+)[IEFG]+(\\d+)(\\.\\d+)*", as.numeric, perl=TRUE, simplify = TRUE)

# now repeat the widths the correct number of times to form the line
var_widths <- rep(var_widths, nreps)
return(var_widths)
})

# assign variable names to chunks based on record lengths
chunkVarIdx <- c(0, cumsum(sapply(chunkVarWidths, length)))
chunkVarNames <- lapply(2:length(chunkVarIdx), function(ii) fileVarNames[(chunkVarIdx[ii-1]+1):chunkVarIdx[ii]])
} else {
chunkVarWidths <- list(fileVarWidths) # a list of variable widths for multi-line ragged files
chunkVarNames <- list(fileVarNames) # a list of variable names for multi-line ragged files
}

# #actually, sometimes the format is "Free" and falls on the same line...
# if (!is.null(saveFileFormat.text)) {
# saveFileFormat <- trimSpace(saveFileFormat.text[1]) #first line contains format
# }
#

#Monte carlo and multiple imputation output: contains only order of variables, not their format
order.text <- getMultilineSection("Order of variables", savedataSection, outfile, allowMultiple=FALSE)
Expand All @@ -310,7 +337,6 @@ l_getSavedata_Fileinfo <- function(outfile, outfiletext, summaries) {
fileVarNames <- trimSpace(variablesToParse)
}


#future: plausible values output from Bayesian runs
#PLAUSIBLE VALUE MEAN, MEDIAN, SD, AND PERCENTILES FOR EACH OBSERVATION

Expand All @@ -319,7 +345,7 @@ l_getSavedata_Fileinfo <- function(outfile, outfiletext, summaries) {
#N.B. Would like to shift return to "saveFile", but need to update everywhere and note deprecation in changelog

#bayesVarTypes=bayesVarTypes,
return(list(fileName=saveFile, fileVarNames=fileVarNames, fileVarFormats=fileVarFormats, fileVarWidths=fileVarWidths,
return(list(fileName=saveFile, fileVarNames=fileVarNames, fileVarFormats=fileVarFormats, fileVarWidths=fileVarWidths, chunkVarNames=chunkVarNames, chunkVarWidths=chunkVarWidths,
bayesFile=bayesFile, bayesVarNames=bayesVarNames, bayesIterDetails=bayesIterDetails, tech3File=tech3File, tech4File=tech4File))
}

Expand Down Expand Up @@ -379,7 +405,7 @@ getSavedata_Data <- function(outfile) {
#' }
#' @keywords internal
getSavedata_Bparams <- function(outfile, discardBurnin=TRUE) {
#exposed wrapper for l_getSavedata_readRawFile, which pulls bayesian parameters into a data.frame
#exposed wrapper for getSavedata_readRawFile, which pulls bayesian parameters into a data.frame

message("getSavedata_Bparams has been deprecated. Please use readModels(\"nameofMplusoutfile.out\", what=\"bparameters\")$bparameters to replicate the old functionality.")
return(invisible(NULL))
Expand All @@ -402,7 +428,7 @@ l_getSavedata_Bparams <- function(outfile, outfiletext, fileInfo, discardBurnin=
#missing fileInfo
if (is.null(fileInfo)) return(NULL)

bp <- l_getSavedata_readRawFile(outfile, outfiletext, format="free", fileName=fileInfo[["bayesFile"]], varNames=fileInfo[["bayesVarNames"]])
bp <- getSavedata_readRawFile(outfile, outfiletext, format="free", fileName=fileInfo[["bayesFile"]], varNames=fileInfo[["bayesVarNames"]])

if (is.null(bp)) return(NULL)

Expand Down Expand Up @@ -504,7 +530,8 @@ l_getSavedata_Bparams <- function(outfile, outfiletext, fileInfo, discardBurnin=
#' @return A data frame of the extracted data.
#' @keywords internal
#' @importFrom utils read.table read.fwf
l_getSavedata_readRawFile <- function(outfile, outfiletext, format="fixed", fileName, varNames, varWidths, input) {
#' @importFrom data.table setDF setnames fread
getSavedata_readRawFile <- function(outfile, outfiletext, format="fixed", fileName, varNames, varWidths, input) {
outfileDirectory <- splitFilePath(outfile)$directory

#if file requested is missing, then abort data pull
Expand Down Expand Up @@ -606,9 +633,39 @@ l_getSavedata_readRawFile <- function(outfile, outfiletext, format="fixed", file
#strip.white is necessary for na.strings to work effectively with fixed width fields
#otherwise would need something like "* " for na.strings

dataset <- read.fwf(file=savedataFile, widths=varWidths, header=FALSE,
na.strings="*", col.names=varNames, strip.white=TRUE)
# custom parser using data.table
mplus_read_fwf <- function(file, widths, header=FALSE, na.strings="*", col.names=NULL) {
dt <- fread(savedataFile, header=F, sep="\n", strip.white = FALSE)
if (!is.list(varWidths)) varWidths <- list(varWidths) # allow vector input for nchunks = 1
if (!is.list(col.names)) col.names <- list(col.names) # allow vector input for nchunks = 1

cs <- cumsum(unlist(varWidths))
cols <- data.frame(beg=c(1, cs[1:length(cs)-1]+1), end=cs)
nchunks <- length(varWidths)

if (nchunks > 1L) { # need to paste together multi-line records on one line for simpler parsing
dt[, cnum := rep(1:(nrow(dt)/nchunks), each=nchunks)]
dt <- dt[, .(V1=paste0(V1, collapse="")), by=cnum]
}

# take the appropriate substring positions for each variable on each row, use type.convert to convert characters to numbers
conv <- dt[ , lapply(seq_len(length(cols$beg)), function(ii) {
type.convert(trimws(substr(V1, cols$beg[ii], cols$end[ii])), as.is=TRUE, na.strings=na.strings)
})]

setnames(conv, make.names(unlist(col.names)))
setDF(conv) # return standard data.frame
return(conv)
}

dataset <- mplus_read_fwf(savedataFile, varWidths, header=FALSE, na.strings="*", col.names=varNames)

# old, slow approach
# dataset <- read.fwf(file=savedataFile, widths=unlist(varWidths), header=FALSE,
# na.strings="*", col.names=unlist(varNames), strip.white=TRUE)

}

return(dataset)
}

15 changes: 7 additions & 8 deletions R/utilityFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -334,8 +334,8 @@ getSection <- function(sectionHeader, outfiletext, headers="standard") {
#identify headers after this section to find end of section
subsequentHeaders <- which(h > beginSection)

if (length(subsequentHeaders) == 0) { nextHeader <- length(outfiletext) #just return the whole enchilada
} else { nextHeader <- h[subsequentHeaders[1]] - 1 }
if (length(subsequentHeaders) == 0) nextHeader <- length(outfiletext) #just return the whole enchilada
else nextHeader <- h[subsequentHeaders[1]] - 1

section.found <- outfiletext[(beginSection+1):nextHeader]
attr(section.found, "lines") <- beginSection:nextHeader
Expand Down Expand Up @@ -385,6 +385,7 @@ parse_into_sections <- function(outfiletext) {
"BETWEEN-LEVEL FACTOR SCORE COMPARISONS",
"ALTERNATIVE PARAMETERIZATIONS FOR THE CATEGORICAL LATENT VARIABLE REGRESSION",
"ODDS RATIO FOR THE ALTERNATIVE PARAMETERIZATIONS FOR THE CATEGORICAL LATENT VARIABLE REGRESSION",
"ODDS RATIOS FOR TESTS OF CATEGORICAL LATENT VARIABLE MULTINOMIAL LOGISTIC REGRESSIONS",
"LATENT CLASS ODDS RATIO RESULTS", "LOGRANK OUTPUT", "STANDARDIZED MODEL RESULTS",
"WITHIN-LEVEL STANDARDIZED MODEL RESULTS FOR CLUSTER \\d+",
"R-SQUARE", "QUALITY OF NUMERICAL RESULTS", "QUALITY OF NUMERICAL RESULTS FOR .*", "TECHNICAL OUTPUT", "TECHNICAL \\d+ OUTPUT",
Expand All @@ -406,6 +407,7 @@ parse_into_sections <- function(outfiletext) {
"EQUALITY TESTS OF MEANS ACROSS CLASSES USING THE BCH PROCEDURE",
"EQUALITY TESTS OF MEANS ACROSS CLASSES USING THE 3-STEP PROCEDURE",
"EQUALITY TESTS OF MEANS/PROBABILITIES ACROSS CLASSES",
"TESTS OF CATEGORICAL LATENT VARIABLE MULTINOMIAL LOGISTIC REGRESSIONS USING", # 3-step regression
"THE FOLLOWING DATA SET\\(S\\) DID NOT RESULT IN A COMPLETED REPLICATION:",
"RESIDUAL OUTPUT", "RESIDUAL OUTPUT FOR THE.*",
"MODEL MODIFICATION INDICES", "MODIFICATION INDICES",
Expand Down Expand Up @@ -880,16 +882,14 @@ detectColumnNames <- function(filename, modelSection, sectionType="model_results
else if (identical(thisLine, c("StdYX", "StdY", "Std")) && identical (nextLine, c("Estimate", "Estimate", "Estimate")))
varNames <- c("param", "stdyx", "stdy", "std")

}
else if (sectionType == "mod_indices") {
} else if (sectionType == "mod_indices") {
nhl <- 1 #default for mod indices
if (identical(thisLine, c("M.I.", "E.P.C.", "Std", "E.P.C.", "StdYX", "E.P.C."))) {
varNames <- c("modV1", "operator", "modV2", "MI", "EPC", "Std_EPC", "StdYX_EPC")
} else if (identical(thisLine, c("M.I.", "E.P.C."))) {
varNames <- c("modV1", "operator", "modV2", "MI", "EPC")
}
}
else if (sectionType == "confidence_intervals") {
} else if (sectionType == "confidence_intervals") {
nhl <- 1 #default for CIs
if (identical(thisLine, c("Lower",".5%","Lower","2.5%","Lower","5%",
"Estimate","Upper","5%","Upper","2.5%","Upper",".5%" ))) {
Expand All @@ -898,8 +898,7 @@ detectColumnNames <- function(filename, modelSection, sectionType="model_results
"Estimate","Upper","2.5%","Upper",".5%" ))) {
varNames <- c("param", "low.5", "low2.5", "est", "up2.5", "up.5")
}
}
else if (sectionType == "auxe") { #currently unused
} else if (sectionType == "auxe") { #currently unused
nhl <- 1
if (identical(thisLine, c("Mean", "S.E.", "Mean", "S.E."))) {
varNames <- c("Mean", "SE", "Mean", "SE")
Expand Down

0 comments on commit 2a984b4

Please sign in to comment.