From 3a4779f6e7a123ea13fbd4a71a0086b62976704d Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Wed, 23 Oct 2024 15:15:13 -1000 Subject: [PATCH] i #313 Updates to Syntax Extractor - Added parameter for excluding licenses in class and file-level comment extraction - Implemented function extraction for function names with optional parameters - Implemented variable extraction with optional types - Added examples for removing empty comments and/or comment delimiters Signed-off-by: Dao McGill --- NAMESPACE | 2 + R/src.R | 182 +++++++++++++++++++++++++++++-- man/query_src_text_class_docs.Rd | 4 +- man/query_src_text_file_docs.Rd | 4 +- man/query_src_text_functions.Rd | 21 ++++ man/query_src_text_variables.Rd | 21 ++++ vignettes/syntax_extractor.Rmd | 160 ++++++++++++++++++++++++--- 7 files changed, 367 insertions(+), 27 deletions(-) create mode 100644 man/query_src_text_functions.Rd create mode 100644 man/query_src_text_variables.Rd diff --git a/NAMESPACE b/NAMESPACE index 4c6ae7d1..8aa0302d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -136,7 +136,9 @@ export(query_src_text) export(query_src_text_class_docs) export(query_src_text_class_names) export(query_src_text_file_docs) +export(query_src_text_functions) export(query_src_text_namespace) +export(query_src_text_variables) export(read_temporary_file) export(recolor_network_by_community) export(refresh_jira_issues) diff --git a/R/src.R b/R/src.R index 73a4699e..d3d88f7c 100644 --- a/R/src.R +++ b/R/src.R @@ -444,16 +444,17 @@ query_src_text_namespace <- function(srcml_path,srcml_filepath){ #' #' @param srcml_path The path to the srcML binary. #' @param srcml_filepath The path to the srcML file to be queried. +#' @param exclude_license If TRUE, excludes license-related comments. Defaults to TRUE. #' #' @return A data.table containing the file path and the file-level documentation comment. #' @export -query_src_text_file_docs <- function(srcml_path, srcml_filepath) { +query_src_text_file_docs <- function(srcml_path, srcml_filepath, exclude_license = TRUE) { # Expand paths srcml_path <- path.expand(srcml_path) srcml_filepath <- path.expand(srcml_filepath) - # This XPath query is designed to extract meaningful file-level documentation comments - # while excluding the common license blocks found at the top of many source files. + # XPath query with and without license comments, depending on exclude_license parameter + # This query is designed to extract meaningful file-level documentation comments. # It works as follows: # # 1. "//src:unit/src:comment": Selects all elements within the (file) node, @@ -462,8 +463,11 @@ query_src_text_file_docs <- function(srcml_path, srcml_filepath) { # 2. "[not(contains(., 'Licensed')) and not(contains(., 'license'))]": Filters out # comments that contain the text 'Licensed' or 'license', which are typically used # for licensing information like the Apache License. - - xpath_query <- "//src:unit/src:comment[not(contains(., 'Licensed')) and not(contains(., 'license'))]" + if (exclude_license) { + xpath_query <- "//src:unit/src:comment[not(contains(., 'Licensed')) and not(contains(., 'license'))]" + } else { + xpath_query <- "//src:unit/src:comment" + } # Execute query using the srcML binary srcml_output <- query_src_text(srcml_path, xpath_query, srcml_filepath) @@ -502,15 +506,17 @@ query_src_text_file_docs <- function(srcml_path, srcml_filepath) { #' #' @param srcml_path The path to the srcML binary. #' @param srcml_filepath The path to the srcML file to be queried. +#' @param exclude_license If TRUE, excludes license-related comments. Defaults to TRUE. #' #' @return A data.table containing the file path and the class-level documentation comment. #' @export -query_src_text_class_docs <- function(srcml_path, srcml_filepath) { +query_src_text_class_docs <- function(srcml_path, srcml_filepath, exclude_license = TRUE) { # Expand paths srcml_path <- path.expand(srcml_path) srcml_filepath <- path.expand(srcml_filepath) - # This XPath query is designed to extract meaningful class-level Javadoc comments + # XPath query with and without license comments, depending on exclude_license parameter + # This query is designed to extract meaningful class-level Javadoc comments # while avoiding license blocks. It works as follows: # # 1. "//src:class": Selects all elements representing class definitions @@ -524,8 +530,11 @@ query_src_text_class_docs <- function(srcml_path, srcml_filepath) { # 3. "/preceding-sibling::src:comment[@format='javadoc'][1]": Once the class is found, # this selects the first Javadoc comment that directly precedes the class definition # and is closest to it. - - xpath_query <- "//src:class[preceding-sibling::src:comment[@format='javadoc' and not(contains(., 'Licensed'))]]/preceding-sibling::src:comment[@format='javadoc'][1]" + if (exclude_license) { + xpath_query <- "//src:class[preceding-sibling::src:comment[@format='javadoc' and not(contains(., 'Licensed'))]]/preceding-sibling::src:comment[@format='javadoc'][1]" + } else { + xpath_query <- "//src:class[preceding-sibling::src:comment[@format='javadoc']]/preceding-sibling::src:comment[@format='javadoc'][1]" + } # Execute query srcml_output <- query_src_text(srcml_path, xpath_query, srcml_filepath) @@ -551,6 +560,161 @@ query_src_text_class_docs <- function(srcml_path, srcml_filepath) { } +#' Query srcML Variable Declarations +#' +#' This function extracts variable names and their types from the srcML XML file. +#' +#' @param srcml_path The path to the srcML binary. +#' @param srcml_filepath The path to the srcML file to be queried (see \code{\link{annotate_src_text}}). +#' @param var_type If TRUE, includes variable types. Defaults to TRUE. +#' +#' @return A data.table containing filepath, variable_name, and optionally variable_type. +#' @export +query_src_text_variables <- function(srcml_path, srcml_filepath, var_type = TRUE) { + # Expand paths + srcml_path <- path.expand(srcml_path) + srcml_filepath <- path.expand(srcml_filepath) + + # Define the XPath query to select variable declarations + xpath_query <- "//src:decl_stmt/src:decl" + + # Run the XPath query using the query_src_text function + srcml_output <- query_src_text(srcml_path, xpath_query, srcml_filepath) + + # Parse the XML output + srcml_output <- XML::xmlTreeParse(paste(srcml_output, collapse = "\n"), useInternalNodes = TRUE) + srcml_root <- XML::xmlRoot(srcml_output) + + # Get the list of unit nodes (each unit node corresponds to a variable declaration) + srcml_variable_nodes <- XML::xmlChildren(srcml_root) + + # Function to parse each unit node and extract variable information + parse_variable_info <- function(unit) { + # Get the filepath from the 'filename' attribute + filepath <- XML::xmlGetAttr(unit, "filename") + + # Extract the variable name + # We use XPath to select the element that is not within a element + var_name_node <- XML::getNodeSet(unit, ".//src:name[not(ancestor::src:type)]", namespaces = c(src = "http://www.srcML.org/srcML/src"))[[1]] + variable_name <- XML::xmlValue(var_name_node) + + if (var_type) { + # Extract the variable type + var_type_nodes <- XML::getNodeSet(unit, ".//src:type//src:name", namespaces = c(src = "http://www.srcML.org/srcML/src")) + # Concatenate type names if there are multiple parts (e.g., generics) + if (length(var_type_nodes) > 0) { + variable_type <- paste(sapply(var_type_nodes, XML::xmlValue), collapse = "") + } else { + variable_type <- NA + } + # Return a data.table with filepath, variable_name, and variable_type + return(data.table(filepath = filepath, variable_name = variable_name, variable_type = variable_type)) + } else { + # Return a data.table with filepath and variable_name + return(data.table(filepath = filepath, variable_name = variable_name)) + } + } + + # Apply the parsing function to each unit node + dt_variables <- rbindlist(lapply(srcml_variable_nodes, parse_variable_info)) + + return(dt_variables) +} + + +#' Query srcML Function Declarations +#' +#' This function extracts function names and optionally their parameters from the srcML XML file. +#' +#' @param srcml_path The path to the srcML binary. +#' @param srcml_filepath The path to the srcML file to be queried (see \code{\link{annotate_src_text}}). +#' @param include_parameters If TRUE, includes function parameters. Defaults to TRUE. +#' +#' @return A data.table containing filepath, function_name, and optionally parameters. +#' @export +query_src_text_functions <- function(srcml_path, srcml_filepath, include_parameters = TRUE) { + # Expand paths + srcml_filepath <- path.expand(srcml_filepath) + + # Define the namespace mapping + ns <- c(src = "http://www.srcML.org/srcML/src") + + # Parse the XML file + srcml_doc <- XML::xmlParse(srcml_filepath) + srcml_root <- XML::xmlRoot(srcml_doc) + + # Get the list of function_decl nodes + function_nodes <- XML::getNodeSet(srcml_root, "//src:function_decl", namespaces = ns) + + # Function to parse each function_decl node and extract information + parse_function_info <- function(unit) { + # Get the filepath from the nearest ancestor unit node + unit_ancestor <- XML::xpathApply(unit, "ancestor::src:unit[1]", namespaces = ns)[[1]] + filepath <- XML::xmlGetAttr(unit_ancestor, "filename") + + # Extract the function name + function_name_node_list <- XML::getNodeSet(unit, "./src:name", namespaces = ns) + if (length(function_name_node_list) > 0) { + function_name <- XML::xmlValue(function_name_node_list[[1]]) + } else { + function_name <- NA + } + + if (include_parameters) { + # Extract parameters + param_nodes <- XML::getNodeSet(unit, "./src:parameter_list/src:parameter/src:decl", namespaces = ns) + if (length(param_nodes) > 0) { + # For each parameter, extract type and name + parameters <- sapply(param_nodes, function(param_node) { + # Get parameter type + type_nodes <- XML::getNodeSet(param_node, ".//src:type", namespaces = ns) + if (length(type_nodes) > 0) { + # Extract all name elements within type, including complex types + type_name_nodes <- XML::getNodeSet(type_nodes[[1]], ".//src:name", namespaces = ns) + param_type <- paste(sapply(type_name_nodes, XML::xmlValue), collapse = "") + } else { + param_type <- "" + } + + # Get parameter name + name_node <- XML::getNodeSet(param_node, "./src:name", namespaces = ns) + param_name <- if (length(name_node) > 0) { + XML::xmlValue(name_node[[1]]) + } else { + "" + } + + # Combine type and name + paste(param_type, param_name) + }) + # Combine all parameters into a single string + parameters <- paste(parameters, collapse = ", ") + } else { + parameters <- "" + } + + # Return a data.table with parameters + return(data.table( + filepath = filepath, + function_name = function_name, + parameters = parameters + )) + } else { + # Return a data.table without parameters + return(data.table( + filepath = filepath, + function_name = function_name + )) + } + } + + # Apply the parsing function to each function_decl node + dt_functions <- rbindlist(lapply(function_nodes, parse_function_info)) + + return(dt_functions) +} + + ############## GoF Detection ############## diff --git a/man/query_src_text_class_docs.Rd b/man/query_src_text_class_docs.Rd index 744fdf4a..e62b16eb 100644 --- a/man/query_src_text_class_docs.Rd +++ b/man/query_src_text_class_docs.Rd @@ -4,12 +4,14 @@ \alias{query_src_text_class_docs} \title{Query srcML Class Documentation Comments} \usage{ -query_src_text_class_docs(srcml_path, srcml_filepath) +query_src_text_class_docs(srcml_path, srcml_filepath, exclude_license = TRUE) } \arguments{ \item{srcml_path}{The path to the srcML binary.} \item{srcml_filepath}{The path to the srcML file to be queried.} + +\item{exclude_license}{If TRUE, excludes license-related comments. Defaults to TRUE.} } \value{ A data.table containing the file path and the class-level documentation comment. diff --git a/man/query_src_text_file_docs.Rd b/man/query_src_text_file_docs.Rd index fe7df883..9f31a19a 100644 --- a/man/query_src_text_file_docs.Rd +++ b/man/query_src_text_file_docs.Rd @@ -4,12 +4,14 @@ \alias{query_src_text_file_docs} \title{Query srcML File-Level Documentation Comments} \usage{ -query_src_text_file_docs(srcml_path, srcml_filepath) +query_src_text_file_docs(srcml_path, srcml_filepath, exclude_license = TRUE) } \arguments{ \item{srcml_path}{The path to the srcML binary.} \item{srcml_filepath}{The path to the srcML file to be queried.} + +\item{exclude_license}{If TRUE, excludes license-related comments. Defaults to TRUE.} } \value{ A data.table containing the file path and the file-level documentation comment. diff --git a/man/query_src_text_functions.Rd b/man/query_src_text_functions.Rd new file mode 100644 index 00000000..67b299ec --- /dev/null +++ b/man/query_src_text_functions.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/src.R +\name{query_src_text_functions} +\alias{query_src_text_functions} +\title{Query srcML Function Declarations} +\usage{ +query_src_text_functions(srcml_path, srcml_filepath, include_parameters = TRUE) +} +\arguments{ +\item{srcml_path}{The path to the srcML binary.} + +\item{srcml_filepath}{The path to the srcML file to be queried (see \code{\link{annotate_src_text}}).} + +\item{include_parameters}{If TRUE, includes function parameters. Defaults to TRUE.} +} +\value{ +A data.table containing filepath, function_name, and optionally parameters. +} +\description{ +This function extracts function names and optionally their parameters from the srcML XML file. +} diff --git a/man/query_src_text_variables.Rd b/man/query_src_text_variables.Rd new file mode 100644 index 00000000..45677264 --- /dev/null +++ b/man/query_src_text_variables.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/src.R +\name{query_src_text_variables} +\alias{query_src_text_variables} +\title{Query srcML Variable Declarations} +\usage{ +query_src_text_variables(srcml_path, srcml_filepath, var_type = TRUE) +} +\arguments{ +\item{srcml_path}{The path to the srcML binary.} + +\item{srcml_filepath}{The path to the srcML file to be queried (see \code{\link{annotate_src_text}}).} + +\item{var_type}{If TRUE, includes variable types. Defaults to TRUE.} +} +\value{ +A data.table containing filepath, variable_name, and optionally variable_type. +} +\description{ +This function extracts variable names and their types from the srcML XML file. +} diff --git a/vignettes/syntax_extractor.Rmd b/vignettes/syntax_extractor.Rmd index 1ad216e6..eff4e6e0 100644 --- a/vignettes/syntax_extractor.Rmd +++ b/vignettes/syntax_extractor.Rmd @@ -90,28 +90,45 @@ Here’s how you can set up the configuration: ``` {r eval=FALSE} # Load the project configuration -tool <- yaml::read_yaml("tools.yml") -conf <- yaml::read_yaml("conf/maven.yml") - -# Path to srcML binary -srcml_path <- tool[["srcml"]] +tool <- yaml::read_yaml("../tools.yml") +conf <- yaml::read_yaml("../conf/maven.yml") # Paths from config -srcml_path <- conf[["tool"]][["srcml_path"]] -src_folder <- conf[["version_control"]][["log"]] -srcml_output_path <- conf[["tool"]][["srcml_output_path"]] -srcml_filepath <- conf[["tool"]][["srcml_output_path"]] +srcml_path <- conf[["srcml"]][["srcml_path"]] +src_folder <- conf[["srcml"]][["src_folder"]] +# srcml_output_path <- conf[["tool"]][["srcml_output_path"]] +srcml_filepath <- conf[["srcml"]][["srcml_filepath"]] # Filters file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] exclude_paths <- conf[["filter"]][["remove_filepaths_containing"]] -# Analysis topics -topics <- conf[["analysis"]][["topics"]] ``` +# Project Configuration File for Maven Project +project: + website: https://github.com/apache/maven + +filter: + # File extensions to include in the analysis + keep_filepaths_ending_with: + - .java + - .xml + # File paths to exclude from the analysis (e.g., test files) + remove_filepaths_containing: + - test + - example + +srcml: + # path to tool binary (change this location later) + srcml_path: /usr/local/bin/srcml + # path to the folder to analyze + src_folder: ../../maven/maven-artifact/src/main/java/org/apache/maven/artifact/ + srcml_filepath: ../../analysis/maven/srcml_output.xml + + # Running the Syntax Extractor ## How the Syntax Extractor Works @@ -209,7 +226,7 @@ Explanation: #### Step 3: Testing the XPath Query Once you have written the XPath query, you can test it in Kaiaulu using the query_src_text() function. Here’s an example of how to use it: -``` {r eval=FALSE} +``` # Extracting function documentation comments function_comments <- query_src_text( srcml_path = "path/to/srcML", @@ -253,7 +270,7 @@ The query_src_text() function: Here’s an example of how you might use it: -``` {r eval=FALSE} +``` # Running an XPath query on the annotated XML query_result <- query_src_text( srcml_path = "path/to/srcML", @@ -316,12 +333,15 @@ This is especially useful in larger projects like Maven, where code is split acr ## Extract File-Level Documentation -To extract file-level documentation comments, we want to ignore the licensing information and focus on comments that describe the file’s purpose or contents. In Maven, file-class comments appear following package or import statements. +In Maven, file-class comments appear following package or import statements. + +- Has a parameter exclude_license which defaults to TRUE. Set it to FALSE if you want the extraction to include licensing information (e.g. Apache Licensing Header on all files from Maven). ``` {r eval=FALSE} file_docs <- query_src_text_file_docs( srcml_path = srcml_path, - srcml_filepath = srcml_filepath + srcml_filepath = srcml_filepath, + exclude_license = TRUE ) # Display the file-level documentation @@ -329,14 +349,79 @@ file_docs %>% gt() ``` +From the generated output table, we are able to use regular expressions to alter the output with the stringi library. For example, the out put contains empty comments /** */. We can remove these and get the table with empty comments filtered out: + +``` {r eval=FALSE} +dt_file_docs <- file_docs + +# Define a function to check if a comment is empty +is_comment_empty <- function(comment_text) { + # Remove comment markers and asterisks + cleaned_text <- stri_replace_all_regex( + comment_text, + pattern = "/\\*\\*?|\\*/|\\*", + replacement = "" + ) + # Trim whitespaces + cleaned_text <- stri_trim_both(cleaned_text) + # Check if the cleaned text is empty + return(nchar(cleaned_text) == 0) +} + +# Create a vector indicating which comments are empty +empty_comments <- sapply(dt_file_docs$file_docs, is_comment_empty) + +# Filter out the empty comments +dt_file_docs <- dt_file_docs[!empty_comments, ] + +# Display the filtered data +dt_file_docs %>% + gt() +``` + +Another example is removing the comment delimiters, leaving just the plain text: + +```{r eval=FALSE} +# Define a function to clean the comment text +clean_comment <- function(comment_text) { + # Remove the starting /** and ending */ + comment_text <- stri_replace_all_regex( + comment_text, + pattern = "^/\\*\\*?|\\*/$", + replacement = "" + ) + # Split the comment into lines + comment_lines <- unlist(stri_split_lines(comment_text)) + # Remove leading * and whitespace from each line + comment_lines <- stri_replace_all_regex( + comment_lines, + pattern = "^\\s*\\*\\s?", + replacement = "" + ) + # Combine the lines back into a single string + cleaned_text <- stri_trim_both(paste(comment_lines, collapse = "\n")) + return(cleaned_text) +} + +# Apply the cleaning function to the 'file_docs' column +dt_file_docs[, file_docs := sapply(file_docs, clean_comment)] + +# Display the cleaned data +dt_file_docs %>% + gt() +``` + ## Class-Level Documentation Class-level documentation comments are essential for understanding the purpose and functionality of a class. Meaningful comments about the class appear just before the tag (class definitions). +- Has a parameter exclude_license which defaults to TRUE. Set it to FALSE if you want the extraction to include licensing information (e.g. Apache Licensing Header on all files from Maven). + ``` {r eval=FALSE} class_docs <- query_src_text_class_docs( srcml_path = srcml_path, - srcml_filepath = srcml_filepath + srcml_filepath = srcml_filepath, + exclude_license = TRUE ) # Display the class-level documentation @@ -344,5 +429,48 @@ class_docs %>% gt() ``` +## Variable Extraction +Extracts variables. var_type param TRUE to include the types. +``` {r eval=FALSE} +# Extract variables with types +variables_with_types <- query_src_text_variables( + srcml_path = srcml_path, + srcml_filepath = srcml_filepath, + var_type = TRUE +) + +# Display the result +variables_with_types %>% + gt() +``` + +## Extract Functions +With or without parameters, include_parameters = T/F + +```{r eval=FALSE} +# Extract functions with parameters +functions_with_params <- query_src_text_functions( + srcml_path = srcml_path, + srcml_filepath = srcml_filepath, + include_parameters = TRUE +) + +# Display the result +functions_with_params %>% + gt() + +``` +```{r eval=FALSE} +# Extract functions without parameters +functions_without_params <- query_src_text_functions( + srcml_path = srcml_path, + srcml_filepath = srcml_filepath, + include_parameters = FALSE +) + +# Display the result +functions_without_params %>% + gt() +```