From 0cbc1c8ce47aea1df955a37272a71749c497fe35 Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Mon, 22 Jul 2024 18:17:18 +1000 Subject: [PATCH 01/15] Added dist_kde and helpers --- NAMESPACE | 11 +- R/density.R | 283 ++++++++++++++++++++++++------------------- man/as_kde.Rd | 37 ------ man/dist_kde.Rd | 33 +++++ man/kde_bandwidth.Rd | 2 +- 5 files changed, 204 insertions(+), 162 deletions(-) delete mode 100644 man/as_kde.Rd create mode 100644 man/dist_kde.Rd diff --git a/NAMESPACE b/NAMESPACE index 97cff5e..eccba49 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,16 +1,23 @@ # Generated by roxygen2: do not edit by hand S3method(autoplot,kde) +S3method(density,dist_kde) S3method(density_scores,default) S3method(density_scores,gam) S3method(density_scores,kde) S3method(density_scores,lm) -S3method(print,kde) +S3method(distributional::cdf,dist_kde) +S3method(distributional::covariance,dist_kde) +S3method(distributional::generate,dist_kde) +S3method(distributional::skewness,dist_kde) +S3method(format,dist_kde) +S3method(mean,dist_kde) +S3method(median,dist_kde) S3method(print,weird_conflicts) -export(as_kde) export(autoplot) export(chauvenet_anomalies) export(density_scores) +export(dist_kde) export(dixon_anomalies) export(fetch_wine_reviews) export(gg_bagplot) diff --git a/R/density.R b/R/density.R index 3ed5363..8867235 100644 --- a/R/density.R +++ b/R/density.R @@ -1,130 +1,90 @@ -#' @export -print.kde <- function(x, ...) { - kde <- !(is.null(x$h) & is.null(x$H)) - if (inherits(x$eval.points, "list")) { - d <- length(x$eval.points) - } else { - d <- 1L - } - if(!kde) { - cat("Density of: [", - paste0(x$names, collapse = ", "), "]\n", sep = "") - } else { - cat("Kernel density estimate of: [", - paste0(x$names, collapse = ", "), "]\n", sep = "") - } - if(d == 1L){ - ngrid <- length(x$eval.points) - } else { - ngrid <- lapply(x$eval.points, length) - } - cat("Computed on a grid of size", paste(ngrid, collapse = " x "), "\n") - if(kde) { - cat("Bandwidth: ") - if (d == 1L) { - cat("h = ", format(x$h, digits = 4)) - } else { - cat("H = \n") - cat(format(x$H, digits = 4), quote=FALSE) - } - } - invisible(x) -} - -#' Convert data frame or matrix object to kde class -#' -#' A density specified as a data frame or matrix can be converted to a kde object. -#' This is useful for plotting the density using \code{\link{autoplot.kde}}. -#' As kde objects are defined on a grid, the density values are interpolated -#' based on the points in the data frame or matrix. +#' Create kde distributional object #' -#' @param object Data frame or matrix with numerical columns, where one column -#' (specified by `density_column`) contains the density values, and the -#' remaining columns define the points at which the density is evaluated. -#' @param density_column Name of the column containing the density values, specified -#' as a bare expression. If missing, the last column is used. -#' @param ngrid Number of points to use for the grid in each dimension. Default is -#' 10001 for univariate densities and 101 for multivariate densities. -#' @param ... Additional arguments are ignored. -#' @return An object of class "kde" -#' @author Rob J Hyndman +#' Creates a distributional object from a kernel density estimate. The +#' `ks::kde()` function is used to compute the density. The cdf and quantiles +#' are consistent with the kde. +#' @details This is a replacement for `distributional::dist_kde()` which also +#' uses a kde when computing the density. However, it uses `stats::density()` on +#' the univariate margins, rather than computing a potentially multivariate +#' density using `ks::kde()`. Also, the quantiles and cdf of a `dist_kde` +#' object are not consistent with the density estimate. (e.g., the cdf is not +#' the integral of the density, and the quantiles are not the inverse of the +#' cdf). +#' @param y Numerical vector or matrix of data, or a list of such objects. If a +#' list is provided, then all objects should be of the same dimension. e.g., +#' all vectors, or all matrices with the same number of columns. +#' @param bandwidth Function to compute bandwidth. +#' @param multiplier to be used for bandwidth. Set to 2 for anomaly detection, and 1 otherwise. +#' @param ... Other arguments are passed to \code{\link[ks]{kde}}. #' @examples -#' tibble(y = seq(-4, 4, by = 0.01), density = dnorm(y)) |> -#' as_kde() +#' dist_kde(c(rnorm(200), rnorm(100, 5))) +#' #' @export -as_kde <- function(object, density_column, ngrid, ...) { - # Check columns of object are all numerical - object <- as.data.frame(object) - if(!all(sapply(object, is.numeric))) { - stop("All columns of object must be numeric") - } - # Check density_column is in object - if(missing(density_column)) { - density_column <- tail(colnames(object), 1) - } else { - density_column <- dplyr::as_label(dplyr::enquo(density_column)) +dist_kde <- function( + y, + bandwidth = kde_bandwidth, + k = 1, + ...) { + if (!is.list(y)) { + y <- list(y) } - if(!(density_column %in% colnames(object))) { - stop(paste(density_column, "not found")) + d <- unlist(lapply(y, function(u) NCOL(u))) + if (!all(d == d[1])) { + stop("All data sets must have the same dimension") } - # Separate points from density values - den <- object[[density_column]] - object[[density_column]] <- NULL - # Find the dimension - d <- NCOL(object) - if (d == 1L) { - if(missing(ngrid)) { - ngrid <- 10001 - } - # Interpolate density on finer grid - density <- list(eval.points = seq(min(object), max(object), length = ngrid)) - density$estimate <- approx(object[[1]], den, xout = density$eval.points)$y - } else if (d == 2L) { - if(missing(ngrid)) { - ngrid <- 101 + y <- vctrs::as_list_of(y, .ptype = vctrs::vec_ptype(y[[1]])) + # Estimate density using ks::kde + density <- lapply( + y, + function(u) { + # browser() + if (NCOL(u) == 1L) { + ks::kde(u, h = kde_bandwidth(u, method = "double"), ...) + } else { + ks::kde(u, H = kde_bandwidth(u, method = "double"), ...) + } } - density <- density_on_grid(as.matrix(object), den, ngrid) - } else { - stop("Only univariate and bivariate densities are supported") - } - # Find falpha using quantile method - missing <- is.na(density$estimate) - samplex <- sample(density$estimate[!missing], - size = 50000, replace = TRUE, - prob = density$estimate[!missing] ) - density$cont <- quantile(samplex, prob = (99:1) / 100, type = 8) - # Set missing values to 0 - density$estimate[is.na(density$estimate)] <- 0 - # Add names - density$names <- colnames(object) - if(is.null(density$names)) { - if(d == 1) { - density$names <- "y" - } else { - density$names <- paste0("y", seq_len(d)) - } - } - structure(density, class = "kde") + # Convert to distributional object + distributional::new_dist(kde = density, class = "dist_kde") } -density_on_grid <- function(y, fy, ngrid) { - y <- as.matrix(y) - if (NCOL(y) != 2L) { - stop("y must be a matrix with 2 columns") +#' @export +format.dist_kde <- function(x, ...) { + d <- vapply(x, function(u) { + NCOL(u$x) + }, integer(1L)) + ngrid <- vapply(x, function(u) { + length(u$eval.points) + }, integer(1L)) + if (d == 1) { + # Find bandwidth and convert to string + h <- vapply(x, function(u) { + u$h + }, numeric(1L)) + sprintf("kde[%sd, h=%.2g]", d, h) + } else { + # Create matrix as string + H <- c(vapply(x, function(u) { + u$H + }, numeric(d * d))) + Hstring <- "{" + for (i in seq(d)) { + Hstring <- paste0( + Hstring, "(", + paste0(sprintf("%.2g", H[(i - 1) * d + seq(d)]), + sep = "", collapse = ", " + ), + ")'" + ) + if (i < d) { + Hstring <- paste0(Hstring, ", ") + } + } + Hstring <- paste0(Hstring, "}") + sprintf("kde[%sd, H=%s]", d, Hstring) } - # Create grid of points - density <- list(eval.points = list( - seq(min(y[, 1]), max(y[, 1]), length = ngrid), - seq(min(y[, 2]), max(y[, 2]), length = ngrid) - )) - # Bivariate interpolation - grid <- expand.grid(density$eval.points[[1]], density$eval.points[[2]]) - ifun <- interpolation::interpfun(x = y[, 1], y = y[, 2], z = fy) - density$estimate <- ifun(grid[,1], grid[,2]) |> - matrix(nrow = ngrid) - return(density) } #' Robust bandwidth estimation for kernel density estimation @@ -142,7 +102,7 @@ density_on_grid <- function(y, fy, ngrid) { #' # Univariate bandwidth calculation #' kde_bandwidth(oldfaithful$duration) #' # Bivariate bandwidth calculation -#' kde_bandwidth(oldfaithful[,2:3]) +#' kde_bandwidth(oldfaithful[, 2:3]) #' @export kde_bandwidth <- function(data, method = c("robust_normal", "double", "lookout"), @@ -150,16 +110,16 @@ kde_bandwidth <- function(data, method = c("robust_normal", "double", "lookout") method <- match.arg(method) d <- NCOL(data) n <- NROW(data) - if(d > 1) { + if (d > 1) { # Find robust covariance matrix of data S <- robustbase::covOGK(data, sigmamu = robustbase::s_IQR)$cov } - if(method != "lookout") { + if (method != "lookout") { k <- ifelse(method == "double", 2, 1) - if(d == 1L) + if (d == 1L) { return(k * 1.06 * robustbase::s_IQR(data) * n^(-0.2)) - else { - return((4/(n * (d + 2)))^(2/(d + 4)) * k^2 * S) + } else { + return((4 / (n * (d + 2)))^(2 / (d + 4)) * k^2 * S) } } else { stop("Not yet implemented") @@ -189,3 +149,82 @@ kde_bandwidth <- function(data, method = c("robust_normal", "double", "lookout") } } +#' @export +density.dist_kde <- function(x, at, ..., na.rm = TRUE) { + d <- NCOL(x$kde$x) + if (d == 1) { + d <- stats::approx(x$kde$eval.points, x$kde$estimate, xout = at)$y + } else { + stop("Not yet implemented") + } + d[is.na(d)] <- 0 + return(d) +} + +#' @exportS3Method distributional::cdf +cdf.dist_kde <- function(x, q, ..., na.rm = TRUE) { + # Apply independently over sample variates + if (is.matrix(x$x)) { + return( + apply(x$x, 2, + function(x, ...) cdf.dist_kde(list(x = x), ...), + q = q, ..., na.rm = TRUE + ) + ) + } + # Integrate density + delta <- x$kde$eval.points[2] - x$kde$eval.points[1] + F <- cumsum(x$kde$estimate) * delta + stats::approx(x$kde$eval.points, F, xout = q, yleft = 0, yright = 1, ..., na.rm = na.rm)$y +} + +#' @exportS3Method distributional::generate +generate.dist_kde <- function(x, times, ...) { + d <- NCOL(x$kde$x) + if (d == 1) { + h <- x$kde$h + sample(x$kde$x, size = times, replace = TRUE) + rnorm(times, sd = h) + } else { + stop("Not yet implemented") + } +} + +#' @export +mean.dist_kde <- function(x, ...) { + if (is.matrix(x$kde$x)) { + apply(x$kde$x, 2, mean, ...) + } else { + mean(x$kde$x, ...) + } +} + +#' @export +median.dist_kde <- function(x, na.rm = FALSE, ...) { + if (is.matrix(x$kde$x)) { + apply(x$kde$x, 2, median, na.rm = na.rm, ...) + } else { + median(x$kde$x, na.rm = na.rm, ...) + } +} + +#' @exportS3Method distributional::covariance +covariance.dist_kde <- function(x, ...) { + if (is.matrix(x$kde$x)) { + stats::cov(x$kde$x, ...) + } else { + stats::var(x$kde$x, ...) + x$kde$h^2 + } +} + +#' @exportS3Method distributional::skewness +skewness.dist_kde <- function(x, ..., na.rm = FALSE) { + if (is.matrix(x$kde$x)) { + abort("Multivariate sample skewness is not yet implemented.") + } + n <- lengths(x$kde$x, use.names = FALSE) + x <- lapply(x$kde$x, function(.) . - mean(., na.rm = na.rm)) + sum_x2 <- vapply(x$kde$x, function(.) sum(.^2, na.rm = na.rm), numeric(1L), USE.NAMES = FALSE) + sum_x3 <- vapply(x$kde$x, function(.) sum(.^3, na.rm = na.rm), numeric(1L), USE.NAMES = FALSE) + y <- sqrt(n) * sum_x3 / (sum_x2^(3 / 2)) + y * ((1 - 1 / n))^(3 / 2) +} diff --git a/man/as_kde.Rd b/man/as_kde.Rd deleted file mode 100644 index d93827e..0000000 --- a/man/as_kde.Rd +++ /dev/null @@ -1,37 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/density.R -\name{as_kde} -\alias{as_kde} -\title{Convert data frame or matrix object to kde class} -\usage{ -as_kde(object, density_column, ngrid, ...) -} -\arguments{ -\item{object}{Data frame or matrix with numerical columns, where one column -(specified by \code{density_column}) contains the density values, and the -remaining columns define the points at which the density is evaluated.} - -\item{density_column}{Name of the column containing the density values, specified -as a bare expression. If missing, the last column is used.} - -\item{ngrid}{Number of points to use for the grid in each dimension. Default is -10001 for univariate densities and 101 for multivariate densities.} - -\item{...}{Additional arguments are ignored.} -} -\value{ -An object of class "kde" -} -\description{ -A density specified as a data frame or matrix can be converted to a kde object. -This is useful for plotting the density using \code{\link{autoplot.kde}}. -As kde objects are defined on a grid, the density values are interpolated -based on the points in the data frame or matrix. -} -\examples{ -tibble(y = seq(-4, 4, by = 0.01), density = dnorm(y)) |> - as_kde() -} -\author{ -Rob J Hyndman -} diff --git a/man/dist_kde.Rd b/man/dist_kde.Rd new file mode 100644 index 0000000..f6f976f --- /dev/null +++ b/man/dist_kde.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/density.R +\name{dist_kde} +\alias{dist_kde} +\title{Create kde distributional object} +\usage{ +dist_kde(y, bandwidth = kde_bandwidth, k = 1, ...) +} +\arguments{ +\item{y}{Numerical vector or matrix of data, or a list of such objects. If a +list is provided, then all objects should be of the same dimension. e.g., +all vectors, or all matrices with the same number of columns.} + +\item{bandwidth}{Function to compute bandwidth.} + +\item{...}{Other arguments are passed to \code{\link[ks]{kde}}.} + +\item{multiplier}{to be used for bandwidth. Set to 2 for anomaly detection, and 1 otherwise.} +} +\description{ +Creates a distributional object from a kernel density estimate. The +\code{ks::kde()} function is used to compute the density. The cdf and quantiles +are consistent with the kde. +} +\details{ +This is a replacement for \code{distributional::dist_kde()} which also +uses a kde when computing the density. However, it uses \code{stats::density()} on +the univariate margins, rather than computing a potentially multivariate +density using \code{ks::kde()}. Also, the quantiles and cdf of a \code{dist_kde} +object are not consistent with the density estimate. (e.g., the cdf is not +the integral of the density, and the quantiles are not the inverse of the +cdf). +} diff --git a/man/kde_bandwidth.Rd b/man/kde_bandwidth.Rd index 358a5b3..40cc84d 100644 --- a/man/kde_bandwidth.Rd +++ b/man/kde_bandwidth.Rd @@ -31,7 +31,7 @@ Robust bandwidth estimation for kernel density estimation # Univariate bandwidth calculation kde_bandwidth(oldfaithful$duration) # Bivariate bandwidth calculation -kde_bandwidth(oldfaithful[,2:3]) +kde_bandwidth(oldfaithful[, 2:3]) } \author{ Rob J Hyndman From deaf714d88cb05a1c017cb6bf8dda05d509bffa3 Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Mon, 22 Jul 2024 18:17:36 +1000 Subject: [PATCH 02/15] pkgdown heading case --- _pkgdown.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index c669cf7..be72f3f 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -55,7 +55,7 @@ reference: - as_kde - autoplot.kde - kde_bandwidth - - title: Functions for Graphics and HDR calculations + - title: Functions for graphics and HDR calculations contents: - gg_bagplot - gg_hdrboxplot From 69155e6e3b1eb95595f6e6c9c51be9c3bcc7b3f7 Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Mon, 22 Jul 2024 18:17:54 +1000 Subject: [PATCH 03/15] Improved docs --- man/score_tail_prob.Rd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/man/score_tail_prob.Rd b/man/score_tail_prob.Rd index faaeacc..efa0d72 100644 --- a/man/score_tail_prob.Rd +++ b/man/score_tail_prob.Rd @@ -43,7 +43,9 @@ These probabilities may be computed in three different ways. \item Given a specified \code{distribution}. \item Using a Generalized Pareto Distribution fitted to the most extreme values of \code{g} (those above the \code{threshold_probability} quantile). This -option is used if \code{GPD = TRUE}. +option is used if \code{GPD = TRUE}. For \code{g} values with tail probability +greater than \code{threshold_probability}, the value of \code{threshold_probability} +is returned. \item Empirically as the proportion of values above \code{g}. This option is used when \code{GPD = FALSE} and \code{distribution = NULL}. } From 28ae1bf9270079bca51266a15ee1f0cb11d4c205 Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Tue, 23 Jul 2024 18:18:17 +1000 Subject: [PATCH 04/15] Allow numeric bandwidths in dist_kde --- R/density.R | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/R/density.R b/R/density.R index 8867235..2679e48 100644 --- a/R/density.R +++ b/R/density.R @@ -13,19 +13,17 @@ #' @param y Numerical vector or matrix of data, or a list of such objects. If a #' list is provided, then all objects should be of the same dimension. e.g., #' all vectors, or all matrices with the same number of columns. -#' @param bandwidth Function to compute bandwidth. -#' @param multiplier to be used for bandwidth. Set to 2 for anomaly detection, and 1 otherwise. -#' @param ... Other arguments are passed to \code{\link[ks]{kde}}. +#' @param bandwidth Either a function to compute bandwidths from data, +#' or a numerical value (for univariate data), or a numerical matrix (for +#' multivariate data). +#' @param kde_options A named list containing arguments for \code{\link[ks]{kde}}. +#' @param ... Other arguments are passed to the `bandwidth` function. #' @examples -#' dist_kde(c(rnorm(200), rnorm(100, 5))) +#' dist_kde(c(rnorm(200), rnorm(100, 5)), method = "double") #' #' @export -dist_kde <- function( - y, - bandwidth = kde_bandwidth, - k = 1, - ...) { +dist_kde <- function(y, bandwidth = kde_bandwidth, kde_options = NULL, ...) { if (!is.list(y)) { y <- list(y) } @@ -38,11 +36,13 @@ dist_kde <- function( density <- lapply( y, function(u) { - # browser() + if(!is.numeric(bandwidth)) { + bandwidth <- bandwidth(u, ...) + } if (NCOL(u) == 1L) { - ks::kde(u, h = kde_bandwidth(u, method = "double"), ...) + do.call(ks::kde, c(list(x = u, h = bandwidth), kde_options)) } else { - ks::kde(u, H = kde_bandwidth(u, method = "double"), ...) + do.call(ks::kde, c(list(x = u, H = bandwidth), kde_options)) } } ) @@ -209,22 +209,24 @@ median.dist_kde <- function(x, na.rm = FALSE, ...) { #' @exportS3Method distributional::covariance covariance.dist_kde <- function(x, ...) { + n <- NROW(x$kde$x) if (is.matrix(x$kde$x)) { - stats::cov(x$kde$x, ...) + stop("Multivariate kde covariance is not yet implemented.") + stats::cov(x$kde$x, ...) # Needs adjustment } else { - stats::var(x$kde$x, ...) + x$kde$h^2 + (n-1) / n * stats::var(x$kde$x, ...) + x$kde$h^2 } } #' @exportS3Method distributional::skewness skewness.dist_kde <- function(x, ..., na.rm = FALSE) { + n <- NROW(x$kde$x) if (is.matrix(x$kde$x)) { - abort("Multivariate sample skewness is not yet implemented.") + stop("Multivariate sample skewness is not yet implemented.") + } else { + m1 <- mean(x$kde$x) # E(X) + m2 <- x$kde$h^2 + mean(x$kde$x^2) # E(X^2) + m3 <- 3 * x$kde$h^2 * m1 - mean(x$kde$x^3) # E(X^3) + m3 - 3*m1*m2 + 2*m1^2 } - n <- lengths(x$kde$x, use.names = FALSE) - x <- lapply(x$kde$x, function(.) . - mean(., na.rm = na.rm)) - sum_x2 <- vapply(x$kde$x, function(.) sum(.^2, na.rm = na.rm), numeric(1L), USE.NAMES = FALSE) - sum_x3 <- vapply(x$kde$x, function(.) sum(.^3, na.rm = na.rm), numeric(1L), USE.NAMES = FALSE) - y <- sqrt(n) * sum_x3 / (sum_x2^(3 / 2)) - y * ((1 - 1 / n))^(3 / 2) } From 28f8db356def1dd27ebe50470ad554c6137d8aca Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Wed, 24 Jul 2024 11:49:28 +1000 Subject: [PATCH 05/15] Simpler arguments for dist_kde --- NAMESPACE | 1 + R/density.R | 38 ++++++++++++++++++++------------------ 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index eccba49..0ad78cf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ S3method(density_scores,lm) S3method(distributional::cdf,dist_kde) S3method(distributional::covariance,dist_kde) S3method(distributional::generate,dist_kde) +S3method(distributional::kurtosis,dist_kde) S3method(distributional::skewness,dist_kde) S3method(format,dist_kde) S3method(mean,dist_kde) diff --git a/R/density.R b/R/density.R index 2679e48..39be5b3 100644 --- a/R/density.R +++ b/R/density.R @@ -1,29 +1,24 @@ #' Create kde distributional object #' -#' Creates a distributional object from a kernel density estimate. The +#' Creates a distributional object using a kernel density estimate. The #' `ks::kde()` function is used to compute the density. The cdf and quantiles #' are consistent with the kde. -#' @details This is a replacement for `distributional::dist_kde()` which also -#' uses a kde when computing the density. However, it uses `stats::density()` on -#' the univariate margins, rather than computing a potentially multivariate -#' density using `ks::kde()`. Also, the quantiles and cdf of a `dist_kde` -#' object are not consistent with the density estimate. (e.g., the cdf is not -#' the integral of the density, and the quantiles are not the inverse of the -#' cdf). +#' #' @param y Numerical vector or matrix of data, or a list of such objects. If a #' list is provided, then all objects should be of the same dimension. e.g., #' all vectors, or all matrices with the same number of columns. -#' @param bandwidth Either a function to compute bandwidths from data, -#' or a numerical value (for univariate data), or a numerical matrix (for -#' multivariate data). +#' @param h Bandwidth for univariate distribution. If `NULL`, the +#' \code{\link{kde_bandwidth}} function is used. +#' @param H Bandwidth matrix for multivariate distribution. If `NULL`, the +#' \code{\link{kde_bandwidth}} function is used. #' @param kde_options A named list containing arguments for \code{\link[ks]{kde}}. -#' @param ... Other arguments are passed to the `bandwidth` function. +#' @param ... Other arguments are passed to the \code{\link{kde_bandwidth}} function. #' @examples #' dist_kde(c(rnorm(200), rnorm(100, 5)), method = "double") #' #' @export -dist_kde <- function(y, bandwidth = kde_bandwidth, kde_options = NULL, ...) { +dist_kde <- function(y, h = NULL, H = NULL, kde_options = NULL, ...) { if (!is.list(y)) { y <- list(y) } @@ -36,13 +31,20 @@ dist_kde <- function(y, bandwidth = kde_bandwidth, kde_options = NULL, ...) { density <- lapply( y, function(u) { - if(!is.numeric(bandwidth)) { - bandwidth <- bandwidth(u, ...) - } if (NCOL(u) == 1L) { - do.call(ks::kde, c(list(x = u, h = bandwidth), kde_options)) + if (is.null(h)) { + if (!is.null(H)) { + h <- sqrt(H) + } else { + h <- kde_bandwidth(u, ...) + } + } + do.call(ks::kde, c(list(x = u, h = h), kde_options)) } else { - do.call(ks::kde, c(list(x = u, H = bandwidth), kde_options)) + if (is.null(H)) { + H <- kde_bandwidth(u, ...) + } + do.call(ks::kde, c(list(x = u, H = H), kde_options)) } } ) From f11e5f55d58be9e5e4bb646d44cdac1a96204745 Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Wed, 24 Jul 2024 11:49:54 +1000 Subject: [PATCH 06/15] Cleaner code --- R/density.R | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/R/density.R b/R/density.R index 39be5b3..396be8f 100644 --- a/R/density.R +++ b/R/density.R @@ -165,7 +165,7 @@ density.dist_kde <- function(x, at, ..., na.rm = TRUE) { #' @exportS3Method distributional::cdf cdf.dist_kde <- function(x, q, ..., na.rm = TRUE) { - # Apply independently over sample variates + # Apply independently over margins if (is.matrix(x$x)) { return( apply(x$x, 2, @@ -184,8 +184,7 @@ cdf.dist_kde <- function(x, q, ..., na.rm = TRUE) { generate.dist_kde <- function(x, times, ...) { d <- NCOL(x$kde$x) if (d == 1) { - h <- x$kde$h - sample(x$kde$x, size = times, replace = TRUE) + rnorm(times, sd = h) + sample(x$kde$x, size = times, replace = TRUE) + rnorm(times, sd = x$kde$h) } else { stop("Not yet implemented") } @@ -216,7 +215,7 @@ covariance.dist_kde <- function(x, ...) { stop("Multivariate kde covariance is not yet implemented.") stats::cov(x$kde$x, ...) # Needs adjustment } else { - (n-1) / n * stats::var(x$kde$x, ...) + x$kde$h^2 + (n - 1) / n * stats::var(x$kde$x, ...) + x$kde$h^2 } } From c18518a118c0dedb23145815acb30eff1a1c9024 Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Wed, 24 Jul 2024 11:50:28 +1000 Subject: [PATCH 07/15] Simpler skewness calculation and added kurtosis --- R/density.R | 20 ++++++++++++++------ man/dist_kde.Rd | 25 ++++++++++++------------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/R/density.R b/R/density.R index 396be8f..a01f3bd 100644 --- a/R/density.R +++ b/R/density.R @@ -221,13 +221,21 @@ covariance.dist_kde <- function(x, ...) { #' @exportS3Method distributional::skewness skewness.dist_kde <- function(x, ..., na.rm = FALSE) { - n <- NROW(x$kde$x) if (is.matrix(x$kde$x)) { - stop("Multivariate sample skewness is not yet implemented.") + stop("Multivariate skewness is not yet implemented.") + } else { + mean((x$kde$x - mean(x$kde$x))^3) / variance(x)^1.5 + } +} + +#' @exportS3Method distributional::kurtosis +# Excess kurtosis for consistency with distributional package +kurtosis.dist_kde <- function(x, ..., na.rm = FALSE) { + if (is.matrix(x$kde$x)) { + stop("Multivariate kurtosis is not yet implemented.") } else { - m1 <- mean(x$kde$x) # E(X) - m2 <- x$kde$h^2 + mean(x$kde$x^2) # E(X^2) - m3 <- 3 * x$kde$h^2 * m1 - mean(x$kde$x^3) # E(X^3) - m3 - 3*m1*m2 + 2*m1^2 + h <- x$kde$h + v <- variance(x) + (mean((x$kde$x - mean(x$kde$x))^4) + 6 * h^2 * v - 3 * h^4) / v^2 - 3 } } diff --git a/man/dist_kde.Rd b/man/dist_kde.Rd index f6f976f..cb6afbc 100644 --- a/man/dist_kde.Rd +++ b/man/dist_kde.Rd @@ -4,30 +4,29 @@ \alias{dist_kde} \title{Create kde distributional object} \usage{ -dist_kde(y, bandwidth = kde_bandwidth, k = 1, ...) +dist_kde(y, h = NULL, H = NULL, kde_options = NULL, ...) } \arguments{ \item{y}{Numerical vector or matrix of data, or a list of such objects. If a list is provided, then all objects should be of the same dimension. e.g., all vectors, or all matrices with the same number of columns.} -\item{bandwidth}{Function to compute bandwidth.} +\item{h}{Bandwidth for univariate distribution. If \code{NULL}, the +\code{\link{kde_bandwidth}} function is used.} -\item{...}{Other arguments are passed to \code{\link[ks]{kde}}.} +\item{H}{Bandwidth matrix for multivariate distribution. If \code{NULL}, the +\code{\link{kde_bandwidth}} function is used.} -\item{multiplier}{to be used for bandwidth. Set to 2 for anomaly detection, and 1 otherwise.} +\item{kde_options}{A named list containing arguments for \code{\link[ks]{kde}}.} + +\item{...}{Other arguments are passed to the \code{\link{kde_bandwidth}} function.} } \description{ -Creates a distributional object from a kernel density estimate. The +Creates a distributional object using a kernel density estimate. The \code{ks::kde()} function is used to compute the density. The cdf and quantiles are consistent with the kde. } -\details{ -This is a replacement for \code{distributional::dist_kde()} which also -uses a kde when computing the density. However, it uses \code{stats::density()} on -the univariate margins, rather than computing a potentially multivariate -density using \code{ks::kde()}. Also, the quantiles and cdf of a \code{dist_kde} -object are not consistent with the density estimate. (e.g., the cdf is not -the integral of the density, and the quantiles are not the inverse of the -cdf). +\examples{ +dist_kde(c(rnorm(200), rnorm(100, 5)), method = "double") + } From 7e42853020a2fa2037780f136b2e8a1a72ec74fa Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Wed, 24 Jul 2024 16:48:02 +1000 Subject: [PATCH 08/15] Added quantile.dist_kde --- NAMESPACE | 1 + R/density.R | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 0ad78cf..b85e81d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,7 @@ S3method(format,dist_kde) S3method(mean,dist_kde) S3method(median,dist_kde) S3method(print,weird_conflicts) +S3method(quantile,dist_kde) export(autoplot) export(chauvenet_anomalies) export(density_scores) diff --git a/R/density.R b/R/density.R index a01f3bd..b6cefa3 100644 --- a/R/density.R +++ b/R/density.R @@ -180,6 +180,15 @@ cdf.dist_kde <- function(x, q, ..., na.rm = TRUE) { stats::approx(x$kde$eval.points, F, xout = q, yleft = 0, yright = 1, ..., na.rm = na.rm)$y } +#' @export +quantile.dist_kde <- function (x, p, ..., na.rm = TRUE) { + # Integrate density + delta <- x$kde$eval.points[2] - x$kde$eval.points[1] + F <- cumsum(x$kde$estimate) * delta + stats::approx(F, x$kde$eval.points, xout = p, yleft = min(x$kde$eval.points), + yright = max(x$kde$eval.points), ties = mean, ..., na.rm = na.rm)$y +} + #' @exportS3Method distributional::generate generate.dist_kde <- function(x, times, ...) { d <- NCOL(x$kde$x) From 876c24c7ffbae3b2cc7533ba8abbf8a2fb621a1e Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Wed, 24 Jul 2024 16:48:52 +1000 Subject: [PATCH 09/15] Added autoplot.distribution --- NAMESPACE | 1 + R/autoplot.R | 200 +++++++++++++++++++++++++++++++++++ R/ggplot.R | 2 +- man/autoplot.distribution.Rd | 79 ++++++++++++++ man/autoplot.kde.Rd | 4 +- 5 files changed, 283 insertions(+), 3 deletions(-) create mode 100644 R/autoplot.R create mode 100644 man/autoplot.distribution.Rd diff --git a/NAMESPACE b/NAMESPACE index b85e81d..8ec67db 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ S3method(distributional::generate,dist_kde) S3method(distributional::kurtosis,dist_kde) S3method(distributional::skewness,dist_kde) S3method(format,dist_kde) +S3method(ggplot2::autoplot,distribution) S3method(mean,dist_kde) S3method(median,dist_kde) S3method(print,weird_conflicts) diff --git a/R/autoplot.R b/R/autoplot.R new file mode 100644 index 0000000..df6ee6a --- /dev/null +++ b/R/autoplot.R @@ -0,0 +1,200 @@ +#' Produce ggplot of densities from distributional objects in 1 or 2 dimensions +#' +#' @details +#' This function produces a ggplot of a density from a distributional object. +#' For univariate densities, it produces a line plot of the density function, with +#' an optional ribbon showing some highest density regions (HDRs) and/or the observations. +#' For bivariate densities, it produces a contour plot of the density function, with +#' the observations optionally shown as points. +#' The mode can also be drawn as a point with the HDRs. +#' For bivariate densities, the combination of `fill = TRUE`, `show_points = TRUE`, +#' `show_mode = TRUE`, and `prob = c(0.5, 0.99)` is equivalent to an HDR boxplot. +#' For univariate densities, the combination of `show_hdr = TRUE`, `show_points = TRUE`, +#' `show_mode = TRUE`, and `prob = c(0.5, 0.99)` is equivalent to an HDR boxplot. +#' +#' @param object distribution object from the distributional package or +#' \code{\link{dist_kde}}() +#' @param prob Probability of the HDR contours to be drawn (for a bivariate plot only). +#' @param fill If `TRUE`, and the density is bivariate, the bivariate contours +#' are shown as filled regions rather than lines. +#' @param show_hdr If `TRUE`, and the density is univariate, then the HDR regions +#' specified by `prob` are shown as a ribbon below the density. +#' @param show_points If `TRUE`, then individual points are plotted. +#' @param show_mode If `TRUE`, then the mode of the distribution is shown. +#' @param show_lookout If `TRUE`, then the observations with lookout probabilities less than 0.05 are shown in red. +#' @param color Color used for mode and HDR contours. If `palette = hdr_palette`, +#' it is also used as the basis for HDR regions. +#' @param palette Color palette function to use for HDR filled regions +#' (if `fill` is `TRUE` or `show_hdr` is `TRUE`). +#' @param alpha Transparency of points. When `fill` is `FALSE`, defaults to +#' min(1, 1000/n), where n is the number of observations. Otherwise, set to 1. +#' @param ... Additional arguments are currently ignored. +#' @return A ggplot object. +#' @author Rob J Hyndman +#' @examples +#' # Univariate density +#' dist_kde(c(rnorm(500), rnorm(500, 4, 1.5))) |> +#' autoplot(show_hdr = TRUE, prob = c(0.5, 0.95), color = "#c14b14") +#' ymat <- tibble(y1 = rnorm(5000), y2 = y1 + rnorm(5000)) +#' ymat |> +#' dist_kde(ymat) |> +#' autoplot(show_points = TRUE, alpha = 0.1, fill = TRUE) +#' @exportS3Method ggplot2::autoplot + +autoplot.distribution <- function( + object, prob = seq(9) / 10, fill = FALSE, + show_hdr = FALSE, show_points = FALSE, show_mode = FALSE, show_lookout = FALSE, + color = "#00659e", palette = hdr_palette, alpha = ifelse(fill, 1, min(1, 1000 / NROW(object$x))), + ...) { + if (min(prob) <= 0 | max(prob) >= 1) { + stop("prob must be between 0 and 1") + } + if (identical(palette, hdr_palette)) { + colors <- hdr_palette(color = color, prob = prob) + } else { + colors <- palette(n = length(prob) + 1) + } + dist <- family(object) + no_groups <- length(dist) == 1L + # Names of distributions + dist_names <- format(object) + if(length(unique(dist_names)) != length(dist_names)) { + # Find duplicates + dup <- duplicated(dist_names) + dist_names[dup] <- paste0(dist_names[dup], "a") + } + + # Extract data + x <- lapply(vctrs::vec_data(object), function(u) u$kde$x) + names(x) <- dist_names + # Check if multivariate + d <- unlist(lapply(x, NCOL)) + d[dist != "kde"] <- 1 + if (any(d > 2)) { + stop("Only univariate and bivariate densities are supported") + } else if (any(d == 2)) { + stop("Bivariate plotting not yet implemented") + } + if (show_points) { + if (all(lengths(x) == 0)) { + warning("No observations found") + show_points <- FALSE + } + } + # Set up data frame for densities + range_x <- range(unlist(quantile(object, p = c(0.002, 0.998)))) + if (show_points) { + range_x <- range(range_x, unlist(x)) + } + y <- seq(min(range_x), max(range_x), length = 501) + df <- c(list(y), density(object, at = y)) + names(df) <- c("y", dist_names) + df <- tibble::as_tibble(df) |> + tidyr::pivot_longer( + cols = -y, names_to = "Distribution", + values_to = "Density" + ) + maxden <- max(df$Density) + + # Add density lines to plot + p <- ggplot(df) + if (no_groups) { + p <- p + geom_line(aes(x = y, y = Density)) + } else { + p <- p + geom_line(aes(x = y, y = Density, color = Distribution)) + } + + # Set up HDRs if needed + if (show_hdr) { + prob <- sort(unique(prob), decreasing = TRUE) + hdr <- purrr::map_dfr(prob, function(u) { + hdri <- hdr(object, size = u * 100) + tibble( + level = u*100, + Distribution = dist_names, + lower = vctrs::field(hdri, "lower"), + upper = vctrs::field(hdri, "upper") + ) |> + tidyr::unnest(c(lower, upper)) + }) + } + + # Show observations in bottom margin + if (show_points) { + if(show_hdr) { + # Only show points outside largest HDR + fi <- purrr::map2(object, x, function(u,x) { + if(is.null(x)) { + return(NULL) + } else { + density(u, at = x)[[1]] + } + }) + threshold <- lapply(fi, quantile, prob = 1 - max(prob), type = 8) + idx <- purrr::map2(fi, threshold, function(f,t){which(f < t)}) + x <- purrr::map2(x, idx, function(x,i) x[i]) + } + # Drop distributions with no data + some_data <- names(x)[lengths(x) > 0] + x <- x[some_data] + show_x <- tibble::as_tibble(x) |> + tidyr::pivot_longer( + cols = everything(), names_to = "Distribution", + values_to = "x" + ) + if (no_groups) { + a <- aes(x = x, y = -maxden * as.numeric(factor(Distribution)) / 40) + } else { + a <- aes(x = x, y = -maxden * (as.numeric(factor(Distribution))- 0.5) / 20, + color = Distribution) + } + p <- p + ggplot2::geom_point(data = show_x, mapping = a, alpha = alpha) + } + if (show_lookout) { + stop("Not yet implemented") + if (!show_hdr) { + kscores <- calc_kde_scores(object$x, h = object$h, ...) + } + lookout_highlight <- lookout(density_scores = kscores$scores, loo_scores = kscores$loo) < 0.05 + lookout <- tibble(x = object$x[lookout_highlight]) + p <- p + ggplot2::geom_point( + data = lookout, mapping = aes(x = x, y = -maxden / 40), + color = "#ff0000" + ) + } + + if (show_hdr) { + p <- p + + ggplot2::geom_rect( + data = hdr |> mutate(id = row_number()), + aes( + xmin = lower, xmax = upper, + ymin = -maxden * as.numeric(factor(Distribution)) / 20, + ymax = -maxden * (as.numeric(factor(Distribution))-1) / 20, + fill = factor(level, levels = sort(prob * 100)), + color = Distribution, + ) + ) + + scale_fill_manual( + breaks = rev(prob * 100), + values = colors[-1], + labels = paste0(100 * rev(prob), "%") + ) + + ggplot2::guides(fill = ggplot2::guide_legend(title = "HDR coverage")) + } + if (show_mode) { + modes <- df |> + dplyr::group_by(Distribution) |> + dplyr::filter(Density == max(Density)) |> + select(mode = y, Distribution) + if (no_groups) { + a <- aes(x = mode, y = -maxden / 20) + } else { + a <- aes(x = mode, y = -maxden * (as.numeric(factor(Distribution))- 0.5) / 20, color = Distribution) + } + p <- p + + ggplot2::geom_point(data = modes, mapping = a, shape = 17, size = 3) + } + + return(p) +} diff --git a/R/ggplot.R b/R/ggplot.R index a246fa1..ecdb715 100644 --- a/R/ggplot.R +++ b/R/ggplot.R @@ -1,4 +1,4 @@ -#' Produce ggplot of densities in 1 or 2 dimensions +#' Produce ggplot of densities from kde objects in 1 or 2 dimensions #' #' @details #' This function produces a ggplot of the density estimate produced by `ks::kde()`. diff --git a/man/autoplot.distribution.Rd b/man/autoplot.distribution.Rd new file mode 100644 index 0000000..b5e9339 --- /dev/null +++ b/man/autoplot.distribution.Rd @@ -0,0 +1,79 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/autoplot.R +\name{autoplot.distribution} +\alias{autoplot.distribution} +\title{Produce ggplot of densities from distributional objects in 1 or 2 dimensions} +\usage{ +\method{autoplot}{distribution}( + object, + prob = seq(9)/10, + fill = FALSE, + show_hdr = FALSE, + show_points = FALSE, + show_mode = FALSE, + show_lookout = FALSE, + color = "#00659e", + palette = hdr_palette, + alpha = ifelse(fill, 1, min(1, 1000/NROW(object$x))), + ... +) +} +\arguments{ +\item{object}{distribution object from the distributional package or +\code{\link{dist_kde}}()} + +\item{prob}{Probability of the HDR contours to be drawn (for a bivariate plot only).} + +\item{fill}{If \code{TRUE}, and the density is bivariate, the bivariate contours +are shown as filled regions rather than lines.} + +\item{show_hdr}{If \code{TRUE}, and the density is univariate, then the HDR regions +specified by \code{prob} are shown as a ribbon below the density.} + +\item{show_points}{If \code{TRUE}, then individual points are plotted.} + +\item{show_mode}{If \code{TRUE}, then the mode of the distribution is shown.} + +\item{show_lookout}{If \code{TRUE}, then the observations with lookout probabilities less than 0.05 are shown in red.} + +\item{color}{Color used for mode and HDR contours. If \code{palette = hdr_palette}, +it is also used as the basis for HDR regions.} + +\item{palette}{Color palette function to use for HDR filled regions +(if \code{fill} is \code{TRUE} or \code{show_hdr} is \code{TRUE}).} + +\item{alpha}{Transparency of points. When \code{fill} is \code{FALSE}, defaults to +min(1, 1000/n), where n is the number of observations. Otherwise, set to 1.} + +\item{...}{Additional arguments are currently ignored.} +} +\value{ +A ggplot object. +} +\description{ +Produce ggplot of densities from distributional objects in 1 or 2 dimensions +} +\details{ +This function produces a ggplot of a density from a distributional object. +For univariate densities, it produces a line plot of the density function, with +an optional ribbon showing some highest density regions (HDRs) and/or the observations. +For bivariate densities, it produces a contour plot of the density function, with +the observations optionally shown as points. +The mode can also be drawn as a point with the HDRs. +For bivariate densities, the combination of \code{fill = TRUE}, \code{show_points = TRUE}, +\code{show_mode = TRUE}, and \code{prob = c(0.5, 0.99)} is equivalent to an HDR boxplot. +For univariate densities, the combination of \code{show_hdr = TRUE}, \code{show_points = TRUE}, +\code{show_mode = TRUE}, and \code{prob = c(0.5, 0.99)} is equivalent to an HDR boxplot. +} +\examples{ +# Univariate density +dist_kde(c(rnorm(500), rnorm(500, 4, 1.5))) |> + autoplot(show_hdr = TRUE, prob = c(0.5, 0.95), color = "#c14b14") +ymat <- tibble(y1 = rnorm(5000), y2 = y1 + rnorm(5000)) +ymat |> + dist_kde(ymat) |> + autoplot(show_points = TRUE, alpha = 0.1, fill = TRUE) +} +\author{ +Rob J Hyndman +} diff --git a/man/autoplot.kde.Rd b/man/autoplot.kde.Rd index d126989..ab0e0e5 100644 --- a/man/autoplot.kde.Rd +++ b/man/autoplot.kde.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/ggplot.R \name{autoplot.kde} \alias{autoplot.kde} -\title{Produce ggplot of densities in 1 or 2 dimensions} +\title{Produce ggplot of densities from kde objects in 1 or 2 dimensions} \usage{ \method{autoplot}{kde}( object, @@ -50,7 +50,7 @@ min(1, 1000/n), where n is the number of observations. Otherwise, set to 1.} A ggplot object. } \description{ -Produce ggplot of densities in 1 or 2 dimensions +Produce ggplot of densities from kde objects in 1 or 2 dimensions } \details{ This function produces a ggplot of the density estimate produced by \code{ks::kde()}. From 879f73786ae8c105e209585f0263135b1020da96 Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Wed, 24 Jul 2024 16:49:01 +1000 Subject: [PATCH 10/15] Updated docs for dist_kde --- R/density.R | 5 +++-- man/dist_kde.Rd | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/R/density.R b/R/density.R index b6cefa3..f96a28a 100644 --- a/R/density.R +++ b/R/density.R @@ -1,8 +1,9 @@ #' Create kde distributional object #' #' Creates a distributional object using a kernel density estimate. The -#' `ks::kde()` function is used to compute the density. The cdf and quantiles -#' are consistent with the kde. +#' `ks::kde()` function is used to compute the density. The cdf, quantiles and +#' moments are consistent with the kde. Generating random values from the kde +#' is equivalent to a smoothed bootstrap. #' #' @param y Numerical vector or matrix of data, or a list of such objects. If a #' list is provided, then all objects should be of the same dimension. e.g., diff --git a/man/dist_kde.Rd b/man/dist_kde.Rd index cb6afbc..57b77fa 100644 --- a/man/dist_kde.Rd +++ b/man/dist_kde.Rd @@ -23,8 +23,9 @@ all vectors, or all matrices with the same number of columns.} } \description{ Creates a distributional object using a kernel density estimate. The -\code{ks::kde()} function is used to compute the density. The cdf and quantiles -are consistent with the kde. +\code{ks::kde()} function is used to compute the density. The cdf, quantiles and +moments are consistent with the kde. Generating random values from the kde +is equivalent to a smoothed bootstrap. } \examples{ dist_kde(c(rnorm(200), rnorm(100, 5)), method = "double") From d80073268cd4d9bcb09dee9f590a847261cccf93 Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Wed, 24 Jul 2024 17:35:53 +1000 Subject: [PATCH 11/15] Fixed HDR calculation and display --- R/autoplot.R | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/R/autoplot.R b/R/autoplot.R index df6ee6a..65827af 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -123,6 +123,13 @@ autoplot.distribution <- function( if (show_points) { if(show_hdr) { # Only show points outside largest HDR + thresh <- tibble(object = object, Distribution = dist_names) |> + left_join(hdr |> filter(level == max(level)), by = "Distribution") |> + dplyr::rowwise() |> + mutate(fi = density(object, at = lower)) |> + select(Distribution, fi) + threshold <- as.list(thresh$fi) + names(threshold) <- thresh$Distribution fi <- purrr::map2(object, x, function(u,x) { if(is.null(x)) { return(NULL) @@ -130,18 +137,16 @@ autoplot.distribution <- function( density(u, at = x)[[1]] } }) - threshold <- lapply(fi, quantile, prob = 1 - max(prob), type = 8) idx <- purrr::map2(fi, threshold, function(f,t){which(f < t)}) x <- purrr::map2(x, idx, function(x,i) x[i]) } # Drop distributions with no data some_data <- names(x)[lengths(x) > 0] x <- x[some_data] - show_x <- tibble::as_tibble(x) |> - tidyr::pivot_longer( - cols = everything(), names_to = "Distribution", - values_to = "x" - ) + show_x <- tibble( + Distribution = rep(names(x), lengths(x)), + x = unlist(x) + ) if (no_groups) { a <- aes(x = x, y = -maxden * as.numeric(factor(Distribution)) / 40) } else { @@ -171,16 +176,16 @@ autoplot.distribution <- function( xmin = lower, xmax = upper, ymin = -maxden * as.numeric(factor(Distribution)) / 20, ymax = -maxden * (as.numeric(factor(Distribution))-1) / 20, - fill = factor(level, levels = sort(prob * 100)), - color = Distribution, + alpha = -level, + fill = Distribution, ) ) + - scale_fill_manual( - breaks = rev(prob * 100), - values = colors[-1], - labels = paste0(100 * rev(prob), "%") - ) + - ggplot2::guides(fill = ggplot2::guide_legend(title = "HDR coverage")) + scale_alpha( + name = "HDR coverage", + breaks = -100*prob, + labels = paste0(100 * prob, "%"), + range = c(0.2,1) + ) } if (show_mode) { modes <- df |> From b198226afc844148a52f54d20a3ed1c7acfe6cd5 Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Thu, 25 Jul 2024 10:44:10 +1000 Subject: [PATCH 12/15] Addressing checks --- .Rbuildignore | 1 + DESCRIPTION | 4 ++-- NAMESPACE | 2 +- R/autoplot.R | 23 +++++++++++++---------- R/density.R | 14 ++++++++------ man/autoplot.distribution.Rd | 6 +++--- 6 files changed, 28 insertions(+), 22 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index b06f0e3..316ea0f 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,3 +8,4 @@ _pkgdown.yml ^pkgdown$ ^CRAN-SUBMISSION$ +.vscode diff --git a/DESCRIPTION b/DESCRIPTION index 8b88e33..eff154d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,14 +23,14 @@ Imports: evd, ggplot2 (>= 3.1.1), grDevices, - interpolation, ks, purrr (>= 0.2.4), rlang, robustbase, rstudioapi (>= 0.7), stray, - tibble (>= 1.4.2) + tibble (>= 1.4.2), + vctrs License: GPL-3 Encoding: UTF-8 LazyData: true diff --git a/NAMESPACE b/NAMESPACE index 8ec67db..5bd918e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,9 +14,9 @@ S3method(distributional::skewness,dist_kde) S3method(format,dist_kde) S3method(ggplot2::autoplot,distribution) S3method(mean,dist_kde) -S3method(median,dist_kde) S3method(print,weird_conflicts) S3method(quantile,dist_kde) +S3method(stats::median,dist_kde) export(autoplot) export(chauvenet_anomalies) export(density_scores) diff --git a/R/autoplot.R b/R/autoplot.R index 65827af..e3b48ff 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -36,9 +36,9 @@ #' dist_kde(c(rnorm(500), rnorm(500, 4, 1.5))) |> #' autoplot(show_hdr = TRUE, prob = c(0.5, 0.95), color = "#c14b14") #' ymat <- tibble(y1 = rnorm(5000), y2 = y1 + rnorm(5000)) -#' ymat |> -#' dist_kde(ymat) |> -#' autoplot(show_points = TRUE, alpha = 0.1, fill = TRUE) +#' #ymat |> +#' # dist_kde(ymat) |> +#' # autoplot(show_points = TRUE, alpha = 0.1, fill = TRUE) #' @exportS3Method ggplot2::autoplot autoplot.distribution <- function( @@ -54,7 +54,7 @@ autoplot.distribution <- function( } else { colors <- palette(n = length(prob) + 1) } - dist <- family(object) + dist <- stats::family(object) no_groups <- length(dist) == 1L # Names of distributions dist_names <- format(object) @@ -108,7 +108,7 @@ autoplot.distribution <- function( if (show_hdr) { prob <- sort(unique(prob), decreasing = TRUE) hdr <- purrr::map_dfr(prob, function(u) { - hdri <- hdr(object, size = u * 100) + hdri <- distributional::hdr(object, size = u * 100) tibble( level = u*100, Distribution = dist_names, @@ -117,6 +117,7 @@ autoplot.distribution <- function( ) |> tidyr::unnest(c(lower, upper)) }) + hdr$id <- seq(NROW(hdr)) } # Show observations in bottom margin @@ -124,10 +125,10 @@ autoplot.distribution <- function( if(show_hdr) { # Only show points outside largest HDR thresh <- tibble(object = object, Distribution = dist_names) |> - left_join(hdr |> filter(level == max(level)), by = "Distribution") |> + dplyr::left_join(hdr |> filter(level == max(level)), by = "Distribution") |> dplyr::rowwise() |> - mutate(fi = density(object, at = lower)) |> - select(Distribution, fi) + dplyr::mutate(fi = density(object, at = lower)) |> + dplyr::select(Distribution, fi) threshold <- as.list(thresh$fi) names(threshold) <- thresh$Distribution fi <- purrr::map2(object, x, function(u,x) { @@ -171,7 +172,7 @@ autoplot.distribution <- function( if (show_hdr) { p <- p + ggplot2::geom_rect( - data = hdr |> mutate(id = row_number()), + data = hdr, aes( xmin = lower, xmax = upper, ymin = -maxden * as.numeric(factor(Distribution)) / 20, @@ -180,7 +181,7 @@ autoplot.distribution <- function( fill = Distribution, ) ) + - scale_alpha( + ggplot2::scale_alpha( name = "HDR coverage", breaks = -100*prob, labels = paste0(100 * prob, "%"), @@ -203,3 +204,5 @@ autoplot.distribution <- function( return(p) } + +utils::globalVariables(c("Density","Distribution","level")) diff --git a/R/density.R b/R/density.R index f96a28a..53cc662 100644 --- a/R/density.R +++ b/R/density.R @@ -22,6 +22,8 @@ dist_kde <- function(y, h = NULL, H = NULL, kde_options = NULL, ...) { if (!is.list(y)) { y <- list(y) + } else if(is.data.frame(y)) { + y <- list(as.matrix(y)) } d <- unlist(lapply(y, function(u) NCOL(u))) if (!all(d == d[1])) { @@ -194,7 +196,7 @@ quantile.dist_kde <- function (x, p, ..., na.rm = TRUE) { generate.dist_kde <- function(x, times, ...) { d <- NCOL(x$kde$x) if (d == 1) { - sample(x$kde$x, size = times, replace = TRUE) + rnorm(times, sd = x$kde$h) + sample(x$kde$x, size = times, replace = TRUE) + stats::rnorm(times, sd = x$kde$h) } else { stop("Not yet implemented") } @@ -209,12 +211,12 @@ mean.dist_kde <- function(x, ...) { } } -#' @export +#' @exportS3Method stats::median median.dist_kde <- function(x, na.rm = FALSE, ...) { if (is.matrix(x$kde$x)) { - apply(x$kde$x, 2, median, na.rm = na.rm, ...) + apply(x$kde$x, 2, stats::median, na.rm = na.rm, ...) } else { - median(x$kde$x, na.rm = na.rm, ...) + stats::median(x$kde$x, na.rm = na.rm, ...) } } @@ -234,7 +236,7 @@ skewness.dist_kde <- function(x, ..., na.rm = FALSE) { if (is.matrix(x$kde$x)) { stop("Multivariate skewness is not yet implemented.") } else { - mean((x$kde$x - mean(x$kde$x))^3) / variance(x)^1.5 + mean((x$kde$x - mean(x$kde$x))^3) / distributional::variance(x)^1.5 } } @@ -245,7 +247,7 @@ kurtosis.dist_kde <- function(x, ..., na.rm = FALSE) { stop("Multivariate kurtosis is not yet implemented.") } else { h <- x$kde$h - v <- variance(x) + v <- distributional::variance(x) (mean((x$kde$x - mean(x$kde$x))^4) + 6 * h^2 * v - 3 * h^4) / v^2 - 3 } } diff --git a/man/autoplot.distribution.Rd b/man/autoplot.distribution.Rd index b5e9339..289f442 100644 --- a/man/autoplot.distribution.Rd +++ b/man/autoplot.distribution.Rd @@ -70,9 +70,9 @@ For univariate densities, the combination of \code{show_hdr = TRUE}, \code{show dist_kde(c(rnorm(500), rnorm(500, 4, 1.5))) |> autoplot(show_hdr = TRUE, prob = c(0.5, 0.95), color = "#c14b14") ymat <- tibble(y1 = rnorm(5000), y2 = y1 + rnorm(5000)) -ymat |> - dist_kde(ymat) |> - autoplot(show_points = TRUE, alpha = 0.1, fill = TRUE) +#ymat |> +# dist_kde(ymat) |> +# autoplot(show_points = TRUE, alpha = 0.1, fill = TRUE) } \author{ Rob J Hyndman From ad2f1de717967c02d8b8531b3411b80a3d616d24 Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Thu, 25 Jul 2024 10:46:08 +1000 Subject: [PATCH 13/15] Updated pkgdown --- _pkgdown.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index be72f3f..af43c05 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -52,11 +52,12 @@ reference: - mvscale - title: Functions for working with kernel density estimates contents: - - as_kde - - autoplot.kde + - dist_kde - kde_bandwidth - title: Functions for graphics and HDR calculations contents: + - autoplot.distribution + - autoplot.kde - gg_bagplot - gg_hdrboxplot - hdr_palette From e9c99cc81619f63ebc020ecccd4b219d614d2c53 Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Thu, 25 Jul 2024 10:51:29 +1000 Subject: [PATCH 14/15] Adding back some kde functions --- DESCRIPTION | 1 + NAMESPACE | 2 + R/kde.R | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++ man/as_kde.Rd | 37 +++++++++++++++ 4 files changed, 168 insertions(+) create mode 100644 R/kde.R create mode 100644 man/as_kde.Rd diff --git a/DESCRIPTION b/DESCRIPTION index eff154d..f56043f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,6 +23,7 @@ Imports: evd, ggplot2 (>= 3.1.1), grDevices, + interpolation, ks, purrr (>= 0.2.4), rlang, diff --git a/NAMESPACE b/NAMESPACE index 5bd918e..595f96d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,9 +14,11 @@ S3method(distributional::skewness,dist_kde) S3method(format,dist_kde) S3method(ggplot2::autoplot,distribution) S3method(mean,dist_kde) +S3method(print,kde) S3method(print,weird_conflicts) S3method(quantile,dist_kde) S3method(stats::median,dist_kde) +export(as_kde) export(autoplot) export(chauvenet_anomalies) export(density_scores) diff --git a/R/kde.R b/R/kde.R new file mode 100644 index 0000000..8ffc8e2 --- /dev/null +++ b/R/kde.R @@ -0,0 +1,128 @@ +#' @export +print.kde <- function(x, ...) { + kde <- !(is.null(x$h) & is.null(x$H)) + if (inherits(x$eval.points, "list")) { + d <- length(x$eval.points) + } else { + d <- 1L + } + if(!kde) { + cat("Density of: [", + paste0(x$names, collapse = ", "), "]\n", sep = "") + } else { + cat("Kernel density estimate of: [", + paste0(x$names, collapse = ", "), "]\n", sep = "") + } + if(d == 1L){ + ngrid <- length(x$eval.points) + } else { + ngrid <- lapply(x$eval.points, length) + } + cat("Computed on a grid of size", paste(ngrid, collapse = " x "), "\n") + if(kde) { + cat("Bandwidth: ") + if (d == 1L) { + cat("h = ", format(x$h, digits = 4)) + } else { + cat("H = \n") + cat(format(x$H, digits = 4), quote=FALSE) + } + } + invisible(x) +} + +#' Convert data frame or matrix object to kde class +#' +#' A density specified as a data frame or matrix can be converted to a kde object. +#' This is useful for plotting the density using \code{\link{autoplot.kde}}. +#' As kde objects are defined on a grid, the density values are interpolated +#' based on the points in the data frame or matrix. +#' +#' @param object Data frame or matrix with numerical columns, where one column +#' (specified by `density_column`) contains the density values, and the +#' remaining columns define the points at which the density is evaluated. +#' @param density_column Name of the column containing the density values, specified +#' as a bare expression. If missing, the last column is used. +#' @param ngrid Number of points to use for the grid in each dimension. Default is +#' 10001 for univariate densities and 101 for multivariate densities. +#' @param ... Additional arguments are ignored. +#' @return An object of class "kde" +#' @author Rob J Hyndman +#' @examples +#' tibble(y = seq(-4, 4, by = 0.01), density = dnorm(y)) |> +#' as_kde() +#' @export + +as_kde <- function(object, density_column, ngrid, ...) { + # Check columns of object are all numerical + object <- as.data.frame(object) + if(!all(sapply(object, is.numeric))) { + stop("All columns of object must be numeric") + } + # Check density_column is in object + if(missing(density_column)) { + density_column <- tail(colnames(object), 1) + } else { + density_column <- dplyr::as_label(dplyr::enquo(density_column)) + } + if(!(density_column %in% colnames(object))) { + stop(paste(density_column, "not found")) + } + # Separate points from density values + den <- object[[density_column]] + object[[density_column]] <- NULL + # Find the dimension + d <- NCOL(object) + if (d == 1L) { + if(missing(ngrid)) { + ngrid <- 10001 + } + # Interpolate density on finer grid + density <- list(eval.points = seq(min(object), max(object), length = ngrid)) + density$estimate <- approx(object[[1]], den, xout = density$eval.points)$y + } else if (d == 2L) { + if(missing(ngrid)) { + ngrid <- 101 + } + density <- density_on_grid(as.matrix(object), den, ngrid) + } else { + stop("Only univariate and bivariate densities are supported") + } + # Find falpha using quantile method + missing <- is.na(density$estimate) + samplex <- sample(density$estimate[!missing], + size = 50000, replace = TRUE, + prob = density$estimate[!missing] + ) + density$cont <- quantile(samplex, prob = (99:1) / 100, type = 8) + # Set missing values to 0 + density$estimate[is.na(density$estimate)] <- 0 + # Add names + density$names <- colnames(object) + if(is.null(density$names)) { + if(d == 1) { + density$names <- "y" + } else { + density$names <- paste0("y", seq_len(d)) + } + } + structure(density, class = "kde") +} + +density_on_grid <- function(y, fy, ngrid) { + y <- as.matrix(y) + if (NCOL(y) != 2L) { + stop("y must be a matrix with 2 columns") + } + # Create grid of points + density <- list(eval.points = list( + seq(min(y[, 1]), max(y[, 1]), length = ngrid), + seq(min(y[, 2]), max(y[, 2]), length = ngrid) + )) + # Bivariate interpolation + grid <- expand.grid(density$eval.points[[1]], density$eval.points[[2]]) + ifun <- interpolation::interpfun(x = y[, 1], y = y[, 2], z = fy) + density$estimate <- ifun(grid[,1], grid[,2]) |> + matrix(nrow = ngrid) + return(density) +} diff --git a/man/as_kde.Rd b/man/as_kde.Rd new file mode 100644 index 0000000..14fc7ad --- /dev/null +++ b/man/as_kde.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/kde.R +\name{as_kde} +\alias{as_kde} +\title{Convert data frame or matrix object to kde class} +\usage{ +as_kde(object, density_column, ngrid, ...) +} +\arguments{ +\item{object}{Data frame or matrix with numerical columns, where one column +(specified by \code{density_column}) contains the density values, and the +remaining columns define the points at which the density is evaluated.} + +\item{density_column}{Name of the column containing the density values, specified +as a bare expression. If missing, the last column is used.} + +\item{ngrid}{Number of points to use for the grid in each dimension. Default is +10001 for univariate densities and 101 for multivariate densities.} + +\item{...}{Additional arguments are ignored.} +} +\value{ +An object of class "kde" +} +\description{ +A density specified as a data frame or matrix can be converted to a kde object. +This is useful for plotting the density using \code{\link{autoplot.kde}}. +As kde objects are defined on a grid, the density values are interpolated +based on the points in the data frame or matrix. +} +\examples{ +tibble(y = seq(-4, 4, by = 0.01), density = dnorm(y)) |> + as_kde() +} +\author{ +Rob J Hyndman +} From 2e86c12fd8a23b3f9e0b7c1ae27df572ef8f7a74 Mon Sep 17 00:00:00 2001 From: Rob Hyndman Date: Thu, 25 Jul 2024 10:59:46 +1000 Subject: [PATCH 15/15] Fixed pkgdown --- _pkgdown.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index af43c05..123ba0a 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -52,6 +52,7 @@ reference: - mvscale - title: Functions for working with kernel density estimates contents: + - as_kde - dist_kde - kde_bandwidth - title: Functions for graphics and HDR calculations