Skip to content

Commit

Permalink
Single phase + outlier clusterer support and other additions (#14)
Browse files Browse the repository at this point in the history
* First additions

* Adding documentation for the single-pass and outlier capabilities.

* Final documentation changed for version 1.4

* Updated documentation about outliers. Fixed DSD_Gaussians random cluster centroid generation.

* Small addition to the DSD definition

* Fixed an issue with Mahalanobis distance calculation and covariance matrix random generation, which lead to the separation issues.

* Removed Matrix dependency, fixed documentation issues.

* Speed up the separation calculation for Mahalanobis

* Updated DSD_Gaussians interface
  • Loading branch information
dkrleza authored Dec 1, 2020
1 parent 5321443 commit f42a2d3
Show file tree
Hide file tree
Showing 46 changed files with 2,067 additions and 727 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ proj$
^.*\.Rproj$
^LICENSE
Work
^doc$
^Meta$
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,8 @@ vignettes/*.pdf
*.o
*.so
*.Rproj
*.RData

Work
doc
Meta
8 changes: 5 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
Package: stream
Version: 1.3-2
Date: 2020-05-04
Version: 1.4-0
Date: 2020-10-20
Encoding: UTF-8
Title: Infrastructure for Data Stream Mining
Authors@R: c(person("Michael", "Hahsler", role = c("aut", "cre", "cph"),
email = "mhahsler@lyle.smu.edu"),
person("Matthew", "Bolanos", role = c("aut", "cph")),
person("John", "Forrest", role = c("ctb")),
person("Matthias", "Carnein", role = c("ctb")),
person("Dennis", "Assenmacher", role = c("ctb"))
person("Dennis", "Assenmacher", role = c("ctb")),
person("Dalibor", "Krleža", role = c("ctb"))
)
Description: A framework for data stream modeling and associated data mining tasks such as clustering and classification. The development of this package was supported in part by NSF IIS-0948893 and NIH R21HG005912. Hahsler et al (2017) <doi:10.18637/jss.v076.i14>.
Depends: R (>= 2.13.0), methods, proxy (>= 0.4-7)
Expand Down
24 changes: 23 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import(proxy)

importFrom(methods, "is", "new")
importFrom(utils, "head", "read.table", "write.table")
importFrom(stats, "runif", "complete.cases", "rnorm", "prcomp", "na.omit")
importFrom(stats, "runif", "complete.cases", "rnorm", "prcomp", "na.omit", "mahalanobis")
importFrom(graphics, "plot", "par", "layout", "title", "pairs", "points",
"lines", "image")
importFrom(grDevices, "gray", "gray.colors", "rgb", "col2rgb")
Expand All @@ -30,6 +30,8 @@ export(
DSC_R,
DSC_Micro,
DSC_Macro,
DSC_Outlier,
DSC_SinglePass,

DSC_BIRCH,
DSC_DStream,
Expand All @@ -49,6 +51,9 @@ export(

DSC_Static,

EvalCallback,
DefaultEvalCallback,

# DSD
DSD,
DSD_R,
Expand Down Expand Up @@ -105,13 +110,17 @@ export(
get_weights,
get_assignment,
get_copy,
clean_outliers,
nclusters,
# merge_DSC,
prune_clusters,
microToMacro,

evaluate,
evaluate_callback,
evaluate_with_callbacks,
evaluate_cluster,
evaluate_cluster_with_callbacks,

animate_cluster,
animate_data,
Expand All @@ -129,6 +138,9 @@ export(
get_microweights,
get_macroclusters,
get_macroweights,
get_outlier_positions,
recheck_outlier,
noutliers,

saveDSC,
readDSC,
Expand Down Expand Up @@ -161,6 +173,7 @@ S3method(reset_stream, DSD)
S3method(reset_stream, DSD_Memory)
S3method(reset_stream, DSD_ReadCSV)
S3method(reset_stream, DSD_ScaleStream)
S3method(reset_stream, DSD_Gaussians)

### DSO Methods
S3method(print, DSO)
Expand All @@ -184,6 +197,7 @@ S3method(get_assignment, DSC)
S3method(get_assignment, DSC_DBSTREAM)
S3method(get_assignment, DSC_DStream)
S3method(get_assignment, DSC_TwoStage)
S3method(get_assignment, DSC_SinglePass)

S3method(get_centers, DSC_Macro)
S3method(get_centers, DSC_Micro)
Expand All @@ -200,6 +214,14 @@ S3method(microToMacro, DSC_DBSTREAM)
S3method(microToMacro, DSC_DStream)
S3method(microToMacro, DSC_TwoStage)

S3method(clean_outliers, DSC_Outlier)
S3method(get_outlier_positions, DSC_Outlier)
S3method(recheck_outlier, DSC_Outlier)
S3method(noutliers, DSC_Outlier)
S3method(print, DSC_Outlier)
S3method(get_outlier_positions, DSC_TwoStage)
S3method(recheck_outlier, DSC_TwoStage)

S3method(get_copy, DSC_R)

S3method(print, stream_eval)
Expand Down
9 changes: 9 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
# stream 1.4-0 (10/20/20)

## New Features
* Added additional features to the DSD_Gaussians, now capable of using Mahalanobis distance and
generating outliers.
* Updated evaluation procedure, now capable of performing external indices calculation using callbacks
* Added support for single-pass clusterers and outlier detectors
* Added outlier correctness assessment indices

# stream 1.3-2 (05/04/20)

## Bug Fixes
Expand Down
157 changes: 128 additions & 29 deletions R/DSC.R
Original file line number Diff line number Diff line change
Expand Up @@ -102,76 +102,91 @@ summary.DSC <- function(object, ...) print(object)

#plot.DSC will call super question.
plot.DSC <- function(x, dsd = NULL, n = 500,
col_points=NULL,
col_clusters=c("red", "blue"),
weights=TRUE,
scale=c(1,5),
cex=1,
pch=NULL,
method="pairs", dim=NULL,
type=c("auto", "micro", "macro", "both"),
assignment = FALSE, ### assignment is not implemented
...) {
col_points=NULL,
col_clusters=c("red", "blue", "green"),
weights=TRUE,
scale=c(1,5),
cex=1,
pch=NULL,
method="pairs", dim=NULL,
type=c("auto", "micro", "macro", "both", "all", "outliers"), # we keep 'both' for compatibility reasons
assignment = FALSE, ### assignment is not implemented
...) {

type <- match.arg(type)
if(type == "outliers" && !is(x, "DSC_Outlier"))
stop("The clusterer is not an outlier detector, cannot draw outliers")

if(is.null(col_points)) col_points <- .points_col

if(type !="both") {
if(type !="both" && type != "all") {
if(type =="auto") type <- get_type(x)
## method can be pairs, scatter or pc (projection with PCA)
centers <- get_centers(x, type=type)
if(type != "outliers") {
centers <- get_centers(x, type=type)
} else {
centers <- get_outlier_positions(x)
}
k <- nrow(centers)

if(k<1) {
warning("No clusters to plot!")
warning("No clusters or outliers to plot!")
plot(NA, NA, xlim=c(0,0), ylim=c(0,0))
return()
}

if(weights) cex_clusters <- get_weights(x, type=type, scale=scale)
else cex_clusters <- rep(1, k)
if(type != "outliers") {
if(weights) cex_clusters <- get_weights(x, type=type, scale=scale)
else cex_clusters <- rep(1, k)
} else cex_clusters <- rep(2, k)

if(type=="micro") {
col <- rep(col_clusters[1], k)
mpch <- rep(1, k)
lwd <- rep(1, k)
}else{
} else if(type=="macro") {
cex_clusters <- cex_clusters*1.5
col <- rep(col_clusters[2], k)
mpch <- rep(3, k)
lwd <- rep(2, k)
} else {
col <- rep(col_clusters[3], k)
mpch <- rep(1, k)
lwd <- rep(1, k)
}

}else{ ### both
} else { ### both
centers_mi <- get_centers(x, type="micro")
centers_ma <- get_centers(x, type="macro")
centers_out <- data.frame()
if(type=="all" && is(x, "DSC_Outlier")) centers_out <- get_outlier_positions(x)
k_mi <- nrow(centers_mi)
k_ma <- nrow(centers_ma)
k_out <- nrow(centers_out)

if(k_mi<1) {
warning("No clusters to plot!")
if((k_mi+k_out)<1) {
warning("No clusters or outliers to plot!")
plot(NA, NA, xlim=c(0,0), ylim=c(0,0))
return()
}

### Fix names if necessary
colnames(centers_mi) <- colnames(centers_ma)
if(nrow(centers_out)>0) colnames(centers_out) <- colnames(centers_ma)

centers <- rbind(centers_mi, centers_ma)
centers <- rbind(centers_mi, centers_ma, centers_out)

if(weights) cex_clusters <- c(get_weights(x, type="micro", scale=scale),
get_weights(x, type="macro", scale=scale*1.5))
else cex_clusters <- c(rep(cex, k_mi), rep(cex*2,+k_ma))
get_weights(x, type="macro", scale=scale*1.5), rep(2, k_out))
else cex_clusters <- c(rep(cex, k_mi), rep(cex*2, k_ma), rep(2, k_out))

col <- c(rep(col_clusters[1], k_mi), rep(col_clusters[2], k_ma))
mpch <- c(rep(1, k_mi), rep(3, k_ma))
lwd <- c(rep(1, k_mi), rep(2, k_ma))
col <- c(rep(col_clusters[1], k_mi), rep(col_clusters[2], k_ma), rep(col_clusters[3], k_out))
mpch <- c(rep(1, k_mi), rep(3, k_ma), rep(1, k_out))
lwd <- c(rep(1, k_mi), rep(2, k_ma), rep(1, k_out))
}

### prepend data if given
if(!is.null(dsd)) {
d <- get_points(dsd, n, cluster = TRUE)
d <- get_points(dsd, n, cluster = TRUE, outlier=TRUE)
# names(d) <- names(centers)
# fix center names
colnames(centers) <- colnames(d)
Expand All @@ -180,6 +195,7 @@ plot.DSC <- function(x, dsd = NULL, n = 500,
col <- c(rep(col_points,n)[1:n], col)
cex_clusters <- c(rep(cex, n), cex_clusters)
mpch <- c(attr(d, "cluster"), mpch)
mpch <- mpch %% 25
lwd <- c(rep(1,n), lwd)

### handle noise
Expand All @@ -204,13 +220,96 @@ plot.DSC <- function(x, dsd = NULL, n = 500,
plot(p$x, col=col, cex=cex_clusters, pch=mpch, lwd=lwd, ...)
}else if(ncol(centers) == 1){
plot(centers[[1]], rep(0, length(centers[[1]])),
col=col, cex=cex_clusters, pch=mpch, lwd=lwd,
ylab = "", xlab = colnames(centers)[1], ...)
col=col, cex=cex_clusters, pch=mpch, lwd=lwd,
ylab = "", xlab = colnames(centers)[1], ...)
}else { ## plot first 2 dimensions
if(ncol(centers)>2) centers <- centers[,1:2]
plot(centers, col=col, cex=cex_clusters, pch=mpch, lwd=lwd, ...)
}

}

DSC_Outlier <- function(...) stop("DSC_Outlier is an abstract class and cannot be instantiated!")
clean_outliers <- function(x, ...)
UseMethod("clean_outliers")
clean_outliers.default <- function(x, ...) {
stop(gettextf("clean_outliers not implemented for class '%s'.", paste(class(x), collapse=", ")))
}
clean_outliers.DSC_Outlier <- function(x, ...) {
stop(gettextf("No clean outliers available for class '%s'.", paste(class(x), collapse=", ")))
}
get_outlier_positions <- function(x, ...)
UseMethod("get_outlier_positions")
get_outlier_positions.default <- function(x, ...) {
stop(gettextf("get_outlier_positions not implemented for class '%s'.", paste(class(x), collapse=", ")))
}
get_outlier_positions.DSC_Outlier <- function(x, ...) {
stop(gettextf("No outlier getter method available for class '%s'.", paste(class(x), collapse=", ")))
}
recheck_outlier <- function(x, outlier_correlated_id, ...)
UseMethod("recheck_outlier")
recheck_outlier.default <- function(x, outlier_correlated_id, ...) {
stop(gettextf("recheck_outlier not implemented for class '%s'.", paste(class(x), collapse=", ")))
}
recheck_outlier.DSC_Outlier <- function(x, outlier_correlated_id, ...) {
stop(gettextf("No outlier rechecking method available for class '%s'.", paste(class(x), collapse=", ")))
}
noutliers <- function(x, ...) UseMethod("noutliers")
noutliers.default <- function(x, ...) {
stop(gettextf("noutliers not implemented for class '%s'.", paste(class(x), collapse=", ")))
}
noutliers.DSC_Outlier <- function(x, ...) {
nrow(get_outlier_positions(x))
}
print.DSC_Outlier <- function(x, ...) {
cat(.line_break(paste(x$description)))
cat("Class:", paste(class(x), collapse=", "), "\n")
if(!is(nc <- try(nclusters(x, type="micro"), silent=TRUE), "try-error"))
cat(paste('Number of micro-clusters:', nc, '\n'))
if(!is(nc <- try(nclusters(x, type="macro"), silent=TRUE), "try-error"))
cat(paste('Number of macro-clusters:', nc, '\n'))
if(!is(no <- try(noutliers(x), silent=TRUE), "try-error"))
cat(paste('Number of outliers:', no, '\n'))
}
get_assignment.DSC_Outlier <- function(x, points, type=c("auto", "micro", "macro"),
method=c("auto", "nn", "model"), outlier_threshold=0.05, ...) {

method <- match.arg(method)
if(method=="auto") method <- "nn"

if(method=="model") {
warning("method model not implemented! using Euclidean nearest neighbor instead!")
method <- "nn"
}

c1 <- get_centers(x, type=type, ...)
c2 <- get_outlier_positions(x, ...)
c <- rbind(c1, c2)

if(nrow(c)>0L) {
dist <- dist(points, c, method="Euclidean")
# Find the minimum distance and save the class
predict <- apply(dist, 1L, which.min)
outliers <- predict > nrow(c1)
outliers[apply(dist, 1L, min) > outlier_threshold] <- FALSE

to_noise <- predict > nrow(c1)
to_noise[apply(dist, 1L, min) <= outlier_threshold] <- FALSE
predict[to_noise] <- NA_integer_
} else {
warning("There are no clusters and outliers!")
predict <- rep(NA_integer_, nrow(points))
outliers <- rep(FALSE, nrow(points))
}

attr(predict, "method") <- method
attr(predict, "outliers") <- outliers

predict
}

DSC_SinglePass <- function(...) stop("DSC_SinglePass is an abstract class and cannot be instantiated!")
get_assignment.DSC_SinglePass <- function(dsc, points, type=c("auto", "micro", "macro"),
method=c("auto", "nn", "model"), ...) {
stop(gettextf("No assignments and update available for class '%s'.", paste(class(dsc), collapse=", ")))
}
2 changes: 1 addition & 1 deletion R/DSC_DStream.R
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ plot.DSC_DStream <- function(x, dsd=NULL, n=500,

### get varnames from data stream
if(!is.null(dsd)) {
ps <- get_points(dsd, n=n, cluster=TRUE)
ps <- get_points(dsd, n=n, cluster=TRUE, outlier=TRUE)
varnames <- colnames(ps)
}

Expand Down
Loading

0 comments on commit f42a2d3

Please sign in to comment.