Skip to content

Commit

Permalink
Merge pull request #22 from poissonconsulting/preserve
Browse files Browse the repository at this point in the history
`duplicates()` now preserves groups and attributes for "regular" and "sf" data frames
  • Loading branch information
aylapear authored Sep 13, 2024
2 parents d543f7f + b46dbe2 commit 6434791
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 17 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tidyplus
Title: Additional 'tidyverse' Functions
Version: 0.1.0
Version: 0.1.0.9000
Authors@R: c(
person("Joe", "Thorley", , "joe@poissonconsulting.ca", role = "aut",
comment = c(ORCID = "0000-0002-7683-4592")),
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
<!-- NEWS.md is maintained by https://fledge.cynkra.com, contributors should not edit this file -->

# tidyplus 0.1.0.9000

- `duplicates()` now preserves groups and attributes for "regular" and "sf" data frames.
- Added tests for `duplicates()`.

# tidyplus 0.1.0

- Added the `str_replace_vec()` function.
Expand Down
24 changes: 21 additions & 3 deletions R/duplicates.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ duplicates <- function(.data, ..., .keep_all = TRUE) {
if (length(col) == 0) {
col_names <- colnames(.data)
} else {
col_names <- vapply(col, rlang::as_string, character(1))
col_names <- vapply(col, rlang::as_string, character(1))
}
col_names <- unique(col_names)
chk_vector(col_names)
Expand All @@ -34,13 +34,31 @@ duplicates <- function(.data, ..., .keep_all = TRUE) {
if (!length(col_names)) {
return(.data)
}

grouped <- dplyr::is_grouped_df(.data)
groups <- dplyr::group_vars(.data)
groups_sym <- rlang::syms(groups)

is_sf <- any(class(.data) == "sf")
if (is_sf) {
col_name_sf <- attributes(.data)$sf_column
}

.data <- tibble::as_tibble(.data)

.data_dup <- dplyr::select(.data, dplyr::all_of(col_names))
.data_dup <- .data_dup[duplicated(.data_dup), , drop = FALSE]
.data_dup <- unique(.data_dup)
.data <- dplyr::inner_join(.data, .data_dup, by = col_names)

.data <- dplyr::inner_join(.data, .data_dup, by = col_names)
if (!(.keep_all)) {
.data <- dplyr::select(.data, dplyr::all_of(col_names))
}
.data <- dplyr::as_tibble(.data)
if (grouped) {
.data <- dplyr::group_by(.data, !!!groups_sym)
}
if (is_sf) {
.data <- sf::st_as_sf(.data, sf_column_name = col_name_sf)
}
.data
}
114 changes: 101 additions & 13 deletions tests/testthat/test-duplicates.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
test_that("returns only duplicated rows of selected columns", {
tib <- dplyr::tibble(x = c(1, 2, 1), y = c(1, 1, 1))
tib <- tibble::tibble(x = c(1, 2, 1), y = c(1, 1, 1))
expect_identical(duplicates(tib), tib[c(1, 3), ])
expect_identical(duplicates(as.data.frame(tib)), tib[c(1, 3), ])

expect_identical(
duplicates(data.frame(x = c(1, 2, 1), y = 1:3), x),
dplyr::tibble(x = c(1, 1), y = c(1L, 3L))
)

expect_identical(
duplicates(data.frame(x = c(1, 2, 1), y = 1:3), x, y),
dplyr::tibble(x = double(0), y = integer(0))
Expand Down Expand Up @@ -40,32 +38,27 @@ test_that("keep_all working", {


test_that("handles data frame with no rows", {
data <- dplyr::tibble(x = integer(), y = integer())

data <- tibble::tibble(x = integer(), y = integer())
expect_equal(
duplicates(data),
data
)

expect_equal(
duplicates(data, x),
data
)

expect_equal(
duplicates(data, x, .keep_all = FALSE),
dplyr::tibble(x = integer())
)
})

test_that("handles data frame with no columns", {
data <- dplyr::tibble()

data <- tibble::tibble()
expect_identical(
duplicates(data),
data
)

expect_identical(
duplicates(data, .keep_all = FALSE),
data
Expand All @@ -74,17 +67,14 @@ test_that("handles data frame with no columns", {

test_that("handles columns with missing values", {
data <- tibble::tibble(x = c(1, 2, NA, 1, 1), y = c(1, 1, NA, NA, NA))

expect_identical(
duplicates(data),
tibble::tibble(x = c(1, 1), y = as.double(c(NA, NA)))
)

expect_identical(
duplicates(data, y),
data
)

expect_identical(
duplicates(data, y, .keep_all = FALSE),
tibble::tibble(y = c(1, 1, NA, NA, NA))
Expand All @@ -106,3 +96,101 @@ test_that("errors when input argument is not a data.frame", {
expect_error(duplicates(NULL), "Data.frame must be a data.frame.")
expect_error(duplicates(NA), "Data.frame must be a data.frame.")
})

test_that("preserves groups", {
data <- tibble::tibble(
X = c(1, 2, 2, 3, 3, 4, 4),
Y = c(11, 12, 13, 14, 14, 15, 15),
a = c("red", "orange", "yellow", "green", "green", "blue", "blue"),
b = c("white", "white", "white", "white", "white", "white", "white")
)
data <- dplyr::group_by(data, a, b)
data <- duplicates(data)

data_dup <- tibble::tibble(
X = c(3, 3, 4, 4),
Y = c(14, 14, 15, 15),
a = c("green", "green", "blue", "blue"),
b = c("white", "white", "white", "white")
)
data_dup <- dplyr::group_by(data_dup, a, b)

expect_identical(data_dup, data)
})

test_that("preserves single active geometry column called geometry", {
skip_if_not_installed("sf")

data <- tibble::tibble(
X = c(1, 2, 2, 3, 3, 4, 4),
Y = c(11, 12, 13, 14, 14, 15, 15),
a = c("red", "orange", "yellow", "green", "green", "blue", "blue"),
b = c("white", "white", "white", "white", "white", "white", "white")
)
data <- sf::st_as_sf(data, coords = c("X", "Y"))
data <- duplicates(data)

data_dup <- tibble::tibble(
X = c(3, 3, 4, 4),
Y = c(14, 14, 15, 15),
a = c("green", "green", "blue", "blue"),
b = c("white", "white", "white", "white")
)
data_dup <- sf::st_as_sf(data_dup, coords = c("X", "Y"))

expect_identical(data_dup, data)
})

test_that("preserves single active geometry column called map", {
skip_if_not_installed("sf")

data <- tibble::tibble(
X = c(1, 2, 2, 3, 3, 4, 4),
Y = c(11, 12, 13, 14, 14, 15, 15),
a = c("red", "orange", "yellow", "green", "green", "blue", "blue"),
b = c("white", "white", "white", "white", "white", "white", "white")
)
data <- sf::st_as_sf(data, coords = c("X", "Y"), sf_column_name = "map")
data <- duplicates(data)

data_dup <- tibble::tibble(
X = c(3, 3, 4, 4),
Y = c(14, 14, 15, 15),
a = c("green", "green", "blue", "blue"),
b = c("white", "white", "white", "white")
)
data_dup <- sf::st_as_sf(data_dup, coords = c("X", "Y"), sf_column_name = "map")

expect_identical(data_dup, data)
})

test_that("deals with one active geometry column and one inactive geometry column", {
skip_if_not_installed("sf")

data <- tibble::tibble(
X = c(1, 2, 2, 3, 3, 4, 4),
Y = c(11, 12, 13, 14, 14, 15, 15),
a = c("red", "orange", "yellow", "green", "green", "blue", "blue"),
b = c("white", "white", "white", "white", "white", "white", "white"),
I = c(101, 102, 102, 103, 103, 104, 104),
J = c(1001, 1002, 1003, 1004, 1004, 1005, 1005)
)
data <- sf::st_as_sf(data, coords = c("X", "Y"))
data <- tibble::as_tibble(data)
data <- sf::st_as_sf(data, coords = c("I", "J"), sf_column_name = "map")
data <- duplicates(data)

data_dup <- tibble::tibble(
X = c(3, 3, 4, 4),
Y = c(14, 14, 15, 15),
a = c("green", "green", "blue", "blue"),
b = c("white", "white", "white", "white"),
I = c(103, 103, 104, 104),
J = c(1004, 1004, 1005, 1005)
)
data_dup <- sf::st_as_sf(data_dup, coords = c("X", "Y"))
data_dup <- tibble::as_tibble(data_dup)
data_dup <- sf::st_as_sf(data_dup, coords = c("I", "J"), sf_column_name = "map")

expect_identical(data_dup, data)
})

0 comments on commit 6434791

Please sign in to comment.