Skip to content

Commit

Permalink
added pins: changelog, sources, hierarchy, raw
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewallenbruce committed Nov 29, 2024
1 parent 683c103 commit 1f01bee
Show file tree
Hide file tree
Showing 16 changed files with 383 additions and 79 deletions.
5 changes: 4 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,13 @@ URL: https://github.com/andrewallenbruce/arktax,
https://andrewallenbruce.github.io/arktax/
BugReports: https://github.com/andrewallenbruce/arktax/issues
Imports:
cli,
fs,
fuimus,
glue,
pins
pins,
rlang,
stringfish
Suggests:
qs,
roxyglobals,
Expand Down
5 changes: 4 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ export(get_pin)
export(gh_raw)
export(list_pins)
export(mount_board)
export(retrieve_ark)
export(retrieve_changelog)
export(retrieve_hierarchy)
export(retrieve_raw)
export(retrieve_sources)
importFrom(fs,path_package)
importFrom(fuimus,search_in_if)
importFrom(glue,glue)
Expand Down
2 changes: 1 addition & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# arktax (development version)
# arktax 0.1.0.9000 (2024-11-16)

* Initialize package
131 changes: 109 additions & 22 deletions R/arktax.R
Original file line number Diff line number Diff line change
@@ -1,42 +1,129 @@
#' Get Taxomony Source File
#' Retrieve Taxonomy Source File
#'
#' @param year `<int>` year of source file release; options are `2009:2024`
#'
#' @param version `<int>` version of source file; options are `0` or `1`
#'
#' @param code `<chr>` Health Care Provider Taxonomy code, a unique alphanumeric code, ten characters in length
#'
#' @param which `<chr>` `wide` or `long` version of the taxonomy
#' @param code `<chr>` Health Care Provider Taxonomy code, a unique
#' alphanumeric code, ten characters in length
#'
#' @returns `<tibble>` of search results
#'
#' @examples
#' retrieve_ark(year = 2024, code = "101Y00000X", which = "wide")
#' retrieve_raw(year = 2024, code = "101Y00000X")
#'
#' retrieve_ark(code = "101Y00000X", which = "long")
#' retrieve_raw(code = "101Y00000X")
#'
#' @importFrom fuimus search_in_if
#'
#' @autoglobal
#'
#' @export
retrieve_ark <- function(year = NULL,
retrieve_raw <- function(year = NULL,
version = NULL,
code = NULL,
which = c("wide", "long")) {

ark <- switch(
which,
wide = get_pin("ark_taxonomy"),
long = get_pin("ark_long")
)

if (which == "wide") {
ark <- search_in_if(ark, ark[["year"]], year)
ark <- search_in_if(ark, ark[["version"]], version)
}
code = NULL) {
check_nchar(code, 10)
pin <- get_pin("ark_taxonomy")
pin <- search_in_if(pin, pin[["year"]], year)
pin <- search_in_if(pin, pin[["version"]], version)
pin <- search_in_if(pin, pin[["code"]], code)
return(pin)
}

#' Retrieve Taxonomy Sources
#'
#' @param code `<chr>` Health Care Provider Taxonomy code, a unique
#' alphanumeric code, ten characters in length
#'
#' @returns `<tibble>` of search results
#'
#' @examples
#' retrieve_sources(code = "101Y00000X")
#'
#' retrieve_sources(code = "103TA0400X")
#'
#' @importFrom fuimus search_in_if
#'
#' @autoglobal
#'
#' @export
retrieve_sources <- function(code = NULL) {
check_nchar(code, 10)
pin <- get_pin("sources")
search_in_if(pin, pin[["code"]], code)
}

#' Retrieve Taxonomy Changelog
#'
#' @param code `<chr>` Health Care Provider Taxonomy code, a unique
#' alphanumeric code, ten characters in length
#'
#' @returns `<tibble>` of search results
#'
#' @examples
#' retrieve_changelog(code = "103GC0700X")
#'
#' retrieve_changelog(code = "103G00000X")
#'
#' @importFrom fuimus search_in_if
#'
#' @autoglobal
#'
#' @export
retrieve_changelog <- function(code = NULL) {
check_nchar(code, 10)
pin <- get_pin("changelog")
search_in_if(pin, pin[["code"]], code)
}

#' Retrieve Taxonomy Hierarchy
#'
#' @param code `<chr>` Health Care Provider Taxonomy code, a unique
#' alphanumeric code, ten characters in length
#'
#' @returns `<tibble>` of search results
#'
#' @examples
#' retrieve_hierarchy(code = "101Y00000X")
#'
#' retrieve_hierarchy(code = "103TA0400X")
#'
#' @importFrom fuimus search_in_if
#'
#' @autoglobal
#'
#' @export
retrieve_hierarchy <- function(code = NULL) {
check_nchar(code, 10)
pin <- get_pin("ark_long")
search_in_if(pin, pin[["code"]], code)
}


#' Check that input is `n` character(s) long
#'
#' @param x `<chr>` string
#'
#' @param n `<int>` number of characters
#'
#' @autoglobal
#'
#' @noRd
check_nchar <- function(x, n) {

if (!is.null(x)) {

ark <- search_in_if(ark, ark[["code"]], code)
stopifnot(rlang::is_integerish(n), n > 0)

return(ark)
arg <- rlang::caller_arg(x)
call <- rlang::caller_env()

if (any(stringfish::sf_nchar(x) != n, na.rm = TRUE)) {
cli::cli_abort(
"{.arg {arg}} must be {.val {n}} character{?s} long.",
arg = arg,
call = call)
}
stringfish::sf_toupper(x)
}
}
6 changes: 6 additions & 0 deletions data-raw/nucc_pins.R
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,12 @@ pin_update(
description = "Health Care Provider Taxonomy Code Set Archive 2009-2024 (Long)"
)

get_pin("ark_long") |>
dplyr::count(code, level, sort = TRUE) |>
dplyr::filter(n > 1) |>
dplyr::pull(code)


ark_long |>
dplyr::group_by(code) |>
dplyr::mutate(years = ivs::iv(min(year), max(year))) |>
Expand Down
155 changes: 135 additions & 20 deletions data-raw/splitting.R
Original file line number Diff line number Diff line change
@@ -1,21 +1,87 @@
source(here::here("data-raw", "pins_internal.R"))

wide <- arktax::retrieve_ark(year = 2024, which = "wide") |>
dplyr::select(-modified, -effective, -deactivated)
# Use wide version
wide <- arktax::retrieve_ark(
year = 2024,
version = 1,
which = "wide") |>
dplyr::select(-year, -version) |>
fuimus::remove_quiet()

#------------ BASE
base <- wide |>
dplyr::select(code, notes) |>
dplyr::distinct(code, notes, .keep_all = TRUE) |>
dplyr::distinct(
code,
notes,
.keep_all = TRUE) |>
dplyr::filter(codex::not_na(notes)) |>
dplyr::mutate(
.id = dplyr::row_number(),
updates = stringr::str_extract_all(notes, "\\[.*\\]"),
sources = stringr::str_remove_all(notes, "\\s?\\[.*\\]") |> dplyr::na_if(""),
notes = NULL) |>
tidyr::unnest(updates, keep_empty = TRUE)

#--------- SOURCES
sources <- base |>
dplyr::select(-updates, -.id) |>
dplyr::filter(codex::not_na(sources)) |>
dplyr::distinct() |>
dplyr::mutate(
sources = dplyr::case_when(
code == "2085U0001X" ~ stringr::str_glue(
"{sources}. Additional Resources: See 2085R0202X. ",
"The American Osteopathic Board of Radiology no longer offers a certificate in this specialty ",
"(Diagnostic Ultrasound is part of the scope of a Diagnostic Radiologist)."),
.default = sources)) |>
dplyr::reframe(
code,
type = "primary",
source = stringr::str_remove(sources, "^[Ss]ources?: "),
addition = stringr::str_detect(sources, " Additional Resources: "))

sources <- vctrs::vec_rbind(
sources |>
dplyr::filter(!addition),
sources |>
dplyr::filter(addition) |>
tidyr::separate_longer_delim(
cols = source,
delim = stringr::fixed(" Additional Resources: ")) |>
dplyr::mutate(
id = dplyr::row_number(),
.before = 1,
.by = code) |>
dplyr::mutate(
type = dplyr::if_else(
id == 1,
"primary",
"additional"),
id = NULL)) |>
dplyr::select(-addition) |>
dplyr::mutate(
type = factor(type,
levels = c("primary", "additional"), ordered = TRUE)) |>
dplyr::arrange(code, type)

# 2085U0001X
# 2008-07-01
# Additional Resources: The American Osteopathic Board of Radiology no longer offers a certificate in this specialty.
# Note: In medical practice, Diagnostic Ultrasound is part of the scope of training and practice of a Diagnostic Radiologist - see Taxonomy Code 2085R0202X.

pin_update(
sources,
"sources",
"NUCC Taxonomy Sources 2009-2024",
"Health Care Provider Taxonomy Code Set Sources 2009-2024"
)


#----------------------------- UPDATES
updates <- base |>
dplyr::select(-sources) |>
dplyr::filter(not_na(updates)) |>
dplyr::filter(codex::not_na(updates)) |>
tidyr::separate_longer_delim(
cols = updates,
delim = stringr::fixed("; ")) |>
Expand All @@ -24,8 +90,8 @@ updates <- base |>
first = stringr::str_extract_all(updates, ", \\d{1,2}")) |>
tidyr::unnest(first, keep_empty = TRUE) |>
dplyr::mutate(
first = str_remove(first, ", "),
multi = str_count(updates, ", \\d{1}/\\d{1}/\\d{4}"))
first = stringr::str_remove(first, ", "),
multi = stringr::str_count(updates, ", \\d{1,2}/\\d{1,2}/\\d{2,4}"))

updates <- vctrs::vec_rbind(
updates |> dplyr::filter(multi == 0),
Expand All @@ -36,21 +102,70 @@ updates <- vctrs::vec_rbind(
delim = stringr::regex(", \\d{1,2}")) |>
dplyr::mutate(updates = dplyr::if_else(
stringr::str_detect(updates, "^[/]"),
stringr::str_glue("{first}{updates}"), updates))
) |>
dplyr::select(-first, -multi) |>
stringr::str_glue("{first}{updates}"), updates))) |>
dplyr::select(-multi, -first) |>
dplyr::mutate(
dates = stringr::str_extract_all(updates, "\\d{1,2}[/]\\d{1,2}[/]\\d{4}"),
updates = stringr::str_remove_all(updates, "\\d{1,2}[/]\\d{1,2}[/]\\d{4}[:, ]\\s?")) |>
dates = stringr::str_extract_all(updates, "\\d{1,2}[/]\\d{1,2}[/]\\d{2,4}"),
updates = stringr::str_remove_all(updates, "\\d{1,2}[/]\\d{1,2}[/]\\d{2,4}[:]\\s?"),
updates = dplyr::if_else(updates == "New", "new", updates),
updates = stringr::str_remove_all(updates, "\\d{1}[/]\\d{1}[/]\\d{4},?\\s?"),
updates = dplyr::if_else(updates == "42 U.S.C. 1396d(1)(3)(B)] [added definition", "added definition", updates)) |>
tidyr::unnest(dates, keep_empty = TRUE) |>
dplyr::mutate(dates = clock::date_parse(dates, format = "%m/%d/%Y"))

updates |>
dplyr::mutate(
date_type = dplyr::case_when(
stringr::str_detect(updates, "[Nn]ew") ~ "effective",
dates = dplyr::if_else(dates == "7/1/24", "7/1/2024", dates),
dates = clock::date_parse(dates, format = "%m/%d/%Y"),
updates = dplyr::case_when(
code == "207RX0202X" & dates == "2007-07-01" ~ "added definition, added source",
code == "207RX0202X" & dates == "2007-11-05" ~ "corrected definition",
code == "320600000X" & dates == "2003-07-01" ~ "new",
code == "320600000X" & dates == "2021-01-01" ~ "modified title and definition",
code == "2085U0001X" & dates == "2008-07-01" ~ "definition added, source added",
.default = updates)) |>
dplyr::distinct() |>
dplyr::mutate(
category = dplyr::case_when(
stringr::str_detect(updates, "new") ~ "effective",
stringr::str_detect(updates, "inactive") ~ "deactivated",
.default = "modified") |> forcats::as_factor()) |>
dplyr::arrange(code, dates) |>
dplyr::count(updates, sort = TRUE) |>
print(n = 50)
.default = "modified"),
category = factor(
category,
levels = c("effective", "modified", "deactivated"), ordered = TRUE)
) |>
dplyr::arrange(code, dates, category) |>
dplyr::reframe(
code,
date_type = category,
date = dates,
change = stringr::str_replace(updates, "^marked inactive, use value ", "marked inactive, use "))

pin_update(
updates,
"changelog",
"NUCC Taxonomy Changelog 2009-2024",
"Health Care Provider Taxonomy Code Set Changelog 2009-2024"
)

#--------- Comparing raw dates to the updates
updates |> dplyr::filter(type == "modified")

arktax::retrieve_ark(which = "wide") |>
dplyr::select(code, modified) |>
dplyr::filter(codex::not_na(modified)) |>
dplyr::distinct() |>
print(n = 200)

updates |> dplyr::filter(type == "effective")

arktax::retrieve_ark(which = "wide") |>
dplyr::select(code, effective) |>
dplyr::filter(codex::not_na(effective)) |>
dplyr::distinct()

updates |>
dplyr::filter(type == "deactivated") |>
print(n = 30)

arktax::retrieve_ark(which = "wide") |>
dplyr::select(code, deactivated) |>
dplyr::filter(codex::not_na(deactivated)) |>
dplyr::distinct()
4 changes: 4 additions & 0 deletions inst/extdata/pins/_pins.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,7 @@ ark_long:
- ark_long/20241129T111715Z-13827/
ark_taxonomy:
- ark_taxonomy/20241124T214441Z-05252/
changelog:
- changelog/20241129T163133Z-a31f6/
sources:
- sources/20241129T163125Z-e153d/
Binary file not shown.
Loading

0 comments on commit 1f01bee

Please sign in to comment.