Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export(AggregatedContinuousData)
export(BelowNationalWQXLowerThreshold)
export(ConvertDepthUnits)
export(ConvertResultUnits)
export(ConvertSpecialChars)
export(FilterFieldReview)
export(FilterFields)
export(FilterParFieldReview)
Expand Down
75 changes: 47 additions & 28 deletions R/DataDiscoveryRetrieval.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
#' Generate TADA-compatible dataframe from WQP Data
#'
#' Retrieve data from Water Quality Portal (WQP) and generate a TADA-compatible
#' dataframe.
#' dataframe. Note that the inputs (e.g. project, organization, siteType) with the
#' exceptions of endDate and startDate match the web service call format from the
#' online WQP GUI. endDate and startDate match the format suggested in USGS's
#' dataRetrieval package (endDate = "YYYY-MM-DD"), which is a more familiar date
#' format for R users than the WQP GUI's endDateHi = "MM-DD-YYYY".
#'
#'
#' This function will create and/or edit the following columns:
#' TADA.DetectionLimitMeasureValue.Flag
Expand All @@ -13,7 +18,7 @@
#'
#' Keep in mind that all the query filters for the WQP work as an AND
#' but within the fields there are ORs. So for example,
#' characteristics – if you choose pH & DO – it’s an OR. Similarly, if you
#' characteristicNames – if you choose pH & DO – it’s an OR. Similarly, if you
#' choose VA and IL, it’s an OR. But the combo of fields are ANDs.
#' Such as State/VA AND Characteristic/DO".
#' "Characteristic" and "Characteristic Group" also work as an AND.
Expand All @@ -30,21 +35,21 @@
#' censored data later on (i.e., nondetections)
#'
#' Users can reference the \href{https://www.epa.gov/waterdata/storage-and-retrieval-and-water-quality-exchange-domain-services-and-downloads}{WQX domain tables}
#' to find allowable vales for queries, e.g., reference the WQX domain table to find countycode and statecode: https://cdx.epa.gov/wqx/download/DomainValues/County_CSV.zip
#' to find allowable values for queries, e.g., reference the WQX domain table to find countycode and statecode: https://cdx.epa.gov/wqx/download/DomainValues/County_CSV.zip
#' Alternatively, you can use the WQP services to find areas where data is available in the US: https://www.waterqualitydata.us/Codes/countycode
#'
#' See ?MeasureValueSpecialCharacters and ?autoclean documentation for more information.
#'
#' @param statecode Code that identifies a state
#' @param startDate Start Date in the format MM-DD-YYYY
#' @param startDate Start Date string in the format YYYY-MM-DD, for example, "2020-01-01"
#' @param countycode Code that identifies a county
#' @param siteid Unique monitoring station identifier
#' @param siteType Type of waterbody
#' @param characteristicName Name of parameter
#' @param sampleMedia Sampling substrate such as water, air, or sediment
#' @param ProjectIdentifier A string of letters and/or numbers (some additional characters also possible) used to signify a project with data in the Water Quality Portal
#' @param OrganizationIdentifier A string of letters and/or numbers (some additional characters also possible) used to signify an organization with data in the Water Quality Portal
#' @param endDate End Date in the format YYYY-MM-DD
#' @param project A string of letters and/or numbers (some additional characters also possible) used to signify a project with data in the Water Quality Portal
#' @param organization A string of letters and/or numbers (some additional characters also possible) used to signify an organization with data in the Water Quality Portal
#' @param endDate End Date string in the format YYYY-MM-DD
#' @param applyautoclean Logical, defaults to TRUE. Applies TADA's autoclean function on the returned data profile.
#'
#' @return TADA-compatible dataframe
Expand All @@ -56,10 +61,13 @@
#' tada1 <- TADAdataRetrieval(statecode = "WI",
#' countycode = "Dane",
#' characteristicName = "Phosphorus")
#' tada2 <- TADAdataRetrieval(ProjectIdentifier = "Anchorage Bacteria 20-21")
#'
#' tada2 <- TADAdataRetrieval(project = "Anchorage Bacteria 20-21")
#'
#' tada3 <- TADAdataRetrieval(statecode = "UT",
#' characteristicName = c("Ammonia", "Nitrate", "Nitrogen"),
#' startDate = "10-01-2020")
#' startDate = "2020-10-01")
#'
#' tada4 <- TADAdataRetrieval(statecode = "SC", countycode = "Abbeville")
#' # countycode queries require a statecode
#' tada5 <- TADAdataRetrieval(countycode = "US:02:020")
Expand All @@ -73,8 +81,8 @@ TADAdataRetrieval <- function(statecode = "null",
siteType = "null",
characteristicName = "null",
sampleMedia = "null",
ProjectIdentifier = "null",
OrganizationIdentifier = "null",
project = "null",
organization = "null",
endDate = "null",
applyautoclean = TRUE
) {
Expand All @@ -88,8 +96,14 @@ TADAdataRetrieval <- function(statecode = "null",
}

if (length(startDate)>1) {
if(is.na(suppressWarnings(lubridate::parse_date_time(startDate[1], orders = "ymd")))){
stop("Incorrect date format. Please use the format YYYY-MM-DD.")
}
WQPquery <- c(WQPquery, startDate = list(startDate))
} else if (startDate != "null") {
if(is.na(suppressWarnings(lubridate::parse_date_time(startDate, orders = "ymd")))){
stop("Incorrect date format. Please use the format YYYY-MM-DD.")
}
WQPquery <- c(WQPquery, startDate = startDate)
}

Expand Down Expand Up @@ -123,21 +137,27 @@ TADAdataRetrieval <- function(statecode = "null",
WQPquery <- c(WQPquery, sampleMedia = sampleMedia)
}

if (length(ProjectIdentifier)>1) {
WQPquery <- c(WQPquery, project = list(ProjectIdentifier))
} else if (ProjectIdentifier != "null") {
WQPquery <- c(WQPquery, project = ProjectIdentifier)
if (length(project)>1) {
WQPquery <- c(WQPquery, project = list(project))
} else if (project != "null") {
WQPquery <- c(WQPquery, project = project)
}

if (length(OrganizationIdentifier)>1) {
WQPquery <- c(WQPquery, organization = list(OrganizationIdentifier))
} else if (OrganizationIdentifier != "null") {
WQPquery <- c(WQPquery, organization = OrganizationIdentifier)
if (length(organization)>1) {
WQPquery <- c(WQPquery, organization = list(organization))
} else if (organization != "null") {
WQPquery <- c(WQPquery, organization = organization)
}

if (length(endDate)>1) {
if(is.na(suppressWarnings(lubridate::parse_date_time(endDate[1], orders = "ymd")))){
stop("Incorrect date format. Please use the format YYYY-MM-DD.")
}
WQPquery <- c(WQPquery, endDate = list(endDate))
} else if (endDate != "null") {
if(is.na(suppressWarnings(lubridate::parse_date_time(endDate, orders = "ymd")))){
stop("Incorrect date format. Please use the format YYYY-MM-DD.")
}
WQPquery <- c(WQPquery, endDate = endDate)
}

Expand Down Expand Up @@ -316,23 +336,22 @@ TADABigdataRetrieval <- function(startDate = "null",
if(!"null"%in%statecode&!"null"%in%huc){stop("Please provide either state code(s) OR huc(s) to proceed.")}

if(!startDate=="null"){
startDate_Low = lubridate::ymd(startDate)
startYearLo = lubridate::year(startDate_Low)
startDate = lubridate::ymd(startDate)
startYearLo = lubridate::year(startDate)
}else{ # else: pick a date before which any data are unlikely to be in WQP
startDate = "1800-01-01"
startDate_Low = lubridate::ymd(startDate)
startYearLo = lubridate::year(startDate_Low)
startDate = lubridate::ymd("1800-01-01")
startYearLo = lubridate::year(startDate)
}

# Logic: if the input endDate is not null, convert to date and obtain year
# for summary
if(!endDate=="null"){
endDate_High = lubridate::ymd(endDate)
endYearHi = lubridate::year(endDate_High)
endDate = lubridate::ymd(endDate)
endYearHi = lubridate::year(endDate)
}else{ # else: if not populated, default to using today's date/year for summary
endDate = Sys.Date()
endDate_High = lubridate::ymd(endDate)
endYearHi = lubridate::year(endDate_High)
endDate = lubridate::ymd(endDate)
endYearHi = lubridate::year(endDate)
}

# Create readWQPsummary query
Expand Down
64 changes: 62 additions & 2 deletions R/Utilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,25 @@ autoclean <- function(.data) {
toupper(.data$DetectionQuantitationLimitMeasure.MeasureUnitCode)
# .data$BiologicalIntentName = toupper(.data$BiologicalIntentName)

# Remove duplicate rows
.data <- .data[!duplicated(.data), ]
# Remove duplicate rows - turned into a test because duplicated() takes a long
# time acting on all columns in a large dataset.
if(!length(unique(.data$ResultIdentifier))==dim(.data)[1]){
print("Duplicate records may be present. Filtering to unique records. This may take a while on large datasets.")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

change to "Duplicate records are present" ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wrote "may be present" because I could potentially see someone using autoclean on a dataset where they joined detection limit data to result data and thus have unique rows with the same result identifier and different detection limits. In this case, the function will check to make sure these cases are truly unique.

dup_rids = names(table(.data$ResultIdentifier)[table(.data$ResultIdentifier)>1])
dup_check = .data%>%dplyr::filter(ResultIdentifier%in%dup_rids)%>%dplyr::group_by(ResultIdentifier)%>%dplyr::distinct()
not_dups = .data%>%dplyr::filter(!ResultIdentifier%in%dup_rids)
.data = plyr::rbind.fill(dup_check, not_dups)
}


# Remove complex biological data
.data <- dplyr::filter(.data, ActivityMediaName == "WATER")
# .data = dplyr::filter(.data, BiologicalIntentName != "TISSUE" | "TOXICITY" | is.na(InvalidCoordinates)== TRUE)

# run MeasureValueSpecialCharacters function
.data <- MeasureValueSpecialCharacters(.data)
# .data <- ConvertSpecialChars(.data, "ResultMeasureValue")
# .data <- ConvertSpecialChars(.data, "DetectionQuantitationLimitMeasure.MeasureValue")

# change latitude and longitude measures to class numeric
.data$LatitudeMeasure <- as.numeric(.data$LatitudeMeasure)
Expand Down Expand Up @@ -402,3 +412,53 @@ checkColumns <- function(.data, expected_cols) {
stop("The dataframe does not contain the required fields to use TADA. Use either the full physical/chemical profile downloaded from WQP or download the TADA profile template available on the EPA TADA webpage.")
}
}



#' ConvertSpecialChars
#'
#' This function will screen a column of the user's choice for special characters.
#' It creates a new column that describes the content of the column prior to
#' conversion to numeric. It also creates a new column to hold the new, numeric
#' column
#'
#' @param .data A TADA profile object
#' @param col A character column to be converted to numeric
#'
#' @export
#'

ConvertSpecialChars <- function(.data,col){
if(!col%in%names(.data)){
stop("Invalid column name specified for input dataset.")
}
if(class(col)=="numeric"){
stop("Column is already numeric. This conversion not needed.")
}
chars.data = .data
names(chars.data)[names(chars.data)==col] = "orig"
chars.data$masked = chars.data$orig
chars.data = chars.data%>%
dplyr::mutate(flag = dplyr::case_when(
is.na(masked) ~ as.character("ND or NA"),
(!is.na(suppressWarnings(as.numeric(masked)) == TRUE)) ~ as.character("Numeric"),
(grepl("<", masked) == TRUE) ~ as.character("Less Than"),
(grepl(">", masked) == TRUE) ~ as.character("Greater Than"),
(grepl("~", masked) == TRUE) ~ as.character("Approximate Value"),
(grepl("[A-Za-z]", masked) == TRUE) ~ as.character("Text"),
(grepl("%", masked) == TRUE) ~ as.character("Percentage"),
(grepl(",", masked) == TRUE) ~ as.character("Comma-Separated Numeric"),
TRUE ~ "Coerced to NA"
))

chars.data$masked = suppressWarnings(as.numeric(stringr::str_replace_all(
chars.data$orig,c("<" = "", ">" = "", "~" = "", "," = "","%" = ""))))

clean.data = chars.data%>%
dplyr::relocate("masked",.after = "orig")%>%
dplyr::relocate("flag", .after="masked")
names(clean.data)[names(clean.data)=="orig"] = col
names(clean.data)[names(clean.data)=="masked"] = paste0(col,".nochar")
names(clean.data)[names(clean.data)=="flag"] = paste0(col,".nochar_flag")
return(clean.data)
}
19 changes: 19 additions & 0 deletions man/ConvertSpecialChars.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 18 additions & 11 deletions man/TADAdataRetrieval.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/testthat/test-DataDiscoveryRetrieval.R
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ test_that("TADAdataRetrieval", {
test_that("TADAdataRetrieval", {
check_autoclean_meters_works <- TADAdataRetrieval(statecode = "UT",
characteristicName = c("Ammonia", "Nitrate", "Nitrogen"),
startDate = "01-01-2021")
startDate = "2021-01-01")
expect_true(any(check_autoclean_meters_works$ActivityDepthHeightMeasure.MeasureUnitCode!="meters"))
})

Expand Down
Loading