Thanks to visit codestin.com
Credit goes to github.com

Skip to content
49 changes: 36 additions & 13 deletions R/RequiredCols.R
Original file line number Diff line number Diff line change
Expand Up @@ -341,34 +341,55 @@ TADA_GetTemplate <- function() {



#' TADA Module 1 Required Fields Check
#' TADA Required Fields Check
#'
#' This function checks if all required fields for TADA Module 1 are
#' included in the input dataframe.
#' This function checks if all fields required to run TADA functions are included in the input
#' dataframe. It is used in the TADA Shiny application to test user supplied files for compatibility
#' with the application.
#'
#' @param .data A dataframe
#'
#' @return Boolean result indicating whether or not the input dataframe contains all of the TADA profile fields.
#' @return Boolean result, TRUE or FALSE, indicating whether or not the input dataframe contains all
#' of the required fields. If FALSE, an error will be returned that includes the names of all
#' missing columns.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' # Find web service URLs for each Profile using WQP User Interface (https://www.waterqualitydata.us/)
#' # Example WQP URL: https://www.waterqualitydata.us/#statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&providers=NWIS&providers=STEWARDS&providers=STORET
#'
#'
#' # Use TADA_ReadWQPWebServices to load the Station, Project, and Phys-Chem Result profiles
#' stationProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Station/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&providers=NWIS&providers=STEWARDS&providers=STORET")
#' physchemProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Result/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&dataProfile=resultPhysChem&providers=NWIS&providers=STEWARDS&providers=STORET")
#' projectProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Project/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&providers=NWIS&providers=STEWARDS&providers=STORET")
#'
#'
#' # Join all three profiles using TADA_JoinWQPProfiles
#' TADAProfile <- TADA_JoinWQPProfiles(FullPhysChem = physchemProfile, Sites = stationProfile, Projects = projectProfile)
#'
#' # Run TADA_CheckRequiredFields
#' CheckRequirements_TADAProfile <- TADA_CheckRequiredFields(TADAProfile)
#' TADAProfile <- TADA_JoinWQPProfiles(FullPhysChem = physchemProfile, Sites = stationProfile,
#' Projects = projectProfile)
#'
#' # Run TADA_CheckRequiredFields, returns error message,
#' # 'The dataframe does not contain the required fields: ActivityStartDateTime'
#' TADA_CheckRequiredFields(TADAProfile)
#'
#' # Add missing col
#' TADAProfile1 <- dataRetrieval:::create_dateTime(df = TADAProfile,
#' date_col = "ActivityStartDate",
#' time_col = "ActivityStartTime.Time",
#' tz_col = "ActivityStartTime.TimeZoneCode",
#' tz = "UTC")
#'
#' review_TADAProfile1 = TADAProfile1 %>% dplyr::select(c("ActivityStartDate",
#' "ActivityStartTime.Time",
#' "ActivityStartTime.TimeZoneCode",
#' "ActivityStartDateTime",
#' "ActivityStartTime.TimeZoneCode_offset"))
#'
#' # re-run TADA_CheckRequiredFields, returns TRUE
#' TADA_CheckRequiredFields(TADAProfile1)
#' }
#'
#'
TADA_CheckRequiredFields <- function(.data) {
# remove names with TADA. string from require.cols
require.originals <- Filter(function(x) !any(grepl("TADA.", x)), require.cols)
Expand All @@ -380,8 +401,10 @@ TADA_CheckRequiredFields <- function(.data) {
if (all(require.originals %in% colnames(.data)) == TRUE) {
TRUE
} else {
stop("The dataframe does not contain the required fields.")
}
missingcols <- base::setdiff(require.originals, colnames(.data))
stop("TADA_CheckRequiredFields: the dataframe does not contain the required fields: ",
paste(as.character(missingcols),
collapse = ", ")) }
}


Expand Down
47 changes: 35 additions & 12 deletions R/Utilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,6 @@ TADA_AutoClean <- function(.data) {
# execute function after checks are passed



# check to make sure columns do not already exist and capitalize fields with known synonyms that
# only differ in caps
print("TADA_Autoclean: creating TADA-specific columns.")
Expand Down Expand Up @@ -272,6 +271,18 @@ TADA_AutoClean <- function(.data) {
.data$TADA.ResultMeasure.MeasureUnitCode <- toupper(.data$ResultMeasure.MeasureUnitCode)
}

if ("ActivityStartDateTime" %in% colnames(.data)) {
Copy link
Collaborator Author

@cristinamullin cristinamullin Dec 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to add ActivityStartDateTime and convert times zones in our autoclean function here in the same format/way as the DR functions do. This is needed in cases where in the TADAShiny app, a user brings their own data file that was not downloaded using either EPATADA or dataRetrieval, but has all other required cols. It should replicate any additional things dataRetrieval readWQPdata does to a df.

Data retrieval produces "2022-06-08 16:00:00" while my code below produces "2023-05-11 11:45:00 UTC". DR also converts times and time zones to UTC, but that's not included in the value here. The time zone conversion and addition of ActivityStartDateTime occurs even when attributes are ignored (which is fine, we need that col):

  results.DR <- dataRetrieval::readWQPdata(WQPquery,
    dataProfile = "resultPhysChem",
    ignore_attributes = TRUE
  )

I'd like to understand this piece of dataretreival better. If there is a stand along function that can change all date/time fields to UTC and also creates this ActivityStartDate col (and does anything else that is happening behind the scenes), we could leverage that here instead.

@ldecicco-USGS do you have any advice on this topic? Can you please point us to the location in your code where this occurs? I did a quick search of your repo but couldn't find it. Thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The internal code is here;
https://github.com/DOI-USGS/dataRetrieval/blob/main/R/importWQP.R#L223
(you can call it, you'd just need to do a triple colon: dataRetrieval:::create_dateTime)

offsetLibrary is a dataframe saved in sysdata.rda
You can see where and how it gets called here:
https://github.com/DOI-USGS/dataRetrieval/blob/main/R/importWQP.R#L160

Let me know if there's something unclear

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you! This really helped me understand how to interpret both the ActivityStartDateTime and offset cols (these starting showing up in TADA df's for the first time last month: ActivityStartTime.TimeZoneCode_offset and ActivityEndTime.TimeZoneCode_offset) that are returned from dataRetrieval. For dataRetrieval users, I think it might be more user friendly to include a column titled ActivityStartDateTime.TimeZoneCode (UTC in this case) instead of the ActivityStartTime.TimeZoneCode_offset (which includes number of hours). As is, the target time zone for ActivityStartDateTime (a function input here) is not documented anywhere in the returned df (see review_TADAProfile1 below). Alternatively, UTC could potentially be included in ActivityStartDateTime but that might break people workflows (e.g. "2023-05-11 11:45:00 UTC"). I am going to create a separate issue in TADA repo to document this issue and discuss how to address it. Is this something you are potentially interested in updating in dataRetrieval?

# Find web service URLs for each Profile using WQP User Interface (https://www.waterqualitydata.us/)
# Example WQP URL: https://www.waterqualitydata.us/#statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&providers=NWIS&providers=STEWARDS&providers=STORET

# Use TADA_ReadWQPWebServices to load the Station, Project, and Phys-Chem Result profiles
stationProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Station/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&providers=NWIS&providers=STEWARDS&providers=STORET")
physchemProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Result/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&dataProfile=resultPhysChem&providers=NWIS&providers=STEWARDS&providers=STORET")
projectProfile <- TADA_ReadWQPWebServices("https://www.waterqualitydata.us/data/Project/search?statecode=US%3A09&characteristicType=Nutrient&startDateLo=04-01-2023&startDateHi=11-01-2023&mimeType=csv&zip=yes&providers=NWIS&providers=STEWARDS&providers=STORET")

# Join all three profiles using TADA_JoinWQPProfiles
TADAProfile <- TADA_JoinWQPProfiles(FullPhysChem = physchemProfile, Sites = stationProfile, Projects = projectProfile)

# Run TADA_CheckRequiredFields, returns error message, 'The dataframe does not contain the required fields: ActivityStartDateTime'
TADA_CheckRequiredFields(TADAProfile)

# Add missing col
TADAProfile1 <- dataRetrieval:::create_dateTime(df = TADAProfile, 
                                         date_col = "ActivityStartDate", 
                                         time_col = "ActivityStartTime.Time",
                                         tz_col = "ActivityStartTime.TimeZoneCode", 
                                         tz = "UTC")

review_TADAProfile1 = TADAProfile1 %>% dplyr::select(c("ActivityStartDate", 
                          "ActivityStartTime.Time", 
                          "ActivityStartTime.TimeZoneCode", 
                          "ActivityStartDateTime",
                          "ActivityStartTime.TimeZoneCode_offset"))

# re-run TADA_CheckRequiredFields, returns TRUE
TADA_CheckRequiredFields(TADAProfile1)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

created a separate issue here to continue the conversation: #558

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just making sure we're both on the same page: In dataRetrieval, the default timezone is UTC set here:
https://github.com/DOI-USGS/dataRetrieval/blob/main/R/readWQPdata.R#L200
You can read about changing timezones here:
https://doi-usgs.github.io/dataRetrieval/reference/readWQPdata.html#arg-tz
This sets the time zone attribute of the POSIX object.

Like this:

library(dataRetrieval)
nameToUse <- "pH"
pHData <- readWQPdata(siteid = "USGS-04024315", 
                      characteristicName = nameToUse,
                      service = "ResultWQX")
attr(pHData$Activity_StartDateTime, "tzone")
[1] "UTC"
pHData$Activity_StartDateTime[1]
[1] "1975-09-27 15:50:00 UTC"

pHData2 <- readWQPdata(siteid = "USGS-04024315", 
                      characteristicName = nameToUse,
                      tz = "America/Chicago",
                      service = "ResultWQX")
attr(pHData2$Activity_StartDateTime, "tzone")
[1] "America/Chicago"
pHData2$Activity_StartDateTime[1]
[1] "1975-09-27 10:50:00 CDT"

So what you are asking for is a column that converts the offset number of hours to the timezone it was converted to?

Note there's also the link in the help to the OlsonNames() base R function which talks about how R handles timezones. The issue is that different operating systems and depending on where in the world the computer things you are will want different abbreviations for timezones (that's why using the OlsonNames is what has been working best for dataRetrieval).
https://rdrr.io/r/base/timezones.html

.data <- .data
} else {
# creates ActivityStartDateTime and ActivityStartTime.TimeZoneCode_offset
# this is only needed when dataRetrieval is not used to get WQP data
.data <- dataRetrieval:::create_dateTime(df = .data,
date_col = "ActivityStartDate",
time_col = "ActivityStartTime.Time",
tz_col = "ActivityStartTime.TimeZoneCode",
tz = "UTC")
}

# Transform "Dissolved oxygen (DO)" characteristic name to "DISSOLVED OXYGEN SATURATION" IF
# result unit is "%" or "% SATURATN".

Expand Down Expand Up @@ -915,7 +926,9 @@ TADA_GetUniqueNearbySites <- function(.data) {
#'
#' Retrieves data for a period of time in the past 20 years using
#' TADA_DataRetrieval. This function can be used for testing functions on
#' random datasets.
#' random datasets. Only random data sets with 10 or more results will be returned.
#' If a random dataset has fewer than 10 results, the function will automatically
#' create another random WQP query until a df with greater than 10 results is returned.
#'
#' @param number_of_days Numeric. The default is 1, which will query and retrieve
#' data for a random two-day period (e.g.startDate = "2015-04-21",
Expand Down Expand Up @@ -943,20 +956,23 @@ TADA_GetUniqueNearbySites <- function(.data) {
#' df <- TADA_RandomTestingData(number_of_days = 5, choose_random_state = TRUE, autoclean = FALSE)
#' }
#'
TADA_RandomTestingData <- function(number_of_days = 1, choose_random_state = FALSE, autoclean = TRUE) {
while (TRUE) {
TADA_RandomTestingData <- function(number_of_days = 1, choose_random_state = FALSE,
autoclean = TRUE) {

get_random_data <- function(ndays = number_of_days, state_choice = choose_random_state,
ac = autoclean) {
# choose a random day within the last 20 years
twenty_yrs_ago <- Sys.Date() - 20 * 365
random_start_date <- twenty_yrs_ago + sample(20 * 365, 1)
# choose a random start date and add any number_of_days (set that as the end date)
end_date <- random_start_date + number_of_days
end_date <- random_start_date + ndays

if (choose_random_state == TRUE) {
if (state_choice == TRUE) {
load(system.file("extdata", "statecodes_df.Rdata", package = "EPATADA"))
state <- sample(statecodes_df$STUSAB, 1)
}

if (choose_random_state == FALSE) {
if (state_choice == FALSE) {
state <- "null"
}

Expand All @@ -966,7 +982,7 @@ TADA_RandomTestingData <- function(number_of_days = 1, choose_random_state = FAL
statecode = state
))

if (autoclean == TRUE) {
if (ac == TRUE) {
dat <- TADA_DataRetrieval(
startDate = as.character(random_start_date),
endDate = as.character(end_date),
Expand All @@ -975,19 +991,26 @@ TADA_RandomTestingData <- function(number_of_days = 1, choose_random_state = FAL
)
}

if (autoclean == FALSE) {
if (ac == FALSE) {
dat <- TADA_DataRetrieval(
startDate = as.character(random_start_date),
endDate = as.character(end_date),
statecode = state,
applyautoclean = FALSE
)
}

if (nrow(dat) > 0) {
return(dat)
return(dat)
}

verify_random_data <- function() {
df <- get_random_data()
while(nrow(df) < 10) {
df <- get_random_data()
}
return(df)
}

verify_random_data()
}

#' Aggregate multiple result values to a min, max, or mean
Expand Down
35 changes: 28 additions & 7 deletions man/TADA_CheckRequiredFields.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion man/TADA_RandomTestingData.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading