USEPA · cristinamullin · Feb 13, 2023 · Feb 9, 2023 · Feb 9, 2023 · Feb 9, 2023
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,7 @@ export(AggregatedContinuousData)
 export(BelowNationalWQXLowerThreshold)
 export(ConvertDepthUnits)
 export(ConvertResultUnits)
+export(ConvertSpecialChars)
 export(FilterFieldReview)
 export(FilterFields)
 export(FilterParFieldReview)

diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R
@@ -1,7 +1,12 @@
 #' Generate TADA-compatible dataframe from WQP Data
 #'
 #' Retrieve data from Water Quality Portal (WQP) and generate a TADA-compatible
-#' dataframe.
+#' dataframe. Note that the inputs (e.g. project, organization, siteType) with the 
+#' exceptions of endDate and startDate match the web service call format from the
+#' online WQP GUI. endDate and startDate match the format suggested in USGS's 
+#' dataRetrieval package (endDate = "YYYY-MM-DD"), which is a more familiar date 
+#' format for R users than the WQP GUI's endDateHi = "MM-DD-YYYY".
+#'  
 #' 
 #' This function will create and/or edit the following columns:
 #' TADA.DetectionLimitMeasureValue.Flag
@@ -13,7 +18,7 @@
 #' 
 #' Keep in mind that all the query filters for the WQP work as an AND 
 #' but within the fields there are ORs. So for example, 
-#' characteristics – if you choose pH & DO – it’s an OR. Similarly, if you
+#' characteristicNames – if you choose pH & DO – it’s an OR. Similarly, if you
 #' choose VA and IL, it’s an OR. But the combo of fields are ANDs. 
 #' Such as State/VA AND Characteristic/DO". 
 #' "Characteristic" and "Characteristic Group" also work as an AND. 
@@ -30,21 +35,21 @@
 #' censored data later on (i.e., nondetections)
 #' 
 #' Users can reference the \href{https://www.epa.gov/waterdata/storage-and-retrieval-and-water-quality-exchange-domain-services-and-downloads}{WQX domain tables}
-#' to find allowable vales for queries, e.g., reference the WQX domain table to find countycode and statecode: https://cdx.epa.gov/wqx/download/DomainValues/County_CSV.zip
+#' to find allowable values for queries, e.g., reference the WQX domain table to find countycode and statecode: https://cdx.epa.gov/wqx/download/DomainValues/County_CSV.zip
 #' Alternatively, you can use the WQP services to find areas where data is available in the US: https://www.waterqualitydata.us/Codes/countycode
 #'  
 #' See ?MeasureValueSpecialCharacters and ?autoclean documentation for more information.
 #' 
 #' @param statecode Code that identifies a state
-#' @param startDate Start Date in the format MM-DD-YYYY
+#' @param startDate Start Date string in the format YYYY-MM-DD, for example, "2020-01-01"
 #' @param countycode Code that identifies a county 
 #' @param siteid Unique monitoring station identifier
 #' @param siteType Type of waterbody
 #' @param characteristicName Name of parameter
 #' @param sampleMedia Sampling substrate such as water, air, or sediment
-#' @param ProjectIdentifier A string of letters and/or numbers (some additional characters also possible) used to signify a project with data in the Water Quality Portal
-#' @param OrganizationIdentifier A string of letters and/or numbers (some additional characters also possible) used to signify an organization with data in the Water Quality Portal
-#' @param endDate End Date in the format YYYY-MM-DD
+#' @param project A string of letters and/or numbers (some additional characters also possible) used to signify a project with data in the Water Quality Portal
+#' @param organization A string of letters and/or numbers (some additional characters also possible) used to signify an organization with data in the Water Quality Portal
+#' @param endDate End Date string in the format YYYY-MM-DD
 #' @param applyautoclean Logical, defaults to TRUE. Applies TADA's autoclean function on the returned data profile.
 #'
 #' @return TADA-compatible dataframe
@@ -56,10 +61,13 @@
 #' tada1 <- TADAdataRetrieval(statecode = "WI",
 #'                            countycode = "Dane",
 #'                            characteristicName = "Phosphorus")
-#' tada2 <- TADAdataRetrieval(ProjectIdentifier = "Anchorage Bacteria 20-21")
+#' 
+#' tada2 <- TADAdataRetrieval(project = "Anchorage Bacteria 20-21")
+#' 
 #' tada3 <- TADAdataRetrieval(statecode = "UT", 
 #'                            characteristicName = c("Ammonia", "Nitrate", "Nitrogen"), 
-#'                            startDate = "10-01-2020")
+#'                            startDate = "2020-10-01")
+#' 
 #' tada4 <- TADAdataRetrieval(statecode = "SC", countycode  = "Abbeville")
 #' # countycode queries require a statecode
 #' tada5 <- TADAdataRetrieval(countycode = "US:02:020")
@@ -73,8 +81,8 @@ TADAdataRetrieval <- function(statecode = "null",
                               siteType = "null",
                               characteristicName = "null",
                               sampleMedia = "null",
-                              ProjectIdentifier = "null",
-                              OrganizationIdentifier = "null",
+                              project = "null",
+                              organization = "null",
                               endDate = "null",
                               applyautoclean = TRUE
                               ) {
@@ -88,8 +96,14 @@ TADAdataRetrieval <- function(statecode = "null",
   }
 
   if (length(startDate)>1) {
+    if(is.na(suppressWarnings(lubridate::parse_date_time(startDate[1], orders = "ymd")))){
+      stop("Incorrect date format. Please use the format YYYY-MM-DD.")
+    }
     WQPquery <- c(WQPquery, startDate = list(startDate)) 
   } else if (startDate != "null") {
+    if(is.na(suppressWarnings(lubridate::parse_date_time(startDate, orders = "ymd")))){
+      stop("Incorrect date format. Please use the format YYYY-MM-DD.")
+    }
     WQPquery <- c(WQPquery, startDate = startDate)
   }
 
@@ -123,21 +137,27 @@ TADAdataRetrieval <- function(statecode = "null",
     WQPquery <- c(WQPquery, sampleMedia = sampleMedia)
   }
 
-  if (length(ProjectIdentifier)>1) {
-    WQPquery <- c(WQPquery, project = list(ProjectIdentifier)) 
-  } else if (ProjectIdentifier != "null") {
-    WQPquery <- c(WQPquery, project = ProjectIdentifier)
+  if (length(project)>1) {
+    WQPquery <- c(WQPquery, project = list(project)) 
+  } else if (project != "null") {
+    WQPquery <- c(WQPquery, project = project)
   }
 
-  if (length(OrganizationIdentifier)>1) {
-    WQPquery <- c(WQPquery, organization = list(OrganizationIdentifier)) 
-  } else if (OrganizationIdentifier != "null") {
-    WQPquery <- c(WQPquery, organization = OrganizationIdentifier)
+  if (length(organization)>1) {
+    WQPquery <- c(WQPquery, organization = list(organization)) 
+  } else if (organization != "null") {
+    WQPquery <- c(WQPquery, organization = organization)
   }
 
   if (length(endDate)>1) {
+    if(is.na(suppressWarnings(lubridate::parse_date_time(endDate[1], orders = "ymd")))){
+      stop("Incorrect date format. Please use the format YYYY-MM-DD.")
+    }
     WQPquery <- c(WQPquery, endDate = list(endDate)) 
   } else if (endDate != "null") {
+    if(is.na(suppressWarnings(lubridate::parse_date_time(endDate, orders = "ymd")))){
+      stop("Incorrect date format. Please use the format YYYY-MM-DD.")
+    }
     WQPquery <- c(WQPquery, endDate = endDate)
   }
 
@@ -316,23 +336,22 @@ TADABigdataRetrieval <- function(startDate = "null",
   if(!"null"%in%statecode&!"null"%in%huc){stop("Please provide either state code(s) OR huc(s) to proceed.")}
 
   if(!startDate=="null"){
-    startDate_Low = lubridate::ymd(startDate)
-    startYearLo = lubridate::year(startDate_Low)
+    startDate = lubridate::ymd(startDate)
+    startYearLo = lubridate::year(startDate)
   }else{ # else: pick a date before which any data are unlikely to be in WQP
-    startDate = "1800-01-01"
-    startDate_Low = lubridate::ymd(startDate)
-    startYearLo = lubridate::year(startDate_Low)
+    startDate = lubridate::ymd("1800-01-01")
+    startYearLo = lubridate::year(startDate)
   } 
 
 # Logic: if the input endDate is not null, convert to date and obtain year
   # for summary
   if(!endDate=="null"){
-    endDate_High = lubridate::ymd(endDate)
-    endYearHi = lubridate::year(endDate_High)
+    endDate = lubridate::ymd(endDate)
+    endYearHi = lubridate::year(endDate)
   }else{ # else: if not populated, default to using today's date/year for summary
     endDate = Sys.Date()
-    endDate_High = lubridate::ymd(endDate)
-    endYearHi = lubridate::year(endDate_High)
+    endDate = lubridate::ymd(endDate)
+    endYearHi = lubridate::year(endDate)
   }
 
   # Create readWQPsummary query

diff --git a/R/Utilities.R b/R/Utilities.R
@@ -64,15 +64,25 @@ autoclean <- function(.data) {
     toupper(.data$DetectionQuantitationLimitMeasure.MeasureUnitCode)
   # .data$BiologicalIntentName = toupper(.data$BiologicalIntentName)
 
-  # Remove duplicate rows
-  .data <- .data[!duplicated(.data), ]
+  # Remove duplicate rows - turned into a test because duplicated() takes a long
+  # time acting on all columns in a large dataset.
+  if(!length(unique(.data$ResultIdentifier))==dim(.data)[1]){
+    print("Duplicate records may be present. Filtering to unique records. This may take a while on large datasets.")
+    dup_rids = names(table(.data$ResultIdentifier)[table(.data$ResultIdentifier)>1])
+    dup_check = .data%>%dplyr::filter(ResultIdentifier%in%dup_rids)%>%dplyr::group_by(ResultIdentifier)%>%dplyr::distinct()
+    not_dups = .data%>%dplyr::filter(!ResultIdentifier%in%dup_rids)
+    .data = plyr::rbind.fill(dup_check, not_dups)
+  }
+
 
   # Remove complex biological data
   .data <- dplyr::filter(.data, ActivityMediaName == "WATER")
   # .data = dplyr::filter(.data, BiologicalIntentName != "TISSUE" | "TOXICITY" | is.na(InvalidCoordinates)== TRUE)
 
   # run MeasureValueSpecialCharacters function
   .data <- MeasureValueSpecialCharacters(.data)
+  # .data <- ConvertSpecialChars(.data, "ResultMeasureValue")
+  # .data <- ConvertSpecialChars(.data, "DetectionQuantitationLimitMeasure.MeasureValue")
 
   # change latitude and longitude measures to class numeric
   .data$LatitudeMeasure <- as.numeric(.data$LatitudeMeasure)
@@ -402,3 +412,53 @@ checkColumns <- function(.data, expected_cols) {
     stop("The dataframe does not contain the required fields to use TADA. Use either the full physical/chemical profile downloaded from WQP or download the TADA profile template available on the EPA TADA webpage.")
   }
 }
+
+
+
+#' ConvertSpecialChars
+#' 
+#' This function will screen a column of the user's choice for special characters.
+#' It creates a new column that describes the content of the column prior to
+#' conversion to numeric. It also creates a new column to hold the new, numeric
+#' column
+#' 
+#' @param .data A TADA profile object
+#' @param col A character column to be converted to numeric
+#' 
+#' @export
+#' 
+
+ConvertSpecialChars <- function(.data,col){
+  if(!col%in%names(.data)){
+    stop("Invalid column name specified for input dataset.")
+  }
+  if(class(col)=="numeric"){
+    stop("Column is already numeric. This conversion not needed.")
+  }
+  chars.data = .data
+  names(chars.data)[names(chars.data)==col] = "orig"
+  chars.data$masked = chars.data$orig
+  chars.data = chars.data%>%
+    dplyr::mutate(flag = dplyr::case_when(
+      is.na(masked) ~ as.character("ND or NA"),
+      (!is.na(suppressWarnings(as.numeric(masked)) == TRUE)) ~ as.character("Numeric"),
+      (grepl("<", masked) == TRUE) ~ as.character("Less Than"),
+      (grepl(">", masked) == TRUE) ~ as.character("Greater Than"),
+      (grepl("~", masked) == TRUE) ~ as.character("Approximate Value"),
+      (grepl("[A-Za-z]", masked) == TRUE) ~ as.character("Text"),
+      (grepl("%", masked) == TRUE) ~ as.character("Percentage"),
+      (grepl(",", masked) == TRUE) ~ as.character("Comma-Separated Numeric"),
+      TRUE ~ "Coerced to NA"
+    ))
+
+  chars.data$masked = suppressWarnings(as.numeric(stringr::str_replace_all(
+    chars.data$orig,c("<" = "", ">" = "", "~" = "", "," = "","%" = ""))))
+
+  clean.data = chars.data%>%
+    dplyr::relocate("masked",.after = "orig")%>%
+    dplyr::relocate("flag", .after="masked")
+  names(clean.data)[names(clean.data)=="orig"] = col
+  names(clean.data)[names(clean.data)=="masked"] = paste0(col,".nochar")
+  names(clean.data)[names(clean.data)=="flag"] = paste0(col,".nochar_flag")
+  return(clean.data)
+}
diff --git a/man/ConvertSpecialChars.Rd b/man/ConvertSpecialChars.Rd
diff --git a/man/TADAdataRetrieval.Rd b/man/TADAdataRetrieval.Rd
diff --git a/tests/testthat/test-DataDiscoveryRetrieval.R b/tests/testthat/test-DataDiscoveryRetrieval.R
@@ -131,7 +131,7 @@ test_that("TADAdataRetrieval", {
 test_that("TADAdataRetrieval", {
   check_autoclean_meters_works <- TADAdataRetrieval(statecode = "UT",
                                     characteristicName = c("Ammonia", "Nitrate", "Nitrogen"),
-                                    startDate = "01-01-2021")
+                                    startDate = "2021-01-01")
   expect_true(any(check_autoclean_meters_works$ActivityDepthHeightMeasure.MeasureUnitCode!="meters"))
   })