USEPA · mthawley · Jan 26, 2022 · Jan 24, 2022 · Jan 26, 2022
diff --git a/R/TADA_AutoClean.R b/R/TADA_AutoClean.R
@@ -1,37 +1,73 @@
-
-library(dplyr)
-
-#' Title
+#' TADA Auto Clean
+#' 
+#' **Placeholder text for function description
 #'
-#' @param .data Full physical/chemical results dataset from WQP
+#' @param .data TADA dataset
+#' @param FlaggedData Boolean argument indicating whether output will have columns appended to flag data or the output will be a cleaned dataset.
 #'
-#' @return Full physical/chemical results dataset with duplicate records and continuous data
+#' @return Full TADA dataset with duplicate records and continuous data
 #' @export
 #'
 #' @examples WQP.QCed <- TADAautoClean(WQP.raw)
 
-TADAautoClean <- function(.data){
+TADAautoClean <- function(.data, FlaggedData = TRUE){
 
-  excluded.columns <- c("ActivityIdentifier", "ActivityConductingOrganizationText",
-                        "OrganizationFormalName", "OrganizationIdentifier",
-                        "ProjectIdentifier", "ResultCommentText", "ActivityCommentText")
+  field.names <- colnames(.data)
+  excluded.fields <- c("ActivityIdentifier", "ActivityConductingOrganizationText",
+                       "OrganizationFormalName", "OrganizationIdentifier",
+                       "ProjectIdentifier", "ResultCommentText", "ActivityCommentText")
+  dupe.fields <- field.names[!field.names %in% excluded.fields]
 
-  WQP <- .data %>% 
-    # Remove type 1 duplicate data
-    distinct() %>%
-    distinct(across(-all_of(excluded.columns)), .keep_all = TRUE) %>%
-    # Remove continuous data
-    filter(ResultDetectionConditionText != "Reported in Raw Data (attached)" |
-             is.na(ResultDetectionConditionText))
+  if(TADAprofileCheck(.data) == FALSE) {
+    stop("The dataframe does not contain the required fields to use TADA. Use either the full physical/chemical profile downloaded from WQP or download the TADA profile template available on the EPA TADA webpage.")
+  }
 
+  if(TADAprofileCheck(.data) == TRUE) {
+
+    if(FlaggedData == FALSE) {
+      # Remove type 1 duplicate
+      clean.data <- .data[!duplicated(.data),]   
+      # Remove type 2 duplicate
+      clean.data <- clean.data[!duplicated(clean.data[dupe.fields]),]
+      # Remove continuous data
+      clean.data <- filter(clean.data, 
+                           ResultDetectionConditionText != "Reported in Raw Data (attached)" |
+                             is.na(ResultDetectionConditionText))
+
+      return(clean.data)
+    }
+
+    if(FlaggedData == TRUE) {
+      # Remove type 1 duplicate
+      flag.data <- .data[!duplicated(.data),]   
+      # Flag type 2 duplicate
+      flag.data$Duplicate.2 <- as.integer(duplicated(flag.data[dupe.fields]) |
+                                            duplicated(flag.data[dupe.fields],
+                                                       fromLast = TRUE))
+      # Flag continuous data
+      # make cont.data data frame
+      cont.data <- filter(flag.data, 
+                          ResultDetectionConditionText == "Reported in Raw Data (attached)")
+      # append ContDataFlag column
+      cont.data$ContDataFlag <- 1
+      # join cont.data to flag.data
+      flag.data <- merge(flag.data, cont.data, all.x = TRUE) 
+
+      return(flag.data)
+    } else {
+      stop("FlaggedData argument must be Boolean (TRUE or FALSE)")
+    }
+  }
 }
 
 
-#' Title
+#' TADA Remove Empty Columns
+#' 
+#' **Placeholder text for function description
 #'
-#' @param .data Full physical/chemical results dataset from WQP
+#' @param .data TADA dataset
 #'
-#' @return Full physical/chemical results dataset without columns containing only NA values
+#' @return Full TADA dataset without columns containing only NA values
 #' @export
 #'
 #' @examples WQP.QCed <- TADAremoveEmptyColumns(WQP.raw)

diff --git a/R/TADA_ProfileCheck.R b/R/TADA_ProfileCheck.R
@@ -0,0 +1,61 @@
+#' TADA Profile Check
+#' 
+#' This function checks if the column names in a dataframe include the TADA
+#' profile fields. It is used at the beginning of TADA functions to ensure the
+#' input data frame is suitable (i.e. is either the full physical/chemical
+#' results profile downloaded from WQP or the TADA profile template downloaded
+#' from the EPA TADA webpage.)
+#'
+#' @param .data A dataframe
+#'
+#' @return Boolean result indicating whether or not the input dataframe contains
+#' all of the TADA profile fields.
+#'
+#' @examples TADAprofileCheck(df)
+
+TADAprofileCheck <- function(.data){
+
+TADA.fields <- c("OrganizationIdentifier", "OrganizationFormalName",
+                 "ActivityIdentifier", "ActivityTypeCode",
+                 "ActivityMediaName", "ActivityMediaSubdivisionName", 
+                 "ActivityStartDate", "ActivityStartTime.Time", 
+                 "ActivityStartTime.TimeZoneCode", "ActivityEndDate", 
+                 "ActivityEndTime.Time", "ActivityEndTime.TimeZoneCode", 
+                 "ActivityDepthHeightMeasure.MeasureValue", "ActivityDepthHeightMeasure.MeasureUnitCode", 
+                 "ActivityDepthAltitudeReferencePointText", "ActivityTopDepthHeightMeasure.MeasureValue", 
+                 "ActivityTopDepthHeightMeasure.MeasureUnitCode", "ActivityBottomDepthHeightMeasure.MeasureValue",
+                 "ActivityBottomDepthHeightMeasure.MeasureUnitCode", "ProjectIdentifier", 
+                 "ActivityConductingOrganizationText", "MonitoringLocationIdentifier", 
+                 "ActivityCommentText", "SampleAquifer", 
+                 "HydrologicCondition", "HydrologicEvent", 
+                 "SampleCollectionMethod.MethodIdentifier", "SampleCollectionMethod.MethodIdentifierContext",
+                 "SampleCollectionMethod.MethodName", "SampleCollectionEquipmentName",
+                 "ResultDetectionConditionText", "CharacteristicName", 
+                 "ResultSampleFractionText", "ResultMeasureValue", 
+                 "ResultMeasure.MeasureUnitCode", "MeasureQualifierCode", 
+                 "ResultStatusIdentifier", "StatisticalBaseCode", 
+                 "ResultValueTypeName", "ResultWeightBasisText", 
+                 "ResultTimeBasisText", "ResultTemperatureBasisText", 
+                 "ResultParticleSizeBasisText", "PrecisionValue",
+                 "ResultCommentText", "USGSPCode", 
+                 "ResultDepthHeightMeasure.MeasureValue", "ResultDepthHeightMeasure.MeasureUnitCode",
+                 "ResultDepthAltitudeReferencePointText", "SubjectTaxonomicName",
+                 "SampleTissueAnatomyName", "ResultAnalyticalMethod.MethodIdentifier",
+                 "ResultAnalyticalMethod.MethodIdentifierContext", "ResultAnalyticalMethod.MethodName",
+                 "MethodDescriptionText", "LaboratoryName",
+                 "AnalysisStartDate", "ResultLaboratoryCommentText",
+                 "DetectionQuantitationLimitTypeName", "DetectionQuantitationLimitMeasure.MeasureValue",
+                 "DetectionQuantitationLimitMeasure.MeasureUnitCode", "PreparationStartDate",
+                 "ProviderName", "ActivityStartDateTime", "ActivityEndDateTime")
+
+if(class(.data) != "data.frame") {
+  stop("Input object must be of class 'data.frame'")
+}
+
+if(all(TADA.fields %in% colnames(.data)) == TRUE) {
+  TRUE
+} else {
+  FALSE
+}
+
+}