From f0b6b1c30d0bd5456f983592cae59dc75108980e Mon Sep 17 00:00:00 2001 From: cristinamullin <46969696+cristinamullin@users.noreply.github.com> Date: Fri, 14 Mar 2025 14:44:53 -0400 Subject: [PATCH 1/7] Create GeospatialDataIntegration.Rmd --- vignettes/GeospatialDataIntegration.Rmd | 647 ++++++++++++++++++++++++ 1 file changed, 647 insertions(+) create mode 100644 vignettes/GeospatialDataIntegration.Rmd diff --git a/vignettes/GeospatialDataIntegration.Rmd b/vignettes/GeospatialDataIntegration.Rmd new file mode 100644 index 000000000..3223735b7 --- /dev/null +++ b/vignettes/GeospatialDataIntegration.Rmd @@ -0,0 +1,647 @@ +--- +title: "Geospatial Data Integration" +format: html +editor: visual +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: true + fig_caption: yes + fig_height: 8 + fig_width: 8 +vignette: > + %\VignetteEncoding{UTF-8} + %\VignetteIndexEntry{Geospatial Data Integration} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console + markdown: + wrap: 72 +authors: + - name: "Cristina Mullin" + orcid: 0000-0002-0615-6087 + affiliation: EPA + - name: "Hillary Marler" + affiliation: EPA + - name: "Marc Weber" + affiliation: EPA + - name: "Dave Blodgett" + affiliation: USGS + - name: "Michael Dumelle" + affiliation: EPA + - name: "Shelly Thawley" + affiliation: EPA + - name: "Kenny Wong" + affiliation: ORISE at EPA +--- + +```{r knitsetup, include = FALSE} +library(knitr) + +knitr::opts_chunk$set( + echo = TRUE, + warning = FALSE, + message = FALSE +) +``` + +# Overview + +Let's walk through how to create an efficient and reproducible workflow +that integrates several R Packages developed by the U.S. Environmental +Protection Agency (EPA) and the U.S. Geological Survey (USGS) to support +water quality programs (such as the Clean Water Act) and geospatial +(watershed or waterbody) level analyses. + +This workflow demonstrates potential uses (beyond their original +collection purpose) for publicly available water quality data from WQP. +To start, participants will learn how to use EPA's Tools for Automated +Data Analysis (TADA) R Package to retrieve, wrangle, harmonize, quality +check, visualize and analyze WQP data from multiple organizations. + +Next, we will showcase how to bring in other web services and libraries +for easy integration of additional hydrologic and geospatial data. We +then plan to touch briefly on packages that can assist with building +statistical models. Finally, we will demonstrate an example for +analyzing water quality by Assessment Units (AUs), which are state or +tribal nation defined watershed or waterbody areas used for CWA +assessments and reporting water quality conditions to EPA and the +public. + +**Intended Audience** + +Water Quality eXchange (WQX) and Water Quality Portal (WQP) community, +Clean Water Act (CWA) community (EPA, States and Tribal Nations), water +quality and geospatial data analysts/researchers, EPA/USGS and other +federal agencies. + +**Leveraged R Packages** + +- EPA: EPATADA, StreamCatTools, spsurvey, spmodel, SSN2 + +- USGS: dataRetrieval, nhdplusTools, hydroloom + +- Fundamental geospatial packages: sf, prism, terra, leaflet and tmap + +# Install and load packages + +We will be leveraging the EPATADA R Package for WQP data retrieval, +cleaning, visualization and other steps needed to prepare for analysis. +Let's dive into Green Bay, WI! + +First, install and load the remotes package specifying the repo. This is +needed before installing EPATADA because it is only available on GitHub +(not CRAN). + +```{r install_remotes, results = 'hide'} +install.packages("remotes", + repos = "http://cran.us.r-project.org" +) +library(remotes) +``` + +Next, install and load EPATADA using the remotes package. All other +dependencies for this workflow will be downloaded automatically. + +```{r devsetup, include = F, eval = T} +remotes::install_github("USEPA/EPATADA", + ref = "prerelease", + dependencies = TRUE, + force = TRUE) +library(EPATADA) +``` + +```{r usersetup, eval = F, results = 'hide'} +remotes::install_github("USEPA/EPATADA", + ref = "develop", + dependencies = TRUE, + force = TRUE) +library(EPATADA) +``` + +It's go time! Let's time our process. + +```{r startime} +# Record start time +start.time <- Sys.time() +``` + +# WQP data discovery and cleaning + +This is an abbreviated introduction to key TADA Module 1 WQP Data +Discovery and Cleaning functions. Additional functions and a more +detailed example workflow is available +[here](https://usepa.github.io/EPATADA/articles/TADAModule1.html). + +**Retrieve data from the WQP** + +In this example, we will first use EPA's How's My Waterway (HMW) +application to find an applicable Hydrologic Unit Code (HUC) for our +area of interest - the [Fox River, Green Bay, +WI](https://mywaterway.epa.gov/community/040302040405/monitoring). Next, +let's query the WQP using the identified HUC, state abbreviation, and a +date range. In this example, we'll start by pulling all data available +in the WQP for this HUC 12 in Wisconsin for the last 5 years. + +WATERSHED: City of Green Bay-Fox River (040302040405) + +```{r getdata} +# # Uncomment to query the WQP +# GreenBay_FoxRiver <- TADA_DataRetrieval( +# statecode = "WI", +# startDate = "2015-01-01", +# endDate = "2024-12-30", +# huc = c("040302040405"), +# applyautoclean = TRUE +# ) + +GreenBay_FoxRiver <- NMCWorkshopData::GreenBay_FoxRiver +``` + +**Wrangle** + +Now, let's use EPATADA functions to review, visualize, and whittle the +returned WQP data down to include only results that are applicable to +our water quality analysis and area of interest. + +**Flag and remove duplicate results from a single organization** + +```{r single-org-dups} +GreenBay_FoxRiver <- TADA_FindPotentialDuplicatesSingleOrg(GreenBay_FoxRiver) + +GreenBay_FoxRiver <- dplyr::filter(GreenBay_FoxRiver, TADA.SingleOrgDup.Flag == "Unique") +``` + +**Autoclean** + +```{r autoclean} +GreenBay_FoxRiver <- TADA_AutoClean(GreenBay_FoxRiver) +``` + +**Handle censored results** + +```{r censored} +GreenBay_FoxRiver <- TADA_SimpleCensoredMethods(GreenBay_FoxRiver, nd_method = "multiplier", nd_multiplier = 0.5, od_method = "as-is", od_multiplier = "null") +``` + +**Flag and remove duplicates from multiple organizations** + +Two organizations sometimes submit the same exact data to WQP. + +```{r multi-org-dups, eval = F} +GreenBay_FoxRiver <- TADA_FindPotentialDuplicatesMultipleOrgs(GreenBay_FoxRiver) + +GreenBay_FoxRiver <- dplyr::filter(GreenBay_FoxRiver, TADA.ResultSelectedMultipleOrgs == "Y") +``` + +**Filter out any remaining irrelevant data, NA's and empty columns** + +```{r autofilter} +unique(GreenBay_FoxRiver$TADA.ResultMeasureValueDataTypes.Flag) + +sum(is.na(GreenBay_FoxRiver$TADA.ResultMeasureValue)) + +GreenBay_FoxRiver <- TADA_AutoFilter(GreenBay_FoxRiver) +# Topic for discussion: Would users like results with NA units to be dealt with in TADA_AutoFilter or in the TADA_ConvertResultUnits()? These have not yet been addressed. + +unique(GreenBay_FoxRiver$TADA.ResultMeasureValueDataTypes.Flag) + +sum(is.na(GreenBay_FoxRiver$TADA.ResultMeasureValue)) +``` + +**Flag and remove QAQC samples and suspect results** + +```{r flag} +GreenBay_FoxRiver <- TADA_RunKeyFlagFunctions(GreenBay_FoxRiver, clean = TRUE) +``` + +**Flag results above and below threshold, but do not remove them** + +```{r thresholds} +GreenBay_FoxRiver <- TADA_FlagAboveThreshold(GreenBay_FoxRiver, clean = FALSE, flaggedonly = FALSE) + +GreenBay_FoxRiver <- TADA_FlagBelowThreshold(GreenBay_FoxRiver, clean = FALSE, flaggedonly = FALSE) +``` + +**Harmonize synonyms across characteristic, fraction, and speciation** + +```{r harmonize} +GreenBay_FoxRiver <- TADA_HarmonizeSynonyms(GreenBay_FoxRiver) +``` + +**Calculate Total N and Total P from various species and fractions** + +```{r TN-TP} +GreenBay_FoxRiver <- TADA_CalculateTotalNP(GreenBay_FoxRiver, daily_agg = "max") +``` + +**Review unique characteristic, fraction, and species combinations** + +```{r table} +GreenBay_FoxRiver_Counts <- TADA_FieldValuesTable(GreenBay_FoxRiver, field = "TADA.ComparableDataIdentifier") + +DT::datatable(GreenBay_FoxRiver_Counts, fillContainer = TRUE) +``` + +**Filter to focus on frequently monitored characteristics in example +data** + +```{r subset} +GreenBay_FoxRiver_Subset <- GreenBay_FoxRiver %>% + dplyr::filter(TADA.ComparableDataIdentifier %in% + c("SPECIFIC CONDUCTANCE_NA_NA_US/CM", + "PH_NA_NA_NA", + "TOTAL NITROGEN, MIXED FORMS_UNFILTERED_AS N_MG/L", + "TOTAL PHOSPHORUS, MIXED FORMS_UNFILTERED_AS P_UG/L", + "DISSOLVED OXYGEN (DO)_NA_NA_MG/L")) +``` + +**Review organizations for subset** + +```{r org-review} +# Create pie of results by organization +TADA_FieldValuesPie(GreenBay_FoxRiver_Subset, field = "OrganizationFormalName") +``` + +# Exploratory visualizations + +**Generate stats table** + +```{r stats} +GreenBay_FoxRiver_Subset_Stats <- TADA_Stats(GreenBay_FoxRiver_Subset) + +DT::datatable(GreenBay_FoxRiver_Subset_Stats, fillContainer = TRUE) +``` + +**Generate scatterplot** + +```{r scatterplot} +TADA_TwoCharacteristicScatterplot(GreenBay_FoxRiver_Subset, id_cols = "TADA.ComparableDataIdentifier", groups = c("TOTAL PHOSPHORUS, MIXED FORMS_UNFILTERED_AS P_UG/L", "TOTAL NITROGEN, MIXED FORMS_UNFILTERED_AS N_MG/L")) +``` + +**Generate map** + +```{r map} +TADA_OverviewMap(GreenBay_FoxRiver_Subset) + +GreenBay_FoxRiver = TADA_FlagCoordinates(GreenBay_FoxRiver_Subset, clean_outsideUSA = "change sign", clean_imprecise = FALSE) + +# CM note for TADA team discussion: Should results with NA lat/long be addressed within TADA_FlagCoordinates? For example, this df has NA lons from USGS that must be addressed before TADA_MakeSpatial can be run... +# sum(is.na(GreenBay_FoxRiver$LongitudeMeasure)) +GreenBay_FoxRiver_Subset <- GreenBay_FoxRiver_Subset[!is.na(GreenBay_FoxRiver_Subset$LongitudeMeasure),] +``` + +# Geospatial data integration + +## Make spatial + +Leverage TADA_MakeSpatial to transform a WQP dataframe into a geospatial +sf object. + +```{r} +GreenBay_FoxRiver_sf = TADA_MakeSpatial(GreenBay_FoxRiver_Subset) +``` + +Then create a unique identifier based on shared lat long values and +filter to just the 25 unique locations. + +```{r} +GreenBay_FoxRiver_sf$latlon <- paste0(GreenBay_FoxRiver_sf$TADA.LongitudeMeasure, GreenBay_FoxRiver_sf$TADA.LatitudeMeasure) + +GreenBay_FoxRiver_sf <- GreenBay_FoxRiver_sf |> + dplyr::group_by(latlon) |> + dplyr::mutate(loc_id = dplyr::cur_group_id()) + +GreenBay_FoxRiver_sf_locs <- GreenBay_FoxRiver_sf |> + dplyr::filter(!duplicated(loc_id)) +``` + +## Access NHDPlus COMIDs for sites + +We use `StreamCatTools` function `sc_get_comid` (which uses an +`nhdplusTools` web service client) to get the comid for each location. + +```{r} +GreenBay_FoxRiver_sf_locs$COMID <- as.integer(strsplit(StreamCatTools::sc_get_comid(GreenBay_FoxRiver_sf_locs), split = ",")[[1]]) + +nhdplus_data <- nhdplusTools::subset_nhdplus(GreenBay_FoxRiver_sf_locs$COMID, nhdplus_data = "download") + +outlet <- dplyr::filter(nhdplus_data$NHDFlowline_Network, hydroseq == min(hydroseq)) + +nhdplusTools::plot_nhdplus(bbox = sf::st_bbox(outlet)) +plot(sf::st_transform(sf::st_geometry(GreenBay_FoxRiver_sf_locs), 3857), add = TRUE) +``` + +## dataRetrieval/NLDI, nhdplusTools, hydroloom + +Do a network navigation and get NHDPlus for our data. Note that the +network navigation only includes flowline geometry. `nhdplusTools` +subsets all of the NHDPlus. + +```{r} +all_network <- dataRetrieval::findNLDI(comid = outlet$comid, nav = "UT", distance_km = 500) + +# we could select only comids on network +if(FALSE) # don't run this one +nhdplus_data <- nhdplusTools::subset_nhdplus(comids = as.integer(all_network$UT_flowlines$nhdplus_comid), nhdplus_data = "download", flowline_only = FALSE) + +# or we could just get everything in the bbox to be sure we get non-network stuff too! +nhdplus_data <- nhdplusTools::subset_nhdplus( + bbox = sf::st_bbox(all_network$UT_flowlines), + nhdplus_data = "download", + flowline_only = FALSE) + +# see ?nhdplusTools::subset_nhdplus for lots more options! + +sapply(nhdplus_data, nrow) + +sapply(nhdplus_data, names) +``` + +## Addressing sites to the network + +There are two forms of hydrographic addresses: catchment indexing and +linear referencing. The former is established with a point in polygon +analysis. The latter is more nuanced. The following block shows how to +establish both with the data we just retrieved. + +Note that hydroloom is compatible with nhdplus and other attribute +systems. See [hydroloom documentation for +more!](https://doi-usgs.github.io/hydroloom/articles/hydroloom.html) + +```{r} +GreenBay_FoxRiver_sf_locs <- sf::st_join( + GreenBay_FoxRiver_sf_locs, + hydroloom::st_compatibalize(dplyr::select(nhdplus_data$CatchmentSP, featureid), + GreenBay_FoxRiver_sf_locs)) + +# NOTE that featureid and comid are the same!! +all(GreenBay_FoxRiver_sf_locs$COMID == GreenBay_FoxRiver_sf_locs$featureid) + +(linear_references <- hydroloom::index_points_to_lines( + nhdplus_data$NHDFlowline_Network, + GreenBay_FoxRiver_sf_locs)) + +GreenBay_FoxRiver_sf_locs <- dplyr::bind_cols(GreenBay_FoxRiver_sf_locs, linear_references) +``` + +We can take this one step further by indexing points to waterbodies! The +return here tells us what waterbody our locations are near or within. +For on-network waterbodies, it will also include the outlet flowling for +each waterbody. + +```{r} +all_wb <- dplyr::bind_rows(dplyr::select(nhdplus_data$NHDWaterbody, wbid = comid), + dplyr::select(nhdplus_data$NHDArea, wbid = comid)) + +(waterbody_indexes <- hydroloom::index_points_to_waterbodies( + sf::st_transform(all_wb, 5070), + GreenBay_FoxRiver_sf_locs, + flines = nhdplus_data$NHDFlowline_Network, + search_radius = units::as_units(1000, "m"))) +``` + +```{r} +par(mar=c(0,0,0,0)) +nhdplusTools::plot_nhdplus(bbox = sf::st_bbox(GreenBay_FoxRiver_sf), + cache_data = tempfile(fileext = ".rds")) +plot(sf::st_transform(all_wb[all_wb$wbid %in% waterbody_indexes$near_wbid,], + 3857), + add = TRUE, + col = "darkblue", border = NA) +plot(sf::st_transform(sf::st_geometry(GreenBay_FoxRiver_sf_locs), 3857), add = TRUE, col = "white") +``` + +There's much much more where that came from. See the pkgdown sites for +[nhdplusTools](https://doi-usgs.github.io/nhdplusTools/) and +[hydroloom](https://doi-usgs.github.io/hydroloom/index.html) for more! + +## Accessing watershed information for sites + +### Discover what StreamCat metrics we might want to use + +More to come here... + +### Discover land cover of waterhsheds for sites + +We'll pull in all the NLCD categories at the local catchment level for +each location + +```{r} +library(ggplot2) +GB_FR_NLCD <- sc_nlcd(year='2019', aoi='cat', comid=GreenBay_FoxRiver_sf_locs$COMID) + + +GB_FR_Urb <- GB_FR_NLCD |> + dplyr::mutate(Pct_Urbanized = pcturbop2019cat+pcturbmd2019cat+pcturblo2019cat+pcturbhi2019cat) |> + dplyr::select(comid,Pct_Urbanized) +GB_FR_Urb +``` + +### Visualize urbanization for local catchment for each location + +```{r} +ggplot(GB_FR_Urb, aes(x=Pct_Urbanized)) + + geom_density() +``` + +### Pull in data for modeling using `StreamCatTools` + +Now we'll just demonstrate pulling in watershed data that we might use +in a modeling exercise as spatial covariates + +```{r} +ws_data <- sc_get_data(metric='fert,nsurp,nani,manure,IWI', aoi='cat,ws', comid=GreenBay_FoxRiver_sf_locs$COMID) +``` + +# Example Use Case 1: Building Statistical Models + +## Spatial Dependence + +- For spatial data, nearby observations tend to be more similar than + distant observations + +- This phenomena is called *spatial dependence* and can be built into + statistical models + +- The benefits of incorporating spatial dependence are *significant* + and include: + + - More realistic characterization of ecological drivers + - More precise predictions at unobserved locations + +## The `spmodel` R package + +- The `spmodel` R package makes spatial models accessible via + straightforward extensions to common modeling functions like `lm()` + and `glm()` +- Spatial dependence is based on Euclidean (straight-line) distance +- Learn more at + +## The `SSN2` R package + +- Like `spmodel`, `SSN2` extends common modeling functions like `lm()` + and `glm()` +- Spatial dependence is based on stream network distance + (flow-connected, flow-unconnected) +- `SSN2` is an updated version of `SSN` (`SSN` has been archived) +- Learn more at + +# Example Use Case 2: Clean Water Act (CWA) Section 303(d) Assessments Part A + +TADA_MakeSpatial(), TADA_GetATTAINS(), TADA_ViewATTAINS() + +## TADA Module 2: Geospatial Functions + +Additional functions and a more detailed example workflow is available +here: + +## CWA Assessment Process + +We do not have time to cover the full process today. Let's focus on +geospatial aspects! + +[Integrated Reporting Memoranda under CWA Sections 303(d), 305(b) and +314](https://www.epa.gov/tmdl/Integrated%20Reporting%20Guidance%20under%20CWA%20Sections%20303%28d%29%2C%20305%28b%29%20and%20314). + +## What are Assessment Units? + +Geospatial areas for analysis. Let's assign data to those units! + +CWA assessment determinations are made by assessment unit, meaning the +entire assessment unit is assessed as either meeting or not meeting +water quality standards (i.e., thresholds or criteria) for all +designated uses. + +## How are assessment units delineated? + +Assessment units are typically delineated by using watershed-oriented +collections of stream reaches, often broken down by physical features +like waterfalls, bridge crossings, or changes in land use, to analyze +water quality impairments within a specific area, ensuring data +homogeneity and spatial clarity within the assessment unit. + +- Existing Assessment Units are available from ATTAINS geospatial + services + +## Associating ATTAINS Assessment Units with WQP Monitoring Locations + +One of the first steps in the CWA assessment process is to define +Assessment Units and associate data with them. A major source for water +quality data is the WQP. + +## Associating ATTAINS Assessment Units with WQP Monitoring Locations + +- Assessment Units: state or tribal waterbody geospatial features + - These may be lines, areas or points +- Water Quality Portal Monitoring Locations + - These are points + +## TADA_GetATTAINS() + +- Automates matching of WQP monitoring locations with ATTAINS + assessment units that fall within (intersect) the same NHDPlus + catchment + ([details](https://usepa.github.io/EPATADA/articles/TADAModule2.html)) +- The function uses high resolution NHDPlus catchments by default + because 80% of state submitted assessment units in ATTAINS were + developed based on high res NHD; users can select med-res if + applicable to their use case + +```{r} +WQP_with_ATTAINSonly <- TADA_GetATTAINS(GreenBay_FoxRiver_Subset, fill_catchments = FALSE, return_sf = TRUE) +``` + +## TADA_ViewATTAINS() + +- Allows for viewing the + +## Challenges with Automated Approach + +- Certain NHDPlus high res catchments overlap multiple ATTAINS + assessment units (state submitted hydrography) which means the sites + are assigned to both AUs in the current functions. Another challenge + is that the WQP sites are not always accurate (imprecise + coordinates). WQP location metadata may also be helpful for + matching/QAQC'ing waterbody names with ATTAINS waterbody names + instead of relying solely on the lat/long and geospatial/mapping + information. Users must manually review associations for accuracy. + +## Using all available data from the WQP + +Finally, some waterbodies have data available in the WQP or from other +sources, but there are no existing Assessment Units in ATTAINS for them. +In the next section, we will share a way to create AU's using NHDPlus +high resolution catchments. + +# Example Use Case 2: Clean Water Act (CWA) Section 303(d) Assessments Part B + +nhdplusTools, TADA::fetchNHD(), TADA_GetATTAINS(), TADA_ViewATTAINS() + +## Creating new AUs to assess additional areas (leveraging USGS's nhdplusTools and TADA geospatial functions) + +TADA has included a way to do this using TADA_GetATTAINS() +fill_catchments function input. This is included for exploratory +purposes only. In theory, states and tribal nations could use the high +res catchments as new assessment unit polygons to assess additional +areas where there is WQP data but no Assessment Unit yet in ATTAINS, but +that process is outside of TADA. + +## Creating new AUs to assess additional areas + +For WQP monitoring sites that DO NOT overlap an existing ATTAINS feature +(neither ATTAINS NHDPlus high res catchment snap shot or state submitted +points/lines/polys), there is nothing to use from ATTAINS because these +are areas where there is WQP data but no ATTAINS Assessment Unit yet. + +## Creating new AUs to assess additional areas + +For these, we implemented a solution using NHDPlusTools to pull in +either NHDPlus high res or med res catchments (user can choose, but high +res is the default) and match those with the WQP sites & create new IDs +(essentially creating new AUs that are the catchments that intersect +these WQP sites). + +```{r} +WQP_withATTAINSandNHDPluscatchments <- TADA_GetATTAINS(GreenBay_FoxRiver_Subset, fill_catchments = TRUE, return_sf = TRUE) +``` + +# Visualizing Water Quality Issues + +- Placeholder: Now that we have data assigned to watershed/waterbodies + (Assessment Units). Let's showcase how to visualize water quality + issues for a few characteristics of interest on a map and in figures + using available packages. + +# Conclusion + +```{r} +end.time <- Sys.time() + +end.time - start.time +``` + +# Contribute + +Note: TADA is still under development. New functionality is added +weekly, and sometimes we need to make bug fixes in response to tester +and user feedback. We appreciate your feedback, patience, and interest +in these helpful tools. + +If you are interested in contributing to TADA development, more +information is available at: + + + +We welcome collaboration with external partners. + +Contribute to EPATADA in a way that helps elevate work you have already +done, broadens the user base of the package, or improves the resource +for all! + +**Thank you to our workshop contributors!** + +- EPA: Cristina Mullin (mullin.cristina\@epa.gov), Marc Weber, Hillary + Marler, Kenny Wong, Michael Dumelle, Shelly Thawley + +- USGS: Dave Blodgett From d8f5a3cade056b59bd0921075756ad47ce3ecd4d Mon Sep 17 00:00:00 2001 From: cristinamullin <46969696+cristinamullin@users.noreply.github.com> Date: Fri, 14 Mar 2025 16:27:24 -0400 Subject: [PATCH 2/7] add depends - StreamCatTools --- DESCRIPTION | 3 ++- vignettes/GeospatialDataIntegration.Rmd | 10 +++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3a64720ce..9544d1c3c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -99,7 +99,8 @@ Suggests: yaml, remotes Remotes: - DOI-USGS/dataRetrieval@develop + DOI-USGS/dataRetrieval@develop, + USEPA/StreamCatTools VignetteBuilder: knitr, rmarkdown Language: en-US Config/testthat/edition: 3 diff --git a/vignettes/GeospatialDataIntegration.Rmd b/vignettes/GeospatialDataIntegration.Rmd index 3223735b7..1b2dc7246 100644 --- a/vignettes/GeospatialDataIntegration.Rmd +++ b/vignettes/GeospatialDataIntegration.Rmd @@ -105,7 +105,7 @@ dependencies for this workflow will be downloaded automatically. ```{r devsetup, include = F, eval = T} remotes::install_github("USEPA/EPATADA", - ref = "prerelease", + ref = "geospatial-vignette", dependencies = TRUE, force = TRUE) library(EPATADA) @@ -428,8 +428,7 @@ We'll pull in all the NLCD categories at the local catchment level for each location ```{r} -library(ggplot2) -GB_FR_NLCD <- sc_nlcd(year='2019', aoi='cat', comid=GreenBay_FoxRiver_sf_locs$COMID) +GB_FR_NLCD <- StreamCatTools::sc_nlcd(year='2019', aoi='cat', comid=GreenBay_FoxRiver_sf_locs$COMID) GB_FR_Urb <- GB_FR_NLCD |> @@ -441,7 +440,7 @@ GB_FR_Urb ### Visualize urbanization for local catchment for each location ```{r} -ggplot(GB_FR_Urb, aes(x=Pct_Urbanized)) + +ggplot2::ggplot(GB_FR_Urb, aes(x=Pct_Urbanized)) + geom_density() ``` @@ -603,7 +602,8 @@ res is the default) and match those with the WQP sites & create new IDs these WQP sites). ```{r} -WQP_withATTAINSandNHDPluscatchments <- TADA_GetATTAINS(GreenBay_FoxRiver_Subset, fill_catchments = TRUE, return_sf = TRUE) +# # need to troubleshoot why this is not working +# WQP_withATTAINSandNHDPluscatchments <- TADA_GetATTAINS(GreenBay_FoxRiver_Subset, fill_catchments = TRUE, return_sf = TRUE) ``` # Visualizing Water Quality Issues From 8da4e079e86090cf81d9dc708596f091a7e20072 Mon Sep 17 00:00:00 2001 From: Weber Date: Fri, 21 Mar 2025 15:07:49 -0700 Subject: [PATCH 3/7] A couple suggested updates --- vignettes/GeospatialDataIntegration.Rmd | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/vignettes/GeospatialDataIntegration.Rmd b/vignettes/GeospatialDataIntegration.Rmd index 1b2dc7246..2b6d07176 100644 --- a/vignettes/GeospatialDataIntegration.Rmd +++ b/vignettes/GeospatialDataIntegration.Rmd @@ -146,16 +146,16 @@ in the WQP for this HUC 12 in Wisconsin for the last 5 years. WATERSHED: City of Green Bay-Fox River (040302040405) ```{r getdata} -# # Uncomment to query the WQP -# GreenBay_FoxRiver <- TADA_DataRetrieval( -# statecode = "WI", -# startDate = "2015-01-01", -# endDate = "2024-12-30", -# huc = c("040302040405"), -# applyautoclean = TRUE -# ) +# Uncomment to query the WQP +GreenBay_FoxRiver <- TADA_DataRetrieval( + statecode = "WI", + startDate = "2015-01-01", + endDate = "2024-12-30", + huc = c("040302040405"), + applyautoclean = TRUE +) -GreenBay_FoxRiver <- NMCWorkshopData::GreenBay_FoxRiver +# GreenBay_FoxRiver <- NMCWorkshopData::GreenBay_FoxRiver ``` **Wrangle** @@ -417,10 +417,15 @@ There's much much more where that came from. See the pkgdown sites for [hydroloom](https://doi-usgs.github.io/hydroloom/index.html) for more! ## Accessing watershed information for sites +We can access watershed information for each unique site location, getting both the landscape data for local catchment or the full upstream watershed for each particular site using [StreamCatTools](https://usepa.github.io/StreamCatTools/index.html) ### Discover what StreamCat metrics we might want to use -More to come here... +```{r} +metrics <- sc_get_params(param='name') +print(paste0('A selection of available StreamCat metrics include: ',paste(metrics[1:10],collapse = ', '))) +``` + ### Discover land cover of waterhsheds for sites From 0bd74bed289d1b97accbadc5abaa4327d593891e Mon Sep 17 00:00:00 2001 From: hillarymarler Date: Mon, 31 Mar 2025 09:26:10 -0400 Subject: [PATCH 4/7] Update GeospatialDataIntegration.Rmd --- vignettes/GeospatialDataIntegration.Rmd | 51 ++++++++++++++++++++----- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/vignettes/GeospatialDataIntegration.Rmd b/vignettes/GeospatialDataIntegration.Rmd index 2b6d07176..4c0d1d1e2 100644 --- a/vignettes/GeospatialDataIntegration.Rmd +++ b/vignettes/GeospatialDataIntegration.Rmd @@ -152,7 +152,8 @@ GreenBay_FoxRiver <- TADA_DataRetrieval( startDate = "2015-01-01", endDate = "2024-12-30", huc = c("040302040405"), - applyautoclean = TRUE + applyautoclean = TRUE, + ask = FALSE ) # GreenBay_FoxRiver <- NMCWorkshopData::GreenBay_FoxRiver @@ -164,33 +165,60 @@ Now, let's use EPATADA functions to review, visualize, and whittle the returned WQP data down to include only results that are applicable to our water quality analysis and area of interest. +**Autoclean** + +In the default arguments for TADA_DataRetrieval, applyautoclean = TRUE. +This runs TADA_AutoClean on the newly retrieved data frame. +TADA_AutoClean is a powerful function which performs a variety of tasks +including: (1) creating new "TADA" prefixed columns and and capitalizing +their contents to reduce case sensitivity issues, (2) converts special +characters in value columns, (3) converts latitude and longitude values +to numeric, (4) replaces "meters" with "m", (5) replaces deprecated +characteristic names with current WQX names, (6) harmonizes result and +detection limit units to WQX, TADA or user supplied target units, (7) +converts depths to meters, and (8) creates the column TADA.ComparableID +by concatenating characteristic name, result sample fraction, method +speciation, and result measure unit. + **Flag and remove duplicate results from a single organization** +We can identify data records uploaded by the same organization with the +same date, time, monitoring location, activity type, characteristic +name, fraction, taxonomic name, depth columns, and result value and +flags them as potential duplicates. The data user must determine if the +data records are unique or represent overlap that could cause issues in +analysis. For this example, we will retain only results flagged as +"Unique". + ```{r single-org-dups} +# find duplicate results submitted by single org GreenBay_FoxRiver <- TADA_FindPotentialDuplicatesSingleOrg(GreenBay_FoxRiver) +# retain unique flagged results GreenBay_FoxRiver <- dplyr::filter(GreenBay_FoxRiver, TADA.SingleOrgDup.Flag == "Unique") ``` -**Autoclean** +**Censored results** -```{r autoclean} -GreenBay_FoxRiver <- TADA_AutoClean(GreenBay_FoxRiver) -``` - -**Handle censored results** +TADA provides some simple methods for dealing with censored results, +such as using multiplying the detection limit by a user supplied value +or leaving the result as is. ```{r censored} +# substitude nondetects with 0.5 detection limit, leave overdetects as is GreenBay_FoxRiver <- TADA_SimpleCensoredMethods(GreenBay_FoxRiver, nd_method = "multiplier", nd_multiplier = 0.5, od_method = "as-is", od_multiplier = "null") ``` **Flag and remove duplicates from multiple organizations** -Two organizations sometimes submit the same exact data to WQP. +Two organizations sometimes submit the same exact data to WQP. Filtering +out these duplicates can prevent issues in analysis. ```{r multi-org-dups, eval = F} +# find potential dups multiple orgs GreenBay_FoxRiver <- TADA_FindPotentialDuplicatesMultipleOrgs(GreenBay_FoxRiver) +# filter out GreenBay_FoxRiver <- dplyr::filter(GreenBay_FoxRiver, TADA.ResultSelectedMultipleOrgs == "Y") ``` @@ -417,7 +445,11 @@ There's much much more where that came from. See the pkgdown sites for [hydroloom](https://doi-usgs.github.io/hydroloom/index.html) for more! ## Accessing watershed information for sites -We can access watershed information for each unique site location, getting both the landscape data for local catchment or the full upstream watershed for each particular site using [StreamCatTools](https://usepa.github.io/StreamCatTools/index.html) + +We can access watershed information for each unique site location, +getting both the landscape data for local catchment or the full upstream +watershed for each particular site using +[StreamCatTools](https://usepa.github.io/StreamCatTools/index.html) ### Discover what StreamCat metrics we might want to use @@ -426,7 +458,6 @@ metrics <- sc_get_params(param='name') print(paste0('A selection of available StreamCat metrics include: ',paste(metrics[1:10],collapse = ', '))) ``` - ### Discover land cover of waterhsheds for sites We'll pull in all the NLCD categories at the local catchment level for From f842ddaeb44cd5b6634a2012a8695c53fd9bbad1 Mon Sep 17 00:00:00 2001 From: hillarymarler Date: Mon, 7 Apr 2025 10:52:29 -0400 Subject: [PATCH 5/7] Update GeospatialDataIntegration.Rmd add install streamcatools to vignette --- vignettes/GeospatialDataIntegration.Rmd | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/vignettes/GeospatialDataIntegration.Rmd b/vignettes/GeospatialDataIntegration.Rmd index 4c0d1d1e2..e09140c28 100644 --- a/vignettes/GeospatialDataIntegration.Rmd +++ b/vignettes/GeospatialDataIntegration.Rmd @@ -101,7 +101,9 @@ library(remotes) ``` Next, install and load EPATADA using the remotes package. All other -dependencies for this workflow will be downloaded automatically. +dependencies for this workflow will be downloaded automatically. We will +also need to install StreamCatTools for some steps later in the +workflow. ```{r devsetup, include = F, eval = T} remotes::install_github("USEPA/EPATADA", @@ -116,7 +118,16 @@ remotes::install_github("USEPA/EPATADA", ref = "develop", dependencies = TRUE, force = TRUE) + +remotes::install_github("USEPA/StreamCatTools", + ref = "master", + dependencies = TRUE, + force = TRUE) + library(EPATADA) +library(StreamCatTools) + + ``` It's go time! Let's time our process. From 223ca6ffee8643f37d4f668449118227c975478e Mon Sep 17 00:00:00 2001 From: cristinamullin <46969696+cristinamullin@users.noreply.github.com> Date: Mon, 5 May 2025 17:08:11 -0400 Subject: [PATCH 6/7] finish draft --- DESCRIPTION | 4 +- vignettes/GeospatialDataIntegration.Rmd | 75 ++++++++++++------------- 2 files changed, 37 insertions(+), 42 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0b55c722b..b05039471 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -80,7 +80,7 @@ Imports: DT, dataRetrieval, dbscan, - janitor, + janitor Depends: R (>= 3.5.0) Suggests: @@ -103,7 +103,7 @@ Suggests: remotes Remotes: DOI-USGS/dataRetrieval@develop, - USEPA/StreamCatTools + USEPA/StreamCatTools@master VignetteBuilder: knitr, rmarkdown Language: en-US Config/testthat/edition: 3 diff --git a/vignettes/GeospatialDataIntegration.Rmd b/vignettes/GeospatialDataIntegration.Rmd index e09140c28..7d178969f 100644 --- a/vignettes/GeospatialDataIntegration.Rmd +++ b/vignettes/GeospatialDataIntegration.Rmd @@ -126,8 +126,6 @@ remotes::install_github("USEPA/StreamCatTools", library(EPATADA) library(StreamCatTools) - - ``` It's go time! Let's time our process. @@ -241,8 +239,11 @@ unique(GreenBay_FoxRiver$TADA.ResultMeasureValueDataTypes.Flag) sum(is.na(GreenBay_FoxRiver$TADA.ResultMeasureValue)) GreenBay_FoxRiver <- TADA_AutoFilter(GreenBay_FoxRiver) -# Topic for discussion: Would users like results with NA units to be dealt with in TADA_AutoFilter or in the TADA_ConvertResultUnits()? These have not yet been addressed. +``` + +Check to make sure there are no more NA's in TADA.ResultMeasureValue. +```{r recheck} unique(GreenBay_FoxRiver$TADA.ResultMeasureValueDataTypes.Flag) sum(is.na(GreenBay_FoxRiver$TADA.ResultMeasureValue)) @@ -282,8 +283,7 @@ GreenBay_FoxRiver_Counts <- TADA_FieldValuesTable(GreenBay_FoxRiver, field = "TA DT::datatable(GreenBay_FoxRiver_Counts, fillContainer = TRUE) ``` -**Filter to focus on frequently monitored characteristics in example -data** +Filter to focus on frequently monitored characteristics in example data ```{r subset} GreenBay_FoxRiver_Subset <- GreenBay_FoxRiver %>% @@ -320,14 +320,24 @@ TADA_TwoCharacteristicScatterplot(GreenBay_FoxRiver_Subset, id_cols = "TADA.Comp **Generate map** -```{r map} +```{r TADA Overview Map} TADA_OverviewMap(GreenBay_FoxRiver_Subset) +``` + +**Clean up coordinate issues** +```{r Coordinate issues} +# Change coordinate sign if appropriate GreenBay_FoxRiver = TADA_FlagCoordinates(GreenBay_FoxRiver_Subset, clean_outsideUSA = "change sign", clean_imprecise = FALSE) -# CM note for TADA team discussion: Should results with NA lat/long be addressed within TADA_FlagCoordinates? For example, this df has NA lons from USGS that must be addressed before TADA_MakeSpatial can be run... -# sum(is.na(GreenBay_FoxRiver$LongitudeMeasure)) +# This df has NA lons from USGS that must be addressed before TADA_MakeSpatial can be run... +sum(is.na(GreenBay_FoxRiver_Subset$LongitudeMeasure)) + +# Remove rows with NA lons from df GreenBay_FoxRiver_Subset <- GreenBay_FoxRiver_Subset[!is.na(GreenBay_FoxRiver_Subset$LongitudeMeasure),] + +# Recheck +sum(is.na(GreenBay_FoxRiver_Subset$LongitudeMeasure)) ``` # Geospatial data integration @@ -341,8 +351,7 @@ sf object. GreenBay_FoxRiver_sf = TADA_MakeSpatial(GreenBay_FoxRiver_Subset) ``` -Then create a unique identifier based on shared lat long values and -filter to just the 25 unique locations. +Then create a unique identifier based on shared lat long values and filter to just the 25 unique locations. ```{r} GreenBay_FoxRiver_sf$latlon <- paste0(GreenBay_FoxRiver_sf$TADA.LongitudeMeasure, GreenBay_FoxRiver_sf$TADA.LatitudeMeasure) @@ -465,11 +474,11 @@ watershed for each particular site using ### Discover what StreamCat metrics we might want to use ```{r} -metrics <- sc_get_params(param='name') +metrics <- StreamCatTools::sc_get_params(param = 'metric_names') print(paste0('A selection of available StreamCat metrics include: ',paste(metrics[1:10],collapse = ', '))) ``` -### Discover land cover of waterhsheds for sites +### Discover land cover of watersheds for sites We'll pull in all the NLCD categories at the local catchment level for each location @@ -487,8 +496,8 @@ GB_FR_Urb ### Visualize urbanization for local catchment for each location ```{r} -ggplot2::ggplot(GB_FR_Urb, aes(x=Pct_Urbanized)) + - geom_density() +ggplot2::ggplot(GB_FR_Urb, ggplot2::aes(x=Pct_Urbanized)) + + ggplot2::geom_density() ``` ### Pull in data for modeling using `StreamCatTools` @@ -497,10 +506,10 @@ Now we'll just demonstrate pulling in watershed data that we might use in a modeling exercise as spatial covariates ```{r} -ws_data <- sc_get_data(metric='fert,nsurp,nani,manure,IWI', aoi='cat,ws', comid=GreenBay_FoxRiver_sf_locs$COMID) +ws_data <- StreamCatTools::sc_get_data(metric='fert,nsurp,nani,manure,IWI', aoi='cat,ws', comid=GreenBay_FoxRiver_sf_locs$COMID) ``` -# Example Use Case 1: Building Statistical Models +# Building Statistical Models ## Spatial Dependence @@ -533,10 +542,6 @@ ws_data <- sc_get_data(metric='fert,nsurp,nani,manure,IWI', aoi='cat,ws', comid= - `SSN2` is an updated version of `SSN` (`SSN` has been archived) - Learn more at -# Example Use Case 2: Clean Water Act (CWA) Section 303(d) Assessments Part A - -TADA_MakeSpatial(), TADA_GetATTAINS(), TADA_ViewATTAINS() - ## TADA Module 2: Geospatial Functions Additional functions and a more detailed example workflow is available @@ -583,7 +588,7 @@ quality data is the WQP. - Water Quality Portal Monitoring Locations - These are points -## TADA_GetATTAINS() +## TADA_GetATTAINS() Part A - Automates matching of WQP monitoring locations with ATTAINS assessment units that fall within (intersect) the same NHDPlus @@ -596,11 +601,9 @@ quality data is the WQP. ```{r} WQP_with_ATTAINSonly <- TADA_GetATTAINS(GreenBay_FoxRiver_Subset, fill_catchments = FALSE, return_sf = TRUE) -``` - -## TADA_ViewATTAINS() -- Allows for viewing the +TADA_ViewATTAINS(WQP_with_ATTAINSonly) +``` ## Challenges with Automated Approach @@ -617,16 +620,14 @@ WQP_with_ATTAINSonly <- TADA_GetATTAINS(GreenBay_FoxRiver_Subset, fill_catchment Finally, some waterbodies have data available in the WQP or from other sources, but there are no existing Assessment Units in ATTAINS for them. -In the next section, we will share a way to create AU's using NHDPlus -high resolution catchments. -# Example Use Case 2: Clean Water Act (CWA) Section 303(d) Assessments Part B +# TADA_GetATTAINS() Part B -nhdplusTools, TADA::fetchNHD(), TADA_GetATTAINS(), TADA_ViewATTAINS() +See: nhdplusTools, TADA::fetchNHD(), TADA_GetATTAINS(), TADA_ViewATTAINS() ## Creating new AUs to assess additional areas (leveraging USGS's nhdplusTools and TADA geospatial functions) -TADA has included a way to do this using TADA_GetATTAINS() +TADA has included a way to explore this using TADA_GetATTAINS() fill_catchments function input. This is included for exploratory purposes only. In theory, states and tribal nations could use the high res catchments as new assessment unit polygons to assess additional @@ -649,18 +650,12 @@ res is the default) and match those with the WQP sites & create new IDs these WQP sites). ```{r} -# # need to troubleshoot why this is not working -# WQP_withATTAINSandNHDPluscatchments <- TADA_GetATTAINS(GreenBay_FoxRiver_Subset, fill_catchments = TRUE, return_sf = TRUE) -``` +WQP_withATTAINSandNHDPluscatchments <- TADA_GetATTAINS(GreenBay_FoxRiver_Subset, fill_catchments = TRUE, return_sf = TRUE) -# Visualizing Water Quality Issues - -- Placeholder: Now that we have data assigned to watershed/waterbodies - (Assessment Units). Let's showcase how to visualize water quality - issues for a few characteristics of interest on a map and in figures - using available packages. +TADA_ViewATTAINS(WQP_withATTAINSandNHDPluscatchments) +``` -# Conclusion +# The end! ```{r} end.time <- Sys.time() From 9ecc1ed282ce136e051ac9afc12c25ebe2bdfdf2 Mon Sep 17 00:00:00 2001 From: cristinamullin <46969696+cristinamullin@users.noreply.github.com> Date: Mon, 5 May 2025 17:20:18 -0400 Subject: [PATCH 7/7] add StreamCatTools to suggests --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index b05039471..5084ad873 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -84,6 +84,7 @@ Imports: Depends: R (>= 3.5.0) Suggests: + StreamCatTools, lwgeom, RColorBrewer, rmarkdown,