Start with the necessary packages and seed for the vignette.
<- c("geojsonsf", "ggmap", "ggplot2", "graphics", "grDevices", "sf", "sparrpowR", "spatstat.geom", "terra", "tidyterra") loadedPackages invisible(lapply(loadedPackages, library, character.only = TRUE)) set.seed(1234) # for reproducibility
Import data from Open Data DC website.
# Washington, D.C. boundary <- "https://opendata.arcgis.com/datasets/7241f6d500b44288ad983f0942b39663_10.geojson" gis_path1 <- geojsonsf::geojson_sf(gis_path1) dc # American Community Survey 2018 Census Tracts <- "https://opendata.arcgis.com/datasets/faea4d66e7134e57bf8566197f25b3a8_0.geojson" gis_path2 <- geojsonsf::geojson_sf(gis_path2)census
We want to create a realistic boundary (i.e., polygon) of our study area. We are going to spatially clip our DC boundary by the census tracts in an attempt to remove major bodies of water where people do not reside.
<- sf::st_union(census) clipwin <- sf::st_intersection(dc, clipwin) dcc # Plot plot(sf::st_geometry(dc), main = "DC Boundary") plot(sf::st_geometry(census), main = "American Community Survey\n2018") plot(sf::st_geometry(dcc), main = "Clipped Boundary")
Our developed method, sparrpowR,
relies on the spatstat package
suite to simulate data, which assumes point locations are on a planar
(i.e., flat) surface. Our boundary is made up of geographical
coordinates on Earth (i.e., a sphere), so we need to flatten our
boundary by spatially projecting it with an appropriate spatial
reference system (SRS). For the District of Columbia, we are going to
use the World Geodetic System 1984 (WGS84) Universal Transverse Mercator
(UTM) Zone 18N EPSG:32619. We then
convert the boundary into a
owin object required by the spatstat.geom
<- sf::st_transform(sf::st_as_sf(dcc), crs = sf::st_crs(32618)) dcp <- spatstat.geom::as.owin(sf::st_geometry(dcp))dco
In this hypothetical example, we want to estimate the local power of detecting a spatial case cluster relative to control locations. Study participants that tested positive for a disease (i.e., cases) are hypothesized to be located in a circular area around the Navy Yard, an Environmental Protection Agency (EPA) Superfund Site (see the success story).
<- data.frame(lon = 326414.70444451, lat = 4304571.1539442) navy <- sf::st_as_sf(navy, coords = c("lon", "lat"), crs = sf::st_crs(32618)) spf # Plot plot(sf::st_geometry(dcp), main = "Location of Hypothetical\nDisease Cluster") plot(spf, col = "magenta", add = T, pch = 4, cex = 2) ::legend("bottom", xpd = T, y.intersp = -1.5, legend = c("Navy Yard"), col = "magenta", pch = 4, cex = 1, bty = "n")graphics
We will assume that approximately 50 cases (e.g.,
n_case = 50) were clustered around the center of the Navy
samp_case = "MVN") with more cases near the
center and fewer cases about 1 kilometers away (e.g.,
s_case = 1000).
If we were to conduct a study, where would we be sufficiently
statistically powered to detect this spatial case cluster? To answer
this question we will randomly sample approximately 950 participants
n_conrol = 950 or 5% disease prevalence) around the
Navy Yard (e.g.,
samp_control = "MVN") sampling more
participants near the center and fewer participants about 2 kilometers
s_control = 2000). These participants would
test negative for a disease (i.e., controls) were we to conduct a study.
We can then resample control locations iteratively, as if we conducted
the same study multiple times (e.g.,
sim_total = 100). We
can conclude that we are sufficiently powered to detect a spatial
clustering in areas where a statistically significant spatial case
cluster was located in at least 80% (e.g.,
p_thresh = 0.8)
of these theoretical studies. The
calculates both a one-tailed, lower tailed hypothesis (i.e., case
clustering only) and a two-tailed hypothesis (i.e., case and control
clustering). Use the
cascon argument in the
spatial_plots() function to plot either test.
<- Sys.time() # record start time start_time <- sparrpowR::spatial_power(x_case = navy[], y_case = navy[], # center of cluster sim_power x_control = navy[], y_control = navy[], # center of cluster n_case = 50, n_control = 950, # sample size of case/control samp_case = "MVN", samp_control = "MVN", # samplers s_case = 1000, s_control = 2000, # approximate size of clusters alpha = 0.05, # critical p-value sim_total = 100, # number of iterations win = dco, # study area resolution = 100, # number gridded knots on x-axis edge = "diggle", # correct for edge effects adapt = FALSE, # fixed-bandwidth h0 = NULL, # automatically select bandwidth for each iteration verbose = FALSE) # no printout <- Sys.time() # record end time end_time <- end_time - start_time # Calculate run timetime_srr
The process above took about 10.5 minutes to run. Of the 100 iterations, we simulated 40 case locations and an average 766 (SD: 11.61) control locations for an average prevalence of 5.22% (SD: 0.08%). The average bandwidth for the statistic was 0.8 kilometers (SD: 0.01). Fewer case locations and controls locations were simulated than specified in the inputs due to being placed outside of our study window (i.e., Maryland, Virginia, or in the water features around the District of Columbia). Users can modify their inputs to achieve the correct number of cases and controls in their output.
We plot the statistical power for a one-tailed, lower-tail hypothesis
cascon = FALSE) at
alpha = 0.05 using the
<- c("deepskyblue", "springgreen", "red", "navyblue") # colors for plots cols <- c(4,5) # symbols for point-locations chars <- c(0.5,0.5) # size of point-locations sizes <- 0.8 # 80% of iterations with statistically significant results p_thresh ## Data Visualization of Input and Power ::spatial_plots(input = sim_power, # use output of above simulation sparrpowRp_thresh = p_thresh, # power cut-off cascon = FALSE, # one-tail, lower tail hypothesis test (i.e., case clustering) plot_pts = TRUE, # display the points in the second plot chars = chars, # case, control sizes = sizes, # case, control cols = cols) # colors of plot
Now, lets overlay our results on top of a basemap. Here, we will use an open-source map from Stamen, that is unprojected in WGS84. We extract the rectangular box (i.e., bounding box) surrounding our polygon boundary of the District of Columbia (WGS84).
<- sf::st_bbox(sf::st_buffer(sf::st_as_sf(dc), dist = 0.015)) dcbb <- matrix(dcbb, nrow = 2) dcbbm <- ggmap::get_map(location = dcbbm, maptype = "terrain", source = "stamen")base_map
Prepare the points from the first simulation for plotting in ggplot2 suite and prepare the original boundary for plotting in ggplot2 suite.
<- sim_power$sim # extract points from first iteration sim_pts <- sf::st_as_sf(sim_pts) # convert to simple features sim_pts names(sim_pts) <- "mark" ::st_crs(sim_pts) <- sf::st_crs(32618) sf<- sf::st_transform(sim_pts, crs = sf::st_crs(4326)) # project to basemapsim_pts_wgs84
Prepare the SpatRaster from the simulation for plotting in ggplot2 suite.
<- data.frame(x = sim_power$rx, pvalprop y = sim_power$ry, z = sim_power$pval_prop_cas) # extract proportion significant <- na.omit(pvalprop) # remove NAs lrr_narm <- terra::rast(lrr_narm) # convert to SpatRaster pvalprop_raster rm(pvalprop, lrr_narm) # conserve memory ::crs(pvalprop_raster) <- terra::crs(dcp) # set output project (UTM 18N) terra<- terra::project(pvalprop_raster, dc) # unproject (WGS84) pvalprop_raster <- grDevices::colorRampPalette(colors = c(cols, cols), space="Lab")(length(terra::values(pvalprop_raster))) # set colorramprampcols
Plot local power as a continuous outcome with point-locations using the ggplot2 suite.
::ggmap(base_map) + # basemap ggmap::geom_sf(data = dcc, # original boundary, ggplot2fill = "transparent", colour = "black", inherit.aes = FALSE) + ::geom_spatraster(data = pvalprop_raster, # output SpatRaster tidyterrasize = 0, alpha = 0.5) + ::scale_fill_gradientn(colours = rampcols, na.value = NA) + # colors for SpatRaster ggplot2::geom_sf(data = sim_pts_wgs84[-1, ], # simulated point-locations ggplot2::aes(color = mark, shape = mark), ggplot2alpha = 0.8, inherit.aes = FALSE) + ::scale_color_manual(values = cols[3:4]) + # fill of point-locations ggplot2::scale_shape_manual(values = chars) + # shape of point-locations ggplot2::labs(x = "", y = "", fill = "Power", color = "", shape = "") # legend labels ggplot2
Plot local power as a categorical outcome with point-locations using the ggplot2 suite.
<- pvalprop_raster pvalprop_reclass ::values(pvalprop_reclass) <- cut(terra::values(pvalprop_raster), c(-Inf, p_thresh, Inf)) terra ::ggmap(base_map) + # basemap ggmap::geom_sf(data = dcc, # original boundary, ggplot2fill = "transparent", colour = "black", inherit.aes = FALSE) + ::geom_spatraster(data = pvalprop_reclass, # output SpatRaster tidyterrasize = 0, alpha = 0.5) + ::scale_fill_manual(values = cols[c(1,2)], ggplot2labels = c("insufficient", "sufficient"), na.translate = FALSE, na.value = NA) + # colors for SpatRaster ::labs(x = "", y = "", fill = "Power") # legend labels ggplot2