Description

The New York State Sewersheds shapefile is a GIS layer that contains estimates for every municipal sewershed in New York States as of November 2022 with estimated populations higher than 100 persons. For data on sewersheds with fewer that 100 persons, please contact the author. Each sewershed is a georeferenced polygon for the estimated boundary of all parcels of land that are connected to the public wastewater treatment plant and the sewers flow into the common point, which is the treatment plant’s location. Please include the citation below when using these data.

Link to full dataset

Citation

Hill DT, Larsen DA (2023) Using geographic information systems to link population estimates to wastewater surveillance data in New York State, USA. PLOS Glob Public Health 3(1): e0001062. https://doi.org/10.1371/journal.pgph.0001062

New York State Sewersheds

Map of sewersheds across NY

Population estimation code

Packages

library(tidyverse)
library(sf)
library(tidycensus)
library(dplyr)
library(readxl)
library(stringi)
library(readr)
library(tidyr)
library(tigris)
library(tmap)
library(data.table)
library(stringr)
library(knitr)

Methods

To get estimates for 2018 and 2020 population by sewershed, we use 2010 block population values from the decennial census overlayed with the sewersheds and assign an apportioned value for each block partially in the sewershed.

Then we obtain a population change estimate using 2018 American Community Survey (ACS) data at the block group level and 2010 ACS data. We do this because there are no recent block estimates for population and we want our values to be as accurate as possible. Comparison of block group to block estimates for sewersheds were roughly the same with between 75 and 80% of each other. We chose to use the population change value for the ACS data applied to the 2010 block data to keep our esimates as close to the granular block data as possible.

Additional covariates for modelling are brought in as percentages of the total popualtion per sewershed using 2018 ACS block group data. The only variable being used from 2010 is the estimated percentage of the population living in nursing homes.

Procedure steps

  1. Define objects for each function (county for sewersheds, block shapefile, block group shapefile, sewershed shapefile)

  2. Create functions for each step

  3. Create shapefile with 2010 block census data

  4. Change coordinate system of sewershed shapefile to match block shapefile

  5. Calculated sewershed values using dissolve and aggregation of overlayed shapefiles

  6. Merge population data to sewershed shapefile

  7. Create shapefile with 2010 and 2018 block group census data

  8. Repeat steps 4 - 6

  9. Save new sewershed file with population data

Functions

These are the functions used for each step of the process which allow for repeating the procedure for each county in New York states. This method can be adapted to any state in the U.S.

Notes

A Census API key is needed to use the “tidycensus” package.

2010 block data grab and analysis functions

# block function
data_grab_2010_Block_function <- function(x){
  dataGrab_2010 <- get_decennial(geography = "block",
                                 state = "NY",
                                 county = county_sewer,
                                 year = 2010,
                                 
                                 variables = c(population = "H010001"),
                                 moe_level = 95)
  
  #no margin of error for decennial census
  wwCensus_2010 <- dataGrab_2010 %>% 
    spread(variable, value) %>% 
    distinct()
  
  wwCensus_2010$year <- rep(2010, nrow(wwCensus_2010))
  
  
  
  wwCensus_2010 <- wwCensus_2010 %>% 
    dplyr::select(GEOID, year, population)
  censusGeo_block$GEOID <- censusGeo_block$GEOID10
  wwCensusGeo_block <- merge(censusGeo_block, wwCensus_2010, by = "GEOID")
  
}

Census block dissolve and aggregate function

#function calculate area for each block, intersect, and calculate new area then dissolve
census_dissolve_function <- function(wwCensusGeo_block, sewersheds){
  #area for each block
  wwCensusGeo_block$area_BG <- st_area(wwCensusGeo_block)
  
  #calculate area proportion in each sewershed
  
  #need to add slight buffer for intersection to work
  wwCensusGeo_block <- st_buffer(wwCensusGeo_block, dist=0)
  sewersheds <- st_buffer(sewersheds, dist = 0)
  Block_clip <- st_intersection(wwCensusGeo_block, sewersheds)
  #area for clipped blocks
  Block_clip$area_clip <- st_area(Block_clip)
  
  #area proportion in each
  Block_clip$area_prop <- Block_clip$area_clip / Block_clip$area_BG
  Block_clip$population_2010_est <- Block_clip$area_prop * Block_clip$population
  Block_dt <- as.data.table(Block_clip)
  Block_dt_final <-
    Block_dt %>%
    group_by(Sewershed) %>%
    summarise(population_block_2010 = sum(population_2010_est))
  
}

Data grab for 2010 and 2018 census block groups

datagrab_BG_function <- function(x){
  
  dataGrab_2010 <- get_decennial(geography = "block group",
                                 state = "NY",
                                 county = county_sewer,
                                 year = 2010,
                                 
                                 variables = c(population_BG_2010 = "H010001",
                                               group_quartersPop = "P029026",
                                               group_quartersNursing = "P042005"
                                 ),
                                 moe_level = 95)
  
  #no margin of error for decennial census
  wwCensus_2010 <- dataGrab_2010 %>% 
    spread(variable, value) %>% 
    distinct()
  
  
  dataGrab_2018 <- get_acs(geography = "block group",
                           state = "NY",
                           county = county_sewer,
                           variables = c(population_BG_2018 = "B01001_001",
                                         housing_units = "B25001_001",
                                         vacant_units = "B25002_003",
                                         owner_occupied = "B25002_002",
                                         Male_age_65_66 = "B01001_020",
                                         Male_age_67_69 = "B01001_021",
                                         Male_age_70_74 = "B01001_022",
                                         Male_age_75_79 = "B01001_023",
                                         Male_age_80_84 = "B01001_024",
                                         Male_age_85_plus = "B01001_025",
                                         F_age_65_66 = "B01001_044",
                                         F_age_67_69 = "B01001_045",
                                         F_age_70_74 = "B01001_046",
                                         F_age_75_79 = "B01001_047",
                                         F_age_80_84 = "B01001_048",
                                         F_age_85_plus = "B01001_049",
                                         black_alone = "B02001_003",
                                         Amer_indian = "B02001_004",
                                         Asian = "B02001_005",
                                         Pacific_islander = "B02001_006",
                                         Other_race = "B02001_007",
                                         Two_or_more_races = "B02001_008",
                                         hispanic_pop = "B03003_003",
                                         median_house_value = "B25077_001",
                                         Total_Education = "B15003_001",
                                         overHS1 = "B15003_017",
                                         overHS2 = "B15003_018",
                                         overHS3 = "B15003_019",
                                         overHS4 = "B15003_020",
                                         overHS5 = "B15003_021",
                                         overHS6 = "B15003_022",
                                         overHS7 = "B15003_023",
                                         overHS8 = "B15003_024",
                                         overHS9 = "B15003_025"
                           ),
                           year = 2018)
  
  wwCensus_2018 <- dataGrab_2018 %>% 
    dplyr::select(-moe) %>% 
    spread(variable, estimate) %>% 
    distinct()
  
  
  wwCensus_2018 <- wwCensus_2018 %>% 
    mutate(
      Over64_Per = (Male_age_65_66 + Male_age_67_69 +Male_age_70_74 +
                      Male_age_75_79 + Male_age_80_84 + Male_age_85_plus +
                      F_age_65_66 + F_age_67_69 + F_age_70_74 +
                      F_age_75_79 + F_age_80_84 + F_age_85_plus),
      Nonwhite_Pop = (black_alone + Amer_indian + Asian + Pacific_islander + Other_race + Two_or_more_races),
      Under_HS_tot = (overHS1 + overHS2 + overHS3 + overHS4 + overHS5 + overHS6 +
                        overHS7 + overHS8 +overHS9))
  
  
  wwCensus_2018 <- wwCensus_2018 %>% 
    dplyr::select(GEOID, population_BG_2018, owner_occupied, Over64_Per, housing_units, vacant_units, black_alone, hispanic_pop, Nonwhite_Pop, Total_Education, Under_HS_tot)
  wwCensus <- left_join(wwCensus_2010, wwCensus_2018, by = "GEOID")
  #ratio which can be multiplied by 2010 value to determine the increase or decrease in population
  
  #
  #merge to shapefile loaded earlier
  wwCensusGeo_change <- merge(censusGeo_BG, wwCensus, by = "GEOID")
  
}

Census block group dissolve and aggregate function

census_BG_dissolve_function <- function(wwCensusBG_2010_2018, sewersheds){
  #area for each block group
  wwCensusBG_2010_2018$area_BG <- st_area(wwCensusBG_2010_2018)
  
  #calculate area proportion in each sewershed so can assign to the population change values to get estimate for population in each based on block group rather than block data
  wwCensusBG_2010_2018 <- st_buffer(wwCensusBG_2010_2018, dist=0)
  sewersheds <- st_buffer(sewersheds, dist=0)
  BG_clip <- st_intersection(wwCensusBG_2010_2018, sewersheds)
  #area for clipped block groups
  BG_clip$area_clip <- st_area(BG_clip)
  
  #area proportion in each
  BG_clip$area_prop <- BG_clip$area_clip / BG_clip$area_BG
  
  #now multiply by the population values for 2010 and 2018
  # block population data
  #now multiply by the population values for 2010 
  BG_clip_dt <- as.data.table(BG_clip)
  BG_clip_dt <- BG_clip_dt[, 12:24 := lapply(.SD, '*', area_prop), .SDcols = 12:24][]
  
  #now dissolve to sewershed and compare to block values to get a margin of error
  #but first change to sf object
  BG.final <-
    BG_clip_dt %>%
    group_by(Sewershed) %>%
    summarise(bg_pop_2010 = sum(population_BG_2010),
              bg_pop_2018 = sum(population_BG_2018),
              owner_occupied_2018 = sum(owner_occupied),
              Over64_2018 = sum(Over64_Per),
              housing_units_2018 = sum(housing_units),
              vacant_units_2018 = sum(vacant_units),
              group_quartersNursing_2010 = sum(group_quartersNursing),
              group_quartersPop_2010 = sum(group_quartersPop),
              black_alone_2018 = sum(black_alone),
              hispanic_pop_2018 = sum(hispanic_pop),
              Nonwhite_pop_2018 = sum(Nonwhite_Pop),
              Total_Education_2018 = sum(Total_Education),
              Under_HS_tot_2018 = sum(Under_HS_tot)
    )
  
}

Test with Cayuga county sewersheds

### Step 1
# sewershed shapefile
sewersheds <- st_read("data/Cayuga catchments/Cayuga catchments.shp")

# how many counties to download?
table(sewersheds$OTH_COUNT) # table of extent 0, only one county (cayuga)

# define county where sewersheds are
county_sewer <- c("Cayuga")

# census block shapefile
censusGeo_block <- st_read("E:/Data/Census/County_blocks/Cayuga/tl_2010_36011_tabblock10.shp")
#censusGeo_block <- blocks("New York", county = county_sewer)

# census block groups for shapefile download
censusGeo_BG <- block_groups("New York", county = county_sewer, cb = TRUE, year = 2018)

#create SW_ID
sewersheds$SW_ID <- 
  paste(sewersheds$StateID, sewersheds$CountyID, sewersheds$WWTP_ID, sewersheds$Site_ID,
        sep = "")
### Step 2

##load functions (performed above)

### Step 3 shapefile for 2010 block
wwCensusGeo_block <- data_grab_2010_Block_function(county_sewer)

### Step 4
#change crs to match
# drop old variables sewersheds

sewersheds <- st_transform(sewersheds, st_crs(wwCensusGeo_block))

# turn off spherical geometry
sf::sf_use_s2(FALSE)

### Step 5 calculate sewershed pop values
Block_dt_final <- census_dissolve_function(wwCensusGeo_block, sewersheds)

### Step 6
#merge into sewershed shapefile
# drop old variables sewersheds
names(sewersheds)
sewersheds <- sewersheds[c(1:11,22:24)]
sewersheds2 <- merge(sewersheds, Block_dt_final, by = "Sewershed")

### Step 7
# pull BG data
wwCensusBG_2010_2018 <- datagrab_BG_function(county_sewer)

### Step 8
#change crs of sewersheds
sewersheds <- st_transform(sewersheds, st_crs(wwCensusBG_2010_2018))

# run final function
BG.final <- census_BG_dissolve_function(wwCensusBG_2010_2018, sewersheds)

#change variables to percentage and obtain population change number
# note for saving as esri shapefile, colnames need to be 10 or less characters
BG.final$BG_pop_change <- BG.final$bg_pop_2018 / BG.final$bg_pop_2010
BG.final$VacPer18 <- BG.final$vacant_units_2018 / BG.final$housing_units_2018
BG.final$NursPer10 <- BG.final$group_quartersNursing_2010 / BG.final$group_quartersPop_2010
BG.final$BlkPer18 <- BG.final$black_alone_2018 / BG.final$bg_pop_2018
BG.final$HispPer18 <- BG.final$hispanic_pop_2018 / BG.final$bg_pop_2018
BG.final$NWPer18 <- BG.final$Nonwhite_pop_2018 / BG.final$bg_pop_2018
BG.final$EduPer18 <- 1- (units::drop_units(BG.final$Under_HS_tot_2018))/units::drop_units(BG.final$Total_Education_2018)

#average annual growth rate
BG.final$GrwthRt <- (((BG.final$bg_pop_2018 - BG.final$bg_pop_2010)/BG.final$bg_pop_2010))/8
#select final variables to merge to sewershed data
BG.variables <- BG.final %>%
  dplyr::select(Sewershed, BG_pop_change,housing_units_2018, VacPer18, NursPer10, BlkPer18,
                HispPer18, NWPer18, EduPer18, GrwthRt)


sewersheds3 <- merge(sewersheds2, BG.variables, by = "Sewershed")

#calculate 2018 pop estimate using BG pop change and 2020 using growth rate
sewersheds3$POP18 <- sewersheds3$population_block_2010 * sewersheds3$BG_pop_change
sewersheds3$POP2020 <- units::drop_units(sewersheds3$population_block_2010) * (1+ units::drop_units(sewersheds3$GrwthRt))^10
#calculated 2020 population using one year change

#round to nearest whole number pop and housing units
sewersheds3$POP18 <- round(sewersheds3$POP18, 0)
sewersheds3$POP2020 <- round(sewersheds3$POP2020, 0)
sewersheds3$housing_units_2018 <- round(sewersheds3$housing_units_2018, 0)
sewersheds3$population_block_2010 <- round(sewersheds3$population_block_2010, 0)

#rename housing variable
sewersheds3$HOUSING18 <- sewersheds3$housing_units_2018
names(sewersheds3)

#remove columns we do not need
sewersheds4 <- sewersheds3[c(2:9,1,10:14,18:25)]
names(sewersheds4)
summary(sewersheds4$POP2020)
### Step 9
#save file
st_write(sewersheds4, "data/Cayuga catchments/Cayuga catchments.shp", append=FALSE)

#load to be sure saved properly
Cayuga <- st_read("data/Cayuga catchments/Cayuga catchments.shp")

ggplot()+
  geom_sf(data = Cayuga)