## ----setup, include=FALSE, cache=FALSE------------------------------
# set global chunk options
## this code is only required to set up knitr for reproducible documentation
## it has nothing to do with the exercise itself, please ignore!
require(knitr)
opts_knit$set(aliases=c(h = 'fig.height', w = 'fig.width'),
              out.format='latex', use.highlight=TRUE)
opts_chunk$set(fig.path='kgraph/exArealData-', fig.align='center', fig.show='hold',
               # do not show R prompts, makes it easier to cut-paste
               prompt=FALSE, comment=NA, fig.width=6, fig.height=6,
               out.width='.65\\linewidth', size='scriptsize',
               cache=FALSE, message=FALSE)
# options to be read by formatR
options(replace.assign=TRUE, width=70)
options(warn = -1)


## ----echo=FALSE, results="hide"-------------------------------------
rm(list=ls())


## ----child-data, child='exArealData1.Rnw'---------------------------

## ----data-set-parent, echo=FALSE, cache=FALSE-----------------------
knitr::set_parent('exArealData.Rnw')


## ----load-packages--------------------------------------------------
library(sf)
library(sp)
library(spdep)
library(spatialreg)
library(RColorBrewer)
library(gstat)
library(ggplot2)


## ----eval=F---------------------------------------------------------
## list.files(path = "./NY_data", pattern="*.shp")


## ----echo=F---------------------------------------------------------
## this is the file specification on the author's system; he organizes datasets into a subdirectory named `ds', and then
##   each data source into its own sub-subdirectory
list.files(path = "../ds/ASDAR/NY_data/", pattern="*.shp")


## ----eval=F---------------------------------------------------------
## NY8 <- st_read(dsn = "./NY_data", layer = "NY8_utm18")
## class(NY8)
## str(NY8, max.level=2)
## summary(NY8)
## NY8$AREANAME <- as.factor(NY8$AREANAME)
## #
## cities <- st_read(dsn = "./NY_data", layer = "NY8cities")
## class(cities)
## print(cities)


## ----echo=F---------------------------------------------------------
NY8 <- st_read(dsn = "../ds/ASDAR/NY_data", layer = "NY8_utm18")
str(NY8)
summary(NY8)
NY8$AREANAME <- as.factor(NY8$AREANAME)
#
cities <- st_read(dsn = "../ds/ASDAR/NY_data", layer = "NY8cities")
summary(cities)
print(cities)


## ----bingo----------------------------------------------------------
ix <- which(cities$names == "Binghampton")
cities$names[ix] <- "Binghamton"


## ----check-validity-------------------------------------------------
# find errors
tmp <- st_is_valid(NY8, reason = TRUE)
ix <- which(tmp != "Valid Geometry")
print(data.frame(polygonID = ix, reason = tmp[ix]))
# fix it
NY8 <- st_make_valid(NY8)
all(st_is_valid(NY8))


## ----ny8-cities, out.width='.9\\linewidth'--------------------------
ggplot(data = NY8) +
  geom_sf(aes(fill = POP8)) +
  geom_sf(data = cities) +
  geom_sf_label(data = cities, aes(label = names))


## -------------------------------------------------------------------
st_crs(NY8)$proj4string
st_crs(cities)$proj4string


## ----all-equal-q----------------------------------------------------
NY8_nb <- poly2nb(NY8)
class(NY8_nb)
summary(NY8_nb)
NY8_nb[[1]]


## -------------------------------------------------------------------
NY8_nb2 <- poly2nb(NY8, queen=FALSE)
summary(NY8_nb2)
NY8_nb[[1]]
NY8_nb2[[1]]



## -------------------------------------------------------------------
# st_touches includes single points of contact
NY8_st <- st_touches(NY8)
NY8_st[[1]]
class(NY8_st)


## ----as-nb-sgbp-----------------------------------------------------
NY8_st_nb <- NY8_st
class(NY8_st_nb) <- "nb"


## -------------------------------------------------------------------
(n.links.queen <- sum(unlist(lapply(NY8_nb, length))))
(n.links.rook <- sum(unlist(lapply(NY8_nb2, length))))
n.links.queen - n.links.rook


## ----plot-NY8-queen-rook, fig.height=12, fig.width=12, out.width='\\linewidth'----
plot(st_geometry(NY8), border="grey", 
     main = "NY8 8-County Census Tracts", 
     axes = TRUE)
coords_centers <- st_coordinates(st_centroid(st_geometry(NY8)))
plot(NY8_nb, coords_centers, add=TRUE,
     pch=19, cex = 0.6, col="red")
plot(NY8_nb2, coords_centers, add=TRUE,
     pch=19, cex = 0.6,col="blue")
legend(461000, 4801000, c("rook","queen"), 
       lty=1, col=c("blue","red"))
grid()


## -------------------------------------------------------------------
(n.nb <- unlist(lapply(NY8_nb, length)))
table(n.nb)


## -------------------------------------------------------------------
min(n.nb); max(n.nb)
(ix.min <- which(n.nb==min(n.nb)))
NY8[ix.min,]
(ix.max <- which(n.nb==max(n.nb)))
NY8[ix.max,]


## ----plot-ny8-nb-2, fig.height=12, fig.width=12, out.width='\\linewidth'----
plot(st_geometry(NY8), border="grey", 
     main = "NY8 Census Tracts with max/min neighbours", 
     sub = "min: red; max: green",
     axes = TRUE)
plot(NY8[ix.min,], border="black", col="red", lwd=1, add=T)
plot(NY8[ix.max,], border="black", col="green", lwd=1, add=T)
plot(NY8_nb, coords_centers, add=TRUE, 
     pch=19, cex = 0.6, col="black", lwd=0.2)
grid()


## ----dnearneigh-----------------------------------------------------
NY8_nb_d <- dnearneigh(coords_centers, d1=0, d2=5000, 
                       row.names=row.names(NY8))
summary(NY8_nb_d)


## ----plot-NY8-10km, fig.height=12, fig.width=12, out.width='\\linewidth'----
plot(st_geometry(NY8), border="grey", 
     main = "NY8 Census Tracts, 10 km neighbours", axes = TRUE)
plot(NY8_nb_d, coords_centers, pch=19, cex = 0.6, add=TRUE, col="blue")
grid()


## ----knn------------------------------------------------------------
knn3 <- knearneigh(coords_centers, k=3)
str(knn3$nn)
knn3$nn[1:3,]
NY8_nb_3nn <- knn2nb(knn3, row.names=row.names(NY8))
NY8_nb_3nn


## ----neigh-knear, fig.height=12, fig.width=12, out.width='\\linewidth'----
plot(st_geometry(NY8), border="grey", 
     main = "NY8 Census Tracts, 3 nearest neighbours", axes = TRUE)
plot(NY8_nb_3nn, coords_centers, pch=19, cex=0.6, col="blue", add=TRUE)
grid()


## -------------------------------------------------------------------
names(NY8)
length(unique(NY8$AREANAME))
sort(unique(NY8$AREANAME))
which(NY8$AREANAME == "Syracuse city")


## ----subset-Syr-----------------------------------------------------
(ix <- grep("Syracuse city", NY8$AREANAME, fixed = TRUE))
# find near neighbours by distance
(iy <- unique(unlist(NY8_nb_d[ix])))
Syr <- NY8[iz <- union(ix, iy),]
dim(Syr)


## ----subset-Syr-nb--------------------------------------------------
Syr_nb <- poly2nb(Syr)


## ----plot-syr-nb, out.width='.9\\linewidth'-------------------------
plot(st_geometry(Syr), border="grey", 
     main = "Syracuse Census Tracts", axes = TRUE)
coords_centers_Syr <- st_coordinates(st_centroid(st_geometry(Syr)))
plot(Syr_nb, coords_centers_Syr, col = "blue", lwd = 0.7, 
     pch=19, cex=0.6, add=TRUE)
text(coords_centers_Syr, row.names(Syr))
grid()


## ----eval=FALSE-----------------------------------------------------
## Syr.ll <- st_transform(Syr, 4326)
## st_write(Syr.ll, "./Syr.kml", driver = "kml", delete_dsn = TRUE)


## ----echo=FALSE, results='hide'-------------------------------------
Syr.ll <- st_transform(Syr, 4326)
st_write(Syr.ll, "../ds/ASDAR/Syr.kml", driver = "kml", delete_dsn = TRUE)


## ----make-weights-NY8-----------------------------------------------
NY8_lw_W <- nb2listw(NY8_nb)
print(NY8_lw_W)
print(NY8_lw_W$neighbours[[1]])
print(NY8_lw_W$weights[[1]])


## ----NY8-lw-minmax--------------------------------------------------
unlist(unique(NY8_lw_W$weights[ix.min]))
unlist(unique(NY8_lw_W$weights[ix.max]))


## ----make-weights-Syr-----------------------------------------------
Syr_lw_W <- nb2listw(Syr_nb)
print(Syr_lw_W)


## ----show.weights.as.matrix-----------------------------------------
build.wts.matrix <- function(wts.list) {
    # set up a matrix to receive the weights, initially all 0
    len <- length(wts.list$weights)
    wts.matrix <- as.data.frame(matrix(0, nrow=len, ncol=len))
    row.names(wts.matrix) <- attr(wts.list$neighbours, "region.id")
    colnames(wts.matrix) <- attr(wts.list$neighbours, "region.id")
    for (i in 1:len) { # each item in the weights list
        nl <- wts.list$neighbours[[i]]  ## one row's neighbours
        wl <- wts.list$weights[[i]]     ## one row's weights
        if (nl[1] != 0) {  # empty neighbour lists have a single `0' element
            # fill in this row of the weights matrix
            for (j in 1:length(nl))  wts.matrix[i, nl[j]] <- wl[j]
            }
    }
    return(wts.matrix)
}
tmp <- build.wts.matrix(NY8_lw_W)
tmp[1,]
round(tmp[1:9,1:9], 4)
rm(tmp)


## -------------------------------------------------------------------
summary(Syr$Z)


## ----plot-z, out.width='.8\\linewidth'------------------------------
ggplot(data = Syr) +
  geom_sf(aes(fill = Z)) +
  labs(title = "Leukemia incidence")


## ----plot-rank-z, out.width='.8\\linewidth'-------------------------
rank <- rank(Syr$Z)
ggplot(data = Syr) +
  geom_sf(aes(fill = rank)) +
  scale_fill_gradient2(guide = guide_colourbar(reverse = TRUE)) +
  labs(title = "Rank of Leukemia incidence")


## ----plot-rel-z, out.width='.8\\linewidth'--------------------------
rel.risk  <- ((Syr$Z-min(Syr$Z))/(max(Syr$Z)-min(Syr$Z)))
n <- length(Syr$Z)
rel.risk <- pmax(ceiling(n*rel.risk), 1)
rel.risk.pal <- rev(brewer.pal(9, 'Spectral'))
ggplot(data = Syr) +
  geom_sf(aes(fill = rel.risk)) +
  scale_fill_gradientn(colors = rel.risk.pal) +
  labs(title = "Relative risk of Leukemia incidence")


## -------------------------------------------------------------------
(moran.z <- moran.test(Syr$Z, Syr_lw_W))


## -------------------------------------------------------------------
n.nb.s <- unlist(lapply(Syr_nb, length))
table(n.nb.s)
min(n.nb.s); max(n.nb.s)
(ix.min.s <- which(n.nb.s==min(n.nb.s)))
Syr[ix.min.s,]
(ix.max.s <- which(n.nb.s==max(n.nb.s)))
Syr[ix.max.s,]


## -------------------------------------------------------------------
Syr_lw_W <- nb2listw(Syr_nb)
summary(Syr_lw_W)
Syr_lw_B <- nb2listw(Syr_nb, style="B")
summary(Syr_lw_B)
Syr_lw_C <- nb2listw(Syr_nb, style="C")
summary(Syr_lw_C)
Syr_lw_U <- nb2listw(Syr_nb, style="U")
summary(Syr_lw_U)


## -------------------------------------------------------------------
row.names(Syr[1,])
Syr_nb[[1]]
dsts <- nbdists(Syr_nb, coords_centers_Syr)
dsts[1]
idw <- lapply(dsts, function(x) 1/(x/1000))
# could do this, IDW^2
# idw2 <- lapply(dsts, function(x) (1/(x/1000)^2))
idw[1]
Syr_lw_idwB <- nb2listw(Syr_nb, glist=idw, style="B")
Syr_lw_idwB$weights[[1]]


## -------------------------------------------------------------------
summary(unlist(Syr_lw_idwB$weights))


## -------------------------------------------------------------------
summary(sapply(Syr_lw_idwB$weights, sum))


## -------------------------------------------------------------------
print(moran.z.idwB <- moran.test(Syr$Z, Syr_lw_idwB))
print(moran.z)
# (moran.pctage65p.idwB <- moran.test(Syr$PCTAGE65P, NY8_lw_idwB))


## ----compare.weight.matrices----------------------------------------
tmp <- build.wts.matrix(Syr_lw_W)
round(tmp[1:9,1:9],4)
tmp <- build.wts.matrix(Syr_lw_B)
round(tmp[1:9,1:9], 4)
tmp <- build.wts.matrix(Syr_lw_C)
round(tmp[1:9,1:9], 4)
tmp <- build.wts.matrix(Syr_lw_U)
round(tmp[1:9,1:9], 4)
tmp <- build.wts.matrix(Syr_lw_idwB)
round(tmp[1:9,1:9], 4)


## -------------------------------------------------------------------
mp <- moran.plot(Syr$Z, Syr_lw_W, xlab="Z",
                 ylab="average neighbour Z")
title(main="Moran scatterplot, Syracuse leukemia incidence",
      sub="weights style `W'")


## -------------------------------------------------------------------
str(mp)
mp[1,]


## -------------------------------------------------------------------
ix <- which(infl <- mp$is_inf)
mp[ix, ]
print(cbind(Syr[ix, c("AREAKEY","Z")], ix))
Syr[Syr_lw_W$neighbours[ix][[1]],c("AREAKEY","Z")]


## ----localmoran-1---------------------------------------------------
lm1 <- localmoran(Syr$Z, Syr_lw_W)
summary(lm1)
ix <- which(lm1[,"Pr(z != E(Ii))"] < 0.05)
print(cbind(Syr[ix,c("AREAKEY","POP8","Z")],ix))


## ----out.width='\\textwidth', fig.width=12, fig.height=4------------
par(mfrow=c(1,3))
mp <- moran.plot(Syr$Z, Syr_lw_W, xlab="Z",
                 ylab="average neighbour Z")
title(main="Moran scatterplot, Syracuse leukemia",
      sub="weights style `W'")
mp <- moran.plot(Syr$Z, Syr_lw_B, xlab="Z",
                 ylab="average neighbour Z")
title(main="Moran scatterplot, Syracuse leukemia",
      sub="weights style `B' (binary)")
mp <- moran.plot(Syr$Z, Syr_lw_idwB, xlab="Z",
                 ylab="average neighbour Z")
title(main="Moran scatterplot, Syracuse leukemia",
      sub="weights style `I' (inverse distance)")
par(mfrow=c(1,1))


## ----localmoran-compare---------------------------------------------
##
moran.test(Syr$Z, Syr_lw_W)
lm1 <- localmoran(Syr$Z, Syr_lw_W)
summary(lm1[,"Ii"])
(ix <- which(lm1[,"Pr(z != E(Ii))"] < 0.05))
##
moran.test(Syr$Z, Syr_lw_B)
lm1 <- localmoran(Syr$Z, Syr_lw_B)
summary(lm1[,"Ii"])
(ix <- which(lm1[,"Pr(z != E(Ii))"] < 0.05))
##
moran.test(Syr$Z, Syr_lw_idwB)
lm1 <- localmoran(Syr$Z,Syr_lw_idwB)
summary(lm1[,"Ii"])
(ix <- which(lm1[,"Pr(z != E(Ii))"] < 0.05))
##


## ----gettis-ord-----------------------------------------------------
summary(gi <- localG(Syr$Z, Syr_lw_W))


## ----plot-gi, out.width='.9\\linewidth', fig.width=8, fig.height=8----
shade <- as.numeric(round(n*((gi-min(gi))/
                             (max(gi)-min(gi)))))+1
colfunc <- colorRampPalette(c("blue", "green", "yellow", "red"))
ramp <- colfunc(n+1)
plot(Syr["Z"], border="grey60", axes=TRUE,
     col=ramp[shade],
     main="Getis-Ord Gi, Syracuse leukemia incidence")
grid()
text(coords_centers_Syr, as.character(round(gi,2)), col="black")


## ----compute-gistar-nblist------------------------------------------
Syr_nbi <- Syr_nb
## add the index its own list
for (i in 1:length(Syr_nb)) {
    Syr_nbi[[i]] <- sort(c(Syr_nbi[[i]], i))
}
## convert to weights
Syr_lw_Wi <- nb2listw(Syr_nbi)
print(Syr_lw_W)
print(Syr_lw_W$weights[[1]])
print(Syr_lw_Wi)
print(Syr_lw_Wi$weights[[1]])


## ----compute-gistar-------------------------------------------------
summary(gi.star <- localG(Syr$Z, Syr_lw_Wi))
summary(gi)
summary(gi.star-gi)


## ----plot-gistar, out.width='.9\\linewidth', fig.width=8, fig.height=8----
## these from plot-gi, above
# n <- length(Syr$Z)
# colfunc <- colorRampPalette(c("blue", "green", "yellow", "red"))
# ramp <- colfunc(n+1)
shade <- as.numeric(round(n*((gi.star-min(gi.star))/
                             (max(gi.star)-min(gi.star)))))+1
plot(Syr["Z"], border="grey60", axes=TRUE,
     col=ramp[shade],
     main="Getis-Ord Gi*, Syracuse leukemia incidence")
grid()
text(coords_centers_Syr, as.character(round(gi.star,2)), col="black")


## ----eval=F---------------------------------------------------------
## TCE <- st_read("./NY_data", "TCE")


## ----echo=F, results='hide'-----------------------------------------
TCE <- st_read("../ds/ASDAR/NY_data", "TCE")


## ----TCE-sources, out.width='.9\\linewidth', fig.width=8, fig.height=8----
ggplot(data = NY8) +
  geom_sf(aes(fill = Z)) +
  scale_fill_gradientn(colors = rel.risk.pal) +
  labs(title = "8 counties, TCE sources", fill = "Incidence") +
  geom_sf(data = TCE, cex = 2, pch = 21, fill = "red")


## -------------------------------------------------------------------
summary(NY8)
m.z.ppp <- lm(Z ~ PEXPOSURE + PCTAGE65P + PCTOWNHOME, data=NY8)
summary(m.z.ppp)


## ----out.width='.75\\linewidth', fig.height=4, fig.width=6----------
hist(residuals(m.z.ppp))
rug(residuals(m.z.ppp))
ix <- which.max(residuals(m.z.ppp))
NY8[ix,]


## -------------------------------------------------------------------
NY8listwB <- nb2listw(NY8_nb, style = "B")
(m.z.ppp.moran.test <- lm.morantest(m.z.ppp, NY8listwB))


## ----leukemia-lm-resid, out.width='.9\\linewidth'-------------------
NY8$lmresid <- residuals(m.z.ppp)
summary(NY8$lmresid)
ggplot(data = NY8) +
  geom_sf(aes(fill = lmresid)) +
  labs(fill = "Linear model residuals", main = "Central NY State") +
  scale_fill_gradientn(colors = colorspace::diverge_hcl(12)) +
  geom_sf(data = TCE, aes(col = name)) +
  labs(col = "TCE sites")


## ----summary-sar----------------------------------------------------
m.z.ppp.sar <- spatialreg::spautolm(Z ~ PEXPOSURE + PCTAGE65P + PCTOWNHOME,
                        data=NY8, listw=NY8listwB)
summary(m.z.ppp.sar)


## ----compare-coeff-ppp-pppsar---------------------------------------
round(summary(m.z.ppp)$coefficients[,c(1,4)],4)
round(summary(m.z.ppp.sar)$Coef[,c(1,4)],4)


## ----plot-sar-resid, out.width='.9\\linewidth'----------------------
NY8$sarresid <- residuals(m.z.ppp.sar)
summary(NY8$sarresid)
ggplot(data = NY8) +
  geom_sf(aes(fill = sarresid)) +
  labs(fill = "SAR model residuals", main = "Central NY State") +
  scale_fill_gradientn(colors = colorspace::diverge_hcl(12)) +
  geom_sf(data = TCE, aes(col = name)) +
  labs(col = "TCE sites")


## -------------------------------------------------------------------
moran.test(NY8$sarresid, NY8listwB)


## ----sar-resid-change, out.width='.7\\linewidth'--------------------
NY8$resid.change <- (NY8$sarresid - NY8$lmresid)
summary(NY8$resid.change)
ggplot(data = NY8) +
  geom_sf(aes(fill = resid.change)) +
  scale_fill_gradientn(colors = colorspace::diverge_hcl(12)) +
  labs(title="SAR error model residuals - linear model residuals",
       fill = 'SAR - LM residual')


## ----trend-stochastic, out.width='\\linewidth', fig.width=10, fig.height=6----
class(m.z.ppp.sar)
NY8$sar_trend <- m.z.ppp.sar$fit$signal_trend
NY8$sar_stochastic <- m.z.ppp.sar$fit$signal_stochastic
rds <- colorRampPalette(brewer.pal(8, "RdBu"))
g1 <- ggplot(data = NY8) +
  geom_sf(aes(fill = sar_trend)) +
  scale_fill_gradientn(colours = bpy.colors(12)) +
  labs(fill = "SAR deterministic component")
#
g2 <- ggplot(data = NY8) +
  geom_sf(aes(fill = sar_stochastic)) +
   scale_fill_gradientn(colours = bpy.colors(12)) +
  labs(fill = "SAR stochastic component")
gridExtra::grid.arrange(g1, g2, nrow = 1)


## ----fit-lagsarlm---------------------------------------------------
m.z.ppp.lagsar <- spatialreg::lagsarlm(Z ~ PEXPOSURE + PCTAGE65P + PCTOWNHOME,
                           data=NY8, listw=NY8listwB)
summary(m.z.ppp.lagsar)


## ----compare-coeff-lagsar-------------------------------------------
round(summary(m.z.ppp.sar)$Coef[,c(1,4)],4)
round(summary(m.z.ppp.lagsar)$Coef[,c(1,4)],4)


## ----fit-durbinlm---------------------------------------------------
m.z.ppp.durbin <- spatialreg::lagsarlm(Z ~ PEXPOSURE + PCTAGE65P + PCTOWNHOME,
                           data=NY8, listw=NY8listwB, type="mixed")
summary(m.z.ppp.durbin)


## ----compare-coeff-durbin-------------------------------------------
round(summary(m.z.ppp.sar)$Coef[,c(1,4)],4)
round(summary(m.z.ppp.durbin)$Coef[,c(1,4)],4)


## ----make-spdf------------------------------------------------------
NY8.pts <- st_centroid(NY8)
str(NY8.pts)
ggplot(data = NY8.pts) +
  geom_sf(aes(color=Z))


## ----compute-ols----------------------------------------------------
m.z.ppp.pts <- lm(Z ~ PEXPOSURE + PCTAGE65P + PCTOWNHOME,
       data = NY8.pts)
coefficients(m.z.ppp.pts)
coefficients(m.z.ppp)


## ----bubble-function, out.width='0.75\\linewidth'-------------------
bubble.sf <- function(.point.obj.name, .field.name, .field.label, .title="") {
    # make a plus/minus indicator
    eval(parse(
        text = paste0("pm <- factor(", 
                      .point.obj.name, "$", 
                      .field.name, "> 0)")
    ))
    # rename them
    levels(pm) <- c("-, overprediction", "+, underprediction")
    # plot
    eval(parse(
      text = paste0("ggplot(", 
                    .point.obj.name,
                    ") + geom_sf(aes(colour = pm, size = abs(", 
                    .field.name, 
                    ")), shape = 1) + labs(size = paste('+/-', .field.label), 
                    colour = '', title = '",
                    .title, "') +
                    scale_colour_manual(values = c('red', 'green'))")  
    ))
} 


## ----out.width='.7\\linewidth', ols-residuals-bubble----------------
NY8.pts$lm.resid <- residuals(m.z.ppp.pts)
bubble.sf("NY8.pts", "lm.resid", 
       "Leukemia incidence, residuals", "OLS")


## ----ols-residuals-vgm, out.width='.55\\linewidth'------------------
vr <- variogram(lm.resid ~ 1, loc=NY8.pts)
plot(vr, plot.numbers=T)


