# extended functionality though packages

# spatial and spacetime packages
library(sp)
library(spdep)
library(rgdal)
library(raster)
# library(spacetime)

# data management packages
library(plyr)
library(reshape)

# advanced graphics
library(lattice)
library(latticeExtra)
library(RColorBrewer)

# cluster analysis
library(ape)
library(cluster)
library(MASS)

# time-series filtering
# library(robfilter)

# custom functions
source('panel.tsplot.R')

## color schemes
cols <- rev(brewer.pal(n=11, name='Spectral'))
cols.palette <- colorRampPalette(cols)
ncuts <- 20
cols.set1 <- brewer.pal('Set1', n=9)


## Step 1: import sample dataset
# load the local copy, don't auto-convert characters -> factors
d <- read.csv(gzfile('static_data/HRCLIM.csv.gz'), as.is=TRUE)

# tidy-up
names(d) <- c('station', 'lat', 'lon', 'date', 'temp') # nicer names
# convert years into Date class objects, starting from Jan 01
d$date <- as.Date(d$date, format="%Y-%m-%d") 
# re-order according to date, by station
d <- d[order(d$station, d$date), ]
# convert station ID to factor datatype
d$station <- factor(d$station)

# check
str(d)

# Lattice Graphics -- very powerful
# quick check on the first 10 stations
# pdf(file='figures/hrclim_first_impressions.pdf', width=10, height=5.5)
xyplot(temp ~ date | station, data=d, type=c('l','g'), subset=station %in% 1:10, 
layout=c(2, 5), as.table=TRUE, xlab='', ylab='Temperature', col='black',
scales=list(alternating=1), strip=strip.custom(bg=grey(0.8)))
# dev.off()


## note that we are working with data in 'long format'
## its simpler to work with most types of data in long format
## the **ply() family of functions are helpful in this case

# check basic distribution of data
d.summary <- ddply(d, .(date), .fun=summarize, 
q05=quantile(temp, prob=0.05, na.rm=TRUE),
q50=quantile(temp, prob=0.50, na.rm=TRUE),
q95=quantile(temp, prob=0.95, na.rm=TRUE),
n.missing=length(which(is.na(temp))))

# replace n.missing on days with no missing data with NA
d.summary$n.missing <- ifelse(d.summary$n.missing == 0, NA, d.summary$n.missing)
# rescale to {0,1}
d.summary$n.missing <- d.summary$n.missing / max(d.summary$n.missing, na.rm=TRUE)

# check
str(d.summary)
head(d.summary)


# plot median bounded by 5% and 95% percentiles
p1 <- xyplot(q05 + q50 + q95 ~ date, data=d.summary, type=c('l','g'), 
lty=c(2,1,2), col='black', lwd=c(1,2,1), xlab='Date', ylab='Temperature',
scales=list(cex=1))

# add visual cues on missing data
p2 <- p1 + layer(panel.points(x=d.summary$date, y=32, cex=d.summary$n.missing * 2, pch='|', col='red'))

# pdf(file='figures/hrclim_data_range.pdf', width=8, height=4)
print(p2)
# dev.off()


## working with the original data again
# check on deviations from the _global_ mean and scale by SD
d$temp_standardized <- scale(d$temp)

# check
head(d)

# get station IDs
st <- levels(d$station)

# leveplot colors each cell (date x station) according to standardized temperature
# first 79 stations
d.1 <- d[d$station %in% st[1:79], ]
# png(file='figures/hrclim_levelplot_1.png', width=600, height=800)
levelplot(temp_standardized ~ date*station, data=d.1, xlab='', ylab='Station',
scales=list(y=list(cex=0.5), x=list(alternating=1, tick.number=12, format="%b\n%Y", cex=1)),
col.regions=cols.palette(ncuts), cuts=ncuts-1, colorkey=list(labels=list(cex=1.5)))
# dev.off()

# next 79 stations
d.2 <- d[d$station %in% st[80:158], ]
# png(file='figures/hrclim_levelplot_2.png', width=600, height=800)
levelplot(temp_standardized ~ date*station, data=d.2, xlab='', ylab='Station',
scales=list(y=list(cex=0.5), x=list(alternating=1, tick.number=12, format="%b\n%Y", cex=1)),
col.regions=cols.palette(ncuts), cuts=ncuts-1, colorkey=list(labels=list(cex=1.5)))
# dev.off()

# last 20 stations
d.3 <- d[d$station %in% st[138:158], ]
# pdf(file='figures/hrclim_levelplot_3.pdf', width=8, height=5)
levelplot(temp_standardized ~ date*station, data=d.3, xlab='', ylab='Station',
scales=list(y=list(cex=1), x=list(alternating=1, tick.number=12, format="%b\n%Y", cex=1)),
col.regions=cols.palette(ncuts), cuts=ncuts-1, colorkey=list(labels=list(cex=1.5)), main='Deviations from the global mean')
# dev.off()


## pause to ponder what we just accomplished ...


## compute deviations from the date-wise mean and scale by SD
# takes a moment to complete ... (use progress indicator)
d.date_wise_deviations <- ddply(d, .(date), .progress='text', .fun=function(i) {
  temp.scaled <- scale(i$temp)
  df <- data.frame(station=unique(i$station), temp.scaled=temp.scaled)
  return(df)
  })


# subset the last 20 stations, and plot
d.sub <- d.date_wise_deviations[d.date_wise_deviations$station %in% st[138:158], ]
# pdf(file='figures/hrclim_levelplot_date_wise_deviations.pdf', width=8, height=5)
levelplot(temp.scaled ~ date*station, data=d.sub, xlab='', ylab='Station',
scales=list(y=list(cex=1), x=list(alternating=1, tick.number=12, format="%b\n%Y", cex=1)),
col.regions=cols.palette(ncuts), cuts=ncuts-1, colorkey=list(labels=list(cex=1.5)), main='deviations from date-wise mean')
# dev.off()

# clean-up: remove extra columns, and extra objects
d$temp_standardized <- NULL # remove a column from a data.frame
str(d)

# remove objects from memory, and 'garbage collect'
# any CS majors want to explain garbge collection?
ls()
rm(d.1, d.2, d.3, d.date_wise_deviations, d.sub, st, p1, p2) ; gc()
ls()


## next tabulate missing data by station
d.record_summary <- ddply(
d, .(station), .fun=summarize, 
n=length(temp), 
n.missing=length(which(is.na(temp)))
)

# check
head(d.record_summary, 60)


# get a vector of stations that are not missing any data
# these are simplest to work with
no.missing <- d.record_summary$station[
d.record_summary$n == 367 & d.record_summary$n.missing == 0
]

# what did we extract?
# station IDs (they are factors)
no.missing


# clean out missing data, and drop missing levels from station ID
d.complete <- subset(d, subset= station %in% no.missing)
d.complete$station <- factor(d.complete$station)

# which stations did we remove?
# setdiff(full set [compared with] subset)
setdiff(levels(d$station), levels(d.complete$station))



## tangent: long format to wide and back
# give stations a name that is suitible for dataframe column names
d.complete$station <- factor(
d.complete$station, 
levels=levels(d.complete$station), 
labels=paste('st', levels(d.complete$station), sep='_')
)

# check
head(d.complete)

# convert to intermediate format, then 'cast' to wide format
# while we wait, note subtle humor invoked by 'reshape' 
# package author Hadley Wickham
d.molten <- melt(d.complete, id.vars=c('station','lat','lon','date'))
d.wide <- cast(d.molten, date ~ station, value='temp')
d.wide <- as.data.frame(d.wide)

# what have we done with our data?
str(d.wide)


# check to make sure that worked: YES
# take time to let this sink in
subset(d.complete, subset=date == as.Date('2008-01-01') & station == 'st_1')
d.wide[1, c('date', 'st_1')]


## things to try with wide-format data:
# 1. PCA: by date (remember to mask out the first column, the date)
d.pca <- princomp(d.wide[, 2:139], cor=TRUE)

# plot
# pdf(file='figures/hrclim_pca_by_date.pdf', width=8, height=5)
par(mar=c(2,3,1,1))
plot(d.wide$date, predict(d.pca)[, 1], type='n', xlab='', ylab='')
matlines(d.wide$date, predict(d.pca)[, 1:3], type='l', lty=1, col=1:3)
legend('bottomright', lty=1, col=1:3, legend=paste('PCA', 1:3, c('(96%)','(2%)','(<1%)')), bty='n', cex=1)
# dev.off()

# thats kind of interesting


## lets try something else with wide-format data
## dissimilarity between stations, using temporal patterns in temperature
# transpose: stations as rows, dates as columns:
d.wide <- cast(d.molten, station ~ date, value='temp')
d.wide <- as.data.frame(d.wide)

# hierarchical clustering: assumes no NA
d.h <- hclust(dist(d.wide[, 2:368]), method='average')
d.h$labels <- d.wide$station

# 'hlcust' objects do not have a very nice plot() method, yuck.
plot(d.h)

# use classes from the 'ape' package for nicer plots
d.phylo <- as.phylo(d.h)

# cut into classes, and set some colors for later use
n.classes <- 4
d.clusters <- cutree(d.h, n.classes)
d.cols <- brewer.pal('Set1', n=n.classes)

# wait, what did cutree() do?
d.clusters


# plot dendrogram
# pdf(file='figures/hrclim_temporal_hclust.pdf', width=8, height=2.5)
plot(d.phylo, direction='down', show.tip.label=FALSE, no.margin=TRUE)
tiplabels(cex=1, pch=15, col=d.cols[d.clusters])
legend('topright', col=d.cols, legend=1:n.classes, pch=15, cex=1.5, bty='n')
# dev.off()

# vizualize data according to these 4 classes... what do they mean?
# save cluster label
d.wide$cluster <- d.clusters

# convert back to long format -- for plotting
d.long <- melt(d.wide, id.var=c('station', 'cluster'))
# fix dates
d.long$date <- as.Date(d.long$variable)
# re-order for plotting
d.long <- d.long[order(d.long$date), ]

# check: OK
str(d.long)
head(d.long)

# plot: IQR of clusters
# pdf(file='figures/hrclim_temporal_hclust_tsplot.pdf', width=8, height=4)
xyplot(value ~ date, groups=cluster, data=d.long, type='l', 
par.settings=list(superpose.polygon=list(col=d.cols), superpose.lines=list(col=d.cols)),
auto.key=list(rectangles=TRUE, lines=FALSE, points=FALSE, columns=n.classes),
scales=list(cex=1), xlab='', ylab='IQR of Temperature by Cluster',
panel=panel.tsplot, range.fun='IQR')
# dev.off()

# what the heck is this: panel=panel.tsplot
# ... if interested, we can talk about custom panel functions

# clean-up
rm(d.long, d.wide); gc()


################################################################################
################################################################################
################################################################################


## check out spatial component

# working with all the data, complete + incomplete records
#  aggregate by station, compute mean annual temp
d.sub <- ddply(d, .(station, lat, lon), .fun=summarize, temp=mean(temp, na.rm=TRUE))

# check
str(d.sub)

# upgrade to SPDF... what?
coordinates(d.sub) <- ~ lon + lat
str(d.sub) # aha!

# note that we are missing CRS

# set spatial reference system, using proj4 notation
proj4string(d.sub) <- '+proj=longlat +ellps=bessel +towgs84=550.499,164.116,475.142,5.80967,2.07902,-11.62386,0.99999445824'

# visual check
plot(d.sub)

# quick map of mean annual temperature: !! stats are wrong due to missing data !!
# pdf(file='figures/hrclim_mean_annual_temp_map.pdf', width=7, height=6)
spplot(d.sub, zcol='temp', key.space='right', col.regions=cols.palette(10), cuts=9)
# dev.off()

# look at internal structure in detail
str(d.sub)


# project to UTM (cartesian coordinate system)
# again, using proj4 notation
d.sub.utm <- spTransform(d.sub, 
CRS('+proj=utm +zone=33 +ellps=WGS84 +datum=WGS84 +units=m')) 

# check conversion: OK
dimnames(d.sub.utm@coords)[[2]] <- c('x','y') # fix projected coordinate names

# align LL, and xy coordinates side-by-side
head(cbind(coordinates(d.sub), coordinates(d.sub.utm)))


## basic distance calculation
# extract the first station, using [i,j] notation
st.1 <- d.sub.utm[1, ]

# basic concept
coordinates(st.1)
coordinates(st.1)[, 1]
coordinates(st.1)[, 2]

# compute distance from first station to all others
dx <- coordinates(d.sub.utm)[,1] - coordinates(st.1)[,1]
dy <- coordinates(d.sub.utm)[,2] - coordinates(st.1)[,2]

# results are vectors, talk about vectorized computation
dx


# distance to the nearest meter via pythagorean eq. 
# note that there is no for-loop
d.sub.utm$distance <- round(sqrt(dx^2 + dy^2))


# depict linear distance from station 1, and supress legend
# note method used to highlight station 1
# pdf(file='figures/hrclim_dist_from_st_1.pdf', width=6, height=6)
spplot(
d.sub.utm, zcol='distance', col.regions=cols.palette(ncuts), cuts=ncuts-1, auto.key=FALSE, par.settings=list(superpose.symbol=list(pch=21)),
sp.layout=list('sp.points', st.1, pch=0, cex=2, col='black')
)
# dev.off()

# azimuth from first station to all others
# converted to degrees CW from North (0 deg), truncated to 0-360, rounded to nearest degree
d.sub.utm$azimuth <- round((90 - (atan2(dy, dx) * 180 / pi)) %% 360)

# depict az from station 1, and supress legend
# pdf(file='figures/hrclim_az_from_st_1.pdf', width=6, height=6)
spplot(d.sub.utm, zcol='azimuth', col.regions=cols.palette(ncuts), cuts=ncuts-1, auto.key=FALSE,
sp.layout=list('sp.points', d.sub.utm[1, ], pch=0, cex=2, col='black'), 
par.settings=list(superpose.symbol=list(pch=21)))
# dev.off()


## simple spatial clustering-- based on linear distances
# using PAM algorithm, variables standardized, 5 classes
d.clara <- clara(coordinates(d.sub.utm), stand=TRUE, k=5)

# inspect, results no too surprising, given input
str(d.clara)
clusplot(d.clara)

# save clustering vector to our SPDF
d.sub.utm$cluster <- factor(d.clara$clustering)

# simple map with cluster and temperature summary
# pdf(file='figures/spatial_cluster_ex_1.pdf', width=8, height=4)
par(mar=c(2.25,1.5,1,1), mfcol=c(1,2))
plot(d.sub.utm, col=cols.set1[d.clara$clustering], pch=1)
points(d.clara$medoids, col=cols.set1[1:5], pch=0, cex=1.5, lwd=2)
legend('bottomleft', legend=c(1:5), pch=15, col=cols.set1[1:5])
box()

# summarize mean annual temp, by spatial cluster
boxplot(temp ~ cluster, data=d.sub.utm@data, varwidth=TRUE, 
boxwex=0.5, las=1, border=cols.set1[1:5])
# dev.off()

# try again, this time using temperature data (single time-slice)
d.clara.1 <- clara(cbind(coordinates(d.sub.utm), d.sub.utm$temp), stand=TRUE, k=5)
d.sub.utm$cluster.1 <- factor(d.clara.1$clustering)

# compare methods, note that cluster labels aren't always the same
par(mar=c(2.25,1.5,3,1), mfcol=c(1,2))
plot(d.sub.utm, col=cols.set1[d.clara$clustering], pch=1)
points(d.clara$medoids, col=cols.set1[1:5], pch=0, cex=1.5, lwd=2)
legend('bottomleft', legend=c(1:5), pch=15, col=cols.set1[1:5])
box() ; title('x,y')

plot(d.sub.utm, col=cols.set1[d.clara.1$clustering], pch=1)
points(d.clara.1$medoids, col=cols.set1[1:5], pch=0, cex=1.5, lwd=2)
box() ; title('x,y')

## more spatial opertions
# check out k-closest spatial neighbors
# pdf(file='figures/knn_example.pdf', width=8, height=2.5)
par(mfcol=c(1,3), mar=c(0,0,1,0))
for(i in 2:4)
  {
	nb <- knn2nb(knearneigh(coordinates(d.sub.utm), k=i))
	plot(nb, coordinates(d.sub.utm), col='Orange', pch=16)
	title(paste(i, ' closest neighbors'))
	}
# dev.off()

# wait, how does that 'nb' object work?
data.frame(number.of.stations=nrow(d.sub.utm), items.in.nb=length(nb))

# nb is a list, check the neighbor indices for the first station
nb[[1]]

# therefore:
d.sub.utm$station[nb[[1]]]

# look-up data for the 4 nn, for first station:
rbind(d.sub.utm[1,], d.sub.utm[nb[[1]], ])


## compare each station's temp with mean of nearest 4 stations
## might be useful for filling missing data

# mean of 4 NN related to station 1
mean(d.sub.utm$temp[nb[[1]]])

# for every station, compute mean of 4 NN
# define a function, that will compute the mean temperature
# based on the set of indices
# note scoping rules, and deletion of NA
f <- function(i) mean(d.sub.utm$temp[i], na.rm=TRUE)

# check
all.equal(f(1:10), mean(d.sub.utm$temp[1:10]))

# iterate over the elements of nb (a list)
# and sending the contents of each element to function 'f'
# collate the results into a vector
d.sub.utm$nn4_mean_temp <- sapply(nb, f)

# tangent: sapply vs lapply
head(sapply(nb, f)) # 'simple apply' returns a vector
head(lapply(nb, f)) # 'list apply' returns a list

# check structure of our DF
head(as.data.frame(d.sub.utm[101:105, c('temp', 'nn4_mean_temp')]))

# clear?



# evaluate with linear model: how well do 4 NN predict measured temp?
# hmm not so well
summary(l <- lm(temp ~ nn4_mean_temp, data=d.sub.utm))



# visual check -- this method of data-filling 
# may be more useful in some regions than others
# pdf(file='figures/temp_vs_nn4_mean_temp.pdf', width=6, height=6)
par(mar=c(4.5,4.5,1,1))
plot(temp ~ nn4_mean_temp, data=d.sub.utm, asp=1, type='n',
ylab='Measured Temperature', xlab='Mean Temperature of Nearest 4 Stations')
grid(col=grey(0.5))
abline(l, lwd=2)
points(temp ~ nn4_mean_temp, data=d.sub.utm, bg=cols.set1[d.sub.utm$cluster], pch=21)
legend('bottomright', legend=c(1:5), pch=21, pt.bg=cols.set1[1:5])
# dev.off()

## how much deviation by cluster?
# note funny syntax
ddply(as.data.frame(d.sub.utm), 
.(cluster), .fun=summarize, 
MAE=mean(abs(temp - nn4_mean_temp), na.rm=TRUE)
)

# compare with plot ...

## just for demonstration purposes, fill-missing data with mean of NN4
# looks like there is only 1 station missing the entire year's worth of data
d.missing.idx <- which(is.na(d.sub.utm$temp))
d.sub.utm[d.missing.idx, ]

# general approach to filling NA (annual data) from NN4 mean
d.sub.utm$temp_final <- with(as.data.frame(d.sub.utm), ifelse(is.na(temp), nn4_mean_temp, temp))

# check: OK
head(as.data.frame(d.sub.utm[101:105, c('temp', 'temp_final', 'nn4_mean_temp')]))


# bonus for later in the week
# fill all missing data in time-series with mean from nearest 4 stations


## OGR demo
# read-in local country boundaries (polygons)
countries <- readOGR(dsn='static_data', layer='countries_s')
# set CRS
proj4string(countries) <- '+proj=utm +zone=33 +ellps=WGS84 +datum=WGS84 +units=m +no_defs'

# check internal structure [truncated]
str(countries[1, ])


# simple map, but PDF format is nice for publication
# pdf(file='figures/countries_map.pdf', width=6, height=4)
spplot(countries, zcol='NAME', col.regions=brewer.pal('Set2', n=nrow(countries)))
# dev.off()


##  extended demo, sampling SPolyDF
plot(countries)
points(spsample(countries, type='stratified', n=10), col='red', pch=16)

# !! that didn't do what I thought it would!

# hexagonal grid from lower-left corner
s <- sapply(slot(countries, 'polygons'), function(i) spsample(i, n=10, type='hexagonal', offset=c(0,0)))

# stack into a single SpatialPoints object
s.merged <- do.call('rbind', s)

# extract the original IDs
ids <- sapply(slot(countries, 'polygons'), function(i) slot(i, 'ID'))

# determine the number of ACTUAL sample points generated for each poly
npts <- sapply(s, function(i) nrow(i@coords))

# generate a reconstituted vector of point IDs
pt_id <- rep(ids, npts)

# promote to SpatialPointsDataFrame
s.final <- SpatialPointsDataFrame(s.merged, data=data.frame(poly_id=pt_id))

# check:
plot(countries) ; points(s.final, col='red', pch=3, cex=0.5)


## 
## read-in DEM data, via maptools functions
dem <- read.asciigrid('static_data/HRdem.asc')

# re-name elevation data... pathnames can cause problems later
names(dem) <- 'elev'

# mask-out missing data (coded as 0)
dem$elev[dem$elev == 0] <- NA

# set CRS
proj4string(dem) <- '+proj=utm +zone=33 +ellps=WGS84 +datum=WGS84 +units=m +no_defs'

# internal structure
str(dem)


## GDAL demo
# try saving as a GeoTiff, will embedd NODATA and CRS
writeGDAL(dem, fname='static_data/dem.tif', type='Int32', driver='GTiff')
# check image details
GDALinfo('static_data/dem.tif')

# optionally clean-up
# unlink('static_data/dem.tif')


# visual check: do points line-up with the DEM?
# ... looks like it
image(dem, col=cols.palette(ncuts))
points(d.sub.utm)

# sample DEM at stations [sp methods]
d.sub.utm$elev <- overlay(dem, d.sub.utm)@data[, 1]

head(as.data.frame(d.sub.utm))

# what does mean ann. temp ~ elev look like
# not so linear...
plot(temp ~ elev, data=d.sub.utm)



## extended example of overlay using {raster}
# sample DEM at stations [raster methods]
dem.r <- raster(dem)
str(dem.r)

# sampling
# vector of length(d.sub.utm) returned
e.simple <- extract(dem.r, d.sub.utm, method='simple') # same result as overlay()
e.bilinear <- extract(dem.r, d.sub.utm, method='bilinear') # sample 4-nearest cells

# list of length(d.sub.utm) returned: takes a while
e.buffer.list <- extract(dem.r, d.sub.utm, buffer=2000) # buffer of 2000 map units (m)
# reduce to vector, by taking element-wise mean
e.buffer <- sapply(e.buffer.list, mean)

# combine and compute summaries
e.combined <- make.groups(e.simple, e.bilinear, e.buffer)
by(e.combined$data, e.combined$which, summary)


# make a nice map: note custom panel function
# png(file='figures/hrclim_nice_map.png', width=600, height=550)
spplot(dem, col.regions=cols.palette(ncuts), cuts=ncuts-1,
colorkey=list(labels=list(cex=1.25)), panel=function(...){
  panel.gridplot(...)
  sp.polygons(countries, col='black')
  sp.points(d.sub.utm, col='black', pch=1)
  })
# dev.off()


## more demos from Ca Soil Resource Website



