Tips on Using the 'sp' classes in R

 
Premise
The sp package provides several useful classes and methods for working with spatial data in R. The documentation is complete for the main set of classes and methods, however there are few listing of extensive examples that may cover some of the less frequently encountered situations. Some examples of where the complexity of the sp classes can cause confusion include:

  • dealing with NA values
  • preserving NA values during import/export
  • replacing data attached to an sp class object
  • plotting several spatial data at once

In addition, there are still some minor bugs to be worked out in terms of reading/writing vector data using methods in the rgdal package. This is currently being worked on by members of the sp and GRASS development community.

 
Joining New Data to an Existing sp Object

# use to read in some vector data
library(rgdal)

# read something in, rows are identified by a column called 'id'
spatial.data <- readOGR(...)

# read in some tabular data, rows are identified by a column called 'id'
new_table <- read.csv(...)

# 'join' the new data with merge()
# all.x=TRUE is used to ensure we have the same number of rows after the join
# in case that the new table has fewer
merged <- merge(x=spatial.data@data, y=new_table, by.x='id', by.y='id', all.x=TRUE)

# generate a vector that represents the original ordering of rows in the sp object
correct.ordering <- match(spatial.data@data$id, merged$id)

# overwrite the original dataframe with the new merged dataframe, in the correct order
spatial.data@data <- merged[correct.ordering, ]

# check the ordering of the merged data, with the original spatial data
cbind(spatial.data@data$id, merged$id[correct.ordering])

 
Correctly Write 'NA' Values to Shapefile [bug in writeOGR()]

# libraries we need
require(rgdal)
require(foreign)

# pass 1: write the shapefile
writeOGR(spatial.data, dsn='new_folder', driver='ESRI Shapefile', layer='new_layer')

# re-make the DBF:
write.dbf(spatial.data@data, file='new_folder/new_layer.dbf')

 
Handle NA in Several Raster Images (when all maps share the same NA spatial distribution)

# need this to read in GRASS data
require(spgrass6)

# read in several rasters into a new sp object:
x <- readRAST6(c('r1','r2','r3','r4'))

# we want to do some analysis that will not work when all rasters contain NA
# save a vector of positions where all of the rasters have a value of NA
x.nas <- which(is.na(x@data$r1) & is.na(x@data$r2) & is.na(x@data$r3) & is.na(x@data$r4))

# save a vector of positions that have no NA in any raster
x.vals <- which( !is.na(x@data$r1) & !is.na(x@data$r2) & !is.na(x@data$r3) & !is.na(x@data$r4))

# extract the records that only have complete data (no NA)
x.no.na <- x@data[x.vals, ]

# do something with the data
require(cluster)
x.clara <- clara(x.no.na, k=6, stand=TRUE)

# add a new column to the original sp object to contain the results of the analysis
x@data$terrain_group <- NA

# fill in the locations where there were complete records with the results
# note that we use the original locations of the non-NA values
x@data$terrain_group[x.vals] <- x.clara$clustering

# save result back to GRASS
writeRAST6(x, zcol='cluster', vname='x_groups')

 
Handle NA in Several Raster Images (when maps have different NA spatial distribution)

# this one is from R. Bivand (thanks)
# assume that we are working with the 'data' slot on an sp opject:
# generate some data; sprinkle in some NA ; convert to DF
x <- rnorm(100)
x[sample(1:100, 5)] <- NA
x.mat <- round(matrix(x, ncol=5), 2)
x.df <- as.data.frame(x.mat)

# extract an NA mask
x.na_mask <- complete.cases(x.df)
 
# just the 'rows' with complete data (across bands)
 x.df[x.na_mask,]

      V1    V2    V3    V4    V5
1   1.32 -1.27  0.00 -0.87  1.52
2  -0.44  1.33  0.78 -1.27  0.00
5   2.70 -0.08 -0.77  0.38  0.32
6  -0.52 -0.07  0.13 -0.43  1.12
7  -0.63  2.02  0.45  0.48 -0.59
8  -0.52 -0.78 -0.59 -0.07 -0.08
11 -1.37 -1.15  0.23  0.73  0.07
12 -0.11 -0.66  0.50 -1.14  0.71
13 -0.38 -0.08  1.00 -0.88 -0.19
14 -1.14 -0.45 -1.37 -0.43 -2.18
15  1.92  0.46 -0.72  0.12  0.27
16  0.51  0.22 -0.39 -1.54 -1.04
17  1.14 -1.11  1.04  0.00 -1.11
18  0.53  0.62 -0.10 -0.17  0.15
19  0.84  0.27 -1.01  0.87  0.31
20  0.82  0.87  0.46  1.35  0.13


# all the data
x.df
      V1    V2    V3    V4    V5
1   1.32 -1.27  0.00 -0.87  1.52
2  -0.44  1.33  0.78 -1.27  0.00
3  -0.31 -0.13 -1.14  2.08    NA
4   0.76    NA -1.20 -0.54 -0.85
5   2.70 -0.08 -0.77  0.38  0.32
6  -0.52 -0.07  0.13 -0.43  1.12
7  -0.63  2.02  0.45  0.48 -0.59
8  -0.52 -0.78 -0.59 -0.07 -0.08
9  -0.38 -0.51  0.44    NA -0.81
10 -1.38    NA -0.65 -0.50    NA
11 -1.37 -1.15  0.23  0.73  0.07
12 -0.11 -0.66  0.50 -1.14  0.71
13 -0.38 -0.08  1.00 -0.88 -0.19
14 -1.14 -0.45 -1.37 -0.43 -2.18
15  1.92  0.46 -0.72  0.12  0.27
16  0.51  0.22 -0.39 -1.54 -1.04
17  1.14 -1.11  1.04  0.00 -1.11
18  0.53  0.62 -0.10 -0.17  0.15
19  0.84  0.27 -1.01  0.87  0.31
20  0.82  0.87  0.46  1.35  0.13

# generate something cool from complete data
library(MASS)
p <- princomp(~ . , data=x.df[x.na_mask,])

# assign back to original data
x.df$pca_1[x.na_mask] <-  predict(p)[,1]