Submitted by dylan on Fri, 2006-03-10 01:21.
Hierarchical clustering methods can be used to create a series of nested groupings of data based on the interplay between variables associated with an observation. In order for the algorithm to produce meaningful results data often need to be scaled accordingly. The first example, CA 2004 election results by county, illustrates the need to convert absolute vote counts into relative percentages of a total vote within a county. More information on the various clustering analysis packages for R can be found here.
Figure 1: CA 2004 election
Figure 3: Soil PSA data
Figure 4
Example 1: CA 2004 Election Data
Preprocessing:
#grab the election data, and save to ca_election.data
http://www.usatoday.com/news/politicselections/vote2004/PresidentialByCounty.aspx?oi=P&rti=G&sp=CA&tf=l
#remove unwanted columns and commas
awk -F"\t" 'BEGIN{OFS = "\t"} {gsub(",",""); print $1, $4, $5}' ca_election.data > ca_election-clean.data
R Code:
#read in the election data and use the first column as the row names
x <- read.table("ca_election-clean.data", sep="\t", header=T, row.names=1)
#initial hcluster: biased by total county population --> no good !
hc <- hclust(log(dist(x)), "ave")
plot(hc, hang=.5, cex=0.8)
#normalize population influence by turning votes for each candidate into a percentage
#add these normalized percentages to the original data frame x
x$B <- x$Bush / (x$Bush + x$Kerry)
x$K <- x$Kerry / (x$Bush + x$Kerry)
#subset x to form a new data frame y: keeping only our normalized vote percentages:
y <- subset(x, select=B:K)
#re-cluster and dendrogram class, plotting looks nicer
hc <- hclust(dist(y), "ave")
dend <- as.dendrogram(hc)
#calculate total votes per county and scale, generate lists of red and blue counties
#note that these are global variables
pop <<- log((x$Bush + x$Kerry) / sum(x$Bush + x$Kerry) * 1000, 10) / 2
reds <<- as.factor(row.names(y[y$B > 0.5, ]))
blues <<- as.factor(row.names(y[y$K > 0.5, ]))
#define a function for coloring and sizing node elements:
colLab <- function(n)
{
if(is.leaf(n))
{
a <- attributes(n)
if ( length(which(blues == a$label)) == 1 )
{
attr(n, "nodePar") <- c(a$nodePar, list(lab.col = "blue", lab.cex=.7, col="blue", cex=pop[n], pch=16 ))
}
else
{
attr(n, "nodePar") <- c(a$nodePar, list(lab.col = "red", lab.cex=.7, col="red", cex=pop[n], pch=16))
}
}
n
}
#modfiy dendrogram nodes and re-plot
dend_colored <- dendrapply(dend, colLab)
plot(dend_colored, main="CA 2004 Election Results by County")
legend(x=55, y=14, legend=c("Dem", "Rep"), col=c("blue", "red"), cex=0.5, pch=16)
Example 3: Soil Particle Size Analysis Data
require(plotrix)
#read in processed granulometer data:
run1 <- read.table("gran_runs_1-4.dat", header=T)
row.names(run1) <- run1$hz_id
#subset all texture data:
x <- subset(run1, select=surf_area:clay)
#just the pedon_ids:
pedon_ids <<- unique(run1$pedon_id)
#function definition for uniq_pedon_color() and colLab()
uniq_pedon_color <- function(pedon_id_list)
{
col_list <- numeric()
len <- length(unique(pedon_id_list))
rainbow_cols <-rainbow(len)
pedon_ids <- table(run1$pedon_id)
for ( i in 1:len )
{
a <- rep(rainbow_cols[i],pedon_ids[i])
col_list <- c(col_list,a)
}
col_list
}
#define a function for coloring and sizing node elements:
colLab <- function(n)
{
a <- attributes(n)
if(is.leaf(n))
{
attr(n, "nodePar") <- c(a$nodePar, list(lab.col = lab_cols[n], lab.cex=.7, cex=0 ))
}
n
}
#normalize the data by dividing be the mean in each col
attach(x)
x_stand <- matrix( c(surf_area/mean(surf_area), vf_sand/mean(vf_sand),
fi_sand/mean(fi_sand), med_sand/mean(med_sand), co_sand/mean(co_sand),
vc_sand/mean(vc_sand), sand/mean(sand), silt/mean(silt), clay/mean(clay)), ncol=9 )
detach(x)
row.names(x_stand) <- run1$hz_id
#clustering ideas:
hc <- hclust(dist(x_stand), "ave")
dend <- as.dendrogram(hc)
#generate a list of lable colors based on the ordering of the pedon_ids in the input file
#note that pedon_ids must be in order
lab_cols <<- uniq_pedon_color(run1$pedon_id)
#modfiy dendrogram nodes and re-plot
dend_colored <- dendrapply(dend, colLab)
#plot the results
par(mfcol=c(1,2))
plot(dend_colored, horiz=T)
soil.texture(ssc, show.lines=TRUE\, show.names=TRUE\, show.grid=TRUE\, pch=18, col.symbols=lab_cols, bg.names="gray")
Example 4: Soil Particle Size Split into 3 classes via kmeans() Figure 4
#re-color dendrogram based on a 3 class, kmeans() classification:
x_kmeans <- kmeans(x_stand, 3 ,nstart=10)
k_means_cols <- as.vector(x_kmeans$cluster)
lab_cols <<- k_means_cols
#reverse-transform the 3 centroids
x_kmeans_center_texture <- matrix( cbind(x_kmeans$centers[,7] * mean(x$sand) , x_kmeans$centers[,8] * mean(x$silt), x_kmeans$centers[,9] *
mean(x$clay)), ncol=3 )
#re-plot dendrogram, with new colors, and soil texture triangle
dend_colored <- dendrapply(dend, colLab)
plot(dend_colored, horiz=T)
soil.texture(ssc, show.lines=TRUE\, show.names=TRUE\, show.grid=TRUE\, pch=18, col.symbols=lab_cols, bg.names="gray")
#plot the centroids on the soil texture triange
show.soil.texture(x_kmeans_center_texture, pch=8, col=1)