142 lines
4.6 KiB
R
142 lines
4.6 KiB
R
# Lab 2 for the University of Tulsa's CS-6643 Bioinformatics Course
|
||
# Hierarchical clustering (upgma and linkage matrix), WGCNA, Multidimensional scaling
|
||
# Professor: Dr. McKinney, Fall 2022
|
||
# Noah L. Schrick - 1492657
|
||
|
||
## Set Working Directory to file directory - RStudio approach
|
||
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
|
||
|
||
#### Part A: hclust
|
||
## 1 - Writing a 2-column linkage matrix
|
||
# No supporting code - see report PDF for work.
|
||
|
||
## 2 - Plot the tree based on Part 1 results
|
||
# initialize an empty list that will contain fields of our hclust object
|
||
a <- list()
|
||
# encoding rules:
|
||
# negative numbers are leaves (A,B,...,E) -> (-1,-2,...,-5)
|
||
# positive are merged clusters (defined by row number in $merge)
|
||
|
||
# each merged pair in a row
|
||
a$merge <- rbind(c(-1, -2), # +1 (A-B)
|
||
c( 1, -3), # +2 (A-B)-C
|
||
c(-4, -5), # +3 (D-E)
|
||
c( 2, 3), # +4 ((A-B)-C)-(DE)
|
||
c( 4, -6)) # +5 (((A-B)-C)-(DE))-F
|
||
|
||
a$height <- c(1, 2, 1.5, 3, 4) # merge heights
|
||
a$order <- 1:6 # order of leaves
|
||
a$labels <- LETTERS[1:6] # labels of leaves as Letters
|
||
class(a) <- "hclust" # force a to be hclust object
|
||
# plot the tree
|
||
# plot knows that a is an hclust object and plots accordingly
|
||
plot(a,hang=-1) # or use plot(as.dendrogram(a))
|
||
|
||
#### Part B - UPGMA and WGCNA
|
||
## 1 - Read in the distance matrix and plot the dendrogram
|
||
my.dist1 <- read.table("distance_matrix.txt",sep="\t", header=T)
|
||
rownames(my.dist1) = colnames(my.dist1)
|
||
my.dist1[lower.tri(my.dist1)] = t(my.dist1)[lower.tri(my.dist1)]
|
||
hc1 <- hclust(as.dist(my.dist1), method="average")
|
||
hc1
|
||
plot(hc1,hang=-1)
|
||
|
||
|
||
## 2 - Paste the distance matrix in the report. Use header=T
|
||
|
||
## 3 - no supporting code
|
||
|
||
## 4 - install and run WGCNA
|
||
# Install
|
||
if (!require("BiocManager")) install.packages("BiocManager")
|
||
library(BiocManager)
|
||
if (!require("WGCNA"))
|
||
BiocManager::install("WGCNA")
|
||
|
||
# load WGCNA and run on the hc1 dendrogram
|
||
cutree(hc1,k=3) # hclust cluster labels with k=3 clusters
|
||
library(WGCNA)
|
||
dynamicMods = cutreeDynamic(
|
||
dendro = hc1,
|
||
distM = my.dist1,
|
||
deepSplit = 2,
|
||
pamRespectsDendro = FALSE,
|
||
minClusterSize = 2)
|
||
names(dynamicMods) <- colnames(my.dist1)
|
||
|
||
## 5 - identify clusters
|
||
dynamicMods
|
||
|
||
|
||
## 6 - show clustered data
|
||
dynamicColors = labels2colors(dynamicMods) # colors might not work
|
||
table(dynamicColors)
|
||
# Plot the dendrogram and colors underneath
|
||
sizeGrWindow(8,6)
|
||
plotDendroAndColors(hc1,
|
||
dynamicColors,
|
||
"Dynamic Tree Cut",
|
||
dendroLabels = NULL,
|
||
hang = -1,
|
||
addGuide = TRUE,
|
||
#guideHang = 0.05,
|
||
main = "dynamic cut clustering")
|
||
|
||
#### Part C - Distance Matrix
|
||
## 1 - no supporting code
|
||
|
||
## 2 - no supporting code
|
||
|
||
## 3 - Plot the dendrogram
|
||
# initialize an empty list that will contain fields of our hclust object
|
||
a <- list()
|
||
# encoding rules:
|
||
# negative numbers are leaves (A,B,...,E) -> (-1,-2,...,-5)
|
||
# positive are merged clusters (defined by row number in $merge)
|
||
|
||
# each merged pair in a row
|
||
a$merge <- rbind(c(-1, -2), # +1 (A-B)
|
||
c(1, -4), # +2 (A-B)-D
|
||
c(2, -3)) # +3 ((A-B)-D)-C
|
||
|
||
|
||
a$height <- c(0.14, 0.305, 0.51) # merge heights
|
||
a$order <- 1:4 # order of leaves
|
||
a$labels <- LETTERS[1:4] # labels of leaves as Letters
|
||
class(a) <- "hclust" # force a to be hclust object
|
||
# plot the tree
|
||
# plot knows that a is an hclust object and plots accordingly
|
||
plot(a) # or use plot(as.dendrogram(a))
|
||
|
||
|
||
## 4 - Verify results
|
||
# create distance matrix
|
||
my.dist2 <- matrix(data=rep(0,16),ncol=4)
|
||
my.dist2[1,2]<-.14; my.dist2[1,3]<-.48; my.dist2[1,4]<-.33;
|
||
my.dist2[2,3]<-.50; my.dist2[2,4]<-.28;
|
||
my.dist2[3,4]<-.55;
|
||
# make matrix symmetric
|
||
# make the lower triangle equal to the upper triangle
|
||
my.dist2[lower.tri(my.dist2)] = t(my.dist2)[lower.tri(my.dist2)]
|
||
diag(my.dist2)<-999 # big number to guarantee it’s not the min
|
||
# compare with hclust
|
||
test <- hclust(as.dist(my.dist2), method="average") # average linkage, UPGMA
|
||
test$merge
|
||
test$height
|
||
test$order
|
||
test$labels<-LETTERS[1:4]
|
||
plot(test)
|
||
|
||
#### Part D - MDS
|
||
## 1 - Show the 2D MDS
|
||
## MDS, you might need to refresh (sweep) the plot window
|
||
locs<-cmdscale(as.dist(my.dist2), k=2) # k is number of mds components
|
||
x<-locs[,1]
|
||
y<-locs[,2]
|
||
# pch=NA hides plot symbols:
|
||
plot(x,y,main="Multi-dimensional Scaling",
|
||
xlab="MDS dimension-1", ylab="MDS dimension-2", pch=NA,asp=1)
|
||
# pch specifies plot symbol; we want null because we are using letter next
|
||
# plot the text labels instead of symbols:
|
||
text(x,y,test$labels,cex=1)
|