CS-6643-Bioinformatics-Lab-2/Schrick-Noah_CS-6643_Lab-2.R
2022-09-15 17:48:15 -05:00

142 lines
4.6 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Lab 2 for the University of Tulsa's CS-6643 Bioinformatics Course
# Hierarchical clustering (upgma and linkage matrix), WGCNA, Multidimensional scaling
# Professor: Dr. McKinney, Fall 2022
# Noah L. Schrick - 1492657
## Set Working Directory to file directory - RStudio approach
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#### Part A: hclust
## 1 - Writing a 2-column linkage matrix
# No supporting code - see report PDF for work.
## 2 - Plot the tree based on Part 1 results
# initialize an empty list that will contain fields of our hclust object
a <- list()
# encoding rules:
# negative numbers are leaves (A,B,...,E) -> (-1,-2,...,-5)
# positive are merged clusters (defined by row number in $merge)
# each merged pair in a row
a$merge <- rbind(c(-1, -2), # +1 (A-B)
c( 1, -3), # +2 (A-B)-C
c(-4, -5), # +3 (D-E)
c( 2, 3), # +4 ((A-B)-C)-(DE)
c( 4, -6)) # +5 (((A-B)-C)-(DE))-F
a$height <- c(1, 2, 1.5, 3, 4) # merge heights
a$order <- 1:6 # order of leaves
a$labels <- LETTERS[1:6] # labels of leaves as Letters
class(a) <- "hclust" # force a to be hclust object
# plot the tree
# plot knows that a is an hclust object and plots accordingly
plot(a,hang=-1) # or use plot(as.dendrogram(a))
#### Part B - UPGMA and WGCNA
## 1 - Read in the distance matrix and plot the dendrogram
my.dist1 <- read.table("distance_matrix.txt",sep="\t", header=T)
rownames(my.dist1) = colnames(my.dist1)
my.dist1[lower.tri(my.dist1)] = t(my.dist1)[lower.tri(my.dist1)]
hc1 <- hclust(as.dist(my.dist1), method="average")
hc1
plot(hc1,hang=-1)
## 2 - Paste the distance matrix in the report. Use header=T
## 3 - no supporting code
## 4 - install and run WGCNA
# Install
if (!require("BiocManager")) install.packages("BiocManager")
library(BiocManager)
if (!require("WGCNA"))
BiocManager::install("WGCNA")
# load WGCNA and run on the hc1 dendrogram
cutree(hc1,k=3) # hclust cluster labels with k=3 clusters
library(WGCNA)
dynamicMods = cutreeDynamic(
dendro = hc1,
distM = my.dist1,
deepSplit = 2,
pamRespectsDendro = FALSE,
minClusterSize = 2)
names(dynamicMods) <- colnames(my.dist1)
## 5 - identify clusters
dynamicMods
## 6 - show clustered data
dynamicColors = labels2colors(dynamicMods) # colors might not work
table(dynamicColors)
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(hc1,
dynamicColors,
"Dynamic Tree Cut",
dendroLabels = NULL,
hang = -1,
addGuide = TRUE,
#guideHang = 0.05,
main = "dynamic cut clustering")
#### Part C - Distance Matrix
## 1 - no supporting code
## 2 - no supporting code
## 3 - Plot the dendrogram
# initialize an empty list that will contain fields of our hclust object
a <- list()
# encoding rules:
# negative numbers are leaves (A,B,...,E) -> (-1,-2,...,-5)
# positive are merged clusters (defined by row number in $merge)
# each merged pair in a row
a$merge <- rbind(c(-1, -2), # +1 (A-B)
c(1, -4), # +2 (A-B)-D
c(2, -3)) # +3 ((A-B)-D)-C
a$height <- c(0.14, 0.305, 0.51) # merge heights
a$order <- 1:4 # order of leaves
a$labels <- LETTERS[1:4] # labels of leaves as Letters
class(a) <- "hclust" # force a to be hclust object
# plot the tree
# plot knows that a is an hclust object and plots accordingly
plot(a) # or use plot(as.dendrogram(a))
## 4 - Verify results
# create distance matrix
my.dist2 <- matrix(data=rep(0,16),ncol=4)
my.dist2[1,2]<-.14; my.dist2[1,3]<-.48; my.dist2[1,4]<-.33;
my.dist2[2,3]<-.50; my.dist2[2,4]<-.28;
my.dist2[3,4]<-.55;
# make matrix symmetric
# make the lower triangle equal to the upper triangle
my.dist2[lower.tri(my.dist2)] = t(my.dist2)[lower.tri(my.dist2)]
diag(my.dist2)<-999 # big number to guarantee its not the min
# compare with hclust
test <- hclust(as.dist(my.dist2), method="average") # average linkage, UPGMA
test$merge
test$height
test$order
test$labels<-LETTERS[1:4]
plot(test)
#### Part D - MDS
## 1 - Show the 2D MDS
## MDS, you might need to refresh (sweep) the plot window
locs<-cmdscale(as.dist(my.dist2), k=2) # k is number of mds components
x<-locs[,1]
y<-locs[,2]
# pch=NA hides plot symbols:
plot(x,y,main="Multi-dimensional Scaling",
xlab="MDS dimension-1", ylab="MDS dimension-2", pch=NA,asp=1)
# pch specifies plot symbol; we want null because we are using letter next
# plot the text labels instead of symbols:
text(x,y,test$labels,cex=1)