commit f17c2a1422192efbd3aa16ddb35045a4a27a707f Author: noah Date: Thu Sep 8 14:58:43 2022 -0500 Lab 2 init and README diff --git a/README.md b/README.md new file mode 100644 index 0000000..c65e941 --- /dev/null +++ b/README.md @@ -0,0 +1,141 @@ +# Bioinformatics Lab 2 + +## Part A - hclust + +1.) Write a 2-column linkage matrix using the following hclust rules + +- Label the objects A, B, C, D, E, F as negative numbers -1, -2, -3, -4, -5, -6 +- Put the closest pair of objects in the first row of the matrix. The first row will be: -1, -2 +- Merge AB (1) with C (-3) +- Repeat for the rest of the tree. +- Show the full merge matrix and the heights of each row in the merge matrix. + +2.) Create a script with the following *incomplete* code. Based on your UPGMA merging calculation and your linkage matrix above, fill in the blanks in the code below to plot the tree. Show the resulting plot. + + # initialize an empty list that will contain fields of our hclust object + a <- list() + # encoding rules: + # negative numbers are leaves (A,B,...,E) -> (-1,-2,...,-5) + # positive are merged clusters (defined by row number in $merge) + + # each merged pair in a row + a$merge <- rbind(c(-1, -2), # +1 (A-B) + c(1,-3) , # +2 (A-B)-C + _______ , # +3 (D-E) + _______ , # +4 ((A-B)-C)-(DE) + _______) # +5 (((A-B)-C)-(DE))-F + + a$height <- c(1, 2, 1.5, 3, 4) # merge heights + a$order <- 1:6 # order of leaves + a$labels <- LETTERS[1:6] # labels of leaves as Letters + class(a) <- "hclust" # force a to be hclust object + # plot the tree + # plot knows that a is an hclust object and plots accordingly + plot(a,hang=-1) # or use plot(as.dendrogram(a)) + +## Part B - UPGMA and WGCNA + +1) Add the code below to your script to read in the distance matrix distance_matrix.txt and plot the dendrogram. Show the dendrogram. + + # Set the Working Directory First + my.dist1 <- read.table("distance_matrix.txt",sep="\t", header=T) + rownames(my.dist1) = colnames(my.dist1) + my.dist1[lower.tri(my.dist1)] = t(my.dist1)[lower.tri(my.dist1)] + hc1 <- hclust(as.dist(my.dist1), method="average") + plot(hc1,hang=-1) + +2.) View the distance matrix in Excel and paste it in the report. Use header=T. + +3.) What does header=T do in this situation? + +4.) Add the following code to install and run WGCNA. + + # method 1 + install.packages("WGCNA", dependencies=T) + source("http://bioconductor.org/biocLite.R") + biocLite(c("AnnotationDbi", "impute", "GO.db", "preprocessCore")) + biocLite("WGCNA") + install.packages("WGCNA") + + # method 2 (easier method) + install.packages("BiocManager") + library(BiocManager) + BiocManager::install("WGCNA") + + # load WGCNA and run on the hc1 dendrogram + cutree(hc1,k=3) # hclust cluster labels with k=3 clusters + library(WGCNA) + dynamicMods = cutreeDynamic( + dendro = hc1, + distM = my.dist1, + deepSplit = 2, + pamRespectsDendro = FALSE, + minClusterSize = 2) + names(dynamicMods) <- colnames(my.dist1) + +5.) Looking at dynamicMods, what leaves are clustered together? + +6.) Add the following code to show the clustered data. What do the colors at the bottom represent? + + dynamicColors = labels2colors(dynamicMods) # colors might not work + table(dynamicColors) + # Plot the dendrogram and colors underneath + sizeGrWindow(8,6) + plotDendroAndColors(hc1, + dynamicColors, + "Dynamic Tree Cut", + dendroLabels = NULL, + hang = -1, + addGuide = TRUE, + #guideHang = 0.05, + main = "dynamic cut clustering") + +## Part C - Distance Matrix + +| <> | A | B | C | D | +|:------:|:------:|:------:|:------:|:------:| +| A | x | 0.14 | 0.48 | 0.33 | +| B | | x | 0.5 | 0.28 | +| C | | | x | 0.55 | +| D | | | | x | + + +1.) Do the UPGMA by hand, showing the new UPGMA distance matrices as you merge clusters. + +2.) Create the linkage matrix as you did in A.1, and list the height of each branch. For example, the first row of the linkage matrix is -1, -2 with height .14. + +3.) Mirror the code you used in A.2 to plot the dendrogram for this new matrix. + +4.) Use code below to verify your results by applying hclust to the distance matrix. + + # create distance matrix + my.dist2 <- matrix(data=rep(0,16),ncol=4) + my.dist2[1,2]<-.14; my.dist2[1,3]<-.48; my.dist2[1,4]<-.33; + my.dist2[2,3]<-.50; my.dist2[2,4]<-.28; + my.dist2[3,4]<-.55; + # make matrix symmetric + # make the lower triangle equal to the upper triangle + my.dist2[lower.tri(my.dist2)] = t(my.dist2)[lower.tri(my.dist2)] + diag(my.dist2)<-999 # big number to guarantee it’s not the min + # compare with hclust + test <- hclust(as.dist(my.dist2), method="average") # average linkage, UPGMA + test$merge + test$height + test$order + test$labels<-LETTERS[1:4] + plot(test,hang=-1) + +## Part D - Multidimensional Scaling + +1.) Show the 2d MDS plot for my.dist2. How does this visualization change your perspective on how the objects are clustered? + + ## MDS, you might need to refresh (sweep) the plot window + locs<-cmdscale(as.dist(my.dist2), k=2) # k is number of mds components + x<-locs[,1] + y<-locs[,2] + # pch=NA hides plot symbols: + plot(x,y,main="Multi-dimensional Scaling", + xlab="MDS dimension-1", ylab="MDS dimension-2", pch=NA,asp=1) + # pch specifies plot symbol; we want null because we are using letter next + # plot the text labels instead of symbols: + text(x,y,test$labels,cex=1) diff --git a/Schrick-Noah_CS-6643_Lab-2.R b/Schrick-Noah_CS-6643_Lab-2.R new file mode 100644 index 0000000..f0bd440 --- /dev/null +++ b/Schrick-Noah_CS-6643_Lab-2.R @@ -0,0 +1,4 @@ +# Lab 2 for the University of Tulsa's CS-6643 Bioinformatics Course +# Hierarchical clustering (upgma and linkage matrix), WGCNA, Multidimensional scaling +# Professor: Dr. McKinney, Fall 2022 +# Noah L. Schrick - 1492657 diff --git a/Schrick-Noah_CS-6643_Lab-2.docx b/Schrick-Noah_CS-6643_Lab-2.docx new file mode 100644 index 0000000..14ed45c Binary files /dev/null and b/Schrick-Noah_CS-6643_Lab-2.docx differ diff --git a/assignment.docx b/assignment.docx new file mode 100644 index 0000000..14ed45c Binary files /dev/null and b/assignment.docx differ diff --git a/distance_matrix.txt b/distance_matrix.txt new file mode 100644 index 0000000..39dece8 --- /dev/null +++ b/distance_matrix.txt @@ -0,0 +1 @@ +A B C D E F 999 2 4 6 6 8 999 999 4 6 6 8 999 999 999 6 6 8 999 999 999 999 3 8 999 999 999 999 999 8 999 999 999 999 999 999 \ No newline at end of file diff --git a/labels2colors.R b/labels2colors.R new file mode 100644 index 0000000..43e5de1 --- /dev/null +++ b/labels2colors.R @@ -0,0 +1,46 @@ +labels2colors <- function (labels, zeroIsGrey = TRUE, colorSeq = NULL, naColor = "grey", commonColorCode = TRUE) +{ + if (is.null(colorSeq)) + colorSeq = standardColors() + if (is.numeric(labels)) { + if (zeroIsGrey) + minLabel = 0 + else minLabel = 1 + if (any(labels < 0, na.rm = TRUE)) + minLabel = min(c(labels), na.rm = TRUE) + nLabels = labels + } + else { + if (commonColorCode) { + factors = factor(c(as.matrix(as.data.frame(labels)))) + nLabels = as.numeric(factors) + dim(nLabels) = dim(labels) + } + else { + labels = as.matrix(as.data.frame(labels)) + factors = list() + for (c in 1:ncol(labels)) factors[[c]] = factor(labels[, c]) + nLabels = sapply(factors, as.numeric) + } + } + if (max(nLabels, na.rm = TRUE) > length(colorSeq)) { + nRepeats = as.integer((max(labels) - 1)/length(colorSeq)) + + 1 + warning(paste("labels2colors: Number of labels exceeds number of avilable colors.", + "Some colors will be repeated", nRepeats, "times.")) + extColorSeq = colorSeq + for (rep in 1:nRepeats) extColorSeq = c(extColorSeq, paste(colorSeq, ".", rep, sep = "")) + } + else { + nRepeats = 1 + extColorSeq = colorSeq + } + colors = rep("grey", length(nLabels)) + fin = !is.na(nLabels) + colors[!fin] = naColor + finLabels = nLabels[fin] + colors[fin][finLabels != 0] = extColorSeq[finLabels[finLabels != 0]] + if (!is.null(dim(labels))) + dim(colors) = dim(labels) + colors +}