# Lab 3 for the University of Tulsa's CS-6643 Bioinformatics Course # Expression Exploratory Analysis # Professor: Dr. McKinney, Fall 2022 # Noah L. Schrick - 1492657 ## Set Working Directory to file directory - RStudio approach setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #### Part A: Loading Data ## 1: Loading Gene Expression Data load("sense.filtered.cpm.Rdata") dim(sense.filtered.cpm) colnames(sense.filtered.cpm) ## 2: Demographic Data # Loading subject.attrs <- read.csv("Demographic_symptom.csv", stringsAsFactors = FALSE) dim(subject.attrs) # 160 subjects x 40 attributes colnames(subject.attrs) # interested in X (sample ids) and Diag (diagnosis) subject.attrs$X subject.attrs$Diag # Matching gene expression samples with their diagnosis if (!require("dplyr")) install.packages("dplyr") library(dplyr) # create a phenotype vector # grab X (subject ids) and Diag (Diagnosis) from subject.attrs that # intersect %in% with the RNA-Seq data phenos.df <- subject.attrs %>% filter(X %in% colnames(sense.filtered.cpm)) %>% dplyr::select(X, Diag) colnames(phenos.df) # $Diag is mdd diagnosis # grab Diag column and convert character to factor mddPheno <- as.factor(phenos.df$Diag) # this is our phenotype/class vector summary(mddPheno) # MDD -- major depressive disorder, HC -- healthy control #### Part B: Normalization ## 1: log2 transformation # raw cpm boxplots and histogram of one sample boxplot(sense.filtered.cpm,range=0, ylab="raw probe intensity", main="Raw", names=mddPheno) hist(sense.filtered.cpm[,1], freq=F, ylab="density", xlab="raw probe intensity", main="Raw Data Density for Sample 1") # log2 transformed boxplots and histogram boxplot(log2(sense.filtered.cpm), range=0, ylab="log2 intensity", main="Log2 Transformed", names=mddPheno) hist(log2(sense.filtered.cpm[,1]), freq=F, ylab="density", xlab="log2 probe intensity", main="log2 Data Density for Sample 1") getmode <- function(v) { uniqv <-unique(v) uniqv[which.max(tabulate(match(v, uniqv)))] } data <- data.frame(Mean = c(mean(sense.filtered.cpm[,1]), mean(log2(sense.filtered.cpm[,1]))), Mode = c(getmode(sense.filtered.cpm[,1]), getmode(log2(sense.filtered.cpm[,1]))), Median = c(median(sense.filtered.cpm[,1]), median(log2(sense.filtered.cpm[,1]))) ) rownames(data) = c("Original", "Log2 Transformed") data ## 2: Quantile Normalization # install quantile normalize #install.packages("BiocManager") if (!require("BiocManager")) install.packages("BiocManager") library(BiocManager) if (!require("preprocessCore")) BiocManager::install("preprocessCore") library(preprocessCore) # replace with custom function? # apply quantile normalization mddExprData_quantile <- normalize.quantiles(sense.filtered.cpm) boxplot(mddExprData_quantile,range=0, ylab="raw intensity", main="Quantile Normalized") head(mddExprData_quantile) ## 3: Log2 on quantile normalized data mddExprData_quantileLog2 <- log2(mddExprData_quantile) # add phenotype names to matrix colnames(mddExprData_quantileLog2) <- mddPheno boxplot(mddExprData_quantileLog2,range=0, ylab="log2 intensity", main="Quantile Normalized Log2") hist(log2(mddExprData_quantileLog2[,1]), freq=F, ylab="density", xlab="log2 probe intensity", main="log2 Quantile Normalized for Sample 1") ## 4: Means mean(mddExprData_quantileLog2[,1]) colMeans(mddExprData_quantileLog2) #### Part C: Exploratory clustering of microarray data ## 1: MDS # transpose data matrix and convert to data frame # ggplot wants data frame and subjects as rows expr_SxG <- data.frame(t(mddExprData_quantileLog2)) # Subject x Gene colnames(expr_SxG) <- rownames(sense.filtered.cpm) # add gene names # MDS of subjects #d<-dist(expr_SxG) # Euclidean metric mddCorr<-cor(t(expr_SxG)) # distance based on correlation d <- sqrt(1-mddCorr) mdd.mds <- cmdscale(d, k = 2) x <- mdd.mds[,1] y <- mdd.mds[,2] mdd.mds.df <- data.frame(mdd.mds) colnames(mdd.mds.df) <- c("dim1","dim2") if (!require("ggplot2")) BiocManager::install("ggplot2") library(ggplot2) p <- ggplot() # initialize empty ggplot object p <- p + geom_point(data=mdd.mds.df, aes(x=dim1, y=dim2, color=mddPheno, shape=mddPheno), size=3) p <- p + ggtitle("MDS") + xlab("Dim 1") + ylab("Dim 2") print(p) dim(mdd.mds) ## 2: hierarchical cluster of subjects mddTree = hclust(as.dist(d)) mddTree$labels <- mddPheno plot(mddTree) num.clust <- 5 mddCuts <- cutree(mddTree,k=num.clust) mddCutstable <- table(names(mddCuts),mddCuts) prop.table(mddCutstable, margin=2) if (!require("dendextend")) install.packages("dendextend") library(dendextend) dend <- as.dendrogram(mddTree) dend=color_labels(dend, k=num.clust) #dend <- dend %>% color_branches(k = 4) plot(dend) # selective coloring of branches AND labels ## 3: Clusters if (!require("WGCNA")) BiocManager::install("WGCNA") library(WGCNA) # Plot the dendrogram and colors underneath sizeGrWindow(8,6) dynamicMods = cutreeDynamic(dendro = mddTree, distM = d, deepSplit = 2, pamRespectsDendro = FALSE, minClusterSize = 2) mddColors = labels2colors(dynamicMods) table(mddColors) mddColorstable <- table(mddColors,names(mddCuts)) prop.table(mddColorstable, margin = 1) plotDendroAndColors(mddTree, mddColors, "Dynamic Clusters", dendroLabels = NULL, # hang = -1, addGuide = TRUE, #guideHang = 0.05, main = "Clustering with WGCNA") ## 4: UMAP if (!require("umap")) install.packages("umap") library(umap) # change umap config parameters custom.config = umap.defaults custom.config$random_state = 123 custom.config$n_epochs = 500 # umap to cluster observations obs_umap = umap(expr_SxG, config=custom.config) #add colors for MDD/HC colors = rep("black",nrow(expr_SxG)) for (i in 1:nrow(expr_SxG)) { colors[i] <- ifelse(grepl("MDD", rownames(expr_SxG)[i]), "black", "red") } dim(obs_umap$layout) plot(obs_umap$layout, col=colors, main="umap of observations", xlab="umap dim1", ylab="umap dim2") legend("bottomleft", legend = c("MDD","HC"), col=c("black","red"), pch=19)