CS-6643-Bioinformatics-Lab-3/Schrick-Noah_CS-6643_Lab-3.R

190 lines
6.1 KiB
R

# Lab 3 for the University of Tulsa's CS-6643 Bioinformatics Course
# Expression Exploratory Analysis
# Professor: Dr. McKinney, Fall 2022
# Noah L. Schrick - 1492657
## Set Working Directory to file directory - RStudio approach
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#### Part A: Loading Data
## 1: Loading Gene Expression Data
load("sense.filtered.cpm.Rdata")
dim(sense.filtered.cpm)
colnames(sense.filtered.cpm)
## 2: Demographic Data
# Loading
subject.attrs <- read.csv("Demographic_symptom.csv", stringsAsFactors = FALSE)
dim(subject.attrs) # 160 subjects x 40 attributes
colnames(subject.attrs) # interested in X (sample ids) and Diag (diagnosis)
subject.attrs$X
subject.attrs$Diag
# Matching gene expression samples with their diagnosis
if (!require("dplyr")) install.packages("dplyr")
library(dplyr)
# create a phenotype vector
# grab X (subject ids) and Diag (Diagnosis) from subject.attrs that
# intersect %in% with the RNA-Seq data
phenos.df <- subject.attrs %>%
filter(X %in% colnames(sense.filtered.cpm)) %>%
dplyr::select(X, Diag)
colnames(phenos.df) # $Diag is mdd diagnosis
# grab Diag column and convert character to factor
mddPheno <- as.factor(phenos.df$Diag) # this is our phenotype/class vector
summary(mddPheno) # MDD -- major depressive disorder, HC -- healthy control
#### Part B: Normalization
## 1: log2 transformation
# raw cpm boxplots and histogram of one sample
boxplot(sense.filtered.cpm,range=0,
ylab="raw probe intensity", main="Raw", names=mddPheno)
hist(sense.filtered.cpm[,1], freq=F, ylab="density",
xlab="raw probe intensity", main="Raw Data Density for Sample 1")
# log2 transformed boxplots and histogram
boxplot(log2(sense.filtered.cpm), range=0,
ylab="log2 intensity", main="Log2 Transformed", names=mddPheno)
hist(log2(sense.filtered.cpm[,1]), freq=F, ylab="density",
xlab="log2 probe intensity", main="log2 Data Density for Sample 1")
getmode <- function(v) {
uniqv <-unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
data <- data.frame(Mean = c(mean(sense.filtered.cpm[,1]),
mean(log2(sense.filtered.cpm[,1]))),
Mode = c(getmode(sense.filtered.cpm[,1]),
getmode(log2(sense.filtered.cpm[,1]))),
Median = c(median(sense.filtered.cpm[,1]),
median(log2(sense.filtered.cpm[,1])))
)
rownames(data) = c("Original", "Log2 Transformed")
data
## 2: Quantile Normalization
# install quantile normalize
#install.packages("BiocManager")
if (!require("BiocManager")) install.packages("BiocManager")
library(BiocManager)
if (!require("preprocessCore"))
BiocManager::install("preprocessCore")
library(preprocessCore) # replace with custom function?
# apply quantile normalization
mddExprData_quantile <- normalize.quantiles(sense.filtered.cpm)
boxplot(mddExprData_quantile,range=0,
ylab="raw intensity", main="Quantile Normalized")
head(mddExprData_quantile)
## 3: Log2 on quantile normalized data
mddExprData_quantileLog2 <- log2(mddExprData_quantile)
# add phenotype names to matrix
colnames(mddExprData_quantileLog2) <- mddPheno
boxplot(mddExprData_quantileLog2,range=0,
ylab="log2 intensity", main="Quantile Normalized Log2")
hist(log2(mddExprData_quantileLog2[,1]), freq=F,
ylab="density", xlab="log2 probe intensity",
main="log2 Quantile Normalized for Sample 1")
## 4: Means
mean(mddExprData_quantileLog2[,1])
colMeans(mddExprData_quantileLog2)
#### Part C: Exploratory clustering of microarray data
## 1: MDS
# transpose data matrix and convert to data frame
# ggplot wants data frame and subjects as rows
expr_SxG <- data.frame(t(mddExprData_quantileLog2)) # Subject x Gene
colnames(expr_SxG) <- rownames(sense.filtered.cpm) # add gene names
# MDS of subjects
#d<-dist(expr_SxG) # Euclidean metric
mddCorr<-cor(t(expr_SxG)) # distance based on correlation
d <- sqrt(1-mddCorr)
mdd.mds <- cmdscale(d, k = 2)
x <- mdd.mds[,1]
y <- mdd.mds[,2]
mdd.mds.df <- data.frame(mdd.mds)
colnames(mdd.mds.df) <- c("dim1","dim2")
if (!require("ggplot2"))
BiocManager::install("ggplot2")
library(ggplot2)
p <- ggplot() # initialize empty ggplot object
p <- p + geom_point(data=mdd.mds.df, aes(x=dim1, y=dim2, color=mddPheno, shape=mddPheno), size=3)
p <- p + ggtitle("MDS") + xlab("Dim 1") + ylab("Dim 2")
print(p)
dim(mdd.mds)
## 2: hierarchical cluster of subjects
mddTree = hclust(as.dist(d))
mddTree$labels <- mddPheno
plot(mddTree)
num.clust <- 5
mddCuts <- cutree(mddTree,k=num.clust)
mddCutstable <- table(names(mddCuts),mddCuts)
prop.table(mddCutstable, margin=2)
if (!require("dendextend")) install.packages("dendextend")
library(dendextend)
dend <- as.dendrogram(mddTree)
dend=color_labels(dend, k=num.clust)
#dend <- dend %>% color_branches(k = 4)
plot(dend) # selective coloring of branches AND labels
## 3: Clusters
if (!require("WGCNA"))
BiocManager::install("WGCNA")
library(WGCNA)
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
dynamicMods = cutreeDynamic(dendro = mddTree, distM = d,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = 2)
mddColors = labels2colors(dynamicMods)
table(mddColors)
mddColorstable <- table(mddColors,names(mddCuts))
prop.table(mddColorstable, margin = 1)
plotDendroAndColors(mddTree, mddColors, "Dynamic Clusters",
dendroLabels = NULL, # hang = -1,
addGuide = TRUE, #guideHang = 0.05,
main = "Clustering with WGCNA")
## 4: UMAP
if (!require("umap")) install.packages("umap")
library(umap)
# change umap config parameters
custom.config = umap.defaults
custom.config$random_state = 123
custom.config$n_epochs = 500
# umap to cluster observations
obs_umap = umap(expr_SxG, config=custom.config)
#add colors for MDD/HC
colors = rep("black",nrow(expr_SxG))
for (i in 1:nrow(expr_SxG)) {
colors[i] <- ifelse(grepl("MDD", rownames(expr_SxG)[i]), "black", "red")
}
dim(obs_umap$layout)
plot(obs_umap$layout, col=colors,
main="umap of observations", xlab="umap dim1", ylab="umap dim2")
legend("bottomleft", legend = c("MDD","HC"),
col=c("black","red"), pch=19)