# Lab 3 for the University of Tulsa's CS-6643 Bioinformatics Course
# Expression Exploratory Analysis 
# Professor: Dr. McKinney, Fall 2022
# Noah L. Schrick - 1492657

## Set Working Directory to file directory - RStudio approach
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

#### Part A: Loading Data
## 1: Loading Gene Expression Data
load("sense.filtered.cpm.Rdata")
dim(sense.filtered.cpm) 
colnames(sense.filtered.cpm)

## 2: Demographic Data
# Loading
subject.attrs <- read.csv("Demographic_symptom.csv", stringsAsFactors = FALSE)
dim(subject.attrs)  # 160 subjects x 40 attributes
colnames(subject.attrs)  # interested in X (sample ids) and Diag (diagnosis)
subject.attrs$X
subject.attrs$Diag

# Matching gene expression samples with their diagnosis
if (!require("dplyr")) install.packages("dplyr")
library(dplyr)
# create a phenotype vector
# grab X (subject ids) and Diag (Diagnosis) from subject.attrs that 
#   intersect %in% with the RNA-Seq data
phenos.df <- subject.attrs %>% 
  filter(X %in% colnames(sense.filtered.cpm)) %>%
  dplyr::select(X, Diag)  
colnames(phenos.df) # $Diag is mdd diagnosis
# grab Diag column and convert character to factor
mddPheno <- as.factor(phenos.df$Diag)  # this is our phenotype/class vector 

summary(mddPheno) # MDD -- major depressive disorder, HC -- healthy control

#### Part B: Normalization
## 1: log2 transformation
# raw cpm boxplots and histogram of one sample
boxplot(sense.filtered.cpm,range=0,
        ylab="raw probe intensity", main="Raw", names=mddPheno)

hist(sense.filtered.cpm[,1], freq=F, ylab="density", 
     xlab="raw probe intensity", main="Raw Data Density for Sample 1")

# log2 transformed boxplots and histogram
boxplot(log2(sense.filtered.cpm), range=0,
        ylab="log2 intensity", main="Log2 Transformed", names=mddPheno)

hist(log2(sense.filtered.cpm[,1]), freq=F, ylab="density", 
     xlab="log2 probe intensity", main="log2 Data Density for Sample 1")

getmode <- function(v) {
  uniqv <-unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

data <- data.frame(Mean = c(mean(sense.filtered.cpm[,1]),
                            mean(log2(sense.filtered.cpm[,1]))),
                   Mode = c(getmode(sense.filtered.cpm[,1]),
                            getmode(log2(sense.filtered.cpm[,1]))),
                   Median = c(median(sense.filtered.cpm[,1]),
                              median(log2(sense.filtered.cpm[,1])))
                   )
rownames(data) = c("Original", "Log2 Transformed")
data

## 2: Quantile Normalization
# install quantile normalize
#install.packages("BiocManager")
if (!require("BiocManager")) install.packages("BiocManager")
library(BiocManager)
if (!require("preprocessCore"))
  BiocManager::install("preprocessCore")
library(preprocessCore)    # replace with custom function?

# apply quantile normalization
mddExprData_quantile <- normalize.quantiles(sense.filtered.cpm)

boxplot(mddExprData_quantile,range=0,
        ylab="raw intensity", main="Quantile Normalized")

head(mddExprData_quantile)

## 3: Log2 on quantile normalized data
mddExprData_quantileLog2 <- log2(mddExprData_quantile)
# add phenotype names to matrix
colnames(mddExprData_quantileLog2) <- mddPheno  

boxplot(mddExprData_quantileLog2,range=0,
        ylab="log2 intensity", main="Quantile Normalized Log2")

hist(log2(mddExprData_quantileLog2[,1]), freq=F, 
     ylab="density", xlab="log2 probe intensity", 
     main="log2 Quantile Normalized for Sample 1")

## 4: Means
mean(mddExprData_quantileLog2[,1])
colMeans(mddExprData_quantileLog2)


#### Part C: Exploratory clustering of microarray data
## 1: MDS
# transpose data matrix and convert to data frame
# ggplot wants data frame and subjects as rows
expr_SxG <- data.frame(t(mddExprData_quantileLog2)) # Subject x Gene
colnames(expr_SxG) <- rownames(sense.filtered.cpm)  # add gene names

# MDS of subjects 
#d<-dist(expr_SxG)         # Euclidean metric
mddCorr<-cor(t(expr_SxG))  # distance based on correlation
d <- sqrt(1-mddCorr)
mdd.mds <- cmdscale(d, k = 2)
x <- mdd.mds[,1]
y <- mdd.mds[,2]
mdd.mds.df <- data.frame(mdd.mds)
colnames(mdd.mds.df) <- c("dim1","dim2")

if (!require("ggplot2"))
  BiocManager::install("ggplot2")
library(ggplot2)

p <- ggplot() # initialize empty ggplot object
p <- p + geom_point(data=mdd.mds.df, aes(x=dim1, y=dim2, color=mddPheno, shape=mddPheno), size=3)
p <- p + ggtitle("MDS") + xlab("Dim 1") + ylab("Dim 2")
print(p)

dim(mdd.mds)

## 2: hierarchical cluster of subjects
mddTree = hclust(as.dist(d))
mddTree$labels <- mddPheno
plot(mddTree)

num.clust <- 5
mddCuts <- cutree(mddTree,k=num.clust)
mddCutstable <- table(names(mddCuts),mddCuts)
prop.table(mddCutstable, margin=2)


if (!require("dendextend")) install.packages("dendextend")
library(dendextend)
dend <- as.dendrogram(mddTree)
dend=color_labels(dend, k=num.clust)
#dend <- dend %>% color_branches(k = 4) 
plot(dend) # selective coloring of branches AND labels 


## 3: Clusters
if (!require("WGCNA"))
  BiocManager::install("WGCNA")
library(WGCNA)
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
dynamicMods = cutreeDynamic(dendro = mddTree, distM = d,
                            deepSplit = 2, pamRespectsDendro = FALSE,
                            minClusterSize = 2)
mddColors = labels2colors(dynamicMods)
table(mddColors)
mddColorstable <- table(mddColors,names(mddCuts))
prop.table(mddColorstable, margin = 1)
plotDendroAndColors(mddTree, mddColors, "Dynamic Clusters",
                    dendroLabels = NULL, # hang = -1,
                    addGuide = TRUE, #guideHang = 0.05,
                    main = "Clustering with WGCNA")

## 4: UMAP
if (!require("umap")) install.packages("umap")
library(umap)
# change umap config parameters
custom.config = umap.defaults
custom.config$random_state = 123
custom.config$n_epochs = 500

# umap to cluster observations
obs_umap = umap(expr_SxG, config=custom.config)
#add colors for MDD/HC
colors = rep("black",nrow(expr_SxG))

for (i in 1:nrow(expr_SxG)) {
  colors[i] <- ifelse(grepl("MDD", rownames(expr_SxG)[i]), "black", "red")
}

dim(obs_umap$layout)
plot(obs_umap$layout, col=colors, 
     main="umap of observations", xlab="umap dim1", ylab="umap dim2")
legend("bottomleft", legend = c("MDD","HC"), 
       col=c("black","red"), pch=19)