CS-6643-Bioinformatics-Lab-5/Schrick-Noah_CS-6643_Lab-5.R
2022-10-13 18:29:27 -05:00

251 lines
9.2 KiB
R
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Lab 5 for the University of Tulsa's CS-6643 Bioinformatics Course
# Gene Expression Statistical Learning
# Professor: Dr. McKinney, Fall 2022
# Noah L. Schrick - 1492657
## Set Working Directory to file directory - RStudio approach
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#### 0: Process and filter data
# load gene expression data
load("sense.filtered.cpm.Rdata") # setwd!
# load phenotype (mdd/hc) data
subject.attrs <- read.csv("Demographic_symptom.csv",
stringsAsFactors = FALSE)
if (!require("dplyr")) install.packages("dplyr")
library(dplyr)
# grab intersecting X (subject ids) and Diag (Diagnosis) from columns
phenos.df <- subject.attrs %>%
filter(X %in% colnames(sense.filtered.cpm)) %>%
dplyr::select(X, Diag)
mddPheno <- as.factor(phenos.df$Diag)
# Normalized and transform
if (!require("preprocessCore")) install.packages("preprocessCore")
library(preprocessCore)
mddExprData_quantile <- normalize.quantiles(sense.filtered.cpm)
mddExprData_quantileLog2 <- log2(mddExprData_quantile)
# attach phenotype names and gene names to data
colnames(mddExprData_quantileLog2) <- mddPheno
rownames(mddExprData_quantileLog2) <- rownames(sense.filtered.cpm)
# coefficient of variation filter sd(x)/abs(mean(x))
CoV_values <- apply(mddExprData_quantileLog2,1,
function(x) {sd(x)/abs(mean(x))})
# smaller threshold, the higher the experimental effect relative to the
# measurement precision
sum(CoV_values<.045)
# there is one gene that has 0 variation -- remove
sd_values <- apply(mddExprData_quantileLog2,1, function(x) {sd(x)})
rownames(mddExprData_quantileLog2)[sd_values==0]
# filter the data matrix
GxS.covfilter <- mddExprData_quantileLog2[CoV_values<.045 & sd_values>0,]
dim(GxS.covfilter)
# convert phenotype to factor
pheno.factor <- as.factor(colnames(GxS.covfilter))
pheno.factor
str(pheno.factor)
levels(pheno.factor)
#### Part A: Logistic Regression
# make sure HC is the reference level
pheno.factor.relevel <- relevel(pheno.factor,"HC")
levels(pheno.factor.relevel)
# also rename levels "0"/"1" from 1/2
levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="MDD"] <- 1
levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="HC"] <- 0
## Fit logistic model of first gene to phenotype data
gene.row <- 2
gene.name <-rownames(GxS.covfilter)[gene.row]
gene.expr <- GxS.covfilter[gene.row,]
gene.fit <- glm(pheno.factor.relevel~gene.expr, family=binomial)
summary(gene.fit)
coeff.mat <- coef(summary(gene.fit))
b0 <- coeff.mat[1,1]
b1 <- coeff.mat[2,1]
b1.pval <- coeff.mat[2,4]
## GLM
modelfn <- function(x){1/(1+exp(-(b0+b1*x)))}
g.min <- min(gene.expr)
g.max <- max(gene.expr)
curve(modelfn,g.min,g.max) # plot for domain of actual genes expression
abline(h=.5, lty=2)
curve(modelfn,3,8) # replot with extended domain to see the S shape
## Predict Function
predicted.probs <- predict(gene.fit, gene.expr=gene.expr,
type="response")
if (!require("ggplot2")) install.packages("ggplot2")
library(ggplot2)
phenotype <- pheno.factor # just rename for legend
gene.gg.df <- data.frame(expression=gene.expr,
prediction=predicted.probs)
# plot predicted probabilities versus gene expression
p <- ggplot(data=gene.gg.df)
p <- p + geom_point(aes(x=expression,
y=prediction,
color=phenotype, # shape and color
shape=phenotype), # are true phenotype
size=3)
p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red")
print(p) + abline(h=.5, lty=2)
#### Part B: Logistic Regression as Classification
# vector of logistic output probabilities for this model (glm/eq. above)
# prob = .5 is the threshold for prediction class = 1 vs 0
predicted.class <- as.integer(predicted.probs >=.5) # predicted class
pheno.factor.relevel # true class
# vector of True (correctly predicted) and False (wrongly predicted)
correct.classified <- predicted.class == pheno.factor.relevel
# sum correct.classified
table(pheno.factor.relevel,predicted.class)
# accuracy:
correct.acc <- sum(correct.classified == "TRUE") / length(correct.classified)
#### Part C: Logistic Regression Ranking of all Genes
# logistic regression function for one gene row
lr.fn <- function(i){
gene=rownames(GxS.covfilter)[i]
gene.expr <- GxS.covfilter[i,]
gene.fit <- glm(pheno.factor.relevel~gene.expr,
family=binomial)
coeff.mat <- coef(summary(gene.fit))
b1 <- coeff.mat[2,1]
b1.pval <- coeff.mat[2,4]
coefvec <- gene.fit$estimate # intercept, gene
pvec <- gene.fit$p.value # intercept, gene
c(gene, b1, b1.pval)
}
# Testing with just one row
lr.fn(2)
# initialize results matrix
num.genes<-nrow(GxS.covfilter)
lr.results.mat <- matrix(0, nrow=nrow(GxS.covfilter), ncol=3)
# for loop the function to all genes
for (i in 1:num.genes){
lr.results.mat[i,] <- lr.fn(i)
}
lr.results.df <- data.frame(lr.results.mat)
colnames(lr.results.df) <- c("gene", "b1", "p.val")
# sort results b1 coefficient p-value
library(dplyr)
lr.results.sorted <- lr.results.df %>%
mutate_at("p.val", as.character) %>%
mutate_at("p.val", as.numeric) %>%
arrange(p.val)
lr.results.sorted[1:10,]
## Scatter plot of model fit and accuracy computation of top gene
# Get top gene data
gene.row <- which(rownames(GxS.covfilter)==lr.results.sorted[1,1])
gene.expr <- GxS.covfilter[gene.row,]
gene.fit <- glm(pheno.factor.relevel~gene.expr,
family=binomial)
predicted.probs <- predict(gene.fit, gene.expr=gene.expr, type="response")
# Plotting Code
gene.gg.df <- data.frame(expression=gene.expr,
prediction=predicted.probs)
# plot predicted probabilities versus gene expression
p <- ggplot(data=gene.gg.df)
p <- p + geom_point(aes(x=expression,
y=prediction,
color=phenotype, # shape and color
shape=phenotype), # are true phenotype
size=3)
p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") +
ggtitle(lr.results.sorted[1,1])
print(p) + abline(h=.5, lty=2)
# code for accuracy
# vector of logistic output probabilities for this model (glm/eq. above)
# prob = .5 is the threshold for prediction class = 1 vs 0
predicted.class <- as.integer(predicted.probs >=.5) # predicted class
pheno.factor.relevel # true class
# vector of True (correctly predicted) and False (wrongly predicted)
correct.classified <- predicted.class == pheno.factor.relevel
# sum correct.classified
table(pheno.factor.relevel,predicted.class)
# accuracy:
correct.acc <- sum(correct.classified == "TRUE") / length(correct.classified)
correct.acc
#### Part D: Statistical Learning
if (!require("glmnet")) install.packages("glmnet")
library(glmnet)
# alpha=0 means ridge, alpha=1 means lasso
# keep has to do with storing the results from cross-validiation
glmnet.model <- cv.glmnet(t(GxS.covfilter), pheno.factor.relevel, alpha=1,
family="binomial", type.measure="class", keep=T)
plot(glmnet.model) # plot of CV error vs lambda penalty
glmnet.model$lambda.min # lambda that minimizes the CV error
# Easy way to extract non-zero coeffs
if (!require("coefplot")) install.packages("coefplot")
library(coefplot)
extract.coef(glmnet.model)
# get the penalized regression coefficients
glmnet.coeffs <- predict(glmnet.model,type="coefficients",
s=glmnet.model$lambda.min)
# get the coefficients that are not zero (these are the selected variables)
glmnet.nonzero.coeffs <- glmnet.coeffs@Dimnames[[1]][which(glmnet.coeffs!=0)]
glmnet.nonzero.coeffs
top_glmnet <- data.frame(as.matrix(glmnet.coeffs)) %>%
tibble::rownames_to_column(var = "features") %>%
filter(s1!=0, features!="(Intercept)")
top_glmnet_features <- top_glmnet %>% pull(features)
# apply the glmnet model to the data to get class predictions
glmnet.predicted <- predict(glmnet.model,
s=glmnet.model$lambda.min, # lambda to use
type="class", # classify
newx=t(GxS.covfilter)) # apply to original
glmnet.accuracy <- mean(factor(glmnet.predicted)==pheno.factor.relevel)
glmnet.accuracy
table(glmnet.predicted,pheno.factor.relevel) # confusion matrix
#### Part E: NPDR
if (!require("devtools")) install.packages("devtools")
library(devtools)
install_github("insilico/npdr")
library(npdr) #https://github.com/insilico/npdr
SxG.dat <- t(GxS.covfilter)
npdr.MDD.results <- npdr(pheno.factor, SxG.dat,
regression.type="binomial",
attr.diff.type="numeric-abs",
nbd.metric = "manhattan",
knn=knnSURF.balanced(pheno.factor, sd.frac = .5),
dopar.nn = F, dopar.reg=F,
padj.method="bonferroni", verbose = T)
colnames(npdr.MDD.results) # column names of the output
library(dplyr)
# gets genes (attributes/att) with FDR-adjusted p-value<.05
top.p05.npdr <- npdr.MDD.results %>% filter(pval.adj<.05) %>% pull(att)
top.p05.npdr
# grab top 200, remove NA, remove "", get att col
top.npdr <- npdr.MDD.results %>% dplyr::slice(1:200) %>%
filter(pval.adj<1) %>% pull(att)
write.table(top.npdr,row.names=F,col.names=F,quote=F)