251 lines
9.2 KiB
R
251 lines
9.2 KiB
R
# Lab 5 for the University of Tulsa's CS-6643 Bioinformatics Course
|
||
# Gene Expression Statistical Learning
|
||
# Professor: Dr. McKinney, Fall 2022
|
||
# Noah L. Schrick - 1492657
|
||
|
||
## Set Working Directory to file directory - RStudio approach
|
||
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
|
||
|
||
#### 0: Process and filter data
|
||
# load gene expression data
|
||
load("sense.filtered.cpm.Rdata") # setwd!
|
||
|
||
# load phenotype (mdd/hc) data
|
||
subject.attrs <- read.csv("Demographic_symptom.csv",
|
||
stringsAsFactors = FALSE)
|
||
|
||
if (!require("dplyr")) install.packages("dplyr")
|
||
library(dplyr)
|
||
# grab intersecting X (subject ids) and Diag (Diagnosis) from columns
|
||
phenos.df <- subject.attrs %>%
|
||
filter(X %in% colnames(sense.filtered.cpm)) %>%
|
||
dplyr::select(X, Diag)
|
||
mddPheno <- as.factor(phenos.df$Diag)
|
||
|
||
# Normalized and transform
|
||
if (!require("preprocessCore")) install.packages("preprocessCore")
|
||
library(preprocessCore)
|
||
mddExprData_quantile <- normalize.quantiles(sense.filtered.cpm)
|
||
mddExprData_quantileLog2 <- log2(mddExprData_quantile)
|
||
# attach phenotype names and gene names to data
|
||
colnames(mddExprData_quantileLog2) <- mddPheno
|
||
rownames(mddExprData_quantileLog2) <- rownames(sense.filtered.cpm)
|
||
|
||
# coefficient of variation filter sd(x)/abs(mean(x))
|
||
CoV_values <- apply(mddExprData_quantileLog2,1,
|
||
function(x) {sd(x)/abs(mean(x))})
|
||
# smaller threshold, the higher the experimental effect relative to the
|
||
# measurement precision
|
||
sum(CoV_values<.045)
|
||
# there is one gene that has 0 variation -- remove
|
||
sd_values <- apply(mddExprData_quantileLog2,1, function(x) {sd(x)})
|
||
rownames(mddExprData_quantileLog2)[sd_values==0]
|
||
# filter the data matrix
|
||
GxS.covfilter <- mddExprData_quantileLog2[CoV_values<.045 & sd_values>0,]
|
||
dim(GxS.covfilter)
|
||
|
||
# convert phenotype to factor
|
||
pheno.factor <- as.factor(colnames(GxS.covfilter))
|
||
pheno.factor
|
||
str(pheno.factor)
|
||
levels(pheno.factor)
|
||
|
||
|
||
#### Part A: Logistic Regression
|
||
# make sure HC is the reference level
|
||
pheno.factor.relevel <- relevel(pheno.factor,"HC")
|
||
levels(pheno.factor.relevel)
|
||
# also rename levels "0"/"1" from 1/2
|
||
levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="MDD"] <- 1
|
||
levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="HC"] <- 0
|
||
|
||
## Fit logistic model of first gene to phenotype data
|
||
gene.row <- 2
|
||
gene.name <-rownames(GxS.covfilter)[gene.row]
|
||
gene.expr <- GxS.covfilter[gene.row,]
|
||
gene.fit <- glm(pheno.factor.relevel~gene.expr, family=binomial)
|
||
summary(gene.fit)
|
||
coeff.mat <- coef(summary(gene.fit))
|
||
b0 <- coeff.mat[1,1]
|
||
b1 <- coeff.mat[2,1]
|
||
b1.pval <- coeff.mat[2,4]
|
||
|
||
## GLM
|
||
modelfn <- function(x){1/(1+exp(-(b0+b1*x)))}
|
||
g.min <- min(gene.expr)
|
||
g.max <- max(gene.expr)
|
||
curve(modelfn,g.min,g.max) # plot for domain of actual gene’s expression
|
||
abline(h=.5, lty=2)
|
||
curve(modelfn,3,8) # replot with extended domain to see the S shape
|
||
|
||
## Predict Function
|
||
predicted.probs <- predict(gene.fit, gene.expr=gene.expr,
|
||
type="response")
|
||
if (!require("ggplot2")) install.packages("ggplot2")
|
||
library(ggplot2)
|
||
phenotype <- pheno.factor # just rename for legend
|
||
gene.gg.df <- data.frame(expression=gene.expr,
|
||
prediction=predicted.probs)
|
||
# plot predicted probabilities versus gene expression
|
||
p <- ggplot(data=gene.gg.df)
|
||
p <- p + geom_point(aes(x=expression,
|
||
y=prediction,
|
||
color=phenotype, # shape and color
|
||
shape=phenotype), # are true phenotype
|
||
size=3)
|
||
p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red")
|
||
print(p) + abline(h=.5, lty=2)
|
||
|
||
|
||
#### Part B: Logistic Regression as Classification
|
||
# vector of logistic output probabilities for this model (glm/eq. above)
|
||
# prob = .5 is the threshold for prediction class = 1 vs 0
|
||
predicted.class <- as.integer(predicted.probs >=.5) # predicted class
|
||
pheno.factor.relevel # true class
|
||
# vector of True (correctly predicted) and False (wrongly predicted)
|
||
correct.classified <- predicted.class == pheno.factor.relevel
|
||
# sum correct.classified
|
||
table(pheno.factor.relevel,predicted.class)
|
||
# accuracy:
|
||
correct.acc <- sum(correct.classified == "TRUE") / length(correct.classified)
|
||
|
||
|
||
#### Part C: Logistic Regression Ranking of all Genes
|
||
# logistic regression function for one gene row
|
||
lr.fn <- function(i){
|
||
gene=rownames(GxS.covfilter)[i]
|
||
gene.expr <- GxS.covfilter[i,]
|
||
gene.fit <- glm(pheno.factor.relevel~gene.expr,
|
||
family=binomial)
|
||
coeff.mat <- coef(summary(gene.fit))
|
||
b1 <- coeff.mat[2,1]
|
||
b1.pval <- coeff.mat[2,4]
|
||
coefvec <- gene.fit$estimate # intercept, gene
|
||
pvec <- gene.fit$p.value # intercept, gene
|
||
c(gene, b1, b1.pval)
|
||
}
|
||
# Testing with just one row
|
||
lr.fn(2)
|
||
|
||
# initialize results matrix
|
||
num.genes<-nrow(GxS.covfilter)
|
||
lr.results.mat <- matrix(0, nrow=nrow(GxS.covfilter), ncol=3)
|
||
# for loop the function to all genes
|
||
for (i in 1:num.genes){
|
||
lr.results.mat[i,] <- lr.fn(i)
|
||
}
|
||
lr.results.df <- data.frame(lr.results.mat)
|
||
colnames(lr.results.df) <- c("gene", "b1", "p.val")
|
||
|
||
# sort results b1 coefficient p-value
|
||
library(dplyr)
|
||
lr.results.sorted <- lr.results.df %>%
|
||
mutate_at("p.val", as.character) %>%
|
||
mutate_at("p.val", as.numeric) %>%
|
||
arrange(p.val)
|
||
lr.results.sorted[1:10,]
|
||
|
||
## Scatter plot of model fit and accuracy computation of top gene
|
||
# Get top gene data
|
||
gene.row <- which(rownames(GxS.covfilter)==lr.results.sorted[1,1])
|
||
gene.expr <- GxS.covfilter[gene.row,]
|
||
gene.fit <- glm(pheno.factor.relevel~gene.expr,
|
||
family=binomial)
|
||
predicted.probs <- predict(gene.fit, gene.expr=gene.expr, type="response")
|
||
# Plotting Code
|
||
gene.gg.df <- data.frame(expression=gene.expr,
|
||
prediction=predicted.probs)
|
||
# plot predicted probabilities versus gene expression
|
||
p <- ggplot(data=gene.gg.df)
|
||
p <- p + geom_point(aes(x=expression,
|
||
y=prediction,
|
||
color=phenotype, # shape and color
|
||
shape=phenotype), # are true phenotype
|
||
size=3)
|
||
p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") +
|
||
ggtitle(lr.results.sorted[1,1])
|
||
print(p) + abline(h=.5, lty=2)
|
||
|
||
# code for accuracy
|
||
# vector of logistic output probabilities for this model (glm/eq. above)
|
||
# prob = .5 is the threshold for prediction class = 1 vs 0
|
||
predicted.class <- as.integer(predicted.probs >=.5) # predicted class
|
||
pheno.factor.relevel # true class
|
||
# vector of True (correctly predicted) and False (wrongly predicted)
|
||
correct.classified <- predicted.class == pheno.factor.relevel
|
||
# sum correct.classified
|
||
table(pheno.factor.relevel,predicted.class)
|
||
# accuracy:
|
||
correct.acc <- sum(correct.classified == "TRUE") / length(correct.classified)
|
||
correct.acc
|
||
|
||
|
||
#### Part D: Statistical Learning
|
||
if (!require("glmnet")) install.packages("glmnet")
|
||
library(glmnet)
|
||
# alpha=0 means ridge, alpha=1 means lasso
|
||
# keep has to do with storing the results from cross-validiation
|
||
glmnet.model <- cv.glmnet(t(GxS.covfilter), pheno.factor.relevel, alpha=1,
|
||
family="binomial", type.measure="class", keep=T)
|
||
plot(glmnet.model) # plot of CV error vs lambda penalty
|
||
glmnet.model$lambda.min # lambda that minimizes the CV error
|
||
|
||
# Easy way to extract non-zero coeffs
|
||
if (!require("coefplot")) install.packages("coefplot")
|
||
library(coefplot)
|
||
extract.coef(glmnet.model)
|
||
|
||
# get the penalized regression coefficients
|
||
glmnet.coeffs <- predict(glmnet.model,type="coefficients",
|
||
s=glmnet.model$lambda.min)
|
||
|
||
# get the coefficients that are not zero (these are the selected variables)
|
||
glmnet.nonzero.coeffs <- glmnet.coeffs@Dimnames[[1]][which(glmnet.coeffs!=0)]
|
||
glmnet.nonzero.coeffs
|
||
|
||
top_glmnet <- data.frame(as.matrix(glmnet.coeffs)) %>%
|
||
tibble::rownames_to_column(var = "features") %>%
|
||
filter(s1!=0, features!="(Intercept)")
|
||
|
||
top_glmnet_features <- top_glmnet %>% pull(features)
|
||
|
||
# apply the glmnet model to the data to get class predictions
|
||
glmnet.predicted <- predict(glmnet.model,
|
||
s=glmnet.model$lambda.min, # lambda to use
|
||
type="class", # classify
|
||
newx=t(GxS.covfilter)) # apply to original
|
||
glmnet.accuracy <- mean(factor(glmnet.predicted)==pheno.factor.relevel)
|
||
glmnet.accuracy
|
||
table(glmnet.predicted,pheno.factor.relevel) # confusion matrix
|
||
|
||
#### Part E: NPDR
|
||
if (!require("devtools")) install.packages("devtools")
|
||
library(devtools)
|
||
install_github("insilico/npdr")
|
||
|
||
library(npdr) #https://github.com/insilico/npdr
|
||
SxG.dat <- t(GxS.covfilter)
|
||
npdr.MDD.results <- npdr(pheno.factor, SxG.dat,
|
||
regression.type="binomial",
|
||
attr.diff.type="numeric-abs",
|
||
nbd.metric = "manhattan",
|
||
knn=knnSURF.balanced(pheno.factor, sd.frac = .5),
|
||
dopar.nn = F, dopar.reg=F,
|
||
padj.method="bonferroni", verbose = T)
|
||
colnames(npdr.MDD.results) # column names of the output
|
||
|
||
library(dplyr)
|
||
# gets genes (attributes/att) with FDR-adjusted p-value<.05
|
||
top.p05.npdr <- npdr.MDD.results %>% filter(pval.adj<.05) %>% pull(att)
|
||
top.p05.npdr
|
||
|
||
|
||
# grab top 200, remove NA, remove "", get att col
|
||
top.npdr <- npdr.MDD.results %>% dplyr::slice(1:200) %>%
|
||
filter(pval.adj<1) %>% pull(att)
|
||
|
||
write.table(top.npdr,row.names=F,col.names=F,quote=F)
|
||
|
||
|
||
|