2022-10-13 18:29:27 -05:00

513 lines
19 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

################# Max-Log-Likelihood #################
n <- length(g.breaks.clean)
kmin <- g.breaks.clean[1]
alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin))
alpha.ML
lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
# Degree Distribution
# Professor: Dr. McKinney, Spring 2022
# Noah Schrick - 1492657
library(igraph)
library(igraphdata)
data(yeast)
g <- yeast
g.netname <- "Yeast"
################# Set up Work #################
g.vec <- degree(g)
g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
" Network"))
legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
"#006CD1", "#E66100", "#D35FB7"))
g.mean <- mean(g.vec)
g.seq <- 0:max(g.vec) # x-axis
################# Guessing Alpha #################
alpha.guess <- 1.5
lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3)
################# Poisson #################
g.pois <- dpois(g.seq, g.mean, log=F)
lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3)
################# Linear model: Least-Squares Fit #################
g.breaks <- g.hist$breaks[-c(1)] # remove 0
g.probs <- g.hist$density[-1] # make lengths match
# Need to clean up probabilities that are 0
nz.probs.mask <- g.probs!=0
g.breaks.clean <- g.breaks[nz.probs.mask]
g.probs.clean <- g.probs[nz.probs.mask]
#plot(log(g.breaks.clean), log(g.probs.clean))
g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
summary(g.fit)
alpha.LM <- coef(g.fit)[2]
lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3)
################# Max-Log-Likelihood #################
n <- length(g.breaks.clean)
kmin <- g.breaks.clean[1]
alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin))
alpha.ML
lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3)
plot(yeast)
hist(yeast)
hist(g.vec)
g.pois
g.mean
alpha.LM
alpha.ML
degree(g)
sort(degree(g))
sort(degree(g),decreasing=FALSE)
sort(degree(g),decreasing=F)
sort(degree(g),decreasing=false)
sort(degree(g), decreasing = TRUE)
head(sort(degree(g), decreasing = TRUE))
stddev(degree(g))
sd(degree(g))
tail(sort(degree(g), decreasing = TRUE))
plot(log(g.breaks.clean), log(g.probs.clean))
# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
# Degree Distribution
# Professor: Dr. McKinney, Spring 2022
# Noah Schrick - 1492657
library(igraph)
library(igraphdata)
data(yeast)
g <- yeast
g.netname <- "Yeast"
################# Set up Work #################
g.vec <- degree(g)
g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
" Network"))
legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
"#006CD1", "#E66100", "#D35FB7"))
g.mean <- mean(g.vec)
g.seq <- 0:max(g.vec) # x-axis
################# Guessing Alpha #################
alpha.guess <- 1.5
lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3)
################# Poisson #################
g.pois <- dpois(g.seq, g.mean, log=F)
lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3)
################# Linear model: Least-Squares Fit #################
g.breaks <- g.hist$breaks[-c(1)] # remove 0
g.probs <- g.hist$density[-1] # make lengths match
# Need to clean up probabilities that are 0
nz.probs.mask <- g.probs!=0
g.breaks.clean <- g.breaks[nz.probs.mask]
g.probs.clean <- g.probs[nz.probs.mask]
plot(log(g.breaks.clean), log(g.probs.clean))
g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
summary(g.fit)
alpha.LM <- coef(g.fit)[2]
lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3)
################# Max-Log-Likelihood #################
n <- length(g.breaks.clean)
kmin <- g.breaks.clean[1]
alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin))
alpha.ML
lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3)
plot(log(g.breaks.clean), log(g.probs.clean))
g.breaks.clean <- g.breaks[nz.probs.mask]
g.probs.clean <- g.probs[nz.probs.mask]
plot(log(g.breaks.clean), log(g.probs.clean))
# Lab 5 for the University of Tulsa's CS-6643 Bioinformatics Course
# Gene Expression Statistical Learning
# Professor: Dr. McKinney, Fall 2022
# Noah L. Schrick - 1492657
## Set Working Directory to file directory - RStudio approach
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#### 0: Process and filter data
# load gene expression data
load("sense.filtered.cpm.Rdata") # setwd!
# load phenotype (mdd/hc) data
subject.attrs <- read.csv("Demographic_symptom.csv",
stringsAsFactors = FALSE)
if (!require("dplyr")) install.packages("dplyr")
library(dplyr)
# grab intersecting X (subject ids) and Diag (Diagnosis) from columns
phenos.df <- subject.attrs %>%
filter(X %in% colnames(sense.filtered.cpm)) %>%
dplyr::select(X, Diag)
mddPheno <- as.factor(phenos.df$Diag)
# Normalized and transform
if (!require("preprocessCore")) install.packages("preprocessCore")
library(preprocessCore)
mddExprData_quantile <- normalize.quantiles(sense.filtered.cpm)
mddExprData_quantileLog2 <- log2(mddExprData_quantile)
# attach phenotype names and gene names to data
colnames(mddExprData_quantileLog2) <- mddPheno
rownames(mddExprData_quantileLog2) <- rownames(sense.filtered.cpm)
# coefficient of variation filter sd(x)/abs(mean(x))
CoV_values <- apply(mddExprData_quantileLog2,1,
function(x) {sd(x)/abs(mean(x))})
# smaller threshold, the higher the experimental effect relative to the
# measurement precision
sum(CoV_values<.045)
# there is one gene that has 0 variation -- remove
sd_values <- apply(mddExprData_quantileLog2,1, function(x) {sd(x)})
rownames(mddExprData_quantileLog2)[sd_values==0]
# filter the data matrix
GxS.covfilter <- mddExprData_quantileLog2[CoV_values<.045 & sd_values>0,]
dim(GxS.covfilter)
# convert phenotype to factor
pheno.factor <- as.factor(colnames(GxS.covfilter))
pheno.factor
str(pheno.factor)
levels(pheno.factor)
#### Part A: Logistic Regression
# make sure HC is the reference level
pheno.factor.relevel <- relevel(pheno.factor,"HC")
levels(pheno.factor.relevel)
# also rename levels "0"/"1" from 1/2
levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="MDD"] <- 1
levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="HC"] <- 0
# Fit logistic model of first gene to phenotype data
gene.row <- 2
gene.name <-rownames(GxS.covfilter)[gene.row]
gene.expr <- GxS.covfilter[gene.row,]
gene.fit <- glm(pheno.factor.relevel~gene.expr, family=binomial)
summary(gene.fit)
coeff.mat <- coef(summary(gene.fit))
b0 <- coeff.mat[1,1]
b1 <- coeff.mat[2,1]
b1.pval <- coeff.mat[2,4]
summary(gene.fit)
coeff.mat <- coef(summary(gene.fit))
b0 <- coeff.mat[1,1]
b0
b1
coeff.mat
b1.pval
## GLM
modelfn <- function(x){1/(1+exp(-(b0+b1*x)))}
g.min <- min(gene.expr)
g.max <- max(gene.expr)
curve(modelfn,g.min,g.max) # plot for domain of actual genes expression
abline(h=.5, lty=2)
curve(modelfn,3,8) # replot with extended domain to see the S shape
## Predict Function
Predicted.probs <- predict(gene.fit, gene.expr=gene.expr,
type="response")
if (!require("ggplot2")) install.packages("ggplot2")
library(ggplot2)
phenotype <- pheno.factor # just rename for legend
gene.gg.df <- data.frame(expression=gene.expr,
prediction=predicted.probs)
# plot predicted probabilities versus gene expression
p <- ggplot(data=gene.gg.df)
p <- p + geom_point(aes(x=expression,
y=prediction,
color=phenotype, # shape and color
shape=phenotype), # are true phenotype
size=3)
p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red")
print(p)
abline(h=.5, lty=2)
Predicted.probs <- predict(gene.fit, gene.expr=gene.expr,
type="response")
if (!require("ggplot2")) install.packages("ggplot2")
library(ggplot2)
phenotype <- pheno.factor # just rename for legend
gene.gg.df <- data.frame(expression=gene.expr,
prediction=predicted.probs)
predicted.probs <- predict(gene.fit, gene.expr=gene.expr,
type="response")
if (!require("ggplot2")) install.packages("ggplot2")
library(ggplot2)
phenotype <- pheno.factor # just rename for legend
gene.gg.df <- data.frame(expression=gene.expr,
prediction=predicted.probs)
# plot predicted probabilities versus gene expression
p <- ggplot(data=gene.gg.df)
p <- p + geom_point(aes(x=expression,
y=prediction,
color=phenotype, # shape and color
shape=phenotype), # are true phenotype
size=3)
p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red")
print(p)
abline(h=.5, lty=2)
predicted.probs <- predict(gene.fit, gene.expr=gene.expr,
type="response")
if (!require("ggplot2")) install.packages("ggplot2")
library(ggplot2)
phenotype <- pheno.factor # just rename for legend
gene.gg.df <- data.frame(expression=gene.expr,
prediction=predicted.probs)
# plot predicted probabilities versus gene expression
p <- ggplot(data=gene.gg.df)
p <- p + geom_point(aes(x=expression,
y=prediction,
color=phenotype, # shape and color
shape=phenotype), # are true phenotype
size=3)
p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red")
print(p)
abline(h=.5, lty=2)
## Predict Function
predicted.probs <- predict(gene.fit, gene.expr=gene.expr,
type="response")
if (!require("ggplot2")) install.packages("ggplot2")
library(ggplot2)
phenotype <- pheno.factor # just rename for legend
gene.gg.df <- data.frame(expression=gene.expr,
prediction=predicted.probs)
# plot predicted probabilities versus gene expression
p <- ggplot(data=gene.gg.df)
p <- p + geom_point(aes(x=expression,
y=prediction,
color=phenotype, # shape and color
shape=phenotype), # are true phenotype
size=3)
p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red")
print(p)
abline(h=.5, lty=2)
plot(p)
abline(h=.5, lty=2)
print(p) + abline(h=.5, lty=2)
# Lab 5 for the University of Tulsa's CS-6643 Bioinformatics Course
# Gene Expression Statistical Learning
# Professor: Dr. McKinney, Fall 2022
# Noah L. Schrick - 1492657
## Set Working Directory to file directory - RStudio approach
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#### 0: Process and filter data
# load gene expression data
load("sense.filtered.cpm.Rdata") # setwd!
# load phenotype (mdd/hc) data
subject.attrs <- read.csv("Demographic_symptom.csv",
stringsAsFactors = FALSE)
if (!require("dplyr")) install.packages("dplyr")
library(dplyr)
# grab intersecting X (subject ids) and Diag (Diagnosis) from columns
phenos.df <- subject.attrs %>%
filter(X %in% colnames(sense.filtered.cpm)) %>%
dplyr::select(X, Diag)
mddPheno <- as.factor(phenos.df$Diag)
# Normalized and transform
if (!require("preprocessCore")) install.packages("preprocessCore")
library(preprocessCore)
mddExprData_quantile <- normalize.quantiles(sense.filtered.cpm)
mddExprData_quantileLog2 <- log2(mddExprData_quantile)
# attach phenotype names and gene names to data
colnames(mddExprData_quantileLog2) <- mddPheno
rownames(mddExprData_quantileLog2) <- rownames(sense.filtered.cpm)
# coefficient of variation filter sd(x)/abs(mean(x))
CoV_values <- apply(mddExprData_quantileLog2,1,
function(x) {sd(x)/abs(mean(x))})
# smaller threshold, the higher the experimental effect relative to the
# measurement precision
sum(CoV_values<.045)
# there is one gene that has 0 variation -- remove
sd_values <- apply(mddExprData_quantileLog2,1, function(x) {sd(x)})
rownames(mddExprData_quantileLog2)[sd_values==0]
# filter the data matrix
GxS.covfilter <- mddExprData_quantileLog2[CoV_values<.045 & sd_values>0,]
dim(GxS.covfilter)
# convert phenotype to factor
pheno.factor <- as.factor(colnames(GxS.covfilter))
pheno.factor
str(pheno.factor)
levels(pheno.factor)
#### Part A: Logistic Regression
# make sure HC is the reference level
pheno.factor.relevel <- relevel(pheno.factor,"HC")
levels(pheno.factor.relevel)
# also rename levels "0"/"1" from 1/2
levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="MDD"] <- 1
levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="HC"] <- 0
## Fit logistic model of first gene to phenotype data
gene.row <- 2
gene.name <-rownames(GxS.covfilter)[gene.row]
gene.expr <- GxS.covfilter[gene.row,]
gene.fit <- glm(pheno.factor.relevel~gene.expr, family=binomial)
summary(gene.fit)
coeff.mat <- coef(summary(gene.fit))
b0 <- coeff.mat[1,1]
b1 <- coeff.mat[2,1]
b1.pval <- coeff.mat[2,4]
## GLM
modelfn <- function(x){1/(1+exp(-(b0+b1*x)))}
g.min <- min(gene.expr)
g.max <- max(gene.expr)
curve(modelfn,g.min,g.max) # plot for domain of actual genes expression
abline(h=.5, lty=2)
curve(modelfn,3,8) # replot with extended domain to see the S shape
## Predict Function
predicted.probs <- predict(gene.fit, gene.expr=gene.expr,
type="response")
if (!require("ggplot2")) install.packages("ggplot2")
library(ggplot2)
phenotype <- pheno.factor # just rename for legend
gene.gg.df <- data.frame(expression=gene.expr,
prediction=predicted.probs)
# plot predicted probabilities versus gene expression
p <- ggplot(data=gene.gg.df)
p <- p + geom_point(aes(x=expression,
y=prediction,
color=phenotype, # shape and color
shape=phenotype), # are true phenotype
size=3)
p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red")
print(p) + abline(h=.5, lty=2)
# vector of logistic output probabilities for this model (glm/eq. above)
# prob = .5 is the threshold for prediction class = 1 vs 0
predicted.class <- as.integer(predicted.probs >=.5) # predicted class
pheno.factor.relevel # true class
# vector of True (correctly predicted) and False (wrongly predicted)
correct.classified <- predicted.class == pheno.factor.relevel
# sum correct.classified
table(pheno.factor.relevel,predicted.class)
predicted.probs
predicted.probs[1]
as.integer(predicted.probs[1] >= 0.5)
# sum correct.classified
table(pheno.factor.relevel,predicted.class)
pheno.factor.relevel
correct.classified
length(correct.classified)
sum(correct.classified == "TRUE")
# accuracy:
correct.acc <- sum(correct.classified == "TRUE") / length(correct.classified)
correct.acc
lr.fn(2)
# logistic regression function for one gene row
lr.fn <- function(i){
gene=rownames(GxS.covfilter)[i]
gene.expr <- GxS.covfilter[i,]
gene.fit <- glm(pheno.factor.relevel~gene.expr,
family=binomial)
coeff.mat <- coef(summary(gene.fit))
b1 <- coeff.mat[2,1]
b1.pval <- coeff.mat[2,4]
coefvec <- gene.fit$estimate # intercept, gene
pvec <- gene.fit$p.value # intercept, gene
c(gene, b1, b1.pval)
}
lr.fn(2)
# initialize results matrix
num.genes<-nrow(GxS.covfilter)
lr.results.mat <- matrix(0, nrow=nrow(GxS.covfilter), ncol=3)
# for loop the function to all genes
for (i in 1:num.genes){
lr.results.mat[i,] <- lr.fn(i)
}
lr.results.df <- data.frame(lr.results.mat)
colnames(lr.results.df) <- c("gene", "b1", "p.val")
# sort results b1 coefficient p-value
library(dplyr)
lr.results.sorted <- lr.results.df %>%
mutate_at("p.val", as.character) %>%
mutate_at("p.val", as.numeric) %>%
arrange(p.val)
lr.results.sorted[1:10,]
lr.results.sorted[1,]
lr.results.sorted[1,1]
gene.row <- which(rownames(GxS.covfilter)==lr.results.sorted[1,1])
gene.row
gene.expr <- GxS.covfilter[gene.row,]
gene.fit <- glm(pheno.factor.relevel~gene.expr,
family=binomial)
predicted.probs <- predict(gene.fit, gene.expr=gene.expr, type="response")
gene.gg.df
# Plotting Code
gene.gg.df <- data.frame(expression=gene.expr,
prediction=predicted.probs)
# plot predicted probabilities versus gene expression
p <- ggplot(data=gene.gg.df)
p <- p + geom_point(aes(x=expression,
y=prediction,
color=phenotype, # shape and color
shape=phenotype), # are true phenotype
size=3)
p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red")
print(p) + abline(h=.5, lty=2)
p <- p + geom_point(aes(x=expression,
y=prediction,
color=phenotype, # shape and color
shape=phenotype,
main=lr.results.sorted[1,1]), # are true phenotype
size=3)
?aes
?ggplot
p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") +
ggtitle(lr.results.sorted[1,1])
print(p) + abline(h=.5, lty=2)
# code for accuracy
# vector of logistic output probabilities for this model (glm/eq. above)
# prob = .5 is the threshold for prediction class = 1 vs 0
predicted.class <- as.integer(predicted.probs >=.5) # predicted class
pheno.factor.relevel # true class
# vector of True (correctly predicted) and False (wrongly predicted)
correct.classified <- predicted.class == pheno.factor.relevel
# sum correct.classified
table(pheno.factor.relevel,predicted.class)
# accuracy:
correct.acc <- sum(correct.classified == "TRUE") / length(correct.classified)
correct.acc
if (!require("glmnet")) install.packages("glmnet")
library(glmnet)
# alpha=0 means ridge, alpha=1 means lasso
# keep has to do with storing the results from cross-validiation
glmnet.model <- cv.glmnet(t(GxS.covfilter), pheno.factor.relevel, alpha=1,
family="binomial", type.measure="class", keep=T)
plot(glmnet.model) # plot of CV error vs lambda penalty
glmnet.model$lambda.min # lambda that minimizes the CV error
glmnet.model
# Convient way to extract non-zero
if (!require("coefplot")) install.packages("coefplot")
library(coefplot)
extract.coef(glmnet.model)
# get the penalized regression coefficients
glmnet.coeffs <- predict(glmnet.model,type="coefficients",
s=glmnet.model$lambda.min)
top_glmnet <- data.frame(as.matrix(glmnet.coeffs)) %>%
tibble::rownames_to_column(var = "features") %>%
filter(s1!=0, features!="(Intercept)")
top_glmnet_features <- top_glmnet %>% pull(features)
# apply the glmnet model to the data to get class predictions
glmnet.predicted <- predict(glmnet.model,
s=glmnet.model$lambda.min, # lambda to use
type="class", # classify
newx=t(GxS.covfilter)) # apply to original
glmnet.accuracy <- mean(factor(glmnet.predicted)==pheno.factor.relevel)
glmnet.accuracy
# get the coefficients that are not zero (these are the selected variables)
glmnet.nonzero.coeffs <- #glmnet.coeffs@Dimnames[[1]][which(glmnet.coeffs!=0)]
glmnet.nonzero.coeffs
# get the coefficients that are not zero (these are the selected variables)
glmnet.nonzero.coeffs <- #glmnet.coeffs@Dimnames[[1]][which(glmnet.coeffs!=0)]
glmnet.nonzero.coeffs
# get the coefficients that are not zero (these are the selected variables)
glmnet.nonzero.coeffs <- glmnet.coeffs@Dimnames[[1]][which(glmnet.coeffs!=0)]
glmnet.nonzero.coeffs
#### Part E: NPDR
if (!require("devtools")) install.packages("devtools")
library(devtools)
install_github("insilico/npdr")
library(npdr) #https://github.com/insilico/npdr
SxG.dat <- t(GxS.covfilter)
npdr.MDD.results <- npdr(pheno.factor, SxG.dat,
regression.type="binomial",
attr.diff.type="numeric-abs",
nbd.metric = "manhattan",
knn=knnSURF.balanced(pheno.factor, sd.frac = .5),
dopar.nn = F, dopar.reg=F,
padj.method="bonferroni", verbose = T)
colnames(npdr.MDD.results) # column names of the output
library(dplyr)
# gets genes (attributes/att) with FDR-adjusted p-value<.05
top.p05.npdr <- npdr.MDD.results %>% filter(pval.adj<.05) %>% pull(att)
top.p05.npdr
# grab top 200, remove NA, remove "", get att col
top.npdr <- npdr.MDD.results %>% dplyr::slice(1:200) %>%
filter(att!="") %>% pull(att)
write.table(top.npdr,row.names=F,col.names=F,quote=F)
# grab top 200, remove NA, remove "", get att col
top.npdr <- npdr.MDD.results %>% dplyr::slice(1:200) %>%
filter(att!="") filter(pval.adj<1) %>% pull(att)
# grab top 200, remove NA, remove "", get att col
top.npdr <- npdr.MDD.results %>% dplyr::slice(1:200) %>%
filter(pval.adj<1) %>% pull(att)
write.table(top.npdr,row.names=F,col.names=F,quote=F)