################# Max-Log-Likelihood ################# n <- length(g.breaks.clean) kmin <- g.breaks.clean[1] alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) alpha.ML lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course # Degree Distribution # Professor: Dr. McKinney, Spring 2022 # Noah Schrick - 1492657 library(igraph) library(igraphdata) data(yeast) g <- yeast g.netname <- "Yeast" ################# Set up Work ################# g.vec <- degree(g) g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, " Network")) legend("topright", c("Guess", "Poisson", "Least-Squares Fit", "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", "#006CD1", "#E66100", "#D35FB7")) g.mean <- mean(g.vec) g.seq <- 0:max(g.vec) # x-axis ################# Guessing Alpha ################# alpha.guess <- 1.5 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) ################# Poisson ################# g.pois <- dpois(g.seq, g.mean, log=F) lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3) ################# Linear model: Least-Squares Fit ################# g.breaks <- g.hist$breaks[-c(1)] # remove 0 g.probs <- g.hist$density[-1] # make lengths match # Need to clean up probabilities that are 0 nz.probs.mask <- g.probs!=0 g.breaks.clean <- g.breaks[nz.probs.mask] g.probs.clean <- g.probs[nz.probs.mask] #plot(log(g.breaks.clean), log(g.probs.clean)) g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) summary(g.fit) alpha.LM <- coef(g.fit)[2] lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3) ################# Max-Log-Likelihood ################# n <- length(g.breaks.clean) kmin <- g.breaks.clean[1] alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) alpha.ML lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3) plot(yeast) hist(yeast) hist(g.vec) g.pois g.mean alpha.LM alpha.ML degree(g) sort(degree(g)) sort(degree(g),decreasing=FALSE) sort(degree(g),decreasing=F) sort(degree(g),decreasing=false) sort(degree(g), decreasing = TRUE) head(sort(degree(g), decreasing = TRUE)) stddev(degree(g)) sd(degree(g)) tail(sort(degree(g), decreasing = TRUE)) plot(log(g.breaks.clean), log(g.probs.clean)) # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course # Degree Distribution # Professor: Dr. McKinney, Spring 2022 # Noah Schrick - 1492657 library(igraph) library(igraphdata) data(yeast) g <- yeast g.netname <- "Yeast" ################# Set up Work ################# g.vec <- degree(g) g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, " Network")) legend("topright", c("Guess", "Poisson", "Least-Squares Fit", "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", "#006CD1", "#E66100", "#D35FB7")) g.mean <- mean(g.vec) g.seq <- 0:max(g.vec) # x-axis ################# Guessing Alpha ################# alpha.guess <- 1.5 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) ################# Poisson ################# g.pois <- dpois(g.seq, g.mean, log=F) lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3) ################# Linear model: Least-Squares Fit ################# g.breaks <- g.hist$breaks[-c(1)] # remove 0 g.probs <- g.hist$density[-1] # make lengths match # Need to clean up probabilities that are 0 nz.probs.mask <- g.probs!=0 g.breaks.clean <- g.breaks[nz.probs.mask] g.probs.clean <- g.probs[nz.probs.mask] plot(log(g.breaks.clean), log(g.probs.clean)) g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) summary(g.fit) alpha.LM <- coef(g.fit)[2] lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3) ################# Max-Log-Likelihood ################# n <- length(g.breaks.clean) kmin <- g.breaks.clean[1] alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) alpha.ML lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3) plot(log(g.breaks.clean), log(g.probs.clean)) g.breaks.clean <- g.breaks[nz.probs.mask] g.probs.clean <- g.probs[nz.probs.mask] plot(log(g.breaks.clean), log(g.probs.clean)) # Lab 5 for the University of Tulsa's CS-6643 Bioinformatics Course # Gene Expression Statistical Learning # Professor: Dr. McKinney, Fall 2022 # Noah L. Schrick - 1492657 ## Set Working Directory to file directory - RStudio approach setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #### 0: Process and filter data # load gene expression data load("sense.filtered.cpm.Rdata") # setwd! # load phenotype (mdd/hc) data subject.attrs <- read.csv("Demographic_symptom.csv", stringsAsFactors = FALSE) if (!require("dplyr")) install.packages("dplyr") library(dplyr) # grab intersecting X (subject ids) and Diag (Diagnosis) from columns phenos.df <- subject.attrs %>% filter(X %in% colnames(sense.filtered.cpm)) %>% dplyr::select(X, Diag) mddPheno <- as.factor(phenos.df$Diag) # Normalized and transform if (!require("preprocessCore")) install.packages("preprocessCore") library(preprocessCore) mddExprData_quantile <- normalize.quantiles(sense.filtered.cpm) mddExprData_quantileLog2 <- log2(mddExprData_quantile) # attach phenotype names and gene names to data colnames(mddExprData_quantileLog2) <- mddPheno rownames(mddExprData_quantileLog2) <- rownames(sense.filtered.cpm) # coefficient of variation filter sd(x)/abs(mean(x)) CoV_values <- apply(mddExprData_quantileLog2,1, function(x) {sd(x)/abs(mean(x))}) # smaller threshold, the higher the experimental effect relative to the # measurement precision sum(CoV_values<.045) # there is one gene that has 0 variation -- remove sd_values <- apply(mddExprData_quantileLog2,1, function(x) {sd(x)}) rownames(mddExprData_quantileLog2)[sd_values==0] # filter the data matrix GxS.covfilter <- mddExprData_quantileLog2[CoV_values<.045 & sd_values>0,] dim(GxS.covfilter) # convert phenotype to factor pheno.factor <- as.factor(colnames(GxS.covfilter)) pheno.factor str(pheno.factor) levels(pheno.factor) #### Part A: Logistic Regression # make sure HC is the reference level pheno.factor.relevel <- relevel(pheno.factor,"HC") levels(pheno.factor.relevel) # also rename levels "0"/"1" from 1/2 levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="MDD"] <- 1 levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="HC"] <- 0 # Fit logistic model of first gene to phenotype data gene.row <- 2 gene.name <-rownames(GxS.covfilter)[gene.row] gene.expr <- GxS.covfilter[gene.row,] gene.fit <- glm(pheno.factor.relevel~gene.expr, family=binomial) summary(gene.fit) coeff.mat <- coef(summary(gene.fit)) b0 <- coeff.mat[1,1] b1 <- coeff.mat[2,1] b1.pval <- coeff.mat[2,4] summary(gene.fit) coeff.mat <- coef(summary(gene.fit)) b0 <- coeff.mat[1,1] b0 b1 coeff.mat b1.pval ## GLM modelfn <- function(x){1/(1+exp(-(b0+b1*x)))} g.min <- min(gene.expr) g.max <- max(gene.expr) curve(modelfn,g.min,g.max) # plot for domain of actual gene’s expression abline(h=.5, lty=2) curve(modelfn,3,8) # replot with extended domain to see the S shape ## Predict Function Predicted.probs <- predict(gene.fit, gene.expr=gene.expr, type="response") if (!require("ggplot2")) install.packages("ggplot2") library(ggplot2) phenotype <- pheno.factor # just rename for legend gene.gg.df <- data.frame(expression=gene.expr, prediction=predicted.probs) # plot predicted probabilities versus gene expression p <- ggplot(data=gene.gg.df) p <- p + geom_point(aes(x=expression, y=prediction, color=phenotype, # shape and color shape=phenotype), # are true phenotype size=3) p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") print(p) abline(h=.5, lty=2) Predicted.probs <- predict(gene.fit, gene.expr=gene.expr, type="response") if (!require("ggplot2")) install.packages("ggplot2") library(ggplot2) phenotype <- pheno.factor # just rename for legend gene.gg.df <- data.frame(expression=gene.expr, prediction=predicted.probs) predicted.probs <- predict(gene.fit, gene.expr=gene.expr, type="response") if (!require("ggplot2")) install.packages("ggplot2") library(ggplot2) phenotype <- pheno.factor # just rename for legend gene.gg.df <- data.frame(expression=gene.expr, prediction=predicted.probs) # plot predicted probabilities versus gene expression p <- ggplot(data=gene.gg.df) p <- p + geom_point(aes(x=expression, y=prediction, color=phenotype, # shape and color shape=phenotype), # are true phenotype size=3) p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") print(p) abline(h=.5, lty=2) predicted.probs <- predict(gene.fit, gene.expr=gene.expr, type="response") if (!require("ggplot2")) install.packages("ggplot2") library(ggplot2) phenotype <- pheno.factor # just rename for legend gene.gg.df <- data.frame(expression=gene.expr, prediction=predicted.probs) # plot predicted probabilities versus gene expression p <- ggplot(data=gene.gg.df) p <- p + geom_point(aes(x=expression, y=prediction, color=phenotype, # shape and color shape=phenotype), # are true phenotype size=3) p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") print(p) abline(h=.5, lty=2) ## Predict Function predicted.probs <- predict(gene.fit, gene.expr=gene.expr, type="response") if (!require("ggplot2")) install.packages("ggplot2") library(ggplot2) phenotype <- pheno.factor # just rename for legend gene.gg.df <- data.frame(expression=gene.expr, prediction=predicted.probs) # plot predicted probabilities versus gene expression p <- ggplot(data=gene.gg.df) p <- p + geom_point(aes(x=expression, y=prediction, color=phenotype, # shape and color shape=phenotype), # are true phenotype size=3) p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") print(p) abline(h=.5, lty=2) plot(p) abline(h=.5, lty=2) print(p) + abline(h=.5, lty=2) # Lab 5 for the University of Tulsa's CS-6643 Bioinformatics Course # Gene Expression Statistical Learning # Professor: Dr. McKinney, Fall 2022 # Noah L. Schrick - 1492657 ## Set Working Directory to file directory - RStudio approach setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #### 0: Process and filter data # load gene expression data load("sense.filtered.cpm.Rdata") # setwd! # load phenotype (mdd/hc) data subject.attrs <- read.csv("Demographic_symptom.csv", stringsAsFactors = FALSE) if (!require("dplyr")) install.packages("dplyr") library(dplyr) # grab intersecting X (subject ids) and Diag (Diagnosis) from columns phenos.df <- subject.attrs %>% filter(X %in% colnames(sense.filtered.cpm)) %>% dplyr::select(X, Diag) mddPheno <- as.factor(phenos.df$Diag) # Normalized and transform if (!require("preprocessCore")) install.packages("preprocessCore") library(preprocessCore) mddExprData_quantile <- normalize.quantiles(sense.filtered.cpm) mddExprData_quantileLog2 <- log2(mddExprData_quantile) # attach phenotype names and gene names to data colnames(mddExprData_quantileLog2) <- mddPheno rownames(mddExprData_quantileLog2) <- rownames(sense.filtered.cpm) # coefficient of variation filter sd(x)/abs(mean(x)) CoV_values <- apply(mddExprData_quantileLog2,1, function(x) {sd(x)/abs(mean(x))}) # smaller threshold, the higher the experimental effect relative to the # measurement precision sum(CoV_values<.045) # there is one gene that has 0 variation -- remove sd_values <- apply(mddExprData_quantileLog2,1, function(x) {sd(x)}) rownames(mddExprData_quantileLog2)[sd_values==0] # filter the data matrix GxS.covfilter <- mddExprData_quantileLog2[CoV_values<.045 & sd_values>0,] dim(GxS.covfilter) # convert phenotype to factor pheno.factor <- as.factor(colnames(GxS.covfilter)) pheno.factor str(pheno.factor) levels(pheno.factor) #### Part A: Logistic Regression # make sure HC is the reference level pheno.factor.relevel <- relevel(pheno.factor,"HC") levels(pheno.factor.relevel) # also rename levels "0"/"1" from 1/2 levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="MDD"] <- 1 levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="HC"] <- 0 ## Fit logistic model of first gene to phenotype data gene.row <- 2 gene.name <-rownames(GxS.covfilter)[gene.row] gene.expr <- GxS.covfilter[gene.row,] gene.fit <- glm(pheno.factor.relevel~gene.expr, family=binomial) summary(gene.fit) coeff.mat <- coef(summary(gene.fit)) b0 <- coeff.mat[1,1] b1 <- coeff.mat[2,1] b1.pval <- coeff.mat[2,4] ## GLM modelfn <- function(x){1/(1+exp(-(b0+b1*x)))} g.min <- min(gene.expr) g.max <- max(gene.expr) curve(modelfn,g.min,g.max) # plot for domain of actual gene’s expression abline(h=.5, lty=2) curve(modelfn,3,8) # replot with extended domain to see the S shape ## Predict Function predicted.probs <- predict(gene.fit, gene.expr=gene.expr, type="response") if (!require("ggplot2")) install.packages("ggplot2") library(ggplot2) phenotype <- pheno.factor # just rename for legend gene.gg.df <- data.frame(expression=gene.expr, prediction=predicted.probs) # plot predicted probabilities versus gene expression p <- ggplot(data=gene.gg.df) p <- p + geom_point(aes(x=expression, y=prediction, color=phenotype, # shape and color shape=phenotype), # are true phenotype size=3) p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") print(p) + abline(h=.5, lty=2) # vector of logistic output probabilities for this model (glm/eq. above) # prob = .5 is the threshold for prediction class = 1 vs 0 predicted.class <- as.integer(predicted.probs >=.5) # predicted class pheno.factor.relevel # true class # vector of True (correctly predicted) and False (wrongly predicted) correct.classified <- predicted.class == pheno.factor.relevel # sum correct.classified table(pheno.factor.relevel,predicted.class) predicted.probs predicted.probs[1] as.integer(predicted.probs[1] >= 0.5) # sum correct.classified table(pheno.factor.relevel,predicted.class) pheno.factor.relevel correct.classified length(correct.classified) sum(correct.classified == "TRUE") # accuracy: correct.acc <- sum(correct.classified == "TRUE") / length(correct.classified) correct.acc lr.fn(2) # logistic regression function for one gene row lr.fn <- function(i){ gene=rownames(GxS.covfilter)[i] gene.expr <- GxS.covfilter[i,] gene.fit <- glm(pheno.factor.relevel~gene.expr, family=binomial) coeff.mat <- coef(summary(gene.fit)) b1 <- coeff.mat[2,1] b1.pval <- coeff.mat[2,4] coefvec <- gene.fit$estimate # intercept, gene pvec <- gene.fit$p.value # intercept, gene c(gene, b1, b1.pval) } lr.fn(2) # initialize results matrix num.genes<-nrow(GxS.covfilter) lr.results.mat <- matrix(0, nrow=nrow(GxS.covfilter), ncol=3) # for loop the function to all genes for (i in 1:num.genes){ lr.results.mat[i,] <- lr.fn(i) } lr.results.df <- data.frame(lr.results.mat) colnames(lr.results.df) <- c("gene", "b1", "p.val") # sort results b1 coefficient p-value library(dplyr) lr.results.sorted <- lr.results.df %>% mutate_at("p.val", as.character) %>% mutate_at("p.val", as.numeric) %>% arrange(p.val) lr.results.sorted[1:10,] lr.results.sorted[1,] lr.results.sorted[1,1] gene.row <- which(rownames(GxS.covfilter)==lr.results.sorted[1,1]) gene.row gene.expr <- GxS.covfilter[gene.row,] gene.fit <- glm(pheno.factor.relevel~gene.expr, family=binomial) predicted.probs <- predict(gene.fit, gene.expr=gene.expr, type="response") gene.gg.df # Plotting Code gene.gg.df <- data.frame(expression=gene.expr, prediction=predicted.probs) # plot predicted probabilities versus gene expression p <- ggplot(data=gene.gg.df) p <- p + geom_point(aes(x=expression, y=prediction, color=phenotype, # shape and color shape=phenotype), # are true phenotype size=3) p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") print(p) + abline(h=.5, lty=2) p <- p + geom_point(aes(x=expression, y=prediction, color=phenotype, # shape and color shape=phenotype, main=lr.results.sorted[1,1]), # are true phenotype size=3) ?aes ?ggplot p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") + ggtitle(lr.results.sorted[1,1]) print(p) + abline(h=.5, lty=2) # code for accuracy # vector of logistic output probabilities for this model (glm/eq. above) # prob = .5 is the threshold for prediction class = 1 vs 0 predicted.class <- as.integer(predicted.probs >=.5) # predicted class pheno.factor.relevel # true class # vector of True (correctly predicted) and False (wrongly predicted) correct.classified <- predicted.class == pheno.factor.relevel # sum correct.classified table(pheno.factor.relevel,predicted.class) # accuracy: correct.acc <- sum(correct.classified == "TRUE") / length(correct.classified) correct.acc if (!require("glmnet")) install.packages("glmnet") library(glmnet) # alpha=0 means ridge, alpha=1 means lasso # keep has to do with storing the results from cross-validiation glmnet.model <- cv.glmnet(t(GxS.covfilter), pheno.factor.relevel, alpha=1, family="binomial", type.measure="class", keep=T) plot(glmnet.model) # plot of CV error vs lambda penalty glmnet.model$lambda.min # lambda that minimizes the CV error glmnet.model # Convient way to extract non-zero if (!require("coefplot")) install.packages("coefplot") library(coefplot) extract.coef(glmnet.model) # get the penalized regression coefficients glmnet.coeffs <- predict(glmnet.model,type="coefficients", s=glmnet.model$lambda.min) top_glmnet <- data.frame(as.matrix(glmnet.coeffs)) %>% tibble::rownames_to_column(var = "features") %>% filter(s1!=0, features!="(Intercept)") top_glmnet_features <- top_glmnet %>% pull(features) # apply the glmnet model to the data to get class predictions glmnet.predicted <- predict(glmnet.model, s=glmnet.model$lambda.min, # lambda to use type="class", # classify newx=t(GxS.covfilter)) # apply to original glmnet.accuracy <- mean(factor(glmnet.predicted)==pheno.factor.relevel) glmnet.accuracy # get the coefficients that are not zero (these are the selected variables) glmnet.nonzero.coeffs <- #glmnet.coeffs@Dimnames[[1]][which(glmnet.coeffs!=0)] glmnet.nonzero.coeffs # get the coefficients that are not zero (these are the selected variables) glmnet.nonzero.coeffs <- #glmnet.coeffs@Dimnames[[1]][which(glmnet.coeffs!=0)] glmnet.nonzero.coeffs # get the coefficients that are not zero (these are the selected variables) glmnet.nonzero.coeffs <- glmnet.coeffs@Dimnames[[1]][which(glmnet.coeffs!=0)] glmnet.nonzero.coeffs #### Part E: NPDR if (!require("devtools")) install.packages("devtools") library(devtools) install_github("insilico/npdr") library(npdr) #https://github.com/insilico/npdr SxG.dat <- t(GxS.covfilter) npdr.MDD.results <- npdr(pheno.factor, SxG.dat, regression.type="binomial", attr.diff.type="numeric-abs", nbd.metric = "manhattan", knn=knnSURF.balanced(pheno.factor, sd.frac = .5), dopar.nn = F, dopar.reg=F, padj.method="bonferroni", verbose = T) colnames(npdr.MDD.results) # column names of the output library(dplyr) # gets genes (attributes/att) with FDR-adjusted p-value<.05 top.p05.npdr <- npdr.MDD.results %>% filter(pval.adj<.05) %>% pull(att) top.p05.npdr # grab top 200, remove NA, remove "", get att col top.npdr <- npdr.MDD.results %>% dplyr::slice(1:200) %>% filter(att!="") %>% pull(att) write.table(top.npdr,row.names=F,col.names=F,quote=F) # grab top 200, remove NA, remove "", get att col top.npdr <- npdr.MDD.results %>% dplyr::slice(1:200) %>% filter(att!="") filter(pval.adj<1) %>% pull(att) # grab top 200, remove NA, remove "", get att col top.npdr <- npdr.MDD.results %>% dplyr::slice(1:200) %>% filter(pval.adj<1) %>% pull(att) write.table(top.npdr,row.names=F,col.names=F,quote=F)