diff --git a/.Rhistory b/.Rhistory new file mode 100644 index 0000000..8fe61f1 --- /dev/null +++ b/.Rhistory @@ -0,0 +1,512 @@ +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3) +plot(yeast) +hist(yeast) +hist(g.vec) +g.pois +g.mean +alpha.LM +alpha.ML +degree(g) +sort(degree(g)) +sort(degree(g),decreasing=FALSE) +sort(degree(g),decreasing=F) +sort(degree(g),decreasing=false) +sort(degree(g), decreasing = TRUE) +head(sort(degree(g), decreasing = TRUE)) +stddev(degree(g)) +sd(degree(g)) +tail(sort(degree(g), decreasing = TRUE)) +plot(log(g.breaks.clean), log(g.probs.clean)) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3) +plot(log(g.breaks.clean), log(g.probs.clean)) +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +plot(log(g.breaks.clean), log(g.probs.clean)) +# Lab 5 for the University of Tulsa's CS-6643 Bioinformatics Course +# Gene Expression Statistical Learning +# Professor: Dr. McKinney, Fall 2022 +# Noah L. Schrick - 1492657 +## Set Working Directory to file directory - RStudio approach +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +#### 0: Process and filter data +# load gene expression data +load("sense.filtered.cpm.Rdata") # setwd! +# load phenotype (mdd/hc) data +subject.attrs <- read.csv("Demographic_symptom.csv", +stringsAsFactors = FALSE) +if (!require("dplyr")) install.packages("dplyr") +library(dplyr) +# grab intersecting X (subject ids) and Diag (Diagnosis) from columns +phenos.df <- subject.attrs %>% +filter(X %in% colnames(sense.filtered.cpm)) %>% +dplyr::select(X, Diag) +mddPheno <- as.factor(phenos.df$Diag) +# Normalized and transform +if (!require("preprocessCore")) install.packages("preprocessCore") +library(preprocessCore) +mddExprData_quantile <- normalize.quantiles(sense.filtered.cpm) +mddExprData_quantileLog2 <- log2(mddExprData_quantile) +# attach phenotype names and gene names to data +colnames(mddExprData_quantileLog2) <- mddPheno +rownames(mddExprData_quantileLog2) <- rownames(sense.filtered.cpm) +# coefficient of variation filter sd(x)/abs(mean(x)) +CoV_values <- apply(mddExprData_quantileLog2,1, +function(x) {sd(x)/abs(mean(x))}) +# smaller threshold, the higher the experimental effect relative to the +# measurement precision +sum(CoV_values<.045) +# there is one gene that has 0 variation -- remove +sd_values <- apply(mddExprData_quantileLog2,1, function(x) {sd(x)}) +rownames(mddExprData_quantileLog2)[sd_values==0] +# filter the data matrix +GxS.covfilter <- mddExprData_quantileLog2[CoV_values<.045 & sd_values>0,] +dim(GxS.covfilter) +# convert phenotype to factor +pheno.factor <- as.factor(colnames(GxS.covfilter)) +pheno.factor +str(pheno.factor) +levels(pheno.factor) +#### Part A: Logistic Regression +# make sure HC is the reference level +pheno.factor.relevel <- relevel(pheno.factor,"HC") +levels(pheno.factor.relevel) +# also rename levels "0"/"1" from 1/2 +levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="MDD"] <- 1 +levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="HC"] <- 0 +# Fit logistic model of first gene to phenotype data +gene.row <- 2 +gene.name <-rownames(GxS.covfilter)[gene.row] +gene.expr <- GxS.covfilter[gene.row,] +gene.fit <- glm(pheno.factor.relevel~gene.expr, family=binomial) +summary(gene.fit) +coeff.mat <- coef(summary(gene.fit)) +b0 <- coeff.mat[1,1] +b1 <- coeff.mat[2,1] +b1.pval <- coeff.mat[2,4] +summary(gene.fit) +coeff.mat <- coef(summary(gene.fit)) +b0 <- coeff.mat[1,1] +b0 +b1 +coeff.mat +b1.pval +## GLM +modelfn <- function(x){1/(1+exp(-(b0+b1*x)))} +g.min <- min(gene.expr) +g.max <- max(gene.expr) +curve(modelfn,g.min,g.max) # plot for domain of actual gene’s expression +abline(h=.5, lty=2) +curve(modelfn,3,8) # replot with extended domain to see the S shape +## Predict Function +Predicted.probs <- predict(gene.fit, gene.expr=gene.expr, +type="response") +if (!require("ggplot2")) install.packages("ggplot2") +library(ggplot2) +phenotype <- pheno.factor # just rename for legend +gene.gg.df <- data.frame(expression=gene.expr, +prediction=predicted.probs) +# plot predicted probabilities versus gene expression +p <- ggplot(data=gene.gg.df) +p <- p + geom_point(aes(x=expression, +y=prediction, +color=phenotype, # shape and color +shape=phenotype), # are true phenotype +size=3) +p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") +print(p) +abline(h=.5, lty=2) +Predicted.probs <- predict(gene.fit, gene.expr=gene.expr, +type="response") +if (!require("ggplot2")) install.packages("ggplot2") +library(ggplot2) +phenotype <- pheno.factor # just rename for legend +gene.gg.df <- data.frame(expression=gene.expr, +prediction=predicted.probs) +predicted.probs <- predict(gene.fit, gene.expr=gene.expr, +type="response") +if (!require("ggplot2")) install.packages("ggplot2") +library(ggplot2) +phenotype <- pheno.factor # just rename for legend +gene.gg.df <- data.frame(expression=gene.expr, +prediction=predicted.probs) +# plot predicted probabilities versus gene expression +p <- ggplot(data=gene.gg.df) +p <- p + geom_point(aes(x=expression, +y=prediction, +color=phenotype, # shape and color +shape=phenotype), # are true phenotype +size=3) +p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") +print(p) +abline(h=.5, lty=2) +predicted.probs <- predict(gene.fit, gene.expr=gene.expr, +type="response") +if (!require("ggplot2")) install.packages("ggplot2") +library(ggplot2) +phenotype <- pheno.factor # just rename for legend +gene.gg.df <- data.frame(expression=gene.expr, +prediction=predicted.probs) +# plot predicted probabilities versus gene expression +p <- ggplot(data=gene.gg.df) +p <- p + geom_point(aes(x=expression, +y=prediction, +color=phenotype, # shape and color +shape=phenotype), # are true phenotype +size=3) +p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") +print(p) +abline(h=.5, lty=2) +## Predict Function +predicted.probs <- predict(gene.fit, gene.expr=gene.expr, +type="response") +if (!require("ggplot2")) install.packages("ggplot2") +library(ggplot2) +phenotype <- pheno.factor # just rename for legend +gene.gg.df <- data.frame(expression=gene.expr, +prediction=predicted.probs) +# plot predicted probabilities versus gene expression +p <- ggplot(data=gene.gg.df) +p <- p + geom_point(aes(x=expression, +y=prediction, +color=phenotype, # shape and color +shape=phenotype), # are true phenotype +size=3) +p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") +print(p) +abline(h=.5, lty=2) +plot(p) +abline(h=.5, lty=2) +print(p) + abline(h=.5, lty=2) +# Lab 5 for the University of Tulsa's CS-6643 Bioinformatics Course +# Gene Expression Statistical Learning +# Professor: Dr. McKinney, Fall 2022 +# Noah L. Schrick - 1492657 +## Set Working Directory to file directory - RStudio approach +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +#### 0: Process and filter data +# load gene expression data +load("sense.filtered.cpm.Rdata") # setwd! +# load phenotype (mdd/hc) data +subject.attrs <- read.csv("Demographic_symptom.csv", +stringsAsFactors = FALSE) +if (!require("dplyr")) install.packages("dplyr") +library(dplyr) +# grab intersecting X (subject ids) and Diag (Diagnosis) from columns +phenos.df <- subject.attrs %>% +filter(X %in% colnames(sense.filtered.cpm)) %>% +dplyr::select(X, Diag) +mddPheno <- as.factor(phenos.df$Diag) +# Normalized and transform +if (!require("preprocessCore")) install.packages("preprocessCore") +library(preprocessCore) +mddExprData_quantile <- normalize.quantiles(sense.filtered.cpm) +mddExprData_quantileLog2 <- log2(mddExprData_quantile) +# attach phenotype names and gene names to data +colnames(mddExprData_quantileLog2) <- mddPheno +rownames(mddExprData_quantileLog2) <- rownames(sense.filtered.cpm) +# coefficient of variation filter sd(x)/abs(mean(x)) +CoV_values <- apply(mddExprData_quantileLog2,1, +function(x) {sd(x)/abs(mean(x))}) +# smaller threshold, the higher the experimental effect relative to the +# measurement precision +sum(CoV_values<.045) +# there is one gene that has 0 variation -- remove +sd_values <- apply(mddExprData_quantileLog2,1, function(x) {sd(x)}) +rownames(mddExprData_quantileLog2)[sd_values==0] +# filter the data matrix +GxS.covfilter <- mddExprData_quantileLog2[CoV_values<.045 & sd_values>0,] +dim(GxS.covfilter) +# convert phenotype to factor +pheno.factor <- as.factor(colnames(GxS.covfilter)) +pheno.factor +str(pheno.factor) +levels(pheno.factor) +#### Part A: Logistic Regression +# make sure HC is the reference level +pheno.factor.relevel <- relevel(pheno.factor,"HC") +levels(pheno.factor.relevel) +# also rename levels "0"/"1" from 1/2 +levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="MDD"] <- 1 +levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="HC"] <- 0 +## Fit logistic model of first gene to phenotype data +gene.row <- 2 +gene.name <-rownames(GxS.covfilter)[gene.row] +gene.expr <- GxS.covfilter[gene.row,] +gene.fit <- glm(pheno.factor.relevel~gene.expr, family=binomial) +summary(gene.fit) +coeff.mat <- coef(summary(gene.fit)) +b0 <- coeff.mat[1,1] +b1 <- coeff.mat[2,1] +b1.pval <- coeff.mat[2,4] +## GLM +modelfn <- function(x){1/(1+exp(-(b0+b1*x)))} +g.min <- min(gene.expr) +g.max <- max(gene.expr) +curve(modelfn,g.min,g.max) # plot for domain of actual gene’s expression +abline(h=.5, lty=2) +curve(modelfn,3,8) # replot with extended domain to see the S shape +## Predict Function +predicted.probs <- predict(gene.fit, gene.expr=gene.expr, +type="response") +if (!require("ggplot2")) install.packages("ggplot2") +library(ggplot2) +phenotype <- pheno.factor # just rename for legend +gene.gg.df <- data.frame(expression=gene.expr, +prediction=predicted.probs) +# plot predicted probabilities versus gene expression +p <- ggplot(data=gene.gg.df) +p <- p + geom_point(aes(x=expression, +y=prediction, +color=phenotype, # shape and color +shape=phenotype), # are true phenotype +size=3) +p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") +print(p) + abline(h=.5, lty=2) +# vector of logistic output probabilities for this model (glm/eq. above) +# prob = .5 is the threshold for prediction class = 1 vs 0 +predicted.class <- as.integer(predicted.probs >=.5) # predicted class +pheno.factor.relevel # true class +# vector of True (correctly predicted) and False (wrongly predicted) +correct.classified <- predicted.class == pheno.factor.relevel +# sum correct.classified +table(pheno.factor.relevel,predicted.class) +predicted.probs +predicted.probs[1] +as.integer(predicted.probs[1] >= 0.5) +# sum correct.classified +table(pheno.factor.relevel,predicted.class) +pheno.factor.relevel +correct.classified +length(correct.classified) +sum(correct.classified == "TRUE") +# accuracy: +correct.acc <- sum(correct.classified == "TRUE") / length(correct.classified) +correct.acc +lr.fn(2) +# logistic regression function for one gene row +lr.fn <- function(i){ +gene=rownames(GxS.covfilter)[i] +gene.expr <- GxS.covfilter[i,] +gene.fit <- glm(pheno.factor.relevel~gene.expr, +family=binomial) +coeff.mat <- coef(summary(gene.fit)) +b1 <- coeff.mat[2,1] +b1.pval <- coeff.mat[2,4] +coefvec <- gene.fit$estimate # intercept, gene +pvec <- gene.fit$p.value # intercept, gene +c(gene, b1, b1.pval) +} +lr.fn(2) +# initialize results matrix +num.genes<-nrow(GxS.covfilter) +lr.results.mat <- matrix(0, nrow=nrow(GxS.covfilter), ncol=3) +# for loop the function to all genes +for (i in 1:num.genes){ +lr.results.mat[i,] <- lr.fn(i) +} +lr.results.df <- data.frame(lr.results.mat) +colnames(lr.results.df) <- c("gene", "b1", "p.val") +# sort results b1 coefficient p-value +library(dplyr) +lr.results.sorted <- lr.results.df %>% +mutate_at("p.val", as.character) %>% +mutate_at("p.val", as.numeric) %>% +arrange(p.val) +lr.results.sorted[1:10,] +lr.results.sorted[1,] +lr.results.sorted[1,1] +gene.row <- which(rownames(GxS.covfilter)==lr.results.sorted[1,1]) +gene.row +gene.expr <- GxS.covfilter[gene.row,] +gene.fit <- glm(pheno.factor.relevel~gene.expr, +family=binomial) +predicted.probs <- predict(gene.fit, gene.expr=gene.expr, type="response") +gene.gg.df +# Plotting Code +gene.gg.df <- data.frame(expression=gene.expr, +prediction=predicted.probs) +# plot predicted probabilities versus gene expression +p <- ggplot(data=gene.gg.df) +p <- p + geom_point(aes(x=expression, +y=prediction, +color=phenotype, # shape and color +shape=phenotype), # are true phenotype +size=3) +p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") +print(p) + abline(h=.5, lty=2) +p <- p + geom_point(aes(x=expression, +y=prediction, +color=phenotype, # shape and color +shape=phenotype, +main=lr.results.sorted[1,1]), # are true phenotype +size=3) +?aes +?ggplot +p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") + +ggtitle(lr.results.sorted[1,1]) +print(p) + abline(h=.5, lty=2) +# code for accuracy +# vector of logistic output probabilities for this model (glm/eq. above) +# prob = .5 is the threshold for prediction class = 1 vs 0 +predicted.class <- as.integer(predicted.probs >=.5) # predicted class +pheno.factor.relevel # true class +# vector of True (correctly predicted) and False (wrongly predicted) +correct.classified <- predicted.class == pheno.factor.relevel +# sum correct.classified +table(pheno.factor.relevel,predicted.class) +# accuracy: +correct.acc <- sum(correct.classified == "TRUE") / length(correct.classified) +correct.acc +if (!require("glmnet")) install.packages("glmnet") +library(glmnet) +# alpha=0 means ridge, alpha=1 means lasso +# keep has to do with storing the results from cross-validiation +glmnet.model <- cv.glmnet(t(GxS.covfilter), pheno.factor.relevel, alpha=1, +family="binomial", type.measure="class", keep=T) +plot(glmnet.model) # plot of CV error vs lambda penalty +glmnet.model$lambda.min # lambda that minimizes the CV error +glmnet.model +# Convient way to extract non-zero +if (!require("coefplot")) install.packages("coefplot") +library(coefplot) +extract.coef(glmnet.model) +# get the penalized regression coefficients +glmnet.coeffs <- predict(glmnet.model,type="coefficients", +s=glmnet.model$lambda.min) +top_glmnet <- data.frame(as.matrix(glmnet.coeffs)) %>% +tibble::rownames_to_column(var = "features") %>% +filter(s1!=0, features!="(Intercept)") +top_glmnet_features <- top_glmnet %>% pull(features) +# apply the glmnet model to the data to get class predictions +glmnet.predicted <- predict(glmnet.model, +s=glmnet.model$lambda.min, # lambda to use +type="class", # classify +newx=t(GxS.covfilter)) # apply to original +glmnet.accuracy <- mean(factor(glmnet.predicted)==pheno.factor.relevel) +glmnet.accuracy +# get the coefficients that are not zero (these are the selected variables) +glmnet.nonzero.coeffs <- #glmnet.coeffs@Dimnames[[1]][which(glmnet.coeffs!=0)] +glmnet.nonzero.coeffs +# get the coefficients that are not zero (these are the selected variables) +glmnet.nonzero.coeffs <- #glmnet.coeffs@Dimnames[[1]][which(glmnet.coeffs!=0)] +glmnet.nonzero.coeffs +# get the coefficients that are not zero (these are the selected variables) +glmnet.nonzero.coeffs <- glmnet.coeffs@Dimnames[[1]][which(glmnet.coeffs!=0)] +glmnet.nonzero.coeffs +#### Part E: NPDR +if (!require("devtools")) install.packages("devtools") +library(devtools) +install_github("insilico/npdr") +library(npdr) #https://github.com/insilico/npdr +SxG.dat <- t(GxS.covfilter) +npdr.MDD.results <- npdr(pheno.factor, SxG.dat, +regression.type="binomial", +attr.diff.type="numeric-abs", +nbd.metric = "manhattan", +knn=knnSURF.balanced(pheno.factor, sd.frac = .5), +dopar.nn = F, dopar.reg=F, +padj.method="bonferroni", verbose = T) +colnames(npdr.MDD.results) # column names of the output +library(dplyr) +# gets genes (attributes/att) with FDR-adjusted p-value<.05 +top.p05.npdr <- npdr.MDD.results %>% filter(pval.adj<.05) %>% pull(att) +top.p05.npdr +# grab top 200, remove NA, remove "", get att col +top.npdr <- npdr.MDD.results %>% dplyr::slice(1:200) %>% +filter(att!="") %>% pull(att) +write.table(top.npdr,row.names=F,col.names=F,quote=F) +# grab top 200, remove NA, remove "", get att col +top.npdr <- npdr.MDD.results %>% dplyr::slice(1:200) %>% +filter(att!="") filter(pval.adj<1) %>% pull(att) +# grab top 200, remove NA, remove "", get att col +top.npdr <- npdr.MDD.results %>% dplyr::slice(1:200) %>% +filter(pval.adj<1) %>% pull(att) +write.table(top.npdr,row.names=F,col.names=F,quote=F) diff --git a/.~lock.lab5_expression3.docx# b/.~lock.lab5_expression3.docx# index dbc8386..9390f1d 100644 --- a/.~lock.lab5_expression3.docx# +++ b/.~lock.lab5_expression3.docx# @@ -1 +1 @@ -,noah,NovaArchSys,13.10.2022 18:05,file:///home/noah/.config/libreoffice/4; \ No newline at end of file +,noah,NovaArchSys,13.10.2022 18:27,file:///home/noah/.config/libreoffice/4; \ No newline at end of file diff --git a/Schrick-Noah_CS-6643_Lab-5.R b/Schrick-Noah_CS-6643_Lab-5.R index 48e295c..25e1934 100644 --- a/Schrick-Noah_CS-6643_Lab-5.R +++ b/Schrick-Noah_CS-6643_Lab-5.R @@ -59,7 +59,7 @@ levels(pheno.factor.relevel) levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="MDD"] <- 1 levels(pheno.factor.relevel)[levels(pheno.factor.relevel)=="HC"] <- 0 -# Fit logistic model of first gene to phenotype data +## Fit logistic model of first gene to phenotype data gene.row <- 2 gene.name <-rownames(GxS.covfilter)[gene.row] gene.expr <- GxS.covfilter[gene.row,] @@ -68,4 +68,183 @@ summary(gene.fit) coeff.mat <- coef(summary(gene.fit)) b0 <- coeff.mat[1,1] b1 <- coeff.mat[2,1] -b1.pval <- coeff.mat[2,4] \ No newline at end of file +b1.pval <- coeff.mat[2,4] + +## GLM +modelfn <- function(x){1/(1+exp(-(b0+b1*x)))} +g.min <- min(gene.expr) +g.max <- max(gene.expr) +curve(modelfn,g.min,g.max) # plot for domain of actual gene’s expression +abline(h=.5, lty=2) +curve(modelfn,3,8) # replot with extended domain to see the S shape + +## Predict Function +predicted.probs <- predict(gene.fit, gene.expr=gene.expr, + type="response") +if (!require("ggplot2")) install.packages("ggplot2") +library(ggplot2) +phenotype <- pheno.factor # just rename for legend +gene.gg.df <- data.frame(expression=gene.expr, + prediction=predicted.probs) +# plot predicted probabilities versus gene expression +p <- ggplot(data=gene.gg.df) +p <- p + geom_point(aes(x=expression, + y=prediction, + color=phenotype, # shape and color + shape=phenotype), # are true phenotype + size=3) +p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") +print(p) + abline(h=.5, lty=2) + + +#### Part B: Logistic Regression as Classification +# vector of logistic output probabilities for this model (glm/eq. above) +# prob = .5 is the threshold for prediction class = 1 vs 0 +predicted.class <- as.integer(predicted.probs >=.5) # predicted class +pheno.factor.relevel # true class +# vector of True (correctly predicted) and False (wrongly predicted) +correct.classified <- predicted.class == pheno.factor.relevel +# sum correct.classified +table(pheno.factor.relevel,predicted.class) +# accuracy: +correct.acc <- sum(correct.classified == "TRUE") / length(correct.classified) + + +#### Part C: Logistic Regression Ranking of all Genes +# logistic regression function for one gene row +lr.fn <- function(i){ + gene=rownames(GxS.covfilter)[i] + gene.expr <- GxS.covfilter[i,] + gene.fit <- glm(pheno.factor.relevel~gene.expr, + family=binomial) + coeff.mat <- coef(summary(gene.fit)) + b1 <- coeff.mat[2,1] + b1.pval <- coeff.mat[2,4] + coefvec <- gene.fit$estimate # intercept, gene + pvec <- gene.fit$p.value # intercept, gene + c(gene, b1, b1.pval) +} +# Testing with just one row +lr.fn(2) + +# initialize results matrix +num.genes<-nrow(GxS.covfilter) +lr.results.mat <- matrix(0, nrow=nrow(GxS.covfilter), ncol=3) +# for loop the function to all genes +for (i in 1:num.genes){ + lr.results.mat[i,] <- lr.fn(i) +} +lr.results.df <- data.frame(lr.results.mat) +colnames(lr.results.df) <- c("gene", "b1", "p.val") + +# sort results b1 coefficient p-value +library(dplyr) +lr.results.sorted <- lr.results.df %>% + mutate_at("p.val", as.character) %>% + mutate_at("p.val", as.numeric) %>% + arrange(p.val) +lr.results.sorted[1:10,] + +## Scatter plot of model fit and accuracy computation of top gene +# Get top gene data +gene.row <- which(rownames(GxS.covfilter)==lr.results.sorted[1,1]) +gene.expr <- GxS.covfilter[gene.row,] +gene.fit <- glm(pheno.factor.relevel~gene.expr, + family=binomial) +predicted.probs <- predict(gene.fit, gene.expr=gene.expr, type="response") +# Plotting Code +gene.gg.df <- data.frame(expression=gene.expr, + prediction=predicted.probs) +# plot predicted probabilities versus gene expression +p <- ggplot(data=gene.gg.df) +p <- p + geom_point(aes(x=expression, + y=prediction, + color=phenotype, # shape and color + shape=phenotype), # are true phenotype + size=3) +p <- p + geom_hline(yintercept = .5, linetype="dashed", color="red") + + ggtitle(lr.results.sorted[1,1]) +print(p) + abline(h=.5, lty=2) + +# code for accuracy +# vector of logistic output probabilities for this model (glm/eq. above) +# prob = .5 is the threshold for prediction class = 1 vs 0 +predicted.class <- as.integer(predicted.probs >=.5) # predicted class +pheno.factor.relevel # true class +# vector of True (correctly predicted) and False (wrongly predicted) +correct.classified <- predicted.class == pheno.factor.relevel +# sum correct.classified +table(pheno.factor.relevel,predicted.class) +# accuracy: +correct.acc <- sum(correct.classified == "TRUE") / length(correct.classified) +correct.acc + + +#### Part D: Statistical Learning +if (!require("glmnet")) install.packages("glmnet") +library(glmnet) +# alpha=0 means ridge, alpha=1 means lasso +# keep has to do with storing the results from cross-validiation +glmnet.model <- cv.glmnet(t(GxS.covfilter), pheno.factor.relevel, alpha=1, + family="binomial", type.measure="class", keep=T) +plot(glmnet.model) # plot of CV error vs lambda penalty +glmnet.model$lambda.min # lambda that minimizes the CV error + +# Easy way to extract non-zero coeffs +if (!require("coefplot")) install.packages("coefplot") +library(coefplot) +extract.coef(glmnet.model) + +# get the penalized regression coefficients +glmnet.coeffs <- predict(glmnet.model,type="coefficients", + s=glmnet.model$lambda.min) + +# get the coefficients that are not zero (these are the selected variables) +glmnet.nonzero.coeffs <- glmnet.coeffs@Dimnames[[1]][which(glmnet.coeffs!=0)] +glmnet.nonzero.coeffs + +top_glmnet <- data.frame(as.matrix(glmnet.coeffs)) %>% + tibble::rownames_to_column(var = "features") %>% + filter(s1!=0, features!="(Intercept)") + +top_glmnet_features <- top_glmnet %>% pull(features) + +# apply the glmnet model to the data to get class predictions +glmnet.predicted <- predict(glmnet.model, + s=glmnet.model$lambda.min, # lambda to use + type="class", # classify + newx=t(GxS.covfilter)) # apply to original +glmnet.accuracy <- mean(factor(glmnet.predicted)==pheno.factor.relevel) +glmnet.accuracy +table(glmnet.predicted,pheno.factor.relevel) # confusion matrix + +#### Part E: NPDR +if (!require("devtools")) install.packages("devtools") +library(devtools) +install_github("insilico/npdr") + +library(npdr) #https://github.com/insilico/npdr +SxG.dat <- t(GxS.covfilter) +npdr.MDD.results <- npdr(pheno.factor, SxG.dat, + regression.type="binomial", + attr.diff.type="numeric-abs", + nbd.metric = "manhattan", + knn=knnSURF.balanced(pheno.factor, sd.frac = .5), + dopar.nn = F, dopar.reg=F, + padj.method="bonferroni", verbose = T) +colnames(npdr.MDD.results) # column names of the output + +library(dplyr) +# gets genes (attributes/att) with FDR-adjusted p-value<.05 +top.p05.npdr <- npdr.MDD.results %>% filter(pval.adj<.05) %>% pull(att) +top.p05.npdr + + +# grab top 200, remove NA, remove "", get att col +top.npdr <- npdr.MDD.results %>% dplyr::slice(1:200) %>% + filter(pval.adj<1) %>% pull(att) + +write.table(top.npdr,row.names=F,col.names=F,quote=F) + + + diff --git a/lab5_expression3.docx b/lab5_expression3.docx index c8805b6..15df232 100644 Binary files a/lab5_expression3.docx and b/lab5_expression3.docx differ