################# Max-Log-Likelihood ################# n <- length(g.breaks.clean) kmin <- g.breaks.clean[1] alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) alpha.ML lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course # Degree Distribution # Professor: Dr. McKinney, Spring 2022 # Noah Schrick - 1492657 library(igraph) library(igraphdata) data(yeast) g <- yeast g.netname <- "Yeast" ################# Set up Work ################# g.vec <- degree(g) g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, " Network")) legend("topright", c("Guess", "Poisson", "Least-Squares Fit", "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", "#006CD1", "#E66100", "#D35FB7")) g.mean <- mean(g.vec) g.seq <- 0:max(g.vec) # x-axis ################# Guessing Alpha ################# alpha.guess <- 1.5 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) ################# Poisson ################# g.pois <- dpois(g.seq, g.mean, log=F) lines(g.seq, g.pois, col="#006CD1", lty=2) ################# Linear model: Least-Squares Fit ################# #g.breaks <- g.hist$breaks[-c(1)] # remove 0 g.breaks <- g.hist$breaks # remove 0 g.probs <- g.hist$density[-1] # make lengths match # Need to clean up probabilities that are 0 nz.probs.mask <- g.probs!=0 g.breaks.clean <- g.breaks[nz.probs.mask] g.probs.clean <- g.breaks[nz.probs.mask] #plot(log(g.breaks.clean), log(g.probs.clean)) g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) summary(g.fit) alpha.LM <- coef(g.fit)[2] lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) ################# Max-Log-Likelihood ################# n <- length(g.breaks.clean) kmin <- g.breaks.clean[1] alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) alpha.ML lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course # Degree Distribution # Professor: Dr. McKinney, Spring 2022 # Noah Schrick - 1492657 library(igraph) library(igraphdata) data(yeast) g <- yeast g.netname <- "Yeast" ################# Set up Work ################# g.vec <- degree(g) g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, " Network")) legend("topright", c("Guess", "Poisson", "Least-Squares Fit", "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", "#006CD1", "#E66100", "#D35FB7")) g.mean <- mean(g.vec) g.seq <- 0:max(g.vec) # x-axis ################# Guessing Alpha ################# alpha.guess <- 1.5 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) ################# Poisson ################# g.pois <- dpois(g.seq, g.mean, log=F) lines(g.seq, g.pois, col="#006CD1", lty=2) ################# Linear model: Least-Squares Fit ################# g.breaks <- g.hist$breaks[-c(1)] # remove 0 g.probs <- g.hist$density[-1] # make lengths match # Need to clean up probabilities that are 0 nz.probs.mask <- g.probs!=0 g.breaks.clean <- g.breaks[nz.probs.mask] g.probs.clean <- g.probs[nz.probs.mask] #plot(log(g.breaks.clean), log(g.probs.clean)) g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) summary(g.fit) alpha.LM <- coef(g.fit)[2] lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) ################# Max-Log-Likelihood ################# n <- length(g.breaks.clean) kmin <- g.breaks.clean[1] alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) alpha.ML lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) alpha.LM # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course # Degree Distribution # Professor: Dr. McKinney, Spring 2022 # Noah Schrick - 1492657 library(igraph) library(igraphdata) data(yeast) g <- yeast g.netname <- "Yeast" ################# Set up Work ################# g.vec <- degree(g) g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, " Network")) legend("topright", c("Guess", "Poisson", "Least-Squares Fit", "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", "#006CD1", "#E66100", "#D35FB7")) g.mean <- mean(g.vec) g.seq <- 0:max(g.vec) # x-axis ################# Guessing Alpha ################# alpha.guess <- 1.5 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) ################# Poisson ################# g.pois <- dpois(g.seq, g.mean, log=F) lines(g.seq, g.pois, col="#006CD1", lty=2) ################# Linear model: Least-Squares Fit ################# g.breaks <- g.hist$breaks[-c(1)] # remove 0 g.probs <- g.hist$density[-1] # make lengths match # Need to clean up probabilities that are 0 nz.probs.mask <- g.probs!=0 g.breaks.clean <- g.breaks[nz.probs.mask] g.probs.clean <- g.probs[nz.probs.mask] #plot(log(g.breaks.clean), log(g.probs.clean)) g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) summary(g.fit) alpha.LM <- coef(g.fit)[2] lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) ################# Max-Log-Likelihood ################# n <- length(g.breaks.clean) kmin <- g.breaks.clean[1] alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) alpha.ML lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course # Degree Distribution # Professor: Dr. McKinney, Spring 2022 # Noah Schrick - 1492657 library(igraph) library(igraphdata) data(yeast) g <- yeast g.netname <- "Yeast" ################# Set up Work ################# g.vec <- degree(g) g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, " Network")) legend("topright", c("Guess", "Poisson", "Least-Squares Fit", "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", "#006CD1", "#E66100", "#D35FB7")) g.mean <- mean(g.vec) g.seq <- 0:max(g.vec) # x-axis ################# Guessing Alpha ################# alpha.guess <- 1.5 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=5) ################# Poisson ################# g.pois <- dpois(g.seq, g.mean, log=F) lines(g.seq, g.pois, col="#006CD1", lty=2) ################# Linear model: Least-Squares Fit ################# g.breaks <- g.hist$breaks[-c(1)] # remove 0 g.probs <- g.hist$density[-1] # make lengths match # Need to clean up probabilities that are 0 nz.probs.mask <- g.probs!=0 g.breaks.clean <- g.breaks[nz.probs.mask] g.probs.clean <- g.probs[nz.probs.mask] #plot(log(g.breaks.clean), log(g.probs.clean)) g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) summary(g.fit) alpha.LM <- coef(g.fit)[2] lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) ################# Max-Log-Likelihood ################# n <- length(g.breaks.clean) kmin <- g.breaks.clean[1] alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) alpha.ML lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course # Degree Distribution # Professor: Dr. McKinney, Spring 2022 # Noah Schrick - 1492657 library(igraph) library(igraphdata) data(yeast) g <- yeast g.netname <- "Yeast" ################# Set up Work ################# g.vec <- degree(g) g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, " Network")) legend("topright", c("Guess", "Poisson", "Least-Squares Fit", "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", "#006CD1", "#E66100", "#D35FB7")) g.mean <- mean(g.vec) g.seq <- 0:max(g.vec) # x-axis ################# Guessing Alpha ################# alpha.guess <- 1.5 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) ################# Poisson ################# g.pois <- dpois(g.seq, g.mean, log=F) lines(g.seq, g.pois, col="#006CD1", lty=2) ################# Linear model: Least-Squares Fit ################# g.breaks <- g.hist$breaks[-c(1)] # remove 0 g.probs <- g.hist$density[-1] # make lengths match # Need to clean up probabilities that are 0 nz.probs.mask <- g.probs!=0 g.breaks.clean <- g.breaks[nz.probs.mask] g.probs.clean <- g.probs[nz.probs.mask] #plot(log(g.breaks.clean), log(g.probs.clean)) g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) summary(g.fit) alpha.LM <- coef(g.fit)[2] lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) ################# Max-Log-Likelihood ################# n <- length(g.breaks.clean) kmin <- g.breaks.clean[1] alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) alpha.ML lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course # Degree Distribution # Professor: Dr. McKinney, Spring 2022 # Noah Schrick - 1492657 library(igraph) library(igraphdata) data(yeast) g <- yeast g.netname <- "Yeast" ################# Set up Work ################# g.vec <- degree(g) g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, " Network")) legend("topright", c("Guess", "Poisson", "Least-Squares Fit", "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", "#006CD1", "#E66100", "#D35FB7")) g.mean <- mean(g.vec) g.seq <- 0:max(g.vec) # x-axis ################# Guessing Alpha ################# alpha.guess <- 1.5 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) ################# Poisson ################# g.pois <- dpois(g.seq, g.mean, log=F) lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3) ################# Linear model: Least-Squares Fit ################# g.breaks <- g.hist$breaks[-c(1)] # remove 0 g.probs <- g.hist$density[-1] # make lengths match # Need to clean up probabilities that are 0 nz.probs.mask <- g.probs!=0 g.breaks.clean <- g.breaks[nz.probs.mask] g.probs.clean <- g.probs[nz.probs.mask] #plot(log(g.breaks.clean), log(g.probs.clean)) g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) summary(g.fit) alpha.LM <- coef(g.fit)[2] lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3) ################# Max-Log-Likelihood ################# n <- length(g.breaks.clean) kmin <- g.breaks.clean[1] alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) alpha.ML lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3) plot(yeast) hist(yeast) hist(g.vec) g.pois g.mean alpha.LM alpha.ML degree(g) sort(degree(g)) sort(degree(g),decreasing=FALSE) sort(degree(g),decreasing=F) sort(degree(g),decreasing=false) sort(degree(g), decreasing = TRUE) head(sort(degree(g), decreasing = TRUE)) stddev(degree(g)) sd(degree(g)) tail(sort(degree(g), decreasing = TRUE)) plot(log(g.breaks.clean), log(g.probs.clean)) # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course # Degree Distribution # Professor: Dr. McKinney, Spring 2022 # Noah Schrick - 1492657 library(igraph) library(igraphdata) data(yeast) g <- yeast g.netname <- "Yeast" ################# Set up Work ################# g.vec <- degree(g) g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, " Network")) legend("topright", c("Guess", "Poisson", "Least-Squares Fit", "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", "#006CD1", "#E66100", "#D35FB7")) g.mean <- mean(g.vec) g.seq <- 0:max(g.vec) # x-axis ################# Guessing Alpha ################# alpha.guess <- 1.5 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) ################# Poisson ################# g.pois <- dpois(g.seq, g.mean, log=F) lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3) ################# Linear model: Least-Squares Fit ################# g.breaks <- g.hist$breaks[-c(1)] # remove 0 g.probs <- g.hist$density[-1] # make lengths match # Need to clean up probabilities that are 0 nz.probs.mask <- g.probs!=0 g.breaks.clean <- g.breaks[nz.probs.mask] g.probs.clean <- g.probs[nz.probs.mask] plot(log(g.breaks.clean), log(g.probs.clean)) g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) summary(g.fit) alpha.LM <- coef(g.fit)[2] lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3) ################# Max-Log-Likelihood ################# n <- length(g.breaks.clean) kmin <- g.breaks.clean[1] alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) alpha.ML lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3) plot(log(g.breaks.clean), log(g.probs.clean)) g.breaks.clean <- g.breaks[nz.probs.mask] g.probs.clean <- g.probs[nz.probs.mask] plot(log(g.breaks.clean), log(g.probs.clean)) ## Turn into function to repeat for all SNPs LR.fn <- function(i){ lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial) td.lr <- tidy(lr) pval_vec <- td.lr$p.value # vector of $p.value from td.lr coef_vec <- td.lr$estimate # vector of $estimate cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3]) } # apply Logistic Regression model to all SNPs LRresults.df <- data.frame(t(sapply(1:ncol(genotypes.df), LR.fn))) # Lab 6 for the University of Tulsa's CS-6643 Bioinformatics Course # GWAS # Professor: Dr. McKinney, Fall 2022 # Noah L. Schrick - 1492657 ## Set Working Directory to file directory - RStudio approach setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #### Part 0: PLINK if (!require("BiocManager")) install.packages("BiocManager") library(BiocManager) if (!require("snpStats")) BiocManager::install("snpStats") library(snpStats) ex.data <- read.pedfile(file="extra.ped", snps="extra.map") ex.data$fam phenotype <- ex.data$fam$affected-1 # change pheno from 1/2 to 0/1 genotypes <- ex.data$genotypes # encoded as AA/AB/BB snp.ids <- as.character(ex.data$map$snp.names) genotypes.df <- data.frame(as(genotypes, "character")) colnames(genotypes.df) <- snp.ids # observed contingency table for SNP rs630969 table(phenotype,genotypes.df$rs630969, dnn=c("phenotype","genotype")) # dnn dimension names of table dim(genotypes.df) #### Part A: Chi-Square Test # creates list of observed contingency tables for all SNPs # sapply acts on each column of genotypes.df observed.tables.list <- sapply(genotypes.df, function(x) table(phenotype,x,dnn=c("phenotype","genotype"))) test.table <- observed.tables.list$rs634228 genoMarg.vec <- colSums(test.table) # margin vector phenoMarg.vec <- rowSums(test.table) # margin vector totalSubj <- sum(genoMarg.vec) # total subjects expect.test <- outer(phenoMarg.vec,genoMarg.vec/totalSubj,'*') ## Fisher Test # Fisher exact test (chi-square test) for all SNPs fish_fn <- function(i){ cbind(snp.ids[i], fisher.test(observed.tables.list[[i]])$p.value) } # apply fisher exact test to all SNPs fish.df <- data.frame(t(sapply(1:ncol(genotypes.df), fish_fn))) colnames(fish.df) <- c("rs", "p_value") # sort SNPs by Fisher exact p-value if (!require("dplyr")) install.packages("dplyr") library(dplyr) fish.results <- fish.df %>% mutate_at("p_value", as.character) %>% mutate_at("p_value", as.numeric) %>% arrange(p_value) print(fish.results) #### Part B: Logistic regression with genotypes if (!require("ggplot2")) BiocManager::install("ggplot2") library(ggplot2) i<-8 A1<-ex.data$map$allele.1[i] A2<-ex.data$map$allele.2[i] geno.labels <- c(paste(A1,A1,sep=""),paste(A1,A2,sep=""),paste(A2,A2,sep="")) ## Plot with ggplot2 # data from the one SNP oneSNP.df <- data.frame(cbind(genotypes.df[[i]],as.numeric(phenotype))) colnames(oneSNP.df) <- c("genotypes","phenotypes") lr.plot <- ggplot(oneSNP.df, aes(x=genotypes, y=phenotypes)) + geom_point(position = position_jitter(w = 0.1, h = 0.1)) + # stat_smooth plots the probability based on the model stat_smooth(method="glm", method.args = list(family = "binomial")) #+ #xlim(geno.labels) + ggtitle(snp.ids[i]) print(lr.plot) ## Fit a logistic regression model of phenotype with SNP in the 8th column if (!require("broom")) install.packages("broom") library(broom) # for tidy function pheno.factor <- factor(phenotype,labels=c(0,1)) i<-8 lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial) td.lr <- tidy(lr) pval_vec <- td.lr$p.value # vector of $p.value from td.lr coef_vec <- td.lr$estimate # vector of $estimate cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3]) ## Turn into function to repeat for all SNPs LR.fn <- function(i){ lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial) td.lr <- tidy(lr) pval_vec <- td.lr$p.value # vector of $p.value from td.lr coef_vec <- td.lr$estimate # vector of $estimate cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3]) } # apply Logistic Regression model to all SNPs LRresults.df <- data.frame(t(sapply(1:ncol(genotypes.df), LR.fn))) # add column names to results data frame colnames(LRresults.df) <- c("rs", "AAintercept", "ABcoef", "BBcoef", "AA.pval", "AB.pval", "BB.pval") # The following sorts LR results by the p-value of the BB # homozygous coefficient. tidy made $p_value a factor and when you try to # convert directly to numeric (as.numeric) turns factors into integer and # this messes up sorting especially with scientific notation lr.results.sorted <- LRresults.df %>% mutate_at("BB.pval", as.character) %>% # convert to char before numeric mutate_at("BB.pval", as.numeric) %>% # convert to numeric for arrange arrange(BB.pval) # sort as.matrix(lr.results.sorted %>% pull(rs,BB.pval)) fish.df fish.results fish.results$rs lr.results.sorted lr.results.sorted$rs table(fish.results$rs, lr.results.sorted$rs) expand.grid(fish=fish.results$rs, lr=lr.results.sorted$rs) CJ(fish=fish.results$rs, lr=lr.results.sorted$rs) table(fish=fish.results$rs, lr=lr.results.sorted$rs) as.table(fish=fish.results$rs, lr=lr.results.sorted$rs) setNames(fish.results$rs, lr.results.sorted$rs) as.table(setNames(fish.results$rs, lr.results.sorted$rs)) data.frame(fish.results$rs, lr.results.sorted$rs) tmp <- data.frame(fish.results$rs, lr.results.sorted$rs) ?plot length(fish.results$rs) length(lr.results.sorted$rs) ?plot(x=1:length(fish.results$rs)) ?plot(y=1:length(fish.results$rs), x=fish.results$rs) plot(y=1:length(fish.results$rs), x=fish.results$rs) plot(fish.results) plot(fish.results$rs) fish.results$rs class(fish.results$rs) as.vector(fish.results$rs) plot(as.vector(fish.results$rs)) as.data.frame(fish.results$rs) plot(as.data.frame(fish.results$rs)) ggplot(as.data.frame(fish.results$rs)) ggplot(as.data.frame(fish.results$rs)) plot(as.data.frame(fish.results$rs)) tmpy<-1:length(fish.results$rs) tmpy yvals <- 1:length(fish.results$rs) xfish <- fish.results$rs xfish which(fish.results$rs == "rs17786052") which(fish.results$rs == "rs17786052")[[1]] snp.ids snp.ids[i] match(fish.results$rs, snp.ids) yvals <- 1:length(fish.results$rs) xvals <- snp.ids fishvals <- match(fish.results$rs, snp.ids) lrvals <- match(lr.results.sorted$rs, snp.ids) plot(xvals, fishvals) fishvals plot(xlimxvals, fishvals, type="l") plot(xvals, fishvals, type="l") plot(1:17, yvals, type=l, xaxt="none", xlab="snp") plot(1:17, yvals, type="l", xaxt="none", xlab="snp") axis(1, axTicks(1), labels=snp.ids) snp.ids axis(labels=snp.ids) ?axis axis(labels=snp.ids, side=1) axis(labels=snp.ids, side=1, at=1) axis(labels=snp.ids, side=1, at=17) axis(labels=snp.ids, side=1, at=17) length(snp.ids) length(snp.ids[1]) xlabels <- c(snp.ids) xlabels axis(xlabels=snp.ids) axis(labels=xlabels) axis(labels=xlabels, side=1) axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids) plot(1:17, fishvals, type="l", xaxt="none", xlab="snp", ylab="rank") axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids) lines(1:length(lr.results.sorted$rs), lrvals, type="l", col="red", lwd =2, lty=1) plot(1:length(fish.results$rs), fishvals, type="l", xaxt="none", xlab="snp", ylab="rank", lty=1, col=1) axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids) lines(1:length(lr.results.sorted$rs), lrvals, type="l", col=2, lty=2) legend(x="topright", legend=c("Chi-Square Fisher Test", "Logistic Regression"), lty=c(1,2), col=c(1,2)) plot(1:length(fish.results$rs), fishvals, type="l", xaxt="none", xlab="snp", ylab="rank", lty=1, col=1) axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids) lines(1:length(lr.results.sorted$rs), lrvals, type="l", col=2, lty=2, lwd=2) legend(x="topright", legend=c("Chi-Square Fisher Test", "Logistic Regression"), lty=c(1,2), col=c(1,2))