diff --git a/.Rhistory b/.Rhistory new file mode 100644 index 0000000..471e486 --- /dev/null +++ b/.Rhistory @@ -0,0 +1,512 @@ +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +#g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.breaks <- g.hist$breaks # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.breaks[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +alpha.LM +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=5) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3) +plot(yeast) +hist(yeast) +hist(g.vec) +g.pois +g.mean +alpha.LM +alpha.ML +degree(g) +sort(degree(g)) +sort(degree(g),decreasing=FALSE) +sort(degree(g),decreasing=F) +sort(degree(g),decreasing=false) +sort(degree(g), decreasing = TRUE) +head(sort(degree(g), decreasing = TRUE)) +stddev(degree(g)) +sd(degree(g)) +tail(sort(degree(g), decreasing = TRUE)) +plot(log(g.breaks.clean), log(g.probs.clean)) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3) +plot(log(g.breaks.clean), log(g.probs.clean)) +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +plot(log(g.breaks.clean), log(g.probs.clean)) +## Turn into function to repeat for all SNPs +LR.fn <- function(i){ +lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial) +td.lr <- tidy(lr) +pval_vec <- td.lr$p.value # vector of $p.value from td.lr +coef_vec <- td.lr$estimate # vector of $estimate +cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3]) +} +# apply Logistic Regression model to all SNPs +LRresults.df <- data.frame(t(sapply(1:ncol(genotypes.df), LR.fn))) +# Lab 6 for the University of Tulsa's CS-6643 Bioinformatics Course +# GWAS +# Professor: Dr. McKinney, Fall 2022 +# Noah L. Schrick - 1492657 +## Set Working Directory to file directory - RStudio approach +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +#### Part 0: PLINK +if (!require("BiocManager")) install.packages("BiocManager") +library(BiocManager) +if (!require("snpStats")) BiocManager::install("snpStats") +library(snpStats) +ex.data <- read.pedfile(file="extra.ped", snps="extra.map") +ex.data$fam +phenotype <- ex.data$fam$affected-1 # change pheno from 1/2 to 0/1 +genotypes <- ex.data$genotypes # encoded as AA/AB/BB +snp.ids <- as.character(ex.data$map$snp.names) +genotypes.df <- data.frame(as(genotypes, "character")) +colnames(genotypes.df) <- snp.ids +# observed contingency table for SNP rs630969 +table(phenotype,genotypes.df$rs630969, +dnn=c("phenotype","genotype")) # dnn dimension names of table +dim(genotypes.df) +#### Part A: Chi-Square Test +# creates list of observed contingency tables for all SNPs +# sapply acts on each column of genotypes.df +observed.tables.list <- sapply(genotypes.df, function(x) +table(phenotype,x,dnn=c("phenotype","genotype"))) +test.table <- observed.tables.list$rs634228 +genoMarg.vec <- colSums(test.table) # margin vector +phenoMarg.vec <- rowSums(test.table) # margin vector +totalSubj <- sum(genoMarg.vec) # total subjects +expect.test <- outer(phenoMarg.vec,genoMarg.vec/totalSubj,'*') +## Fisher Test +# Fisher exact test (chi-square test) for all SNPs +fish_fn <- function(i){ +cbind(snp.ids[i], fisher.test(observed.tables.list[[i]])$p.value) +} +# apply fisher exact test to all SNPs +fish.df <- data.frame(t(sapply(1:ncol(genotypes.df), fish_fn))) +colnames(fish.df) <- c("rs", "p_value") +# sort SNPs by Fisher exact p-value +if (!require("dplyr")) install.packages("dplyr") +library(dplyr) +fish.results <- fish.df %>% +mutate_at("p_value", as.character) %>% +mutate_at("p_value", as.numeric) %>% +arrange(p_value) +print(fish.results) +#### Part B: Logistic regression with genotypes +if (!require("ggplot2")) BiocManager::install("ggplot2") +library(ggplot2) +i<-8 +A1<-ex.data$map$allele.1[i] +A2<-ex.data$map$allele.2[i] +geno.labels <- c(paste(A1,A1,sep=""),paste(A1,A2,sep=""),paste(A2,A2,sep="")) +## Plot with ggplot2 +# data from the one SNP +oneSNP.df <- data.frame(cbind(genotypes.df[[i]],as.numeric(phenotype))) +colnames(oneSNP.df) <- c("genotypes","phenotypes") +lr.plot <- ggplot(oneSNP.df, aes(x=genotypes, y=phenotypes)) + +geom_point(position = position_jitter(w = 0.1, h = 0.1)) + +# stat_smooth plots the probability based on the model +stat_smooth(method="glm", method.args = list(family = "binomial")) #+ +#xlim(geno.labels) + ggtitle(snp.ids[i]) +print(lr.plot) +## Fit a logistic regression model of phenotype with SNP in the 8th column +if (!require("broom")) install.packages("broom") +library(broom) # for tidy function +pheno.factor <- factor(phenotype,labels=c(0,1)) +i<-8 +lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial) +td.lr <- tidy(lr) +pval_vec <- td.lr$p.value # vector of $p.value from td.lr +coef_vec <- td.lr$estimate # vector of $estimate +cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3]) +## Turn into function to repeat for all SNPs +LR.fn <- function(i){ +lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial) +td.lr <- tidy(lr) +pval_vec <- td.lr$p.value # vector of $p.value from td.lr +coef_vec <- td.lr$estimate # vector of $estimate +cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3]) +} +# apply Logistic Regression model to all SNPs +LRresults.df <- data.frame(t(sapply(1:ncol(genotypes.df), LR.fn))) +# add column names to results data frame +colnames(LRresults.df) <- c("rs", "AAintercept", "ABcoef", "BBcoef", "AA.pval", "AB.pval", "BB.pval") +# The following sorts LR results by the p-value of the BB +# homozygous coefficient. tidy made $p_value a factor and when you try to +# convert directly to numeric (as.numeric) turns factors into integer and +# this messes up sorting especially with scientific notation +lr.results.sorted <- LRresults.df %>% +mutate_at("BB.pval", as.character) %>% # convert to char before numeric +mutate_at("BB.pval", as.numeric) %>% # convert to numeric for arrange +arrange(BB.pval) # sort +as.matrix(lr.results.sorted %>% pull(rs,BB.pval)) +fish.df +fish.results +fish.results$rs +lr.results.sorted +lr.results.sorted$rs +table(fish.results$rs, lr.results.sorted$rs) +expand.grid(fish=fish.results$rs, lr=lr.results.sorted$rs) +CJ(fish=fish.results$rs, lr=lr.results.sorted$rs) +table(fish=fish.results$rs, lr=lr.results.sorted$rs) +as.table(fish=fish.results$rs, lr=lr.results.sorted$rs) +setNames(fish.results$rs, lr.results.sorted$rs) +as.table(setNames(fish.results$rs, lr.results.sorted$rs)) +data.frame(fish.results$rs, lr.results.sorted$rs) +tmp <- data.frame(fish.results$rs, lr.results.sorted$rs) +?plot +length(fish.results$rs) +length(lr.results.sorted$rs) +?plot(x=1:length(fish.results$rs)) +?plot(y=1:length(fish.results$rs), x=fish.results$rs) +plot(y=1:length(fish.results$rs), x=fish.results$rs) +plot(fish.results) +plot(fish.results$rs) +fish.results$rs +class(fish.results$rs) +as.vector(fish.results$rs) +plot(as.vector(fish.results$rs)) +as.data.frame(fish.results$rs) +plot(as.data.frame(fish.results$rs)) +ggplot(as.data.frame(fish.results$rs)) +ggplot(as.data.frame(fish.results$rs)) +plot(as.data.frame(fish.results$rs)) +tmpy<-1:length(fish.results$rs) +tmpy +yvals <- 1:length(fish.results$rs) +xfish <- fish.results$rs +xfish +which(fish.results$rs == "rs17786052") +which(fish.results$rs == "rs17786052")[[1]] +snp.ids +snp.ids[i] +match(fish.results$rs, snp.ids) +yvals <- 1:length(fish.results$rs) +xvals <- snp.ids +fishvals <- match(fish.results$rs, snp.ids) +lrvals <- match(lr.results.sorted$rs, snp.ids) +plot(xvals, fishvals) +fishvals +plot(xlimxvals, fishvals, type="l") +plot(xvals, fishvals, type="l") +plot(1:17, yvals, type=l, xaxt="none", xlab="snp") +plot(1:17, yvals, type="l", xaxt="none", xlab="snp") +axis(1, axTicks(1), labels=snp.ids) +snp.ids +axis(labels=snp.ids) +?axis +axis(labels=snp.ids, side=1) +axis(labels=snp.ids, side=1, at=1) +axis(labels=snp.ids, side=1, at=17) +axis(labels=snp.ids, side=1, at=17) +length(snp.ids) +length(snp.ids[1]) +xlabels <- c(snp.ids) +xlabels +axis(xlabels=snp.ids) +axis(labels=xlabels) +axis(labels=xlabels, side=1) +axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids) +plot(1:17, fishvals, type="l", xaxt="none", xlab="snp", ylab="rank") +axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids) +lines(1:length(lr.results.sorted$rs), lrvals, type="l", col="red", lwd =2, lty=1) +plot(1:length(fish.results$rs), fishvals, type="l", xaxt="none", +xlab="snp", ylab="rank", lty=1, col=1) +axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids) +lines(1:length(lr.results.sorted$rs), lrvals, type="l", col=2, lty=2) +legend(x="topright", legend=c("Chi-Square Fisher Test", "Logistic Regression"), +lty=c(1,2), col=c(1,2)) +plot(1:length(fish.results$rs), fishvals, type="l", xaxt="none", +xlab="snp", ylab="rank", lty=1, col=1) +axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids) +lines(1:length(lr.results.sorted$rs), lrvals, type="l", col=2, lty=2, lwd=2) +legend(x="topright", legend=c("Chi-Square Fisher Test", "Logistic Regression"), +lty=c(1,2), col=c(1,2)) diff --git a/.~lock.lab_gwas_stats.docx# b/.~lock.lab_gwas_stats.docx# deleted file mode 100644 index 43e9721..0000000 --- a/.~lock.lab_gwas_stats.docx# +++ /dev/null @@ -1 +0,0 @@ -,noah,NovaArchSys,20.10.2022 16:40,file:///home/noah/.config/libreoffice/4; \ No newline at end of file diff --git a/Schrick-Noah_CS-6643_Lab-6-Report.pdf b/Schrick-Noah_CS-6643_Lab-6-Report.pdf new file mode 100644 index 0000000..e0e2412 Binary files /dev/null and b/Schrick-Noah_CS-6643_Lab-6-Report.pdf differ diff --git a/Schrick-Noah_CS-6643_Lab-6.R b/Schrick-Noah_CS-6643_Lab-6.R index bd72376..d817006 100644 --- a/Schrick-Noah_CS-6643_Lab-6.R +++ b/Schrick-Noah_CS-6643_Lab-6.R @@ -54,3 +54,81 @@ fish.results <- fish.df %>% mutate_at("p_value", as.numeric) %>% arrange(p_value) print(fish.results) + + +#### Part B: Logistic regression with genotypes +if (!require("ggplot2")) BiocManager::install("ggplot2") +library(ggplot2) +i<-8 +A1<-ex.data$map$allele.1[i] +A2<-ex.data$map$allele.2[i] +geno.labels <- c(paste(A1,A1,sep=""),paste(A1,A2,sep=""),paste(A2,A2,sep="")) + +## Plot with ggplot2 +# data from the one SNP +oneSNP.df <- data.frame(cbind(genotypes.df[[i]],as.numeric(phenotype))) +colnames(oneSNP.df) <- c("genotypes","phenotypes") + +lr.plot <- ggplot(oneSNP.df, aes(x=genotypes, y=phenotypes)) + + geom_point(position = position_jitter(w = 0.1, h = 0.1)) + + # stat_smooth plots the probability based on the model + stat_smooth(method="glm", method.args = list(family = "binomial")) #+ + #xlim(geno.labels) + ggtitle(snp.ids[i]) +print(lr.plot) + +## Fit a logistic regression model of phenotype with SNP in the 8th column +if (!require("broom")) install.packages("broom") +library(broom) # for tidy function +pheno.factor <- factor(phenotype,labels=c(0,1)) +i<-8 +lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial) +td.lr <- tidy(lr) + +pval_vec <- td.lr$p.value # vector of $p.value from td.lr +coef_vec <- td.lr$estimate # vector of $estimate +cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3]) + +## Turn into function to repeat for all SNPs +LR.fn <- function(i){ + lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial) + td.lr <- tidy(lr) + + pval_vec <- td.lr$p.value # vector of $p.value from td.lr + coef_vec <- td.lr$estimate # vector of $estimate + cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3]) +} + +# apply Logistic Regression model to all SNPs +LRresults.df <- data.frame(t(sapply(1:ncol(genotypes.df), LR.fn))) + +# add column names to results data frame +colnames(LRresults.df) <- c("rs", "AAintercept", "ABcoef", "BBcoef", "AA.pval", "AB.pval", "BB.pval") + +# The following sorts LR results by the p-value of the BB +# homozygous coefficient. tidy made $p_value a factor and when you try to +# convert directly to numeric (as.numeric) turns factors into integer and +# this messes up sorting especially with scientific notation + +lr.results.sorted <- LRresults.df %>% + mutate_at("BB.pval", as.character) %>% # convert to char before numeric + mutate_at("BB.pval", as.numeric) %>% # convert to numeric for arrange + arrange(BB.pval) # sort + +as.matrix(lr.results.sorted %>% pull(rs,BB.pval)) + +## Comparing Rankings +# Via Table +data.frame(fish.results$rs, lr.results.sorted$rs) + +# Via Plot +yvals <- 1:length(fish.results$rs) +xvals <- snp.ids +fishvals <- match(fish.results$rs, snp.ids) +lrvals <- match(lr.results.sorted$rs, snp.ids) + +plot(1:length(fish.results$rs), fishvals, type="l", xaxt="none", + xlab="snp", ylab="rank", lty=1, col=1) +axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids) +lines(1:length(lr.results.sorted$rs), lrvals, type="l", col=2, lty=2, lwd=2) +legend(x="topright", legend=c("Chi-Square Fisher Test", "Logistic Regression"), + lty=c(1,2), col=c(1,2)) diff --git a/lab_gwas_stats.docx b/lab_gwas_stats.docx index 04334a7..b5f3324 100644 Binary files a/lab_gwas_stats.docx and b/lab_gwas_stats.docx differ