diff --git a/.Rhistory b/.Rhistory
new file mode 100644
index 0000000..471e486
--- /dev/null
+++ b/.Rhistory
@@ -0,0 +1,512 @@
+################# Max-Log-Likelihood #################
+n <- length(g.breaks.clean)
+kmin <- g.breaks.clean[1]
+alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin)
+alpha.ML
+lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
+# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
+# Degree Distribution
+# Professor: Dr. McKinney, Spring 2022
+# Noah Schrick - 1492657
+library(igraph)
+library(igraphdata)
+data(yeast)
+g <- yeast
+g.netname <- "Yeast"
+################# Set up Work #################
+g.vec <- degree(g)
+g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
+" Network"))
+legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
+"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
+"#006CD1", "#E66100", "#D35FB7"))
+g.mean <- mean(g.vec)
+g.seq <- 0:max(g.vec) # x-axis
+################# Guessing Alpha #################
+alpha.guess <- 1.5
+lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1)
+################# Poisson #################
+g.pois <- dpois(g.seq, g.mean, log=F)
+lines(g.seq, g.pois, col="#006CD1", lty=2)
+################# Linear model: Least-Squares Fit #################
+#g.breaks <- g.hist$breaks[-c(1)] # remove 0
+g.breaks <- g.hist$breaks # remove 0
+g.probs <- g.hist$density[-1] # make lengths match
+# Need to clean up probabilities that are 0
+nz.probs.mask <- g.probs!=0
+g.breaks.clean <- g.breaks[nz.probs.mask]
+g.probs.clean <- g.breaks[nz.probs.mask]
+#plot(log(g.breaks.clean), log(g.probs.clean))
+g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
+summary(g.fit)
+alpha.LM <- coef(g.fit)[2]
+lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3)
+################# Max-Log-Likelihood #################
+n <- length(g.breaks.clean)
+kmin <- g.breaks.clean[1]
+alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin)
+alpha.ML
+lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
+# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
+# Degree Distribution
+# Professor: Dr. McKinney, Spring 2022
+# Noah Schrick - 1492657
+library(igraph)
+library(igraphdata)
+data(yeast)
+g <- yeast
+g.netname <- "Yeast"
+################# Set up Work #################
+g.vec <- degree(g)
+g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
+" Network"))
+legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
+"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
+"#006CD1", "#E66100", "#D35FB7"))
+g.mean <- mean(g.vec)
+g.seq <- 0:max(g.vec) # x-axis
+################# Guessing Alpha #################
+alpha.guess <- 1.5
+lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1)
+################# Poisson #################
+g.pois <- dpois(g.seq, g.mean, log=F)
+lines(g.seq, g.pois, col="#006CD1", lty=2)
+################# Linear model: Least-Squares Fit #################
+g.breaks <- g.hist$breaks[-c(1)] # remove 0
+g.probs <- g.hist$density[-1] # make lengths match
+# Need to clean up probabilities that are 0
+nz.probs.mask <- g.probs!=0
+g.breaks.clean <- g.breaks[nz.probs.mask]
+g.probs.clean <- g.probs[nz.probs.mask]
+#plot(log(g.breaks.clean), log(g.probs.clean))
+g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
+summary(g.fit)
+alpha.LM <- coef(g.fit)[2]
+lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3)
+################# Max-Log-Likelihood #################
+n <- length(g.breaks.clean)
+kmin <- g.breaks.clean[1]
+alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin)
+alpha.ML
+lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
+alpha.LM
+# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
+# Degree Distribution
+# Professor: Dr. McKinney, Spring 2022
+# Noah Schrick - 1492657
+library(igraph)
+library(igraphdata)
+data(yeast)
+g <- yeast
+g.netname <- "Yeast"
+################# Set up Work #################
+g.vec <- degree(g)
+g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
+" Network"))
+legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
+"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
+"#006CD1", "#E66100", "#D35FB7"))
+g.mean <- mean(g.vec)
+g.seq <- 0:max(g.vec) # x-axis
+################# Guessing Alpha #################
+alpha.guess <- 1.5
+lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1)
+################# Poisson #################
+g.pois <- dpois(g.seq, g.mean, log=F)
+lines(g.seq, g.pois, col="#006CD1", lty=2)
+################# Linear model: Least-Squares Fit #################
+g.breaks <- g.hist$breaks[-c(1)] # remove 0
+g.probs <- g.hist$density[-1] # make lengths match
+# Need to clean up probabilities that are 0
+nz.probs.mask <- g.probs!=0
+g.breaks.clean <- g.breaks[nz.probs.mask]
+g.probs.clean <- g.probs[nz.probs.mask]
+#plot(log(g.breaks.clean), log(g.probs.clean))
+g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
+summary(g.fit)
+alpha.LM <- coef(g.fit)[2]
+lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3)
+################# Max-Log-Likelihood #################
+n <- length(g.breaks.clean)
+kmin <- g.breaks.clean[1]
+alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin))
+alpha.ML
+lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
+# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
+# Degree Distribution
+# Professor: Dr. McKinney, Spring 2022
+# Noah Schrick - 1492657
+library(igraph)
+library(igraphdata)
+data(yeast)
+g <- yeast
+g.netname <- "Yeast"
+################# Set up Work #################
+g.vec <- degree(g)
+g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
+" Network"))
+legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
+"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
+"#006CD1", "#E66100", "#D35FB7"))
+g.mean <- mean(g.vec)
+g.seq <- 0:max(g.vec) # x-axis
+################# Guessing Alpha #################
+alpha.guess <- 1.5
+lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=5)
+################# Poisson #################
+g.pois <- dpois(g.seq, g.mean, log=F)
+lines(g.seq, g.pois, col="#006CD1", lty=2)
+################# Linear model: Least-Squares Fit #################
+g.breaks <- g.hist$breaks[-c(1)] # remove 0
+g.probs <- g.hist$density[-1] # make lengths match
+# Need to clean up probabilities that are 0
+nz.probs.mask <- g.probs!=0
+g.breaks.clean <- g.breaks[nz.probs.mask]
+g.probs.clean <- g.probs[nz.probs.mask]
+#plot(log(g.breaks.clean), log(g.probs.clean))
+g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
+summary(g.fit)
+alpha.LM <- coef(g.fit)[2]
+lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3)
+################# Max-Log-Likelihood #################
+n <- length(g.breaks.clean)
+kmin <- g.breaks.clean[1]
+alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin))
+alpha.ML
+lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
+# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
+# Degree Distribution
+# Professor: Dr. McKinney, Spring 2022
+# Noah Schrick - 1492657
+library(igraph)
+library(igraphdata)
+data(yeast)
+g <- yeast
+g.netname <- "Yeast"
+################# Set up Work #################
+g.vec <- degree(g)
+g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
+" Network"))
+legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
+"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
+"#006CD1", "#E66100", "#D35FB7"))
+g.mean <- mean(g.vec)
+g.seq <- 0:max(g.vec) # x-axis
+################# Guessing Alpha #################
+alpha.guess <- 1.5
+lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3)
+################# Poisson #################
+g.pois <- dpois(g.seq, g.mean, log=F)
+lines(g.seq, g.pois, col="#006CD1", lty=2)
+################# Linear model: Least-Squares Fit #################
+g.breaks <- g.hist$breaks[-c(1)] # remove 0
+g.probs <- g.hist$density[-1] # make lengths match
+# Need to clean up probabilities that are 0
+nz.probs.mask <- g.probs!=0
+g.breaks.clean <- g.breaks[nz.probs.mask]
+g.probs.clean <- g.probs[nz.probs.mask]
+#plot(log(g.breaks.clean), log(g.probs.clean))
+g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
+summary(g.fit)
+alpha.LM <- coef(g.fit)[2]
+lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3)
+################# Max-Log-Likelihood #################
+n <- length(g.breaks.clean)
+kmin <- g.breaks.clean[1]
+alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin))
+alpha.ML
+lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
+# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
+# Degree Distribution
+# Professor: Dr. McKinney, Spring 2022
+# Noah Schrick - 1492657
+library(igraph)
+library(igraphdata)
+data(yeast)
+g <- yeast
+g.netname <- "Yeast"
+################# Set up Work #################
+g.vec <- degree(g)
+g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
+" Network"))
+legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
+"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
+"#006CD1", "#E66100", "#D35FB7"))
+g.mean <- mean(g.vec)
+g.seq <- 0:max(g.vec) # x-axis
+################# Guessing Alpha #################
+alpha.guess <- 1.5
+lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3)
+################# Poisson #################
+g.pois <- dpois(g.seq, g.mean, log=F)
+lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3)
+################# Linear model: Least-Squares Fit #################
+g.breaks <- g.hist$breaks[-c(1)] # remove 0
+g.probs <- g.hist$density[-1] # make lengths match
+# Need to clean up probabilities that are 0
+nz.probs.mask <- g.probs!=0
+g.breaks.clean <- g.breaks[nz.probs.mask]
+g.probs.clean <- g.probs[nz.probs.mask]
+#plot(log(g.breaks.clean), log(g.probs.clean))
+g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
+summary(g.fit)
+alpha.LM <- coef(g.fit)[2]
+lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3)
+################# Max-Log-Likelihood #################
+n <- length(g.breaks.clean)
+kmin <- g.breaks.clean[1]
+alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin))
+alpha.ML
+lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3)
+plot(yeast)
+hist(yeast)
+hist(g.vec)
+g.pois
+g.mean
+alpha.LM
+alpha.ML
+degree(g)
+sort(degree(g))
+sort(degree(g),decreasing=FALSE)
+sort(degree(g),decreasing=F)
+sort(degree(g),decreasing=false)
+sort(degree(g), decreasing = TRUE)
+head(sort(degree(g), decreasing = TRUE))
+stddev(degree(g))
+sd(degree(g))
+tail(sort(degree(g), decreasing = TRUE))
+plot(log(g.breaks.clean), log(g.probs.clean))
+# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
+# Degree Distribution
+# Professor: Dr. McKinney, Spring 2022
+# Noah Schrick - 1492657
+library(igraph)
+library(igraphdata)
+data(yeast)
+g <- yeast
+g.netname <- "Yeast"
+################# Set up Work #################
+g.vec <- degree(g)
+g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
+" Network"))
+legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
+"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
+"#006CD1", "#E66100", "#D35FB7"))
+g.mean <- mean(g.vec)
+g.seq <- 0:max(g.vec) # x-axis
+################# Guessing Alpha #################
+alpha.guess <- 1.5
+lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3)
+################# Poisson #################
+g.pois <- dpois(g.seq, g.mean, log=F)
+lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3)
+################# Linear model: Least-Squares Fit #################
+g.breaks <- g.hist$breaks[-c(1)] # remove 0
+g.probs <- g.hist$density[-1] # make lengths match
+# Need to clean up probabilities that are 0
+nz.probs.mask <- g.probs!=0
+g.breaks.clean <- g.breaks[nz.probs.mask]
+g.probs.clean <- g.probs[nz.probs.mask]
+plot(log(g.breaks.clean), log(g.probs.clean))
+g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
+summary(g.fit)
+alpha.LM <- coef(g.fit)[2]
+lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3)
+################# Max-Log-Likelihood #################
+n <- length(g.breaks.clean)
+kmin <- g.breaks.clean[1]
+alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin))
+alpha.ML
+lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3)
+plot(log(g.breaks.clean), log(g.probs.clean))
+g.breaks.clean <- g.breaks[nz.probs.mask]
+g.probs.clean <- g.probs[nz.probs.mask]
+plot(log(g.breaks.clean), log(g.probs.clean))
+## Turn into function to repeat for all SNPs
+LR.fn <- function(i){
+lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial)
+td.lr <- tidy(lr)
+pval_vec <- td.lr$p.value # vector of $p.value from td.lr
+coef_vec <- td.lr$estimate # vector of $estimate
+cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3])
+}
+# apply Logistic Regression model to all SNPs
+LRresults.df <- data.frame(t(sapply(1:ncol(genotypes.df), LR.fn)))
+# Lab 6 for the University of Tulsa's CS-6643 Bioinformatics Course
+# GWAS
+# Professor: Dr. McKinney, Fall 2022
+# Noah L. Schrick - 1492657
+## Set Working Directory to file directory - RStudio approach
+setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
+#### Part 0: PLINK
+if (!require("BiocManager")) install.packages("BiocManager")
+library(BiocManager)
+if (!require("snpStats")) BiocManager::install("snpStats")
+library(snpStats)
+ex.data <- read.pedfile(file="extra.ped", snps="extra.map")
+ex.data$fam
+phenotype <- ex.data$fam$affected-1  # change pheno from 1/2 to 0/1
+genotypes <- ex.data$genotypes       # encoded as AA/AB/BB
+snp.ids <- as.character(ex.data$map$snp.names)
+genotypes.df <- data.frame(as(genotypes, "character"))
+colnames(genotypes.df) <- snp.ids
+# observed contingency table for SNP rs630969
+table(phenotype,genotypes.df$rs630969,
+dnn=c("phenotype","genotype")) # dnn dimension names of table
+dim(genotypes.df)
+#### Part A: Chi-Square Test
+# creates list of observed contingency tables for all SNPs
+# sapply acts on each column of genotypes.df
+observed.tables.list <- sapply(genotypes.df, function(x)
+table(phenotype,x,dnn=c("phenotype","genotype")))
+test.table <- observed.tables.list$rs634228
+genoMarg.vec  <-  colSums(test.table)           # margin vector
+phenoMarg.vec <- rowSums(test.table)            # margin vector
+totalSubj <- sum(genoMarg.vec)                  # total subjects
+expect.test <- outer(phenoMarg.vec,genoMarg.vec/totalSubj,'*')
+## Fisher Test
+# Fisher exact test (chi-square test) for all SNPs
+fish_fn <- function(i){
+cbind(snp.ids[i], fisher.test(observed.tables.list[[i]])$p.value)
+}
+# apply fisher exact test to all SNPs
+fish.df <- data.frame(t(sapply(1:ncol(genotypes.df), fish_fn)))
+colnames(fish.df) <- c("rs", "p_value")
+# sort SNPs by Fisher exact p-value
+if (!require("dplyr")) install.packages("dplyr")
+library(dplyr)
+fish.results <- fish.df                            %>%
+mutate_at("p_value", as.character) %>%
+mutate_at("p_value", as.numeric)   %>%
+arrange(p_value)
+print(fish.results)
+#### Part B: Logistic regression with genotypes
+if (!require("ggplot2")) BiocManager::install("ggplot2")
+library(ggplot2)
+i<-8
+A1<-ex.data$map$allele.1[i]
+A2<-ex.data$map$allele.2[i]
+geno.labels <- c(paste(A1,A1,sep=""),paste(A1,A2,sep=""),paste(A2,A2,sep=""))
+## Plot with ggplot2
+# data from the one SNP
+oneSNP.df <- data.frame(cbind(genotypes.df[[i]],as.numeric(phenotype)))
+colnames(oneSNP.df) <- c("genotypes","phenotypes")
+lr.plot <- ggplot(oneSNP.df, aes(x=genotypes, y=phenotypes)) +
+geom_point(position = position_jitter(w = 0.1, h = 0.1)) +
+# stat_smooth plots the probability based on the model
+stat_smooth(method="glm", method.args = list(family = "binomial")) #+
+#xlim(geno.labels) + ggtitle(snp.ids[i])
+print(lr.plot)
+## Fit a logistic regression model of phenotype with SNP in the 8th column
+if (!require("broom")) install.packages("broom")
+library(broom) # for tidy function
+pheno.factor <- factor(phenotype,labels=c(0,1))
+i<-8
+lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial)
+td.lr <- tidy(lr)
+pval_vec <- td.lr$p.value # vector of $p.value from td.lr
+coef_vec <- td.lr$estimate # vector of $estimate
+cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3])
+## Turn into function to repeat for all SNPs
+LR.fn <- function(i){
+lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial)
+td.lr <- tidy(lr)
+pval_vec <- td.lr$p.value # vector of $p.value from td.lr
+coef_vec <- td.lr$estimate # vector of $estimate
+cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3])
+}
+# apply Logistic Regression model to all SNPs
+LRresults.df <- data.frame(t(sapply(1:ncol(genotypes.df), LR.fn)))
+# add column names to results data frame
+colnames(LRresults.df) <- c("rs", "AAintercept", "ABcoef", "BBcoef", "AA.pval", "AB.pval", "BB.pval")
+# The following sorts LR results by the p-value of the BB
+# homozygous coefficient. tidy made $p_value a factor and when you try to
+# convert directly to numeric (as.numeric) turns factors into integer and
+# this messes up sorting especially with scientific notation
+lr.results.sorted <- LRresults.df    %>%
+mutate_at("BB.pval", as.character) %>%  # convert to char before numeric
+mutate_at("BB.pval", as.numeric)   %>%  # convert to numeric for arrange
+arrange(BB.pval)                        # sort
+as.matrix(lr.results.sorted %>% pull(rs,BB.pval))
+fish.df
+fish.results
+fish.results$rs
+lr.results.sorted
+lr.results.sorted$rs
+table(fish.results$rs, lr.results.sorted$rs)
+expand.grid(fish=fish.results$rs, lr=lr.results.sorted$rs)
+CJ(fish=fish.results$rs, lr=lr.results.sorted$rs)
+table(fish=fish.results$rs, lr=lr.results.sorted$rs)
+as.table(fish=fish.results$rs, lr=lr.results.sorted$rs)
+setNames(fish.results$rs, lr.results.sorted$rs)
+as.table(setNames(fish.results$rs, lr.results.sorted$rs))
+data.frame(fish.results$rs, lr.results.sorted$rs)
+tmp <- data.frame(fish.results$rs, lr.results.sorted$rs)
+?plot
+length(fish.results$rs)
+length(lr.results.sorted$rs)
+?plot(x=1:length(fish.results$rs))
+?plot(y=1:length(fish.results$rs), x=fish.results$rs)
+plot(y=1:length(fish.results$rs), x=fish.results$rs)
+plot(fish.results)
+plot(fish.results$rs)
+fish.results$rs
+class(fish.results$rs)
+as.vector(fish.results$rs)
+plot(as.vector(fish.results$rs))
+as.data.frame(fish.results$rs)
+plot(as.data.frame(fish.results$rs))
+ggplot(as.data.frame(fish.results$rs))
+ggplot(as.data.frame(fish.results$rs))
+plot(as.data.frame(fish.results$rs))
+tmpy<-1:length(fish.results$rs)
+tmpy
+yvals <- 1:length(fish.results$rs)
+xfish <- fish.results$rs
+xfish
+which(fish.results$rs == "rs17786052")
+which(fish.results$rs == "rs17786052")[[1]]
+snp.ids
+snp.ids[i]
+match(fish.results$rs, snp.ids)
+yvals <- 1:length(fish.results$rs)
+xvals <- snp.ids
+fishvals <- match(fish.results$rs, snp.ids)
+lrvals <- match(lr.results.sorted$rs, snp.ids)
+plot(xvals, fishvals)
+fishvals
+plot(xlimxvals, fishvals, type="l")
+plot(xvals, fishvals, type="l")
+plot(1:17, yvals, type=l, xaxt="none", xlab="snp")
+plot(1:17, yvals, type="l", xaxt="none", xlab="snp")
+axis(1, axTicks(1), labels=snp.ids)
+snp.ids
+axis(labels=snp.ids)
+?axis
+axis(labels=snp.ids, side=1)
+axis(labels=snp.ids, side=1, at=1)
+axis(labels=snp.ids, side=1, at=17)
+axis(labels=snp.ids, side=1, at=17)
+length(snp.ids)
+length(snp.ids[1])
+xlabels <- c(snp.ids)
+xlabels
+axis(xlabels=snp.ids)
+axis(labels=xlabels)
+axis(labels=xlabels, side=1)
+axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids)
+plot(1:17, fishvals, type="l", xaxt="none", xlab="snp", ylab="rank")
+axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids)
+lines(1:length(lr.results.sorted$rs), lrvals, type="l", col="red", lwd =2, lty=1)
+plot(1:length(fish.results$rs), fishvals, type="l", xaxt="none",
+xlab="snp", ylab="rank", lty=1, col=1)
+axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids)
+lines(1:length(lr.results.sorted$rs), lrvals, type="l", col=2, lty=2)
+legend(x="topright", legend=c("Chi-Square Fisher Test", "Logistic Regression"),
+lty=c(1,2), col=c(1,2))
+plot(1:length(fish.results$rs), fishvals, type="l", xaxt="none",
+xlab="snp", ylab="rank", lty=1, col=1)
+axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids)
+lines(1:length(lr.results.sorted$rs), lrvals, type="l", col=2, lty=2, lwd=2)
+legend(x="topright", legend=c("Chi-Square Fisher Test", "Logistic Regression"),
+lty=c(1,2), col=c(1,2))
diff --git a/.~lock.lab_gwas_stats.docx# b/.~lock.lab_gwas_stats.docx#
deleted file mode 100644
index 43e9721..0000000
--- a/.~lock.lab_gwas_stats.docx#
+++ /dev/null
@@ -1 +0,0 @@
-,noah,NovaArchSys,20.10.2022 16:40,file:///home/noah/.config/libreoffice/4;
\ No newline at end of file
diff --git a/Schrick-Noah_CS-6643_Lab-6-Report.pdf b/Schrick-Noah_CS-6643_Lab-6-Report.pdf
new file mode 100644
index 0000000..e0e2412
Binary files /dev/null and b/Schrick-Noah_CS-6643_Lab-6-Report.pdf differ
diff --git a/Schrick-Noah_CS-6643_Lab-6.R b/Schrick-Noah_CS-6643_Lab-6.R
index bd72376..d817006 100644
--- a/Schrick-Noah_CS-6643_Lab-6.R
+++ b/Schrick-Noah_CS-6643_Lab-6.R
@@ -54,3 +54,81 @@ fish.results <- fish.df                            %>%
   mutate_at("p_value", as.numeric)   %>%
   arrange(p_value) 
 print(fish.results)
+
+
+#### Part B: Logistic regression with genotypes
+if (!require("ggplot2")) BiocManager::install("ggplot2")
+library(ggplot2) 
+i<-8
+A1<-ex.data$map$allele.1[i]
+A2<-ex.data$map$allele.2[i]
+geno.labels <- c(paste(A1,A1,sep=""),paste(A1,A2,sep=""),paste(A2,A2,sep=""))
+
+## Plot with ggplot2
+# data from the one SNP
+oneSNP.df <- data.frame(cbind(genotypes.df[[i]],as.numeric(phenotype)))
+colnames(oneSNP.df) <- c("genotypes","phenotypes")
+
+lr.plot <- ggplot(oneSNP.df, aes(x=genotypes, y=phenotypes)) +
+  geom_point(position = position_jitter(w = 0.1, h = 0.1)) +
+  # stat_smooth plots the probability based on the model
+  stat_smooth(method="glm", method.args = list(family = "binomial")) #+
+  #xlim(geno.labels) + ggtitle(snp.ids[i])
+print(lr.plot)
+
+## Fit a logistic regression model of phenotype with SNP in the 8th column
+if (!require("broom")) install.packages("broom")
+library(broom) # for tidy function
+pheno.factor <- factor(phenotype,labels=c(0,1))
+i<-8
+lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial)
+td.lr <- tidy(lr)	
+
+pval_vec <- td.lr$p.value # vector of $p.value from td.lr
+coef_vec <- td.lr$estimate # vector of $estimate
+cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3])  
+
+## Turn into function to repeat for all SNPs
+LR.fn <- function(i){
+  lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial)
+  td.lr <- tidy(lr)	
+  
+  pval_vec <- td.lr$p.value # vector of $p.value from td.lr
+  coef_vec <- td.lr$estimate # vector of $estimate
+  cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3])  
+} 
+
+# apply Logistic Regression model to all SNPs
+LRresults.df <- data.frame(t(sapply(1:ncol(genotypes.df), LR.fn)))
+
+# add column names to results data frame
+colnames(LRresults.df) <- c("rs", "AAintercept", "ABcoef", "BBcoef", "AA.pval", "AB.pval", "BB.pval")
+
+# The following sorts LR results by the p-value of the BB 
+# homozygous coefficient. tidy made $p_value a factor and when you try to 
+# convert directly to numeric (as.numeric) turns factors into integer and 
+# this messes up sorting especially with scientific notation
+
+lr.results.sorted <- LRresults.df    %>% 
+  mutate_at("BB.pval", as.character) %>%  # convert to char before numeric
+  mutate_at("BB.pval", as.numeric)   %>%  # convert to numeric for arrange 
+  arrange(BB.pval)                        # sort 
+
+as.matrix(lr.results.sorted %>% pull(rs,BB.pval))
+
+## Comparing Rankings
+# Via Table
+data.frame(fish.results$rs, lr.results.sorted$rs)
+
+# Via Plot
+yvals <- 1:length(fish.results$rs)
+xvals <- snp.ids
+fishvals <- match(fish.results$rs, snp.ids)
+lrvals <- match(lr.results.sorted$rs, snp.ids)
+
+plot(1:length(fish.results$rs), fishvals, type="l", xaxt="none",
+     xlab="snp", ylab="rank", lty=1, col=1)
+axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids)
+lines(1:length(lr.results.sorted$rs), lrvals, type="l", col=2, lty=2, lwd=2)
+legend(x="topright", legend=c("Chi-Square Fisher Test", "Logistic Regression"),
+       lty=c(1,2), col=c(1,2))
diff --git a/lab_gwas_stats.docx b/lab_gwas_stats.docx
index 04334a7..b5f3324 100644
Binary files a/lab_gwas_stats.docx and b/lab_gwas_stats.docx differ