CS-6643-Bioinformatics-Lab-6/Schrick-Noah_CS-6643_Lab-6.R

135 lines
5.0 KiB
R

# Lab 6 for the University of Tulsa's CS-6643 Bioinformatics Course
# GWAS
# Professor: Dr. McKinney, Fall 2022
# Noah L. Schrick - 1492657
## Set Working Directory to file directory - RStudio approach
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#### Part 0: PLINK
if (!require("BiocManager")) install.packages("BiocManager")
library(BiocManager)
if (!require("snpStats")) BiocManager::install("snpStats")
library(snpStats)
ex.data <- read.pedfile(file="extra.ped", snps="extra.map")
ex.data$fam
phenotype <- ex.data$fam$affected-1 # change pheno from 1/2 to 0/1
genotypes <- ex.data$genotypes # encoded as AA/AB/BB
snp.ids <- as.character(ex.data$map$snp.names)
genotypes.df <- data.frame(as(genotypes, "character"))
colnames(genotypes.df) <- snp.ids
# observed contingency table for SNP rs630969
table(phenotype,genotypes.df$rs630969,
dnn=c("phenotype","genotype")) # dnn dimension names of table
dim(genotypes.df)
#### Part A: Chi-Square Test
# creates list of observed contingency tables for all SNPs
# sapply acts on each column of genotypes.df
observed.tables.list <- sapply(genotypes.df, function(x)
table(phenotype,x,dnn=c("phenotype","genotype")))
test.table <- observed.tables.list$rs634228
genoMarg.vec <- colSums(test.table) # margin vector
phenoMarg.vec <- rowSums(test.table) # margin vector
totalSubj <- sum(genoMarg.vec) # total subjects
expect.test <- outer(phenoMarg.vec,genoMarg.vec/totalSubj,'*')
## Fisher Test
# Fisher exact test (chi-square test) for all SNPs
fish_fn <- function(i){
cbind(snp.ids[i], fisher.test(observed.tables.list[[i]])$p.value)
}
# apply fisher exact test to all SNPs
fish.df <- data.frame(t(sapply(1:ncol(genotypes.df), fish_fn)))
colnames(fish.df) <- c("rs", "p_value")
# sort SNPs by Fisher exact p-value
if (!require("dplyr")) install.packages("dplyr")
library(dplyr)
fish.results <- fish.df %>%
mutate_at("p_value", as.character) %>%
mutate_at("p_value", as.numeric) %>%
arrange(p_value)
print(fish.results)
#### Part B: Logistic regression with genotypes
if (!require("ggplot2")) BiocManager::install("ggplot2")
library(ggplot2)
i<-8
A1<-ex.data$map$allele.1[i]
A2<-ex.data$map$allele.2[i]
geno.labels <- c(paste(A1,A1,sep=""),paste(A1,A2,sep=""),paste(A2,A2,sep=""))
## Plot with ggplot2
# data from the one SNP
oneSNP.df <- data.frame(cbind(genotypes.df[[i]],as.numeric(phenotype)))
colnames(oneSNP.df) <- c("genotypes","phenotypes")
lr.plot <- ggplot(oneSNP.df, aes(x=genotypes, y=phenotypes)) +
geom_point(position = position_jitter(w = 0.1, h = 0.1)) +
# stat_smooth plots the probability based on the model
stat_smooth(method="glm", method.args = list(family = "binomial")) #+
#xlim(geno.labels) + ggtitle(snp.ids[i])
print(lr.plot)
## Fit a logistic regression model of phenotype with SNP in the 8th column
if (!require("broom")) install.packages("broom")
library(broom) # for tidy function
pheno.factor <- factor(phenotype,labels=c(0,1))
i<-8
lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial)
td.lr <- tidy(lr)
pval_vec <- td.lr$p.value # vector of $p.value from td.lr
coef_vec <- td.lr$estimate # vector of $estimate
cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3])
## Turn into function to repeat for all SNPs
LR.fn <- function(i){
lr <- glm(pheno.factor~genotypes.df[[i]],family=binomial)
td.lr <- tidy(lr)
pval_vec <- td.lr$p.value # vector of $p.value from td.lr
coef_vec <- td.lr$estimate # vector of $estimate
cbind(snp.ids[i], coef_vec[1], coef_vec[2], coef_vec[3], pval_vec[1], pval_vec[2], pval_vec[3])
}
# apply Logistic Regression model to all SNPs
LRresults.df <- data.frame(t(sapply(1:ncol(genotypes.df), LR.fn)))
# add column names to results data frame
colnames(LRresults.df) <- c("rs", "AAintercept", "ABcoef", "BBcoef", "AA.pval", "AB.pval", "BB.pval")
# The following sorts LR results by the p-value of the BB
# homozygous coefficient. tidy made $p_value a factor and when you try to
# convert directly to numeric (as.numeric) turns factors into integer and
# this messes up sorting especially with scientific notation
lr.results.sorted <- LRresults.df %>%
mutate_at("BB.pval", as.character) %>% # convert to char before numeric
mutate_at("BB.pval", as.numeric) %>% # convert to numeric for arrange
arrange(BB.pval) # sort
as.matrix(lr.results.sorted %>% pull(rs,BB.pval))
## Comparing Rankings
# Via Table
data.frame(fish.results$rs, lr.results.sorted$rs)
# Via Plot
yvals <- 1:length(fish.results$rs)
xvals <- snp.ids
fishvals <- match(fish.results$rs, snp.ids)
lrvals <- match(lr.results.sorted$rs, snp.ids)
plot(1:length(fish.results$rs), fishvals, type="l", xaxt="none",
xlab="snp", ylab="rank", lty=1, col=1)
axis(side = 1, at = 1:length(fish.results$rs), labels=snp.ids)
lines(1:length(lr.results.sorted$rs), lrvals, type="l", col=2, lty=2, lwd=2)
legend(x="topright", legend=c("Chi-Square Fisher Test", "Logistic Regression"),
lty=c(1,2), col=c(1,2))