CS-6643-Bioinformatics-Lab-1/Schrick-Noah_CS-6643_Lab1.R
2022-09-06 23:41:26 -05:00

151 lines
4.7 KiB
R

# Lab 1 for the University of Tulsa's CS-6643 Bioinformatics Course
# Intro to R, online bioinformatics resources, nucleotide frequency statistics
# Professor: Dr. McKinney, Fall 2022
# Noah L. Schrick - 1492657
## Set Working Directory to file directory - RStudio approach
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#### Part A: Seq Function
## a
AAvec <- seq(from = 1, to = 33, by = 2)
## b
ABvec <- seq(from = 7, to = 40, length.out = 15)
## c
my.dna <- sample(c("A", "C", "G", "T"), size = 20, replace = T)
## d
my.dna.A <- length(which(my.dna == "A"))
## e
my.dna.table <- table(my.dna)
my.dna.table.df <- as.data.frame(my.dna.table)
cols <- rainbow(nrow(my.dna.table))
my.dna.table.df$percent <-
round(100*my.dna.table.df$Freq/sum(my.dna.table.df$Freq), digits = 1)
my.dna.table.df$label <- paste(my.dna.table.df$my.dna,
" (", my.dna.table.df$percent, "%)", sep = "")
pie(my.dna.table.df$Freq, labels = my.dna.table.df$label, col = cols,
main = "Pie Chart Representation of Random ACTG Sample")
bp <- barplot(as.matrix(my.dna.table), beside = TRUE, xlab = "Nucleotide",
ylab = "Frequency", ylim = c(-1, max(as.numeric(my.dna.table))+2),
main = "Bar Plot Representation of Random ACTG Sample", col = cols,
legend = TRUE)
text(x = bp, y = my.dna.table + 0.5, labels = as.numeric(my.dna.table))
text(x = bp, y = -0.5, labels = names(my.dna.table))
## f
my.dna2 <- sample(c("A", "C", "G", "T"), size = 20, replace = T,
prob = c(0.1, 0.4, 0.4, 0.1))
my.dna2.table <- table(my.dna2)
my.dna2.table
#### Part B: NCBI (no supporting R code for this part)
#### Part C: Reading fasta files, nucelotide and dinucleotide frequencies
## Pre-cursor: Load associated supportive libraries
if (!require("seqinr")) install.packages("seqinr")
library(seqinr)
## Load in the fasta file as a string
myfasta <- read.fasta(file="apoe.fasta", as.string= TRUE)
## a
class(myfasta)
## b
fasta2vec <- function(fasta.file){
if (!require("seqinr")) install.packages("seqinr")
library(seqinr)
fasta <- read.fasta(file=fasta.file, as.string= TRUE)
fasta.string <- fasta[[1]][1]
fasta.list <- strsplit(fasta.string,"")
fasta.vec <- unlist(fasta.list)
}
fasta.vec <- fasta2vec("apoe.fasta")
## c
fasta.len <- length(fasta.vec)
## d
fasta.vec[1:20]
## e
fasta.table <- table(fasta.vec)
## f
fasta.cols <- rainbow(nrow(fasta.table))
fasta.bp <- barplot(as.matrix(fasta.table), beside = TRUE, xlab = "Letter",
ylab = "Frequency", ylim = c(-0.25*max(as.numeric(fasta.table)),
1.25*max(as.numeric(fasta.table))),
main = "Bar Plot Representation of APOE Nucelotides",
col = fasta.cols, legend = TRUE)
text(x = fasta.bp, y = 1.1*fasta.table, labels = as.numeric(fasta.table))
text(x = fasta.bp, y = -0.10*max(as.numeric(fasta.table)),
labels = names(fasta.table))
## g
fasta.nuc_prob <- fasta.table/fasta.len
#### Part D: GC Content
## a
fasta.gc <- (sum(fasta.vec=="g") + sum(fasta.vec=="c"))/fasta.len
# Verify:
fasta.gc == GC(fasta.vec)
## b
fasta.pairs <- seqinr::count(fasta.vec,2)
fasta.pairs.gc <- fasta.pairs['gc']
## c
fasta.pairs.cols <- rainbow(nrow(fasta.pairs))
fasta.pairs.bp <- barplot(as.matrix(fasta.pairs), beside = TRUE, xlab = "Pairs",
ylab = "Frequency",
ylim = c(-0.25*max(as.numeric(fasta.pairs)),
1.25*max(as.numeric(fasta.pairs))),
main = "Bar Plot Representation of APOE Nucelotide Pairs",
col = fasta.pairs.cols)
text(x = fasta.pairs.bp, y = 1.1*fasta.pairs, labels = as.numeric(fasta.pairs))
text(x = fasta.pairs.bp, y = -0.10*max(as.numeric(fasta.pairs)),
labels = names(fasta.pairs))
#### Part E: Coronavirus
## a
covid.vec <- fasta2vec("covid.fasta")
covid.table <- table(covid.vec)
covid.nuc_prob <- covid.table / length(covid.vec)
compare.bind <- rbind(fasta.nuc_prob, covid.nuc_prob)
compare.bp <- barplot(as.matrix(compare.bind), beside = TRUE,
xlab = "Nucelotide",
ylab = "Probability",
ylim = c(-0.0,
1.25*max(c(fasta.nuc_prob, covid.nuc_prob))),
main = "Bar Plot Representation and Comparison of APOE to COVID Nucleotide Probabilities",
legend = TRUE, legend.text = c("APOE", "COVID"))
ycords = 1.03*c(rbind(as.vector(fasta.nuc_prob), as.vector(covid.nuc_prob)))
tlabs = c(rbind(as.vector(fasta.nuc_prob), as.vector(covid.nuc_prob)))
text(x = compare.bp, y = ycords,
labels = round(as.numeric(tlabs), digits = 3))
## b
covid.gc <- GC(covid.vec)