151 lines
4.7 KiB
R
151 lines
4.7 KiB
R
# Lab 1 for the University of Tulsa's CS-6643 Bioinformatics Course
|
|
# Intro to R, online bioinformatics resources, nucleotide frequency statistics
|
|
# Professor: Dr. McKinney, Fall 2022
|
|
# Noah L. Schrick - 1492657
|
|
|
|
## Set Working Directory to file directory - RStudio approach
|
|
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
|
|
|
|
#### Part A: Seq Function
|
|
## a
|
|
AAvec <- seq(from = 1, to = 33, by = 2)
|
|
|
|
## b
|
|
ABvec <- seq(from = 7, to = 40, length.out = 15)
|
|
|
|
## c
|
|
my.dna <- sample(c("A", "C", "G", "T"), size = 20, replace = T)
|
|
|
|
## d
|
|
my.dna.A <- length(which(my.dna == "A"))
|
|
|
|
## e
|
|
my.dna.table <- table(my.dna)
|
|
|
|
my.dna.table.df <- as.data.frame(my.dna.table)
|
|
cols <- rainbow(nrow(my.dna.table))
|
|
my.dna.table.df$percent <-
|
|
round(100*my.dna.table.df$Freq/sum(my.dna.table.df$Freq), digits = 1)
|
|
my.dna.table.df$label <- paste(my.dna.table.df$my.dna,
|
|
" (", my.dna.table.df$percent, "%)", sep = "")
|
|
pie(my.dna.table.df$Freq, labels = my.dna.table.df$label, col = cols,
|
|
main = "Pie Chart Representation of Random ACTG Sample")
|
|
|
|
|
|
bp <- barplot(as.matrix(my.dna.table), beside = TRUE, xlab = "Nucleotide",
|
|
ylab = "Frequency", ylim = c(-1, max(as.numeric(my.dna.table))+2),
|
|
main = "Bar Plot Representation of Random ACTG Sample", col = cols,
|
|
legend = TRUE)
|
|
|
|
text(x = bp, y = my.dna.table + 0.5, labels = as.numeric(my.dna.table))
|
|
text(x = bp, y = -0.5, labels = names(my.dna.table))
|
|
|
|
## f
|
|
my.dna2 <- sample(c("A", "C", "G", "T"), size = 20, replace = T,
|
|
prob = c(0.1, 0.4, 0.4, 0.1))
|
|
my.dna2.table <- table(my.dna2)
|
|
my.dna2.table
|
|
|
|
|
|
#### Part B: NCBI (no supporting R code for this part)
|
|
|
|
#### Part C: Reading fasta files, nucelotide and dinucleotide frequencies
|
|
|
|
## Pre-cursor: Load associated supportive libraries
|
|
if (!require("seqinr")) install.packages("seqinr")
|
|
library(seqinr)
|
|
|
|
## Load in the fasta file as a string
|
|
myfasta <- read.fasta(file="apoe.fasta", as.string= TRUE)
|
|
|
|
## a
|
|
class(myfasta)
|
|
|
|
## b
|
|
fasta2vec <- function(fasta.file){
|
|
if (!require("seqinr")) install.packages("seqinr")
|
|
library(seqinr)
|
|
fasta <- read.fasta(file=fasta.file, as.string= TRUE)
|
|
fasta.string <- fasta[[1]][1]
|
|
fasta.list <- strsplit(fasta.string,"")
|
|
fasta.vec <- unlist(fasta.list)
|
|
}
|
|
|
|
fasta.vec <- fasta2vec("apoe.fasta")
|
|
|
|
## c
|
|
fasta.len <- length(fasta.vec)
|
|
|
|
## d
|
|
fasta.vec[1:20]
|
|
|
|
## e
|
|
fasta.table <- table(fasta.vec)
|
|
|
|
## f
|
|
fasta.cols <- rainbow(nrow(fasta.table))
|
|
|
|
fasta.bp <- barplot(as.matrix(fasta.table), beside = TRUE, xlab = "Letter",
|
|
ylab = "Frequency", ylim = c(-0.25*max(as.numeric(fasta.table)),
|
|
1.25*max(as.numeric(fasta.table))),
|
|
main = "Bar Plot Representation of APOE Nucelotides",
|
|
col = fasta.cols, legend = TRUE)
|
|
|
|
text(x = fasta.bp, y = 1.1*fasta.table, labels = as.numeric(fasta.table))
|
|
text(x = fasta.bp, y = -0.10*max(as.numeric(fasta.table)),
|
|
labels = names(fasta.table))
|
|
|
|
## g
|
|
fasta.nuc_prob <- fasta.table/fasta.len
|
|
|
|
#### Part D: GC Content
|
|
|
|
## a
|
|
fasta.gc <- (sum(fasta.vec=="g") + sum(fasta.vec=="c"))/fasta.len
|
|
# Verify:
|
|
fasta.gc == GC(fasta.vec)
|
|
|
|
## b
|
|
fasta.pairs <- seqinr::count(fasta.vec,2)
|
|
fasta.pairs.gc <- fasta.pairs['gc']
|
|
|
|
## c
|
|
fasta.pairs.cols <- rainbow(nrow(fasta.pairs))
|
|
|
|
fasta.pairs.bp <- barplot(as.matrix(fasta.pairs), beside = TRUE, xlab = "Pairs",
|
|
ylab = "Frequency",
|
|
ylim = c(-0.25*max(as.numeric(fasta.pairs)),
|
|
1.25*max(as.numeric(fasta.pairs))),
|
|
main = "Bar Plot Representation of APOE Nucelotide Pairs",
|
|
col = fasta.pairs.cols)
|
|
|
|
text(x = fasta.pairs.bp, y = 1.1*fasta.pairs, labels = as.numeric(fasta.pairs))
|
|
text(x = fasta.pairs.bp, y = -0.10*max(as.numeric(fasta.pairs)),
|
|
labels = names(fasta.pairs))
|
|
|
|
#### Part E: Coronavirus
|
|
|
|
## a
|
|
covid.vec <- fasta2vec("covid.fasta")
|
|
covid.table <- table(covid.vec)
|
|
covid.nuc_prob <- covid.table / length(covid.vec)
|
|
|
|
compare.bind <- rbind(fasta.nuc_prob, covid.nuc_prob)
|
|
|
|
compare.bp <- barplot(as.matrix(compare.bind), beside = TRUE,
|
|
xlab = "Nucelotide",
|
|
ylab = "Probability",
|
|
ylim = c(-0.0,
|
|
1.25*max(c(fasta.nuc_prob, covid.nuc_prob))),
|
|
main = "Bar Plot Representation and Comparison of APOE to COVID Nucleotide Probabilities",
|
|
legend = TRUE, legend.text = c("APOE", "COVID"))
|
|
|
|
ycords = 1.03*c(rbind(as.vector(fasta.nuc_prob), as.vector(covid.nuc_prob)))
|
|
tlabs = c(rbind(as.vector(fasta.nuc_prob), as.vector(covid.nuc_prob)))
|
|
text(x = compare.bp, y = ycords,
|
|
labels = round(as.numeric(tlabs), digits = 3))
|
|
|
|
## b
|
|
covid.gc <- GC(covid.vec)
|
|
|