# Lab 1 for the University of Tulsa's CS-6643 Bioinformatics Course # Intro to R, online bioinformatics resources, nucleotide frequency statistics # Professor: Dr. McKinney, Fall 2022 # Noah L. Schrick - 1492657 ## Set Working Directory to file directory - RStudio approach setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #### Part A: Seq Function ## a AAvec <- seq(from = 1, to = 33, by = 2) ## b ABvec <- seq(from = 7, to = 40, length.out = 15) ## c my.dna <- sample(c("A", "C", "G", "T"), size = 20, replace = T) ## d my.dna.A <- length(which(my.dna == "A")) ## e my.dna.table <- table(my.dna) my.dna.table.df <- as.data.frame(my.dna.table) cols <- rainbow(nrow(my.dna.table)) my.dna.table.df$percent <- round(100*my.dna.table.df$Freq/sum(my.dna.table.df$Freq), digits = 1) my.dna.table.df$label <- paste(my.dna.table.df$my.dna, " (", my.dna.table.df$percent, "%)", sep = "") pie(my.dna.table.df$Freq, labels = my.dna.table.df$label, col = cols, main = "Pie Chart Representation of Random ACTG Sample") bp <- barplot(as.matrix(my.dna.table), beside = TRUE, xlab = "Nucleotide", ylab = "Frequency", ylim = c(-1, max(as.numeric(my.dna.table))+2), main = "Bar Plot Representation of Random ACTG Sample", col = cols, legend = TRUE) text(x = bp, y = my.dna.table + 0.5, labels = as.numeric(my.dna.table)) text(x = bp, y = -0.5, labels = names(my.dna.table)) ## f my.dna2 <- sample(c("A", "C", "G", "T"), size = 20, replace = T, prob = c(0.1, 0.4, 0.4, 0.1)) my.dna2.table <- table(my.dna2) my.dna2.table #### Part B: NCBI (no supporting R code for this part) #### Part C: Reading fasta files, nucelotide and dinucleotide frequencies ## Pre-cursor: Load associated supportive libraries if (!require("seqinr")) install.packages("seqinr") library(seqinr) ## Load in the fasta file as a string myfasta <- read.fasta(file="apoe.fasta", as.string= TRUE) ## a class(myfasta) ## b fasta2vec <- function(fasta.file){ if (!require("seqinr")) install.packages("seqinr") library(seqinr) fasta <- read.fasta(file=fasta.file, as.string= TRUE) fasta.string <- fasta[[1]][1] fasta.list <- strsplit(fasta.string,"") fasta.vec <- unlist(fasta.list) } fasta.vec <- fasta2vec("apoe.fasta") ## c fasta.len <- length(fasta.vec) ## d fasta.vec[1:20] ## e fasta.table <- table(fasta.vec) ## f fasta.cols <- rainbow(nrow(fasta.table)) fasta.bp <- barplot(as.matrix(fasta.table), beside = TRUE, xlab = "Letter", ylab = "Frequency", ylim = c(-0.25*max(as.numeric(fasta.table)), 1.25*max(as.numeric(fasta.table))), main = "Bar Plot Representation of APOE Nucelotides", col = fasta.cols, legend = TRUE) text(x = fasta.bp, y = 1.1*fasta.table, labels = as.numeric(fasta.table)) text(x = fasta.bp, y = -0.10*max(as.numeric(fasta.table)), labels = names(fasta.table)) ## g fasta.nuc_prob <- fasta.table/fasta.len #### Part D: GC Content ## a fasta.gc <- (sum(fasta.vec=="g") + sum(fasta.vec=="c"))/fasta.len # Verify: fasta.gc == GC(fasta.vec) ## b fasta.pairs <- seqinr::count(fasta.vec,2) fasta.pairs.gc <- fasta.pairs['gc'] ## c fasta.pairs.cols <- rainbow(nrow(fasta.pairs)) fasta.pairs.bp <- barplot(as.matrix(fasta.pairs), beside = TRUE, xlab = "Pairs", ylab = "Frequency", ylim = c(-0.25*max(as.numeric(fasta.pairs)), 1.25*max(as.numeric(fasta.pairs))), main = "Bar Plot Representation of APOE Nucelotide Pairs", col = fasta.pairs.cols) text(x = fasta.pairs.bp, y = 1.1*fasta.pairs, labels = as.numeric(fasta.pairs)) text(x = fasta.pairs.bp, y = -0.10*max(as.numeric(fasta.pairs)), labels = names(fasta.pairs)) #### Part E: Coronavirus ## a covid.vec <- fasta2vec("covid.fasta") covid.table <- table(covid.vec) covid.nuc_prob <- covid.table / length(covid.vec) compare.bind <- rbind(fasta.nuc_prob, covid.nuc_prob) compare.bp <- barplot(as.matrix(compare.bind), beside = TRUE, xlab = "Nucelotide", ylab = "Probability", ylim = c(-0.0, 1.25*max(c(fasta.nuc_prob, covid.nuc_prob))), main = "Bar Plot Representation and Comparison of APOE to COVID Nucleotide Probabilities", legend = TRUE, legend.text = c("APOE", "COVID")) ycords = 1.03*c(rbind(as.vector(fasta.nuc_prob), as.vector(covid.nuc_prob))) tlabs = c(rbind(as.vector(fasta.nuc_prob), as.vector(covid.nuc_prob))) text(x = compare.bp, y = ycords, labels = round(as.numeric(tlabs), digits = 3)) ## b covid.gc <- GC(covid.vec)