diff --git a/Schrick-Noah_CS-6643_Lab-10.R b/Schrick-Noah_CS-6643_Lab-10.R index e69de29..44ea929 100644 --- a/Schrick-Noah_CS-6643_Lab-10.R +++ b/Schrick-Noah_CS-6643_Lab-10.R @@ -0,0 +1,58 @@ +# Lab 10 for the University of Tulsa's CS-6643 Bioinformatics Course +# Phylogenetic Analysis +# Professor: Dr. McKinney, Fall 2022 +# Noah L. Schrick - 1492657 + +## Set Working Directory to file directory - RStudio approach +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) + +#### Part A: GenBank sequences and a multiple fasta file +if (!require("ape")) install.packages("ape") +library(ape) # needed for read.GenBank + +# fetch the mtDNA sequences +mtDNA.MultiSeqs.list<-read.GenBank(c("AF011222","AF254446","X90314","AF089820", + "AF176766","AF451972", "AY079510", + "AF050738","AF176722","AF315498", + "AF176731","AF451964"), as.character=TRUE) +# look at species names +mtDNA.Species<-attr(mtDNA.MultiSeqs.list,"species") +# use species as name instead of genbank id +names(mtDNA.MultiSeqs.list)<-mtDNA.Species +# need to fix some names +names(mtDNA.MultiSeqs.list)[1] <- paste("German_Neanderthal",sep="") +names(mtDNA.MultiSeqs.list)[2] <- paste("Russian_Neanderthal",sep="") +names(mtDNA.MultiSeqs.list)[3] <- paste("Human") +names(mtDNA.MultiSeqs.list)[6] <- paste("Puti_Orangutan",sep="") +names(mtDNA.MultiSeqs.list)[12] <- paste("Jari_Orangutan",sep="") + +length(mtDNA.MultiSeqs.list$Human) + +# look at one of the sequences using $ +mtDNA.MultiSeqs.list$Human + +## Convert to Biostrings object for the sequences +if (!require("BiocManager")) install.packages("BiocManager") +library(BiocManager) +if (!require("Biostrings")) BiocManager::install("Biostrings") +library(Biostrings) +# loop through the list to create vector of strings for Biostrings input +Names.vec <- c() # initialize speices names string vector +Seqs.vec <- c() # initialize sequence string vector +for (mtDNA.name in names(mtDNA.MultiSeqs.list)) +{ + Names.vec <- c(Names.vec,mtDNA.name) # concatenate vector + Seqs.vec <-c(Seqs.vec,paste(mtDNA.MultiSeqs.list[[mtDNA.name]],collapse="")) +} +mtDNA.multSeqs.bstr <- DNAStringSet(Seqs.vec) # convert to Biostring + +# name the Biostring sequences and compute stats +names(mtDNA.multSeqs.bstr) <- Names.vec # count nucs and sequence lengths +# num.nts <- alphabetFrequency(mtDNA.multSeqs.bstr)[,1:4] +mtDNA.lengths <- rowSums(num.nts) +proportion.nts <- num.nts/mtDNA.lengths + +# Obtain name and length of species with longest sequence +nlengthnames <- cbind(mtDNA.lengths, Names.vec) +idx <- which.max(nlengthnames[,1]) +nlengthnames[idx,] diff --git a/Schrick-Noah_CS-6643_Lab-10.docx b/Schrick-Noah_CS-6643_Lab-10.docx index a74c72e..ec49494 100644 Binary files a/Schrick-Noah_CS-6643_Lab-10.docx and b/Schrick-Noah_CS-6643_Lab-10.docx differ