Finalizing Multiple Sequence Alignment

2022-11-21 17:16:46 -06:00 · 2022-11-21 17:16:46 -06:00 · 056dcb299b
commit 056dcb299b
parent bd4cfb8688
9 changed files with 933 additions and 1 deletions
--- a/.Rhistory
+++ b/.Rhistory
@ -0,0 +1,512 @@
 ################# Max-Log-Likelihood #################
 n <- length(g.breaks.clean)
 kmin <- g.breaks.clean[1]
 alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin)
 alpha.ML
 lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
 # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
 # Degree Distribution
 # Professor: Dr. McKinney, Spring 2022
 # Noah Schrick - 1492657
 library(igraph)
 library(igraphdata)
 data(yeast)
 g <- yeast
 g.netname <- "Yeast"
 ################# Set up Work #################
 g.vec <- degree(g)
 g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
 " Network"))
 legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
 "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
 "#006CD1", "#E66100", "#D35FB7"))
 g.mean <- mean(g.vec)
 g.seq <- 0:max(g.vec) # x-axis
 ################# Guessing Alpha #################
 alpha.guess <- 1.5
 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1)
 ################# Poisson #################
 g.pois <- dpois(g.seq, g.mean, log=F)
 lines(g.seq, g.pois, col="#006CD1", lty=2)
 ################# Linear model: Least-Squares Fit #################
 g.breaks <- g.hist$breaks[-c(1,2,3)] # remove 0
 g.probs <- g.hist$density[-1] # make lengths match
 # Need to clean up probabilities that are 0
 nz.probs.mask <- g.probs!=0
 g.breaks.clean <- g.breaks[nz.probs.mask]
 g.probs.clean <- g.breaks[nz.probs.mask]
 #plot(log(g.breaks.clean), log(g.probs.clean))
 g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
 summary(g.fit)
 alpha.LM <- coef(g.fit)[2]
 lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3)
 ################# Max-Log-Likelihood #################
 n <- length(g.breaks.clean)
 kmin <- g.breaks.clean[1]
 alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin)
 alpha.ML
 lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
 # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
 # Degree Distribution
 # Professor: Dr. McKinney, Spring 2022
 # Noah Schrick - 1492657
 library(igraph)
 library(igraphdata)
 data(yeast)
 g <- yeast
 g.netname <- "Yeast"
 ################# Set up Work #################
 g.vec <- degree(g)
 g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
 " Network"))
 legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
 "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
 "#006CD1", "#E66100", "#D35FB7"))
 g.mean <- mean(g.vec)
 g.seq <- 0:max(g.vec) # x-axis
 ################# Guessing Alpha #################
 alpha.guess <- 1.5
 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1)
 ################# Poisson #################
 g.pois <- dpois(g.seq, g.mean, log=F)
 lines(g.seq, g.pois, col="#006CD1", lty=2)
 ################# Linear model: Least-Squares Fit #################
 g.breaks <- g.hist$breaks[-c(1)] # remove 0
 g.probs <- g.hist$density[-1] # make lengths match
 # Need to clean up probabilities that are 0
 nz.probs.mask <- g.probs!=0
 g.breaks.clean <- g.breaks[nz.probs.mask]
 g.probs.clean <- g.breaks[nz.probs.mask]
 #plot(log(g.breaks.clean), log(g.probs.clean))
 g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
 summary(g.fit)
 alpha.LM <- coef(g.fit)[2]
 lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3)
 ################# Max-Log-Likelihood #################
 n <- length(g.breaks.clean)
 kmin <- g.breaks.clean[1]
 alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin)
 alpha.ML
 lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
 # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
 # Degree Distribution
 # Professor: Dr. McKinney, Spring 2022
 # Noah Schrick - 1492657
 library(igraph)
 library(igraphdata)
 data(yeast)
 g <- yeast
 g.netname <- "Yeast"
 ################# Set up Work #################
 g.vec <- degree(g)
 g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
 " Network"))
 legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
 "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
 "#006CD1", "#E66100", "#D35FB7"))
 g.mean <- mean(g.vec)
 g.seq <- 0:max(g.vec) # x-axis
 ################# Guessing Alpha #################
 alpha.guess <- 1.5
 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1)
 ################# Poisson #################
 g.pois <- dpois(g.seq, g.mean, log=F)
 lines(g.seq, g.pois, col="#006CD1", lty=2)
 ################# Linear model: Least-Squares Fit #################
 #g.breaks <- g.hist$breaks[-c(1)] # remove 0
 g.breaks <- g.hist$breaks # remove 0
 g.probs <- g.hist$density[-1] # make lengths match
 # Need to clean up probabilities that are 0
 nz.probs.mask <- g.probs!=0
 g.breaks.clean <- g.breaks[nz.probs.mask]
 g.probs.clean <- g.breaks[nz.probs.mask]
 #plot(log(g.breaks.clean), log(g.probs.clean))
 g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
 summary(g.fit)
 alpha.LM <- coef(g.fit)[2]
 lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3)
 ################# Max-Log-Likelihood #################
 n <- length(g.breaks.clean)
 kmin <- g.breaks.clean[1]
 alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin)
 alpha.ML
 lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
 # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
 # Degree Distribution
 # Professor: Dr. McKinney, Spring 2022
 # Noah Schrick - 1492657
 library(igraph)
 library(igraphdata)
 data(yeast)
 g <- yeast
 g.netname <- "Yeast"
 ################# Set up Work #################
 g.vec <- degree(g)
 g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
 " Network"))
 legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
 "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
 "#006CD1", "#E66100", "#D35FB7"))
 g.mean <- mean(g.vec)
 g.seq <- 0:max(g.vec) # x-axis
 ################# Guessing Alpha #################
 alpha.guess <- 1.5
 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1)
 ################# Poisson #################
 g.pois <- dpois(g.seq, g.mean, log=F)
 lines(g.seq, g.pois, col="#006CD1", lty=2)
 ################# Linear model: Least-Squares Fit #################
 g.breaks <- g.hist$breaks[-c(1)] # remove 0
 g.probs <- g.hist$density[-1] # make lengths match
 # Need to clean up probabilities that are 0
 nz.probs.mask <- g.probs!=0
 g.breaks.clean <- g.breaks[nz.probs.mask]
 g.probs.clean <- g.probs[nz.probs.mask]
 #plot(log(g.breaks.clean), log(g.probs.clean))
 g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
 summary(g.fit)
 alpha.LM <- coef(g.fit)[2]
 lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3)
 ################# Max-Log-Likelihood #################
 n <- length(g.breaks.clean)
 kmin <- g.breaks.clean[1]
 alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin)
 alpha.ML
 lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
 alpha.LM
 # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
 # Degree Distribution
 # Professor: Dr. McKinney, Spring 2022
 # Noah Schrick - 1492657
 library(igraph)
 library(igraphdata)
 data(yeast)
 g <- yeast
 g.netname <- "Yeast"
 ################# Set up Work #################
 g.vec <- degree(g)
 g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
 " Network"))
 legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
 "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
 "#006CD1", "#E66100", "#D35FB7"))
 g.mean <- mean(g.vec)
 g.seq <- 0:max(g.vec) # x-axis
 ################# Guessing Alpha #################
 alpha.guess <- 1.5
 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1)
 ################# Poisson #################
 g.pois <- dpois(g.seq, g.mean, log=F)
 lines(g.seq, g.pois, col="#006CD1", lty=2)
 ################# Linear model: Least-Squares Fit #################
 g.breaks <- g.hist$breaks[-c(1)] # remove 0
 g.probs <- g.hist$density[-1] # make lengths match
 # Need to clean up probabilities that are 0
 nz.probs.mask <- g.probs!=0
 g.breaks.clean <- g.breaks[nz.probs.mask]
 g.probs.clean <- g.probs[nz.probs.mask]
 #plot(log(g.breaks.clean), log(g.probs.clean))
 g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
 summary(g.fit)
 alpha.LM <- coef(g.fit)[2]
 lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3)
 ################# Max-Log-Likelihood #################
 n <- length(g.breaks.clean)
 kmin <- g.breaks.clean[1]
 alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin))
 alpha.ML
 lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
 # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
 # Degree Distribution
 # Professor: Dr. McKinney, Spring 2022
 # Noah Schrick - 1492657
 library(igraph)
 library(igraphdata)
 data(yeast)
 g <- yeast
 g.netname <- "Yeast"
 ################# Set up Work #################
 g.vec <- degree(g)
 g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
 " Network"))
 legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
 "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
 "#006CD1", "#E66100", "#D35FB7"))
 g.mean <- mean(g.vec)
 g.seq <- 0:max(g.vec) # x-axis
 ################# Guessing Alpha #################
 alpha.guess <- 1.5
 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=5)
 ################# Poisson #################
 g.pois <- dpois(g.seq, g.mean, log=F)
 lines(g.seq, g.pois, col="#006CD1", lty=2)
 ################# Linear model: Least-Squares Fit #################
 g.breaks <- g.hist$breaks[-c(1)] # remove 0
 g.probs <- g.hist$density[-1] # make lengths match
 # Need to clean up probabilities that are 0
 nz.probs.mask <- g.probs!=0
 g.breaks.clean <- g.breaks[nz.probs.mask]
 g.probs.clean <- g.probs[nz.probs.mask]
 #plot(log(g.breaks.clean), log(g.probs.clean))
 g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
 summary(g.fit)
 alpha.LM <- coef(g.fit)[2]
 lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3)
 ################# Max-Log-Likelihood #################
 n <- length(g.breaks.clean)
 kmin <- g.breaks.clean[1]
 alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin))
 alpha.ML
 lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
 # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
 # Degree Distribution
 # Professor: Dr. McKinney, Spring 2022
 # Noah Schrick - 1492657
 library(igraph)
 library(igraphdata)
 data(yeast)
 g <- yeast
 g.netname <- "Yeast"
 ################# Set up Work #################
 g.vec <- degree(g)
 g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
 " Network"))
 legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
 "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
 "#006CD1", "#E66100", "#D35FB7"))
 g.mean <- mean(g.vec)
 g.seq <- 0:max(g.vec) # x-axis
 ################# Guessing Alpha #################
 alpha.guess <- 1.5
 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3)
 ################# Poisson #################
 g.pois <- dpois(g.seq, g.mean, log=F)
 lines(g.seq, g.pois, col="#006CD1", lty=2)
 ################# Linear model: Least-Squares Fit #################
 g.breaks <- g.hist$breaks[-c(1)] # remove 0
 g.probs <- g.hist$density[-1] # make lengths match
 # Need to clean up probabilities that are 0
 nz.probs.mask <- g.probs!=0
 g.breaks.clean <- g.breaks[nz.probs.mask]
 g.probs.clean <- g.probs[nz.probs.mask]
 #plot(log(g.breaks.clean), log(g.probs.clean))
 g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
 summary(g.fit)
 alpha.LM <- coef(g.fit)[2]
 lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3)
 ################# Max-Log-Likelihood #################
 n <- length(g.breaks.clean)
 kmin <- g.breaks.clean[1]
 alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin))
 alpha.ML
 lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4)
 # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
 # Degree Distribution
 # Professor: Dr. McKinney, Spring 2022
 # Noah Schrick - 1492657
 library(igraph)
 library(igraphdata)
 data(yeast)
 g <- yeast
 g.netname <- "Yeast"
 ################# Set up Work #################
 g.vec <- degree(g)
 g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
 " Network"))
 legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
 "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
 "#006CD1", "#E66100", "#D35FB7"))
 g.mean <- mean(g.vec)
 g.seq <- 0:max(g.vec) # x-axis
 ################# Guessing Alpha #################
 alpha.guess <- 1.5
 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3)
 ################# Poisson #################
 g.pois <- dpois(g.seq, g.mean, log=F)
 lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3)
 ################# Linear model: Least-Squares Fit #################
 g.breaks <- g.hist$breaks[-c(1)] # remove 0
 g.probs <- g.hist$density[-1] # make lengths match
 # Need to clean up probabilities that are 0
 nz.probs.mask <- g.probs!=0
 g.breaks.clean <- g.breaks[nz.probs.mask]
 g.probs.clean <- g.probs[nz.probs.mask]
 #plot(log(g.breaks.clean), log(g.probs.clean))
 g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
 summary(g.fit)
 alpha.LM <- coef(g.fit)[2]
 lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3)
 ################# Max-Log-Likelihood #################
 n <- length(g.breaks.clean)
 kmin <- g.breaks.clean[1]
 alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin))
 alpha.ML
 lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3)
 plot(yeast)
 hist(yeast)
 hist(g.vec)
 g.pois
 g.mean
 alpha.LM
 alpha.ML
 degree(g)
 sort(degree(g))
 sort(degree(g),decreasing=FALSE)
 sort(degree(g),decreasing=F)
 sort(degree(g),decreasing=false)
 sort(degree(g), decreasing = TRUE)
 head(sort(degree(g), decreasing = TRUE))
 stddev(degree(g))
 sd(degree(g))
 tail(sort(degree(g), decreasing = TRUE))
 plot(log(g.breaks.clean), log(g.probs.clean))
 # Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course
 # Degree Distribution
 # Professor: Dr. McKinney, Spring 2022
 # Noah Schrick - 1492657
 library(igraph)
 library(igraphdata)
 data(yeast)
 g <- yeast
 g.netname <- "Yeast"
 ################# Set up Work #################
 g.vec <- degree(g)
 g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname,
 " Network"))
 legend("topright", c("Guess", "Poisson", "Least-Squares Fit",
 "Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6",
 "#006CD1", "#E66100", "#D35FB7"))
 g.mean <- mean(g.vec)
 g.seq <- 0:max(g.vec) # x-axis
 ################# Guessing Alpha #################
 alpha.guess <- 1.5
 lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3)
 ################# Poisson #################
 g.pois <- dpois(g.seq, g.mean, log=F)
 lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3)
 ################# Linear model: Least-Squares Fit #################
 g.breaks <- g.hist$breaks[-c(1)] # remove 0
 g.probs <- g.hist$density[-1] # make lengths match
 # Need to clean up probabilities that are 0
 nz.probs.mask <- g.probs!=0
 g.breaks.clean <- g.breaks[nz.probs.mask]
 g.probs.clean <- g.probs[nz.probs.mask]
 plot(log(g.breaks.clean), log(g.probs.clean))
 g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean))
 summary(g.fit)
 alpha.LM <- coef(g.fit)[2]
 lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3)
 ################# Max-Log-Likelihood #################
 n <- length(g.breaks.clean)
 kmin <- g.breaks.clean[1]
 alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin))
 alpha.ML
 lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3)
 plot(log(g.breaks.clean), log(g.probs.clean))
 g.breaks.clean <- g.breaks[nz.probs.mask]
 g.probs.clean <- g.probs[nz.probs.mask]
 plot(log(g.breaks.clean), log(g.probs.clean))
 ## Set Working Directory to file directory - RStudio approach
 setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
 #### Part A: GenBank sequences and a multiple fasta file
 if (!require("ape")) install.packages("ape")
 library(ape) # needed for read.GenBank
 # fetch the mtDNA sequences
 mtDNA.MultiSeqs.list<-read.GenBank(c("AF011222","AF254446","X90314","AF089820",
 "AF176766","AF451972", "AY079510",
 "AF050738","AF176722","AF315498",
 "AF176731","AF451964"), as.character=TRUE)
 # look at species names
 mtDNA.Species<-attr(mtDNA.MultiSeqs.list,"species")
 # use species as name instead of genbank id
 names(mtDNA.MultiSeqs.list)<-mtDNA.Species
 names(mtDNA.MultiSeqs.list)
 # need to fix some names
 names(mtDNA.MultiSeqs.list)[1] <- paste("German_Neanderthal",sep="")
 names(mtDNA.MultiSeqs.list)[2] <- paste("Russian_Neanderthal",sep="")
 names(mtDNA.MultiSeqs.list)[3] <- paste("Human")
 names(mtDNA.MultiSeqs.list)[6] <- paste("Puti_Orangutan",sep="")
 names(mtDNA.MultiSeqs.list)[12] <- paste("Jari_Orangutan",sep="")
 names(mtDNA.MultiSeqs.list)
 # look at one of the sequences using $
 mtDNA.MultiSeqs.list$Human
 length(mtDNA.MultiSeqs.list$Human)
 ## Convert to Biostrings object for the sequences
 if (!require("BiocManager")) install.packages("BiocManager")
 library(BiocManager)
 if (!require("Biostrings")) BiocManager::install("Biostrings")
 library(Biostrings)
 # loop through the list to create vector of strings for Biostrings input
 Names.vec <- c() # initialize speices names string vector
 Seqs.vec <- c()  # initialize sequence string vector
 for (mtDNA.name in names(mtDNA.MultiSeqs.list))
 {
 Names.vec <- c(Names.vec,mtDNA.name) # concatenate vector
 Seqs.vec <-c(Seqs.vec,paste(mtDNA.MultiSeqs.list[[mtDNA.name]],collapse=""))
 }
 mtDNA.multSeqs.bstr <- DNAStringSet(Seqs.vec) # convert to Biostring
 # name the Biostring sequences and compute stats
 names(mtDNA.multSeqs.bstr) <- Names.vec # count nucs and sequence lengths
 num.nts <- alphabetFrequency(mtDNA.multSeqs.bstr)[,1:4]
 mtDNA.lengths <- rowSums(num.nts)
 proportion.nts <- num.nts/mtDNA.lengths
 num.nts
 names(mtDA.multSeqs.bstr)
 names(mtDNA.multSeqs.bstr)
 mtDNA.multSeqs.bstr
 mtDNA.multSeqs.bstr
 mtDNA.multSeqs.bstr
 mtDNA.multSeqs.bstr --help
 print(mtDNA.multSeqs.bstr)
 print(mtDNA.multSeqs.bstr, -n40)
 print(mtDNA.multSeqs.bstr, -n 40)
 print(mtDNA.multSeqs.bstr, n=20)
 class(mtDNA.multSeqs.bstr)
 print(mtDNA.multSeqs.bstr)
 ?print()
 ?print
 table(mtDNA.multSeqs.bstr)
 mtDNA.multSeqs.bstr
 mtDNA.multSeqs.bstr$width
 mtDNA.multSeqs.bstr[,1]$width
 mtDNA.multSeqs.bstr[1,]$width
 mtDNA.multSeqs.bstr[1]$width
 mtDNA.multSeqs.bstr[1]
 mtDNA.multSeqs.bstr[1]$seq
 mtDNA.multSeqs.bstr[1]$width
 mtDNA.multSeqs.bstr[1]$names
 mtDNA.multSeqs.bstr$names
 # name the Biostring sequences and compute stats
 names(mtDNA.multSeqs.bstr) <- Names.vec # count nucs and sequence lengths
 mtDNA.multSeqs.bstr$names
 mtDNA.multSeqs.bstr$Names
 mtDNA.lengths
 table(mtDNA.lengths, Names.vec)
 cbind(mtDNA.lengths, Names.vec)
 sort(cbind(mtDNA.lengths, Names.vec))
 cbind(mtDNA.lengths, Names.vec)
 cbind(mtDNA.lengths, Names.vec)
 table(cbind(mtDNA.lengths, Names.vec))
 rbind(cbind(mtDNA.lengths, Names.vec))
 sort(rbind(cbind(mtDNA.lengths, Names.vec)))
 rbind(mtDNA.lengths, Names.vec)
 cbind(mtDNA.lengths, Names.vec)
 max(cbind(mtDNA.lengths, Names.vec))
 max(cbind(mtDNA.lengths, Names.vec))[,1]
 max(cbind(mtDNA.lengths, Names.vec)[,1])
 max(cbind(mtDNA.lengths, Names.vec)[1,])
 max(cbind(mtDNA.lengths, Names.vec)[,1])
 max(cbind(mtDNA.lengths, Names.vec))
 cbind(mtDNA.lengths, Names.vec)
 nlengthnames <- cbind(mtDNA.lengths, Names.vec)
 max(nlengthnames[,1])
 nlengthnames <- cbind(mtDNA.lengths, Names.vec)
 nlengthnames[which.max(nlengthnames[,1])]
 idx <- which.max(nlengthnames[,1])
 idx
 nlengthnames[idx, idx]
 nlengthnames[idx]
 nlengthnames
 nlengthnames[idx,]
 proportion.nts <- num.nts/mtDNA.lengths
 proportion.nts
--- a/MSA.png
+++ b/MSA.png
--- a/Schrick-Noah_CS-6643_Lab-10.R
+++ b/Schrick-Noah_CS-6643_Lab-10.R
@ -48,7 +48,7 @@ mtDNA.multSeqs.bstr <- DNAStringSet(Seqs.vec) # convert to Biostring
 # name the Biostring sequences and compute stats
 names(mtDNA.multSeqs.bstr) <- Names.vec # count nucs and sequence lengths
-# num.nts <- alphabetFrequency(mtDNA.multSeqs.bstr)[,1:4]
+num.nts <- alphabetFrequency(mtDNA.multSeqs.bstr)[,1:4]
 mtDNA.lengths <- rowSums(num.nts)
 proportion.nts <- num.nts/mtDNA.lengths
@ -56,3 +56,40 @@ proportion.nts <- num.nts/mtDNA.lengths
 nlengthnames <- cbind(mtDNA.lengths, Names.vec)
 idx <- which.max(nlengthnames[,1])
 nlengthnames[idx,]
 #### Part B: Multiple Sequence Alignment
 if (!require("BiocManager")) install.packages("BiocManager")
 library(BiocManager)
 if (!require("msa")) BiocManager::install("msa")
 library(msa)
 mtDNA.msa <- msa(mtDNA.multSeqs.bstr,method="ClustalOmega")
 msaPrettyPrint(mtDNA.msa, file="mtDNA.pdf", output="pdf", showNames="left",
               showLogo="none", askForOverwrite=FALSE, verbose=TRUE )
 ## loop to make results data frame
 num_seqs <- length(Names.vec)
 # initialize data frame
 align.stats.df <- data.frame(species=rep(NA,num_seqs), seqlen=rep(0,num_seqs), 
                             numgaps=rep(0,num_seqs), nt_a=rep(NA,num_seqs), 
                             nt_c=rep(NA,num_seqs), nt_g=rep(NA,num_seqs), 
                             nt_t=rep(NA,num_seqs))
 # DNAbin type required for dist.dna and helpful for other calculations
 mtDNA.msa.DNAbin <- as.DNAbin(mtDNA.msa) 
 for (i in 1:num_seqs){
  seq_name <- Names.vec[i]
  seq.vec <- as.character(mtDNA.msa.DNAbin[i,])
  num.gaps <- sum(seq.vec=="-")
  prop.nt.i <- proportion.nts[i,]
  align.stats.df[i,] <- c(seq_name, mtDNA.lengths[i], num.gaps,
                          round(prop.nt.i[1],digits=2), round(prop.nt.i[2],digits=2), 
                          round(prop.nt.i[3],digits=2), round(prop.nt.i[4],digits=2))
 }
 # write to file
 write.table(align.stats.df,file="alignstats.tab",sep = "\t", row.names=FALSE, quote=FALSE)
 # you can use $ operator to grab a named column from a data.frame 
 # similar to grabbing a named variable from a list 
 align.stats.df$species
 align.stats.df$nt_a             # strings by default
 as.numeric(align.stats.df$nt_a) # convert to numeric
--- a/Schrick-Noah_CS-6643_Lab-10.docx
+++ b/Schrick-Noah_CS-6643_Lab-10.docx
--- a/alignstats.tab
+++ b/alignstats.tab
@ -0,0 +1,13 @@
 species	seqlen	numgaps	nt_a	nt_c	nt_g	nt_t
 German_Neanderthal	379	81	0.31	0.32	0.13	0.24
 Russian_Neanderthal	345	76	0.33	0.34	0.11	0.22
 Human	360	76	0.33	0.34	0.11	0.22
 Gorilla_beringei_beringei	374	96	0.29	0.36	0.13	0.23
 Pan_troglodytes_troglodytes	340	105	0.32	0.36	0.1	0.22
 Puti_Orangutan	354	90	0.31	0.4	0.1	0.19
 Gorilla_gorilla_gorilla	367	71	0.3	0.35	0.14	0.21
 Gorilla_beringei_graueri	374	105	0.28	0.35	0.14	0.23
 Pan_troglodytes_schweinfurthii	339	110	0.34	0.35	0.09	0.22
 Pan_troglodytes_ellioti	411	111	0.33	0.36	0.1	0.21
 Pan_troglodytes_verus	339	39	0.34	0.37	0.1	0.2
 Jari_Orangutan	345	111	0.32	0.39	0.1	0.19
--- a/mtDNA.aux
+++ b/mtDNA.aux
@ -0,0 +1,14 @@
 \relax 
 \savedseqlength{1}{1}{369}
 \savedseqlength{1}{2}{374}
 \savedseqlength{1}{3}{374}
 \savedseqlength{1}{4}{354}
 \savedseqlength{1}{5}{345}
 \savedseqlength{1}{6}{360}
 \savedseqlength{1}{7}{379}
 \savedseqlength{1}{8}{345}
 \savedseqlength{1}{9}{340}
 \savedseqlength{1}{10}{339}
 \savedseqlength{1}{11}{411}
 \savedseqlength{1}{12}{339}
 \gdef \@abspage@last{2}
--- a/mtDNA.log
+++ b/mtDNA.log
@ -0,0 +1,308 @@
 This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/Arch Linux) (preloaded format=pdflatex 2022.11.8)  21 NOV 2022 16:54
 entering extended mode
 restricted \write18 enabled.
 %&-line parsing enabled.
 **mtDNA.tex
 (./mtDNA.tex
 LaTeX2e <2021-11-15> patch level 1
 L3 programming layer <2022-04-10>
 (/usr/share/texmf-dist/tex/latex/base/article.cls
 Document Class: article 2021/10/04 v1.4n Standard LaTeX document class
 (/usr/share/texmf-dist/tex/latex/base/size10.clo
 File: size10.clo 2021/10/04 v1.4n Standard LaTeX file (size option)
 )
 \c@part=\count185
 \c@section=\count186
 \c@subsection=\count187
 \c@subsubsection=\count188
 \c@paragraph=\count189
 \c@subparagraph=\count190
 \c@figure=\count191
 \c@table=\count192
 \abovecaptionskip=\skip47
 \belowcaptionskip=\skip48
 \bibindent=\dimen138
 )
 (/usr/lib/R/library/msa/tex/texshade.sty
 Package: texshade 2021/04/01 LaTeX TeXshade (v1.26)
 Package `texshade', Version 1.26 of 2021/04/01.
 (/usr/share/texmf-dist/tex/latex/graphics/color.sty
 Package: color 2021/12/07 v1.3c Standard LaTeX Color (DPC)
 (/usr/share/texmf-dist/tex/latex/graphics-cfg/color.cfg
 File: color.cfg 2016/01/02 v1.6 sample color configuration
 )
 Package color Info: Driver file: dvips.def on input line 149.
 (/usr/share/texmf-dist/tex/latex/graphics-def/dvips.def
 File: dvips.def 2017/06/20 v3.1d Graphics/color driver for dvips
 )
 (/usr/share/texmf-dist/tex/latex/graphics/dvipsnam.def
 File: dvipsnam.def 2016/06/17 v3.0m Driver-dependent file (DPC,SPQR)
 ))
 (/usr/share/texmf-dist/tex/latex/graphics/graphics.sty
 Package: graphics 2021/03/04 v1.4d Standard LaTeX Graphics (DPC,SPQR)
 (/usr/share/texmf-dist/tex/latex/graphics/trig.sty
 Package: trig 2021/08/11 v1.11 sin cos tan (DPC)
 )
 (/usr/share/texmf-dist/tex/latex/graphics-cfg/graphics.cfg
 File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration
 )
 Package graphics Info: Driver file: pdftex.def on input line 107.
 (/usr/share/texmf-dist/tex/latex/graphics-def/pdftex.def
 File: pdftex.def 2020/10/05 v1.2a Graphics/color driver for pdftex
 ))
 \structurefile=\read2
 \featurefile=\write3
 \alignfile=\read3
 \sublogofile=\read4
 \exp@rtfile=\write4
 \exp@rt@chimerafile=\write5
 (/usr/share/texmf-dist/tex/latex/amsfonts/amssymb.sty
 Package: amssymb 2013/01/14 v3.01 AMS font symbols
 (/usr/share/texmf-dist/tex/latex/amsfonts/amsfonts.sty
 Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
 \@emptytoks=\toks16
 \symAMSa=\mathgroup4
 \symAMSb=\mathgroup5
 LaTeX Font Info:    Redeclaring math symbol \hbar on input line 98.
 LaTeX Font Info:    Overwriting math alphabet `\mathfrak' in version `bold'
 (Font)                  U/euf/m/n --> U/euf/b/n on input line 106.
 ))
 \symalphahelix=\mathgroup6
 \loopcount=\count193
 \innerloopcount=\count194
 \outerloopcount=\count195
 \seq@count=\count196
 \killseq@count=\count197
 \seq@percent=\count198
 \res@count=\count199
 \seq@pointer=\count266
 \pos@count=\count267
 \res@perline=\count268
 \end@count=\count269
 \cons@count=\count270
 \total@count=\count271
 \temp@count=\count272
 \triple@count=\count273
 \temp@@count=\count274
 \pos@sum=\count275
 \box@width=\skip49
 \name@width=\skip50
 \box@depth=\skip51
 \width@tmp=\skip52
 \box@height=\skip53
 \number@width=\skip54
 \line@stretch=\skip55
 \center@fill=\skip56
 \arrow@width=\skip57
 \arrow@height=\skip58
 \rule@thick=\skip59
 \arrow@thick=\skip60
 \logo@height=\skip61
 \equal@width=\skip62
 \equal@tmp=\skip63
 \equal@height=\skip64
 \temp@@length=\skip65
 \vspace@legend=\skip66
 \hspace@legend=\skip67
 \bar@length=\skip68
 Package color Info: Redefining color LightGray on input line 1716.
 Package color Info: Redefining color LightLightGray on input line 1817.
 Package color Info: Redefining color LightLightLightGray on input line 1919.
 )
 (/usr/share/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def
 File: l3backend-pdftex.def 2022-04-14 L3 backend support: PDF output (pdfTeX)
 \l__color_backend_stack_int=\count276
 \l__pdf_internal_box=\box50
 )
 No file mtDNA.aux.
 \openout1 = `mtDNA.aux'.
 LaTeX Font Info:    Checking defaults for OML/cmm/m/it on input line 24.
 LaTeX Font Info:    ... okay on input line 24.
 LaTeX Font Info:    Checking defaults for OMS/cmsy/m/n on input line 24.
 LaTeX Font Info:    ... okay on input line 24.
 LaTeX Font Info:    Checking defaults for OT1/cmr/m/n on input line 24.
 LaTeX Font Info:    ... okay on input line 24.
 LaTeX Font Info:    Checking defaults for T1/cmr/m/n on input line 24.
 LaTeX Font Info:    ... okay on input line 24.
 LaTeX Font Info:    Checking defaults for TS1/cmr/m/n on input line 24.
 LaTeX Font Info:    ... okay on input line 24.
 LaTeX Font Info:    Checking defaults for OMX/cmex/m/n on input line 24.
 LaTeX Font Info:    ... okay on input line 24.
 LaTeX Font Info:    Checking defaults for U/cmr/m/n on input line 24.
 LaTeX Font Info:    ... okay on input line 24.
 (/usr/share/texmf-dist/tex/context/base/mkii/supp-pdf.mkii
 [Loading MPS to PDF converter (version 2006.09.02).]
 \scratchcounter=\count277
 \scratchdimen=\dimen139
 \scratchbox=\box51
 \nofMPsegments=\count278
 \nofMParguments=\count279
 \everyMPshowfont=\toks17
 \MPscratchCnt=\count280
 \MPscratchDim=\dimen140
 \MPnumerator=\count281
 \makeMPintoPDFobject=\count282
 \everyMPtoPDFconversion=\toks18
 ) (/usr/share/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty
 Package: epstopdf-base 2020-01-24 v2.11 Base part for package epstopdf
 Package epstopdf-base Info: Redefining graphics rule for `.eps' on input line 4
 85.
 (/usr/share/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg
 File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Liv
 e
 ))
 (/tmp/Rtmp9tSm8e/seq57d10064df.fasta:
 Runaway argument?
 ! Paragraph ended before \inf@@get was complete.
 <to be read again> 
                   \par 
 l.25 ...hade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta}
 I suspect you've forgotten a `}', causing me to apply this
 control sequence to too much text. How can we recover?
 My plan is to forget the whole thing and hope for the best.
 ! Misplaced alignment tab character &.
 \msfline ->\par &
                  & & & @
 l.25 ...hade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta}
 I can't figure out why you would want to use a tab mark
 here. If you just want an ampersand, the remedy is
 simple: Just type `I\&' now. But if some right brace
 up above has ended a previous alignment prematurely,
 you're probably due for more error messages, and you
 might try typing `S' now just to see what is salvageable.
 ! Misplaced alignment tab character &.
 \msfline ->\par & &
                    & & @
 l.25 ...hade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta}
 I can't figure out why you would want to use a tab mark
 here. If you just want an ampersand, the remedy is
 simple: Just type `I\&' now. But if some right brace
 up above has ended a previous alignment prematurely,
 you're probably due for more error messages, and you
 might try typing `S' now just to see what is salvageable.
 ! Misplaced alignment tab character &.
 \msfline ->\par & & &
                      & @
 l.25 ...hade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta}
 I can't figure out why you would want to use a tab mark
 here. If you just want an ampersand, the remedy is
 simple: Just type `I\&' now. But if some right brace
 up above has ended a previous alignment prematurely,
 you're probably due for more error messages, and you
 might try typing `S' now just to see what is salvageable.
 ! Misplaced alignment tab character &.
 \msfline ->\par & & & &
                        @
 l.25 ...hade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta}
 I can't figure out why you would want to use a tab mark
 here. If you just want an ampersand, the remedy is
 simple: Just type `I\&' now. But if some right brace
 up above has ended a previous alignment prematurely,
 you're probably due for more error messages, and you
 might try typing `S' now just to see what is salvageable.
 Runaway argument?
 ! Paragraph ended before \check@letter was complete.
 <to be read again> 
                   \par 
 l.25 ...hade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta}
 I suspect you've forgotten a `}', causing me to apply this
 control sequence to too much text. How can we recover?
 My plan is to forget the whole thing and hope for the best.
 Runaway argument?
 ! Paragraph ended before \firstchar@get was complete.
 <to be read again> 
                   \par 
 l.25 ...hade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta}
 I suspect you've forgotten a `}', causing me to apply this
 control sequence to too much text. How can we recover?
 My plan is to forget the whole thing and hope for the best.
 Runaway argument?
 ! Paragraph ended before \firstchar@get was complete.
 <to be read again> 
                   \par 
 l.47 \end{texshade}
 I suspect you've forgotten a `}', causing me to apply this
 control sequence to too much text. How can we recover?
 My plan is to forget the whole thing and hope for the best.
 Runaway argument?
 ! Paragraph ended before \res@get was complete.
 <to be read again> 
                   \par 
 l.47 \end{texshade}
 I suspect you've forgotten a `}', causing me to apply this
 control sequence to too much text. How can we recover?
 My plan is to forget the whole thing and hope for the best.
 ! Misplaced alignment tab character &.
 \seq@line ->\par &
                  @
 l.47 \end{texshade}
 I can't figure out why you would want to use a tab mark
 here. If you just want an ampersand, the remedy is
 simple: Just type `I\&' now. But if some right brace
 up above has ended a previous alignment prematurely,
 you're probably due for more error messages, and you
 might try typing `S' now just to see what is salvageable.
 . . . . . [1
 Non-PDF special ignored!
 <special> papersize=794.96999pt,614.295pt
 {/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}] )
 LaTeX Font Info:    Trying to load font information for U+msa on input line 47.
 (/usr/share/texmf-dist/tex/latex/amsfonts/umsa.fd
 File: umsa.fd 2013/01/14 v3.01 AMS symbols A
 )
 LaTeX Font Info:    Trying to load font information for U+msb on input line 47.
 (/usr/share/texmf-dist/tex/latex/amsfonts/umsb.fd
 File: umsb.fd 2013/01/14 v3.01 AMS symbols B
 ) [2] (./mtDNA.aux) ) 
 Here is how much of TeX's memory you used:
 8179 strings out of 478238
 97095 string characters out of 5850456
 1334999 words of memory out of 5000000
 26433 multiletter control sequences out of 15000+600000
 471599 words of font info for 39 fonts, out of 8000000 for 9000
 1141 hyphenation exceptions out of 8191
 541i,6n,65p,182b,4214s stack positions out of 5000i,500n,10000p,200000b,80000s
 </usr/share/texmf-dist/fonts/type1/public/amsfonts/cm/cmr1
 0.pfb></usr/share/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy10.pfb></usr/sh
 are/texmf-dist/fonts/type1/public/amsfonts/cm/cmtt10.pfb>
 Output written on mtDNA.pdf (2 pages, 86917 bytes).
 PDF statistics:
 26 PDF objects out of 1000 (max. 8388607)
 15 compressed objects within 1 object stream
 0 named destinations out of 1000 (max. 500000)
 1 words of extra memory for PDF output out of 10000 (max. 10000000)
--- a/mtDNA.pdf
+++ b/mtDNA.pdf
--- a/mtDNA.tex
+++ b/mtDNA.tex
@ -0,0 +1,48 @@
 \documentclass[10pt]{article}
 \usepackage{texshade}
 \headheight=0pt
 \headsep=0pt
 \hoffset=0pt
 \voffset=0pt
 \paperwidth=11in
 \paperheight=8.5in
 \ifx\pdfoutput\undefined
 \relax
 \else
 \pdfpagewidth=\paperwidth
 \pdfpageheight=\paperheight
 \fi
 \oddsidemargin=-0.9in
 \topmargin=-0.7in
 \textwidth=10.8in
 \textheight=7.9in
 \pagestyle{empty}
 \begin{document}
 \begin{texshade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta}
 \seqtype{N}
 \shadingmode{identical}
 \threshold{50}
 \showconsensus[ColdHot]{bottom}
 \shadingcolors{blues}
 \hidelogoscale
 \shownames{left}
 \nameseq{1}{Gorilla gorilla gorilla}
 \nameseq{2}{Gorilla beringei beringei}
 \nameseq{3}{Gorilla beringei graueri}
 \nameseq{4}{Puti Orangutan}
 \nameseq{5}{Jari Orangutan}
 \nameseq{6}{Human}
 \nameseq{7}{German Neanderthal}
 \nameseq{8}{Russian Neanderthal}
 \nameseq{9}{Pan troglodytes troglodytes}
 \nameseq{10}{Pan troglodytes schweinfurthii}
 \nameseq{11}{Pan troglodytes ellioti}
 \nameseq{12}{Pan troglodytes verus}
 \shownumbering{right}
 \showlegend
 \end{texshade}
 \end{document}