diff --git a/.Rhistory b/.Rhistory new file mode 100644 index 0000000..58aa9db --- /dev/null +++ b/.Rhistory @@ -0,0 +1,512 @@ +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1,2,3)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.breaks[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.breaks[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +#g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.breaks <- g.hist$breaks # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.breaks[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +alpha.LM +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=5) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3) +plot(yeast) +hist(yeast) +hist(g.vec) +g.pois +g.mean +alpha.LM +alpha.ML +degree(g) +sort(degree(g)) +sort(degree(g),decreasing=FALSE) +sort(degree(g),decreasing=F) +sort(degree(g),decreasing=false) +sort(degree(g), decreasing = TRUE) +head(sort(degree(g), decreasing = TRUE)) +stddev(degree(g)) +sd(degree(g)) +tail(sort(degree(g), decreasing = TRUE)) +plot(log(g.breaks.clean), log(g.probs.clean)) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3) +plot(log(g.breaks.clean), log(g.probs.clean)) +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +plot(log(g.breaks.clean), log(g.probs.clean)) +## Set Working Directory to file directory - RStudio approach +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +#### Part A: GenBank sequences and a multiple fasta file +if (!require("ape")) install.packages("ape") +library(ape) # needed for read.GenBank +# fetch the mtDNA sequences +mtDNA.MultiSeqs.list<-read.GenBank(c("AF011222","AF254446","X90314","AF089820", +"AF176766","AF451972", "AY079510", +"AF050738","AF176722","AF315498", +"AF176731","AF451964"), as.character=TRUE) +# look at species names +mtDNA.Species<-attr(mtDNA.MultiSeqs.list,"species") +# use species as name instead of genbank id +names(mtDNA.MultiSeqs.list)<-mtDNA.Species +names(mtDNA.MultiSeqs.list) +# need to fix some names +names(mtDNA.MultiSeqs.list)[1] <- paste("German_Neanderthal",sep="") +names(mtDNA.MultiSeqs.list)[2] <- paste("Russian_Neanderthal",sep="") +names(mtDNA.MultiSeqs.list)[3] <- paste("Human") +names(mtDNA.MultiSeqs.list)[6] <- paste("Puti_Orangutan",sep="") +names(mtDNA.MultiSeqs.list)[12] <- paste("Jari_Orangutan",sep="") +names(mtDNA.MultiSeqs.list) +# look at one of the sequences using $ +mtDNA.MultiSeqs.list$Human +length(mtDNA.MultiSeqs.list$Human) +## Convert to Biostrings object for the sequences +if (!require("BiocManager")) install.packages("BiocManager") +library(BiocManager) +if (!require("Biostrings")) BiocManager::install("Biostrings") +library(Biostrings) +# loop through the list to create vector of strings for Biostrings input +Names.vec <- c() # initialize speices names string vector +Seqs.vec <- c() # initialize sequence string vector +for (mtDNA.name in names(mtDNA.MultiSeqs.list)) +{ +Names.vec <- c(Names.vec,mtDNA.name) # concatenate vector +Seqs.vec <-c(Seqs.vec,paste(mtDNA.MultiSeqs.list[[mtDNA.name]],collapse="")) +} +mtDNA.multSeqs.bstr <- DNAStringSet(Seqs.vec) # convert to Biostring +# name the Biostring sequences and compute stats +names(mtDNA.multSeqs.bstr) <- Names.vec # count nucs and sequence lengths +num.nts <- alphabetFrequency(mtDNA.multSeqs.bstr)[,1:4] +mtDNA.lengths <- rowSums(num.nts) +proportion.nts <- num.nts/mtDNA.lengths +num.nts +names(mtDA.multSeqs.bstr) +names(mtDNA.multSeqs.bstr) +mtDNA.multSeqs.bstr +mtDNA.multSeqs.bstr +mtDNA.multSeqs.bstr +mtDNA.multSeqs.bstr --help +print(mtDNA.multSeqs.bstr) +print(mtDNA.multSeqs.bstr, -n40) +print(mtDNA.multSeqs.bstr, -n 40) +print(mtDNA.multSeqs.bstr, n=20) +class(mtDNA.multSeqs.bstr) +print(mtDNA.multSeqs.bstr) +?print() +?print +table(mtDNA.multSeqs.bstr) +mtDNA.multSeqs.bstr +mtDNA.multSeqs.bstr$width +mtDNA.multSeqs.bstr[,1]$width +mtDNA.multSeqs.bstr[1,]$width +mtDNA.multSeqs.bstr[1]$width +mtDNA.multSeqs.bstr[1] +mtDNA.multSeqs.bstr[1]$seq +mtDNA.multSeqs.bstr[1]$width +mtDNA.multSeqs.bstr[1]$names +mtDNA.multSeqs.bstr$names +# name the Biostring sequences and compute stats +names(mtDNA.multSeqs.bstr) <- Names.vec # count nucs and sequence lengths +mtDNA.multSeqs.bstr$names +mtDNA.multSeqs.bstr$Names +mtDNA.lengths +table(mtDNA.lengths, Names.vec) +cbind(mtDNA.lengths, Names.vec) +sort(cbind(mtDNA.lengths, Names.vec)) +cbind(mtDNA.lengths, Names.vec) +cbind(mtDNA.lengths, Names.vec) +table(cbind(mtDNA.lengths, Names.vec)) +rbind(cbind(mtDNA.lengths, Names.vec)) +sort(rbind(cbind(mtDNA.lengths, Names.vec))) +rbind(mtDNA.lengths, Names.vec) +cbind(mtDNA.lengths, Names.vec) +max(cbind(mtDNA.lengths, Names.vec)) +max(cbind(mtDNA.lengths, Names.vec))[,1] +max(cbind(mtDNA.lengths, Names.vec)[,1]) +max(cbind(mtDNA.lengths, Names.vec)[1,]) +max(cbind(mtDNA.lengths, Names.vec)[,1]) +max(cbind(mtDNA.lengths, Names.vec)) +cbind(mtDNA.lengths, Names.vec) +nlengthnames <- cbind(mtDNA.lengths, Names.vec) +max(nlengthnames[,1]) +nlengthnames <- cbind(mtDNA.lengths, Names.vec) +nlengthnames[which.max(nlengthnames[,1])] +idx <- which.max(nlengthnames[,1]) +idx +nlengthnames[idx, idx] +nlengthnames[idx] +nlengthnames +nlengthnames[idx,] +proportion.nts <- num.nts/mtDNA.lengths +proportion.nts diff --git a/MSA.png b/MSA.png new file mode 100644 index 0000000..f7daf77 Binary files /dev/null and b/MSA.png differ diff --git a/Schrick-Noah_CS-6643_Lab-10.R b/Schrick-Noah_CS-6643_Lab-10.R index 44ea929..5e036c9 100644 --- a/Schrick-Noah_CS-6643_Lab-10.R +++ b/Schrick-Noah_CS-6643_Lab-10.R @@ -48,7 +48,7 @@ mtDNA.multSeqs.bstr <- DNAStringSet(Seqs.vec) # convert to Biostring # name the Biostring sequences and compute stats names(mtDNA.multSeqs.bstr) <- Names.vec # count nucs and sequence lengths -# num.nts <- alphabetFrequency(mtDNA.multSeqs.bstr)[,1:4] +num.nts <- alphabetFrequency(mtDNA.multSeqs.bstr)[,1:4] mtDNA.lengths <- rowSums(num.nts) proportion.nts <- num.nts/mtDNA.lengths @@ -56,3 +56,40 @@ proportion.nts <- num.nts/mtDNA.lengths nlengthnames <- cbind(mtDNA.lengths, Names.vec) idx <- which.max(nlengthnames[,1]) nlengthnames[idx,] + + +#### Part B: Multiple Sequence Alignment +if (!require("BiocManager")) install.packages("BiocManager") +library(BiocManager) +if (!require("msa")) BiocManager::install("msa") +library(msa) +mtDNA.msa <- msa(mtDNA.multSeqs.bstr,method="ClustalOmega") +msaPrettyPrint(mtDNA.msa, file="mtDNA.pdf", output="pdf", showNames="left", + showLogo="none", askForOverwrite=FALSE, verbose=TRUE ) +## loop to make results data frame +num_seqs <- length(Names.vec) +# initialize data frame +align.stats.df <- data.frame(species=rep(NA,num_seqs), seqlen=rep(0,num_seqs), + numgaps=rep(0,num_seqs), nt_a=rep(NA,num_seqs), + nt_c=rep(NA,num_seqs), nt_g=rep(NA,num_seqs), + nt_t=rep(NA,num_seqs)) +# DNAbin type required for dist.dna and helpful for other calculations +mtDNA.msa.DNAbin <- as.DNAbin(mtDNA.msa) +for (i in 1:num_seqs){ + seq_name <- Names.vec[i] + seq.vec <- as.character(mtDNA.msa.DNAbin[i,]) + num.gaps <- sum(seq.vec=="-") + prop.nt.i <- proportion.nts[i,] + align.stats.df[i,] <- c(seq_name, mtDNA.lengths[i], num.gaps, + round(prop.nt.i[1],digits=2), round(prop.nt.i[2],digits=2), + round(prop.nt.i[3],digits=2), round(prop.nt.i[4],digits=2)) +} + +# write to file +write.table(align.stats.df,file="alignstats.tab",sep = "\t", row.names=FALSE, quote=FALSE) + +# you can use $ operator to grab a named column from a data.frame +# similar to grabbing a named variable from a list +align.stats.df$species +align.stats.df$nt_a # strings by default +as.numeric(align.stats.df$nt_a) # convert to numeric diff --git a/Schrick-Noah_CS-6643_Lab-10.docx b/Schrick-Noah_CS-6643_Lab-10.docx index ec49494..b690bb1 100644 Binary files a/Schrick-Noah_CS-6643_Lab-10.docx and b/Schrick-Noah_CS-6643_Lab-10.docx differ diff --git a/alignstats.tab b/alignstats.tab new file mode 100644 index 0000000..26ea19f --- /dev/null +++ b/alignstats.tab @@ -0,0 +1,13 @@ +species seqlen numgaps nt_a nt_c nt_g nt_t +German_Neanderthal 379 81 0.31 0.32 0.13 0.24 +Russian_Neanderthal 345 76 0.33 0.34 0.11 0.22 +Human 360 76 0.33 0.34 0.11 0.22 +Gorilla_beringei_beringei 374 96 0.29 0.36 0.13 0.23 +Pan_troglodytes_troglodytes 340 105 0.32 0.36 0.1 0.22 +Puti_Orangutan 354 90 0.31 0.4 0.1 0.19 +Gorilla_gorilla_gorilla 367 71 0.3 0.35 0.14 0.21 +Gorilla_beringei_graueri 374 105 0.28 0.35 0.14 0.23 +Pan_troglodytes_schweinfurthii 339 110 0.34 0.35 0.09 0.22 +Pan_troglodytes_ellioti 411 111 0.33 0.36 0.1 0.21 +Pan_troglodytes_verus 339 39 0.34 0.37 0.1 0.2 +Jari_Orangutan 345 111 0.32 0.39 0.1 0.19 diff --git a/mtDNA.aux b/mtDNA.aux new file mode 100644 index 0000000..e3b20cc --- /dev/null +++ b/mtDNA.aux @@ -0,0 +1,14 @@ +\relax +\savedseqlength{1}{1}{369} +\savedseqlength{1}{2}{374} +\savedseqlength{1}{3}{374} +\savedseqlength{1}{4}{354} +\savedseqlength{1}{5}{345} +\savedseqlength{1}{6}{360} +\savedseqlength{1}{7}{379} +\savedseqlength{1}{8}{345} +\savedseqlength{1}{9}{340} +\savedseqlength{1}{10}{339} +\savedseqlength{1}{11}{411} +\savedseqlength{1}{12}{339} +\gdef \@abspage@last{2} diff --git a/mtDNA.log b/mtDNA.log new file mode 100644 index 0000000..e4b56d1 --- /dev/null +++ b/mtDNA.log @@ -0,0 +1,308 @@ +This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/Arch Linux) (preloaded format=pdflatex 2022.11.8) 21 NOV 2022 16:54 +entering extended mode + restricted \write18 enabled. + %&-line parsing enabled. +**mtDNA.tex +(./mtDNA.tex +LaTeX2e <2021-11-15> patch level 1 +L3 programming layer <2022-04-10> +(/usr/share/texmf-dist/tex/latex/base/article.cls +Document Class: article 2021/10/04 v1.4n Standard LaTeX document class +(/usr/share/texmf-dist/tex/latex/base/size10.clo +File: size10.clo 2021/10/04 v1.4n Standard LaTeX file (size option) +) +\c@part=\count185 +\c@section=\count186 +\c@subsection=\count187 +\c@subsubsection=\count188 +\c@paragraph=\count189 +\c@subparagraph=\count190 +\c@figure=\count191 +\c@table=\count192 +\abovecaptionskip=\skip47 +\belowcaptionskip=\skip48 +\bibindent=\dimen138 +) +(/usr/lib/R/library/msa/tex/texshade.sty +Package: texshade 2021/04/01 LaTeX TeXshade (v1.26) + +Package `texshade', Version 1.26 of 2021/04/01. +(/usr/share/texmf-dist/tex/latex/graphics/color.sty +Package: color 2021/12/07 v1.3c Standard LaTeX Color (DPC) + +(/usr/share/texmf-dist/tex/latex/graphics-cfg/color.cfg +File: color.cfg 2016/01/02 v1.6 sample color configuration +) +Package color Info: Driver file: dvips.def on input line 149. + +(/usr/share/texmf-dist/tex/latex/graphics-def/dvips.def +File: dvips.def 2017/06/20 v3.1d Graphics/color driver for dvips +) +(/usr/share/texmf-dist/tex/latex/graphics/dvipsnam.def +File: dvipsnam.def 2016/06/17 v3.0m Driver-dependent file (DPC,SPQR) +)) +(/usr/share/texmf-dist/tex/latex/graphics/graphics.sty +Package: graphics 2021/03/04 v1.4d Standard LaTeX Graphics (DPC,SPQR) + +(/usr/share/texmf-dist/tex/latex/graphics/trig.sty +Package: trig 2021/08/11 v1.11 sin cos tan (DPC) +) +(/usr/share/texmf-dist/tex/latex/graphics-cfg/graphics.cfg +File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration +) +Package graphics Info: Driver file: pdftex.def on input line 107. + +(/usr/share/texmf-dist/tex/latex/graphics-def/pdftex.def +File: pdftex.def 2020/10/05 v1.2a Graphics/color driver for pdftex +)) +\structurefile=\read2 +\featurefile=\write3 +\alignfile=\read3 +\sublogofile=\read4 +\exp@rtfile=\write4 +\exp@rt@chimerafile=\write5 + +(/usr/share/texmf-dist/tex/latex/amsfonts/amssymb.sty +Package: amssymb 2013/01/14 v3.01 AMS font symbols + +(/usr/share/texmf-dist/tex/latex/amsfonts/amsfonts.sty +Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support +\@emptytoks=\toks16 +\symAMSa=\mathgroup4 +\symAMSb=\mathgroup5 +LaTeX Font Info: Redeclaring math symbol \hbar on input line 98. +LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold' +(Font) U/euf/m/n --> U/euf/b/n on input line 106. +)) +\symalphahelix=\mathgroup6 +\loopcount=\count193 +\innerloopcount=\count194 +\outerloopcount=\count195 +\seq@count=\count196 +\killseq@count=\count197 +\seq@percent=\count198 +\res@count=\count199 +\seq@pointer=\count266 +\pos@count=\count267 +\res@perline=\count268 +\end@count=\count269 +\cons@count=\count270 +\total@count=\count271 +\temp@count=\count272 +\triple@count=\count273 +\temp@@count=\count274 +\pos@sum=\count275 +\box@width=\skip49 +\name@width=\skip50 +\box@depth=\skip51 +\width@tmp=\skip52 +\box@height=\skip53 +\number@width=\skip54 +\line@stretch=\skip55 +\center@fill=\skip56 +\arrow@width=\skip57 +\arrow@height=\skip58 +\rule@thick=\skip59 +\arrow@thick=\skip60 +\logo@height=\skip61 +\equal@width=\skip62 +\equal@tmp=\skip63 +\equal@height=\skip64 +\temp@@length=\skip65 +\vspace@legend=\skip66 +\hspace@legend=\skip67 +\bar@length=\skip68 +Package color Info: Redefining color LightGray on input line 1716. +Package color Info: Redefining color LightLightGray on input line 1817. +Package color Info: Redefining color LightLightLightGray on input line 1919. +) +(/usr/share/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def +File: l3backend-pdftex.def 2022-04-14 L3 backend support: PDF output (pdfTeX) +\l__color_backend_stack_int=\count276 +\l__pdf_internal_box=\box50 +) +No file mtDNA.aux. +\openout1 = `mtDNA.aux'. + +LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 24. +LaTeX Font Info: ... okay on input line 24. +LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 24. +LaTeX Font Info: ... okay on input line 24. +LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 24. +LaTeX Font Info: ... okay on input line 24. +LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 24. +LaTeX Font Info: ... okay on input line 24. +LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 24. +LaTeX Font Info: ... okay on input line 24. +LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 24. +LaTeX Font Info: ... okay on input line 24. +LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 24. +LaTeX Font Info: ... okay on input line 24. +(/usr/share/texmf-dist/tex/context/base/mkii/supp-pdf.mkii +[Loading MPS to PDF converter (version 2006.09.02).] +\scratchcounter=\count277 +\scratchdimen=\dimen139 +\scratchbox=\box51 +\nofMPsegments=\count278 +\nofMParguments=\count279 +\everyMPshowfont=\toks17 +\MPscratchCnt=\count280 +\MPscratchDim=\dimen140 +\MPnumerator=\count281 +\makeMPintoPDFobject=\count282 +\everyMPtoPDFconversion=\toks18 +) (/usr/share/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty +Package: epstopdf-base 2020-01-24 v2.11 Base part for package epstopdf +Package epstopdf-base Info: Redefining graphics rule for `.eps' on input line 4 +85. + +(/usr/share/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg +File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Liv +e +)) +(/tmp/Rtmp9tSm8e/seq57d10064df.fasta: +Runaway argument? +! Paragraph ended before \inf@@get was complete. + + \par +l.25 ...hade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta} + +I suspect you've forgotten a `}', causing me to apply this +control sequence to too much text. How can we recover? +My plan is to forget the whole thing and hope for the best. + +! Misplaced alignment tab character &. +\msfline ->\par & + & & & @ +l.25 ...hade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta} + +I can't figure out why you would want to use a tab mark +here. If you just want an ampersand, the remedy is +simple: Just type `I\&' now. But if some right brace +up above has ended a previous alignment prematurely, +you're probably due for more error messages, and you +might try typing `S' now just to see what is salvageable. + +! Misplaced alignment tab character &. +\msfline ->\par & & + & & @ +l.25 ...hade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta} + +I can't figure out why you would want to use a tab mark +here. If you just want an ampersand, the remedy is +simple: Just type `I\&' now. But if some right brace +up above has ended a previous alignment prematurely, +you're probably due for more error messages, and you +might try typing `S' now just to see what is salvageable. + +! Misplaced alignment tab character &. +\msfline ->\par & & & + & @ +l.25 ...hade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta} + +I can't figure out why you would want to use a tab mark +here. If you just want an ampersand, the remedy is +simple: Just type `I\&' now. But if some right brace +up above has ended a previous alignment prematurely, +you're probably due for more error messages, and you +might try typing `S' now just to see what is salvageable. + +! Misplaced alignment tab character &. +\msfline ->\par & & & & + @ +l.25 ...hade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta} + +I can't figure out why you would want to use a tab mark +here. If you just want an ampersand, the remedy is +simple: Just type `I\&' now. But if some right brace +up above has ended a previous alignment prematurely, +you're probably due for more error messages, and you +might try typing `S' now just to see what is salvageable. + +Runaway argument? +! Paragraph ended before \check@letter was complete. + + \par +l.25 ...hade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta} + +I suspect you've forgotten a `}', causing me to apply this +control sequence to too much text. How can we recover? +My plan is to forget the whole thing and hope for the best. + +Runaway argument? +! Paragraph ended before \firstchar@get was complete. + + \par +l.25 ...hade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta} + +I suspect you've forgotten a `}', causing me to apply this +control sequence to too much text. How can we recover? +My plan is to forget the whole thing and hope for the best. + +Runaway argument? +! Paragraph ended before \firstchar@get was complete. + + \par +l.47 \end{texshade} + +I suspect you've forgotten a `}', causing me to apply this +control sequence to too much text. How can we recover? +My plan is to forget the whole thing and hope for the best. + +Runaway argument? +! Paragraph ended before \res@get was complete. + + \par +l.47 \end{texshade} + +I suspect you've forgotten a `}', causing me to apply this +control sequence to too much text. How can we recover? +My plan is to forget the whole thing and hope for the best. + +! Misplaced alignment tab character &. +\seq@line ->\par & + @ +l.47 \end{texshade} + +I can't figure out why you would want to use a tab mark +here. If you just want an ampersand, the remedy is +simple: Just type `I\&' now. But if some right brace +up above has ended a previous alignment prematurely, +you're probably due for more error messages, and you +might try typing `S' now just to see what is salvageable. + +. . . . . [1 +Non-PDF special ignored! + papersize=794.96999pt,614.295pt + +{/var/lib/texmf/fonts/map/pdftex/updmap/pdftex.map}] ) +LaTeX Font Info: Trying to load font information for U+msa on input line 47. + + +(/usr/share/texmf-dist/tex/latex/amsfonts/umsa.fd +File: umsa.fd 2013/01/14 v3.01 AMS symbols A +) +LaTeX Font Info: Trying to load font information for U+msb on input line 47. + + +(/usr/share/texmf-dist/tex/latex/amsfonts/umsb.fd +File: umsb.fd 2013/01/14 v3.01 AMS symbols B +) [2] (./mtDNA.aux) ) +Here is how much of TeX's memory you used: + 8179 strings out of 478238 + 97095 string characters out of 5850456 + 1334999 words of memory out of 5000000 + 26433 multiletter control sequences out of 15000+600000 + 471599 words of font info for 39 fonts, out of 8000000 for 9000 + 1141 hyphenation exceptions out of 8191 + 541i,6n,65p,182b,4214s stack positions out of 5000i,500n,10000p,200000b,80000s + +Output written on mtDNA.pdf (2 pages, 86917 bytes). +PDF statistics: + 26 PDF objects out of 1000 (max. 8388607) + 15 compressed objects within 1 object stream + 0 named destinations out of 1000 (max. 500000) + 1 words of extra memory for PDF output out of 10000 (max. 10000000) + diff --git a/mtDNA.pdf b/mtDNA.pdf new file mode 100644 index 0000000..c29f725 Binary files /dev/null and b/mtDNA.pdf differ diff --git a/mtDNA.tex b/mtDNA.tex new file mode 100644 index 0000000..22be093 --- /dev/null +++ b/mtDNA.tex @@ -0,0 +1,48 @@ +\documentclass[10pt]{article} + +\usepackage{texshade} + +\headheight=0pt +\headsep=0pt +\hoffset=0pt +\voffset=0pt +\paperwidth=11in +\paperheight=8.5in +\ifx\pdfoutput\undefined +\relax +\else +\pdfpagewidth=\paperwidth +\pdfpageheight=\paperheight +\fi +\oddsidemargin=-0.9in +\topmargin=-0.7in +\textwidth=10.8in +\textheight=7.9in + +\pagestyle{empty} + +\begin{document} +\begin{texshade}{/tmp/Rtmp9tSm8e/seq57d10064df.fasta} +\seqtype{N} +\shadingmode{identical} +\threshold{50} +\showconsensus[ColdHot]{bottom} +\shadingcolors{blues} +\hidelogoscale +\shownames{left} +\nameseq{1}{Gorilla gorilla gorilla} +\nameseq{2}{Gorilla beringei beringei} +\nameseq{3}{Gorilla beringei graueri} +\nameseq{4}{Puti Orangutan} +\nameseq{5}{Jari Orangutan} +\nameseq{6}{Human} +\nameseq{7}{German Neanderthal} +\nameseq{8}{Russian Neanderthal} +\nameseq{9}{Pan troglodytes troglodytes} +\nameseq{10}{Pan troglodytes schweinfurthii} +\nameseq{11}{Pan troglodytes ellioti} +\nameseq{12}{Pan troglodytes verus} +\shownumbering{right} +\showlegend +\end{texshade} +\end{document}