diff --git a/.Rhistory b/.Rhistory new file mode 100644 index 0000000..c83e194 --- /dev/null +++ b/.Rhistory @@ -0,0 +1,512 @@ +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1,2,3)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.breaks[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.breaks[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +#g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.breaks <- g.hist$breaks # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.breaks[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean)/kmin) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +alpha.LM +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=5) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +#plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3) +plot(yeast) +hist(yeast) +hist(g.vec) +g.pois +g.mean +alpha.LM +alpha.ML +degree(g) +sort(degree(g)) +sort(degree(g),decreasing=FALSE) +sort(degree(g),decreasing=F) +sort(degree(g),decreasing=false) +sort(degree(g), decreasing = TRUE) +head(sort(degree(g), decreasing = TRUE)) +stddev(degree(g)) +sd(degree(g)) +tail(sort(degree(g), decreasing = TRUE)) +plot(log(g.breaks.clean), log(g.probs.clean)) +# Homework 4 for the University of Tulsa' s CS-7863 Network Theory Course +# Degree Distribution +# Professor: Dr. McKinney, Spring 2022 +# Noah Schrick - 1492657 +library(igraph) +library(igraphdata) +data(yeast) +g <- yeast +g.netname <- "Yeast" +################# Set up Work ################# +g.vec <- degree(g) +g.hist <- hist(g.vec, freq=FALSE, main=paste("Histogram of the", g.netname, +" Network")) +legend("topright", c("Guess", "Poisson", "Least-Squares Fit", +"Max Log-Likelihood"), lty=c(1,2,3,4), col=c("#40B0A6", +"#006CD1", "#E66100", "#D35FB7")) +g.mean <- mean(g.vec) +g.seq <- 0:max(g.vec) # x-axis +################# Guessing Alpha ################# +alpha.guess <- 1.5 +lines(g.seq, g.seq^(-alpha.guess), col="#40B0A6", lty=1, lwd=3) +################# Poisson ################# +g.pois <- dpois(g.seq, g.mean, log=F) +lines(g.seq, g.pois, col="#006CD1", lty=2, lwd=3) +################# Linear model: Least-Squares Fit ################# +g.breaks <- g.hist$breaks[-c(1)] # remove 0 +g.probs <- g.hist$density[-1] # make lengths match +# Need to clean up probabilities that are 0 +nz.probs.mask <- g.probs!=0 +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +plot(log(g.breaks.clean), log(g.probs.clean)) +g.fit <- lm(log(g.probs.clean)~log(g.breaks.clean)) +summary(g.fit) +alpha.LM <- coef(g.fit)[2] +lines(g.seq, g.seq^(-alpha.LM), col="#E66100", lty=3, lwd=3) +################# Max-Log-Likelihood ################# +n <- length(g.breaks.clean) +kmin <- g.breaks.clean[1] +alpha.ML <- 1 + n/sum(log(g.breaks.clean/kmin)) +alpha.ML +lines(g.seq, g.seq^(-alpha.ML), col="#D35FB7", lty=4, lwd=3) +plot(log(g.breaks.clean), log(g.probs.clean)) +g.breaks.clean <- g.breaks[nz.probs.mask] +g.probs.clean <- g.probs[nz.probs.mask] +plot(log(g.breaks.clean), log(g.probs.clean)) +#### Part B: Loop Recursion Warmup +m <- 3 # row edges +n <- 6 # col edges +path_matrix <- matrix(1, nrow=m+1, ncol=n+1) +for (i in seq(2,m+1)){ +for (j in seq(2, n+1)){ +path_matrix[i,j] <- path_matrix[i-1,j] + path_matrix[i,j-1] +} +} +path_matrix +path_matrix[m][n] +path_matrix[m] +path_matrix[n] +path_matrix[m,n] +path_matrix[m+1,n+1] +calc.num.paths <- function(n,m){ +path_matrix <- matrix(1, nrow=m+1, ncol=n+1) +for (i in seq(2,m+1)){ +for (j in seq(2, n+1)){ +path_matrix[i,j] <- path_matrix[i-1,j] + path_matrix[i,j-1] +} +} +path_matrix[m+1,n+1] +} +#### Part B: Loop Recursion Warmup +calc.num.paths <- function(n,m){ +path_matrix <- matrix(1, nrow=m+1, ncol=n+1) +for (i in seq(2,m+1)){ +for (j in seq(2, n+1)){ +path_matrix[i,j] <- path_matrix[i-1,j] + path_matrix[i,j-1] +} +} +path_matrix[m+1,n+1] +} +m <- 5 # row edges +n <- 5 # col edges +calc.num.paths(n,m) +m <- 5 # row edges +n <- 6 # col edges +calc.num.paths(n,m) +m <- 10 # row edges +n <- 10 # col edges +calc.num.paths(n,m) +factorial(n+m)/(factorial(n)*factorial(m)) +m <- 5 # row edges +n <- 5 # col edges +calc.num.paths(n,m) +factorial(n+m)/(factorial(n)*factorial(m)) +m <- 5 # row edges +n <- 6 # col edges +calc.num.paths(n,m) +factorial(n+m)/(factorial(n)*factorial(m)) +m <- 10 # row edges +n <- 10 # col edges +calc.num.paths(n,m) +factorial(n+m)/(factorial(n)*factorial(m)) +h1n1.Cali +h1n1.Cali.dna.vec <- fasta2vec("FJ969540.1.fasta") +#### Part A: EMBOSS pairwise alignment server and influenza +## Load associated supportive libraries +if (!require("seqinr")) install.packages("seqinr") +library(seqinr) +## Load in the fasta file +fasta2vec <- function(fasta.file){ +if (!require("seqinr")) install.packages("seqinr") +library(seqinr) +fasta <- read.fasta(file=fasta.file, as.string= TRUE) +fasta.string <- fasta[[1]][1] +fasta.list <- strsplit(fasta.string,"") +fasta.vec <- unlist(fasta.list) +} +h1n1.Cali.dna.vec <- fasta2vec("FJ969540.1.fasta") +## Set Working Directory to file directory - RStudio approach +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +h1n1.Cali.dna.vec <- fasta2vec("FJ969540.1.fasta") +h1n1.Cali.dna.vec[1:5] +?count +count(h1n1.Cali.dna.vec[1:5],2) +count(h1n1.Cali.dna.vec[1:5],2)["aa"] +count(h1n1.Cali.dna.vec[1:5],11) +count(h1n1.Cali.dna.vec[1:5],1) +#### Part C: Dinucleotide Signals +calc.sliding.cpg <- function(fastaVec, slideWin){ +# allocate memory for odds ratio output +cg_dinuc_oddsRatio <- double() +n<-length(fastaVec) +for (i in 1:(n-slideWin)){ +# array slice the vectpr on sliding window size and obtain dinuc appearances +dinucs<-count(fastaVec[i:(i+slideWin)],2) +# retrieve number of times "CG" appeared +cg_dinuc_count <- dinucs["cg"] +# get counts of all nucleotides +nucs<-count(fastaVec[i:(i+slideWin)],1) +# obtain times dinuc pairing appeared per times the indiv nucs appeared +cg_dinuc_oddsRatio[i] <- +cg_dinuc_count/(nucs["c"]*nucs["g"]) +} +return(cg_dinuc_oddsRatio) # returns vector +} +apoe.fasta.vec <- fasta2vec("apoe.fasta") +cpg_vec <- calc.sliding.cpg(apoe.fasta.vec,150) +plot(cpg_vec,type="l",main="Observed vs Expected CG",xlab="Base Index", ylab="Obs/Exp") +cpg_vec diff --git a/Schrick-Noah_CS-6643_Lab-8.R b/Schrick-Noah_CS-6643_Lab-8.R index d49fc9c..603f2b3 100644 --- a/Schrick-Noah_CS-6643_Lab-8.R +++ b/Schrick-Noah_CS-6643_Lab-8.R @@ -75,4 +75,25 @@ n <- 10 # col edges calc.num.paths(n,m) factorial(n+m)/(factorial(n)*factorial(m)) +#### Part C: Dinucleotide Signals +calc.sliding.cpg <- function(fastaVec, slideWin){ + # allocate memory for odds ratio output + cg_dinuc_oddsRatio <- double() + n<-length(fastaVec) + for (i in 1:(n-slideWin)){ + # array slice the vectpr on sliding window size and obtain dinuc appearances + dinucs<-count(fastaVec[i:(i+slideWin)],2) + # retrieve number of times "CG" appeared + cg_dinuc_count <- dinucs["cg"] + # get counts of all nucleotides + nucs<-count(fastaVec[i:(i+slideWin)],1) + # obtain times dinuc pairing appeared per times the indiv nucs appeared + cg_dinuc_oddsRatio[i] <- + cg_dinuc_count/(nucs["c"]*nucs["g"]) + } + return(cg_dinuc_oddsRatio) # returns vector +} +apoe.fasta.vec <- fasta2vec("apoe.fasta") +cpg_vec <- calc.sliding.cpg(apoe.fasta.vec,150) +plot(cpg_vec,type="l",main="Observed vs Expected CG",xlab="Base Index", ylab="Obs/Exp") diff --git a/Schrick-Noah_CS-6643_Lab-8.doc b/Schrick-Noah_CS-6643_Lab-8.doc index 548acc2..e2581ed 100644 Binary files a/Schrick-Noah_CS-6643_Lab-8.doc and b/Schrick-Noah_CS-6643_Lab-8.doc differ diff --git a/Schrick-Noah_CS-6643_Lab-8.pdf b/Schrick-Noah_CS-6643_Lab-8.pdf new file mode 100644 index 0000000..0ee1b7d Binary files /dev/null and b/Schrick-Noah_CS-6643_Lab-8.pdf differ diff --git a/apoe.fasta b/apoe.fasta new file mode 100644 index 0000000..82b1eaa --- /dev/null +++ b/apoe.fasta @@ -0,0 +1,54 @@ +>NC_000019.10:44905796-44909393 Homo sapiens chromosome 19, GRCh38.p14 Primary Assembly +CTACTCAGCCCCAGCGGAGGTGAAGGACGTCCTTCCCCAGGAGCCGGTGAGAAGCGCAGTCGGGGGCACG +GGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGGTAGTCTCAGGA +GAGCTACTCGGGGTCGGGCTTGGGGAGAGGAGGAGCGGGGGTGAGGCAAGCAGCAGGGGACTGGACCTGG +GAAGGGCTGGGCAGCAGAGACGACCCGACCCGCTAGAAGGTGGGGTGGGGAGAGCAGCTGGACTGGGATG +TAAGCCATAGCAGGACTCCACGAGTTGTCACTATCATTTATCGAGCACCTACTGGGTGTCCCCAGTGTCC +TCAGATCTCCATAACTGGGGAGCCAGGGGCAGCGACACGGTAGCTAGCCGTCGATTGGAGAACTTTAAAA +TGAGGACTGAATTAGCTCATAAATGGAACACGGCGCTTAACTGTGAGGTTGGAGCTTAGAATGTGAAGGG +AGAATGAGGAATGCGAGACTGGGACTGAGATGGAACCGGCGGTGGGGAGGGGGTGGGGGGATGGAATTTG +AACCCCGGGAGAGGAAGATGGAATTTTCTATGGAGGCCGACCTGGGGATGGGGAGATAAGAGAAGACCAG +GAGGGAGTTAAATAGGGAATGGGTTGGGGGCGGCTTGGTAAATGTGCTGGGATTAGGCTGTTGCAGATAA +TGCAACAAGGCTTGGAAGGCTAACCTGGGGTGAGGCCGGGTTGGGGCCGGGCTGGGGGTGGGAGGAGTCC +TCACTGGCGGTTGATTGACAGTTTCTCCTTCCCCAGACTGGCCAATCACAGGCAGGAAGATGAAGGTTCT +GTGGGCTGCGTTGCTGGTCACATTCCTGGCAGGTATGGGGGCGGGGCTTGCTCGGTTCCCCCCGCTCCTC +CCCCTCTCATCCTCACCTCAACCTCCTGGCCCCATTCAGGCAGACCCTGGGCCCCCTCTTCTGAGGCTTC +TGTGCTGCTTCCTGGCTCTGAACAGCGATTTGACGCTCTCTGGGCCTCGGTTTCCCCCATCCTTGAGATA +GGAGTTAGAAGTTGTTTTGTTGTTGTTGTTTGTTGTTGTTGTTTTGTTTTTTTGAGATGAAGTCTCGCTC +TGTCGCCCAGGCTGGAGTGCAGTGGCGGGATCTCGGCTCACTGCAAGCTCCGCCTCCCAGGTCCACGCCA +TTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGACTACAGGCACATGCCACCACACCCGACTAACTTTTTTG +TATTTTCAGTAGAGACGGGGTTTCACCATGTTGGCCAGGCTGGTCTGGAACTCCTGACCTCAGGTGATCT +GCCCGTTTCGATCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGCACCTGGCTGGGAGTTAGAGGT +TTCTAATGCATTGCAGGCAGATAGTGAATACCAGACACGGGGCAGCTGTGATCTTTATTCTCCATCACCC +CCACACAGCCCTGCCTGGGGCACACAAGGACACTCAATACATGCTTTTCCGCTGGGCGCGGTGGCTCACC +CCTGTAATCCCAGCACTTTGGGAGGCCAAGGTGGGAGGATCACTTGAGCCCAGGAGTTCAACACCAGCCT +GGGCAACATAGTGAGACCCTGTCTCTACTAAAAATACAAAAATTAGCCAGGCATGGTGCCACACACCTGT +GCTCTCAGCTACTCAGGAGGCTGAGGCAGGAGGATCGCTTGAGCCCAGAAGGTCAAGGTTGCAGTGAACC +ATGTTCAGGCCGCTGCACTCCAGCCTGGGTGACAGAGCAAGACCCTGTTTATAAATACATAATGCTTTCC +AAGTGATTAAACCGACTCCCCCCTCACCCTGCCCACCATGGCTCCAAAGAAGCATTTGTGGAGCACCTTC +TGTGTGCCCCTAGGTACTAGATGCCTGGACGGGGTCAGAAGGACCCTGACCCACCTTGAACTTGTTCCAC +ACAGGATGCCAGGCCAAGGTGGAGCAAGCGGTGGAGACAGAGCCGGAGCCCGAGCTGCGCCAGCAGACCG +AGTGGCAGAGCGGCCAGCGCTGGGAACTGGCACTGGGTCGCTTTTGGGATTACCTGCGCTGGGTGCAGAC +ACTGTCTGAGCAGGTGCAGGAGGAGCTGCTCAGCTCCCAGGTCACCCAGGAACTGAGGTGAGTGTCCCCA +TCCTGGCCCTTGACCCTCCTGGTGGGCGGCTATACCTCCCCAGGTCCAGGTTTCATTCTGCCCCTGTCGC +TAAGTCTTGGGGGGCCTGGGTCTCTGCTGGTTCTAGCTTCCTCTTCCCATTTCTGACTCCTGGCTTTAGC +TCTCTGGAATTCTCTCTCTCAGCTTTGTCTCTCTCTCTTCCCTTCTGACTCAGTCTCTCACACTCGTCCT +GGCTCTGTCTCTGTCCTTCCCTAGCTCTTTTATATAGAGACAGAGAGATGGGGTCTCACTGTGTTGCCCA +GGCTGGTCTTGAACTTCTGGGCTCAAGCGATCCTCCCGCCTCGGCCTCCCAAAGTGCTGGGATTAGAGGC +ATGAGCCACCTTGCCCGGCCTCCTAGCTCCTTCTTCGTCTCTGCCTCTGCCCTCTGCATCTGCTCTCTGC +ATCTGTCTCTGTCTCCTTCTCTCGGCCTCTGCCCCGTTCCTTCTCTCCCTCTTGGGTCTCTCTGGCTCAT +CCCCATCTCGCCCGCCCCATCCCAGCCCTTCTCCCCGCCTCCCACTGTGCGACACCCTCCCGCCCTCTCG +GCCGCAGGGCGCTGATGGACGAGACCATGAAGGAGTTGAAGGCCTACAAATCGGAACTGGAGGAACAACT +GACCCCGGTGGCGGAGGAGACGCGGGCACGGCTGTCCAAGGAGCTGCAGGCGGCGCAGGCCCGGCTGGGC +GCGGACATGGAGGACGTGTGCGGCCGCCTGGTGCAGTACCGCGGCGAGGTGCAGGCCATGCTCGGCCAGA +GCACCGAGGAGCTGCGGGTGCGCCTCGCCTCCCACCTGCGCAAGCTGCGTAAGCGGCTCCTCCGCGATGC +CGATGACCTGCAGAAGCGCCTGGCAGTGTACCAGGCCGGGGCCCGCGAGGGCGCCGAGCGCGGCCTCAGC +GCCATCCGCGAGCGCCTGGGGCCCCTGGTGGAACAGGGCCGCGTGCGGGCCGCCACTGTGGGCTCCCTGG +CCGGCCAGCCGCTACAGGAGCGGGCCCAGGCCTGGGGCGAGCGGCTGCGCGCGCGGATGGAGGAGATGGG +CAGCCGGACCCGCGACCGCCTGGACGAGGTGAAGGAGCAGGTGGCGGAGGTGCGCGCCAAGCTGGAGGAG +CAGGCCCAGCAGATACGCCTGCAGGCCGAGGCCTTCCAGGCCCGCCTCAAGAGCTGGTTCGAGCCCCTGG +TGGAAGACATGCAGCGCCAGTGGGCCGGGCTGGTGGAGAAGGTGCAGGCTGCCGTGGGCACCAGCGCCGC +CCCTGTGCCCAGCGACAATCACTGAACGCCGAAGCCTGCAGCCATGCGACCCCACGCCACCCCGTGCCTC +CTGCCTCCGCGCAGCCTGCAGCGGGAGACCCTGTCCCCGCCCCAGCCGTCCTCCTGGGGTGGACCCTAGT +TTAATAAAGATTCACCAAGTTTCACGCA +