# Lab 8 for the University of Tulsa's CS-6643 Bioinformatics Course # Pairwise Sequence Alignment # Professor: Dr. McKinney, Fall 2022 # Noah L. Schrick - 1492657 ## Set Working Directory to file directory - RStudio approach setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #### Part A: EMBOSS pairwise alignment server and influenza ## Load associated supportive libraries if (!require("seqinr")) install.packages("seqinr") library(seqinr) ## Load in the fasta file fasta2vec <- function(fasta.file){ if (!require("seqinr")) install.packages("seqinr") library(seqinr) fasta <- read.fasta(file=fasta.file, as.string= TRUE) fasta.string <- fasta[[1]][1] fasta.list <- strsplit(fasta.string,"") fasta.vec <- unlist(fasta.list) } h1n1.Cali.dna.vec <- fasta2vec("FJ969540.1.fasta") h1n1.Bris.dna.vec <- fasta2vec("CY030230.1.fasta") ## Convert DNA seq into amino acid seq h1n1.Cali.aa.vec<-seqinr::translate(h1n1.Cali.dna.vec) h1n1.Cali.aa.table<-table(h1n1.Cali.aa.vec) # aa count table ## Create dotchart of amino acid freqs # sort the table (smallest to largest) h1n1.Cali.aa.sortedtable <-h1n1.Cali.aa.table[order(h1n1.Cali.aa.table)] # convert the AA table names to 3-letter code # You can ignore the warning or remove the offending letters from # the table names(h1n1.Cali.aa.sortedtable)<-aaa(names(h1n1.Cali.aa.sortedtable)) dotchart(h1n1.Cali.aa.sortedtable) # Repeating for Brisbane and accounting for shift in start codon h1n1.Bris.aa.vec <- seqinr::translate(h1n1.Bris.dna.vec) h1n1.Bris.aa.vec <- h1n1.Bris.aa.vec[-seq(1, match('M', h1n1.Bris.aa.vec)-1)] h1n1.Bris.aa.table<-table(h1n1.Bris.aa.vec) # aa count table h1n1.Bris.aa.sortedtable <-h1n1.Bris.aa.table[order(h1n1.Bris.aa.table)] names(h1n1.Bris.aa.sortedtable)<-aaa(names(h1n1.Bris.aa.sortedtable)) dotchart(h1n1.Bris.aa.sortedtable) paste(h1n1.Bris.aa.vec,collapse="",sep="") #### Part B: Loop Recursion Warmup calc.num.paths <- function(n,m){ path_matrix <- matrix(1, nrow=m+1, ncol=n+1) for (i in seq(2,m+1)){ for (j in seq(2, n+1)){ path_matrix[i,j] <- path_matrix[i-1,j] + path_matrix[i,j-1] } } path_matrix[m+1,n+1] } m <- 5 # row edges n <- 5 # col edges calc.num.paths(n,m) factorial(n+m)/(factorial(n)*factorial(m)) m <- 5 # row edges n <- 6 # col edges calc.num.paths(n,m) factorial(n+m)/(factorial(n)*factorial(m)) m <- 10 # row edges n <- 10 # col edges calc.num.paths(n,m) factorial(n+m)/(factorial(n)*factorial(m)) #### Part C: Dinucleotide Signals calc.sliding.cpg <- function(fastaVec, slideWin){ # allocate memory for odds ratio output cg_dinuc_oddsRatio <- double() n<-length(fastaVec) for (i in 1:(n-slideWin)){ # array slice the vectpr on sliding window size and obtain dinuc appearances dinucs<-count(fastaVec[i:(i+slideWin)],2) # retrieve number of times "CG" appeared cg_dinuc_count <- dinucs["cg"] # get counts of all nucleotides nucs<-count(fastaVec[i:(i+slideWin)],1) # obtain times dinuc pairing appeared per times the indiv nucs appeared cg_dinuc_oddsRatio[i] <- cg_dinuc_count/(nucs["c"]*nucs["g"]) } return(cg_dinuc_oddsRatio) # returns vector } apoe.fasta.vec <- fasta2vec("apoe.fasta") cpg_vec <- calc.sliding.cpg(apoe.fasta.vec,150) plot(cpg_vec,type="l",main="Observed vs Expected CG",xlab="Base Index", ylab="Obs/Exp")