# Lab 8 for the University of Tulsa's CS-6643 Bioinformatics Course
# Pairwise Sequence Alignment
# Professor: Dr. McKinney, Fall 2022
# Noah L. Schrick - 1492657

## Set Working Directory to file directory - RStudio approach
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

#### Part A: EMBOSS pairwise alignment server and influenza
## Load associated supportive libraries
if (!require("seqinr")) install.packages("seqinr")
library(seqinr)

## Load in the fasta file
fasta2vec <- function(fasta.file){
  if (!require("seqinr")) install.packages("seqinr")
  library(seqinr)
  fasta <- read.fasta(file=fasta.file, as.string= TRUE)
  fasta.string <- fasta[[1]][1]
  fasta.list <- strsplit(fasta.string,"")
  fasta.vec <- unlist(fasta.list)
}

h1n1.Cali.dna.vec <- fasta2vec("FJ969540.1.fasta")
h1n1.Bris.dna.vec <- fasta2vec("CY030230.1.fasta")

## Convert DNA seq into amino acid seq
h1n1.Cali.aa.vec<-seqinr::translate(h1n1.Cali.dna.vec)
h1n1.Cali.aa.table<-table(h1n1.Cali.aa.vec) # aa count table

## Create dotchart of amino acid freqs
# sort the table (smallest to largest)
h1n1.Cali.aa.sortedtable <-h1n1.Cali.aa.table[order(h1n1.Cali.aa.table)]
# convert the AA table names to 3-letter code
# You can ignore the warning or remove the offending letters from
# the table
names(h1n1.Cali.aa.sortedtable)<-aaa(names(h1n1.Cali.aa.sortedtable)) 
dotchart(h1n1.Cali.aa.sortedtable)

# Repeating for Brisbane and accounting for shift in start codon
h1n1.Bris.aa.vec <- seqinr::translate(h1n1.Bris.dna.vec)
h1n1.Bris.aa.vec <- h1n1.Bris.aa.vec[-seq(1, match('M', h1n1.Bris.aa.vec)-1)]

h1n1.Bris.aa.table<-table(h1n1.Bris.aa.vec) # aa count table
h1n1.Bris.aa.sortedtable <-h1n1.Bris.aa.table[order(h1n1.Bris.aa.table)]

names(h1n1.Bris.aa.sortedtable)<-aaa(names(h1n1.Bris.aa.sortedtable)) 
dotchart(h1n1.Bris.aa.sortedtable)
paste(h1n1.Bris.aa.vec,collapse="",sep="")


#### Part B: Loop Recursion Warmup
calc.num.paths <- function(n,m){
  path_matrix <- matrix(1, nrow=m+1, ncol=n+1)
  for (i in seq(2,m+1)){
    for (j in seq(2, n+1)){
      path_matrix[i,j] <- path_matrix[i-1,j] + path_matrix[i,j-1]
    }
  }
  path_matrix[m+1,n+1]
}

m <- 5 # row edges
n <- 5 # col edges
calc.num.paths(n,m)
factorial(n+m)/(factorial(n)*factorial(m))

m <- 5 # row edges
n <- 6 # col edges
calc.num.paths(n,m)
factorial(n+m)/(factorial(n)*factorial(m))

m <- 10 # row edges
n <- 10 # col edges
calc.num.paths(n,m)
factorial(n+m)/(factorial(n)*factorial(m))

#### Part C: Dinucleotide Signals
calc.sliding.cpg <- function(fastaVec, slideWin){
  # allocate memory for odds ratio output 
  cg_dinuc_oddsRatio <- double() 
  n<-length(fastaVec)
  for (i in 1:(n-slideWin)){
    # array slice the vectpr on sliding window size and obtain dinuc appearances
    dinucs<-count(fastaVec[i:(i+slideWin)],2)
    # retrieve number of times "CG" appeared
    cg_dinuc_count <- dinucs["cg"]
    # get counts of all nucleotides
    nucs<-count(fastaVec[i:(i+slideWin)],1)
    # obtain times dinuc pairing appeared per times the indiv nucs appeared
    cg_dinuc_oddsRatio[i] <- 		
      cg_dinuc_count/(nucs["c"]*nucs["g"])
  }
  return(cg_dinuc_oddsRatio) # returns vector
}

apoe.fasta.vec <- fasta2vec("apoe.fasta")
cpg_vec <- calc.sliding.cpg(apoe.fasta.vec,150)
plot(cpg_vec,type="l",main="Observed vs Expected CG",xlab="Base Index", ylab="Obs/Exp")