Finalizing EMBOSS pairwise alignment with influenza

This commit is contained in:
Noah L. Schrick 2022-11-09 17:36:42 -06:00
parent d37c6cf3f9
commit 15c29fa1ef
4 changed files with 112 additions and 0 deletions

31
CY030230.1.fasta Normal file
View File

@ -0,0 +1,31 @@
>ENA|CY030230|CY030230.1 Influenza A virus (A/Brisbane/59/2007(H1N1)) segment 4 sequence.
AGCAAAAGCAGGGGATAATAAAAACAACCAGAATGAAAGTAAAACTACTGGTCCTGTTAT
GCACATTTACAGCTACATATGCAGACACAATATGTATAGGCTACCATGCTAACAACTCGA
CCGACACTGTTGACACAGTACTTGAAAAGAATGTGACAGTGACACACTCTGTCAACCTGC
TTGAGAACAGTCACAATGGAAAACTATGTCTATTAAAAGGAATAGCCCCACTACAATTGG
GTAATTGCAGCGTTGCCGGGTGGATCTTAGGAAACCCAGAATGCGAATTACTGATTTCCA
AGGAGTCATGGTCCTACATTGTAGAAAAACCAAATCCTGAGAATGGAACATGTTACCCAG
GGCATTTCGCTGACTATGAGGAACTGAGGGAGCAATTGAGTTCAGTATCTTCATTTGAGA
GGTTCGAAATATTCCCCAAAGAAAGCTCATGGCCCAACCACACCGTAACCGGAGTGTCAG
CATCATGCTCCCATAATGGGGAAAGCAGTTTTTACAGAAATTTGCTATGGCTGACGGGGA
AGAATGGTTTGTACCCAAACCTGAGCAAGTCCTATGCAAACAACAAAGAAAAAGAAGTCC
TTGTACTATGGGGTGTTCATCACCCGCCAAACATAGGTGACCAAAAGGCCCTCTATCATA
CAGAAAATGCTTATGTCTCTGTAGTGTCTTCACATTATAGCAGAAAATTCACCCCAGAAA
TAGCCAAAAGACCCAAAGTAAGAGATCAAGAAGGAAGAATCAATTACTACTGGACTCTGC
TTGAACCCGGGGATACAATAATATTTGAGGCAAATGGAAATCTAATAGCGCCAAGATATG
CTTTCGCACTGAGTAGAGGCTTTGGATCAGGAATCATCAACTCAAATGCACCAATGGATA
AATGTGATGCGAAGTGCCAAACACCTCAGGGAGCTATAAACAGCAGTCTTCCTTTCCAGA
ACGTACACCCAGTCACAATAGGAGAGTGTCCAAAGTATGTCAGGAGTGCAAAATTAAGGA
TGGTTACAGGACTAAGGAACATCCCATCCATTCAATCCAGAGGTTTGTTTGGAGCCATTG
CCGGTTTCATTGAAGGGGGGTGGACTGGAATGGTAGATGGTTGGTATGGTTATCATCATC
AGAATGAGCAAGGATCTGGCTATGCTGCAGATCAAAAAAGCACACAAAATGCCATTAATG
GGATTACAAACAAGGTGAATTCTGTAATTGAGAAAATGAACACTCAATTCACAGCAGTGG
GCAAAGAATTCAACAAATTGGAAAGAAGGATGGAAAACTTGAATAAAAAAGTTGATGATG
GGTTTATAGACATTTGGACATATAATGCAGAACTGTTGGTTCTACTGGAAAATGAAAGGA
CTTTGGATTTCCATGACTCCAATGTGAAGAATCTGTATGAGAAAGTAAAAAGCCAGTTAA
AGAATAATGCTAAAGAAATAGGAAATGGGTGTTTTGAATTCTATCACAAGTGTAACGATG
AATGCATGGAGAGTGTAAAGAATGGAACTTATGACTATCCAAAATATTCCGAAGAATCAA
AGTTAAACAGGGAGAAAATTGATGGAGTGAAATTGGAATCAATGGGAGTCTATCAGATTC
TGGCGATCTACTCAACAGTCGCCAGTTCTCTGGTTCTTTTGGTCTCCCTGGGGGCAATCA
GCTTCTGGATGTGTTCCAATGGGTCTTTACAGTGTAGAATATGCATCTAAGACCAGAATT
TCAGAAATATAAGGAAAAACA

30
FJ969540.1.fasta Normal file
View File

@ -0,0 +1,30 @@
>ENA|FJ969540|FJ969540.1 Influenza A virus (A/California/07/2009(H1N1)) segment 4 hemagglutinin (HA) gene, complete cds.
ATGAAGGCAATACTAGTAGTTCTGCTATATACATTTGCAACCGCAAATGCAGACACATTA
TGTATAGGTTATCATGCGAACAATTCAACAGACACTGTAGACACAGTACTAGAAAAGAAT
GTAACAGTAACACACTCTGTTAACCTTCTAGAAGACAAGCATAACGGGAAACTATGCAAA
CTAAGAGGGGTAGCCCCATTGCATTTGGGTAAATGTAACATTGCTGGCTGGATCCTGGGA
AATCCAGAGTGTGAATCACTCTCCACAGCAAGCTCATGGTCCTACATTGTGGAAACACCT
AGTTCAGACAATGGAACGTGTTACCCAGGAGATTTCATCGATTATGAGGAGCTAAGAGAG
CAATTGAGCTCAGTGTCATCATTTGAAAGGTTTGAGATATTCCCCAAGACAAGTTCATGG
CCCAATCATGACTCGAACAAAGGTGTAACGGCAGCATGTCCTCATGCTGGAGCAAAAAGC
TTCTACAAAAATTTAATATGGCTAGTTAAAAAAGGAAATTCATACCCAAAGCTCAGCAAA
TCCTACATTAATGATAAAGGGAAAGAAGTCCTCGTGCTATGGGGCATTCACCATCCATCT
ACTAGTGCTGACCAACAAAGTCTCTATCAGAATGCAGATGCATATGTTTTTGTGGGGTCA
TCAAGATACAGCAAGAAGTTCAAGCCGGAAATAGCAATAAGACCCAAAGTGAGGGRTCRA
GAAGGGAGAATGAACTATTACTGGACACTAGTAGAGCCGGGAGACAAAATAACATTCGAA
GCAACTGGAAATCTAGTGGTACCGAGATATGCATTCGCAATGGAAAGAAATGCTGGATCT
GGTATTATCATTTCAGATACACCAGTCCACGATTGCAATACAACTTGTCAAACACCCAAG
GGTGCTATAAACACCAGCCTCCCATTTCAGAATATACATCCGATCACAATTGGAAAATGT
CCAAAATATGTAAAAAGCACAAAATTGAGACTGGCCACAGGATTGAGGAATATCCCGTCT
ATTCAATCTAGAGGCCTATTTGGGGCCATTGCCGGTTTCATTGAAGGGGGGTGGACAGGG
ATGGTAGATGGATGGTACGGTTATCACCATCAAAATGAGCAGGGGTCAGGATATGCAGCC
GACCTGAAGAGCACACAGAATGCCATTGACGAGATTACTAACAAAGTAAATTCTGTTATT
GAAAAGATGAATACACAGTTCACAGCAGTAGGTAAAGAGTTCAACCACCTGGAAAAAAGA
ATAGAGAATTTAAATAAAAAAGTTGATGATGGTTTCCTGGACATTTGGACTTACAATGCC
GAACTGTTGGTTCTATTGGAAAATGAAAGAACTTTGGACTACCACGATTCAAATGTGAAG
AACTTATATGAAAAGGTAAGAAGCCAGCTAAAAAACAATGCCAAGGAAATTGGAAACGGC
TGCTTTGAATTTTACCACAAATGCGATAACACGTGCATGGAAAGTGTCAAAAATGGGACT
TATGACTACCCAAAATACTCAGAGGAAGCAAAATTAAACAGAGAAGAAATAGATGGGGTA
AAGCTGGAATCAACAAGGATTTACCAGATTTTGGCGATCTATTCAACTGTCGCCAGTTCA
TTGGTACTGGTAGTCTCCCTGGGGGCAATCAGTTTCTGGATGTGCTCTAATGGGTCTCTA
CAGTGTAGAATATGTATTTAA

View File

@ -0,0 +1,51 @@
# Lab 8 for the University of Tulsa's CS-6643 Bioinformatics Course
# Pairwise Sequence Alignment
# Professor: Dr. McKinney, Fall 2022
# Noah L. Schrick - 1492657
## Set Working Directory to file directory - RStudio approach
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#### Part A: EMBOSS pairwise alignment server and influenza
## Load associated supportive libraries
if (!require("seqinr")) install.packages("seqinr")
library(seqinr)
## Load in the fasta file
fasta2vec <- function(fasta.file){
if (!require("seqinr")) install.packages("seqinr")
library(seqinr)
fasta <- read.fasta(file=fasta.file, as.string= TRUE)
fasta.string <- fasta[[1]][1]
fasta.list <- strsplit(fasta.string,"")
fasta.vec <- unlist(fasta.list)
}
h1n1.Cali.dna.vec <- fasta2vec("FJ969540.1.fasta")
h1n1.Bris.dna.vec <- fasta2vec("CY030230.1.fasta")
## Convert DNA seq into amino acid seq
h1n1.Cali.aa.vec<-seqinr::translate(h1n1.Cali.dna.vec)
h1n1.Cali.aa.table<-table(h1n1.Cali.aa.vec) # aa count table
## Create dotchart of amino acid freqs
# sort the table (smallest to largest)
h1n1.Cali.aa.sortedtable <-h1n1.Cali.aa.table[order(h1n1.Cali.aa.table)]
# convert the AA table names to 3-letter code
# You can ignore the warning or remove the offending letters from
# the table
names(h1n1.Cali.aa.sortedtable)<-aaa(names(h1n1.Cali.aa.sortedtable))
dotchart(h1n1.Cali.aa.sortedtable)
# Repeating for Brisbane and accounting for shift in start codon
h1n1.Bris.aa.vec <- seqinr::translate(h1n1.Bris.dna.vec)
h1n1.Bris.aa.vec <- h1n1.Bris.aa.vec[-seq(1, match('M', h1n1.Bris.aa.vec)-1)]
h1n1.Bris.aa.table<-table(h1n1.Bris.aa.vec) # aa count table
h1n1.Bris.aa.sortedtable <-h1n1.Bris.aa.table[order(h1n1.Bris.aa.table)]
names(h1n1.Bris.aa.sortedtable)<-aaa(names(h1n1.Bris.aa.sortedtable))
dotchart(h1n1.Bris.aa.sortedtable)
paste(h1n1.Bris.aa.vec,collapse="",sep="")
#

Binary file not shown.