# Lab 3 for the University of Tulsa's CS-6643 Bioinformatics Course # Expression Exploratory Analysis # Professor: Dr. McKinney, Fall 2022 # Noah L. Schrick - 1492657 ## Set Working Directory to file directory - RStudio approach setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #### Part A: Loading Data ## 1: Loading Gene Expression Data load("sense.filtered.cpm.Rdata") dim(sense.filtered.cpm) colnames(sense.filtered.cpm) ## 2: Demographic Data # Loading subject.attrs <- read.csv("Demographic_symptom.csv", stringsAsFactors = FALSE) dim(subject.attrs) # 160 subjects x 40 attributes colnames(subject.attrs) # interested in X (sample ids) and Diag (diagnosis) subject.attrs$X subject.attrs$Diag # Matching gene expression samples with their diagnosis if (!require("dplyr")) install.packages("dplyr") library(dplyr) # create a phenotype vector # grab X (subject ids) and Diag (Diagnosis) from subject.attrs that # intersect %in% with the RNA-Seq data phenos.df <- subject.attrs %>% filter(X %in% colnames(sense.filtered.cpm)) %>% dplyr::select(X, Diag) colnames(phenos.df) # $Diag is mdd diagnosis # grab Diag column and convert character to factor mddPheno <- as.factor(phenos.df$Diag) # this is our phenotype/class vector summary(mddPheno) # MDD -- major depressive disorder, HC -- healthy control