From abe72f45da34f807b3b55b1b3865867908a3483e Mon Sep 17 00:00:00 2001 From: noah Date: Thu, 13 Oct 2022 15:08:33 -0500 Subject: [PATCH] Data pre-processing --- .~lock.lab5_expression3.docx# | 1 + Schrick-Noah_CS-6643_Lab-5.R | 51 +++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 .~lock.lab5_expression3.docx# create mode 100644 Schrick-Noah_CS-6643_Lab-5.R diff --git a/.~lock.lab5_expression3.docx# b/.~lock.lab5_expression3.docx# new file mode 100644 index 0000000..9c2cc5e --- /dev/null +++ b/.~lock.lab5_expression3.docx# @@ -0,0 +1 @@ +,noah,NovaArchSys,13.10.2022 15:00,file:///home/noah/.config/libreoffice/4; \ No newline at end of file diff --git a/Schrick-Noah_CS-6643_Lab-5.R b/Schrick-Noah_CS-6643_Lab-5.R new file mode 100644 index 0000000..35eb43f --- /dev/null +++ b/Schrick-Noah_CS-6643_Lab-5.R @@ -0,0 +1,51 @@ +# Lab 5 for the University of Tulsa's CS-6643 Bioinformatics Course +# Gene Expression Statistical Learning +# Professor: Dr. McKinney, Fall 2022 +# Noah L. Schrick - 1492657 + +## Set Working Directory to file directory - RStudio approach +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) + +#### 0: Process and filter data +# load gene expression data +load("sense.filtered.cpm.Rdata") # setwd! + +# load phenotype (mdd/hc) data +subject.attrs <- read.csv("Demographic_symptom.csv", + stringsAsFactors = FALSE) + +if (!require("dplyr")) install.packages("dplyr") +library(dplyr) +# grab intersecting X (subject ids) and Diag (Diagnosis) from columns +phenos.df <- subject.attrs %>% + filter(X %in% colnames(sense.filtered.cpm)) %>% + dplyr::select(X, Diag) +mddPheno <- as.factor(phenos.df$Diag) + +# Normalized and transform +if (!require("preprocessCore")) install.packages("preprocessCore") +library(preprocessCore) +mddExprData_quantile <- normalize.quantiles(sense.filtered.cpm) +mddExprData_quantileLog2 <- log2(mddExprData_quantile) +# attach phenotype names and gene names to data +colnames(mddExprData_quantileLog2) <- mddPheno +rownames(mddExprData_quantileLog2) <- rownames(sense.filtered.cpm) + +# coefficient of variation filter sd(x)/abs(mean(x)) +CoV_values <- apply(mddExprData_quantileLog2,1, + function(x) {sd(x)/abs(mean(x))}) +# smaller threshold, the higher the experimental effect relative to the +# measurement precision +sum(CoV_values<.045) +# there is one gene that has 0 variation -- remove +sd_values <- apply(mddExprData_quantileLog2,1, function(x) {sd(x)}) +rownames(mddExprData_quantileLog2)[sd_values==0] +# filter the data matrix +GxS.covfilter <- mddExprData_quantileLog2[CoV_values<.045 & sd_values>0,] +dim(GxS.covfilter) + +# convert phenotype to factor +pheno.factor <- as.factor(colnames(GxS.covfilter)) +pheno.factor +str(pheno.factor) +levels(pheno.factor) \ No newline at end of file