diff --git a/Schrick-Noah_Homework-6.R b/Schrick-Noah_Homework-6.R index 825137b..d0d842a 100644 --- a/Schrick-Noah_Homework-6.R +++ b/Schrick-Noah_Homework-6.R @@ -1,8 +1,11 @@ -# Project 6 for the University of Tulsa's CS-7863 Sci-Stat Course +# Homework 6 for the University of Tulsa's CS-7863 Sci-Stat Course # Penalized Machine Learning # Professor: Dr. McKinney, Spring 2023 # Noah L. Schrick - 1492657 +if (!require("data.table")) install.packages("data.table") +library(data.table) + # 1. Penalized Regression and Classification ## a. Modified Ridge classification for LASSO penalties source("Schrick-Noah_Ridge-LASSO-Regression.R") @@ -10,27 +13,58 @@ source("Schrick-Noah_Ridge-LASSO-Regression.R") ### Use npdro simulated data to test source("Schrick-Noah_Simulated-Data.R") bundled_data <- create_data() -# bundled_data$train.X = train.X -lasso.df <- data.frame(att=c("intercept", colnames(train.X)), +### LASSO +unpen_beta <- unpen_coeff(bundled_data$train.X, bundled_data$train.y) +lasso.df <- data.frame(att=c("intercept", colnames(bundled_data$train.X)), scores=unpen_beta$betas, abs_scores=abs(unpen_beta$betas)) -dplyr::slice_max(lasso.df,order_by=abs_scores,n=20) +lasso.res <- dplyr::slice_max(lasso.df,order_by=abs_scores,n=20) +lasso.table <- as.data.table(lasso.res) ### Compare with Ridge +#### Find lambda +tune_results <- tune_ridge(bundled_data$train.X, bundled_data$train.y, + num_folds=10, 2^seq(-5,5,1), verbose=F) +plot(log(tune_results$cv.table$hyp), tune_results$cv.table$means, type="l", + xlab="lambda", ylab="CV Mean Loss") +abline(v=tune_results$lam.min) +tune_results$lam.min + +#### Use lam.min for Ridge Regression +ridge_result <- ridge_betas(bundled_data$train.X, bundled_data$train.y, + beta_init = NULL, lam=tune_results$lam.min, method="BFGS") +ridge.df <- data.frame(att=c("intercept", colnames(bundled_data$train.X)), + scores=ridge_result$betas, + abs_scores=abs(ridge_result$betas)) +ridge.res <- dplyr::slice_max(ridge.df,order_by=abs_scores,n=20) +ridge.table <- as.data.table(ridge.res) ### Compare with Random Forest source("Schrick-Noah_Random-Forest.R") -rf_comp(train) +rf_result <- rf_comp(bundled_data$train) +rf.df <- data.frame(att=c(colnames(bundled_data$train.X)), + scores=rf_result$rf2_imp$rf_score) +rf_res <- dplyr::slice_max(rf.df,order_by=scores, n=20) +rf.table <- as.data.table(rf_res) ### Compare with glmnet source("Schrick-Noah_glmnet.R") - #### Alpha = 0 -glm_fcn(train.X, train.y, 0) +glm.res.0 <- glm_fcn(bundled_data$train.X, bundled_data$train.y, 0) +glm.df.0 <- data.frame(att=c("intercept", colnames(bundled_data$train.X)), + scores=glm.res.0$lambda.1se, + abs_scores=glm.res.0$abs_scores) +glm.df.0.res <- dplyr::slice_max(glm.df.0,order_by=abs_scores,n=20) +glm.0.table <- as.data.table(glm.df.0.res) #### Alpha = 1 -glm_fcn(train.X, train.y, 1) +glm.res.1 <- glm_fcn(bundled_data$train.X, bundled_data$train.y, 1) # alpha=1 +glm.df.1 <- data.frame(att=c("intercept", colnames(bundled_data$train.X)), + scores=glm.res.1$lambda.1se, + abs_scores=glm.res.1$abs_scores) +glm.df.1.res <- dplyr::slice_max(glm.df.1,order_by=abs_scores,n=20) +glm.1.table <- as.data.table(glm.df.1.res) ## b. Repeat comparison using a graph with clusters if (!require("igraph")) install.packages("igraph") diff --git a/Schrick-Noah_Random-Forest.R b/Schrick-Noah_Random-Forest.R index e7d2410..433d861 100644 --- a/Schrick-Noah_Random-Forest.R +++ b/Schrick-Noah_Random-Forest.R @@ -10,7 +10,7 @@ rf_comp <- function(train){ detach("package:ranger", unload=TRUE) rf_imp<-data.frame(rf_score=importance(rf, type=1)) # Cannot do if ranger is loaded #dplyr::arrange(rf_imp,-MeanDecreaseAccuracy) - print(dplyr::slice_max(rf_imp,order_by=MeanDecreaseAccuracy, n=20)) + #print(dplyr::slice_max(rf_imp,order_by=MeanDecreaseAccuracy, n=20)) library(ranger) rf2<-ranger(as.factor(train$class) ~ ., data=train, num.trees=5000, @@ -18,8 +18,10 @@ rf_comp <- function(train){ print(rf2) # error rf2_imp<-data.frame(rf_score=rf2$variable.importance) #dplyr::arrange(rf_imp,-MeanDecreaseAccuracy) - print(dplyr::slice_max(rf2_imp,order_by=rf_score, n=20)) + #print(dplyr::slice_max(rf2_imp,order_by=rf_score, n=20)) #rftest <- predict(rf, newdata=test, type="class") #confusionMatrix(table(rftest,test$class)) -} \ No newline at end of file + + return(list(rf_imp=rf_imp, rf2_imp=rf2_imp)) +} diff --git a/Schrick-Noah_Ridge-LASSO-Regression.R b/Schrick-Noah_Ridge-LASSO-Regression.R index 22e87de..b1c7f37 100644 --- a/Schrick-Noah_Ridge-LASSO-Regression.R +++ b/Schrick-Noah_Ridge-LASSO-Regression.R @@ -68,4 +68,37 @@ unpen_coeff <- function(X, y, lambda=0){ beta <- beta+lambda } } -} \ No newline at end of file + return(unpen_beta) +} + +if (!require("caret")) install.packages("caret") +library(caret) +tune_ridge <- function(X, y, num_folds, tune_grid, verbose=T){ + folds <- caret::createFolds(y, k = num_folds) + cv.results <- list() + for (fold.id in seq(1,num_folds)){ + te.idx <- folds[[fold.id]] + if (verbose){cat("fold", fold.id, "of",num_folds,"\n")} + if(verbose){cat("\t inner loop over hyperparameters...\n")} + # iterate over hyperparameter + scores <- sapply(tune_grid, # hyp loop var + function(lam){ + # train beta's + btrain <- ridge_betas(X[-te.idx,], y[-te.idx], + beta_init = NULL, + lam=lam, method="BFGS")$betas + # get test loss with training beta's + penalized_loss(X[te.idx,], y[te.idx], btrain, lam=lam, alpha=0) + } + ) # end sapply hyp loop over hyperparameters + cv.results[[fold.id]] <- scores # scores vector + } # end for folds loop + cv.results <- data.frame(cv.results) # turn list to df + cv.results$means <- rowMeans(as.matrix(cv.results)) + cv.results$hyp <- tune_grid + colnames(cv.results) <- c(names(folds),"means","hyp") + #### Select best performance + best.idx <- which.min(cv.results$means) # accuracy + return(list(cv.table = cv.results, + lam.min = cv.results$hyp[best.idx])) +} diff --git a/Schrick-Noah_glmnet.R b/Schrick-Noah_glmnet.R index 8168760..2f4e1b6 100644 --- a/Schrick-Noah_glmnet.R +++ b/Schrick-Noah_glmnet.R @@ -14,5 +14,6 @@ glm_fcn <- function(train.X, train.y, alpha_p){ glmnet.df <- data.frame(as.matrix(glmnet.class.coeffs)) glmnet.df$abs_scores <- abs(glmnet.df$lambda.1se) - dplyr::slice_max(glmnet.df,order_by=abs_scores,n=21) -} \ No newline at end of file + #dplyr::slice_max(glmnet.df,order_by=abs_scores,n=21) + return(glmnet.df) +}