Part 1a: Regression comparisons

This commit is contained in:
Noah L. Schrick 2023-04-12 22:36:09 -05:00
parent c2cd91b766
commit 93eb5d2f23
4 changed files with 84 additions and 14 deletions

View File

@ -1,8 +1,11 @@
# Project 6 for the University of Tulsa's CS-7863 Sci-Stat Course # Homework 6 for the University of Tulsa's CS-7863 Sci-Stat Course
# Penalized Machine Learning # Penalized Machine Learning
# Professor: Dr. McKinney, Spring 2023 # Professor: Dr. McKinney, Spring 2023
# Noah L. Schrick - 1492657 # Noah L. Schrick - 1492657
if (!require("data.table")) install.packages("data.table")
library(data.table)
# 1. Penalized Regression and Classification # 1. Penalized Regression and Classification
## a. Modified Ridge classification for LASSO penalties ## a. Modified Ridge classification for LASSO penalties
source("Schrick-Noah_Ridge-LASSO-Regression.R") source("Schrick-Noah_Ridge-LASSO-Regression.R")
@ -10,27 +13,58 @@ source("Schrick-Noah_Ridge-LASSO-Regression.R")
### Use npdro simulated data to test ### Use npdro simulated data to test
source("Schrick-Noah_Simulated-Data.R") source("Schrick-Noah_Simulated-Data.R")
bundled_data <- create_data() bundled_data <- create_data()
# bundled_data$train.X = train.X
lasso.df <- data.frame(att=c("intercept", colnames(train.X)), ### LASSO
unpen_beta <- unpen_coeff(bundled_data$train.X, bundled_data$train.y)
lasso.df <- data.frame(att=c("intercept", colnames(bundled_data$train.X)),
scores=unpen_beta$betas, scores=unpen_beta$betas,
abs_scores=abs(unpen_beta$betas)) abs_scores=abs(unpen_beta$betas))
dplyr::slice_max(lasso.df,order_by=abs_scores,n=20) lasso.res <- dplyr::slice_max(lasso.df,order_by=abs_scores,n=20)
lasso.table <- as.data.table(lasso.res)
### Compare with Ridge ### Compare with Ridge
#### Find lambda
tune_results <- tune_ridge(bundled_data$train.X, bundled_data$train.y,
num_folds=10, 2^seq(-5,5,1), verbose=F)
plot(log(tune_results$cv.table$hyp), tune_results$cv.table$means, type="l",
xlab="lambda", ylab="CV Mean Loss")
abline(v=tune_results$lam.min)
tune_results$lam.min
#### Use lam.min for Ridge Regression
ridge_result <- ridge_betas(bundled_data$train.X, bundled_data$train.y,
beta_init = NULL, lam=tune_results$lam.min, method="BFGS")
ridge.df <- data.frame(att=c("intercept", colnames(bundled_data$train.X)),
scores=ridge_result$betas,
abs_scores=abs(ridge_result$betas))
ridge.res <- dplyr::slice_max(ridge.df,order_by=abs_scores,n=20)
ridge.table <- as.data.table(ridge.res)
### Compare with Random Forest ### Compare with Random Forest
source("Schrick-Noah_Random-Forest.R") source("Schrick-Noah_Random-Forest.R")
rf_comp(train) rf_result <- rf_comp(bundled_data$train)
rf.df <- data.frame(att=c(colnames(bundled_data$train.X)),
scores=rf_result$rf2_imp$rf_score)
rf_res <- dplyr::slice_max(rf.df,order_by=scores, n=20)
rf.table <- as.data.table(rf_res)
### Compare with glmnet ### Compare with glmnet
source("Schrick-Noah_glmnet.R") source("Schrick-Noah_glmnet.R")
#### Alpha = 0 #### Alpha = 0
glm_fcn(train.X, train.y, 0) glm.res.0 <- glm_fcn(bundled_data$train.X, bundled_data$train.y, 0)
glm.df.0 <- data.frame(att=c("intercept", colnames(bundled_data$train.X)),
scores=glm.res.0$lambda.1se,
abs_scores=glm.res.0$abs_scores)
glm.df.0.res <- dplyr::slice_max(glm.df.0,order_by=abs_scores,n=20)
glm.0.table <- as.data.table(glm.df.0.res)
#### Alpha = 1 #### Alpha = 1
glm_fcn(train.X, train.y, 1) glm.res.1 <- glm_fcn(bundled_data$train.X, bundled_data$train.y, 1) # alpha=1
glm.df.1 <- data.frame(att=c("intercept", colnames(bundled_data$train.X)),
scores=glm.res.1$lambda.1se,
abs_scores=glm.res.1$abs_scores)
glm.df.1.res <- dplyr::slice_max(glm.df.1,order_by=abs_scores,n=20)
glm.1.table <- as.data.table(glm.df.1.res)
## b. Repeat comparison using a graph with clusters ## b. Repeat comparison using a graph with clusters
if (!require("igraph")) install.packages("igraph") if (!require("igraph")) install.packages("igraph")

View File

@ -10,7 +10,7 @@ rf_comp <- function(train){
detach("package:ranger", unload=TRUE) detach("package:ranger", unload=TRUE)
rf_imp<-data.frame(rf_score=importance(rf, type=1)) # Cannot do if ranger is loaded rf_imp<-data.frame(rf_score=importance(rf, type=1)) # Cannot do if ranger is loaded
#dplyr::arrange(rf_imp,-MeanDecreaseAccuracy) #dplyr::arrange(rf_imp,-MeanDecreaseAccuracy)
print(dplyr::slice_max(rf_imp,order_by=MeanDecreaseAccuracy, n=20)) #print(dplyr::slice_max(rf_imp,order_by=MeanDecreaseAccuracy, n=20))
library(ranger) library(ranger)
rf2<-ranger(as.factor(train$class) ~ ., data=train, num.trees=5000, rf2<-ranger(as.factor(train$class) ~ ., data=train, num.trees=5000,
@ -18,8 +18,10 @@ rf_comp <- function(train){
print(rf2) # error print(rf2) # error
rf2_imp<-data.frame(rf_score=rf2$variable.importance) rf2_imp<-data.frame(rf_score=rf2$variable.importance)
#dplyr::arrange(rf_imp,-MeanDecreaseAccuracy) #dplyr::arrange(rf_imp,-MeanDecreaseAccuracy)
print(dplyr::slice_max(rf2_imp,order_by=rf_score, n=20)) #print(dplyr::slice_max(rf2_imp,order_by=rf_score, n=20))
#rftest <- predict(rf, newdata=test, type="class") #rftest <- predict(rf, newdata=test, type="class")
#confusionMatrix(table(rftest,test$class)) #confusionMatrix(table(rftest,test$class))
}
return(list(rf_imp=rf_imp, rf2_imp=rf2_imp))
}

View File

@ -68,4 +68,37 @@ unpen_coeff <- function(X, y, lambda=0){
beta <- beta+lambda beta <- beta+lambda
} }
} }
} return(unpen_beta)
}
if (!require("caret")) install.packages("caret")
library(caret)
tune_ridge <- function(X, y, num_folds, tune_grid, verbose=T){
folds <- caret::createFolds(y, k = num_folds)
cv.results <- list()
for (fold.id in seq(1,num_folds)){
te.idx <- folds[[fold.id]]
if (verbose){cat("fold", fold.id, "of",num_folds,"\n")}
if(verbose){cat("\t inner loop over hyperparameters...\n")}
# iterate over hyperparameter
scores <- sapply(tune_grid, # hyp loop var
function(lam){
# train beta's
btrain <- ridge_betas(X[-te.idx,], y[-te.idx],
beta_init = NULL,
lam=lam, method="BFGS")$betas
# get test loss with training beta's
penalized_loss(X[te.idx,], y[te.idx], btrain, lam=lam, alpha=0)
}
) # end sapply hyp loop over hyperparameters
cv.results[[fold.id]] <- scores # scores vector
} # end for folds loop
cv.results <- data.frame(cv.results) # turn list to df
cv.results$means <- rowMeans(as.matrix(cv.results))
cv.results$hyp <- tune_grid
colnames(cv.results) <- c(names(folds),"means","hyp")
#### Select best performance
best.idx <- which.min(cv.results$means) # accuracy
return(list(cv.table = cv.results,
lam.min = cv.results$hyp[best.idx]))
}

View File

@ -14,5 +14,6 @@ glm_fcn <- function(train.X, train.y, alpha_p){
glmnet.df <- data.frame(as.matrix(glmnet.class.coeffs)) glmnet.df <- data.frame(as.matrix(glmnet.class.coeffs))
glmnet.df$abs_scores <- abs(glmnet.df$lambda.1se) glmnet.df$abs_scores <- abs(glmnet.df$lambda.1se)
dplyr::slice_max(glmnet.df,order_by=abs_scores,n=21) #dplyr::slice_max(glmnet.df,order_by=abs_scores,n=21)
} return(glmnet.df)
}