Part 1a: Regression comparisons
This commit is contained in:
parent
c2cd91b766
commit
93eb5d2f23
@ -1,8 +1,11 @@
|
||||
# Project 6 for the University of Tulsa's CS-7863 Sci-Stat Course
|
||||
# Homework 6 for the University of Tulsa's CS-7863 Sci-Stat Course
|
||||
# Penalized Machine Learning
|
||||
# Professor: Dr. McKinney, Spring 2023
|
||||
# Noah L. Schrick - 1492657
|
||||
|
||||
if (!require("data.table")) install.packages("data.table")
|
||||
library(data.table)
|
||||
|
||||
# 1. Penalized Regression and Classification
|
||||
## a. Modified Ridge classification for LASSO penalties
|
||||
source("Schrick-Noah_Ridge-LASSO-Regression.R")
|
||||
@ -10,27 +13,58 @@ source("Schrick-Noah_Ridge-LASSO-Regression.R")
|
||||
### Use npdro simulated data to test
|
||||
source("Schrick-Noah_Simulated-Data.R")
|
||||
bundled_data <- create_data()
|
||||
# bundled_data$train.X = train.X
|
||||
|
||||
lasso.df <- data.frame(att=c("intercept", colnames(train.X)),
|
||||
### LASSO
|
||||
unpen_beta <- unpen_coeff(bundled_data$train.X, bundled_data$train.y)
|
||||
lasso.df <- data.frame(att=c("intercept", colnames(bundled_data$train.X)),
|
||||
scores=unpen_beta$betas,
|
||||
abs_scores=abs(unpen_beta$betas))
|
||||
dplyr::slice_max(lasso.df,order_by=abs_scores,n=20)
|
||||
lasso.res <- dplyr::slice_max(lasso.df,order_by=abs_scores,n=20)
|
||||
lasso.table <- as.data.table(lasso.res)
|
||||
|
||||
### Compare with Ridge
|
||||
#### Find lambda
|
||||
tune_results <- tune_ridge(bundled_data$train.X, bundled_data$train.y,
|
||||
num_folds=10, 2^seq(-5,5,1), verbose=F)
|
||||
plot(log(tune_results$cv.table$hyp), tune_results$cv.table$means, type="l",
|
||||
xlab="lambda", ylab="CV Mean Loss")
|
||||
abline(v=tune_results$lam.min)
|
||||
tune_results$lam.min
|
||||
|
||||
#### Use lam.min for Ridge Regression
|
||||
ridge_result <- ridge_betas(bundled_data$train.X, bundled_data$train.y,
|
||||
beta_init = NULL, lam=tune_results$lam.min, method="BFGS")
|
||||
ridge.df <- data.frame(att=c("intercept", colnames(bundled_data$train.X)),
|
||||
scores=ridge_result$betas,
|
||||
abs_scores=abs(ridge_result$betas))
|
||||
ridge.res <- dplyr::slice_max(ridge.df,order_by=abs_scores,n=20)
|
||||
ridge.table <- as.data.table(ridge.res)
|
||||
|
||||
### Compare with Random Forest
|
||||
source("Schrick-Noah_Random-Forest.R")
|
||||
rf_comp(train)
|
||||
rf_result <- rf_comp(bundled_data$train)
|
||||
rf.df <- data.frame(att=c(colnames(bundled_data$train.X)),
|
||||
scores=rf_result$rf2_imp$rf_score)
|
||||
rf_res <- dplyr::slice_max(rf.df,order_by=scores, n=20)
|
||||
rf.table <- as.data.table(rf_res)
|
||||
|
||||
### Compare with glmnet
|
||||
source("Schrick-Noah_glmnet.R")
|
||||
|
||||
#### Alpha = 0
|
||||
glm_fcn(train.X, train.y, 0)
|
||||
glm.res.0 <- glm_fcn(bundled_data$train.X, bundled_data$train.y, 0)
|
||||
glm.df.0 <- data.frame(att=c("intercept", colnames(bundled_data$train.X)),
|
||||
scores=glm.res.0$lambda.1se,
|
||||
abs_scores=glm.res.0$abs_scores)
|
||||
glm.df.0.res <- dplyr::slice_max(glm.df.0,order_by=abs_scores,n=20)
|
||||
glm.0.table <- as.data.table(glm.df.0.res)
|
||||
|
||||
#### Alpha = 1
|
||||
glm_fcn(train.X, train.y, 1)
|
||||
glm.res.1 <- glm_fcn(bundled_data$train.X, bundled_data$train.y, 1) # alpha=1
|
||||
glm.df.1 <- data.frame(att=c("intercept", colnames(bundled_data$train.X)),
|
||||
scores=glm.res.1$lambda.1se,
|
||||
abs_scores=glm.res.1$abs_scores)
|
||||
glm.df.1.res <- dplyr::slice_max(glm.df.1,order_by=abs_scores,n=20)
|
||||
glm.1.table <- as.data.table(glm.df.1.res)
|
||||
|
||||
## b. Repeat comparison using a graph with clusters
|
||||
if (!require("igraph")) install.packages("igraph")
|
||||
|
||||
@ -10,7 +10,7 @@ rf_comp <- function(train){
|
||||
detach("package:ranger", unload=TRUE)
|
||||
rf_imp<-data.frame(rf_score=importance(rf, type=1)) # Cannot do if ranger is loaded
|
||||
#dplyr::arrange(rf_imp,-MeanDecreaseAccuracy)
|
||||
print(dplyr::slice_max(rf_imp,order_by=MeanDecreaseAccuracy, n=20))
|
||||
#print(dplyr::slice_max(rf_imp,order_by=MeanDecreaseAccuracy, n=20))
|
||||
|
||||
library(ranger)
|
||||
rf2<-ranger(as.factor(train$class) ~ ., data=train, num.trees=5000,
|
||||
@ -18,8 +18,10 @@ rf_comp <- function(train){
|
||||
print(rf2) # error
|
||||
rf2_imp<-data.frame(rf_score=rf2$variable.importance)
|
||||
#dplyr::arrange(rf_imp,-MeanDecreaseAccuracy)
|
||||
print(dplyr::slice_max(rf2_imp,order_by=rf_score, n=20))
|
||||
#print(dplyr::slice_max(rf2_imp,order_by=rf_score, n=20))
|
||||
|
||||
#rftest <- predict(rf, newdata=test, type="class")
|
||||
#confusionMatrix(table(rftest,test$class))
|
||||
|
||||
return(list(rf_imp=rf_imp, rf2_imp=rf2_imp))
|
||||
}
|
||||
@ -68,4 +68,37 @@ unpen_coeff <- function(X, y, lambda=0){
|
||||
beta <- beta+lambda
|
||||
}
|
||||
}
|
||||
return(unpen_beta)
|
||||
}
|
||||
|
||||
if (!require("caret")) install.packages("caret")
|
||||
library(caret)
|
||||
tune_ridge <- function(X, y, num_folds, tune_grid, verbose=T){
|
||||
folds <- caret::createFolds(y, k = num_folds)
|
||||
cv.results <- list()
|
||||
for (fold.id in seq(1,num_folds)){
|
||||
te.idx <- folds[[fold.id]]
|
||||
if (verbose){cat("fold", fold.id, "of",num_folds,"\n")}
|
||||
if(verbose){cat("\t inner loop over hyperparameters...\n")}
|
||||
# iterate over hyperparameter
|
||||
scores <- sapply(tune_grid, # hyp loop var
|
||||
function(lam){
|
||||
# train beta's
|
||||
btrain <- ridge_betas(X[-te.idx,], y[-te.idx],
|
||||
beta_init = NULL,
|
||||
lam=lam, method="BFGS")$betas
|
||||
# get test loss with training beta's
|
||||
penalized_loss(X[te.idx,], y[te.idx], btrain, lam=lam, alpha=0)
|
||||
}
|
||||
) # end sapply hyp loop over hyperparameters
|
||||
cv.results[[fold.id]] <- scores # scores vector
|
||||
} # end for folds loop
|
||||
cv.results <- data.frame(cv.results) # turn list to df
|
||||
cv.results$means <- rowMeans(as.matrix(cv.results))
|
||||
cv.results$hyp <- tune_grid
|
||||
colnames(cv.results) <- c(names(folds),"means","hyp")
|
||||
#### Select best performance
|
||||
best.idx <- which.min(cv.results$means) # accuracy
|
||||
return(list(cv.table = cv.results,
|
||||
lam.min = cv.results$hyp[best.idx]))
|
||||
}
|
||||
@ -14,5 +14,6 @@ glm_fcn <- function(train.X, train.y, alpha_p){
|
||||
|
||||
glmnet.df <- data.frame(as.matrix(glmnet.class.coeffs))
|
||||
glmnet.df$abs_scores <- abs(glmnet.df$lambda.1se)
|
||||
dplyr::slice_max(glmnet.df,order_by=abs_scores,n=21)
|
||||
#dplyr::slice_max(glmnet.df,order_by=abs_scores,n=21)
|
||||
return(glmnet.df)
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user