Part 1a: Regression comparisons
This commit is contained in:
parent
c2cd91b766
commit
93eb5d2f23
@ -1,8 +1,11 @@
|
|||||||
# Project 6 for the University of Tulsa's CS-7863 Sci-Stat Course
|
# Homework 6 for the University of Tulsa's CS-7863 Sci-Stat Course
|
||||||
# Penalized Machine Learning
|
# Penalized Machine Learning
|
||||||
# Professor: Dr. McKinney, Spring 2023
|
# Professor: Dr. McKinney, Spring 2023
|
||||||
# Noah L. Schrick - 1492657
|
# Noah L. Schrick - 1492657
|
||||||
|
|
||||||
|
if (!require("data.table")) install.packages("data.table")
|
||||||
|
library(data.table)
|
||||||
|
|
||||||
# 1. Penalized Regression and Classification
|
# 1. Penalized Regression and Classification
|
||||||
## a. Modified Ridge classification for LASSO penalties
|
## a. Modified Ridge classification for LASSO penalties
|
||||||
source("Schrick-Noah_Ridge-LASSO-Regression.R")
|
source("Schrick-Noah_Ridge-LASSO-Regression.R")
|
||||||
@ -10,27 +13,58 @@ source("Schrick-Noah_Ridge-LASSO-Regression.R")
|
|||||||
### Use npdro simulated data to test
|
### Use npdro simulated data to test
|
||||||
source("Schrick-Noah_Simulated-Data.R")
|
source("Schrick-Noah_Simulated-Data.R")
|
||||||
bundled_data <- create_data()
|
bundled_data <- create_data()
|
||||||
# bundled_data$train.X = train.X
|
|
||||||
|
|
||||||
lasso.df <- data.frame(att=c("intercept", colnames(train.X)),
|
### LASSO
|
||||||
|
unpen_beta <- unpen_coeff(bundled_data$train.X, bundled_data$train.y)
|
||||||
|
lasso.df <- data.frame(att=c("intercept", colnames(bundled_data$train.X)),
|
||||||
scores=unpen_beta$betas,
|
scores=unpen_beta$betas,
|
||||||
abs_scores=abs(unpen_beta$betas))
|
abs_scores=abs(unpen_beta$betas))
|
||||||
dplyr::slice_max(lasso.df,order_by=abs_scores,n=20)
|
lasso.res <- dplyr::slice_max(lasso.df,order_by=abs_scores,n=20)
|
||||||
|
lasso.table <- as.data.table(lasso.res)
|
||||||
|
|
||||||
### Compare with Ridge
|
### Compare with Ridge
|
||||||
|
#### Find lambda
|
||||||
|
tune_results <- tune_ridge(bundled_data$train.X, bundled_data$train.y,
|
||||||
|
num_folds=10, 2^seq(-5,5,1), verbose=F)
|
||||||
|
plot(log(tune_results$cv.table$hyp), tune_results$cv.table$means, type="l",
|
||||||
|
xlab="lambda", ylab="CV Mean Loss")
|
||||||
|
abline(v=tune_results$lam.min)
|
||||||
|
tune_results$lam.min
|
||||||
|
|
||||||
|
#### Use lam.min for Ridge Regression
|
||||||
|
ridge_result <- ridge_betas(bundled_data$train.X, bundled_data$train.y,
|
||||||
|
beta_init = NULL, lam=tune_results$lam.min, method="BFGS")
|
||||||
|
ridge.df <- data.frame(att=c("intercept", colnames(bundled_data$train.X)),
|
||||||
|
scores=ridge_result$betas,
|
||||||
|
abs_scores=abs(ridge_result$betas))
|
||||||
|
ridge.res <- dplyr::slice_max(ridge.df,order_by=abs_scores,n=20)
|
||||||
|
ridge.table <- as.data.table(ridge.res)
|
||||||
|
|
||||||
### Compare with Random Forest
|
### Compare with Random Forest
|
||||||
source("Schrick-Noah_Random-Forest.R")
|
source("Schrick-Noah_Random-Forest.R")
|
||||||
rf_comp(train)
|
rf_result <- rf_comp(bundled_data$train)
|
||||||
|
rf.df <- data.frame(att=c(colnames(bundled_data$train.X)),
|
||||||
|
scores=rf_result$rf2_imp$rf_score)
|
||||||
|
rf_res <- dplyr::slice_max(rf.df,order_by=scores, n=20)
|
||||||
|
rf.table <- as.data.table(rf_res)
|
||||||
|
|
||||||
### Compare with glmnet
|
### Compare with glmnet
|
||||||
source("Schrick-Noah_glmnet.R")
|
source("Schrick-Noah_glmnet.R")
|
||||||
|
|
||||||
#### Alpha = 0
|
#### Alpha = 0
|
||||||
glm_fcn(train.X, train.y, 0)
|
glm.res.0 <- glm_fcn(bundled_data$train.X, bundled_data$train.y, 0)
|
||||||
|
glm.df.0 <- data.frame(att=c("intercept", colnames(bundled_data$train.X)),
|
||||||
|
scores=glm.res.0$lambda.1se,
|
||||||
|
abs_scores=glm.res.0$abs_scores)
|
||||||
|
glm.df.0.res <- dplyr::slice_max(glm.df.0,order_by=abs_scores,n=20)
|
||||||
|
glm.0.table <- as.data.table(glm.df.0.res)
|
||||||
|
|
||||||
#### Alpha = 1
|
#### Alpha = 1
|
||||||
glm_fcn(train.X, train.y, 1)
|
glm.res.1 <- glm_fcn(bundled_data$train.X, bundled_data$train.y, 1) # alpha=1
|
||||||
|
glm.df.1 <- data.frame(att=c("intercept", colnames(bundled_data$train.X)),
|
||||||
|
scores=glm.res.1$lambda.1se,
|
||||||
|
abs_scores=glm.res.1$abs_scores)
|
||||||
|
glm.df.1.res <- dplyr::slice_max(glm.df.1,order_by=abs_scores,n=20)
|
||||||
|
glm.1.table <- as.data.table(glm.df.1.res)
|
||||||
|
|
||||||
## b. Repeat comparison using a graph with clusters
|
## b. Repeat comparison using a graph with clusters
|
||||||
if (!require("igraph")) install.packages("igraph")
|
if (!require("igraph")) install.packages("igraph")
|
||||||
|
|||||||
@ -10,7 +10,7 @@ rf_comp <- function(train){
|
|||||||
detach("package:ranger", unload=TRUE)
|
detach("package:ranger", unload=TRUE)
|
||||||
rf_imp<-data.frame(rf_score=importance(rf, type=1)) # Cannot do if ranger is loaded
|
rf_imp<-data.frame(rf_score=importance(rf, type=1)) # Cannot do if ranger is loaded
|
||||||
#dplyr::arrange(rf_imp,-MeanDecreaseAccuracy)
|
#dplyr::arrange(rf_imp,-MeanDecreaseAccuracy)
|
||||||
print(dplyr::slice_max(rf_imp,order_by=MeanDecreaseAccuracy, n=20))
|
#print(dplyr::slice_max(rf_imp,order_by=MeanDecreaseAccuracy, n=20))
|
||||||
|
|
||||||
library(ranger)
|
library(ranger)
|
||||||
rf2<-ranger(as.factor(train$class) ~ ., data=train, num.trees=5000,
|
rf2<-ranger(as.factor(train$class) ~ ., data=train, num.trees=5000,
|
||||||
@ -18,8 +18,10 @@ rf_comp <- function(train){
|
|||||||
print(rf2) # error
|
print(rf2) # error
|
||||||
rf2_imp<-data.frame(rf_score=rf2$variable.importance)
|
rf2_imp<-data.frame(rf_score=rf2$variable.importance)
|
||||||
#dplyr::arrange(rf_imp,-MeanDecreaseAccuracy)
|
#dplyr::arrange(rf_imp,-MeanDecreaseAccuracy)
|
||||||
print(dplyr::slice_max(rf2_imp,order_by=rf_score, n=20))
|
#print(dplyr::slice_max(rf2_imp,order_by=rf_score, n=20))
|
||||||
|
|
||||||
#rftest <- predict(rf, newdata=test, type="class")
|
#rftest <- predict(rf, newdata=test, type="class")
|
||||||
#confusionMatrix(table(rftest,test$class))
|
#confusionMatrix(table(rftest,test$class))
|
||||||
|
|
||||||
|
return(list(rf_imp=rf_imp, rf2_imp=rf2_imp))
|
||||||
}
|
}
|
||||||
@ -68,4 +68,37 @@ unpen_coeff <- function(X, y, lambda=0){
|
|||||||
beta <- beta+lambda
|
beta <- beta+lambda
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return(unpen_beta)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!require("caret")) install.packages("caret")
|
||||||
|
library(caret)
|
||||||
|
tune_ridge <- function(X, y, num_folds, tune_grid, verbose=T){
|
||||||
|
folds <- caret::createFolds(y, k = num_folds)
|
||||||
|
cv.results <- list()
|
||||||
|
for (fold.id in seq(1,num_folds)){
|
||||||
|
te.idx <- folds[[fold.id]]
|
||||||
|
if (verbose){cat("fold", fold.id, "of",num_folds,"\n")}
|
||||||
|
if(verbose){cat("\t inner loop over hyperparameters...\n")}
|
||||||
|
# iterate over hyperparameter
|
||||||
|
scores <- sapply(tune_grid, # hyp loop var
|
||||||
|
function(lam){
|
||||||
|
# train beta's
|
||||||
|
btrain <- ridge_betas(X[-te.idx,], y[-te.idx],
|
||||||
|
beta_init = NULL,
|
||||||
|
lam=lam, method="BFGS")$betas
|
||||||
|
# get test loss with training beta's
|
||||||
|
penalized_loss(X[te.idx,], y[te.idx], btrain, lam=lam, alpha=0)
|
||||||
|
}
|
||||||
|
) # end sapply hyp loop over hyperparameters
|
||||||
|
cv.results[[fold.id]] <- scores # scores vector
|
||||||
|
} # end for folds loop
|
||||||
|
cv.results <- data.frame(cv.results) # turn list to df
|
||||||
|
cv.results$means <- rowMeans(as.matrix(cv.results))
|
||||||
|
cv.results$hyp <- tune_grid
|
||||||
|
colnames(cv.results) <- c(names(folds),"means","hyp")
|
||||||
|
#### Select best performance
|
||||||
|
best.idx <- which.min(cv.results$means) # accuracy
|
||||||
|
return(list(cv.table = cv.results,
|
||||||
|
lam.min = cv.results$hyp[best.idx]))
|
||||||
}
|
}
|
||||||
@ -14,5 +14,6 @@ glm_fcn <- function(train.X, train.y, alpha_p){
|
|||||||
|
|
||||||
glmnet.df <- data.frame(as.matrix(glmnet.class.coeffs))
|
glmnet.df <- data.frame(as.matrix(glmnet.class.coeffs))
|
||||||
glmnet.df$abs_scores <- abs(glmnet.df$lambda.1se)
|
glmnet.df$abs_scores <- abs(glmnet.df$lambda.1se)
|
||||||
dplyr::slice_max(glmnet.df,order_by=abs_scores,n=21)
|
#dplyr::slice_max(glmnet.df,order_by=abs_scores,n=21)
|
||||||
|
return(glmnet.df)
|
||||||
}
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user