diff --git a/Schrick-Noah_Homework-6.R b/Schrick-Noah_Homework-6.R index a9c43f4..2421496 100644 --- a/Schrick-Noah_Homework-6.R +++ b/Schrick-Noah_Homework-6.R @@ -14,156 +14,94 @@ source("Schrick-Noah_Ridge-LASSO-Regression.R") source("Schrick-Noah_Simulated-Data.R") bundled_data <- create_data() -### LASSO -unpen_beta <- unpen_coeff(bundled_data$train.X, bundled_data$train.y) -lasso.df <- data.frame(att=c("intercept", colnames(bundled_data$train.X)), - scores=unpen_beta$betas, - abs_scores=abs(unpen_beta$betas)) -lasso.res <- dplyr::slice_max(lasso.df,order_by=abs_scores,n=20) -lasso.table <- as.data.table(lasso.res) - -### Compare with Ridge -#### Find lambda -tune_results <- tune_ridge(bundled_data$train.X, bundled_data$train.y, - num_folds=10, 2^seq(-5,5,1), verbose=F) -plot(log(tune_results$cv.table$hyp), tune_results$cv.table$means, type="l", - xlab="lambda", ylab="CV Mean Loss") -abline(v=tune_results$lam.min) -tune_results$lam.min - -#### Use lam.min for Ridge Regression -ridge_result <- ridge_betas(bundled_data$train.X, bundled_data$train.y, - beta_init = NULL, lam=tune_results$lam.min, method="BFGS") -ridge.df <- data.frame(att=c("intercept", colnames(bundled_data$train.X)), - scores=ridge_result$betas, - abs_scores=abs(ridge_result$betas)) -ridge.res <- dplyr::slice_max(ridge.df,order_by=abs_scores,n=20) -ridge.table <- as.data.table(ridge.res) - -### Compare with Random Forest -source("Schrick-Noah_Random-Forest.R") -rf_result <- rf_comp(bundled_data$train) -rf.df <- data.frame(att=c(colnames(bundled_data$train.X)), - scores=rf_result$rf2_imp$rf_score) -rf_res <- dplyr::slice_max(rf.df,order_by=scores, n=20) -rf.table <- as.data.table(rf_res) - -### Compare with glmnet -source("Schrick-Noah_glmnet.R") -#### Alpha = 0 -glm.res.0 <- glm_fcn(bundled_data$train.X, bundled_data$train.y, 0) -glm.df.0 <- data.frame(att=c("intercept", colnames(bundled_data$train.X)), - scores=glm.res.0$lambda.1se, - abs_scores=glm.res.0$abs_scores) -glm.df.0.res <- dplyr::slice_max(glm.df.0,order_by=abs_scores,n=20) -glm.0.table <- as.data.table(glm.df.0.res) - -#### Alpha = 1 -glm.res.1 <- glm_fcn(bundled_data$train.X, bundled_data$train.y, 1) # alpha=1 -glm.df.1 <- data.frame(att=c("intercept", colnames(bundled_data$train.X)), - scores=glm.res.1$lambda.1se, - abs_scores=glm.res.1$abs_scores) -glm.df.1.res <- dplyr::slice_max(glm.df.1,order_by=abs_scores,n=20) -glm.1.table <- as.data.table(glm.df.1.res) - -### Plot -#### Convert names to indices -lasso.df$att <- match(lasso.df$att,colnames(bundled_data$train)) -ridge.df$att <- match(ridge.df$att,colnames(bundled_data$train)) -rf.df$att <- match(rf.df$att,colnames(bundled_data$train)) -glm.df.0$att <- match(glm.df.0$att,colnames(bundled_data$train)) -glm.df.1$att <- match(glm.df.1$att,colnames(bundled_data$train)) - -#### Scale -lasso.df$abs_scores <- scale(lasso.df$abs_scores) -ridge.df$abs_scores <- scale(ridge.df$abs_scores) -rf.df$scores <- scale(rf.df$scores) -glm.df.0$abs_scores <- scale(glm.df.0$abs_scores) -glm.df.1$abs_scores <- scale(glm.df.1$abs_scores) - -plot(x=lasso.df$att, y=lasso.df$abs_scores, type="l", xlab="Vars", - ylab="Coefficients (Abs Scores)", xaxt="n", col="blue", ylim=c(-1,3), - main="Scaled scores for simulated data feature selection") -axis(1, at=1:101, labels=colnames(bundled_data$train), cex.axis=0.5) -lines(x=ridge.df$att, y=ridge.df$abs_scores, col="red") -lines(x=rf.df$att, y=rf.df$scores, col="green") -lines(x=glm.df.0$att, y=glm.df.0$abs_scores, col="bisque4") -lines(x=glm.df.1$att, y=glm.df.1$abs_scores, col="purple") -legend(x="topleft", - legend=c("LASSO", "Ridge", "Random Forest","glmnet (alpha=0)", "glmnet (alpha=1)"), - lty=c(1,1,1,1,1), - col=c("blue", "red", "green", "bisque4", "purple"), - cex=1) - +run_comparison <- function(bundled_data){ + ### LASSO + unpen_beta <- unpen_coeff(bundled_data$train.X, bundled_data$train.y) + lasso.df <- data.frame(att=c("intercept", colnames(bundled_data$train.X)), + scores=unpen_beta$betas, + abs_scores=abs(unpen_beta$betas)) + lasso.res <- dplyr::slice_max(lasso.df,order_by=abs_scores,n=20) + lasso.table <- as.data.table(lasso.res) + + ### Compare with Ridge + #### Find lambda + tune_results <- tune_ridge(bundled_data$train.X, bundled_data$train.y, + num_folds=10, 2^seq(-5,5,1), verbose=F) + plot(log(tune_results$cv.table$hyp), tune_results$cv.table$means, type="l", + xlab="lambda", ylab="CV Mean Loss") + abline(v=tune_results$lam.min) + tune_results$lam.min + + #### Use lam.min for Ridge Regression + ridge_result <- ridge_betas(bundled_data$train.X, bundled_data$train.y, + beta_init = NULL, lam=tune_results$lam.min, method="BFGS") + ridge.df <- data.frame(att=c("intercept", colnames(bundled_data$train.X)), + scores=ridge_result$betas, + abs_scores=abs(ridge_result$betas)) + ridge.res <- dplyr::slice_max(ridge.df,order_by=abs_scores,n=20) + ridge.table <- as.data.table(ridge.res) + + ### Compare with Random Forest + source("Schrick-Noah_Random-Forest.R") + rf_result <- rf_comp(bundled_data$train) + rf.df <- data.frame(att=c(colnames(bundled_data$train.X)), + scores=rf_result$rf2_imp$rf_score) + rf_res <- dplyr::slice_max(rf.df,order_by=scores, n=20) + rf.table <- as.data.table(rf_res) + + ### Compare with glmnet + source("Schrick-Noah_glmnet.R") + #### Alpha = 0 + glm.res.0 <- glm_fcn(bundled_data$train.X, bundled_data$train.y, 0) + glm.df.0 <- data.frame(att=c("intercept", colnames(bundled_data$train.X)), + scores=glm.res.0$lambda.1se, + abs_scores=glm.res.0$abs_scores) + glm.df.0.res <- dplyr::slice_max(glm.df.0,order_by=abs_scores,n=20) + glm.0.table <- as.data.table(glm.df.0.res) + + #### Alpha = 1 + glm.res.1 <- glm_fcn(bundled_data$train.X, bundled_data$train.y, 1) # alpha=1 + glm.df.1 <- data.frame(att=c("intercept", colnames(bundled_data$train.X)), + scores=glm.res.1$lambda.1se, + abs_scores=glm.res.1$abs_scores) + glm.df.1.res <- dplyr::slice_max(glm.df.1,order_by=abs_scores,n=20) + glm.1.table <- as.data.table(glm.df.1.res) + + ### Plot + #### Convert names to indices + lasso.df$att <- match(lasso.df$att,colnames(bundled_data$train)) + ridge.df$att <- match(ridge.df$att,colnames(bundled_data$train)) + rf.df$att <- match(rf.df$att,colnames(bundled_data$train)) + glm.df.0$att <- match(glm.df.0$att,colnames(bundled_data$train)) + glm.df.1$att <- match(glm.df.1$att,colnames(bundled_data$train)) + + #### Scale + lasso.df$abs_scores <- scale(lasso.df$abs_scores) + ridge.df$abs_scores <- scale(ridge.df$abs_scores) + rf.df$scores <- scale(rf.df$scores) + glm.df.0$abs_scores <- scale(glm.df.0$abs_scores) + glm.df.1$abs_scores <- scale(glm.df.1$abs_scores) + + plot(x=lasso.df$att, y=lasso.df$abs_scores, type="l", xlab="Vars", + ylab="Coefficients (Abs Scores)", xaxt="n", col="blue", ylim=c(-1,3), + main="Scaled scores for simulated data feature selection") + axis(1, at=1:101, labels=colnames(bundled_data$train), cex.axis=0.5) + lines(x=ridge.df$att, y=ridge.df$abs_scores, col="red") + lines(x=rf.df$att, y=rf.df$scores, col="green") + lines(x=glm.df.0$att, y=glm.df.0$abs_scores, col="bisque4") + lines(x=glm.df.1$att, y=glm.df.1$abs_scores, col="purple") + legend(x="topleft", + legend=c("LASSO", "Ridge", "Random Forest","glmnet (alpha=0)", "glmnet (alpha=1)"), + lty=c(1,1,1,1,1), + col=c("blue", "red", "green", "bisque4", "purple"), + cex=1) +} +run_comparison(bundled_data) ## b. Repeat comparison using a graph with clusters -if (!require("igraph")) install.packages("igraph") -library(igraph) -if (!require("Matrix")) install.packages("Matrix") -library(Matrix) # bdiag - -npc <-25 # nodes per cluster -n_clust <- 4 # 4 clusters with 25 nodes each - -# no clusters -g0 <- erdos.renyi.game(npc*n_clust, 0.2) -plot(g0) - -matlist = list() -for (i in 1:n_clust){ - matlist[[i]] = get.adjacency(erdos.renyi.game(npc, 0.2)) -} - -# merge clusters into one matrix -mat_clust <- bdiag(matlist) # create block-diagonal matrix - -## the following two things might not be necessary - -# check for loner nodes, connected to nothing, and join them to something -k <- rowSums(mat_clust) -node_vector <- seq(1,npc*n_clust) -for (i in node_vector){ - if (k[i]==0){ # if k=0, connect to something random - j <- sample(node_vector[-i],1) - mat_clust[i,j] <- 1 - mat_clust[j,i] <- 1 - } -} - -node_colors <- c(rep("red",npc), rep("green",npc), rep("blue",npc), rep("orange",npc)) -g1 <- graph_from_adjacency_matrix(mat_clust, mode="undirected", diag=F) -plot(g1, vertex.color=node_colors) - -### Dataset with g1 -dataset.graph <- npdro::createSimulation2(num.samples=num.samples, - num.variables=num.variables, - pct.imbalance=0.5, - pct.signals=0.2, - main.bias=0.5, - interaction.bias=1, - hi.cor=0.95, - lo.cor=0.2, - mix.type="main-interactionScalefree", - label="class", - sim.type="mixed", - pct.mixed=0.5, - pct.train=0.5, - pct.holdout=0.5, - pct.validation=0, - plot.graph=F, - graph.structure = g1, - verbose=T) - -train.graph <- dataset.graph$train #150x101 -test.graph <- dataset.graph$holdout -validation.graph <- dataset.graph$validation -dataset.graph$signal.names -colnames(train.graph) - -# separate the class vector from the predictor data matrix -train.graph.X <- train.graph[, -which(colnames(train.graph) == "class")] -train.graph.y <- train.graph[, "class"] -train.graph.y.01 <- as.numeric(train.graph.y)-1 +source("Schrick-Noah_graphs.R") +bundled_graph_data <- sim_graph_data() +run_comparison(bundled_graph_data) ## c. Use npdro and igraph to create knn my.k <- 3 # larger k, fewer clusters diff --git a/Schrick-Noah_graphs.R b/Schrick-Noah_graphs.R new file mode 100644 index 0000000..0e86ba6 --- /dev/null +++ b/Schrick-Noah_graphs.R @@ -0,0 +1,43 @@ +source("Schrick-Noah_Simulated-Data.R") + +if (!require("igraph")) install.packages("igraph") +library(igraph) +if (!require("Matrix")) install.packages("Matrix") +library(Matrix) # bdiag + +sim_graph_data <- function(){ + npc <-25 # nodes per cluster + n_clust <- 4 # 4 clusters with 25 nodes each + + # no clusters + g0 <- erdos.renyi.game(npc*n_clust, 0.2) + plot(g0) + + matlist = list() + for (i in 1:n_clust){ + matlist[[i]] = get.adjacency(erdos.renyi.game(npc, 0.2)) + } + + # merge clusters into one matrix + mat_clust <- bdiag(matlist) # create block-diagonal matrix + + ## the following two things might not be necessary + # check for loner nodes, connected to nothing, and join them to something + k <- rowSums(mat_clust) + node_vector <- seq(1,npc*n_clust) + for (i in node_vector){ + if (k[i]==0){ # if k=0, connect to something random + j <- sample(node_vector[-i],1) + mat_clust[i,j] <- 1 + mat_clust[j,i] <- 1 + } + } + + node_colors <- c(rep("red",npc), rep("green",npc), rep("blue",npc), rep("orange",npc)) + g1 <- graph_from_adjacency_matrix(mat_clust, mode="undirected", diag=F) + plot(g1, vertex.color=node_colors) + + ### Dataset with g1 + bundled_graph_data <- create_data(graph.structure=g1) + return(bundled_graph_data) +} \ No newline at end of file