library(ggplot2)
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-10
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
set.seed(113)
Coef = rnorm(5, 0, sd=1)
X = matrix(rnorm(500, 0, sd=1), nrow = 5, ncol = 100)
Y = apply(X, 2, function(x) Coef%*%x)
df = data.frame(x = c("a", "b", "c", "d", "e"), coef = Coef, Category="True")
ggplot(df, aes(x=x, y=coef, fill=Category)) + geom_bar(stat = "Identity") + theme_bw() +
xlab("") + ylab("Coefficients")
Reg_analysis <- function(Xtrain, Ytrain, Xtest, Ytest, Ronly = FALSE) {
model = glmnet(t(Xtrain), Ytrain, alpha=0)
Ypred = predict.glmnet(model, newx = t(Xtest), s=0)
df = data.frame(True = Ytest, Predict = as.numeric(Ypred))
dfcoef = rbind(
data.frame(Var = c("a", "b", "c", "d", "e"), Coef = Coef, Category="True"),
data.frame(Var = c("a", "b", "c", "d", "e"), Coef = coef.glmnet(model, s=0)[2:6], Category="Trained")
)
dfcoef$Category = factor(dfcoef$Category)
plots = list()
err = max(1-sum((Ypred-Ytest)^2)/sum((Ypred-mean(Ypred))^2), 0)
if (!Ronly) {
plots[[1]] = ggplot(df, aes(y=True, x=Predict)) + geom_point() + xlab("Prected Y") + ylab("True Y") +
annotate("text", x=min(df$Predict)*0.5, y=max(df$True), label=paste0("R squared = ",as.character(round(err, digits = 2))))
plots[[2]] = ggplot(dfcoef, aes(x=Var, y=Coef, fill=Category)) + geom_bar(stat = "Identity", position = "dodge") +
ylab("Coefficients")
return(plots)
} else{
return(err)
}
}
Train model with only 10 data samples, which works already fine for both predicting new data and coefficients. Response for independent test data (1000 observations) shows good correaltion with predicted response.
Xtrain = matrix(rnorm(50, 0, sd=1), nrow = 5, ncol = 10)
Ytrain = apply(Xtrain, 2, function(x) Coef%*%x)
Xtest = matrix(rnorm(1000, 0, sd=1), nrow = 5, ncol = 200)
Ytest = apply(Xtest, 2, function(x) Coef%*%x)
plots <- Reg_analysis(Xtrain, Ytrain, Xtest, Ytest)
plot_grid(plots[[1]], plots[[2]], labels = c("A", "B"), rel_widths = c(1,1.5))
Ntrain = c(10, 50, 100, 500)
plots = list()
for (Ndata in Ntrain) {
Xtrain = matrix(rnorm(5*Ndata, 0, sd=1), nrow = 5, ncol = Ndata)
Ytrain = apply(Xtrain, 2, function(x) Coef%*%x) + rnorm(Ndata, 0, 5)
plots = Reg_analysis(Xtrain, Ytrain, Xtest, Ytest)
#plots = c(plots, Reg_analysis(Xtrain, Ytrain, Xtest, Ytest))
plots[[1]] = plots[[1]] + ggtitle(paste0("#Training data = ", as.character(Ndata)))
plot(plot_grid(plotlist = plots, labels=c("A", "B"), rel_widths = c(1,1.5)))
}
#plot_grid(plotlist = plots, labels = c("A", "B", "C", "D", "E", "F"),rel_widths = c(1,1.5), ncol = 2)
Noise = seq(1,10, by=1)
Ntrain = seq(10, 1000, by = 10)
df = data.frame()
for(noise in Noise) {
for(Ndata in Ntrain) {
Xtrain = matrix(rnorm(5*Ndata, 0, sd=1), nrow = 5, ncol = Ndata)
Ytrain = apply(Xtrain, 2, function(x) Coef%*%x) + rnorm(Ndata, 0, noise)
Perf = Reg_analysis(Xtrain, Ytrain, Xtest, Ytest, Ronly = TRUE)
df = rbind(df, data.frame(Rsquare = Perf, Noise=noise, Ndata = Ndata))
}
}
ggplot(df, aes(x=Ndata, y=Rsquare)) + geom_smooth(aes(group=Noise, color = Noise), se=FALSE) +
xlab("Number of training data") + ylab("R squared")
## `geom_smooth()` using method = 'loess'