library(ggplot2)
library(glmnet)

## Loading required package: Matrix

## Loading required package: foreach

## Loaded glmnet 2.0-10

library(cowplot)

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:ggplot2':
## 
##     ggsave

set.seed(113)
Coef = rnorm(5, 0, sd=1)
X = matrix(rnorm(500, 0, sd=1), nrow = 5, ncol = 100)
Y = apply(X, 2, function(x) Coef%*%x)

df = data.frame(x = c("a", "b", "c", "d", "e"), coef = Coef, Category="True")
ggplot(df, aes(x=x, y=coef, fill=Category)) + geom_bar(stat = "Identity") + theme_bw() +
  xlab("") + ylab("Coefficients")

Reg_analysis <- function(Xtrain, Ytrain, Xtest, Ytest, Ronly = FALSE) {
  model = glmnet(t(Xtrain), Ytrain, alpha=0)
  Ypred = predict.glmnet(model, newx = t(Xtest), s=0)

  df = data.frame(True = Ytest, Predict = as.numeric(Ypred))
  dfcoef = rbind(
    data.frame(Var = c("a", "b", "c", "d", "e"), Coef = Coef, Category="True"),
    data.frame(Var = c("a", "b", "c", "d", "e"), Coef = coef.glmnet(model, s=0)[2:6], Category="Trained")
            )
  dfcoef$Category = factor(dfcoef$Category)
  plots = list()

  err = max(1-sum((Ypred-Ytest)^2)/sum((Ypred-mean(Ypred))^2), 0)
  if (!Ronly) {
    plots[[1]] = ggplot(df, aes(y=True, x=Predict)) + geom_point() + xlab("Prected Y") + ylab("True Y") + 
      annotate("text", x=min(df$Predict)*0.5, y=max(df$True), label=paste0("R squared = ",as.character(round(err, digits = 2))))
    plots[[2]] = ggplot(dfcoef, aes(x=Var, y=Coef, fill=Category)) + geom_bar(stat = "Identity", position = "dodge") +
      ylab("Coefficients")
    return(plots)
  } else{
    return(err)
  }
}

Training regression model on data without noise

Train model with only 10 data samples, which works already fine for both predicting new data and coefficients. Response for independent test data (1000 observations) shows good correaltion with predicted response.

Xtrain = matrix(rnorm(50, 0, sd=1), nrow = 5, ncol = 10)
Ytrain = apply(Xtrain, 2, function(x) Coef%*%x)

Xtest = matrix(rnorm(1000, 0, sd=1), nrow = 5, ncol = 200)
Ytest = apply(Xtest, 2, function(x) Coef%*%x)

plots <- Reg_analysis(Xtrain, Ytrain, Xtest, Ytest)

plot_grid(plots[[1]], plots[[2]], labels = c("A", "B"), rel_widths = c(1,1.5))

Model with noise and increased amount of data.

Ntrain = c(10, 50, 100, 500)
plots = list()
for (Ndata in Ntrain) {
  Xtrain = matrix(rnorm(5*Ndata, 0, sd=1), nrow = 5, ncol = Ndata)
  Ytrain = apply(Xtrain, 2, function(x) Coef%*%x) + rnorm(Ndata, 0, 5)

  plots = Reg_analysis(Xtrain, Ytrain, Xtest, Ytest)
  #plots = c(plots, Reg_analysis(Xtrain, Ytrain, Xtest, Ytest))
  plots[[1]] = plots[[1]] + ggtitle(paste0("#Training data = ", as.character(Ndata)))
  plot(plot_grid(plotlist = plots, labels=c("A", "B"), rel_widths = c(1,1.5)))
}

#plot_grid(plotlist = plots, labels = c("A", "B", "C", "D", "E", "F"),rel_widths = c(1,1.5), ncol = 2)

performance with varying noise levels and amount of data

Noise = seq(1,10, by=1)
Ntrain = seq(10, 1000, by = 10)

df = data.frame()

for(noise in Noise) {
  for(Ndata in Ntrain) {
    Xtrain = matrix(rnorm(5*Ndata, 0, sd=1), nrow = 5, ncol = Ndata)
    Ytrain = apply(Xtrain, 2, function(x) Coef%*%x) + rnorm(Ndata, 0, noise)
    
    Perf =  Reg_analysis(Xtrain, Ytrain, Xtest, Ytest, Ronly = TRUE)
    df = rbind(df, data.frame(Rsquare = Perf, Noise=noise, Ndata = Ndata))
  }
}

ggplot(df, aes(x=Ndata, y=Rsquare)) + geom_smooth(aes(group=Noise, color = Noise), se=FALSE) +
  xlab("Number of training data") + ylab("R squared")

## `geom_smooth()` using method = 'loess'

Numerical experiment on differnet number of samples

Training regression model on data without noise

Model with noise and increased amount of data.

performance with varying noise levels and amount of data